Adding RTCD for DNN code

Starting with compute_linear()
This commit is contained in:
Jean-Marc Valin 2023-11-13 18:26:31 -05:00
parent b0620c0bf9
commit 2e034f6f31
No known key found for this signature in database
GPG key ID: 531A52533318F00A
31 changed files with 539 additions and 165 deletions

View file

@ -50,18 +50,30 @@ if CPU_X86
if HAVE_RTCD if HAVE_RTCD
CELT_SOURCES += $(CELT_SOURCES_X86_RTCD) CELT_SOURCES += $(CELT_SOURCES_X86_RTCD)
SILK_SOURCES += $(SILK_SOURCES_X86_RTCD) SILK_SOURCES += $(SILK_SOURCES_X86_RTCD)
if ENABLE_DEEP_PLC
LPCNET_SOURCES += $(DNN_SOURCES_X86_RTCD)
endif
endif endif
if HAVE_SSE if HAVE_SSE
CELT_SOURCES += $(CELT_SOURCES_SSE) CELT_SOURCES += $(CELT_SOURCES_SSE)
endif endif
if HAVE_SSE2 if HAVE_SSE2
CELT_SOURCES += $(CELT_SOURCES_SSE2) CELT_SOURCES += $(CELT_SOURCES_SSE2)
if ENABLE_DEEP_PLC
LPCNET_SOURCES += $(DNN_SOURCES_SSE2)
endif
endif endif
if HAVE_SSE4_1 if HAVE_SSE4_1
CELT_SOURCES += $(CELT_SOURCES_SSE4_1) CELT_SOURCES += $(CELT_SOURCES_SSE4_1)
if ENABLE_DEEP_PLC
LPCNET_SOURCES += $(DNN_SOURCES_SSE4_1)
endif
endif endif
if HAVE_AVX2 if HAVE_AVX2
CELT_SOURCES += $(CELT_SOURCES_AVX2) CELT_SOURCES += $(CELT_SOURCES_AVX2)
if ENABLE_DEEP_PLC
LPCNET_SOURCES += $(DNN_SOURCES_AVX2)
endif
endif endif
endif endif
@ -398,19 +410,22 @@ $(SSE_OBJ): CFLAGS += $(OPUS_X86_SSE_CFLAGS)
endif endif
if HAVE_SSE2 if HAVE_SSE2
SSE2_OBJ = $(CELT_SOURCES_SSE2:.c=.lo) SSE2_OBJ = $(CELT_SOURCES_SSE2:.c=.lo) \
$(DNN_SOURCES_SSE2:.c=.lo)
$(SSE2_OBJ): CFLAGS += $(OPUS_X86_SSE2_CFLAGS) $(SSE2_OBJ): CFLAGS += $(OPUS_X86_SSE2_CFLAGS)
endif endif
if HAVE_SSE4_1 if HAVE_SSE4_1
SSE4_1_OBJ = $(CELT_SOURCES_SSE4_1:.c=.lo) \ SSE4_1_OBJ = $(CELT_SOURCES_SSE4_1:.c=.lo) \
$(DNN_SOURCES_SSE4_1:.c=.lo) \
$(SILK_SOURCES_SSE4_1:.c=.lo) \ $(SILK_SOURCES_SSE4_1:.c=.lo) \
$(SILK_SOURCES_FIXED_SSE4_1:.c=.lo) $(SILK_SOURCES_FIXED_SSE4_1:.c=.lo)
$(SSE4_1_OBJ): CFLAGS += $(OPUS_X86_SSE4_1_CFLAGS) $(SSE4_1_OBJ): CFLAGS += $(OPUS_X86_SSE4_1_CFLAGS)
endif endif
if HAVE_AVX2 if HAVE_AVX2
AVX2_OBJ = $(CELT_SOURCES_AVX2:.c=.lo) AVX2_OBJ = $(CELT_SOURCES_AVX2:.c=.lo) \
$(DNN_SOURCES_AVX2:.c=.lo)
$(AVX2_OBJ): CFLAGS += $(OPUS_X86_AVX2_CFLAGS) $(AVX2_OBJ): CFLAGS += $(OPUS_X86_AVX2_CFLAGS)
endif endif

View file

@ -47,7 +47,7 @@
# endif # endif
# if defined(OPUS_X86_MAY_HAVE_AVX2) # if defined(OPUS_X86_MAY_HAVE_AVX2)
# define MAY_HAVE_AVX2(name) name ## _avx # define MAY_HAVE_AVX2(name) name ## _avx2
# else # else
# define MAY_HAVE_AVX2(name) name ## _c # define MAY_HAVE_AVX2(name) name ## _c
# endif # endif

View file

@ -42,33 +42,35 @@ static void conv1_cond_init(float *mem, int len, int dilation, int *init)
*init = 1; *init = 1;
} }
void DRED_rdovae_decode_all(const RDOVAEDec *model, float *features, const float *state, const float *latents, int nb_latents) void DRED_rdovae_decode_all(const RDOVAEDec *model, float *features, const float *state, const float *latents, int nb_latents, int arch)
{ {
int i; int i;
RDOVAEDecState dec; RDOVAEDecState dec;
memset(&dec, 0, sizeof(dec)); memset(&dec, 0, sizeof(dec));
dred_rdovae_dec_init_states(&dec, model, state); dred_rdovae_dec_init_states(&dec, model, state, arch);
for (i = 0; i < 2*nb_latents; i += 2) for (i = 0; i < 2*nb_latents; i += 2)
{ {
dred_rdovae_decode_qframe( dred_rdovae_decode_qframe(
&dec, &dec,
model, model,
&features[2*i*DRED_NUM_FEATURES], &features[2*i*DRED_NUM_FEATURES],
&latents[(i/2)*DRED_LATENT_DIM]); &latents[(i/2)*DRED_LATENT_DIM],
arch);
} }
} }
void dred_rdovae_dec_init_states( void dred_rdovae_dec_init_states(
RDOVAEDecState *h, /* io: state buffer handle */ RDOVAEDecState *h, /* io: state buffer handle */
const RDOVAEDec *model, const RDOVAEDec *model,
const float *initial_state /* i: initial state */ const float *initial_state, /* i: initial state */
int arch
) )
{ {
float hidden[DEC_HIDDEN_INIT_OUT_SIZE]; float hidden[DEC_HIDDEN_INIT_OUT_SIZE];
float state_init[DEC_GRU1_STATE_SIZE+DEC_GRU2_STATE_SIZE+DEC_GRU3_STATE_SIZE+DEC_GRU4_STATE_SIZE+DEC_GRU5_STATE_SIZE]; float state_init[DEC_GRU1_STATE_SIZE+DEC_GRU2_STATE_SIZE+DEC_GRU3_STATE_SIZE+DEC_GRU4_STATE_SIZE+DEC_GRU5_STATE_SIZE];
int counter=0; int counter=0;
compute_generic_dense(&model->dec_hidden_init, hidden, initial_state, ACTIVATION_TANH); compute_generic_dense(&model->dec_hidden_init, hidden, initial_state, ACTIVATION_TANH, arch);
compute_generic_dense(&model->dec_gru_init, state_init, hidden, ACTIVATION_TANH); compute_generic_dense(&model->dec_gru_init, state_init, hidden, ACTIVATION_TANH, arch);
OPUS_COPY(h->gru1_state, state_init, DEC_GRU1_STATE_SIZE); OPUS_COPY(h->gru1_state, state_init, DEC_GRU1_STATE_SIZE);
counter += DEC_GRU1_STATE_SIZE; counter += DEC_GRU1_STATE_SIZE;
OPUS_COPY(h->gru2_state, &state_init[counter], DEC_GRU2_STATE_SIZE); OPUS_COPY(h->gru2_state, &state_init[counter], DEC_GRU2_STATE_SIZE);
@ -86,7 +88,8 @@ void dred_rdovae_decode_qframe(
RDOVAEDecState *dec_state, /* io: state buffer handle */ RDOVAEDecState *dec_state, /* io: state buffer handle */
const RDOVAEDec *model, const RDOVAEDec *model,
float *qframe, /* o: quadruple feature frame (four concatenated frames in reverse order) */ float *qframe, /* o: quadruple feature frame (four concatenated frames in reverse order) */
const float *input /* i: latent vector */ const float *input, /* i: latent vector */
int arch
) )
{ {
float buffer[DEC_DENSE1_OUT_SIZE + DEC_GRU1_OUT_SIZE + DEC_GRU2_OUT_SIZE + DEC_GRU3_OUT_SIZE + DEC_GRU4_OUT_SIZE + DEC_GRU5_OUT_SIZE float buffer[DEC_DENSE1_OUT_SIZE + DEC_GRU1_OUT_SIZE + DEC_GRU2_OUT_SIZE + DEC_GRU3_OUT_SIZE + DEC_GRU4_OUT_SIZE + DEC_GRU5_OUT_SIZE
@ -94,43 +97,43 @@ void dred_rdovae_decode_qframe(
int output_index = 0; int output_index = 0;
/* run encoder stack and concatenate output in buffer*/ /* run encoder stack and concatenate output in buffer*/
compute_generic_dense(&model->dec_dense1, &buffer[output_index], input, ACTIVATION_TANH); compute_generic_dense(&model->dec_dense1, &buffer[output_index], input, ACTIVATION_TANH, arch);
output_index += DEC_DENSE1_OUT_SIZE; output_index += DEC_DENSE1_OUT_SIZE;
compute_generic_gru(&model->dec_gru1_input, &model->dec_gru1_recurrent, dec_state->gru1_state, buffer); compute_generic_gru(&model->dec_gru1_input, &model->dec_gru1_recurrent, dec_state->gru1_state, buffer, arch);
compute_glu(&model->dec_glu1, &buffer[output_index], dec_state->gru1_state); compute_glu(&model->dec_glu1, &buffer[output_index], dec_state->gru1_state, arch);
output_index += DEC_GRU1_OUT_SIZE; output_index += DEC_GRU1_OUT_SIZE;
conv1_cond_init(dec_state->conv1_state, output_index, 1, &dec_state->initialized); conv1_cond_init(dec_state->conv1_state, output_index, 1, &dec_state->initialized);
compute_generic_conv1d(&model->dec_conv1, &buffer[output_index], dec_state->conv1_state, buffer, output_index, ACTIVATION_TANH); compute_generic_conv1d(&model->dec_conv1, &buffer[output_index], dec_state->conv1_state, buffer, output_index, ACTIVATION_TANH, arch);
output_index += DEC_CONV1_OUT_SIZE; output_index += DEC_CONV1_OUT_SIZE;
compute_generic_gru(&model->dec_gru2_input, &model->dec_gru2_recurrent, dec_state->gru2_state, buffer); compute_generic_gru(&model->dec_gru2_input, &model->dec_gru2_recurrent, dec_state->gru2_state, buffer, arch);
compute_glu(&model->dec_glu2, &buffer[output_index], dec_state->gru2_state); compute_glu(&model->dec_glu2, &buffer[output_index], dec_state->gru2_state, arch);
output_index += DEC_GRU2_OUT_SIZE; output_index += DEC_GRU2_OUT_SIZE;
conv1_cond_init(dec_state->conv2_state, output_index, 1, &dec_state->initialized); conv1_cond_init(dec_state->conv2_state, output_index, 1, &dec_state->initialized);
compute_generic_conv1d(&model->dec_conv2, &buffer[output_index], dec_state->conv2_state, buffer, output_index, ACTIVATION_TANH); compute_generic_conv1d(&model->dec_conv2, &buffer[output_index], dec_state->conv2_state, buffer, output_index, ACTIVATION_TANH, arch);
output_index += DEC_CONV2_OUT_SIZE; output_index += DEC_CONV2_OUT_SIZE;
compute_generic_gru(&model->dec_gru3_input, &model->dec_gru3_recurrent, dec_state->gru3_state, buffer); compute_generic_gru(&model->dec_gru3_input, &model->dec_gru3_recurrent, dec_state->gru3_state, buffer, arch);
compute_glu(&model->dec_glu3, &buffer[output_index], dec_state->gru3_state); compute_glu(&model->dec_glu3, &buffer[output_index], dec_state->gru3_state, arch);
output_index += DEC_GRU3_OUT_SIZE; output_index += DEC_GRU3_OUT_SIZE;
conv1_cond_init(dec_state->conv3_state, output_index, 1, &dec_state->initialized); conv1_cond_init(dec_state->conv3_state, output_index, 1, &dec_state->initialized);
compute_generic_conv1d(&model->dec_conv3, &buffer[output_index], dec_state->conv3_state, buffer, output_index, ACTIVATION_TANH); compute_generic_conv1d(&model->dec_conv3, &buffer[output_index], dec_state->conv3_state, buffer, output_index, ACTIVATION_TANH, arch);
output_index += DEC_CONV3_OUT_SIZE; output_index += DEC_CONV3_OUT_SIZE;
compute_generic_gru(&model->dec_gru4_input, &model->dec_gru4_recurrent, dec_state->gru4_state, buffer); compute_generic_gru(&model->dec_gru4_input, &model->dec_gru4_recurrent, dec_state->gru4_state, buffer, arch);
compute_glu(&model->dec_glu4, &buffer[output_index], dec_state->gru4_state); compute_glu(&model->dec_glu4, &buffer[output_index], dec_state->gru4_state, arch);
output_index += DEC_GRU4_OUT_SIZE; output_index += DEC_GRU4_OUT_SIZE;
conv1_cond_init(dec_state->conv4_state, output_index, 1, &dec_state->initialized); conv1_cond_init(dec_state->conv4_state, output_index, 1, &dec_state->initialized);
compute_generic_conv1d(&model->dec_conv4, &buffer[output_index], dec_state->conv4_state, buffer, output_index, ACTIVATION_TANH); compute_generic_conv1d(&model->dec_conv4, &buffer[output_index], dec_state->conv4_state, buffer, output_index, ACTIVATION_TANH, arch);
output_index += DEC_CONV4_OUT_SIZE; output_index += DEC_CONV4_OUT_SIZE;
compute_generic_gru(&model->dec_gru5_input, &model->dec_gru5_recurrent, dec_state->gru5_state, buffer); compute_generic_gru(&model->dec_gru5_input, &model->dec_gru5_recurrent, dec_state->gru5_state, buffer, arch);
compute_glu(&model->dec_glu5, &buffer[output_index], dec_state->gru5_state); compute_glu(&model->dec_glu5, &buffer[output_index], dec_state->gru5_state, arch);
output_index += DEC_GRU5_OUT_SIZE; output_index += DEC_GRU5_OUT_SIZE;
conv1_cond_init(dec_state->conv5_state, output_index, 1, &dec_state->initialized); conv1_cond_init(dec_state->conv5_state, output_index, 1, &dec_state->initialized);
compute_generic_conv1d(&model->dec_conv5, &buffer[output_index], dec_state->conv5_state, buffer, output_index, ACTIVATION_TANH); compute_generic_conv1d(&model->dec_conv5, &buffer[output_index], dec_state->conv5_state, buffer, output_index, ACTIVATION_TANH, arch);
output_index += DEC_CONV5_OUT_SIZE; output_index += DEC_CONV5_OUT_SIZE;
compute_generic_dense(&model->dec_output, qframe, buffer, ACTIVATION_LINEAR); compute_generic_dense(&model->dec_output, qframe, buffer, ACTIVATION_LINEAR, arch);
} }

View file

@ -46,8 +46,8 @@ struct RDOVAEDecStruct {
float conv5_state[DEC_CONV5_STATE_SIZE]; float conv5_state[DEC_CONV5_STATE_SIZE];
}; };
void dred_rdovae_dec_init_states(RDOVAEDecState *h, const RDOVAEDec *model, const float * initial_state); void dred_rdovae_dec_init_states(RDOVAEDecState *h, const RDOVAEDec *model, const float * initial_state, int arch);
void dred_rdovae_decode_qframe(RDOVAEDecState *h, const RDOVAEDec *model, float *qframe, const float * z); void dred_rdovae_decode_qframe(RDOVAEDecState *h, const RDOVAEDec *model, float *qframe, const float * z, int arch);
void DRED_rdovae_decode_all(const RDOVAEDec *model, float *features, const float *state, const float *latents, int nb_latents); void DRED_rdovae_decode_all(const RDOVAEDec *model, float *features, const float *state, const float *latents, int nb_latents, int arch);
#endif #endif

View file

@ -50,7 +50,8 @@ void dred_rdovae_encode_dframe(
const RDOVAEEnc *model, const RDOVAEEnc *model,
float *latents, /* o: latent vector */ float *latents, /* o: latent vector */
float *initial_state, /* o: initial state */ float *initial_state, /* o: initial state */
const float *input /* i: double feature frame (concatenated) */ const float *input, /* i: double feature frame (concatenated) */
int arch
) )
{ {
float padded_latents[DRED_PADDED_LATENT_DIM]; float padded_latents[DRED_PADDED_LATENT_DIM];
@ -61,49 +62,49 @@ void dred_rdovae_encode_dframe(
int output_index = 0; int output_index = 0;
/* run encoder stack and concatenate output in buffer*/ /* run encoder stack and concatenate output in buffer*/
compute_generic_dense(&model->enc_dense1, &buffer[output_index], input, ACTIVATION_TANH); compute_generic_dense(&model->enc_dense1, &buffer[output_index], input, ACTIVATION_TANH, arch);
output_index += ENC_DENSE1_OUT_SIZE; output_index += ENC_DENSE1_OUT_SIZE;
compute_generic_gru(&model->enc_gru1_input, &model->enc_gru1_recurrent, enc_state->gru1_state, buffer); compute_generic_gru(&model->enc_gru1_input, &model->enc_gru1_recurrent, enc_state->gru1_state, buffer, arch);
OPUS_COPY(&buffer[output_index], enc_state->gru1_state, ENC_GRU1_OUT_SIZE); OPUS_COPY(&buffer[output_index], enc_state->gru1_state, ENC_GRU1_OUT_SIZE);
output_index += ENC_GRU1_OUT_SIZE; output_index += ENC_GRU1_OUT_SIZE;
conv1_cond_init(enc_state->conv1_state, output_index, 1, &enc_state->initialized); conv1_cond_init(enc_state->conv1_state, output_index, 1, &enc_state->initialized);
compute_generic_conv1d(&model->enc_conv1, &buffer[output_index], enc_state->conv1_state, buffer, output_index, ACTIVATION_TANH); compute_generic_conv1d(&model->enc_conv1, &buffer[output_index], enc_state->conv1_state, buffer, output_index, ACTIVATION_TANH, arch);
output_index += ENC_CONV1_OUT_SIZE; output_index += ENC_CONV1_OUT_SIZE;
compute_generic_gru(&model->enc_gru2_input, &model->enc_gru2_recurrent, enc_state->gru2_state, buffer); compute_generic_gru(&model->enc_gru2_input, &model->enc_gru2_recurrent, enc_state->gru2_state, buffer, arch);
OPUS_COPY(&buffer[output_index], enc_state->gru2_state, ENC_GRU2_OUT_SIZE); OPUS_COPY(&buffer[output_index], enc_state->gru2_state, ENC_GRU2_OUT_SIZE);
output_index += ENC_GRU2_OUT_SIZE; output_index += ENC_GRU2_OUT_SIZE;
conv1_cond_init(enc_state->conv2_state, output_index, 2, &enc_state->initialized); conv1_cond_init(enc_state->conv2_state, output_index, 2, &enc_state->initialized);
compute_generic_conv1d_dilation(&model->enc_conv2, &buffer[output_index], enc_state->conv2_state, buffer, output_index, 2, ACTIVATION_TANH); compute_generic_conv1d_dilation(&model->enc_conv2, &buffer[output_index], enc_state->conv2_state, buffer, output_index, 2, ACTIVATION_TANH, arch);
output_index += ENC_CONV2_OUT_SIZE; output_index += ENC_CONV2_OUT_SIZE;
compute_generic_gru(&model->enc_gru3_input, &model->enc_gru3_recurrent, enc_state->gru3_state, buffer); compute_generic_gru(&model->enc_gru3_input, &model->enc_gru3_recurrent, enc_state->gru3_state, buffer, arch);
OPUS_COPY(&buffer[output_index], enc_state->gru3_state, ENC_GRU3_OUT_SIZE); OPUS_COPY(&buffer[output_index], enc_state->gru3_state, ENC_GRU3_OUT_SIZE);
output_index += ENC_GRU3_OUT_SIZE; output_index += ENC_GRU3_OUT_SIZE;
conv1_cond_init(enc_state->conv3_state, output_index, 2, &enc_state->initialized); conv1_cond_init(enc_state->conv3_state, output_index, 2, &enc_state->initialized);
compute_generic_conv1d_dilation(&model->enc_conv3, &buffer[output_index], enc_state->conv3_state, buffer, output_index, 2, ACTIVATION_TANH); compute_generic_conv1d_dilation(&model->enc_conv3, &buffer[output_index], enc_state->conv3_state, buffer, output_index, 2, ACTIVATION_TANH, arch);
output_index += ENC_CONV3_OUT_SIZE; output_index += ENC_CONV3_OUT_SIZE;
compute_generic_gru(&model->enc_gru4_input, &model->enc_gru4_recurrent, enc_state->gru4_state, buffer); compute_generic_gru(&model->enc_gru4_input, &model->enc_gru4_recurrent, enc_state->gru4_state, buffer, arch);
OPUS_COPY(&buffer[output_index], enc_state->gru4_state, ENC_GRU4_OUT_SIZE); OPUS_COPY(&buffer[output_index], enc_state->gru4_state, ENC_GRU4_OUT_SIZE);
output_index += ENC_GRU4_OUT_SIZE; output_index += ENC_GRU4_OUT_SIZE;
conv1_cond_init(enc_state->conv4_state, output_index, 2, &enc_state->initialized); conv1_cond_init(enc_state->conv4_state, output_index, 2, &enc_state->initialized);
compute_generic_conv1d_dilation(&model->enc_conv4, &buffer[output_index], enc_state->conv4_state, buffer, output_index, 2, ACTIVATION_TANH); compute_generic_conv1d_dilation(&model->enc_conv4, &buffer[output_index], enc_state->conv4_state, buffer, output_index, 2, ACTIVATION_TANH, arch);
output_index += ENC_CONV4_OUT_SIZE; output_index += ENC_CONV4_OUT_SIZE;
compute_generic_gru(&model->enc_gru5_input, &model->enc_gru5_recurrent, enc_state->gru5_state, buffer); compute_generic_gru(&model->enc_gru5_input, &model->enc_gru5_recurrent, enc_state->gru5_state, buffer, arch);
OPUS_COPY(&buffer[output_index], enc_state->gru5_state, ENC_GRU5_OUT_SIZE); OPUS_COPY(&buffer[output_index], enc_state->gru5_state, ENC_GRU5_OUT_SIZE);
output_index += ENC_GRU5_OUT_SIZE; output_index += ENC_GRU5_OUT_SIZE;
conv1_cond_init(enc_state->conv5_state, output_index, 2, &enc_state->initialized); conv1_cond_init(enc_state->conv5_state, output_index, 2, &enc_state->initialized);
compute_generic_conv1d_dilation(&model->enc_conv5, &buffer[output_index], enc_state->conv5_state, buffer, output_index, 2, ACTIVATION_TANH); compute_generic_conv1d_dilation(&model->enc_conv5, &buffer[output_index], enc_state->conv5_state, buffer, output_index, 2, ACTIVATION_TANH, arch);
output_index += ENC_CONV5_OUT_SIZE; output_index += ENC_CONV5_OUT_SIZE;
compute_generic_dense(&model->enc_zdense, padded_latents, buffer, ACTIVATION_LINEAR); compute_generic_dense(&model->enc_zdense, padded_latents, buffer, ACTIVATION_LINEAR, arch);
OPUS_COPY(latents, padded_latents, DRED_LATENT_DIM); OPUS_COPY(latents, padded_latents, DRED_LATENT_DIM);
/* next, calculate initial state */ /* next, calculate initial state */
compute_generic_dense(&model->gdense1, state_hidden, buffer, ACTIVATION_TANH); compute_generic_dense(&model->gdense1, state_hidden, buffer, ACTIVATION_TANH, arch);
compute_generic_dense(&model->gdense2, padded_state, state_hidden, ACTIVATION_LINEAR); compute_generic_dense(&model->gdense2, padded_state, state_hidden, ACTIVATION_LINEAR, arch);
OPUS_COPY(initial_state, padded_state, DRED_STATE_DIM); OPUS_COPY(initial_state, padded_state, DRED_STATE_DIM);
} }

View file

@ -46,7 +46,7 @@ struct RDOVAEEncStruct {
float conv5_state[2*ENC_CONV5_STATE_SIZE]; float conv5_state[2*ENC_CONV5_STATE_SIZE];
}; };
void dred_rdovae_encode_dframe(RDOVAEEncState *enc_state, const RDOVAEEnc *model, float *latents, float *initial_state, const float *input); void dred_rdovae_encode_dframe(RDOVAEEncState *enc_state, const RDOVAEEnc *model, float *latents, float *initial_state, const float *input, int arch);
#endif #endif

View file

@ -42,6 +42,7 @@
#include "lpcnet.h" #include "lpcnet.h"
#include "lpcnet_private.h" #include "lpcnet_private.h"
#include "os_support.h" #include "os_support.h"
#include "cpu_support.h"
static void biquad(float *y, float mem[2], const float *x, const float *b, const float *a, int N) { static void biquad(float *y, float mem[2], const float *x, const float *b, const float *a, int N) {
@ -135,7 +136,9 @@ int main(int argc, char **argv) {
FILE *fnoise = NULL; FILE *fnoise = NULL;
float noise_gain = 0; float noise_gain = 0;
long noise_size=0; long noise_size=0;
int arch;
srand(getpid()); srand(getpid());
arch = opus_select_arch();
st = lpcnet_encoder_create(); st = lpcnet_encoder_create();
argv0=argv[0]; argv0=argv[0];
if (argc == 5 && strcmp(argv[1], "-btrain")==0) { if (argc == 5 && strcmp(argv[1], "-btrain")==0) {
@ -244,7 +247,7 @@ int main(int argc, char **argv) {
for (i=0;i<FRAME_SIZE;i++) x[i] += rand()/(float)RAND_MAX - .5; for (i=0;i<FRAME_SIZE;i++) x[i] += rand()/(float)RAND_MAX - .5;
/* PCM is delayed by 1/2 frame to make the features centered on the frames. */ /* PCM is delayed by 1/2 frame to make the features centered on the frames. */
for (i=0;i<FRAME_SIZE-TRAINING_OFFSET;i++) pcm[i+TRAINING_OFFSET] = float2short(x[i]); for (i=0;i<FRAME_SIZE-TRAINING_OFFSET;i++) pcm[i+TRAINING_OFFSET] = float2short(x[i]);
compute_frame_features(st, x); compute_frame_features(st, x, arch);
if (fpcm) { if (fpcm) {
compute_noise(noisebuf, noise_std); compute_noise(noisebuf, noise_std);

View file

@ -36,6 +36,7 @@
#include "pitch.h" #include "pitch.h"
#include "nnet.h" #include "nnet.h"
#include "lpcnet_private.h" #include "lpcnet_private.h"
#include "cpu_support.h"
#define FARGAN_FEATURES (NB_FEATURES) #define FARGAN_FEATURES (NB_FEATURES)
@ -52,9 +53,9 @@ static void compute_fargan_cond(FARGANState *st, float *cond, const float *featu
OPUS_COPY(&dense_in[NB_FEATURES], &model->cond_net_pembed.float_weights[IMAX(0,IMIN(period-32, 224))*COND_NET_PEMBED_OUT_SIZE], COND_NET_PEMBED_OUT_SIZE); OPUS_COPY(&dense_in[NB_FEATURES], &model->cond_net_pembed.float_weights[IMAX(0,IMIN(period-32, 224))*COND_NET_PEMBED_OUT_SIZE], COND_NET_PEMBED_OUT_SIZE);
OPUS_COPY(dense_in, features, NB_FEATURES); OPUS_COPY(dense_in, features, NB_FEATURES);
compute_generic_dense(&model->cond_net_fdense1, conv1_in, dense_in, ACTIVATION_TANH); compute_generic_dense(&model->cond_net_fdense1, conv1_in, dense_in, ACTIVATION_TANH, st->arch);
compute_generic_conv1d(&model->cond_net_fconv1, conv2_in, st->cond_conv1_state, conv1_in, COND_NET_FCONV1_IN_SIZE, ACTIVATION_TANH); compute_generic_conv1d(&model->cond_net_fconv1, conv2_in, st->cond_conv1_state, conv1_in, COND_NET_FCONV1_IN_SIZE, ACTIVATION_TANH, st->arch);
compute_generic_conv1d(&model->cond_net_fconv2, cond, st->cond_conv2_state, conv2_in, COND_NET_FCONV2_IN_SIZE, ACTIVATION_TANH); compute_generic_conv1d(&model->cond_net_fconv2, cond, st->cond_conv2_state, conv2_in, COND_NET_FCONV2_IN_SIZE, ACTIVATION_TANH, st->arch);
} }
static void fargan_deemphasis(float *pcm, float *deemph_mem) { static void fargan_deemphasis(float *pcm, float *deemph_mem) {
@ -84,7 +85,7 @@ static void run_fargan_subframe(FARGANState *st, float *pcm, const float *cond,
celt_assert(st->cont_initialized); celt_assert(st->cont_initialized);
model = &st->model; model = &st->model;
compute_generic_dense(&model->sig_net_cond_gain_dense, &gain, cond, ACTIVATION_LINEAR); compute_generic_dense(&model->sig_net_cond_gain_dense, &gain, cond, ACTIVATION_LINEAR, st->arch);
gain = exp(gain); gain = exp(gain);
gain_1 = 1.f/(1e-5f + gain); gain_1 = 1.f/(1e-5f + gain);
@ -100,26 +101,26 @@ static void run_fargan_subframe(FARGANState *st, float *pcm, const float *cond,
OPUS_COPY(&fwc0_in[FARGAN_COND_SIZE], pred, FARGAN_SUBFRAME_SIZE+4); OPUS_COPY(&fwc0_in[FARGAN_COND_SIZE], pred, FARGAN_SUBFRAME_SIZE+4);
OPUS_COPY(&fwc0_in[FARGAN_COND_SIZE+FARGAN_SUBFRAME_SIZE+4], prev, FARGAN_SUBFRAME_SIZE); OPUS_COPY(&fwc0_in[FARGAN_COND_SIZE+FARGAN_SUBFRAME_SIZE+4], prev, FARGAN_SUBFRAME_SIZE);
compute_generic_conv1d(&model->sig_net_fwc0_conv, gru1_in, st->fwc0_mem, fwc0_in, SIG_NET_INPUT_SIZE, ACTIVATION_TANH); compute_generic_conv1d(&model->sig_net_fwc0_conv, gru1_in, st->fwc0_mem, fwc0_in, SIG_NET_INPUT_SIZE, ACTIVATION_TANH, st->arch);
celt_assert(SIG_NET_FWC0_GLU_GATE_OUT_SIZE == model->sig_net_fwc0_glu_gate.nb_outputs); celt_assert(SIG_NET_FWC0_GLU_GATE_OUT_SIZE == model->sig_net_fwc0_glu_gate.nb_outputs);
compute_glu(&model->sig_net_fwc0_glu_gate, gru1_in, gru1_in); compute_glu(&model->sig_net_fwc0_glu_gate, gru1_in, gru1_in, st->arch);
compute_generic_dense(&model->sig_net_gain_dense_out, pitch_gate, gru1_in, ACTIVATION_SIGMOID); compute_generic_dense(&model->sig_net_gain_dense_out, pitch_gate, gru1_in, ACTIVATION_SIGMOID, st->arch);
for (i=0;i<FARGAN_SUBFRAME_SIZE;i++) gru1_in[SIG_NET_FWC0_GLU_GATE_OUT_SIZE+i] = pitch_gate[0]*pred[i+2]; for (i=0;i<FARGAN_SUBFRAME_SIZE;i++) gru1_in[SIG_NET_FWC0_GLU_GATE_OUT_SIZE+i] = pitch_gate[0]*pred[i+2];
OPUS_COPY(&gru1_in[SIG_NET_FWC0_GLU_GATE_OUT_SIZE+FARGAN_SUBFRAME_SIZE], prev, FARGAN_SUBFRAME_SIZE); OPUS_COPY(&gru1_in[SIG_NET_FWC0_GLU_GATE_OUT_SIZE+FARGAN_SUBFRAME_SIZE], prev, FARGAN_SUBFRAME_SIZE);
compute_generic_gru(&model->sig_net_gru1_input, &model->sig_net_gru1_recurrent, st->gru1_state, gru1_in); compute_generic_gru(&model->sig_net_gru1_input, &model->sig_net_gru1_recurrent, st->gru1_state, gru1_in, st->arch);
compute_glu(&model->sig_net_gru1_glu_gate, gru2_in, st->gru1_state); compute_glu(&model->sig_net_gru1_glu_gate, gru2_in, st->gru1_state, st->arch);
for (i=0;i<FARGAN_SUBFRAME_SIZE;i++) gru2_in[SIG_NET_GRU1_OUT_SIZE+i] = pitch_gate[1]*pred[i+2]; for (i=0;i<FARGAN_SUBFRAME_SIZE;i++) gru2_in[SIG_NET_GRU1_OUT_SIZE+i] = pitch_gate[1]*pred[i+2];
OPUS_COPY(&gru2_in[SIG_NET_GRU1_OUT_SIZE+FARGAN_SUBFRAME_SIZE], prev, FARGAN_SUBFRAME_SIZE); OPUS_COPY(&gru2_in[SIG_NET_GRU1_OUT_SIZE+FARGAN_SUBFRAME_SIZE], prev, FARGAN_SUBFRAME_SIZE);
compute_generic_gru(&model->sig_net_gru2_input, &model->sig_net_gru2_recurrent, st->gru2_state, gru2_in); compute_generic_gru(&model->sig_net_gru2_input, &model->sig_net_gru2_recurrent, st->gru2_state, gru2_in, st->arch);
compute_glu(&model->sig_net_gru2_glu_gate, gru3_in, st->gru2_state); compute_glu(&model->sig_net_gru2_glu_gate, gru3_in, st->gru2_state, st->arch);
for (i=0;i<FARGAN_SUBFRAME_SIZE;i++) gru3_in[SIG_NET_GRU2_OUT_SIZE+i] = pitch_gate[2]*pred[i+2]; for (i=0;i<FARGAN_SUBFRAME_SIZE;i++) gru3_in[SIG_NET_GRU2_OUT_SIZE+i] = pitch_gate[2]*pred[i+2];
OPUS_COPY(&gru3_in[SIG_NET_GRU2_OUT_SIZE+FARGAN_SUBFRAME_SIZE], prev, FARGAN_SUBFRAME_SIZE); OPUS_COPY(&gru3_in[SIG_NET_GRU2_OUT_SIZE+FARGAN_SUBFRAME_SIZE], prev, FARGAN_SUBFRAME_SIZE);
compute_generic_gru(&model->sig_net_gru3_input, &model->sig_net_gru3_recurrent, st->gru3_state, gru3_in); compute_generic_gru(&model->sig_net_gru3_input, &model->sig_net_gru3_recurrent, st->gru3_state, gru3_in, st->arch);
compute_glu(&model->sig_net_gru3_glu_gate, &skip_cat[SIG_NET_GRU1_OUT_SIZE+SIG_NET_GRU2_OUT_SIZE], st->gru3_state); compute_glu(&model->sig_net_gru3_glu_gate, &skip_cat[SIG_NET_GRU1_OUT_SIZE+SIG_NET_GRU2_OUT_SIZE], st->gru3_state, st->arch);
OPUS_COPY(skip_cat, gru2_in, SIG_NET_GRU1_OUT_SIZE); OPUS_COPY(skip_cat, gru2_in, SIG_NET_GRU1_OUT_SIZE);
OPUS_COPY(&skip_cat[SIG_NET_GRU1_OUT_SIZE], gru3_in, SIG_NET_GRU2_OUT_SIZE); OPUS_COPY(&skip_cat[SIG_NET_GRU1_OUT_SIZE], gru3_in, SIG_NET_GRU2_OUT_SIZE);
@ -127,10 +128,10 @@ static void run_fargan_subframe(FARGANState *st, float *pcm, const float *cond,
for (i=0;i<FARGAN_SUBFRAME_SIZE;i++) skip_cat[SIG_NET_GRU1_OUT_SIZE+SIG_NET_GRU2_OUT_SIZE+SIG_NET_GRU3_OUT_SIZE+SIG_NET_FWC0_CONV_OUT_SIZE+i] = pitch_gate[3]*pred[i+2]; for (i=0;i<FARGAN_SUBFRAME_SIZE;i++) skip_cat[SIG_NET_GRU1_OUT_SIZE+SIG_NET_GRU2_OUT_SIZE+SIG_NET_GRU3_OUT_SIZE+SIG_NET_FWC0_CONV_OUT_SIZE+i] = pitch_gate[3]*pred[i+2];
OPUS_COPY(&skip_cat[SIG_NET_GRU1_OUT_SIZE+SIG_NET_GRU2_OUT_SIZE+SIG_NET_GRU3_OUT_SIZE+SIG_NET_FWC0_CONV_OUT_SIZE+FARGAN_SUBFRAME_SIZE], prev, FARGAN_SUBFRAME_SIZE); OPUS_COPY(&skip_cat[SIG_NET_GRU1_OUT_SIZE+SIG_NET_GRU2_OUT_SIZE+SIG_NET_GRU3_OUT_SIZE+SIG_NET_FWC0_CONV_OUT_SIZE+FARGAN_SUBFRAME_SIZE], prev, FARGAN_SUBFRAME_SIZE);
compute_generic_dense(&model->sig_net_skip_dense, skip_out, skip_cat, ACTIVATION_TANH); compute_generic_dense(&model->sig_net_skip_dense, skip_out, skip_cat, ACTIVATION_TANH, st->arch);
compute_glu(&model->sig_net_skip_glu_gate, skip_out, skip_out); compute_glu(&model->sig_net_skip_glu_gate, skip_out, skip_out, st->arch);
compute_generic_dense(&model->sig_net_sig_dense_out, pcm, skip_out, ACTIVATION_TANH); compute_generic_dense(&model->sig_net_sig_dense_out, pcm, skip_out, ACTIVATION_TANH, st->arch);
for (i=0;i<FARGAN_SUBFRAME_SIZE;i++) pcm[i] *= gain; for (i=0;i<FARGAN_SUBFRAME_SIZE;i++) pcm[i] *= gain;
OPUS_MOVE(st->pitch_buf, &st->pitch_buf[FARGAN_SUBFRAME_SIZE], PITCH_MAX_PERIOD-FARGAN_SUBFRAME_SIZE); OPUS_MOVE(st->pitch_buf, &st->pitch_buf[FARGAN_SUBFRAME_SIZE], PITCH_MAX_PERIOD-FARGAN_SUBFRAME_SIZE);
@ -174,13 +175,13 @@ void fargan_init(FARGANState *st)
{ {
int ret; int ret;
OPUS_CLEAR(st, 1); OPUS_CLEAR(st, 1);
st->arch = opus_select_arch();
#ifndef USE_WEIGHTS_FILE #ifndef USE_WEIGHTS_FILE
ret = init_fargan(&st->model, fargan_arrays); ret = init_fargan(&st->model, fargan_arrays);
#else #else
ret = 0; ret = 0;
#endif #endif
celt_assert(ret == 0); celt_assert(ret == 0);
/* FIXME: perform arch detection. */
} }
int fargan_load_model(FARGANState *st, const unsigned char *data, int len) { int fargan_load_model(FARGANState *st, const unsigned char *data, int len) {

View file

@ -120,7 +120,7 @@ int lpcnet_encode(LPCNetEncState *st, const opus_int16 *pcm, unsigned char *buf)
* @param [out] features <tt>float[NB_TOTAL_FEATURES]</tt>: Four feature vectors * @param [out] features <tt>float[NB_TOTAL_FEATURES]</tt>: Four feature vectors
* @retval 0 Success * @retval 0 Success
*/ */
int lpcnet_compute_single_frame_features(LPCNetEncState *st, const opus_int16 *pcm, float features[NB_TOTAL_FEATURES]); int lpcnet_compute_single_frame_features(LPCNetEncState *st, const opus_int16 *pcm, float features[NB_TOTAL_FEATURES], int arch);
/** Compute features on LPCNET_FRAME_SIZE speech samples (currently 160) and output features for one 10-ms frame. /** Compute features on LPCNET_FRAME_SIZE speech samples (currently 160) and output features for one 10-ms frame.
@ -129,7 +129,7 @@ int lpcnet_compute_single_frame_features(LPCNetEncState *st, const opus_int16 *p
* @param [out] features <tt>float[NB_TOTAL_FEATURES]</tt>: Four feature vectors * @param [out] features <tt>float[NB_TOTAL_FEATURES]</tt>: Four feature vectors
* @retval 0 Success * @retval 0 Success
*/ */
int lpcnet_compute_single_frame_features_float(LPCNetEncState *st, const float *pcm, float features[NB_TOTAL_FEATURES]); int lpcnet_compute_single_frame_features_float(LPCNetEncState *st, const float *pcm, float features[NB_TOTAL_FEATURES], int arch);
/** Gets the size of an <code>LPCNetState</code> structure. /** Gets the size of an <code>LPCNetState</code> structure.
* @returns The size in bytes. * @returns The size in bytes.

View file

@ -37,6 +37,7 @@
#include "freq.h" #include "freq.h"
#include "os_support.h" #include "os_support.h"
#include "fargan.h" #include "fargan.h"
#include "cpu_support.h"
#ifdef USE_WEIGHTS_FILE #ifdef USE_WEIGHTS_FILE
# if __unix__ # if __unix__
@ -99,12 +100,14 @@ void usage(void) {
int main(int argc, char **argv) { int main(int argc, char **argv) {
int mode=0; int mode=0;
int arch;
FILE *fin, *fout; FILE *fin, *fout;
#ifdef USE_WEIGHTS_FILE #ifdef USE_WEIGHTS_FILE
int len; int len;
unsigned char *data; unsigned char *data;
const char *filename = "weights_blob.bin"; const char *filename = "weights_blob.bin";
#endif #endif
arch = opus_select_arch();
if (argc < 4) usage(); if (argc < 4) usage();
if (strcmp(argv[1], "-features") == 0) mode=MODE_FEATURES; if (strcmp(argv[1], "-features") == 0) mode=MODE_FEATURES;
else if (strcmp(argv[1], "-fargan-synthesis") == 0) mode=MODE_FARGAN_SYNTHESIS; else if (strcmp(argv[1], "-fargan-synthesis") == 0) mode=MODE_FARGAN_SYNTHESIS;
@ -137,7 +140,7 @@ int main(int argc, char **argv) {
size_t ret; size_t ret;
ret = fread(pcm, sizeof(pcm[0]), LPCNET_FRAME_SIZE, fin); ret = fread(pcm, sizeof(pcm[0]), LPCNET_FRAME_SIZE, fin);
if (feof(fin) || ret != LPCNET_FRAME_SIZE) break; if (feof(fin) || ret != LPCNET_FRAME_SIZE) break;
lpcnet_compute_single_frame_features(net, pcm, features); lpcnet_compute_single_frame_features(net, pcm, features, arch);
fwrite(features, sizeof(float), NB_TOTAL_FEATURES, fout); fwrite(features, sizeof(float), NB_TOTAL_FEATURES, fout);
} }
lpcnet_encoder_destroy(net); lpcnet_encoder_destroy(net);

View file

@ -95,7 +95,7 @@ static void biquad(float *y, float mem[2], const float *x, const float *b, const
#define celt_log10(x) (0.3010299957f*celt_log2(x)) #define celt_log10(x) (0.3010299957f*celt_log2(x))
void compute_frame_features(LPCNetEncState *st, const float *in) { void compute_frame_features(LPCNetEncState *st, const float *in, int arch) {
float aligned_in[FRAME_SIZE]; float aligned_in[FRAME_SIZE];
int i; int i;
float Ly[NB_BANDS]; float Ly[NB_BANDS];
@ -142,7 +142,7 @@ void compute_frame_features(LPCNetEncState *st, const float *in) {
OPUS_COPY(&x[0], st->pitch_mem, LPC_ORDER); OPUS_COPY(&x[0], st->pitch_mem, LPC_ORDER);
OPUS_COPY(&x[LPC_ORDER], aligned_in, FRAME_SIZE); OPUS_COPY(&x[LPC_ORDER], aligned_in, FRAME_SIZE);
OPUS_COPY(st->pitch_mem, &aligned_in[FRAME_SIZE-LPC_ORDER], LPC_ORDER); OPUS_COPY(st->pitch_mem, &aligned_in[FRAME_SIZE-LPC_ORDER], LPC_ORDER);
celt_fir(&x[LPC_ORDER], st->lpc, &st->lp_buf[PITCH_MAX_PERIOD], FRAME_SIZE, LPC_ORDER, st->arch); celt_fir(&x[LPC_ORDER], st->lpc, &st->lp_buf[PITCH_MAX_PERIOD], FRAME_SIZE, LPC_ORDER, arch);
for (i=0;i<FRAME_SIZE;i++) { for (i=0;i<FRAME_SIZE;i++) {
st->exc_buf[PITCH_MAX_PERIOD+i] = st->lp_buf[PITCH_MAX_PERIOD+i] + .7f*st->pitch_filt; st->exc_buf[PITCH_MAX_PERIOD+i] = st->lp_buf[PITCH_MAX_PERIOD+i] + .7f*st->pitch_filt;
st->pitch_filt = st->lp_buf[PITCH_MAX_PERIOD+i]; st->pitch_filt = st->lp_buf[PITCH_MAX_PERIOD+i];
@ -152,7 +152,7 @@ void compute_frame_features(LPCNetEncState *st, const float *in) {
{ {
double ener1; double ener1;
float *buf = st->exc_buf; float *buf = st->exc_buf;
celt_pitch_xcorr(&buf[PITCH_MAX_PERIOD], buf, xcorr, FRAME_SIZE, PITCH_MAX_PERIOD-PITCH_MIN_PERIOD, st->arch); celt_pitch_xcorr(&buf[PITCH_MAX_PERIOD], buf, xcorr, FRAME_SIZE, PITCH_MAX_PERIOD-PITCH_MIN_PERIOD, arch);
ener0 = celt_inner_prod_c(&buf[PITCH_MAX_PERIOD], &buf[PITCH_MAX_PERIOD], FRAME_SIZE); ener0 = celt_inner_prod_c(&buf[PITCH_MAX_PERIOD], &buf[PITCH_MAX_PERIOD], FRAME_SIZE);
ener1 = celt_inner_prod_c(&buf[0], &buf[0], FRAME_SIZE-1); ener1 = celt_inner_prod_c(&buf[0], &buf[0], FRAME_SIZE-1);
/*printf("%f\n", st->frame_weight[sub]);*/ /*printf("%f\n", st->frame_weight[sub]);*/
@ -165,7 +165,7 @@ void compute_frame_features(LPCNetEncState *st, const float *in) {
} }
/*printf("\n");*/ /*printf("\n");*/
} }
st->dnn_pitch = compute_pitchdnn(&st->pitchdnn, st->if_features, st->xcorr_features); st->dnn_pitch = compute_pitchdnn(&st->pitchdnn, st->if_features, st->xcorr_features, arch);
} }
void process_single_frame(LPCNetEncState *st, FILE *ffeat) { void process_single_frame(LPCNetEncState *st, FILE *ffeat) {
@ -196,26 +196,26 @@ void preemphasis(float *y, float *mem, const float *x, float coef, int N) {
} }
} }
static int lpcnet_compute_single_frame_features_impl(LPCNetEncState *st, float *x, float features[NB_TOTAL_FEATURES]) { static int lpcnet_compute_single_frame_features_impl(LPCNetEncState *st, float *x, float features[NB_TOTAL_FEATURES], int arch) {
preemphasis(x, &st->mem_preemph, x, PREEMPHASIS, FRAME_SIZE); preemphasis(x, &st->mem_preemph, x, PREEMPHASIS, FRAME_SIZE);
compute_frame_features(st, x); compute_frame_features(st, x, arch);
process_single_frame(st, NULL); process_single_frame(st, NULL);
OPUS_COPY(features, &st->features[0], NB_TOTAL_FEATURES); OPUS_COPY(features, &st->features[0], NB_TOTAL_FEATURES);
return 0; return 0;
} }
int lpcnet_compute_single_frame_features(LPCNetEncState *st, const opus_int16 *pcm, float features[NB_TOTAL_FEATURES]) { int lpcnet_compute_single_frame_features(LPCNetEncState *st, const opus_int16 *pcm, float features[NB_TOTAL_FEATURES], int arch) {
int i; int i;
float x[FRAME_SIZE]; float x[FRAME_SIZE];
for (i=0;i<FRAME_SIZE;i++) x[i] = pcm[i]; for (i=0;i<FRAME_SIZE;i++) x[i] = pcm[i];
lpcnet_compute_single_frame_features_impl(st, x, features); lpcnet_compute_single_frame_features_impl(st, x, features, arch);
return 0; return 0;
} }
int lpcnet_compute_single_frame_features_float(LPCNetEncState *st, const float *pcm, float features[NB_TOTAL_FEATURES]) { int lpcnet_compute_single_frame_features_float(LPCNetEncState *st, const float *pcm, float features[NB_TOTAL_FEATURES], int arch) {
int i; int i;
float x[FRAME_SIZE]; float x[FRAME_SIZE];
for (i=0;i<FRAME_SIZE;i++) x[i] = pcm[i]; for (i=0;i<FRAME_SIZE;i++) x[i] = pcm[i];
lpcnet_compute_single_frame_features_impl(st, x, features); lpcnet_compute_single_frame_features_impl(st, x, features, arch);
return 0; return 0;
} }

View file

@ -33,6 +33,7 @@
#include "plc_data.h" #include "plc_data.h"
#include "os_support.h" #include "os_support.h"
#include "common.h" #include "common.h"
#include "cpu_support.h"
#ifndef M_PI #ifndef M_PI
#define M_PI 3.141592653 #define M_PI 3.141592653
@ -54,6 +55,7 @@ void lpcnet_plc_reset(LPCNetPLCState *st) {
int lpcnet_plc_init(LPCNetPLCState *st) { int lpcnet_plc_init(LPCNetPLCState *st) {
int ret; int ret;
st->arch = opus_select_arch();
fargan_init(&st->fargan); fargan_init(&st->fargan);
lpcnet_encoder_init(&st->enc); lpcnet_encoder_init(&st->enc);
st->analysis_pos = PLC_BUF_SIZE; st->analysis_pos = PLC_BUF_SIZE;
@ -109,10 +111,10 @@ static void compute_plc_pred(LPCNetPLCState *st, float *out, const float *in) {
float dense_out[PLC_DENSE1_OUT_SIZE]; float dense_out[PLC_DENSE1_OUT_SIZE];
PLCNetState *net = &st->plc_net; PLCNetState *net = &st->plc_net;
celt_assert(st->loaded); celt_assert(st->loaded);
_lpcnet_compute_dense(&st->model.plc_dense1, dense_out, in); _lpcnet_compute_dense(&st->model.plc_dense1, dense_out, in, st->arch);
compute_gruB(&st->model.plc_gru1, zeros, net->plc_gru1_state, dense_out); compute_gruB(&st->model.plc_gru1, zeros, net->plc_gru1_state, dense_out, st->arch);
compute_gruB(&st->model.plc_gru2, zeros, net->plc_gru2_state, net->plc_gru1_state); compute_gruB(&st->model.plc_gru2, zeros, net->plc_gru2_state, net->plc_gru1_state, st->arch);
_lpcnet_compute_dense(&st->model.plc_out, out, net->plc_gru2_state); _lpcnet_compute_dense(&st->model.plc_out, out, net->plc_gru2_state, st->arch);
} }
static int get_fec_or_pred(LPCNetPLCState *st, float *out) { static int get_fec_or_pred(LPCNetPLCState *st, float *out) {
@ -164,7 +166,7 @@ int lpcnet_plc_conceal(LPCNetPLCState *st, opus_int16 *pcm) {
float plc_features[2*NB_BANDS+NB_FEATURES+1]; float plc_features[2*NB_BANDS+NB_FEATURES+1];
for (i=0;i<FRAME_SIZE;i++) x[i] = 32768.f*st->pcm[st->analysis_pos+i]; for (i=0;i<FRAME_SIZE;i++) x[i] = 32768.f*st->pcm[st->analysis_pos+i];
burg_cepstral_analysis(plc_features, x); burg_cepstral_analysis(plc_features, x);
lpcnet_compute_single_frame_features_float(&st->enc, x, st->features); lpcnet_compute_single_frame_features_float(&st->enc, x, st->features, st->arch);
if ((st->analysis_gap && count > 0) || count > 1) { if ((st->analysis_gap && count > 0) || count > 1) {
queue_features(st, st->features); queue_features(st, st->features);
OPUS_COPY(&plc_features[2*NB_BANDS], st->features, NB_FEATURES); OPUS_COPY(&plc_features[2*NB_BANDS], st->features, NB_FEATURES);

View file

@ -24,7 +24,6 @@
struct LPCNetEncState{ struct LPCNetEncState{
PitchDNNState pitchdnn; PitchDNNState pitchdnn;
int arch;
float analysis_mem[OVERLAP_SIZE]; float analysis_mem[OVERLAP_SIZE];
float mem_preemph; float mem_preemph;
kiss_fft_cpx prev_if[PITCH_IF_MAX_FREQ]; kiss_fft_cpx prev_if[PITCH_IF_MAX_FREQ];
@ -67,7 +66,7 @@ struct LPCNetPLCState {
void preemphasis(float *y, float *mem, const float *x, float coef, int N); void preemphasis(float *y, float *mem, const float *x, float coef, int N);
void compute_frame_features(LPCNetEncState *st, const float *in); void compute_frame_features(LPCNetEncState *st, const float *in, int arch);
void lpcnet_reset_signal(LPCNetState *lpcnet); void lpcnet_reset_signal(LPCNetState *lpcnet);
void run_frame_network(LPCNetState *lpcnet, float *gru_a_condition, float *gru_b_condition, float *lpc, const float *features); void run_frame_network(LPCNetState *lpcnet, float *gru_a_condition, float *gru_b_condition, float *lpc, const float *features);
@ -79,7 +78,6 @@ void lpcnet_synthesize_tail_impl(LPCNetState *lpcnet, opus_int16 *output, int N,
void lpcnet_synthesize_impl(LPCNetState *lpcnet, const float *features, opus_int16 *output, int N, int preload); void lpcnet_synthesize_impl(LPCNetState *lpcnet, const float *features, opus_int16 *output, int N, int preload);
void lpcnet_synthesize_blend_impl(LPCNetState *lpcnet, const opus_int16 *pcm_in, opus_int16 *output, int N); void lpcnet_synthesize_blend_impl(LPCNetState *lpcnet, const opus_int16 *pcm_in, opus_int16 *output, int N);
void process_single_frame(LPCNetEncState *st, FILE *ffeat); void process_single_frame(LPCNetEncState *st, FILE *ffeat);
int lpcnet_compute_single_frame_features(LPCNetEncState *st, const opus_int16 *pcm, float features[NB_TOTAL_FEATURES]);
void process_single_frame(LPCNetEncState *st, FILE *ffeat); void process_single_frame(LPCNetEncState *st, FILE *ffeat);

View file

@ -69,50 +69,16 @@ static OPUS_INLINE float relu(float x)
return x < 0 ? 0 : x; return x < 0 ? 0 : x;
} }
static void compute_linear(const LinearLayer *linear, float *out, const float *in) void compute_generic_dense(const LinearLayer *layer, float *output, const float *input, int activation, int arch)
{ {
int i, M, N; compute_linear(layer, output, input, arch);
const float *bias;
celt_assert(in != out);
bias = linear->bias;
M = linear->nb_inputs;
N = linear->nb_outputs;
if (linear->float_weights != NULL) {
if (linear->weights_idx != NULL) sparse_sgemv8x4(out, linear->float_weights, linear->weights_idx, N, in);
else sgemv(out, linear->float_weights, N, M, N, in);
} else if (linear->weights != NULL) {
if (linear->weights_idx != NULL) sparse_cgemv8x4(out, linear->weights, linear->weights_idx, linear->scale, N, M, in);
else cgemv8x4(out, linear->weights, linear->scale, N, M, in);
/* Only use SU biases on for integer matrices on SU archs. */
#ifdef USE_SU_BIAS
bias = linear->subias;
#endif
}
else OPUS_CLEAR(out, N);
if (bias != NULL) {
for (i=0;i<N;i++) out[i] += bias[i];
}
if (linear->diag) {
/* Diag is only used for GRU recurrent weights. */
celt_assert(3*M == N);
for (i=0;i<M;i++) {
out[i] += linear->diag[i]*in[i];
out[i+M] += linear->diag[i+M]*in[i];
out[i+2*M] += linear->diag[i+2*M]*in[i];
}
}
}
void compute_generic_dense(const LinearLayer *layer, float *output, const float *input, int activation)
{
compute_linear(layer, output, input);
compute_activation(output, output, layer->nb_outputs, activation); compute_activation(output, output, layer->nb_outputs, activation);
} }
#define MAX_RNN_NEURONS_ALL IMAX(IMAX(FARGAN_MAX_RNN_NEURONS, PLC_MAX_RNN_NEURONS), DRED_MAX_RNN_NEURONS) #define MAX_RNN_NEURONS_ALL IMAX(IMAX(FARGAN_MAX_RNN_NEURONS, PLC_MAX_RNN_NEURONS), DRED_MAX_RNN_NEURONS)
void compute_generic_gru(const LinearLayer *input_weights, const LinearLayer *recurrent_weights, float *state, const float *in) void compute_generic_gru(const LinearLayer *input_weights, const LinearLayer *recurrent_weights, float *state, const float *in, int arch)
{ {
int i; int i;
int N; int N;
@ -129,8 +95,8 @@ void compute_generic_gru(const LinearLayer *input_weights, const LinearLayer *re
h = &zrh[2*N]; h = &zrh[2*N];
celt_assert(recurrent_weights->nb_outputs <= 3*MAX_RNN_NEURONS_ALL); celt_assert(recurrent_weights->nb_outputs <= 3*MAX_RNN_NEURONS_ALL);
celt_assert(in != state); celt_assert(in != state);
compute_linear(input_weights, zrh, in); compute_linear(input_weights, zrh, in, arch);
compute_linear(recurrent_weights, recur, state); compute_linear(recurrent_weights, recur, state, arch);
for (i=0;i<2*N;i++) for (i=0;i<2*N;i++)
zrh[i] += recur[i]; zrh[i] += recur[i];
compute_activation(zrh, zrh, 2*N, ACTIVATION_SIGMOID); compute_activation(zrh, zrh, 2*N, ACTIVATION_SIGMOID);
@ -143,12 +109,12 @@ void compute_generic_gru(const LinearLayer *input_weights, const LinearLayer *re
state[i] = h[i]; state[i] = h[i];
} }
void compute_glu(const LinearLayer *layer, float *output, const float *input) void compute_glu(const LinearLayer *layer, float *output, const float *input, int arch)
{ {
int i; int i;
float act2[MAX_INPUTS]; float act2[MAX_INPUTS];
celt_assert(layer->nb_inputs == layer->nb_outputs); celt_assert(layer->nb_inputs == layer->nb_outputs);
compute_linear(layer, act2, input); compute_linear(layer, act2, input, arch);
compute_activation(act2, act2, layer->nb_outputs, ACTIVATION_SIGMOID); compute_activation(act2, act2, layer->nb_outputs, ACTIVATION_SIGMOID);
if (input == output) { if (input == output) {
/* Give a vectorization hint to the compiler for the in-place case. */ /* Give a vectorization hint to the compiler for the in-place case. */
@ -194,7 +160,7 @@ void compute_activation(float *output, const float *input, int N, int activation
} }
} }
void _lpcnet_compute_dense(const DenseLayer *layer, float *output, const float *input) void _lpcnet_compute_dense(const DenseLayer *layer, float *output, const float *input, int arch)
{ {
LinearLayer matrix; LinearLayer matrix;
celt_assert(input != output); celt_assert(input != output);
@ -207,7 +173,7 @@ void _lpcnet_compute_dense(const DenseLayer *layer, float *output, const float *
matrix.nb_inputs = layer->nb_inputs; matrix.nb_inputs = layer->nb_inputs;
matrix.nb_outputs = layer->nb_neurons; matrix.nb_outputs = layer->nb_neurons;
matrix.scale = NULL; matrix.scale = NULL;
compute_linear(&matrix, output, input); compute_linear(&matrix, output, input, arch);
compute_activation(output, output, layer->nb_neurons, layer->activation); compute_activation(output, output, layer->nb_neurons, layer->activation);
} }
@ -218,7 +184,7 @@ void _lpcnet_compute_dense(const DenseLayer *layer, float *output, const float *
#endif #endif
#define MAX_IDX_SIZE 8192 #define MAX_IDX_SIZE 8192
void compute_gruB(const GRULayer *gru, const float* gru_b_condition, float *state, const float *input) void compute_gruB(const GRULayer *gru, const float* gru_b_condition, float *state, const float *input, int arch)
{ {
LinearLayer in_matrix, rec_matrix; LinearLayer in_matrix, rec_matrix;
int i, M, N; int i, M, N;
@ -262,25 +228,25 @@ void compute_gruB(const GRULayer *gru, const float* gru_b_condition, float *stat
rec_matrix.float_weights = NULL; rec_matrix.float_weights = NULL;
#endif #endif
rec_matrix.weights_idx = NULL; rec_matrix.weights_idx = NULL;
compute_generic_gru(&in_matrix, &rec_matrix, state, input); compute_generic_gru(&in_matrix, &rec_matrix, state, input, arch);
} }
#define MAX_CONV_INPUTS_ALL DRED_MAX_CONV_INPUTS #define MAX_CONV_INPUTS_ALL DRED_MAX_CONV_INPUTS
void compute_generic_conv1d(const LinearLayer *layer, float *output, float *mem, const float *input, int input_size, int activation) void compute_generic_conv1d(const LinearLayer *layer, float *output, float *mem, const float *input, int input_size, int activation, int arch)
{ {
float tmp[MAX_CONV_INPUTS_ALL]; float tmp[MAX_CONV_INPUTS_ALL];
celt_assert(input != output); celt_assert(input != output);
celt_assert(layer->nb_inputs <= MAX_CONV_INPUTS_ALL); celt_assert(layer->nb_inputs <= MAX_CONV_INPUTS_ALL);
OPUS_COPY(tmp, mem, layer->nb_inputs-input_size); OPUS_COPY(tmp, mem, layer->nb_inputs-input_size);
OPUS_COPY(&tmp[layer->nb_inputs-input_size], input, input_size); OPUS_COPY(&tmp[layer->nb_inputs-input_size], input, input_size);
compute_linear(layer, output, tmp); compute_linear(layer, output, tmp, arch);
compute_activation(output, output, layer->nb_outputs, activation); compute_activation(output, output, layer->nb_outputs, activation);
OPUS_COPY(mem, &tmp[input_size], layer->nb_inputs-input_size); OPUS_COPY(mem, &tmp[input_size], layer->nb_inputs-input_size);
} }
void compute_generic_conv1d_dilation(const LinearLayer *layer, float *output, float *mem, const float *input, int input_size, int dilation, int activation) void compute_generic_conv1d_dilation(const LinearLayer *layer, float *output, float *mem, const float *input, int input_size, int dilation, int activation, int arch)
{ {
float tmp[MAX_CONV_INPUTS_ALL]; float tmp[MAX_CONV_INPUTS_ALL];
int ksize = layer->nb_inputs/input_size; int ksize = layer->nb_inputs/input_size;
@ -290,7 +256,7 @@ void compute_generic_conv1d_dilation(const LinearLayer *layer, float *output, fl
if (dilation==1) OPUS_COPY(tmp, mem, layer->nb_inputs-input_size); if (dilation==1) OPUS_COPY(tmp, mem, layer->nb_inputs-input_size);
else for (i=0;i<ksize-1;i++) OPUS_COPY(&tmp[i*input_size], &mem[i*input_size*dilation], input_size); else for (i=0;i<ksize-1;i++) OPUS_COPY(&tmp[i*input_size], &mem[i*input_size*dilation], input_size);
OPUS_COPY(&tmp[layer->nb_inputs-input_size], input, input_size); OPUS_COPY(&tmp[layer->nb_inputs-input_size], input, input_size);
compute_linear(layer, output, tmp); compute_linear(layer, output, tmp, arch);
compute_activation(output, output, layer->nb_outputs, activation); compute_activation(output, output, layer->nb_outputs, activation);
if (dilation==1) OPUS_COPY(mem, &tmp[input_size], layer->nb_inputs-input_size); if (dilation==1) OPUS_COPY(mem, &tmp[input_size], layer->nb_inputs-input_size);
else { else {

View file

@ -126,18 +126,18 @@ typedef struct {
int dim; int dim;
} EmbeddingLayer; } EmbeddingLayer;
void compute_generic_dense(const LinearLayer *layer, float *output, const float *input, int activation); void compute_generic_dense(const LinearLayer *layer, float *output, const float *input, int activation, int arch);
void compute_generic_gru(const LinearLayer *input_weights, const LinearLayer *recurrent_weights, float *state, const float *in); void compute_generic_gru(const LinearLayer *input_weights, const LinearLayer *recurrent_weights, float *state, const float *in, int arch);
void compute_generic_conv1d(const LinearLayer *layer, float *output, float *mem, const float *input, int input_size, int activation); void compute_generic_conv1d(const LinearLayer *layer, float *output, float *mem, const float *input, int input_size, int activation, int arch);
void compute_generic_conv1d_dilation(const LinearLayer *layer, float *output, float *mem, const float *input, int input_size, int dilation, int activation); void compute_generic_conv1d_dilation(const LinearLayer *layer, float *output, float *mem, const float *input, int input_size, int dilation, int activation, int arch);
void compute_glu(const LinearLayer *layer, float *output, const float *input); void compute_glu(const LinearLayer *layer, float *output, const float *input, int arch);
void compute_gated_activation(const LinearLayer *layer, float *output, const float *input, int activation); void compute_gated_activation(const LinearLayer *layer, float *output, const float *input, int activation, int arch);
void compute_activation(float *output, const float *input, int N, int activation); void compute_activation(float *output, const float *input, int N, int activation);
void _lpcnet_compute_dense(const DenseLayer *layer, float *output, const float *input); void _lpcnet_compute_dense(const DenseLayer *layer, float *output, const float *input, int arch);
void compute_gruB(const GRULayer *gru, const float* gru_b_condition, float *state, const float *input); void compute_gruB(const GRULayer *gru, const float* gru_b_condition, float *state, const float *input, int arch);
@ -189,4 +189,25 @@ int gru_init(GRULayer *layer, const WeightArray *arrays,
void compute_conv2d(const Conv2dLayer *conv, float *out, float *mem, const float *in, int height, int hstride, int activation); void compute_conv2d(const Conv2dLayer *conv, float *out, float *mem, const float *in, int height, int hstride, int activation);
void compute_linear_c(const LinearLayer *linear, float *out, const float *in);
#if defined(OPUS_X86_MAY_HAVE_SSE2)
#include "x86/dnn_x86.h"
#endif
#ifndef OVERRIDE_COMPUTE_LINEAR
#define compute_linear(linear, out, in, arch) ((void)(arch),compute_linear_c(linear, out, in))
#endif
#if defined(__x86_64__) && !defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_MAY_HAVE_AVX2)
#if defined(_MSC_VER)
#pragma message ("Only SSE and SSE2 are available. On newer machines, enable SSSE3/AVX/AVX2 to get better performance")
#else
#warning "Only SSE and SSE2 are available. On newer machines, enable SSSE3/AVX/AVX2 using -march= to get better performance"
#endif
#endif
#endif /* NNET_H_ */ #endif /* NNET_H_ */

76
dnn/nnet_arch.h Normal file
View file

@ -0,0 +1,76 @@
/* Copyright (c) 2018-2019 Mozilla
2023 Amazon */
/*
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef NNET_ARCH_H
#define NNET_ARCH_H
#include "nnet.h"
#include "arch.h"
#include "os_support.h"
#include "vec.h"
#define CAT_SUFFIX2(a,b) a ## b
#define CAT_SUFFIX(a,b) CAT_SUFFIX2(a, b)
#define RTCD_SUF(name) CAT_SUFFIX(name, RTCD_ARCH)
void RTCD_SUF(compute_linear_) (const LinearLayer *linear, float *out, const float *in)
{
int i, M, N;
const float *bias;
celt_assert(in != out);
bias = linear->bias;
M = linear->nb_inputs;
N = linear->nb_outputs;
if (linear->float_weights != NULL) {
if (linear->weights_idx != NULL) sparse_sgemv8x4(out, linear->float_weights, linear->weights_idx, N, in);
else sgemv(out, linear->float_weights, N, M, N, in);
} else if (linear->weights != NULL) {
if (linear->weights_idx != NULL) sparse_cgemv8x4(out, linear->weights, linear->weights_idx, linear->scale, N, M, in);
else cgemv8x4(out, linear->weights, linear->scale, N, M, in);
/* Only use SU biases on for integer matrices on SU archs. */
#ifdef USE_SU_BIAS
bias = linear->subias;
#endif
}
else OPUS_CLEAR(out, N);
if (bias != NULL) {
for (i=0;i<N;i++) out[i] += bias[i];
}
if (linear->diag) {
/* Diag is only used for GRU recurrent weights. */
celt_assert(3*M == N);
for (i=0;i<M;i++) {
out[i] += linear->diag[i]*in[i];
out[i+M] += linear->diag[i+M]*in[i];
out[i+2*M] += linear->diag[i+2*M]*in[i];
}
}
}
#endif

35
dnn/nnet_default.c Normal file
View file

@ -0,0 +1,35 @@
/* Copyright (c) 2018-2019 Mozilla
2023 Amazon */
/*
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#define RTCD_ARCH c
#include "nnet_arch.h"

View file

@ -12,7 +12,8 @@
float compute_pitchdnn( float compute_pitchdnn(
PitchDNNState *st, PitchDNNState *st,
const float *if_features, const float *if_features,
const float *xcorr_features const float *xcorr_features,
int arch
) )
{ {
float if1_out[DENSE_IF_UPSAMPLER_1_OUT_SIZE]; float if1_out[DENSE_IF_UPSAMPLER_1_OUT_SIZE];
@ -28,16 +29,16 @@ float compute_pitchdnn(
float count=0; float count=0;
PitchDNN *model = &st->model; PitchDNN *model = &st->model;
/* IF */ /* IF */
compute_generic_dense(&model->dense_if_upsampler_1, if1_out, if_features, ACTIVATION_TANH); compute_generic_dense(&model->dense_if_upsampler_1, if1_out, if_features, ACTIVATION_TANH, arch);
compute_generic_dense(&model->dense_if_upsampler_2, &downsampler_in[NB_XCORR_FEATURES], if1_out, ACTIVATION_TANH); compute_generic_dense(&model->dense_if_upsampler_2, &downsampler_in[NB_XCORR_FEATURES], if1_out, ACTIVATION_TANH, arch);
/* xcorr*/ /* xcorr*/
OPUS_COPY(&conv1_tmp1[1], xcorr_features, NB_XCORR_FEATURES); OPUS_COPY(&conv1_tmp1[1], xcorr_features, NB_XCORR_FEATURES);
compute_conv2d(&model->conv2d_1, &conv1_tmp2[1], st->xcorr_mem1, conv1_tmp1, NB_XCORR_FEATURES, NB_XCORR_FEATURES+2, ACTIVATION_TANH); compute_conv2d(&model->conv2d_1, &conv1_tmp2[1], st->xcorr_mem1, conv1_tmp1, NB_XCORR_FEATURES, NB_XCORR_FEATURES+2, ACTIVATION_TANH);
compute_conv2d(&model->conv2d_2, downsampler_in, st->xcorr_mem2, conv1_tmp2, NB_XCORR_FEATURES, NB_XCORR_FEATURES, ACTIVATION_TANH); compute_conv2d(&model->conv2d_2, downsampler_in, st->xcorr_mem2, conv1_tmp2, NB_XCORR_FEATURES, NB_XCORR_FEATURES, ACTIVATION_TANH);
compute_generic_dense(&model->dense_downsampler, downsampler_out, downsampler_in, ACTIVATION_TANH); compute_generic_dense(&model->dense_downsampler, downsampler_out, downsampler_in, ACTIVATION_TANH, arch);
compute_generic_gru(&model->gru_1_input, &model->gru_1_recurrent, st->gru_state, downsampler_out); compute_generic_gru(&model->gru_1_input, &model->gru_1_recurrent, st->gru_state, downsampler_out, arch);
compute_generic_dense(&model->dense_final_upsampler, output, st->gru_state, ACTIVATION_LINEAR); compute_generic_dense(&model->dense_final_upsampler, output, st->gru_state, ACTIVATION_LINEAR, arch);
for (i=0;i<180;i++) { for (i=0;i<180;i++) {
if (output[i] > maxval) { if (output[i] > maxval) {
pos = i; pos = i;
@ -65,7 +66,6 @@ void pitchdnn_init(PitchDNNState *st)
ret = 0; ret = 0;
#endif #endif
celt_assert(ret == 0); celt_assert(ret == 0);
/* FIXME: perform arch detection. */
} }
int pitchdnn_load_model(PitchDNNState *st, const unsigned char *data, int len) { int pitchdnn_load_model(PitchDNNState *st, const unsigned char *data, int len) {

View file

@ -27,7 +27,8 @@ int pitchdnn_load_model(PitchDNNState *st, const unsigned char *data, int len);
float compute_pitchdnn( float compute_pitchdnn(
PitchDNNState *st, PitchDNNState *st,
const float *if_features, const float *if_features,
const float *xcorr_features const float *xcorr_features,
int arch
); );
#endif #endif

View file

@ -655,11 +655,6 @@ static inline mm256i_emu opus_mm256_dpbusds_epi32(mm256i_emu src, mm256i_emu a,
return res; return res;
} }
#if defined(_MSC_VER)
#pragma message ("Only SSE and SSE2 are available. On newer machines, enable SSSE3/AVX/AVX2 to get better performance")
#else
#warning "Only SSE and SSE2 are available. On newer machines, enable SSSE3/AVX/AVX2 using -march= to get better performance"
#endif
#else #else

78
dnn/x86/dnn_x86.h Normal file
View file

@ -0,0 +1,78 @@
/* Copyright (c) 2011-2019 Mozilla
2023 Amazon */
/*
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef DNN_X86_H
#define DNN_X86_H
#include "cpu_support.h"
#include "opus_types.h"
#if defined(OPUS_X86_MAY_HAVE_SSE2)
void compute_linear_sse2(const LinearLayer *linear, float *out, const float *in);
#endif
#if defined(OPUS_X86_MAY_HAVE_SSE4_1)
void compute_linear_sse4_1(const LinearLayer *linear, float *out, const float *in);
#endif
#if defined(OPUS_X86_MAY_HAVE_AVX2)
void compute_linear_avx2(const LinearLayer *linear, float *out, const float *in);
#endif
#if defined(OPUS_X86_PRESUME_AVX2)
#define OVERRIDE_COMPUTE_LINEAR
#define compute_linear(linear, out, in, arch) ((void)(arch),compute_linear_avx2(linear, out, in))
#elif defined(OPUS_X86_PRESUME_SSE4_1) && !defined(OPUS_X86_MAY_HAVE_AVX2)
#define OVERRIDE_COMPUTE_LINEAR
#define compute_linear(linear, out, in, arch) ((void)(arch),compute_linear_sse4_1(linear, out, in))
#elif defined(OPUS_X86_PRESUME_SSE2) && !defined(OPUS_X86_MAY_HAVE_AVX2) && !defined(OPUS_X86_MAY_HAVE_SSE4_1)
#define OVERRIDE_COMPUTE_LINEAR
#define compute_linear(linear, out, in, arch) ((void)(arch),compute_linear_sse2(linear, out, in))
#elif defined(OPUS_HAVE_RTCD) && (defined(OPUS_X86_MAY_HAVE_AVX2) || defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2))
extern void (*const DNN_COMPUTE_LINEAR_IMPL[OPUS_ARCHMASK + 1])(
const LinearLayer *linear,
float *out,
const float *in
);
#define OVERRIDE_COMPUTE_LINEAR
#define compute_linear(linear, out, in, arch) \
((*DNN_COMPUTE_LINEAR_IMPL[(arch) & OPUS_ARCHMASK])(linear, out, in))
#endif
#endif /* DNN_X86_H */

38
dnn/x86/nnet_avx2.c Normal file
View file

@ -0,0 +1,38 @@
/* Copyright (c) 2018-2019 Mozilla
2023 Amazon */
/*
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#ifndef __AVX2__
#error nnet_avx2.c is being compiled without AVX2 enabled
#endif
#define RTCD_ARCH avx2
#include "nnet_arch.h"

38
dnn/x86/nnet_sse2.c Normal file
View file

@ -0,0 +1,38 @@
/* Copyright (c) 2018-2019 Mozilla
2023 Amazon */
/*
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#ifndef __SSE2__
#error nnet_sse2.c is being compiled without SSE2 enabled
#endif
#define RTCD_ARCH sse2
#include "nnet_arch.h"

38
dnn/x86/nnet_sse4_1.c Normal file
View file

@ -0,0 +1,38 @@
/* Copyright (c) 2018-2019 Mozilla
2023 Amazon */
/*
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#ifndef __SSE4_1__
#error nnet_sse4_1.c is being compiled without SSE4.1 enabled
#endif
#define RTCD_ARCH sse4_1
#include "nnet_arch.h"

54
dnn/x86/x86_dnn_map.c Normal file
View file

@ -0,0 +1,54 @@
/* Copyright (c) 2018-2019 Mozilla
2023 Amazon */
/*
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#include "x86/x86cpu.h"
#include "nnet.h"
#if defined(OPUS_HAVE_RTCD)
#if (defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_AVX2))
void (*const DNN_COMPUTE_LINEAR_IMPL[OPUS_ARCHMASK + 1])(
const LinearLayer *linear,
float *out,
const float *in
) = {
compute_linear_c, /* non-sse */
compute_linear_c,
MAY_HAVE_SSE2(compute_linear),
MAY_HAVE_SSE4_1(compute_linear), /* sse4.1 */
MAY_HAVE_AVX2(compute_linear) /* avx */
};
#endif
#endif

View file

@ -12,7 +12,9 @@ dnn/vec.h \
dnn/vec_avx.h \ dnn/vec_avx.h \
dnn/vec_neon.h \ dnn/vec_neon.h \
dnn/pitchdnn.h \ dnn/pitchdnn.h \
dnn/pitchdnn_data.h dnn/pitchdnn_data.h \
dnn/x86/dnn_x86.h \
dnn/nnet_arch.h
DRED_HEAD = \ DRED_HEAD = \
silk/dred_coding.h \ silk/dred_coding.h \

View file

@ -7,6 +7,7 @@ dnn/lpcnet_enc.c \
dnn/lpcnet_plc.c \ dnn/lpcnet_plc.c \
dnn/lpcnet_tables.c \ dnn/lpcnet_tables.c \
dnn/nnet.c \ dnn/nnet.c \
dnn/nnet_default.c \
dnn/plc_data.c \ dnn/plc_data.c \
dnn/parse_lpcnet_weights.c \ dnn/parse_lpcnet_weights.c \
dnn/pitchdnn.c \ dnn/pitchdnn.c \
@ -21,3 +22,8 @@ dnn/dred_rdovae_stats_data.c \
silk/dred_encoder.c \ silk/dred_encoder.c \
silk/dred_coding.c \ silk/dred_coding.c \
silk/dred_decoder.c silk/dred_decoder.c
DNN_SOURCES_X86_RTCD = dnn/x86/x86_dnn_map.c
DNN_SOURCES_AVX2 = dnn/x86/nnet_avx2.c
DNN_SOURCES_SSE4_1 = dnn/x86/nnet_sse4_1.c
DNN_SOURCES_SSE2 = dnn/x86/nnet_sse2.c

View file

@ -87,7 +87,7 @@ void dred_encoder_init(DREDEnc* enc, opus_int32 Fs, int channels)
dred_encoder_reset(enc); dred_encoder_reset(enc);
} }
static void dred_process_frame(DREDEnc *enc) static void dred_process_frame(DREDEnc *enc, int arch)
{ {
float feature_buffer[2 * 36]; float feature_buffer[2 * 36];
float input_buffer[2*DRED_NUM_FEATURES] = {0}; float input_buffer[2*DRED_NUM_FEATURES] = {0};
@ -97,15 +97,15 @@ static void dred_process_frame(DREDEnc *enc)
OPUS_MOVE(enc->latents_buffer + DRED_LATENT_DIM, enc->latents_buffer, (DRED_MAX_FRAMES - 1) * DRED_LATENT_DIM); OPUS_MOVE(enc->latents_buffer + DRED_LATENT_DIM, enc->latents_buffer, (DRED_MAX_FRAMES - 1) * DRED_LATENT_DIM);
/* calculate LPCNet features */ /* calculate LPCNet features */
lpcnet_compute_single_frame_features_float(&enc->lpcnet_enc_state, enc->input_buffer, feature_buffer); lpcnet_compute_single_frame_features_float(&enc->lpcnet_enc_state, enc->input_buffer, feature_buffer, arch);
lpcnet_compute_single_frame_features_float(&enc->lpcnet_enc_state, enc->input_buffer + DRED_FRAME_SIZE, feature_buffer + 36); lpcnet_compute_single_frame_features_float(&enc->lpcnet_enc_state, enc->input_buffer + DRED_FRAME_SIZE, feature_buffer + 36, arch);
/* prepare input buffer (discard LPC coefficients) */ /* prepare input buffer (discard LPC coefficients) */
OPUS_COPY(input_buffer, feature_buffer, DRED_NUM_FEATURES); OPUS_COPY(input_buffer, feature_buffer, DRED_NUM_FEATURES);
OPUS_COPY(input_buffer + DRED_NUM_FEATURES, feature_buffer + 36, DRED_NUM_FEATURES); OPUS_COPY(input_buffer + DRED_NUM_FEATURES, feature_buffer + 36, DRED_NUM_FEATURES);
/* run RDOVAE encoder */ /* run RDOVAE encoder */
dred_rdovae_encode_dframe(&enc->rdovae_enc, &enc->model, enc->latents_buffer, enc->state_buffer, input_buffer); dred_rdovae_encode_dframe(&enc->rdovae_enc, &enc->model, enc->latents_buffer, enc->state_buffer, input_buffer, arch);
enc->latents_buffer_fill = IMIN(enc->latents_buffer_fill+1, DRED_NUM_REDUNDANCY_FRAMES); enc->latents_buffer_fill = IMIN(enc->latents_buffer_fill+1, DRED_NUM_REDUNDANCY_FRAMES);
} }
@ -188,7 +188,7 @@ static void dred_convert_to_16k(DREDEnc *enc, const float *in, int in_len, float
} }
} }
void dred_compute_latents(DREDEnc *enc, const float *pcm, int frame_size, int extra_delay) void dred_compute_latents(DREDEnc *enc, const float *pcm, int frame_size, int extra_delay, int arch)
{ {
int curr_offset16k; int curr_offset16k;
int frame_size16k = frame_size * 16000 / enc->Fs; int frame_size16k = frame_size * 16000 / enc->Fs;
@ -206,7 +206,7 @@ void dred_compute_latents(DREDEnc *enc, const float *pcm, int frame_size, int ex
if (enc->input_buffer_fill >= 2*DRED_FRAME_SIZE) if (enc->input_buffer_fill >= 2*DRED_FRAME_SIZE)
{ {
curr_offset16k += 320; curr_offset16k += 320;
dred_process_frame(enc); dred_process_frame(enc, arch);
enc->input_buffer_fill -= 2*DRED_FRAME_SIZE; enc->input_buffer_fill -= 2*DRED_FRAME_SIZE;
OPUS_MOVE(&enc->input_buffer[0], &enc->input_buffer[2*DRED_FRAME_SIZE], enc->input_buffer_fill); OPUS_MOVE(&enc->input_buffer[0], &enc->input_buffer[2*DRED_FRAME_SIZE], enc->input_buffer_fill);
/* 15 ms (6*2.5 ms) is the ideal offset for DRED because it corresponds to our vocoder look-ahead. */ /* 15 ms (6*2.5 ms) is the ideal offset for DRED because it corresponds to our vocoder look-ahead. */

View file

@ -64,7 +64,7 @@ void dred_encoder_reset(DREDEnc* enc);
void dred_deinit_encoder(DREDEnc *enc); void dred_deinit_encoder(DREDEnc *enc);
void dred_compute_latents(DREDEnc *enc, const float *pcm, int frame_size, int extra_delay); void dred_compute_latents(DREDEnc *enc, const float *pcm, int frame_size, int extra_delay, int arch);
int dred_encode_silk_frame(const DREDEnc *enc, unsigned char *buf, int max_chunks, int max_bytes); int dred_encode_silk_frame(const DREDEnc *enc, unsigned char *buf, int max_chunks, int max_bytes);

View file

@ -1424,7 +1424,7 @@ int opus_dred_process(OpusDREDDecoder *dred_dec, const OpusDRED *src, OpusDRED *
OPUS_COPY(dst, src, 1); OPUS_COPY(dst, src, 1);
if (dst->process_stage == 2) if (dst->process_stage == 2)
return OPUS_OK; return OPUS_OK;
DRED_rdovae_decode_all(&dred_dec->model, dst->fec_features, dst->state, dst->latents, dst->nb_latents); DRED_rdovae_decode_all(&dred_dec->model, dst->fec_features, dst->state, dst->latents, dst->nb_latents, dred_dec->arch);
dst->process_stage = 2; dst->process_stage = 2;
return OPUS_OK; return OPUS_OK;
#else #else

View file

@ -1715,7 +1715,7 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_
#ifdef ENABLE_DRED #ifdef ENABLE_DRED
if ( st->dred_duration > 0 && st->dred_encoder.loaded ) { if ( st->dred_duration > 0 && st->dred_encoder.loaded ) {
/* DRED Encoder */ /* DRED Encoder */
dred_compute_latents( &st->dred_encoder, &pcm_buf[total_buffer*st->channels], frame_size, total_buffer ); dred_compute_latents( &st->dred_encoder, &pcm_buf[total_buffer*st->channels], frame_size, total_buffer, st->arch );
} else { } else {
st->dred_encoder.latents_buffer_fill = 0; st->dred_encoder.latents_buffer_fill = 0;
} }