mirror of
https://github.com/xiph/opus.git
synced 2025-05-29 14:49:14 +00:00
Remove trailing whitespace in dnn
This commit is contained in:
parent
26ab10d0c8
commit
f36685fc97
37 changed files with 231 additions and 246 deletions
|
@ -115,7 +115,7 @@ This codebase is also meant for research and it is possible to train new models.
|
||||||
and move the generated nnet\_data.\* files to the src/ directory.
|
and move the generated nnet\_data.\* files to the src/ directory.
|
||||||
Then you just need to rebuild the software and use lpcnet\_demo as explained above.
|
Then you just need to rebuild the software and use lpcnet\_demo as explained above.
|
||||||
|
|
||||||
# Speech Material for Training
|
# Speech Material for Training
|
||||||
|
|
||||||
Suitable training material can be obtained from [Open Speech and Language Resources](https://www.openslr.org/). See the datasets.txt file for details on suitable training data.
|
Suitable training material can be obtained from [Open Speech and Language Resources](https://www.openslr.org/). See the datasets.txt file for details on suitable training data.
|
||||||
|
|
||||||
|
@ -123,5 +123,4 @@ Suitable training material can be obtained from [Open Speech and Language Resour
|
||||||
|
|
||||||
1. [LPCNet: DSP-Boosted Neural Speech Synthesis](https://people.xiph.org/~jm/demo/lpcnet/)
|
1. [LPCNet: DSP-Boosted Neural Speech Synthesis](https://people.xiph.org/~jm/demo/lpcnet/)
|
||||||
1. [A Real-Time Wideband Neural Vocoder at 1.6 kb/s Using LPCNet](https://people.xiph.org/~jm/demo/lpcnet_codec/)
|
1. [A Real-Time Wideband Neural Vocoder at 1.6 kb/s Using LPCNet](https://people.xiph.org/~jm/demo/lpcnet_codec/)
|
||||||
1. Sample model files (check compatibility): https://media.xiph.org/lpcnet/data/
|
1. Sample model files (check compatibility): https://media.xiph.org/lpcnet/data/
|
||||||
|
|
||||||
|
|
|
@ -171,4 +171,3 @@ The corresponding citations for all these datasets are:
|
||||||
journal={arXiv preprint arXiv:2104.01497},
|
journal={arXiv preprint arXiv:2104.01497},
|
||||||
year={2021}
|
year={2021}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -9,4 +9,3 @@ if not exist %model% (
|
||||||
tar -xvzf %model%
|
tar -xvzf %model%
|
||||||
move .\src\*.c .
|
move .\src\*.c .
|
||||||
move .\src\*.h .
|
move .\src\*.h .
|
||||||
|
|
||||||
|
|
|
@ -98,7 +98,7 @@ void write_audio(LPCNetEncState *st, const short *pcm, const int *noise, FILE *f
|
||||||
/* Simulate error on excitation. */
|
/* Simulate error on excitation. */
|
||||||
e += noise[k*FRAME_SIZE+i];
|
e += noise[k*FRAME_SIZE+i];
|
||||||
e = IMIN(255, IMAX(0, e));
|
e = IMIN(255, IMAX(0, e));
|
||||||
|
|
||||||
RNN_MOVE(&st->sig_mem[1], &st->sig_mem[0], LPC_ORDER-1);
|
RNN_MOVE(&st->sig_mem[1], &st->sig_mem[0], LPC_ORDER-1);
|
||||||
st->sig_mem[0] = p + ulaw2lin(e);
|
st->sig_mem[0] = p + ulaw2lin(e);
|
||||||
st->exc_mem = e;
|
st->exc_mem = e;
|
||||||
|
@ -241,7 +241,7 @@ int main(int argc, char **argv) {
|
||||||
if (fpcm) {
|
if (fpcm) {
|
||||||
compute_noise(&noisebuf[st->pcount*FRAME_SIZE], noise_std);
|
compute_noise(&noisebuf[st->pcount*FRAME_SIZE], noise_std);
|
||||||
}
|
}
|
||||||
|
|
||||||
process_single_frame(st, ffeat);
|
process_single_frame(st, ffeat);
|
||||||
if (fpcm) write_audio(st, pcm, &noisebuf[st->pcount*FRAME_SIZE], fpcm, 1);
|
if (fpcm) write_audio(st, pcm, &noisebuf[st->pcount*FRAME_SIZE], fpcm, 1);
|
||||||
st->pcount++;
|
st->pcount++;
|
||||||
|
@ -260,4 +260,3 @@ int main(int argc, char **argv) {
|
||||||
lpcnet_encoder_destroy(st);
|
lpcnet_encoder_destroy(st);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -326,4 +326,3 @@ void apply_window(float *x) {
|
||||||
x[WINDOW_SIZE - 1 - i] *= half_window[i];
|
x[WINDOW_SIZE - 1 - i] *= half_window[i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -61,7 +61,7 @@ void rc2lpc(float *lpc, const float *rc)
|
||||||
float ntmp[LPC_ORDER] = {0.0};
|
float ntmp[LPC_ORDER] = {0.0};
|
||||||
RNN_COPY(tmp, rc, LPC_ORDER);
|
RNN_COPY(tmp, rc, LPC_ORDER);
|
||||||
for(i = 0; i < LPC_ORDER ; i++)
|
for(i = 0; i < LPC_ORDER ; i++)
|
||||||
{
|
{
|
||||||
for(j = 0; j <= i-1; j++)
|
for(j = 0; j <= i-1; j++)
|
||||||
{
|
{
|
||||||
ntmp[j] = tmp[j] + tmp[i]*tmp[i - j - 1];
|
ntmp[j] = tmp[j] + tmp[i]*tmp[i - j - 1];
|
||||||
|
@ -106,7 +106,7 @@ void run_frame_network(LPCNetState *lpcnet, float *gru_a_condition, float *gru_b
|
||||||
_lpcnet_compute_dense(&lpcnet->model.gru_b_dense_feature, gru_b_condition, condition);
|
_lpcnet_compute_dense(&lpcnet->model.gru_b_dense_feature, gru_b_condition, condition);
|
||||||
#ifdef END2END
|
#ifdef END2END
|
||||||
rc2lpc(lpc, rc);
|
rc2lpc(lpc, rc);
|
||||||
#elif FEATURES_DELAY>0
|
#elif FEATURES_DELAY>0
|
||||||
memcpy(lpc, lpcnet->old_lpc[FEATURES_DELAY-1], LPC_ORDER*sizeof(lpc[0]));
|
memcpy(lpc, lpcnet->old_lpc[FEATURES_DELAY-1], LPC_ORDER*sizeof(lpc[0]));
|
||||||
memmove(lpcnet->old_lpc[1], lpcnet->old_lpc[0], (FEATURES_DELAY-1)*LPC_ORDER*sizeof(lpc[0]));
|
memmove(lpcnet->old_lpc[1], lpcnet->old_lpc[0], (FEATURES_DELAY-1)*LPC_ORDER*sizeof(lpc[0]));
|
||||||
lpc_from_cepstrum(lpcnet->old_lpc[0], features);
|
lpc_from_cepstrum(lpcnet->old_lpc[0], features);
|
||||||
|
|
10
dnn/nnet.c
10
dnn/nnet.c
|
@ -170,7 +170,7 @@ int sample_mdense(const MDenseLayer *layer, const float *input, const float *sam
|
||||||
C = layer->nb_channels;
|
C = layer->nb_channels;
|
||||||
celt_assert(N*C <= MAX_MDENSE_TMP);
|
celt_assert(N*C <= MAX_MDENSE_TMP);
|
||||||
stride = M*C;
|
stride = M*C;
|
||||||
|
|
||||||
celt_assert(N <= DUAL_FC_OUT_SIZE);
|
celt_assert(N <= DUAL_FC_OUT_SIZE);
|
||||||
|
|
||||||
/* Computing all the random thresholds in advance. These thresholds are directly
|
/* Computing all the random thresholds in advance. These thresholds are directly
|
||||||
|
@ -188,7 +188,7 @@ int sample_mdense(const MDenseLayer *layer, const float *input, const float *sam
|
||||||
int bit;
|
int bit;
|
||||||
int i;
|
int i;
|
||||||
float sum1, sum2;
|
float sum1, sum2;
|
||||||
|
|
||||||
i = (1<<b) | val;
|
i = (1<<b) | val;
|
||||||
|
|
||||||
sum1 = layer->bias[i];
|
sum1 = layer->bias[i];
|
||||||
|
@ -426,7 +426,7 @@ void compute_sparse_gru(const SparseGRULayer *gru, float *state, const float *in
|
||||||
#ifdef USE_SU_BIAS
|
#ifdef USE_SU_BIAS
|
||||||
bias = &gru->subias[3*N];
|
bias = &gru->subias[3*N];
|
||||||
#else
|
#else
|
||||||
bias = &gru->bias[3*N];
|
bias = &gru->bias[3*N];
|
||||||
#endif
|
#endif
|
||||||
for (k=0;k<2;k++)
|
for (k=0;k<2;k++)
|
||||||
{
|
{
|
||||||
|
@ -478,7 +478,7 @@ void compute_embedding(const EmbeddingLayer *layer, float *output, int input)
|
||||||
for (i=0;i<layer->dim;i++)
|
for (i=0;i<layer->dim;i++)
|
||||||
{
|
{
|
||||||
output[i] = layer->embedding_weights[input*layer->dim + i];
|
output[i] = layer->embedding_weights[input*layer->dim + i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void compute_gru_a_input(float *output, const float *input, int N, const EmbeddingLayer *layer1, int val1, const EmbeddingLayer *layer2, int val2, const EmbeddingLayer *layer3, int val3) {
|
void compute_gru_a_input(float *output, const float *input, int N, const EmbeddingLayer *layer1, int val1, const EmbeddingLayer *layer2, int val2, const EmbeddingLayer *layer3, int val3) {
|
||||||
|
@ -499,5 +499,5 @@ void accum_embedding(const EmbeddingLayer *layer, float *output, int input)
|
||||||
for (i=0;i<layer->dim;i++)
|
for (i=0;i<layer->dim;i++)
|
||||||
{
|
{
|
||||||
output[i] += layer->embedding_weights[input*layer->dim + i];
|
output[i] += layer->embedding_weights[input*layer->dim + i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -45,7 +45,7 @@ int parse_record(const unsigned char **data, int *len, WeightArray *array) {
|
||||||
array->type = h->type;
|
array->type = h->type;
|
||||||
array->size = h->size;
|
array->size = h->size;
|
||||||
array->data = (*data)+WEIGHT_BLOCK_SIZE;
|
array->data = (*data)+WEIGHT_BLOCK_SIZE;
|
||||||
|
|
||||||
*data += h->block_size+WEIGHT_BLOCK_SIZE;
|
*data += h->block_size+WEIGHT_BLOCK_SIZE;
|
||||||
*len -= h->block_size+WEIGHT_BLOCK_SIZE;
|
*len -= h->block_size+WEIGHT_BLOCK_SIZE;
|
||||||
return array->size;
|
return array->size;
|
||||||
|
@ -103,7 +103,7 @@ static const void *find_idx_check(const WeightArray *arrays, const char *name, i
|
||||||
if (remain < nb_blocks+1) return NULL;
|
if (remain < nb_blocks+1) return NULL;
|
||||||
for (i=0;i<nb_blocks;i++) {
|
for (i=0;i<nb_blocks;i++) {
|
||||||
int pos = *idx++;
|
int pos = *idx++;
|
||||||
if (pos+3 >= nb_in || (pos&0x3)) return NULL;
|
if (pos+3 >= nb_in || (pos&0x3)) return NULL;
|
||||||
}
|
}
|
||||||
nb_out -= 8;
|
nb_out -= 8;
|
||||||
remain -= nb_blocks+1;
|
remain -= nb_blocks+1;
|
||||||
|
|
|
@ -63,7 +63,7 @@ int test_sgemv_accum16() {
|
||||||
out[i] = 0;
|
out[i] = 0;
|
||||||
out_fast[i] = 0;
|
out_fast[i] = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
for(i=0; i<COLS; i++) {
|
for(i=0; i<COLS; i++) {
|
||||||
x[i] = i+1;
|
x[i] = i+1;
|
||||||
}
|
}
|
||||||
|
@ -101,7 +101,7 @@ int test_sparse_sgemv_accum16() {
|
||||||
out[i] = 0;
|
out[i] = 0;
|
||||||
out_fast[i] = 0;
|
out_fast[i] = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
sparse_sgemv_accum16(out, w, rows, indx, x);
|
sparse_sgemv_accum16(out, w, rows, indx, x);
|
||||||
sparse_sgemv_accum16_fast(out_fast, w, rows, indx, x);
|
sparse_sgemv_accum16_fast(out_fast, w, rows, indx, x);
|
||||||
|
|
||||||
|
@ -126,5 +126,3 @@ int main() {
|
||||||
int test2 = test_sparse_sgemv_accum16();
|
int test2 = test_sparse_sgemv_accum16();
|
||||||
return test1 || test2;
|
return test1 || test2;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -80,14 +80,14 @@ extern const opus_uint16 dred_p0_q15[{levels * N}];
|
||||||
|
|
||||||
|
|
||||||
def c_export(args, model):
|
def c_export(args, model):
|
||||||
|
|
||||||
message = f"Auto generated from checkpoint {os.path.basename(args.checkpoint)}"
|
message = f"Auto generated from checkpoint {os.path.basename(args.checkpoint)}"
|
||||||
|
|
||||||
enc_writer = CWriter(os.path.join(args.output_dir, "dred_rdovae_enc_data"), message=message)
|
enc_writer = CWriter(os.path.join(args.output_dir, "dred_rdovae_enc_data"), message=message)
|
||||||
dec_writer = CWriter(os.path.join(args.output_dir, "dred_rdovae_dec_data"), message=message)
|
dec_writer = CWriter(os.path.join(args.output_dir, "dred_rdovae_dec_data"), message=message)
|
||||||
stats_writer = CWriter(os.path.join(args.output_dir, "dred_rdovae_stats_data"), message=message)
|
stats_writer = CWriter(os.path.join(args.output_dir, "dred_rdovae_stats_data"), message=message)
|
||||||
constants_writer = CWriter(os.path.join(args.output_dir, "dred_rdovae_constants"), message=message, header_only=True)
|
constants_writer = CWriter(os.path.join(args.output_dir, "dred_rdovae_constants"), message=message, header_only=True)
|
||||||
|
|
||||||
# some custom includes
|
# some custom includes
|
||||||
for writer in [enc_writer, dec_writer, stats_writer]:
|
for writer in [enc_writer, dec_writer, stats_writer]:
|
||||||
writer.header.write(
|
writer.header.write(
|
||||||
|
@ -99,10 +99,10 @@ f"""
|
||||||
#include "nnet.h"
|
#include "nnet.h"
|
||||||
"""
|
"""
|
||||||
)
|
)
|
||||||
|
|
||||||
# encoder
|
# encoder
|
||||||
encoder_dense_layers = [
|
encoder_dense_layers = [
|
||||||
('core_encoder.module.dense_1' , 'enc_dense1', 'TANH'),
|
('core_encoder.module.dense_1' , 'enc_dense1', 'TANH'),
|
||||||
('core_encoder.module.dense_2' , 'enc_dense3', 'TANH'),
|
('core_encoder.module.dense_2' , 'enc_dense3', 'TANH'),
|
||||||
('core_encoder.module.dense_3' , 'enc_dense5', 'TANH'),
|
('core_encoder.module.dense_3' , 'enc_dense5', 'TANH'),
|
||||||
('core_encoder.module.dense_4' , 'enc_dense7', 'TANH'),
|
('core_encoder.module.dense_4' , 'enc_dense7', 'TANH'),
|
||||||
|
@ -110,31 +110,31 @@ f"""
|
||||||
('core_encoder.module.state_dense_1' , 'gdense1' , 'TANH'),
|
('core_encoder.module.state_dense_1' , 'gdense1' , 'TANH'),
|
||||||
('core_encoder.module.state_dense_2' , 'gdense2' , 'TANH')
|
('core_encoder.module.state_dense_2' , 'gdense2' , 'TANH')
|
||||||
]
|
]
|
||||||
|
|
||||||
for name, export_name, activation in encoder_dense_layers:
|
for name, export_name, activation in encoder_dense_layers:
|
||||||
layer = model.get_submodule(name)
|
layer = model.get_submodule(name)
|
||||||
dump_torch_weights(enc_writer, layer, name=export_name, activation=activation, verbose=True)
|
dump_torch_weights(enc_writer, layer, name=export_name, activation=activation, verbose=True)
|
||||||
|
|
||||||
|
|
||||||
encoder_gru_layers = [
|
encoder_gru_layers = [
|
||||||
('core_encoder.module.gru_1' , 'enc_dense2', 'TANH'),
|
('core_encoder.module.gru_1' , 'enc_dense2', 'TANH'),
|
||||||
('core_encoder.module.gru_2' , 'enc_dense4', 'TANH'),
|
('core_encoder.module.gru_2' , 'enc_dense4', 'TANH'),
|
||||||
('core_encoder.module.gru_3' , 'enc_dense6', 'TANH')
|
('core_encoder.module.gru_3' , 'enc_dense6', 'TANH')
|
||||||
]
|
]
|
||||||
|
|
||||||
enc_max_rnn_units = max([dump_torch_weights(enc_writer, model.get_submodule(name), export_name, activation, verbose=True, input_sparse=True, dotp=True)
|
enc_max_rnn_units = max([dump_torch_weights(enc_writer, model.get_submodule(name), export_name, activation, verbose=True, input_sparse=True, dotp=True)
|
||||||
for name, export_name, activation in encoder_gru_layers])
|
for name, export_name, activation in encoder_gru_layers])
|
||||||
|
|
||||||
|
|
||||||
encoder_conv_layers = [
|
|
||||||
('core_encoder.module.conv1' , 'bits_dense' , 'LINEAR')
|
|
||||||
]
|
|
||||||
|
|
||||||
enc_max_conv_inputs = max([dump_torch_weights(enc_writer, model.get_submodule(name), export_name, activation, verbose=True) for name, export_name, activation in encoder_conv_layers])
|
|
||||||
|
|
||||||
|
|
||||||
|
encoder_conv_layers = [
|
||||||
|
('core_encoder.module.conv1' , 'bits_dense' , 'LINEAR')
|
||||||
|
]
|
||||||
|
|
||||||
|
enc_max_conv_inputs = max([dump_torch_weights(enc_writer, model.get_submodule(name), export_name, activation, verbose=True) for name, export_name, activation in encoder_conv_layers])
|
||||||
|
|
||||||
|
|
||||||
del enc_writer
|
del enc_writer
|
||||||
|
|
||||||
# decoder
|
# decoder
|
||||||
decoder_dense_layers = [
|
decoder_dense_layers = [
|
||||||
('core_decoder.module.gru_1_init' , 'state1', 'TANH'),
|
('core_decoder.module.gru_1_init' , 'state1', 'TANH'),
|
||||||
|
@ -151,25 +151,25 @@ f"""
|
||||||
for name, export_name, activation in decoder_dense_layers:
|
for name, export_name, activation in decoder_dense_layers:
|
||||||
layer = model.get_submodule(name)
|
layer = model.get_submodule(name)
|
||||||
dump_torch_weights(dec_writer, layer, name=export_name, activation=activation, verbose=True)
|
dump_torch_weights(dec_writer, layer, name=export_name, activation=activation, verbose=True)
|
||||||
|
|
||||||
|
|
||||||
decoder_gru_layers = [
|
decoder_gru_layers = [
|
||||||
('core_decoder.module.gru_1' , 'dec_dense2', 'TANH'),
|
('core_decoder.module.gru_1' , 'dec_dense2', 'TANH'),
|
||||||
('core_decoder.module.gru_2' , 'dec_dense4', 'TANH'),
|
('core_decoder.module.gru_2' , 'dec_dense4', 'TANH'),
|
||||||
('core_decoder.module.gru_3' , 'dec_dense6', 'TANH')
|
('core_decoder.module.gru_3' , 'dec_dense6', 'TANH')
|
||||||
]
|
]
|
||||||
|
|
||||||
dec_max_rnn_units = max([dump_torch_weights(dec_writer, model.get_submodule(name), export_name, activation, verbose=True, input_sparse=True, dotp=True)
|
dec_max_rnn_units = max([dump_torch_weights(dec_writer, model.get_submodule(name), export_name, activation, verbose=True, input_sparse=True, dotp=True)
|
||||||
for name, export_name, activation in decoder_gru_layers])
|
for name, export_name, activation in decoder_gru_layers])
|
||||||
|
|
||||||
del dec_writer
|
del dec_writer
|
||||||
|
|
||||||
# statistical model
|
# statistical model
|
||||||
qembedding = model.statistical_model.quant_embedding
|
qembedding = model.statistical_model.quant_embedding
|
||||||
dump_statistical_model(stats_writer, qembedding)
|
dump_statistical_model(stats_writer, qembedding)
|
||||||
|
|
||||||
del stats_writer
|
del stats_writer
|
||||||
|
|
||||||
# constants
|
# constants
|
||||||
constants_writer.header.write(
|
constants_writer.header.write(
|
||||||
f"""
|
f"""
|
||||||
|
@ -193,12 +193,12 @@ f"""
|
||||||
|
|
||||||
"""
|
"""
|
||||||
)
|
)
|
||||||
|
|
||||||
del constants_writer
|
del constants_writer
|
||||||
|
|
||||||
|
|
||||||
def numpy_export(args, model):
|
def numpy_export(args, model):
|
||||||
|
|
||||||
exchange_name_to_name = {
|
exchange_name_to_name = {
|
||||||
'encoder_stack_layer1_dense' : 'core_encoder.module.dense_1',
|
'encoder_stack_layer1_dense' : 'core_encoder.module.dense_1',
|
||||||
'encoder_stack_layer3_dense' : 'core_encoder.module.dense_2',
|
'encoder_stack_layer3_dense' : 'core_encoder.module.dense_2',
|
||||||
|
@ -225,20 +225,20 @@ def numpy_export(args, model):
|
||||||
'decoder_stack_layer4_gru' : 'core_decoder.module.gru_2',
|
'decoder_stack_layer4_gru' : 'core_decoder.module.gru_2',
|
||||||
'decoder_stack_layer6_gru' : 'core_decoder.module.gru_3'
|
'decoder_stack_layer6_gru' : 'core_decoder.module.gru_3'
|
||||||
}
|
}
|
||||||
|
|
||||||
name_to_exchange_name = {value : key for key, value in exchange_name_to_name.items()}
|
name_to_exchange_name = {value : key for key, value in exchange_name_to_name.items()}
|
||||||
|
|
||||||
for name, exchange_name in name_to_exchange_name.items():
|
for name, exchange_name in name_to_exchange_name.items():
|
||||||
print(f"printing layer {name}...")
|
print(f"printing layer {name}...")
|
||||||
dump_torch_weights(os.path.join(args.output_dir, exchange_name), model.get_submodule(name))
|
dump_torch_weights(os.path.join(args.output_dir, exchange_name), model.get_submodule(name))
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
||||||
|
|
||||||
os.makedirs(args.output_dir, exist_ok=True)
|
os.makedirs(args.output_dir, exist_ok=True)
|
||||||
|
|
||||||
|
|
||||||
# load model from checkpoint
|
# load model from checkpoint
|
||||||
checkpoint = torch.load(args.checkpoint, map_location='cpu')
|
checkpoint = torch.load(args.checkpoint, map_location='cpu')
|
||||||
model = RDOVAE(*checkpoint['model_args'], **checkpoint['model_kwargs'])
|
model = RDOVAE(*checkpoint['model_args'], **checkpoint['model_kwargs'])
|
||||||
|
@ -249,7 +249,7 @@ if __name__ == "__main__":
|
||||||
|
|
||||||
if len(unmatched_keys) > 0:
|
if len(unmatched_keys) > 0:
|
||||||
print(f"warning: the following keys were unmatched {unmatched_keys}")
|
print(f"warning: the following keys were unmatched {unmatched_keys}")
|
||||||
|
|
||||||
if args.format == 'C':
|
if args.format == 'C':
|
||||||
c_export(args, model)
|
c_export(args, model)
|
||||||
elif args.format == 'numpy':
|
elif args.format == 'numpy':
|
||||||
|
|
|
@ -84,7 +84,7 @@ total_delay = silk_delay + zero_history + args.extra_delay - dump_data_delay
|
||||||
# load signal
|
# load signal
|
||||||
if args.input.endswith('.raw') or args.input.endswith('.pcm'):
|
if args.input.endswith('.raw') or args.input.endswith('.pcm'):
|
||||||
signal = np.fromfile(args.input, dtype='int16')
|
signal = np.fromfile(args.input, dtype='int16')
|
||||||
|
|
||||||
elif args.input.endswith('.wav'):
|
elif args.input.endswith('.wav'):
|
||||||
fs, signal = wavfile.read(args.input)
|
fs, signal = wavfile.read(args.input)
|
||||||
else:
|
else:
|
||||||
|
@ -94,7 +94,7 @@ else:
|
||||||
padded_signal_length = len(signal) + total_delay
|
padded_signal_length = len(signal) + total_delay
|
||||||
tail = padded_signal_length % frame_size
|
tail = padded_signal_length % frame_size
|
||||||
right_padding = (frame_size - tail) % frame_size
|
right_padding = (frame_size - tail) % frame_size
|
||||||
|
|
||||||
signal = np.concatenate((np.zeros(total_delay, dtype=np.int16), signal, np.zeros(right_padding, dtype=np.int16)))
|
signal = np.concatenate((np.zeros(total_delay, dtype=np.int16), signal, np.zeros(right_padding, dtype=np.int16)))
|
||||||
|
|
||||||
padded_signal_file = os.path.splitext(args.input)[0] + '_padded.raw'
|
padded_signal_file = os.path.splitext(args.input)[0] + '_padded.raw'
|
||||||
|
@ -152,7 +152,7 @@ with torch.no_grad():
|
||||||
zi = torch.clone(z[:, i - 2 * input_length + 2: i + 1 : 2, :])
|
zi = torch.clone(z[:, i - 2 * input_length + 2: i + 1 : 2, :])
|
||||||
zi, rates = model.quantize(zi, quant_ids)
|
zi, rates = model.quantize(zi, quant_ids)
|
||||||
zi = model.unquantize(zi, quant_ids)
|
zi = model.unquantize(zi, quant_ids)
|
||||||
|
|
||||||
features = model.decode(zi, states[:, i : i + 1, :])
|
features = model.decode(zi, states[:, i : i + 1, :])
|
||||||
packets.append(features.squeeze(0).numpy())
|
packets.append(features.squeeze(0).numpy())
|
||||||
packet_size = 8 * int((torch.sum(rates) + 7 + state_size) / 8)
|
packet_size = 8 * int((torch.sum(rates) + 7 + state_size) / 8)
|
||||||
|
@ -176,7 +176,7 @@ if args.lossfile != None:
|
||||||
count = 2
|
count = 2
|
||||||
for i in range(num_packets):
|
for i in range(num_packets):
|
||||||
if (loss[i] == 0) or (i == num_packets - 1):
|
if (loss[i] == 0) or (i == num_packets - 1):
|
||||||
|
|
||||||
fec_out[ptr:ptr+count,:] = packets[i][foffset:, :]
|
fec_out[ptr:ptr+count,:] = packets[i][foffset:, :]
|
||||||
|
|
||||||
ptr += count
|
ptr += count
|
||||||
|
@ -190,14 +190,14 @@ if args.lossfile != None:
|
||||||
fec_out_full[:, : fec_out.shape[-1]] = fec_out
|
fec_out_full[:, : fec_out.shape[-1]] = fec_out
|
||||||
|
|
||||||
fec_out_full.tofile(packet_file[:-4] + f'_fec.f32')
|
fec_out_full.tofile(packet_file[:-4] + f'_fec.f32')
|
||||||
|
|
||||||
|
|
||||||
if args.debug_output:
|
if args.debug_output:
|
||||||
import itertools
|
import itertools
|
||||||
|
|
||||||
batches = [4]
|
batches = [4]
|
||||||
offsets = [0, 2 * args.num_redundancy_frames - 4]
|
offsets = [0, 2 * args.num_redundancy_frames - 4]
|
||||||
|
|
||||||
# sanity checks
|
# sanity checks
|
||||||
# 1. concatenate features at offset 0
|
# 1. concatenate features at offset 0
|
||||||
for batch, offset in itertools.product(batches, offsets):
|
for batch, offset in itertools.product(batches, offsets):
|
||||||
|
@ -210,4 +210,3 @@ if args.debug_output:
|
||||||
|
|
||||||
print(f"writing debug output {packet_file[:-4] + f'_torch_batch{batch}_offset{offset}.f32'}")
|
print(f"writing debug output {packet_file[:-4] + f'_torch_batch{batch}_offset{offset}.f32'}")
|
||||||
test_features_full.tofile(packet_file[:-4] + f'_torch_batch{batch}_offset{offset}.f32')
|
test_features_full.tofile(packet_file[:-4] + f'_torch_batch{batch}_offset{offset}.f32')
|
||||||
|
|
||||||
|
|
|
@ -90,7 +90,7 @@ if __name__ == "__main__":
|
||||||
cond_size = args.cond_size
|
cond_size = args.cond_size
|
||||||
cond_size2 = args.cond_size2
|
cond_size2 = args.cond_size2
|
||||||
state_dim = args.state_dim
|
state_dim = args.state_dim
|
||||||
|
|
||||||
|
|
||||||
# model
|
# model
|
||||||
checkpoint['model_args'] = (num_features, latent_dim, quant_levels, cond_size, cond_size2)
|
checkpoint['model_args'] = (num_features, latent_dim, quant_levels, cond_size, cond_size2)
|
||||||
|
@ -105,9 +105,9 @@ if __name__ == "__main__":
|
||||||
'encoder_stack_layer8_dense',
|
'encoder_stack_layer8_dense',
|
||||||
'encoder_state_layer1_dense',
|
'encoder_state_layer1_dense',
|
||||||
'encoder_state_layer2_dense',
|
'encoder_state_layer2_dense',
|
||||||
'decoder_state1_dense',
|
'decoder_state1_dense',
|
||||||
'decoder_state2_dense',
|
'decoder_state2_dense',
|
||||||
'decoder_state3_dense',
|
'decoder_state3_dense',
|
||||||
'decoder_stack_layer1_dense',
|
'decoder_stack_layer1_dense',
|
||||||
'decoder_stack_layer3_dense',
|
'decoder_stack_layer3_dense',
|
||||||
'decoder_stack_layer5_dense',
|
'decoder_stack_layer5_dense',
|
||||||
|
@ -122,7 +122,7 @@ if __name__ == "__main__":
|
||||||
'encoder_stack_layer6_gru',
|
'encoder_stack_layer6_gru',
|
||||||
'decoder_stack_layer2_gru',
|
'decoder_stack_layer2_gru',
|
||||||
'decoder_stack_layer4_gru',
|
'decoder_stack_layer4_gru',
|
||||||
'decoder_stack_layer6_gru'
|
'decoder_stack_layer6_gru'
|
||||||
]
|
]
|
||||||
|
|
||||||
conv1d_layer_names = [
|
conv1d_layer_names = [
|
||||||
|
|
|
@ -43,7 +43,7 @@ int get_fec_frame(const char * const filename, float *features, int packet_index
|
||||||
long offset;
|
long offset;
|
||||||
|
|
||||||
FILE *fid = fopen(filename, "rb");
|
FILE *fid = fopen(filename, "rb");
|
||||||
|
|
||||||
/* read header */
|
/* read header */
|
||||||
if (fread(&version, sizeof(version), 1, fid) != 1) goto error;
|
if (fread(&version, sizeof(version), 1, fid) != 1) goto error;
|
||||||
if (fread(&header_size, sizeof(header_size), 1, fid) != 1) goto error;
|
if (fread(&header_size, sizeof(header_size), 1, fid) != 1) goto error;
|
||||||
|
@ -88,7 +88,7 @@ int get_fec_rate(const char * const filename, int packet_index)
|
||||||
int16_t rate;
|
int16_t rate;
|
||||||
|
|
||||||
FILE *fid = fopen(filename, "rb");
|
FILE *fid = fopen(filename, "rb");
|
||||||
|
|
||||||
/* read header */
|
/* read header */
|
||||||
if (fread(&version, sizeof(version), 1, fid) != 1) goto error;
|
if (fread(&version, sizeof(version), 1, fid) != 1) goto error;
|
||||||
if (fread(&header_size, sizeof(header_size), 1, fid) != 1) goto error;
|
if (fread(&header_size, sizeof(header_size), 1, fid) != 1) goto error;
|
||||||
|
|
|
@ -33,25 +33,25 @@ import numpy as np
|
||||||
|
|
||||||
def write_fec_packets(filename, packets, rates=None):
|
def write_fec_packets(filename, packets, rates=None):
|
||||||
""" writes packets in binary format """
|
""" writes packets in binary format """
|
||||||
|
|
||||||
assert np.dtype(np.float32).itemsize == 4
|
assert np.dtype(np.float32).itemsize == 4
|
||||||
assert np.dtype(np.int16).itemsize == 2
|
assert np.dtype(np.int16).itemsize == 2
|
||||||
|
|
||||||
# derive some sizes
|
# derive some sizes
|
||||||
num_packets = len(packets)
|
num_packets = len(packets)
|
||||||
subframes_per_packet = packets[0].shape[-2]
|
subframes_per_packet = packets[0].shape[-2]
|
||||||
num_features = packets[0].shape[-1]
|
num_features = packets[0].shape[-1]
|
||||||
|
|
||||||
# size of float is 4
|
# size of float is 4
|
||||||
subframe_size = num_features * 4
|
subframe_size = num_features * 4
|
||||||
packet_size = subframe_size * subframes_per_packet + 2 # two bytes for rate
|
packet_size = subframe_size * subframes_per_packet + 2 # two bytes for rate
|
||||||
|
|
||||||
version = 1
|
version = 1
|
||||||
# header size (version, header_size, num_packets, packet_size, subframe_size, subrames_per_packet, num_features)
|
# header size (version, header_size, num_packets, packet_size, subframe_size, subrames_per_packet, num_features)
|
||||||
header_size = 14
|
header_size = 14
|
||||||
|
|
||||||
with open(filename, 'wb') as f:
|
with open(filename, 'wb') as f:
|
||||||
|
|
||||||
# header
|
# header
|
||||||
f.write(np.int16(version).tobytes())
|
f.write(np.int16(version).tobytes())
|
||||||
f.write(np.int16(header_size).tobytes())
|
f.write(np.int16(header_size).tobytes())
|
||||||
|
@ -60,28 +60,28 @@ def write_fec_packets(filename, packets, rates=None):
|
||||||
f.write(np.int16(subframe_size).tobytes())
|
f.write(np.int16(subframe_size).tobytes())
|
||||||
f.write(np.int16(subframes_per_packet).tobytes())
|
f.write(np.int16(subframes_per_packet).tobytes())
|
||||||
f.write(np.int16(num_features).tobytes())
|
f.write(np.int16(num_features).tobytes())
|
||||||
|
|
||||||
# packets
|
# packets
|
||||||
for i, packet in enumerate(packets):
|
for i, packet in enumerate(packets):
|
||||||
if type(rates) == type(None):
|
if type(rates) == type(None):
|
||||||
rate = 0
|
rate = 0
|
||||||
else:
|
else:
|
||||||
rate = rates[i]
|
rate = rates[i]
|
||||||
|
|
||||||
f.write(np.int16(rate).tobytes())
|
f.write(np.int16(rate).tobytes())
|
||||||
|
|
||||||
features = np.flip(packet, axis=-2)
|
features = np.flip(packet, axis=-2)
|
||||||
f.write(features.astype(np.float32).tobytes())
|
f.write(features.astype(np.float32).tobytes())
|
||||||
|
|
||||||
|
|
||||||
def read_fec_packets(filename):
|
def read_fec_packets(filename):
|
||||||
""" reads packets from binary format """
|
""" reads packets from binary format """
|
||||||
|
|
||||||
assert np.dtype(np.float32).itemsize == 4
|
assert np.dtype(np.float32).itemsize == 4
|
||||||
assert np.dtype(np.int16).itemsize == 2
|
assert np.dtype(np.int16).itemsize == 2
|
||||||
|
|
||||||
with open(filename, 'rb') as f:
|
with open(filename, 'rb') as f:
|
||||||
|
|
||||||
# header
|
# header
|
||||||
version = np.frombuffer(f.read(2), dtype=np.int16).item()
|
version = np.frombuffer(f.read(2), dtype=np.int16).item()
|
||||||
header_size = np.frombuffer(f.read(2), dtype=np.int16).item()
|
header_size = np.frombuffer(f.read(2), dtype=np.int16).item()
|
||||||
|
@ -90,19 +90,19 @@ def read_fec_packets(filename):
|
||||||
subframe_size = np.frombuffer(f.read(2), dtype=np.int16).item()
|
subframe_size = np.frombuffer(f.read(2), dtype=np.int16).item()
|
||||||
subframes_per_packet = np.frombuffer(f.read(2), dtype=np.int16).item()
|
subframes_per_packet = np.frombuffer(f.read(2), dtype=np.int16).item()
|
||||||
num_features = np.frombuffer(f.read(2), dtype=np.int16).item()
|
num_features = np.frombuffer(f.read(2), dtype=np.int16).item()
|
||||||
|
|
||||||
dummy_features = np.zeros((subframes_per_packet, num_features), dtype=np.float32)
|
dummy_features = np.zeros((subframes_per_packet, num_features), dtype=np.float32)
|
||||||
|
|
||||||
# packets
|
# packets
|
||||||
rates = []
|
rates = []
|
||||||
packets = []
|
packets = []
|
||||||
for i in range(num_packets):
|
for i in range(num_packets):
|
||||||
|
|
||||||
rate = np.frombuffer(f.read(2), dtype=np.int16).item
|
rate = np.frombuffer(f.read(2), dtype=np.int16).item
|
||||||
rates.append(rate)
|
rates.append(rate)
|
||||||
|
|
||||||
features = np.reshape(np.frombuffer(f.read(subframe_size * subframes_per_packet), dtype=np.float32), dummy_features.shape)
|
features = np.reshape(np.frombuffer(f.read(subframe_size * subframes_per_packet), dtype=np.float32), dummy_features.shape)
|
||||||
packet = np.flip(features, axis=-2)
|
packet = np.flip(features, axis=-2)
|
||||||
packets.append(packet)
|
packets.append(packet)
|
||||||
|
|
||||||
return packets
|
return packets
|
|
@ -40,7 +40,7 @@ class RDOVAEDataset(torch.utils.data.Dataset):
|
||||||
lambda_max=0.0135,
|
lambda_max=0.0135,
|
||||||
quant_levels=16,
|
quant_levels=16,
|
||||||
enc_stride=2):
|
enc_stride=2):
|
||||||
|
|
||||||
self.sequence_length = sequence_length
|
self.sequence_length = sequence_length
|
||||||
self.lambda_min = lambda_min
|
self.lambda_min = lambda_min
|
||||||
self.lambda_max = lambda_max
|
self.lambda_max = lambda_max
|
||||||
|
@ -50,7 +50,7 @@ class RDOVAEDataset(torch.utils.data.Dataset):
|
||||||
|
|
||||||
if sequence_length % enc_stride:
|
if sequence_length % enc_stride:
|
||||||
raise ValueError(f"RDOVAEDataset.__init__: enc_stride {enc_stride} does not divide sequence length {sequence_length}")
|
raise ValueError(f"RDOVAEDataset.__init__: enc_stride {enc_stride} does not divide sequence length {sequence_length}")
|
||||||
|
|
||||||
self.features = np.reshape(np.fromfile(feature_file, dtype=np.float32), (-1, num_features))
|
self.features = np.reshape(np.fromfile(feature_file, dtype=np.float32), (-1, num_features))
|
||||||
self.features = self.features[:, :num_used_features]
|
self.features = self.features[:, :num_used_features]
|
||||||
self.num_sequences = self.features.shape[0] // sequence_length
|
self.num_sequences = self.features.shape[0] // sequence_length
|
||||||
|
@ -65,4 +65,3 @@ class RDOVAEDataset(torch.utils.data.Dataset):
|
||||||
rate_lambda = self.lambda_min * np.exp(q_ids.astype(np.float32) / self.denominator).astype(np.float32)
|
rate_lambda = self.lambda_min * np.exp(q_ids.astype(np.float32) / self.denominator).astype(np.float32)
|
||||||
|
|
||||||
return features, rate_lambda, q_ids
|
return features, rate_lambda, q_ids
|
||||||
|
|
||||||
|
|
|
@ -42,7 +42,7 @@ def soft_pvq(x, k):
|
||||||
|
|
||||||
# L2 normalization
|
# L2 normalization
|
||||||
x_norm2 = x / (1e-15 + torch.norm(x, dim=-1, keepdim=True))
|
x_norm2 = x / (1e-15 + torch.norm(x, dim=-1, keepdim=True))
|
||||||
|
|
||||||
|
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
# quantization loop, no need to track gradients here
|
# quantization loop, no need to track gradients here
|
||||||
|
@ -84,19 +84,19 @@ def cache_parameters(func):
|
||||||
return cache[args]
|
return cache[args]
|
||||||
else:
|
else:
|
||||||
cache[args] = func(*args)
|
cache[args] = func(*args)
|
||||||
|
|
||||||
return cache[args]
|
return cache[args]
|
||||||
return cached_func
|
return cached_func
|
||||||
|
|
||||||
@cache_parameters
|
@cache_parameters
|
||||||
def pvq_codebook_size(n, k):
|
def pvq_codebook_size(n, k):
|
||||||
|
|
||||||
if k == 0:
|
if k == 0:
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
if n == 0:
|
if n == 0:
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
return pvq_codebook_size(n - 1, k) + pvq_codebook_size(n, k - 1) + pvq_codebook_size(n - 1, k - 1)
|
return pvq_codebook_size(n - 1, k) + pvq_codebook_size(n, k - 1) + pvq_codebook_size(n - 1, k - 1)
|
||||||
|
|
||||||
|
|
||||||
|
@ -121,7 +121,7 @@ def hard_rate_estimate(z, r, theta, reduce=True):
|
||||||
p0 = 1 - r ** (0.5 + 0.5 * theta)
|
p0 = 1 - r ** (0.5 + 0.5 * theta)
|
||||||
alpha = torch.relu(1 - torch.abs(z_q)) ** 2
|
alpha = torch.relu(1 - torch.abs(z_q)) ** 2
|
||||||
rate = - torch.sum(
|
rate = - torch.sum(
|
||||||
(alpha * torch.log2(p0 * r ** torch.abs(z_q) + 1e-6)
|
(alpha * torch.log2(p0 * r ** torch.abs(z_q) + 1e-6)
|
||||||
+ (1 - alpha) * torch.log2(0.5 * (1 - p0) * (1 - r) * r ** (torch.abs(z_q) - 1) + 1e-6)),
|
+ (1 - alpha) * torch.log2(0.5 * (1 - p0) * (1 - r) * r ** (torch.abs(z_q) - 1) + 1e-6)),
|
||||||
dim=-1
|
dim=-1
|
||||||
)
|
)
|
||||||
|
@ -154,7 +154,7 @@ def noise_quantize(x):
|
||||||
|
|
||||||
def distortion_loss(y_true, y_pred, rate_lambda=None):
|
def distortion_loss(y_true, y_pred, rate_lambda=None):
|
||||||
""" custom distortion loss for LPCNet features """
|
""" custom distortion loss for LPCNet features """
|
||||||
|
|
||||||
if y_true.size(-1) != 20:
|
if y_true.size(-1) != 20:
|
||||||
raise ValueError('distortion loss is designed to work with 20 features')
|
raise ValueError('distortion loss is designed to work with 20 features')
|
||||||
|
|
||||||
|
@ -169,7 +169,7 @@ def distortion_loss(y_true, y_pred, rate_lambda=None):
|
||||||
loss = loss / torch.sqrt(rate_lambda)
|
loss = loss / torch.sqrt(rate_lambda)
|
||||||
|
|
||||||
loss = torch.mean(loss)
|
loss = torch.mean(loss)
|
||||||
|
|
||||||
return loss
|
return loss
|
||||||
|
|
||||||
|
|
||||||
|
@ -181,23 +181,23 @@ import random
|
||||||
def random_split(start, stop, num_splits=3, min_len=3):
|
def random_split(start, stop, num_splits=3, min_len=3):
|
||||||
get_min_len = lambda x : min([x[i+1] - x[i] for i in range(len(x) - 1)])
|
get_min_len = lambda x : min([x[i+1] - x[i] for i in range(len(x) - 1)])
|
||||||
candidate = [start] + sorted([random.randint(start, stop-1) for i in range(num_splits)]) + [stop]
|
candidate = [start] + sorted([random.randint(start, stop-1) for i in range(num_splits)]) + [stop]
|
||||||
|
|
||||||
while get_min_len(candidate) < min_len:
|
while get_min_len(candidate) < min_len:
|
||||||
candidate = [start] + sorted([random.randint(start, stop-1) for i in range(num_splits)]) + [stop]
|
candidate = [start] + sorted([random.randint(start, stop-1) for i in range(num_splits)]) + [stop]
|
||||||
|
|
||||||
return candidate
|
return candidate
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# weight initialization and clipping
|
# weight initialization and clipping
|
||||||
def init_weights(module):
|
def init_weights(module):
|
||||||
|
|
||||||
if isinstance(module, nn.GRU):
|
if isinstance(module, nn.GRU):
|
||||||
for p in module.named_parameters():
|
for p in module.named_parameters():
|
||||||
if p[0].startswith('weight_hh_'):
|
if p[0].startswith('weight_hh_'):
|
||||||
nn.init.orthogonal_(p[1])
|
nn.init.orthogonal_(p[1])
|
||||||
|
|
||||||
|
|
||||||
def weight_clip_factory(max_value):
|
def weight_clip_factory(max_value):
|
||||||
""" weight clipping function concerning sum of abs values of adjecent weights """
|
""" weight clipping function concerning sum of abs values of adjecent weights """
|
||||||
def clip_weight_(w):
|
def clip_weight_(w):
|
||||||
|
@ -213,13 +213,13 @@ def weight_clip_factory(max_value):
|
||||||
1))
|
1))
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
w[:, :stop] *= factor
|
w[:, :stop] *= factor
|
||||||
|
|
||||||
def clip_weights(module):
|
def clip_weights(module):
|
||||||
if isinstance(module, nn.GRU) or isinstance(module, nn.Linear):
|
if isinstance(module, nn.GRU) or isinstance(module, nn.Linear):
|
||||||
for name, w in module.named_parameters():
|
for name, w in module.named_parameters():
|
||||||
if name.startswith('weight'):
|
if name.startswith('weight'):
|
||||||
clip_weight_(w)
|
clip_weight_(w)
|
||||||
|
|
||||||
return clip_weights
|
return clip_weights
|
||||||
|
|
||||||
# RDOVAE module and submodules
|
# RDOVAE module and submodules
|
||||||
|
@ -229,12 +229,12 @@ class CoreEncoder(nn.Module):
|
||||||
STATE_HIDDEN = 128
|
STATE_HIDDEN = 128
|
||||||
FRAMES_PER_STEP = 2
|
FRAMES_PER_STEP = 2
|
||||||
CONV_KERNEL_SIZE = 4
|
CONV_KERNEL_SIZE = 4
|
||||||
|
|
||||||
def __init__(self, feature_dim, output_dim, cond_size, cond_size2, state_size=24):
|
def __init__(self, feature_dim, output_dim, cond_size, cond_size2, state_size=24):
|
||||||
""" core encoder for RDOVAE
|
""" core encoder for RDOVAE
|
||||||
|
|
||||||
Computes latents, initial states, and rate estimates from features and lambda parameter
|
Computes latents, initial states, and rate estimates from features and lambda parameter
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
super(CoreEncoder, self).__init__()
|
super(CoreEncoder, self).__init__()
|
||||||
|
@ -289,7 +289,7 @@ class CoreEncoder(nn.Module):
|
||||||
|
|
||||||
# concatenation of all hidden layer outputs
|
# concatenation of all hidden layer outputs
|
||||||
x9 = torch.cat((x1, x2, x3, x4, x5, x6, x7, x8), dim=-1)
|
x9 = torch.cat((x1, x2, x3, x4, x5, x6, x7, x8), dim=-1)
|
||||||
|
|
||||||
# init state for decoder
|
# init state for decoder
|
||||||
states = torch.tanh(self.state_dense_1(x9))
|
states = torch.tanh(self.state_dense_1(x9))
|
||||||
states = torch.tanh(self.state_dense_2(states))
|
states = torch.tanh(self.state_dense_2(states))
|
||||||
|
@ -309,9 +309,9 @@ class CoreDecoder(nn.Module):
|
||||||
|
|
||||||
def __init__(self, input_dim, output_dim, cond_size, cond_size2, state_size=24):
|
def __init__(self, input_dim, output_dim, cond_size, cond_size2, state_size=24):
|
||||||
""" core decoder for RDOVAE
|
""" core decoder for RDOVAE
|
||||||
|
|
||||||
Computes features from latents, initial state, and quantization index
|
Computes features from latents, initial state, and quantization index
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
super(CoreDecoder, self).__init__()
|
super(CoreDecoder, self).__init__()
|
||||||
|
@ -324,7 +324,7 @@ class CoreDecoder(nn.Module):
|
||||||
self.state_size = state_size
|
self.state_size = state_size
|
||||||
|
|
||||||
self.input_size = self.input_dim
|
self.input_size = self.input_dim
|
||||||
|
|
||||||
self.concat_size = 4 * self.cond_size + 4 * self.cond_size2
|
self.concat_size = 4 * self.cond_size + 4 * self.cond_size2
|
||||||
|
|
||||||
# layers
|
# layers
|
||||||
|
@ -348,7 +348,7 @@ class CoreDecoder(nn.Module):
|
||||||
self.apply(init_weights)
|
self.apply(init_weights)
|
||||||
|
|
||||||
def forward(self, z, initial_state):
|
def forward(self, z, initial_state):
|
||||||
|
|
||||||
gru_1_state = torch.tanh(self.gru_1_init(initial_state).permute(1, 0, 2))
|
gru_1_state = torch.tanh(self.gru_1_init(initial_state).permute(1, 0, 2))
|
||||||
gru_2_state = torch.tanh(self.gru_2_init(initial_state).permute(1, 0, 2))
|
gru_2_state = torch.tanh(self.gru_2_init(initial_state).permute(1, 0, 2))
|
||||||
gru_3_state = torch.tanh(self.gru_3_init(initial_state).permute(1, 0, 2))
|
gru_3_state = torch.tanh(self.gru_3_init(initial_state).permute(1, 0, 2))
|
||||||
|
@ -374,9 +374,9 @@ class CoreDecoder(nn.Module):
|
||||||
class StatisticalModel(nn.Module):
|
class StatisticalModel(nn.Module):
|
||||||
def __init__(self, quant_levels, latent_dim):
|
def __init__(self, quant_levels, latent_dim):
|
||||||
""" Statistical model for latent space
|
""" Statistical model for latent space
|
||||||
|
|
||||||
Computes scaling, deadzone, r, and theta
|
Computes scaling, deadzone, r, and theta
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
super(StatisticalModel, self).__init__()
|
super(StatisticalModel, self).__init__()
|
||||||
|
@ -388,7 +388,7 @@ class StatisticalModel(nn.Module):
|
||||||
|
|
||||||
# quantization embedding
|
# quantization embedding
|
||||||
self.quant_embedding = nn.Embedding(quant_levels, self.embedding_dim)
|
self.quant_embedding = nn.Embedding(quant_levels, self.embedding_dim)
|
||||||
|
|
||||||
# initialize embedding to 0
|
# initialize embedding to 0
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
self.quant_embedding.weight[:] = 0
|
self.quant_embedding.weight[:] = 0
|
||||||
|
@ -406,7 +406,7 @@ class StatisticalModel(nn.Module):
|
||||||
r_soft = torch.sigmoid(x[..., 3 * self.latent_dim : 4 * self.latent_dim])
|
r_soft = torch.sigmoid(x[..., 3 * self.latent_dim : 4 * self.latent_dim])
|
||||||
theta_hard = torch.sigmoid(x[..., 4 * self.latent_dim : 5 * self.latent_dim])
|
theta_hard = torch.sigmoid(x[..., 4 * self.latent_dim : 5 * self.latent_dim])
|
||||||
r_hard = torch.sigmoid(x[..., 5 * self.latent_dim : 6 * self.latent_dim])
|
r_hard = torch.sigmoid(x[..., 5 * self.latent_dim : 6 * self.latent_dim])
|
||||||
|
|
||||||
|
|
||||||
return {
|
return {
|
||||||
'quant_embedding' : x,
|
'quant_embedding' : x,
|
||||||
|
@ -443,34 +443,34 @@ class RDOVAE(nn.Module):
|
||||||
self.state_dim = state_dim
|
self.state_dim = state_dim
|
||||||
self.pvq_num_pulses = pvq_num_pulses
|
self.pvq_num_pulses = pvq_num_pulses
|
||||||
self.state_dropout_rate = state_dropout_rate
|
self.state_dropout_rate = state_dropout_rate
|
||||||
|
|
||||||
# submodules encoder and decoder share the statistical model
|
# submodules encoder and decoder share the statistical model
|
||||||
self.statistical_model = StatisticalModel(quant_levels, latent_dim)
|
self.statistical_model = StatisticalModel(quant_levels, latent_dim)
|
||||||
self.core_encoder = nn.DataParallel(CoreEncoder(feature_dim, latent_dim, cond_size, cond_size2, state_size=state_dim))
|
self.core_encoder = nn.DataParallel(CoreEncoder(feature_dim, latent_dim, cond_size, cond_size2, state_size=state_dim))
|
||||||
self.core_decoder = nn.DataParallel(CoreDecoder(latent_dim, feature_dim, cond_size, cond_size2, state_size=state_dim))
|
self.core_decoder = nn.DataParallel(CoreDecoder(latent_dim, feature_dim, cond_size, cond_size2, state_size=state_dim))
|
||||||
|
|
||||||
self.enc_stride = CoreEncoder.FRAMES_PER_STEP
|
self.enc_stride = CoreEncoder.FRAMES_PER_STEP
|
||||||
self.dec_stride = CoreDecoder.FRAMES_PER_STEP
|
self.dec_stride = CoreDecoder.FRAMES_PER_STEP
|
||||||
|
|
||||||
if clip_weights:
|
if clip_weights:
|
||||||
self.weight_clip_fn = weight_clip_factory(0.496)
|
self.weight_clip_fn = weight_clip_factory(0.496)
|
||||||
else:
|
else:
|
||||||
self.weight_clip_fn = None
|
self.weight_clip_fn = None
|
||||||
|
|
||||||
if self.dec_stride % self.enc_stride != 0:
|
if self.dec_stride % self.enc_stride != 0:
|
||||||
raise ValueError(f"get_decoder_chunks_generic: encoder stride does not divide decoder stride")
|
raise ValueError(f"get_decoder_chunks_generic: encoder stride does not divide decoder stride")
|
||||||
|
|
||||||
def clip_weights(self):
|
def clip_weights(self):
|
||||||
if not type(self.weight_clip_fn) == type(None):
|
if not type(self.weight_clip_fn) == type(None):
|
||||||
self.apply(self.weight_clip_fn)
|
self.apply(self.weight_clip_fn)
|
||||||
|
|
||||||
def get_decoder_chunks(self, z_frames, mode='split', chunks_per_offset = 4):
|
def get_decoder_chunks(self, z_frames, mode='split', chunks_per_offset = 4):
|
||||||
|
|
||||||
enc_stride = self.enc_stride
|
enc_stride = self.enc_stride
|
||||||
dec_stride = self.dec_stride
|
dec_stride = self.dec_stride
|
||||||
|
|
||||||
stride = dec_stride // enc_stride
|
stride = dec_stride // enc_stride
|
||||||
|
|
||||||
chunks = []
|
chunks = []
|
||||||
|
|
||||||
for offset in range(stride):
|
for offset in range(stride):
|
||||||
|
@ -529,7 +529,7 @@ class RDOVAE(nn.Module):
|
||||||
z_q = hard_quantize(z) / statistical_model['quant_scale']
|
z_q = hard_quantize(z) / statistical_model['quant_scale']
|
||||||
z_n = noise_quantize(z) / statistical_model['quant_scale']
|
z_n = noise_quantize(z) / statistical_model['quant_scale']
|
||||||
states_q = soft_pvq(states, self.pvq_num_pulses)
|
states_q = soft_pvq(states, self.pvq_num_pulses)
|
||||||
|
|
||||||
if self.state_dropout_rate > 0:
|
if self.state_dropout_rate > 0:
|
||||||
drop = torch.rand(states_q.size(0)) < self.state_dropout_rate
|
drop = torch.rand(states_q.size(0)) < self.state_dropout_rate
|
||||||
mask = torch.ones_like(states_q)
|
mask = torch.ones_like(states_q)
|
||||||
|
@ -552,7 +552,7 @@ class RDOVAE(nn.Module):
|
||||||
# decoder with soft quantized input
|
# decoder with soft quantized input
|
||||||
z_dec_reverse = torch.flip(z_n[..., chunk['z_start'] : chunk['z_stop'] : chunk['z_stride'], :], [1])
|
z_dec_reverse = torch.flip(z_n[..., chunk['z_start'] : chunk['z_stop'] : chunk['z_stride'], :], [1])
|
||||||
features_reverse = self.core_decoder(z_dec_reverse, dec_initial_state)
|
features_reverse = self.core_decoder(z_dec_reverse, dec_initial_state)
|
||||||
outputs_sq.append((torch.flip(features_reverse, [1]), chunk['features_start'], chunk['features_stop']))
|
outputs_sq.append((torch.flip(features_reverse, [1]), chunk['features_start'], chunk['features_stop']))
|
||||||
|
|
||||||
return {
|
return {
|
||||||
'outputs_hard_quant' : outputs_hq,
|
'outputs_hard_quant' : outputs_hq,
|
||||||
|
@ -563,24 +563,24 @@ class RDOVAE(nn.Module):
|
||||||
|
|
||||||
def encode(self, features):
|
def encode(self, features):
|
||||||
""" encoder with quantization and rate estimation """
|
""" encoder with quantization and rate estimation """
|
||||||
|
|
||||||
z, states = self.core_encoder(features)
|
z, states = self.core_encoder(features)
|
||||||
|
|
||||||
# quantization of initial states
|
# quantization of initial states
|
||||||
states = soft_pvq(states, self.pvq_num_pulses)
|
states = soft_pvq(states, self.pvq_num_pulses)
|
||||||
state_size = m.log2(pvq_codebook_size(self.state_dim, self.pvq_num_pulses))
|
state_size = m.log2(pvq_codebook_size(self.state_dim, self.pvq_num_pulses))
|
||||||
|
|
||||||
return z, states, state_size
|
return z, states, state_size
|
||||||
|
|
||||||
def decode(self, z, initial_state):
|
def decode(self, z, initial_state):
|
||||||
""" decoder (flips sequences by itself) """
|
""" decoder (flips sequences by itself) """
|
||||||
|
|
||||||
z_reverse = torch.flip(z, [1])
|
z_reverse = torch.flip(z, [1])
|
||||||
features_reverse = self.core_decoder(z_reverse, initial_state)
|
features_reverse = self.core_decoder(z_reverse, initial_state)
|
||||||
features = torch.flip(features_reverse, [1])
|
features = torch.flip(features_reverse, [1])
|
||||||
|
|
||||||
return features
|
return features
|
||||||
|
|
||||||
def quantize(self, z, q_ids):
|
def quantize(self, z, q_ids):
|
||||||
""" quantization of latent vectors """
|
""" quantization of latent vectors """
|
||||||
|
|
||||||
|
@ -602,13 +602,12 @@ class RDOVAE(nn.Module):
|
||||||
z = zq / stats['quant_scale']
|
z = zq / stats['quant_scale']
|
||||||
|
|
||||||
return z
|
return z
|
||||||
|
|
||||||
def freeze_model(self):
|
def freeze_model(self):
|
||||||
|
|
||||||
# freeze all parameters
|
# freeze all parameters
|
||||||
for p in self.parameters():
|
for p in self.parameters():
|
||||||
p.requires_grad = False
|
p.requires_grad = False
|
||||||
|
|
||||||
for p in self.statistical_model.parameters():
|
for p in self.statistical_model.parameters():
|
||||||
p.requires_grad = True
|
p.requires_grad = True
|
||||||
|
|
||||||
|
|
|
@ -89,7 +89,7 @@ adam_eps = 1e-8
|
||||||
|
|
||||||
checkpoint['batch_size'] = batch_size
|
checkpoint['batch_size'] = batch_size
|
||||||
checkpoint['lr'] = lr
|
checkpoint['lr'] = lr
|
||||||
checkpoint['lr_decay_factor'] = lr_decay_factor
|
checkpoint['lr_decay_factor'] = lr_decay_factor
|
||||||
checkpoint['split_mode'] = split_mode
|
checkpoint['split_mode'] = split_mode
|
||||||
checkpoint['epochs'] = epochs
|
checkpoint['epochs'] = epochs
|
||||||
checkpoint['sequence_length'] = sequence_length
|
checkpoint['sequence_length'] = sequence_length
|
||||||
|
@ -130,10 +130,10 @@ checkpoint['state_dict'] = model.state_dict()
|
||||||
if args.train_decoder_only:
|
if args.train_decoder_only:
|
||||||
if args.initial_checkpoint is None:
|
if args.initial_checkpoint is None:
|
||||||
print("warning: training decoder only without providing initial checkpoint")
|
print("warning: training decoder only without providing initial checkpoint")
|
||||||
|
|
||||||
for p in model.core_encoder.module.parameters():
|
for p in model.core_encoder.module.parameters():
|
||||||
p.requires_grad = False
|
p.requires_grad = False
|
||||||
|
|
||||||
for p in model.statistical_model.parameters():
|
for p in model.statistical_model.parameters():
|
||||||
p.requires_grad = False
|
p.requires_grad = False
|
||||||
|
|
||||||
|
@ -180,15 +180,15 @@ if __name__ == '__main__':
|
||||||
|
|
||||||
# zero out gradients
|
# zero out gradients
|
||||||
optimizer.zero_grad()
|
optimizer.zero_grad()
|
||||||
|
|
||||||
# push inputs to device
|
# push inputs to device
|
||||||
features = features.to(device)
|
features = features.to(device)
|
||||||
q_ids = q_ids.to(device)
|
q_ids = q_ids.to(device)
|
||||||
rate_lambda = rate_lambda.to(device)
|
rate_lambda = rate_lambda.to(device)
|
||||||
|
|
||||||
|
|
||||||
rate_lambda_upsamp = torch.repeat_interleave(rate_lambda, 2, 1)
|
rate_lambda_upsamp = torch.repeat_interleave(rate_lambda, 2, 1)
|
||||||
|
|
||||||
# run model
|
# run model
|
||||||
model_output = model(features, q_ids)
|
model_output = model(features, q_ids)
|
||||||
|
|
||||||
|
@ -224,17 +224,17 @@ if __name__ == '__main__':
|
||||||
|
|
||||||
# total loss
|
# total loss
|
||||||
total_loss = rate_loss + (distortion_loss_hard_quant + distortion_loss_soft_quant) / 2
|
total_loss = rate_loss + (distortion_loss_hard_quant + distortion_loss_soft_quant) / 2
|
||||||
|
|
||||||
if args.enable_first_frame_loss:
|
if args.enable_first_frame_loss:
|
||||||
total_loss = total_loss + 0.5 * torch.relu(first_frame_loss - distortion_loss_hard_quant)
|
total_loss = total_loss + 0.5 * torch.relu(first_frame_loss - distortion_loss_hard_quant)
|
||||||
|
|
||||||
|
|
||||||
total_loss.backward()
|
total_loss.backward()
|
||||||
|
|
||||||
optimizer.step()
|
optimizer.step()
|
||||||
|
|
||||||
model.clip_weights()
|
model.clip_weights()
|
||||||
|
|
||||||
scheduler.step()
|
scheduler.step()
|
||||||
|
|
||||||
# collect running stats
|
# collect running stats
|
||||||
|
|
|
@ -3,7 +3,7 @@ Modification of Tensorflow's Embedding Layer:
|
||||||
1. Not restricted to be the first layer of a model
|
1. Not restricted to be the first layer of a model
|
||||||
2. Differentiable (allows non-integer lookups)
|
2. Differentiable (allows non-integer lookups)
|
||||||
- For non integer lookup, this layer linearly interpolates between the adjacent embeddings in the following way to preserver gradient flow
|
- For non integer lookup, this layer linearly interpolates between the adjacent embeddings in the following way to preserver gradient flow
|
||||||
- E = (1 - frac(x))*embed(floor(x)) + frac(x)*embed(ceil(x))
|
- E = (1 - frac(x))*embed(floor(x)) + frac(x)*embed(ceil(x))
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import tensorflow as tf
|
import tensorflow as tf
|
||||||
|
@ -26,13 +26,13 @@ class diff_Embed(Layer):
|
||||||
self.pcm_init = pcm_init
|
self.pcm_init = pcm_init
|
||||||
self.initializer = initializer
|
self.initializer = initializer
|
||||||
|
|
||||||
def build(self, input_shape):
|
def build(self, input_shape):
|
||||||
w_init = tf.random_normal_initializer()
|
w_init = tf.random_normal_initializer()
|
||||||
if self.pcm_init:
|
if self.pcm_init:
|
||||||
w_init = self.initializer
|
w_init = self.initializer
|
||||||
self.w = tf.Variable(initial_value=w_init(shape=(self.dict_size, self.units),dtype='float32'),trainable=True)
|
self.w = tf.Variable(initial_value=w_init(shape=(self.dict_size, self.units),dtype='float32'),trainable=True)
|
||||||
|
|
||||||
def call(self, inputs):
|
def call(self, inputs):
|
||||||
alpha = inputs - tf.math.floor(inputs)
|
alpha = inputs - tf.math.floor(inputs)
|
||||||
alpha = tf.expand_dims(alpha,axis = -1)
|
alpha = tf.expand_dims(alpha,axis = -1)
|
||||||
alpha = tf.tile(alpha,[1,1,1,self.units])
|
alpha = tf.tile(alpha,[1,1,1,self.units])
|
||||||
|
|
|
@ -309,13 +309,13 @@ if __name__ == "__main__":
|
||||||
else:
|
else:
|
||||||
hf.write('/* This is *not* an end-to-end model */\n')
|
hf.write('/* This is *not* an end-to-end model */\n')
|
||||||
hf.write('/* #define END2END */\n\n')
|
hf.write('/* #define END2END */\n\n')
|
||||||
|
|
||||||
# LPC weighting factor
|
# LPC weighting factor
|
||||||
if type(args.lpc_gamma) == type(None):
|
if type(args.lpc_gamma) == type(None):
|
||||||
lpc_gamma = get_parameter(model, 'lpc_gamma', 1)
|
lpc_gamma = get_parameter(model, 'lpc_gamma', 1)
|
||||||
else:
|
else:
|
||||||
lpc_gamma = args.lpc_gamma
|
lpc_gamma = args.lpc_gamma
|
||||||
|
|
||||||
hf.write('/* LPC weighting factor */\n')
|
hf.write('/* LPC weighting factor */\n')
|
||||||
hf.write('#define LPC_GAMMA ' + str(lpc_gamma) +'f\n\n')
|
hf.write('#define LPC_GAMMA ' + str(lpc_gamma) +'f\n\n')
|
||||||
|
|
||||||
|
@ -376,7 +376,7 @@ if __name__ == "__main__":
|
||||||
|
|
||||||
hf.write('typedef struct {\n')
|
hf.write('typedef struct {\n')
|
||||||
for i, name in enumerate(layer_list):
|
for i, name in enumerate(layer_list):
|
||||||
hf.write(' float {}_state[{}_STATE_SIZE];\n'.format(name, name.upper()))
|
hf.write(' float {}_state[{}_STATE_SIZE];\n'.format(name, name.upper()))
|
||||||
hf.write('} NNetState;\n\n')
|
hf.write('} NNetState;\n\n')
|
||||||
|
|
||||||
model_struct.write('} LPCNetModel;\n\n')
|
model_struct.write('} LPCNetModel;\n\n')
|
||||||
|
|
|
@ -283,7 +283,7 @@ hf.write('#define PLC_MAX_RNN_NEURONS {}\n\n'.format(max_rnn_neurons))
|
||||||
|
|
||||||
hf.write('typedef struct {\n')
|
hf.write('typedef struct {\n')
|
||||||
for i, name in enumerate(layer_list):
|
for i, name in enumerate(layer_list):
|
||||||
hf.write(' float {}_state[{}_STATE_SIZE];\n'.format(name, name.upper()))
|
hf.write(' float {}_state[{}_STATE_SIZE];\n'.format(name, name.upper()))
|
||||||
hf.write('} PLCNetState;\n\n')
|
hf.write('} PLCNetState;\n\n')
|
||||||
|
|
||||||
model_struct.write('} PLCModel;\n\n')
|
model_struct.write('} PLCModel;\n\n')
|
||||||
|
|
|
@ -173,7 +173,7 @@ f"""
|
||||||
[
|
[
|
||||||
dump_conv1d_layer(encoder.get_layer(name), source_fid, header_fid)
|
dump_conv1d_layer(encoder.get_layer(name), source_fid, header_fid)
|
||||||
for name in encoder_conv1d_names
|
for name in encoder_conv1d_names
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
# dump Dense layers
|
# dump Dense layers
|
||||||
|
@ -232,13 +232,13 @@ f"""
|
||||||
'dec_dense7',
|
'dec_dense7',
|
||||||
'dec_dense8',
|
'dec_dense8',
|
||||||
'dec_final'
|
'dec_final'
|
||||||
]
|
]
|
||||||
|
|
||||||
decoder_gru_names = [
|
decoder_gru_names = [
|
||||||
'dec_dense2',
|
'dec_dense2',
|
||||||
'dec_dense4',
|
'dec_dense4',
|
||||||
'dec_dense6'
|
'dec_dense6'
|
||||||
]
|
]
|
||||||
|
|
||||||
source_fid = open("dred_rdovae_dec_data.c", 'w')
|
source_fid = open("dred_rdovae_dec_data.c", 'w')
|
||||||
header_fid = open("dred_rdovae_dec_data.h", 'w')
|
header_fid = open("dred_rdovae_dec_data.h", 'w')
|
||||||
|
|
|
@ -97,7 +97,7 @@ total_delay = silk_delay + zero_history + args.extra_delay - dump_data_delay
|
||||||
# load signal
|
# load signal
|
||||||
if args.input.endswith('.raw') or args.input.endswith('.pcm') or args.input.endswith('.sw'):
|
if args.input.endswith('.raw') or args.input.endswith('.pcm') or args.input.endswith('.sw'):
|
||||||
signal = np.fromfile(args.input, dtype='int16')
|
signal = np.fromfile(args.input, dtype='int16')
|
||||||
|
|
||||||
elif args.input.endswith('.wav'):
|
elif args.input.endswith('.wav'):
|
||||||
fs, signal = wavfile.read(args.input)
|
fs, signal = wavfile.read(args.input)
|
||||||
else:
|
else:
|
||||||
|
@ -107,7 +107,7 @@ else:
|
||||||
padded_signal_length = len(signal) + total_delay
|
padded_signal_length = len(signal) + total_delay
|
||||||
tail = padded_signal_length % frame_size
|
tail = padded_signal_length % frame_size
|
||||||
right_padding = (frame_size - tail) % frame_size
|
right_padding = (frame_size - tail) % frame_size
|
||||||
|
|
||||||
signal = np.concatenate((np.zeros(total_delay, dtype=np.int16), signal, np.zeros(right_padding, dtype=np.int16)))
|
signal = np.concatenate((np.zeros(total_delay, dtype=np.int16), signal, np.zeros(right_padding, dtype=np.int16)))
|
||||||
|
|
||||||
padded_signal_file = os.path.splitext(args.input)[0] + '_padded.raw'
|
padded_signal_file = os.path.splitext(args.input)[0] + '_padded.raw'
|
||||||
|
@ -228,7 +228,7 @@ if args.lossfile != None:
|
||||||
fec_out_full[:, :nb_used_features] = fec_out
|
fec_out_full[:, :nb_used_features] = fec_out
|
||||||
|
|
||||||
fec_out_full.tofile(packet_file[:-4] + f'_fec.f32')
|
fec_out_full.tofile(packet_file[:-4] + f'_fec.f32')
|
||||||
|
|
||||||
|
|
||||||
#create packets array like in the original version for debugging purposes
|
#create packets array like in the original version for debugging purposes
|
||||||
for i in range(offset, num_frames):
|
for i in range(offset, num_frames):
|
||||||
|
@ -254,4 +254,3 @@ if args.debug_output:
|
||||||
|
|
||||||
print(f"writing debug output {packet_file[:-4] + f'_tf_batch{batch}_offset{offset}.f32'}")
|
print(f"writing debug output {packet_file[:-4] + f'_tf_batch{batch}_offset{offset}.f32'}")
|
||||||
test_features_full.tofile(packet_file[:-4] + f'_tf_batch{batch}_offset{offset}.f32')
|
test_features_full.tofile(packet_file[:-4] + f'_tf_batch{batch}_offset{offset}.f32')
|
||||||
|
|
||||||
|
|
|
@ -43,7 +43,7 @@ int get_fec_frame(const char * const filename, float *features, int packet_index
|
||||||
long offset;
|
long offset;
|
||||||
|
|
||||||
FILE *fid = fopen(filename, "rb");
|
FILE *fid = fopen(filename, "rb");
|
||||||
|
|
||||||
/* read header */
|
/* read header */
|
||||||
if (fread(&version, sizeof(version), 1, fid) != 1) goto error;
|
if (fread(&version, sizeof(version), 1, fid) != 1) goto error;
|
||||||
if (fread(&header_size, sizeof(header_size), 1, fid) != 1) goto error;
|
if (fread(&header_size, sizeof(header_size), 1, fid) != 1) goto error;
|
||||||
|
@ -88,7 +88,7 @@ int get_fec_rate(const char * const filename, int packet_index)
|
||||||
int16_t rate;
|
int16_t rate;
|
||||||
|
|
||||||
FILE *fid = fopen(filename, "rb");
|
FILE *fid = fopen(filename, "rb");
|
||||||
|
|
||||||
/* read header */
|
/* read header */
|
||||||
if (fread(&version, sizeof(version), 1, fid) != 1) goto error;
|
if (fread(&version, sizeof(version), 1, fid) != 1) goto error;
|
||||||
if (fread(&header_size, sizeof(header_size), 1, fid) != 1) goto error;
|
if (fread(&header_size, sizeof(header_size), 1, fid) != 1) goto error;
|
||||||
|
|
|
@ -33,25 +33,25 @@ import numpy as np
|
||||||
|
|
||||||
def write_fec_packets(filename, packets, rates=None):
|
def write_fec_packets(filename, packets, rates=None):
|
||||||
""" writes packets in binary format """
|
""" writes packets in binary format """
|
||||||
|
|
||||||
assert np.dtype(np.float32).itemsize == 4
|
assert np.dtype(np.float32).itemsize == 4
|
||||||
assert np.dtype(np.int16).itemsize == 2
|
assert np.dtype(np.int16).itemsize == 2
|
||||||
|
|
||||||
# derive some sizes
|
# derive some sizes
|
||||||
num_packets = len(packets)
|
num_packets = len(packets)
|
||||||
subframes_per_packet = packets[0].shape[-2]
|
subframes_per_packet = packets[0].shape[-2]
|
||||||
num_features = packets[0].shape[-1]
|
num_features = packets[0].shape[-1]
|
||||||
|
|
||||||
# size of float is 4
|
# size of float is 4
|
||||||
subframe_size = num_features * 4
|
subframe_size = num_features * 4
|
||||||
packet_size = subframe_size * subframes_per_packet + 2 # two bytes for rate
|
packet_size = subframe_size * subframes_per_packet + 2 # two bytes for rate
|
||||||
|
|
||||||
version = 1
|
version = 1
|
||||||
# header size (version, header_size, num_packets, packet_size, subframe_size, subrames_per_packet, num_features)
|
# header size (version, header_size, num_packets, packet_size, subframe_size, subrames_per_packet, num_features)
|
||||||
header_size = 14
|
header_size = 14
|
||||||
|
|
||||||
with open(filename, 'wb') as f:
|
with open(filename, 'wb') as f:
|
||||||
|
|
||||||
# header
|
# header
|
||||||
f.write(np.int16(version).tobytes())
|
f.write(np.int16(version).tobytes())
|
||||||
f.write(np.int16(header_size).tobytes())
|
f.write(np.int16(header_size).tobytes())
|
||||||
|
@ -60,28 +60,28 @@ def write_fec_packets(filename, packets, rates=None):
|
||||||
f.write(np.int16(subframe_size).tobytes())
|
f.write(np.int16(subframe_size).tobytes())
|
||||||
f.write(np.int16(subframes_per_packet).tobytes())
|
f.write(np.int16(subframes_per_packet).tobytes())
|
||||||
f.write(np.int16(num_features).tobytes())
|
f.write(np.int16(num_features).tobytes())
|
||||||
|
|
||||||
# packets
|
# packets
|
||||||
for i, packet in enumerate(packets):
|
for i, packet in enumerate(packets):
|
||||||
if type(rates) == type(None):
|
if type(rates) == type(None):
|
||||||
rate = 0
|
rate = 0
|
||||||
else:
|
else:
|
||||||
rate = rates[i]
|
rate = rates[i]
|
||||||
|
|
||||||
f.write(np.int16(rate).tobytes())
|
f.write(np.int16(rate).tobytes())
|
||||||
|
|
||||||
features = np.flip(packet, axis=-2)
|
features = np.flip(packet, axis=-2)
|
||||||
f.write(features.astype(np.float32).tobytes())
|
f.write(features.astype(np.float32).tobytes())
|
||||||
|
|
||||||
|
|
||||||
def read_fec_packets(filename):
|
def read_fec_packets(filename):
|
||||||
""" reads packets from binary format """
|
""" reads packets from binary format """
|
||||||
|
|
||||||
assert np.dtype(np.float32).itemsize == 4
|
assert np.dtype(np.float32).itemsize == 4
|
||||||
assert np.dtype(np.int16).itemsize == 2
|
assert np.dtype(np.int16).itemsize == 2
|
||||||
|
|
||||||
with open(filename, 'rb') as f:
|
with open(filename, 'rb') as f:
|
||||||
|
|
||||||
# header
|
# header
|
||||||
version = np.frombuffer(f.read(2), dtype=np.int16).item()
|
version = np.frombuffer(f.read(2), dtype=np.int16).item()
|
||||||
header_size = np.frombuffer(f.read(2), dtype=np.int16).item()
|
header_size = np.frombuffer(f.read(2), dtype=np.int16).item()
|
||||||
|
@ -90,19 +90,19 @@ def read_fec_packets(filename):
|
||||||
subframe_size = np.frombuffer(f.read(2), dtype=np.int16).item()
|
subframe_size = np.frombuffer(f.read(2), dtype=np.int16).item()
|
||||||
subframes_per_packet = np.frombuffer(f.read(2), dtype=np.int16).item()
|
subframes_per_packet = np.frombuffer(f.read(2), dtype=np.int16).item()
|
||||||
num_features = np.frombuffer(f.read(2), dtype=np.int16).item()
|
num_features = np.frombuffer(f.read(2), dtype=np.int16).item()
|
||||||
|
|
||||||
dummy_features = np.zeros((1, subframes_per_packet, num_features), dtype=np.float32)
|
dummy_features = np.zeros((1, subframes_per_packet, num_features), dtype=np.float32)
|
||||||
|
|
||||||
# packets
|
# packets
|
||||||
rates = []
|
rates = []
|
||||||
packets = []
|
packets = []
|
||||||
for i in range(num_packets):
|
for i in range(num_packets):
|
||||||
|
|
||||||
rate = np.frombuffer(f.read(2), dtype=np.int16).item
|
rate = np.frombuffer(f.read(2), dtype=np.int16).item
|
||||||
rates.append(rate)
|
rates.append(rate)
|
||||||
|
|
||||||
features = np.reshape(np.frombuffer(f.read(subframe_size * subframes_per_packet), dtype=np.float32), dummy_features.shape)
|
features = np.reshape(np.frombuffer(f.read(subframe_size * subframes_per_packet), dtype=np.float32), dummy_features.shape)
|
||||||
packet = np.flip(features, axis=-2)
|
packet = np.flip(features, axis=-2)
|
||||||
packets.append(packet)
|
packets.append(packet)
|
||||||
|
|
||||||
return packets
|
return packets
|
|
@ -35,7 +35,7 @@ def interp_mulaw(gamma = 1):
|
||||||
alpha = e_gt - tf.math.floor(e_gt)
|
alpha = e_gt - tf.math.floor(e_gt)
|
||||||
alpha = tf.tile(alpha,[1,1,256])
|
alpha = tf.tile(alpha,[1,1,256])
|
||||||
e_gt = tf.cast(e_gt,'int32')
|
e_gt = tf.cast(e_gt,'int32')
|
||||||
e_gt = tf.clip_by_value(e_gt,0,254)
|
e_gt = tf.clip_by_value(e_gt,0,254)
|
||||||
interp_probab = (1 - alpha)*model_out + alpha*tf.roll(model_out,shift = -1,axis = -1)
|
interp_probab = (1 - alpha)*model_out + alpha*tf.roll(model_out,shift = -1,axis = -1)
|
||||||
sparse_cel = tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE)(e_gt,interp_probab)
|
sparse_cel = tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE)(e_gt,interp_probab)
|
||||||
loss_mod = sparse_cel + prob_compensation + gamma*regularization
|
loss_mod = sparse_cel + prob_compensation + gamma*regularization
|
||||||
|
@ -51,7 +51,7 @@ def metric_oginterploss(y_true,y_pred):
|
||||||
alpha = e_gt - tf.math.floor(e_gt)
|
alpha = e_gt - tf.math.floor(e_gt)
|
||||||
alpha = tf.tile(alpha,[1,1,256])
|
alpha = tf.tile(alpha,[1,1,256])
|
||||||
e_gt = tf.cast(e_gt,'int32')
|
e_gt = tf.cast(e_gt,'int32')
|
||||||
e_gt = tf.clip_by_value(e_gt,0,254)
|
e_gt = tf.clip_by_value(e_gt,0,254)
|
||||||
interp_probab = (1 - alpha)*model_out + alpha*tf.roll(model_out,shift = -1,axis = -1)
|
interp_probab = (1 - alpha)*model_out + alpha*tf.roll(model_out,shift = -1,axis = -1)
|
||||||
sparse_cel = tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE)(e_gt,interp_probab)
|
sparse_cel = tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE)(e_gt,interp_probab)
|
||||||
loss_mod = sparse_cel + prob_compensation
|
loss_mod = sparse_cel + prob_compensation
|
||||||
|
@ -78,7 +78,7 @@ def metric_cel(y_true, y_pred):
|
||||||
e_gt = tf_l2u(y_true - p)
|
e_gt = tf_l2u(y_true - p)
|
||||||
e_gt = tf.round(e_gt)
|
e_gt = tf.round(e_gt)
|
||||||
e_gt = tf.cast(e_gt,'int32')
|
e_gt = tf.cast(e_gt,'int32')
|
||||||
e_gt = tf.clip_by_value(e_gt,0,255)
|
e_gt = tf.clip_by_value(e_gt,0,255)
|
||||||
sparse_cel = tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE)(e_gt,model_out)
|
sparse_cel = tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE)(e_gt,model_out)
|
||||||
return sparse_cel
|
return sparse_cel
|
||||||
|
|
||||||
|
@ -97,4 +97,3 @@ def loss_matchlar():
|
||||||
loss_lar_diff = tf.square(loss_lar_diff)
|
loss_lar_diff = tf.square(loss_lar_diff)
|
||||||
return tf.reduce_mean(loss_lar_diff, axis=-1)
|
return tf.reduce_mean(loss_lar_diff, axis=-1)
|
||||||
return loss
|
return loss
|
||||||
|
|
||||||
|
|
|
@ -186,7 +186,7 @@ class SparsifyGRUB(Callback):
|
||||||
|
|
||||||
w[0] = p
|
w[0] = p
|
||||||
layer.set_weights(w)
|
layer.set_weights(w)
|
||||||
|
|
||||||
|
|
||||||
class PCMInit(Initializer):
|
class PCMInit(Initializer):
|
||||||
def __init__(self, gain=.1, seed=None):
|
def __init__(self, gain=.1, seed=None):
|
||||||
|
@ -264,20 +264,20 @@ def new_lpcnet_model(rnn_units1=384, rnn_units2=16, nb_used_features=20, batch_s
|
||||||
lpcoeffs = diff_rc2lpc(name = "rc2lpc")(cfeat)
|
lpcoeffs = diff_rc2lpc(name = "rc2lpc")(cfeat)
|
||||||
else:
|
else:
|
||||||
lpcoeffs = Input(shape=(None, lpc_order), batch_size=batch_size)
|
lpcoeffs = Input(shape=(None, lpc_order), batch_size=batch_size)
|
||||||
|
|
||||||
real_preds = diff_pred(name = "real_lpc2preds")([pcm,lpcoeffs])
|
real_preds = diff_pred(name = "real_lpc2preds")([pcm,lpcoeffs])
|
||||||
weighting = lpc_gamma ** np.arange(1, 17).astype('float32')
|
weighting = lpc_gamma ** np.arange(1, 17).astype('float32')
|
||||||
weighted_lpcoeffs = Lambda(lambda x: x[0]*x[1])([lpcoeffs, weighting])
|
weighted_lpcoeffs = Lambda(lambda x: x[0]*x[1])([lpcoeffs, weighting])
|
||||||
tensor_preds = diff_pred(name = "lpc2preds")([pcm,weighted_lpcoeffs])
|
tensor_preds = diff_pred(name = "lpc2preds")([pcm,weighted_lpcoeffs])
|
||||||
past_errors = error_calc([pcm,tensor_preds])
|
past_errors = error_calc([pcm,tensor_preds])
|
||||||
|
|
||||||
embed = diff_Embed(name='embed_sig',initializer = PCMInit())
|
embed = diff_Embed(name='embed_sig',initializer = PCMInit())
|
||||||
cpcm = Concatenate()([tf_l2u(pcm),tf_l2u(tensor_preds),past_errors])
|
cpcm = Concatenate()([tf_l2u(pcm),tf_l2u(tensor_preds),past_errors])
|
||||||
cpcm = GaussianNoise(.3)(cpcm)
|
cpcm = GaussianNoise(.3)(cpcm)
|
||||||
cpcm = Reshape((-1, embed_size*3))(embed(cpcm))
|
cpcm = Reshape((-1, embed_size*3))(embed(cpcm))
|
||||||
cpcm_decoder = Reshape((-1, embed_size*3))(embed(dpcm))
|
cpcm_decoder = Reshape((-1, embed_size*3))(embed(dpcm))
|
||||||
|
|
||||||
|
|
||||||
rep = Lambda(lambda x: K.repeat_elements(x, frame_size, 1))
|
rep = Lambda(lambda x: K.repeat_elements(x, frame_size, 1))
|
||||||
|
|
||||||
quant = quant_regularizer if quantize else None
|
quant = quant_regularizer if quantize else None
|
||||||
|
@ -305,7 +305,7 @@ def new_lpcnet_model(rnn_units1=384, rnn_units2=16, nb_used_features=20, batch_s
|
||||||
rnn2.trainable=False
|
rnn2.trainable=False
|
||||||
md.trainable=False
|
md.trainable=False
|
||||||
embed.Trainable=False
|
embed.Trainable=False
|
||||||
|
|
||||||
m_out = Concatenate(name='pdf')([tensor_preds,real_preds,ulaw_prob])
|
m_out = Concatenate(name='pdf')([tensor_preds,real_preds,ulaw_prob])
|
||||||
if not flag_e2e:
|
if not flag_e2e:
|
||||||
model = Model([pcm, feat, pitch, lpcoeffs], m_out)
|
model = Model([pcm, feat, pitch, lpcoeffs], m_out)
|
||||||
|
@ -315,7 +315,7 @@ def new_lpcnet_model(rnn_units1=384, rnn_units2=16, nb_used_features=20, batch_s
|
||||||
model.rnn_units2 = rnn_units2
|
model.rnn_units2 = rnn_units2
|
||||||
model.nb_used_features = nb_used_features
|
model.nb_used_features = nb_used_features
|
||||||
model.frame_size = frame_size
|
model.frame_size = frame_size
|
||||||
|
|
||||||
if not flag_e2e:
|
if not flag_e2e:
|
||||||
encoder = Model([feat, pitch], cfeat)
|
encoder = Model([feat, pitch], cfeat)
|
||||||
dec_rnn_in = Concatenate()([cpcm_decoder, dec_feat])
|
dec_rnn_in = Concatenate()([cpcm_decoder, dec_feat])
|
||||||
|
@ -330,7 +330,7 @@ def new_lpcnet_model(rnn_units1=384, rnn_units2=16, nb_used_features=20, batch_s
|
||||||
decoder = Model([dpcm, dec_feat, dec_state1, dec_state2], [dec_ulaw_prob, state1, state2])
|
decoder = Model([dpcm, dec_feat, dec_state1, dec_state2], [dec_ulaw_prob, state1, state2])
|
||||||
else:
|
else:
|
||||||
decoder = Model([dpcm, dec_feat, dec_state1, dec_state2], [dec_ulaw_prob, state1, state2])
|
decoder = Model([dpcm, dec_feat, dec_state1, dec_state2], [dec_ulaw_prob, state1, state2])
|
||||||
|
|
||||||
# add parameters to model
|
# add parameters to model
|
||||||
set_parameter(model, 'lpc_gamma', lpc_gamma, dtype='float64')
|
set_parameter(model, 'lpc_gamma', lpc_gamma, dtype='float64')
|
||||||
set_parameter(model, 'flag_e2e', flag_e2e, dtype='bool')
|
set_parameter(model, 'flag_e2e', flag_e2e, dtype='bool')
|
||||||
|
|
|
@ -88,10 +88,10 @@ def new_lpcnet_plc_model(rnn_units=256, nb_used_features=20, nb_burg_features=36
|
||||||
gru_out1, _ = rnn(cfeat)
|
gru_out1, _ = rnn(cfeat)
|
||||||
gru_out1 = GaussianNoise(.005)(gru_out1)
|
gru_out1 = GaussianNoise(.005)(gru_out1)
|
||||||
gru_out2, _ = rnn2(gru_out1)
|
gru_out2, _ = rnn2(gru_out1)
|
||||||
|
|
||||||
out_dense = Dense(nb_used_features, activation='linear', name='plc_out')
|
out_dense = Dense(nb_used_features, activation='linear', name='plc_out')
|
||||||
plc_out = out_dense(gru_out2)
|
plc_out = out_dense(gru_out2)
|
||||||
|
|
||||||
model = Model([feat, lost], plc_out)
|
model = Model([feat, lost], plc_out)
|
||||||
model.rnn_units = rnn_units
|
model.rnn_units = rnn_units
|
||||||
model.cond_size = cond_size
|
model.cond_size = cond_size
|
||||||
|
|
|
@ -6,7 +6,7 @@ import numpy as np
|
||||||
import math
|
import math
|
||||||
|
|
||||||
class MDense(Layer):
|
class MDense(Layer):
|
||||||
|
|
||||||
def __init__(self, outputs,
|
def __init__(self, outputs,
|
||||||
channels=2,
|
channels=2,
|
||||||
activation=None,
|
activation=None,
|
||||||
|
|
|
@ -5,9 +5,9 @@ import tensorflow as tf
|
||||||
|
|
||||||
def set_parameter(model, parameter_name, parameter_value, dtype='float32'):
|
def set_parameter(model, parameter_name, parameter_value, dtype='float32'):
|
||||||
""" stores parameter_value as non-trainable weight with name parameter_name:0 """
|
""" stores parameter_value as non-trainable weight with name parameter_name:0 """
|
||||||
|
|
||||||
weights = [weight for weight in model.weights if weight.name == (parameter_name + ":0")]
|
weights = [weight for weight in model.weights if weight.name == (parameter_name + ":0")]
|
||||||
|
|
||||||
if len(weights) == 0:
|
if len(weights) == 0:
|
||||||
model.add_weight(parameter_name, trainable=False, initializer=tf.keras.initializers.Constant(parameter_value), dtype=dtype)
|
model.add_weight(parameter_name, trainable=False, initializer=tf.keras.initializers.Constant(parameter_value), dtype=dtype)
|
||||||
elif len(weights) == 1:
|
elif len(weights) == 1:
|
||||||
|
@ -15,14 +15,14 @@ def set_parameter(model, parameter_name, parameter_value, dtype='float32'):
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"more than one weight starting with {parameter_name}:0 in model")
|
raise ValueError(f"more than one weight starting with {parameter_name}:0 in model")
|
||||||
|
|
||||||
|
|
||||||
def get_parameter(model, parameter_name, default=None):
|
def get_parameter(model, parameter_name, default=None):
|
||||||
""" returns parameter value if parameter is present in model and otherwise default """
|
""" returns parameter value if parameter is present in model and otherwise default """
|
||||||
|
|
||||||
weights = [weight for weight in model.weights if weight.name == (parameter_name + ":0")]
|
weights = [weight for weight in model.weights if weight.name == (parameter_name + ":0")]
|
||||||
|
|
||||||
if len(weights) == 0:
|
if len(weights) == 0:
|
||||||
return default
|
return default
|
||||||
elif len(weights) > 1:
|
elif len(weights) > 1:
|
||||||
raise ValueError(f"more than one weight starting with {parameter_name}:0 in model")
|
raise ValueError(f"more than one weight starting with {parameter_name}:0 in model")
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -56,7 +56,7 @@ class PLCLoader(Sequence):
|
||||||
lost_mask = np.tile(lost, (1,1,features.shape[2]))
|
lost_mask = np.tile(lost, (1,1,features.shape[2]))
|
||||||
in_features = features*lost_mask
|
in_features = features*lost_mask
|
||||||
in_features[:,:,:self.nb_burg_features] = in_features[:,:,:self.nb_burg_features]*burg_mask
|
in_features[:,:,:self.nb_burg_features] = in_features[:,:,:self.nb_burg_features]*burg_mask
|
||||||
|
|
||||||
#For the first frame after a loss, we don't have valid features, but the Burg estimate is valid.
|
#For the first frame after a loss, we don't have valid features, but the Burg estimate is valid.
|
||||||
#in_features[:,1:,self.nb_burg_features:] = in_features[:,1:,self.nb_burg_features:]*lost_mask[:,:-1,self.nb_burg_features:]
|
#in_features[:,1:,self.nb_burg_features:] = in_features[:,1:,self.nb_burg_features:]*lost_mask[:,:-1,self.nb_burg_features:]
|
||||||
out_lost = np.copy(lost)
|
out_lost = np.copy(lost)
|
||||||
|
|
|
@ -61,7 +61,7 @@ def soft_quantize(x):
|
||||||
#x = 4*x
|
#x = 4*x
|
||||||
#x = x - (.25/np.math.pi)*tf.math.sin(2*np.math.pi*x)
|
#x = x - (.25/np.math.pi)*tf.math.sin(2*np.math.pi*x)
|
||||||
#x = x - (.25/np.math.pi)*tf.math.sin(2*np.math.pi*x)
|
#x = x - (.25/np.math.pi)*tf.math.sin(2*np.math.pi*x)
|
||||||
#x = x - (.25/np.math.pi)*tf.math.sin(2*np.math.pi*x)
|
#x = x - (.25/np.math.pi)*tf.math.sin(2*np.math.pi*x)
|
||||||
return x
|
return x
|
||||||
|
|
||||||
def noise_quantize(x):
|
def noise_quantize(x):
|
||||||
|
@ -237,7 +237,7 @@ def new_rdovae_decoder(nb_used_features=20, nb_bits=17, bunch=4, nb_quant=40, ba
|
||||||
bits_input = Input(shape=(None, nb_bits), batch_size=batch_size, name="dec_bits")
|
bits_input = Input(shape=(None, nb_bits), batch_size=batch_size, name="dec_bits")
|
||||||
gru_state_input = Input(shape=(nb_state_dim,), batch_size=batch_size, name="dec_state")
|
gru_state_input = Input(shape=(nb_state_dim,), batch_size=batch_size, name="dec_state")
|
||||||
|
|
||||||
|
|
||||||
gru = CuDNNGRU if training else GRU
|
gru = CuDNNGRU if training else GRU
|
||||||
dec_dense1 = Dense(cond_size2, activation='tanh', kernel_constraint=constraint, name='dec_dense1')
|
dec_dense1 = Dense(cond_size2, activation='tanh', kernel_constraint=constraint, name='dec_dense1')
|
||||||
dec_dense2 = gru(cond_size, return_sequences=True, kernel_constraint=constraint, recurrent_constraint=constraint, name='dec_dense2')
|
dec_dense2 = gru(cond_size, return_sequences=True, kernel_constraint=constraint, recurrent_constraint=constraint, name='dec_dense2')
|
||||||
|
@ -300,7 +300,7 @@ def tensor_concat(x):
|
||||||
y = []
|
y = []
|
||||||
for i in range(n-1):
|
for i in range(n-1):
|
||||||
offset = 2 * (n-1-i)
|
offset = 2 * (n-1-i)
|
||||||
tmp = K.concatenate([x[i][:, offset:, :], x[-1][:, -offset:, :]], axis=-2)
|
tmp = K.concatenate([x[i][:, offset:, :], x[-1][:, -offset:, :]], axis=-2)
|
||||||
y.append(tf.expand_dims(tmp, axis=0))
|
y.append(tf.expand_dims(tmp, axis=0))
|
||||||
y.append(tf.expand_dims(x[-1], axis=0))
|
y.append(tf.expand_dims(x[-1], axis=0))
|
||||||
return Concatenate(axis=0)(y)
|
return Concatenate(axis=0)(y)
|
||||||
|
@ -335,7 +335,7 @@ def new_rdovae_model(nb_used_features=20, nb_bits=17, bunch=4, nb_quant=40, batc
|
||||||
dze = dzone([ze,dead_zone])
|
dze = dzone([ze,dead_zone])
|
||||||
ndze = noisequant(dze)
|
ndze = noisequant(dze)
|
||||||
dze_quant = hardquant(dze)
|
dze_quant = hardquant(dze)
|
||||||
|
|
||||||
div = Lambda(lambda x: x[0]/x[1])
|
div = Lambda(lambda x: x[0]/x[1])
|
||||||
dze_quant = div([dze_quant,quant_scale])
|
dze_quant = div([dze_quant,quant_scale])
|
||||||
ndze_unquant = div([ndze,quant_scale])
|
ndze_unquant = div([ndze,quant_scale])
|
||||||
|
@ -355,13 +355,13 @@ def new_rdovae_model(nb_used_features=20, nb_bits=17, bunch=4, nb_quant=40, batc
|
||||||
combined_output.append(tmp)
|
combined_output.append(tmp)
|
||||||
|
|
||||||
tmp = split_decoder([ndze_select, state_select])
|
tmp = split_decoder([ndze_select, state_select])
|
||||||
tmp = cat([tmp, lambda_up])
|
tmp = cat([tmp, lambda_up])
|
||||||
unquantized_output.append(tmp)
|
unquantized_output.append(tmp)
|
||||||
|
|
||||||
concat = Lambda(tensor_concat, name="output")
|
concat = Lambda(tensor_concat, name="output")
|
||||||
combined_output = concat(combined_output)
|
combined_output = concat(combined_output)
|
||||||
unquantized_output = concat(unquantized_output)
|
unquantized_output = concat(unquantized_output)
|
||||||
|
|
||||||
e2 = Concatenate(name="hard_bits")([dze, hard_distr_embed, lambda_val])
|
e2 = Concatenate(name="hard_bits")([dze, hard_distr_embed, lambda_val])
|
||||||
e = Concatenate(name="soft_bits")([dze, soft_distr_embed, lambda_val])
|
e = Concatenate(name="soft_bits")([dze, soft_distr_embed, lambda_val])
|
||||||
|
|
||||||
|
@ -370,4 +370,3 @@ def new_rdovae_model(nb_used_features=20, nb_bits=17, bunch=4, nb_quant=40, batc
|
||||||
model.nb_used_features = nb_used_features
|
model.nb_used_features = nb_used_features
|
||||||
|
|
||||||
return model, encoder, decoder, qembedding
|
return model, encoder, decoder, qembedding
|
||||||
|
|
||||||
|
|
|
@ -113,7 +113,7 @@ if __name__ == "__main__":
|
||||||
# qembedding
|
# qembedding
|
||||||
print(f"writing layer {exchange_name['qembedding']}...")
|
print(f"writing layer {exchange_name['qembedding']}...")
|
||||||
dump_tf_weights(os.path.join(args.output, exchange_name['qembedding']), qembedding)
|
dump_tf_weights(os.path.join(args.output, exchange_name['qembedding']), qembedding)
|
||||||
|
|
||||||
# decoder
|
# decoder
|
||||||
decoder_dense_names = [
|
decoder_dense_names = [
|
||||||
'state1',
|
'state1',
|
||||||
|
@ -125,7 +125,7 @@ if __name__ == "__main__":
|
||||||
'dec_dense7',
|
'dec_dense7',
|
||||||
'dec_dense8',
|
'dec_dense8',
|
||||||
'dec_final'
|
'dec_final'
|
||||||
]
|
]
|
||||||
|
|
||||||
decoder_gru_names = [
|
decoder_gru_names = [
|
||||||
'dec_dense2',
|
'dec_dense2',
|
||||||
|
|
|
@ -79,7 +79,7 @@ exchange_name = {
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
||||||
model, encoder, decoder, qembedding = new_rdovae_model(20, args.latent_dim, cond_size=args.cond_size, nb_quant=args.quant_levels)
|
model, encoder, decoder, qembedding = new_rdovae_model(20, args.latent_dim, cond_size=args.cond_size, nb_quant=args.quant_levels)
|
||||||
|
|
||||||
encoder_layers = [
|
encoder_layers = [
|
||||||
'enc_dense1',
|
'enc_dense1',
|
||||||
'enc_dense3',
|
'enc_dense3',
|
||||||
|
@ -93,7 +93,7 @@ if __name__ == "__main__":
|
||||||
'enc_dense6',
|
'enc_dense6',
|
||||||
'bits_dense'
|
'bits_dense'
|
||||||
]
|
]
|
||||||
|
|
||||||
decoder_layers = [
|
decoder_layers = [
|
||||||
'state1',
|
'state1',
|
||||||
'state2',
|
'state2',
|
||||||
|
@ -108,16 +108,16 @@ if __name__ == "__main__":
|
||||||
'dec_dense4',
|
'dec_dense4',
|
||||||
'dec_dense6'
|
'dec_dense6'
|
||||||
]
|
]
|
||||||
|
|
||||||
for name in encoder_layers:
|
for name in encoder_layers:
|
||||||
print(f"loading weight for layer {name}...")
|
print(f"loading weight for layer {name}...")
|
||||||
load_tf_weights(os.path.join(args.input, exchange_name[name]), encoder.get_layer(name))
|
load_tf_weights(os.path.join(args.input, exchange_name[name]), encoder.get_layer(name))
|
||||||
|
|
||||||
print(f"loading weight for layer qembedding...")
|
print(f"loading weight for layer qembedding...")
|
||||||
load_tf_weights(os.path.join(args.input, exchange_name['qembedding']), qembedding)
|
load_tf_weights(os.path.join(args.input, exchange_name['qembedding']), qembedding)
|
||||||
|
|
||||||
for name in decoder_layers:
|
for name in decoder_layers:
|
||||||
print(f"loading weight for layer {name}...")
|
print(f"loading weight for layer {name}...")
|
||||||
load_tf_weights(os.path.join(args.input, exchange_name[name]), decoder.get_layer(name))
|
load_tf_weights(os.path.join(args.input, exchange_name[name]), decoder.get_layer(name))
|
||||||
|
|
||||||
model.save(args.weights)
|
model.save(args.weights)
|
||||||
|
|
|
@ -118,5 +118,3 @@ for c in range(0, nb_frames):
|
||||||
#print(mem)
|
#print(mem)
|
||||||
np.array([np.round(mem)], dtype='int16').tofile(fout)
|
np.array([np.round(mem)], dtype='int16').tofile(fout)
|
||||||
skip = 0
|
skip = 0
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -36,12 +36,12 @@ class diff_pred(Layer):
|
||||||
rept = Lambda(lambda x: K.repeat_elements(x , frame_size, 1))
|
rept = Lambda(lambda x: K.repeat_elements(x , frame_size, 1))
|
||||||
zpX = Lambda(lambda x: K.concatenate([0*x[:,0:lpcoeffs_N,:], x],axis = 1))
|
zpX = Lambda(lambda x: K.concatenate([0*x[:,0:lpcoeffs_N,:], x],axis = 1))
|
||||||
cX = Lambda(lambda x: K.concatenate([x[:,(lpcoeffs_N - i):(lpcoeffs_N - i + 2400),:] for i in range(lpcoeffs_N)],axis = 2))
|
cX = Lambda(lambda x: K.concatenate([x[:,(lpcoeffs_N - i):(lpcoeffs_N - i + 2400),:] for i in range(lpcoeffs_N)],axis = 2))
|
||||||
|
|
||||||
pred = -Multiply()([rept(lpc),cX(zpX(xt))])
|
pred = -Multiply()([rept(lpc),cX(zpX(xt))])
|
||||||
|
|
||||||
return K.sum(pred,axis = 2,keepdims = True)
|
return K.sum(pred,axis = 2,keepdims = True)
|
||||||
|
|
||||||
# Differentiable Transformations (RC <-> LPC) computed using the Levinson Durbin Recursion
|
# Differentiable Transformations (RC <-> LPC) computed using the Levinson Durbin Recursion
|
||||||
class diff_rc2lpc(Layer):
|
class diff_rc2lpc(Layer):
|
||||||
def call(self, inputs, lpcoeffs_N = 16):
|
def call(self, inputs, lpcoeffs_N = 16):
|
||||||
def pred_lpc_recursive(input):
|
def pred_lpc_recursive(input):
|
||||||
|
|
|
@ -134,7 +134,7 @@ strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()
|
||||||
|
|
||||||
with strategy.scope():
|
with strategy.scope():
|
||||||
model, _, _ = lpcnet.new_lpcnet_model(rnn_units1=args.grua_size,
|
model, _, _ = lpcnet.new_lpcnet_model(rnn_units1=args.grua_size,
|
||||||
rnn_units2=args.grub_size,
|
rnn_units2=args.grub_size,
|
||||||
batch_size=batch_size, training=True,
|
batch_size=batch_size, training=True,
|
||||||
quantize=quantize,
|
quantize=quantize,
|
||||||
flag_e2e=flag_e2e,
|
flag_e2e=flag_e2e,
|
||||||
|
|
|
@ -200,14 +200,14 @@ static inline void sgemv_accum16(float *out, const float *weights, int rows, int
|
||||||
for (i=0;i<rows;i+=16)
|
for (i=0;i<rows;i+=16)
|
||||||
{
|
{
|
||||||
float * restrict y = &out[i];
|
float * restrict y = &out[i];
|
||||||
|
|
||||||
/* keep y[0..15] in registers for duration of inner loop */
|
/* keep y[0..15] in registers for duration of inner loop */
|
||||||
|
|
||||||
float32x4_t y0_3 = vld1q_f32(&y[0]);
|
float32x4_t y0_3 = vld1q_f32(&y[0]);
|
||||||
float32x4_t y4_7 = vld1q_f32(&y[4]);
|
float32x4_t y4_7 = vld1q_f32(&y[4]);
|
||||||
float32x4_t y8_11 = vld1q_f32(&y[8]);
|
float32x4_t y8_11 = vld1q_f32(&y[8]);
|
||||||
float32x4_t y12_15 = vld1q_f32(&y[12]);
|
float32x4_t y12_15 = vld1q_f32(&y[12]);
|
||||||
|
|
||||||
for (j=0;j<cols;j++)
|
for (j=0;j<cols;j++)
|
||||||
{
|
{
|
||||||
const float * restrict w;
|
const float * restrict w;
|
||||||
|
@ -219,9 +219,9 @@ static inline void sgemv_accum16(float *out, const float *weights, int rows, int
|
||||||
wvec4_7 = vld1q_f32(&w[4]);
|
wvec4_7 = vld1q_f32(&w[4]);
|
||||||
wvec8_11 = vld1q_f32(&w[8]);
|
wvec8_11 = vld1q_f32(&w[8]);
|
||||||
wvec12_15 = vld1q_f32(&w[12]);
|
wvec12_15 = vld1q_f32(&w[12]);
|
||||||
|
|
||||||
xj = vld1q_dup_f32(&x[j]);
|
xj = vld1q_dup_f32(&x[j]);
|
||||||
|
|
||||||
y0_3 = vmlaq_f32(y0_3, wvec0_3, xj);
|
y0_3 = vmlaq_f32(y0_3, wvec0_3, xj);
|
||||||
y4_7 = vmlaq_f32(y4_7, wvec4_7, xj);
|
y4_7 = vmlaq_f32(y4_7, wvec4_7, xj);
|
||||||
y8_11 = vmlaq_f32(y8_11, wvec8_11, xj);
|
y8_11 = vmlaq_f32(y8_11, wvec8_11, xj);
|
||||||
|
@ -229,12 +229,12 @@ static inline void sgemv_accum16(float *out, const float *weights, int rows, int
|
||||||
}
|
}
|
||||||
|
|
||||||
/* save y[0..15] back to memory */
|
/* save y[0..15] back to memory */
|
||||||
|
|
||||||
vst1q_f32(&y[0], y0_3);
|
vst1q_f32(&y[0], y0_3);
|
||||||
vst1q_f32(&y[4], y4_7);
|
vst1q_f32(&y[4], y4_7);
|
||||||
vst1q_f32(&y[8], y8_11);
|
vst1q_f32(&y[8], y8_11);
|
||||||
vst1q_f32(&y[12], y12_15);
|
vst1q_f32(&y[12], y12_15);
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -249,32 +249,32 @@ static inline void sparse_sgemv_accum16(float *out, const float *w, int rows, co
|
||||||
y = &out[i];
|
y = &out[i];
|
||||||
|
|
||||||
/* keep y[0..15] in registers for duration of inner loop */
|
/* keep y[0..15] in registers for duration of inner loop */
|
||||||
|
|
||||||
float32x4_t y0_3 = vld1q_f32(&y[0]);
|
float32x4_t y0_3 = vld1q_f32(&y[0]);
|
||||||
float32x4_t y4_7 = vld1q_f32(&y[4]);
|
float32x4_t y4_7 = vld1q_f32(&y[4]);
|
||||||
float32x4_t y8_11 = vld1q_f32(&y[8]);
|
float32x4_t y8_11 = vld1q_f32(&y[8]);
|
||||||
float32x4_t y12_15 = vld1q_f32(&y[12]);
|
float32x4_t y12_15 = vld1q_f32(&y[12]);
|
||||||
|
|
||||||
for (j=0;j<cols;j++)
|
for (j=0;j<cols;j++)
|
||||||
{
|
{
|
||||||
float32x4_t xj= vld1q_dup_f32(&x[*idx++]);
|
float32x4_t xj= vld1q_dup_f32(&x[*idx++]);
|
||||||
float32x4_t wvec;
|
float32x4_t wvec;
|
||||||
|
|
||||||
wvec = vld1q_f32(&w[0]); y0_3 = vmlaq_f32(y0_3, wvec, xj);
|
wvec = vld1q_f32(&w[0]); y0_3 = vmlaq_f32(y0_3, wvec, xj);
|
||||||
wvec = vld1q_f32(&w[4]); y4_7 = vmlaq_f32(y4_7, wvec, xj);
|
wvec = vld1q_f32(&w[4]); y4_7 = vmlaq_f32(y4_7, wvec, xj);
|
||||||
wvec = vld1q_f32(&w[8]); y8_11 = vmlaq_f32(y8_11, wvec, xj);
|
wvec = vld1q_f32(&w[8]); y8_11 = vmlaq_f32(y8_11, wvec, xj);
|
||||||
wvec = vld1q_f32(&w[12]); y12_15 = vmlaq_f32(y12_15, wvec, xj);
|
wvec = vld1q_f32(&w[12]); y12_15 = vmlaq_f32(y12_15, wvec, xj);
|
||||||
|
|
||||||
w += 16;
|
w += 16;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* save y[0..15] back to memory */
|
/* save y[0..15] back to memory */
|
||||||
|
|
||||||
vst1q_f32(&y[0], y0_3);
|
vst1q_f32(&y[0], y0_3);
|
||||||
vst1q_f32(&y[4], y4_7);
|
vst1q_f32(&y[4], y4_7);
|
||||||
vst1q_f32(&y[8], y8_11);
|
vst1q_f32(&y[8], y8_11);
|
||||||
vst1q_f32(&y[12], y12_15);
|
vst1q_f32(&y[12], y12_15);
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue