Remove trailing whitespace in dnn

This commit is contained in:
Marcus Asteborg 2023-06-22 05:27:54 -07:00
parent 26ab10d0c8
commit f36685fc97
No known key found for this signature in database
GPG key ID: F69798291D4CE42A
37 changed files with 231 additions and 246 deletions

View file

@ -115,7 +115,7 @@ This codebase is also meant for research and it is possible to train new models.
and move the generated nnet\_data.\* files to the src/ directory. and move the generated nnet\_data.\* files to the src/ directory.
Then you just need to rebuild the software and use lpcnet\_demo as explained above. Then you just need to rebuild the software and use lpcnet\_demo as explained above.
# Speech Material for Training # Speech Material for Training
Suitable training material can be obtained from [Open Speech and Language Resources](https://www.openslr.org/). See the datasets.txt file for details on suitable training data. Suitable training material can be obtained from [Open Speech and Language Resources](https://www.openslr.org/). See the datasets.txt file for details on suitable training data.
@ -123,5 +123,4 @@ Suitable training material can be obtained from [Open Speech and Language Resour
1. [LPCNet: DSP-Boosted Neural Speech Synthesis](https://people.xiph.org/~jm/demo/lpcnet/) 1. [LPCNet: DSP-Boosted Neural Speech Synthesis](https://people.xiph.org/~jm/demo/lpcnet/)
1. [A Real-Time Wideband Neural Vocoder at 1.6 kb/s Using LPCNet](https://people.xiph.org/~jm/demo/lpcnet_codec/) 1. [A Real-Time Wideband Neural Vocoder at 1.6 kb/s Using LPCNet](https://people.xiph.org/~jm/demo/lpcnet_codec/)
1. Sample model files (check compatibility): https://media.xiph.org/lpcnet/data/ 1. Sample model files (check compatibility): https://media.xiph.org/lpcnet/data/

View file

@ -171,4 +171,3 @@ The corresponding citations for all these datasets are:
journal={arXiv preprint arXiv:2104.01497}, journal={arXiv preprint arXiv:2104.01497},
year={2021} year={2021}
} }

View file

@ -9,4 +9,3 @@ if not exist %model% (
tar -xvzf %model% tar -xvzf %model%
move .\src\*.c . move .\src\*.c .
move .\src\*.h . move .\src\*.h .

View file

@ -98,7 +98,7 @@ void write_audio(LPCNetEncState *st, const short *pcm, const int *noise, FILE *f
/* Simulate error on excitation. */ /* Simulate error on excitation. */
e += noise[k*FRAME_SIZE+i]; e += noise[k*FRAME_SIZE+i];
e = IMIN(255, IMAX(0, e)); e = IMIN(255, IMAX(0, e));
RNN_MOVE(&st->sig_mem[1], &st->sig_mem[0], LPC_ORDER-1); RNN_MOVE(&st->sig_mem[1], &st->sig_mem[0], LPC_ORDER-1);
st->sig_mem[0] = p + ulaw2lin(e); st->sig_mem[0] = p + ulaw2lin(e);
st->exc_mem = e; st->exc_mem = e;
@ -241,7 +241,7 @@ int main(int argc, char **argv) {
if (fpcm) { if (fpcm) {
compute_noise(&noisebuf[st->pcount*FRAME_SIZE], noise_std); compute_noise(&noisebuf[st->pcount*FRAME_SIZE], noise_std);
} }
process_single_frame(st, ffeat); process_single_frame(st, ffeat);
if (fpcm) write_audio(st, pcm, &noisebuf[st->pcount*FRAME_SIZE], fpcm, 1); if (fpcm) write_audio(st, pcm, &noisebuf[st->pcount*FRAME_SIZE], fpcm, 1);
st->pcount++; st->pcount++;
@ -260,4 +260,3 @@ int main(int argc, char **argv) {
lpcnet_encoder_destroy(st); lpcnet_encoder_destroy(st);
return 0; return 0;
} }

View file

@ -326,4 +326,3 @@ void apply_window(float *x) {
x[WINDOW_SIZE - 1 - i] *= half_window[i]; x[WINDOW_SIZE - 1 - i] *= half_window[i];
} }
} }

View file

@ -61,7 +61,7 @@ void rc2lpc(float *lpc, const float *rc)
float ntmp[LPC_ORDER] = {0.0}; float ntmp[LPC_ORDER] = {0.0};
RNN_COPY(tmp, rc, LPC_ORDER); RNN_COPY(tmp, rc, LPC_ORDER);
for(i = 0; i < LPC_ORDER ; i++) for(i = 0; i < LPC_ORDER ; i++)
{ {
for(j = 0; j <= i-1; j++) for(j = 0; j <= i-1; j++)
{ {
ntmp[j] = tmp[j] + tmp[i]*tmp[i - j - 1]; ntmp[j] = tmp[j] + tmp[i]*tmp[i - j - 1];
@ -106,7 +106,7 @@ void run_frame_network(LPCNetState *lpcnet, float *gru_a_condition, float *gru_b
_lpcnet_compute_dense(&lpcnet->model.gru_b_dense_feature, gru_b_condition, condition); _lpcnet_compute_dense(&lpcnet->model.gru_b_dense_feature, gru_b_condition, condition);
#ifdef END2END #ifdef END2END
rc2lpc(lpc, rc); rc2lpc(lpc, rc);
#elif FEATURES_DELAY>0 #elif FEATURES_DELAY>0
memcpy(lpc, lpcnet->old_lpc[FEATURES_DELAY-1], LPC_ORDER*sizeof(lpc[0])); memcpy(lpc, lpcnet->old_lpc[FEATURES_DELAY-1], LPC_ORDER*sizeof(lpc[0]));
memmove(lpcnet->old_lpc[1], lpcnet->old_lpc[0], (FEATURES_DELAY-1)*LPC_ORDER*sizeof(lpc[0])); memmove(lpcnet->old_lpc[1], lpcnet->old_lpc[0], (FEATURES_DELAY-1)*LPC_ORDER*sizeof(lpc[0]));
lpc_from_cepstrum(lpcnet->old_lpc[0], features); lpc_from_cepstrum(lpcnet->old_lpc[0], features);

View file

@ -170,7 +170,7 @@ int sample_mdense(const MDenseLayer *layer, const float *input, const float *sam
C = layer->nb_channels; C = layer->nb_channels;
celt_assert(N*C <= MAX_MDENSE_TMP); celt_assert(N*C <= MAX_MDENSE_TMP);
stride = M*C; stride = M*C;
celt_assert(N <= DUAL_FC_OUT_SIZE); celt_assert(N <= DUAL_FC_OUT_SIZE);
/* Computing all the random thresholds in advance. These thresholds are directly /* Computing all the random thresholds in advance. These thresholds are directly
@ -188,7 +188,7 @@ int sample_mdense(const MDenseLayer *layer, const float *input, const float *sam
int bit; int bit;
int i; int i;
float sum1, sum2; float sum1, sum2;
i = (1<<b) | val; i = (1<<b) | val;
sum1 = layer->bias[i]; sum1 = layer->bias[i];
@ -426,7 +426,7 @@ void compute_sparse_gru(const SparseGRULayer *gru, float *state, const float *in
#ifdef USE_SU_BIAS #ifdef USE_SU_BIAS
bias = &gru->subias[3*N]; bias = &gru->subias[3*N];
#else #else
bias = &gru->bias[3*N]; bias = &gru->bias[3*N];
#endif #endif
for (k=0;k<2;k++) for (k=0;k<2;k++)
{ {
@ -478,7 +478,7 @@ void compute_embedding(const EmbeddingLayer *layer, float *output, int input)
for (i=0;i<layer->dim;i++) for (i=0;i<layer->dim;i++)
{ {
output[i] = layer->embedding_weights[input*layer->dim + i]; output[i] = layer->embedding_weights[input*layer->dim + i];
} }
} }
void compute_gru_a_input(float *output, const float *input, int N, const EmbeddingLayer *layer1, int val1, const EmbeddingLayer *layer2, int val2, const EmbeddingLayer *layer3, int val3) { void compute_gru_a_input(float *output, const float *input, int N, const EmbeddingLayer *layer1, int val1, const EmbeddingLayer *layer2, int val2, const EmbeddingLayer *layer3, int val3) {
@ -499,5 +499,5 @@ void accum_embedding(const EmbeddingLayer *layer, float *output, int input)
for (i=0;i<layer->dim;i++) for (i=0;i<layer->dim;i++)
{ {
output[i] += layer->embedding_weights[input*layer->dim + i]; output[i] += layer->embedding_weights[input*layer->dim + i];
} }
} }

View file

@ -45,7 +45,7 @@ int parse_record(const unsigned char **data, int *len, WeightArray *array) {
array->type = h->type; array->type = h->type;
array->size = h->size; array->size = h->size;
array->data = (*data)+WEIGHT_BLOCK_SIZE; array->data = (*data)+WEIGHT_BLOCK_SIZE;
*data += h->block_size+WEIGHT_BLOCK_SIZE; *data += h->block_size+WEIGHT_BLOCK_SIZE;
*len -= h->block_size+WEIGHT_BLOCK_SIZE; *len -= h->block_size+WEIGHT_BLOCK_SIZE;
return array->size; return array->size;
@ -103,7 +103,7 @@ static const void *find_idx_check(const WeightArray *arrays, const char *name, i
if (remain < nb_blocks+1) return NULL; if (remain < nb_blocks+1) return NULL;
for (i=0;i<nb_blocks;i++) { for (i=0;i<nb_blocks;i++) {
int pos = *idx++; int pos = *idx++;
if (pos+3 >= nb_in || (pos&0x3)) return NULL; if (pos+3 >= nb_in || (pos&0x3)) return NULL;
} }
nb_out -= 8; nb_out -= 8;
remain -= nb_blocks+1; remain -= nb_blocks+1;

View file

@ -63,7 +63,7 @@ int test_sgemv_accum16() {
out[i] = 0; out[i] = 0;
out_fast[i] = 0; out_fast[i] = 0;
} }
for(i=0; i<COLS; i++) { for(i=0; i<COLS; i++) {
x[i] = i+1; x[i] = i+1;
} }
@ -101,7 +101,7 @@ int test_sparse_sgemv_accum16() {
out[i] = 0; out[i] = 0;
out_fast[i] = 0; out_fast[i] = 0;
} }
sparse_sgemv_accum16(out, w, rows, indx, x); sparse_sgemv_accum16(out, w, rows, indx, x);
sparse_sgemv_accum16_fast(out_fast, w, rows, indx, x); sparse_sgemv_accum16_fast(out_fast, w, rows, indx, x);
@ -126,5 +126,3 @@ int main() {
int test2 = test_sparse_sgemv_accum16(); int test2 = test_sparse_sgemv_accum16();
return test1 || test2; return test1 || test2;
} }

View file

@ -80,14 +80,14 @@ extern const opus_uint16 dred_p0_q15[{levels * N}];
def c_export(args, model): def c_export(args, model):
message = f"Auto generated from checkpoint {os.path.basename(args.checkpoint)}" message = f"Auto generated from checkpoint {os.path.basename(args.checkpoint)}"
enc_writer = CWriter(os.path.join(args.output_dir, "dred_rdovae_enc_data"), message=message) enc_writer = CWriter(os.path.join(args.output_dir, "dred_rdovae_enc_data"), message=message)
dec_writer = CWriter(os.path.join(args.output_dir, "dred_rdovae_dec_data"), message=message) dec_writer = CWriter(os.path.join(args.output_dir, "dred_rdovae_dec_data"), message=message)
stats_writer = CWriter(os.path.join(args.output_dir, "dred_rdovae_stats_data"), message=message) stats_writer = CWriter(os.path.join(args.output_dir, "dred_rdovae_stats_data"), message=message)
constants_writer = CWriter(os.path.join(args.output_dir, "dred_rdovae_constants"), message=message, header_only=True) constants_writer = CWriter(os.path.join(args.output_dir, "dred_rdovae_constants"), message=message, header_only=True)
# some custom includes # some custom includes
for writer in [enc_writer, dec_writer, stats_writer]: for writer in [enc_writer, dec_writer, stats_writer]:
writer.header.write( writer.header.write(
@ -99,10 +99,10 @@ f"""
#include "nnet.h" #include "nnet.h"
""" """
) )
# encoder # encoder
encoder_dense_layers = [ encoder_dense_layers = [
('core_encoder.module.dense_1' , 'enc_dense1', 'TANH'), ('core_encoder.module.dense_1' , 'enc_dense1', 'TANH'),
('core_encoder.module.dense_2' , 'enc_dense3', 'TANH'), ('core_encoder.module.dense_2' , 'enc_dense3', 'TANH'),
('core_encoder.module.dense_3' , 'enc_dense5', 'TANH'), ('core_encoder.module.dense_3' , 'enc_dense5', 'TANH'),
('core_encoder.module.dense_4' , 'enc_dense7', 'TANH'), ('core_encoder.module.dense_4' , 'enc_dense7', 'TANH'),
@ -110,31 +110,31 @@ f"""
('core_encoder.module.state_dense_1' , 'gdense1' , 'TANH'), ('core_encoder.module.state_dense_1' , 'gdense1' , 'TANH'),
('core_encoder.module.state_dense_2' , 'gdense2' , 'TANH') ('core_encoder.module.state_dense_2' , 'gdense2' , 'TANH')
] ]
for name, export_name, activation in encoder_dense_layers: for name, export_name, activation in encoder_dense_layers:
layer = model.get_submodule(name) layer = model.get_submodule(name)
dump_torch_weights(enc_writer, layer, name=export_name, activation=activation, verbose=True) dump_torch_weights(enc_writer, layer, name=export_name, activation=activation, verbose=True)
encoder_gru_layers = [ encoder_gru_layers = [
('core_encoder.module.gru_1' , 'enc_dense2', 'TANH'), ('core_encoder.module.gru_1' , 'enc_dense2', 'TANH'),
('core_encoder.module.gru_2' , 'enc_dense4', 'TANH'), ('core_encoder.module.gru_2' , 'enc_dense4', 'TANH'),
('core_encoder.module.gru_3' , 'enc_dense6', 'TANH') ('core_encoder.module.gru_3' , 'enc_dense6', 'TANH')
] ]
enc_max_rnn_units = max([dump_torch_weights(enc_writer, model.get_submodule(name), export_name, activation, verbose=True, input_sparse=True, dotp=True) enc_max_rnn_units = max([dump_torch_weights(enc_writer, model.get_submodule(name), export_name, activation, verbose=True, input_sparse=True, dotp=True)
for name, export_name, activation in encoder_gru_layers]) for name, export_name, activation in encoder_gru_layers])
encoder_conv_layers = [
('core_encoder.module.conv1' , 'bits_dense' , 'LINEAR')
]
enc_max_conv_inputs = max([dump_torch_weights(enc_writer, model.get_submodule(name), export_name, activation, verbose=True) for name, export_name, activation in encoder_conv_layers])
encoder_conv_layers = [
('core_encoder.module.conv1' , 'bits_dense' , 'LINEAR')
]
enc_max_conv_inputs = max([dump_torch_weights(enc_writer, model.get_submodule(name), export_name, activation, verbose=True) for name, export_name, activation in encoder_conv_layers])
del enc_writer del enc_writer
# decoder # decoder
decoder_dense_layers = [ decoder_dense_layers = [
('core_decoder.module.gru_1_init' , 'state1', 'TANH'), ('core_decoder.module.gru_1_init' , 'state1', 'TANH'),
@ -151,25 +151,25 @@ f"""
for name, export_name, activation in decoder_dense_layers: for name, export_name, activation in decoder_dense_layers:
layer = model.get_submodule(name) layer = model.get_submodule(name)
dump_torch_weights(dec_writer, layer, name=export_name, activation=activation, verbose=True) dump_torch_weights(dec_writer, layer, name=export_name, activation=activation, verbose=True)
decoder_gru_layers = [ decoder_gru_layers = [
('core_decoder.module.gru_1' , 'dec_dense2', 'TANH'), ('core_decoder.module.gru_1' , 'dec_dense2', 'TANH'),
('core_decoder.module.gru_2' , 'dec_dense4', 'TANH'), ('core_decoder.module.gru_2' , 'dec_dense4', 'TANH'),
('core_decoder.module.gru_3' , 'dec_dense6', 'TANH') ('core_decoder.module.gru_3' , 'dec_dense6', 'TANH')
] ]
dec_max_rnn_units = max([dump_torch_weights(dec_writer, model.get_submodule(name), export_name, activation, verbose=True, input_sparse=True, dotp=True) dec_max_rnn_units = max([dump_torch_weights(dec_writer, model.get_submodule(name), export_name, activation, verbose=True, input_sparse=True, dotp=True)
for name, export_name, activation in decoder_gru_layers]) for name, export_name, activation in decoder_gru_layers])
del dec_writer del dec_writer
# statistical model # statistical model
qembedding = model.statistical_model.quant_embedding qembedding = model.statistical_model.quant_embedding
dump_statistical_model(stats_writer, qembedding) dump_statistical_model(stats_writer, qembedding)
del stats_writer del stats_writer
# constants # constants
constants_writer.header.write( constants_writer.header.write(
f""" f"""
@ -193,12 +193,12 @@ f"""
""" """
) )
del constants_writer del constants_writer
def numpy_export(args, model): def numpy_export(args, model):
exchange_name_to_name = { exchange_name_to_name = {
'encoder_stack_layer1_dense' : 'core_encoder.module.dense_1', 'encoder_stack_layer1_dense' : 'core_encoder.module.dense_1',
'encoder_stack_layer3_dense' : 'core_encoder.module.dense_2', 'encoder_stack_layer3_dense' : 'core_encoder.module.dense_2',
@ -225,20 +225,20 @@ def numpy_export(args, model):
'decoder_stack_layer4_gru' : 'core_decoder.module.gru_2', 'decoder_stack_layer4_gru' : 'core_decoder.module.gru_2',
'decoder_stack_layer6_gru' : 'core_decoder.module.gru_3' 'decoder_stack_layer6_gru' : 'core_decoder.module.gru_3'
} }
name_to_exchange_name = {value : key for key, value in exchange_name_to_name.items()} name_to_exchange_name = {value : key for key, value in exchange_name_to_name.items()}
for name, exchange_name in name_to_exchange_name.items(): for name, exchange_name in name_to_exchange_name.items():
print(f"printing layer {name}...") print(f"printing layer {name}...")
dump_torch_weights(os.path.join(args.output_dir, exchange_name), model.get_submodule(name)) dump_torch_weights(os.path.join(args.output_dir, exchange_name), model.get_submodule(name))
if __name__ == "__main__": if __name__ == "__main__":
os.makedirs(args.output_dir, exist_ok=True) os.makedirs(args.output_dir, exist_ok=True)
# load model from checkpoint # load model from checkpoint
checkpoint = torch.load(args.checkpoint, map_location='cpu') checkpoint = torch.load(args.checkpoint, map_location='cpu')
model = RDOVAE(*checkpoint['model_args'], **checkpoint['model_kwargs']) model = RDOVAE(*checkpoint['model_args'], **checkpoint['model_kwargs'])
@ -249,7 +249,7 @@ if __name__ == "__main__":
if len(unmatched_keys) > 0: if len(unmatched_keys) > 0:
print(f"warning: the following keys were unmatched {unmatched_keys}") print(f"warning: the following keys were unmatched {unmatched_keys}")
if args.format == 'C': if args.format == 'C':
c_export(args, model) c_export(args, model)
elif args.format == 'numpy': elif args.format == 'numpy':

View file

@ -84,7 +84,7 @@ total_delay = silk_delay + zero_history + args.extra_delay - dump_data_delay
# load signal # load signal
if args.input.endswith('.raw') or args.input.endswith('.pcm'): if args.input.endswith('.raw') or args.input.endswith('.pcm'):
signal = np.fromfile(args.input, dtype='int16') signal = np.fromfile(args.input, dtype='int16')
elif args.input.endswith('.wav'): elif args.input.endswith('.wav'):
fs, signal = wavfile.read(args.input) fs, signal = wavfile.read(args.input)
else: else:
@ -94,7 +94,7 @@ else:
padded_signal_length = len(signal) + total_delay padded_signal_length = len(signal) + total_delay
tail = padded_signal_length % frame_size tail = padded_signal_length % frame_size
right_padding = (frame_size - tail) % frame_size right_padding = (frame_size - tail) % frame_size
signal = np.concatenate((np.zeros(total_delay, dtype=np.int16), signal, np.zeros(right_padding, dtype=np.int16))) signal = np.concatenate((np.zeros(total_delay, dtype=np.int16), signal, np.zeros(right_padding, dtype=np.int16)))
padded_signal_file = os.path.splitext(args.input)[0] + '_padded.raw' padded_signal_file = os.path.splitext(args.input)[0] + '_padded.raw'
@ -152,7 +152,7 @@ with torch.no_grad():
zi = torch.clone(z[:, i - 2 * input_length + 2: i + 1 : 2, :]) zi = torch.clone(z[:, i - 2 * input_length + 2: i + 1 : 2, :])
zi, rates = model.quantize(zi, quant_ids) zi, rates = model.quantize(zi, quant_ids)
zi = model.unquantize(zi, quant_ids) zi = model.unquantize(zi, quant_ids)
features = model.decode(zi, states[:, i : i + 1, :]) features = model.decode(zi, states[:, i : i + 1, :])
packets.append(features.squeeze(0).numpy()) packets.append(features.squeeze(0).numpy())
packet_size = 8 * int((torch.sum(rates) + 7 + state_size) / 8) packet_size = 8 * int((torch.sum(rates) + 7 + state_size) / 8)
@ -176,7 +176,7 @@ if args.lossfile != None:
count = 2 count = 2
for i in range(num_packets): for i in range(num_packets):
if (loss[i] == 0) or (i == num_packets - 1): if (loss[i] == 0) or (i == num_packets - 1):
fec_out[ptr:ptr+count,:] = packets[i][foffset:, :] fec_out[ptr:ptr+count,:] = packets[i][foffset:, :]
ptr += count ptr += count
@ -190,14 +190,14 @@ if args.lossfile != None:
fec_out_full[:, : fec_out.shape[-1]] = fec_out fec_out_full[:, : fec_out.shape[-1]] = fec_out
fec_out_full.tofile(packet_file[:-4] + f'_fec.f32') fec_out_full.tofile(packet_file[:-4] + f'_fec.f32')
if args.debug_output: if args.debug_output:
import itertools import itertools
batches = [4] batches = [4]
offsets = [0, 2 * args.num_redundancy_frames - 4] offsets = [0, 2 * args.num_redundancy_frames - 4]
# sanity checks # sanity checks
# 1. concatenate features at offset 0 # 1. concatenate features at offset 0
for batch, offset in itertools.product(batches, offsets): for batch, offset in itertools.product(batches, offsets):
@ -210,4 +210,3 @@ if args.debug_output:
print(f"writing debug output {packet_file[:-4] + f'_torch_batch{batch}_offset{offset}.f32'}") print(f"writing debug output {packet_file[:-4] + f'_torch_batch{batch}_offset{offset}.f32'}")
test_features_full.tofile(packet_file[:-4] + f'_torch_batch{batch}_offset{offset}.f32') test_features_full.tofile(packet_file[:-4] + f'_torch_batch{batch}_offset{offset}.f32')

View file

@ -90,7 +90,7 @@ if __name__ == "__main__":
cond_size = args.cond_size cond_size = args.cond_size
cond_size2 = args.cond_size2 cond_size2 = args.cond_size2
state_dim = args.state_dim state_dim = args.state_dim
# model # model
checkpoint['model_args'] = (num_features, latent_dim, quant_levels, cond_size, cond_size2) checkpoint['model_args'] = (num_features, latent_dim, quant_levels, cond_size, cond_size2)
@ -105,9 +105,9 @@ if __name__ == "__main__":
'encoder_stack_layer8_dense', 'encoder_stack_layer8_dense',
'encoder_state_layer1_dense', 'encoder_state_layer1_dense',
'encoder_state_layer2_dense', 'encoder_state_layer2_dense',
'decoder_state1_dense', 'decoder_state1_dense',
'decoder_state2_dense', 'decoder_state2_dense',
'decoder_state3_dense', 'decoder_state3_dense',
'decoder_stack_layer1_dense', 'decoder_stack_layer1_dense',
'decoder_stack_layer3_dense', 'decoder_stack_layer3_dense',
'decoder_stack_layer5_dense', 'decoder_stack_layer5_dense',
@ -122,7 +122,7 @@ if __name__ == "__main__":
'encoder_stack_layer6_gru', 'encoder_stack_layer6_gru',
'decoder_stack_layer2_gru', 'decoder_stack_layer2_gru',
'decoder_stack_layer4_gru', 'decoder_stack_layer4_gru',
'decoder_stack_layer6_gru' 'decoder_stack_layer6_gru'
] ]
conv1d_layer_names = [ conv1d_layer_names = [

View file

@ -43,7 +43,7 @@ int get_fec_frame(const char * const filename, float *features, int packet_index
long offset; long offset;
FILE *fid = fopen(filename, "rb"); FILE *fid = fopen(filename, "rb");
/* read header */ /* read header */
if (fread(&version, sizeof(version), 1, fid) != 1) goto error; if (fread(&version, sizeof(version), 1, fid) != 1) goto error;
if (fread(&header_size, sizeof(header_size), 1, fid) != 1) goto error; if (fread(&header_size, sizeof(header_size), 1, fid) != 1) goto error;
@ -88,7 +88,7 @@ int get_fec_rate(const char * const filename, int packet_index)
int16_t rate; int16_t rate;
FILE *fid = fopen(filename, "rb"); FILE *fid = fopen(filename, "rb");
/* read header */ /* read header */
if (fread(&version, sizeof(version), 1, fid) != 1) goto error; if (fread(&version, sizeof(version), 1, fid) != 1) goto error;
if (fread(&header_size, sizeof(header_size), 1, fid) != 1) goto error; if (fread(&header_size, sizeof(header_size), 1, fid) != 1) goto error;

View file

@ -33,25 +33,25 @@ import numpy as np
def write_fec_packets(filename, packets, rates=None): def write_fec_packets(filename, packets, rates=None):
""" writes packets in binary format """ """ writes packets in binary format """
assert np.dtype(np.float32).itemsize == 4 assert np.dtype(np.float32).itemsize == 4
assert np.dtype(np.int16).itemsize == 2 assert np.dtype(np.int16).itemsize == 2
# derive some sizes # derive some sizes
num_packets = len(packets) num_packets = len(packets)
subframes_per_packet = packets[0].shape[-2] subframes_per_packet = packets[0].shape[-2]
num_features = packets[0].shape[-1] num_features = packets[0].shape[-1]
# size of float is 4 # size of float is 4
subframe_size = num_features * 4 subframe_size = num_features * 4
packet_size = subframe_size * subframes_per_packet + 2 # two bytes for rate packet_size = subframe_size * subframes_per_packet + 2 # two bytes for rate
version = 1 version = 1
# header size (version, header_size, num_packets, packet_size, subframe_size, subrames_per_packet, num_features) # header size (version, header_size, num_packets, packet_size, subframe_size, subrames_per_packet, num_features)
header_size = 14 header_size = 14
with open(filename, 'wb') as f: with open(filename, 'wb') as f:
# header # header
f.write(np.int16(version).tobytes()) f.write(np.int16(version).tobytes())
f.write(np.int16(header_size).tobytes()) f.write(np.int16(header_size).tobytes())
@ -60,28 +60,28 @@ def write_fec_packets(filename, packets, rates=None):
f.write(np.int16(subframe_size).tobytes()) f.write(np.int16(subframe_size).tobytes())
f.write(np.int16(subframes_per_packet).tobytes()) f.write(np.int16(subframes_per_packet).tobytes())
f.write(np.int16(num_features).tobytes()) f.write(np.int16(num_features).tobytes())
# packets # packets
for i, packet in enumerate(packets): for i, packet in enumerate(packets):
if type(rates) == type(None): if type(rates) == type(None):
rate = 0 rate = 0
else: else:
rate = rates[i] rate = rates[i]
f.write(np.int16(rate).tobytes()) f.write(np.int16(rate).tobytes())
features = np.flip(packet, axis=-2) features = np.flip(packet, axis=-2)
f.write(features.astype(np.float32).tobytes()) f.write(features.astype(np.float32).tobytes())
def read_fec_packets(filename): def read_fec_packets(filename):
""" reads packets from binary format """ """ reads packets from binary format """
assert np.dtype(np.float32).itemsize == 4 assert np.dtype(np.float32).itemsize == 4
assert np.dtype(np.int16).itemsize == 2 assert np.dtype(np.int16).itemsize == 2
with open(filename, 'rb') as f: with open(filename, 'rb') as f:
# header # header
version = np.frombuffer(f.read(2), dtype=np.int16).item() version = np.frombuffer(f.read(2), dtype=np.int16).item()
header_size = np.frombuffer(f.read(2), dtype=np.int16).item() header_size = np.frombuffer(f.read(2), dtype=np.int16).item()
@ -90,19 +90,19 @@ def read_fec_packets(filename):
subframe_size = np.frombuffer(f.read(2), dtype=np.int16).item() subframe_size = np.frombuffer(f.read(2), dtype=np.int16).item()
subframes_per_packet = np.frombuffer(f.read(2), dtype=np.int16).item() subframes_per_packet = np.frombuffer(f.read(2), dtype=np.int16).item()
num_features = np.frombuffer(f.read(2), dtype=np.int16).item() num_features = np.frombuffer(f.read(2), dtype=np.int16).item()
dummy_features = np.zeros((subframes_per_packet, num_features), dtype=np.float32) dummy_features = np.zeros((subframes_per_packet, num_features), dtype=np.float32)
# packets # packets
rates = [] rates = []
packets = [] packets = []
for i in range(num_packets): for i in range(num_packets):
rate = np.frombuffer(f.read(2), dtype=np.int16).item rate = np.frombuffer(f.read(2), dtype=np.int16).item
rates.append(rate) rates.append(rate)
features = np.reshape(np.frombuffer(f.read(subframe_size * subframes_per_packet), dtype=np.float32), dummy_features.shape) features = np.reshape(np.frombuffer(f.read(subframe_size * subframes_per_packet), dtype=np.float32), dummy_features.shape)
packet = np.flip(features, axis=-2) packet = np.flip(features, axis=-2)
packets.append(packet) packets.append(packet)
return packets return packets

View file

@ -40,7 +40,7 @@ class RDOVAEDataset(torch.utils.data.Dataset):
lambda_max=0.0135, lambda_max=0.0135,
quant_levels=16, quant_levels=16,
enc_stride=2): enc_stride=2):
self.sequence_length = sequence_length self.sequence_length = sequence_length
self.lambda_min = lambda_min self.lambda_min = lambda_min
self.lambda_max = lambda_max self.lambda_max = lambda_max
@ -50,7 +50,7 @@ class RDOVAEDataset(torch.utils.data.Dataset):
if sequence_length % enc_stride: if sequence_length % enc_stride:
raise ValueError(f"RDOVAEDataset.__init__: enc_stride {enc_stride} does not divide sequence length {sequence_length}") raise ValueError(f"RDOVAEDataset.__init__: enc_stride {enc_stride} does not divide sequence length {sequence_length}")
self.features = np.reshape(np.fromfile(feature_file, dtype=np.float32), (-1, num_features)) self.features = np.reshape(np.fromfile(feature_file, dtype=np.float32), (-1, num_features))
self.features = self.features[:, :num_used_features] self.features = self.features[:, :num_used_features]
self.num_sequences = self.features.shape[0] // sequence_length self.num_sequences = self.features.shape[0] // sequence_length
@ -65,4 +65,3 @@ class RDOVAEDataset(torch.utils.data.Dataset):
rate_lambda = self.lambda_min * np.exp(q_ids.astype(np.float32) / self.denominator).astype(np.float32) rate_lambda = self.lambda_min * np.exp(q_ids.astype(np.float32) / self.denominator).astype(np.float32)
return features, rate_lambda, q_ids return features, rate_lambda, q_ids

View file

@ -42,7 +42,7 @@ def soft_pvq(x, k):
# L2 normalization # L2 normalization
x_norm2 = x / (1e-15 + torch.norm(x, dim=-1, keepdim=True)) x_norm2 = x / (1e-15 + torch.norm(x, dim=-1, keepdim=True))
with torch.no_grad(): with torch.no_grad():
# quantization loop, no need to track gradients here # quantization loop, no need to track gradients here
@ -84,19 +84,19 @@ def cache_parameters(func):
return cache[args] return cache[args]
else: else:
cache[args] = func(*args) cache[args] = func(*args)
return cache[args] return cache[args]
return cached_func return cached_func
@cache_parameters @cache_parameters
def pvq_codebook_size(n, k): def pvq_codebook_size(n, k):
if k == 0: if k == 0:
return 1 return 1
if n == 0: if n == 0:
return 0 return 0
return pvq_codebook_size(n - 1, k) + pvq_codebook_size(n, k - 1) + pvq_codebook_size(n - 1, k - 1) return pvq_codebook_size(n - 1, k) + pvq_codebook_size(n, k - 1) + pvq_codebook_size(n - 1, k - 1)
@ -121,7 +121,7 @@ def hard_rate_estimate(z, r, theta, reduce=True):
p0 = 1 - r ** (0.5 + 0.5 * theta) p0 = 1 - r ** (0.5 + 0.5 * theta)
alpha = torch.relu(1 - torch.abs(z_q)) ** 2 alpha = torch.relu(1 - torch.abs(z_q)) ** 2
rate = - torch.sum( rate = - torch.sum(
(alpha * torch.log2(p0 * r ** torch.abs(z_q) + 1e-6) (alpha * torch.log2(p0 * r ** torch.abs(z_q) + 1e-6)
+ (1 - alpha) * torch.log2(0.5 * (1 - p0) * (1 - r) * r ** (torch.abs(z_q) - 1) + 1e-6)), + (1 - alpha) * torch.log2(0.5 * (1 - p0) * (1 - r) * r ** (torch.abs(z_q) - 1) + 1e-6)),
dim=-1 dim=-1
) )
@ -154,7 +154,7 @@ def noise_quantize(x):
def distortion_loss(y_true, y_pred, rate_lambda=None): def distortion_loss(y_true, y_pred, rate_lambda=None):
""" custom distortion loss for LPCNet features """ """ custom distortion loss for LPCNet features """
if y_true.size(-1) != 20: if y_true.size(-1) != 20:
raise ValueError('distortion loss is designed to work with 20 features') raise ValueError('distortion loss is designed to work with 20 features')
@ -169,7 +169,7 @@ def distortion_loss(y_true, y_pred, rate_lambda=None):
loss = loss / torch.sqrt(rate_lambda) loss = loss / torch.sqrt(rate_lambda)
loss = torch.mean(loss) loss = torch.mean(loss)
return loss return loss
@ -181,23 +181,23 @@ import random
def random_split(start, stop, num_splits=3, min_len=3): def random_split(start, stop, num_splits=3, min_len=3):
get_min_len = lambda x : min([x[i+1] - x[i] for i in range(len(x) - 1)]) get_min_len = lambda x : min([x[i+1] - x[i] for i in range(len(x) - 1)])
candidate = [start] + sorted([random.randint(start, stop-1) for i in range(num_splits)]) + [stop] candidate = [start] + sorted([random.randint(start, stop-1) for i in range(num_splits)]) + [stop]
while get_min_len(candidate) < min_len: while get_min_len(candidate) < min_len:
candidate = [start] + sorted([random.randint(start, stop-1) for i in range(num_splits)]) + [stop] candidate = [start] + sorted([random.randint(start, stop-1) for i in range(num_splits)]) + [stop]
return candidate return candidate
# weight initialization and clipping # weight initialization and clipping
def init_weights(module): def init_weights(module):
if isinstance(module, nn.GRU): if isinstance(module, nn.GRU):
for p in module.named_parameters(): for p in module.named_parameters():
if p[0].startswith('weight_hh_'): if p[0].startswith('weight_hh_'):
nn.init.orthogonal_(p[1]) nn.init.orthogonal_(p[1])
def weight_clip_factory(max_value): def weight_clip_factory(max_value):
""" weight clipping function concerning sum of abs values of adjecent weights """ """ weight clipping function concerning sum of abs values of adjecent weights """
def clip_weight_(w): def clip_weight_(w):
@ -213,13 +213,13 @@ def weight_clip_factory(max_value):
1)) 1))
with torch.no_grad(): with torch.no_grad():
w[:, :stop] *= factor w[:, :stop] *= factor
def clip_weights(module): def clip_weights(module):
if isinstance(module, nn.GRU) or isinstance(module, nn.Linear): if isinstance(module, nn.GRU) or isinstance(module, nn.Linear):
for name, w in module.named_parameters(): for name, w in module.named_parameters():
if name.startswith('weight'): if name.startswith('weight'):
clip_weight_(w) clip_weight_(w)
return clip_weights return clip_weights
# RDOVAE module and submodules # RDOVAE module and submodules
@ -229,12 +229,12 @@ class CoreEncoder(nn.Module):
STATE_HIDDEN = 128 STATE_HIDDEN = 128
FRAMES_PER_STEP = 2 FRAMES_PER_STEP = 2
CONV_KERNEL_SIZE = 4 CONV_KERNEL_SIZE = 4
def __init__(self, feature_dim, output_dim, cond_size, cond_size2, state_size=24): def __init__(self, feature_dim, output_dim, cond_size, cond_size2, state_size=24):
""" core encoder for RDOVAE """ core encoder for RDOVAE
Computes latents, initial states, and rate estimates from features and lambda parameter Computes latents, initial states, and rate estimates from features and lambda parameter
""" """
super(CoreEncoder, self).__init__() super(CoreEncoder, self).__init__()
@ -289,7 +289,7 @@ class CoreEncoder(nn.Module):
# concatenation of all hidden layer outputs # concatenation of all hidden layer outputs
x9 = torch.cat((x1, x2, x3, x4, x5, x6, x7, x8), dim=-1) x9 = torch.cat((x1, x2, x3, x4, x5, x6, x7, x8), dim=-1)
# init state for decoder # init state for decoder
states = torch.tanh(self.state_dense_1(x9)) states = torch.tanh(self.state_dense_1(x9))
states = torch.tanh(self.state_dense_2(states)) states = torch.tanh(self.state_dense_2(states))
@ -309,9 +309,9 @@ class CoreDecoder(nn.Module):
def __init__(self, input_dim, output_dim, cond_size, cond_size2, state_size=24): def __init__(self, input_dim, output_dim, cond_size, cond_size2, state_size=24):
""" core decoder for RDOVAE """ core decoder for RDOVAE
Computes features from latents, initial state, and quantization index Computes features from latents, initial state, and quantization index
""" """
super(CoreDecoder, self).__init__() super(CoreDecoder, self).__init__()
@ -324,7 +324,7 @@ class CoreDecoder(nn.Module):
self.state_size = state_size self.state_size = state_size
self.input_size = self.input_dim self.input_size = self.input_dim
self.concat_size = 4 * self.cond_size + 4 * self.cond_size2 self.concat_size = 4 * self.cond_size + 4 * self.cond_size2
# layers # layers
@ -348,7 +348,7 @@ class CoreDecoder(nn.Module):
self.apply(init_weights) self.apply(init_weights)
def forward(self, z, initial_state): def forward(self, z, initial_state):
gru_1_state = torch.tanh(self.gru_1_init(initial_state).permute(1, 0, 2)) gru_1_state = torch.tanh(self.gru_1_init(initial_state).permute(1, 0, 2))
gru_2_state = torch.tanh(self.gru_2_init(initial_state).permute(1, 0, 2)) gru_2_state = torch.tanh(self.gru_2_init(initial_state).permute(1, 0, 2))
gru_3_state = torch.tanh(self.gru_3_init(initial_state).permute(1, 0, 2)) gru_3_state = torch.tanh(self.gru_3_init(initial_state).permute(1, 0, 2))
@ -374,9 +374,9 @@ class CoreDecoder(nn.Module):
class StatisticalModel(nn.Module): class StatisticalModel(nn.Module):
def __init__(self, quant_levels, latent_dim): def __init__(self, quant_levels, latent_dim):
""" Statistical model for latent space """ Statistical model for latent space
Computes scaling, deadzone, r, and theta Computes scaling, deadzone, r, and theta
""" """
super(StatisticalModel, self).__init__() super(StatisticalModel, self).__init__()
@ -388,7 +388,7 @@ class StatisticalModel(nn.Module):
# quantization embedding # quantization embedding
self.quant_embedding = nn.Embedding(quant_levels, self.embedding_dim) self.quant_embedding = nn.Embedding(quant_levels, self.embedding_dim)
# initialize embedding to 0 # initialize embedding to 0
with torch.no_grad(): with torch.no_grad():
self.quant_embedding.weight[:] = 0 self.quant_embedding.weight[:] = 0
@ -406,7 +406,7 @@ class StatisticalModel(nn.Module):
r_soft = torch.sigmoid(x[..., 3 * self.latent_dim : 4 * self.latent_dim]) r_soft = torch.sigmoid(x[..., 3 * self.latent_dim : 4 * self.latent_dim])
theta_hard = torch.sigmoid(x[..., 4 * self.latent_dim : 5 * self.latent_dim]) theta_hard = torch.sigmoid(x[..., 4 * self.latent_dim : 5 * self.latent_dim])
r_hard = torch.sigmoid(x[..., 5 * self.latent_dim : 6 * self.latent_dim]) r_hard = torch.sigmoid(x[..., 5 * self.latent_dim : 6 * self.latent_dim])
return { return {
'quant_embedding' : x, 'quant_embedding' : x,
@ -443,34 +443,34 @@ class RDOVAE(nn.Module):
self.state_dim = state_dim self.state_dim = state_dim
self.pvq_num_pulses = pvq_num_pulses self.pvq_num_pulses = pvq_num_pulses
self.state_dropout_rate = state_dropout_rate self.state_dropout_rate = state_dropout_rate
# submodules encoder and decoder share the statistical model # submodules encoder and decoder share the statistical model
self.statistical_model = StatisticalModel(quant_levels, latent_dim) self.statistical_model = StatisticalModel(quant_levels, latent_dim)
self.core_encoder = nn.DataParallel(CoreEncoder(feature_dim, latent_dim, cond_size, cond_size2, state_size=state_dim)) self.core_encoder = nn.DataParallel(CoreEncoder(feature_dim, latent_dim, cond_size, cond_size2, state_size=state_dim))
self.core_decoder = nn.DataParallel(CoreDecoder(latent_dim, feature_dim, cond_size, cond_size2, state_size=state_dim)) self.core_decoder = nn.DataParallel(CoreDecoder(latent_dim, feature_dim, cond_size, cond_size2, state_size=state_dim))
self.enc_stride = CoreEncoder.FRAMES_PER_STEP self.enc_stride = CoreEncoder.FRAMES_PER_STEP
self.dec_stride = CoreDecoder.FRAMES_PER_STEP self.dec_stride = CoreDecoder.FRAMES_PER_STEP
if clip_weights: if clip_weights:
self.weight_clip_fn = weight_clip_factory(0.496) self.weight_clip_fn = weight_clip_factory(0.496)
else: else:
self.weight_clip_fn = None self.weight_clip_fn = None
if self.dec_stride % self.enc_stride != 0: if self.dec_stride % self.enc_stride != 0:
raise ValueError(f"get_decoder_chunks_generic: encoder stride does not divide decoder stride") raise ValueError(f"get_decoder_chunks_generic: encoder stride does not divide decoder stride")
def clip_weights(self): def clip_weights(self):
if not type(self.weight_clip_fn) == type(None): if not type(self.weight_clip_fn) == type(None):
self.apply(self.weight_clip_fn) self.apply(self.weight_clip_fn)
def get_decoder_chunks(self, z_frames, mode='split', chunks_per_offset = 4): def get_decoder_chunks(self, z_frames, mode='split', chunks_per_offset = 4):
enc_stride = self.enc_stride enc_stride = self.enc_stride
dec_stride = self.dec_stride dec_stride = self.dec_stride
stride = dec_stride // enc_stride stride = dec_stride // enc_stride
chunks = [] chunks = []
for offset in range(stride): for offset in range(stride):
@ -529,7 +529,7 @@ class RDOVAE(nn.Module):
z_q = hard_quantize(z) / statistical_model['quant_scale'] z_q = hard_quantize(z) / statistical_model['quant_scale']
z_n = noise_quantize(z) / statistical_model['quant_scale'] z_n = noise_quantize(z) / statistical_model['quant_scale']
states_q = soft_pvq(states, self.pvq_num_pulses) states_q = soft_pvq(states, self.pvq_num_pulses)
if self.state_dropout_rate > 0: if self.state_dropout_rate > 0:
drop = torch.rand(states_q.size(0)) < self.state_dropout_rate drop = torch.rand(states_q.size(0)) < self.state_dropout_rate
mask = torch.ones_like(states_q) mask = torch.ones_like(states_q)
@ -552,7 +552,7 @@ class RDOVAE(nn.Module):
# decoder with soft quantized input # decoder with soft quantized input
z_dec_reverse = torch.flip(z_n[..., chunk['z_start'] : chunk['z_stop'] : chunk['z_stride'], :], [1]) z_dec_reverse = torch.flip(z_n[..., chunk['z_start'] : chunk['z_stop'] : chunk['z_stride'], :], [1])
features_reverse = self.core_decoder(z_dec_reverse, dec_initial_state) features_reverse = self.core_decoder(z_dec_reverse, dec_initial_state)
outputs_sq.append((torch.flip(features_reverse, [1]), chunk['features_start'], chunk['features_stop'])) outputs_sq.append((torch.flip(features_reverse, [1]), chunk['features_start'], chunk['features_stop']))
return { return {
'outputs_hard_quant' : outputs_hq, 'outputs_hard_quant' : outputs_hq,
@ -563,24 +563,24 @@ class RDOVAE(nn.Module):
def encode(self, features): def encode(self, features):
""" encoder with quantization and rate estimation """ """ encoder with quantization and rate estimation """
z, states = self.core_encoder(features) z, states = self.core_encoder(features)
# quantization of initial states # quantization of initial states
states = soft_pvq(states, self.pvq_num_pulses) states = soft_pvq(states, self.pvq_num_pulses)
state_size = m.log2(pvq_codebook_size(self.state_dim, self.pvq_num_pulses)) state_size = m.log2(pvq_codebook_size(self.state_dim, self.pvq_num_pulses))
return z, states, state_size return z, states, state_size
def decode(self, z, initial_state): def decode(self, z, initial_state):
""" decoder (flips sequences by itself) """ """ decoder (flips sequences by itself) """
z_reverse = torch.flip(z, [1]) z_reverse = torch.flip(z, [1])
features_reverse = self.core_decoder(z_reverse, initial_state) features_reverse = self.core_decoder(z_reverse, initial_state)
features = torch.flip(features_reverse, [1]) features = torch.flip(features_reverse, [1])
return features return features
def quantize(self, z, q_ids): def quantize(self, z, q_ids):
""" quantization of latent vectors """ """ quantization of latent vectors """
@ -602,13 +602,12 @@ class RDOVAE(nn.Module):
z = zq / stats['quant_scale'] z = zq / stats['quant_scale']
return z return z
def freeze_model(self): def freeze_model(self):
# freeze all parameters # freeze all parameters
for p in self.parameters(): for p in self.parameters():
p.requires_grad = False p.requires_grad = False
for p in self.statistical_model.parameters(): for p in self.statistical_model.parameters():
p.requires_grad = True p.requires_grad = True

View file

@ -89,7 +89,7 @@ adam_eps = 1e-8
checkpoint['batch_size'] = batch_size checkpoint['batch_size'] = batch_size
checkpoint['lr'] = lr checkpoint['lr'] = lr
checkpoint['lr_decay_factor'] = lr_decay_factor checkpoint['lr_decay_factor'] = lr_decay_factor
checkpoint['split_mode'] = split_mode checkpoint['split_mode'] = split_mode
checkpoint['epochs'] = epochs checkpoint['epochs'] = epochs
checkpoint['sequence_length'] = sequence_length checkpoint['sequence_length'] = sequence_length
@ -130,10 +130,10 @@ checkpoint['state_dict'] = model.state_dict()
if args.train_decoder_only: if args.train_decoder_only:
if args.initial_checkpoint is None: if args.initial_checkpoint is None:
print("warning: training decoder only without providing initial checkpoint") print("warning: training decoder only without providing initial checkpoint")
for p in model.core_encoder.module.parameters(): for p in model.core_encoder.module.parameters():
p.requires_grad = False p.requires_grad = False
for p in model.statistical_model.parameters(): for p in model.statistical_model.parameters():
p.requires_grad = False p.requires_grad = False
@ -180,15 +180,15 @@ if __name__ == '__main__':
# zero out gradients # zero out gradients
optimizer.zero_grad() optimizer.zero_grad()
# push inputs to device # push inputs to device
features = features.to(device) features = features.to(device)
q_ids = q_ids.to(device) q_ids = q_ids.to(device)
rate_lambda = rate_lambda.to(device) rate_lambda = rate_lambda.to(device)
rate_lambda_upsamp = torch.repeat_interleave(rate_lambda, 2, 1) rate_lambda_upsamp = torch.repeat_interleave(rate_lambda, 2, 1)
# run model # run model
model_output = model(features, q_ids) model_output = model(features, q_ids)
@ -224,17 +224,17 @@ if __name__ == '__main__':
# total loss # total loss
total_loss = rate_loss + (distortion_loss_hard_quant + distortion_loss_soft_quant) / 2 total_loss = rate_loss + (distortion_loss_hard_quant + distortion_loss_soft_quant) / 2
if args.enable_first_frame_loss: if args.enable_first_frame_loss:
total_loss = total_loss + 0.5 * torch.relu(first_frame_loss - distortion_loss_hard_quant) total_loss = total_loss + 0.5 * torch.relu(first_frame_loss - distortion_loss_hard_quant)
total_loss.backward() total_loss.backward()
optimizer.step() optimizer.step()
model.clip_weights() model.clip_weights()
scheduler.step() scheduler.step()
# collect running stats # collect running stats

View file

@ -3,7 +3,7 @@ Modification of Tensorflow's Embedding Layer:
1. Not restricted to be the first layer of a model 1. Not restricted to be the first layer of a model
2. Differentiable (allows non-integer lookups) 2. Differentiable (allows non-integer lookups)
- For non integer lookup, this layer linearly interpolates between the adjacent embeddings in the following way to preserver gradient flow - For non integer lookup, this layer linearly interpolates between the adjacent embeddings in the following way to preserver gradient flow
- E = (1 - frac(x))*embed(floor(x)) + frac(x)*embed(ceil(x)) - E = (1 - frac(x))*embed(floor(x)) + frac(x)*embed(ceil(x))
""" """
import tensorflow as tf import tensorflow as tf
@ -26,13 +26,13 @@ class diff_Embed(Layer):
self.pcm_init = pcm_init self.pcm_init = pcm_init
self.initializer = initializer self.initializer = initializer
def build(self, input_shape): def build(self, input_shape):
w_init = tf.random_normal_initializer() w_init = tf.random_normal_initializer()
if self.pcm_init: if self.pcm_init:
w_init = self.initializer w_init = self.initializer
self.w = tf.Variable(initial_value=w_init(shape=(self.dict_size, self.units),dtype='float32'),trainable=True) self.w = tf.Variable(initial_value=w_init(shape=(self.dict_size, self.units),dtype='float32'),trainable=True)
def call(self, inputs): def call(self, inputs):
alpha = inputs - tf.math.floor(inputs) alpha = inputs - tf.math.floor(inputs)
alpha = tf.expand_dims(alpha,axis = -1) alpha = tf.expand_dims(alpha,axis = -1)
alpha = tf.tile(alpha,[1,1,1,self.units]) alpha = tf.tile(alpha,[1,1,1,self.units])

View file

@ -309,13 +309,13 @@ if __name__ == "__main__":
else: else:
hf.write('/* This is *not* an end-to-end model */\n') hf.write('/* This is *not* an end-to-end model */\n')
hf.write('/* #define END2END */\n\n') hf.write('/* #define END2END */\n\n')
# LPC weighting factor # LPC weighting factor
if type(args.lpc_gamma) == type(None): if type(args.lpc_gamma) == type(None):
lpc_gamma = get_parameter(model, 'lpc_gamma', 1) lpc_gamma = get_parameter(model, 'lpc_gamma', 1)
else: else:
lpc_gamma = args.lpc_gamma lpc_gamma = args.lpc_gamma
hf.write('/* LPC weighting factor */\n') hf.write('/* LPC weighting factor */\n')
hf.write('#define LPC_GAMMA ' + str(lpc_gamma) +'f\n\n') hf.write('#define LPC_GAMMA ' + str(lpc_gamma) +'f\n\n')
@ -376,7 +376,7 @@ if __name__ == "__main__":
hf.write('typedef struct {\n') hf.write('typedef struct {\n')
for i, name in enumerate(layer_list): for i, name in enumerate(layer_list):
hf.write(' float {}_state[{}_STATE_SIZE];\n'.format(name, name.upper())) hf.write(' float {}_state[{}_STATE_SIZE];\n'.format(name, name.upper()))
hf.write('} NNetState;\n\n') hf.write('} NNetState;\n\n')
model_struct.write('} LPCNetModel;\n\n') model_struct.write('} LPCNetModel;\n\n')

View file

@ -283,7 +283,7 @@ hf.write('#define PLC_MAX_RNN_NEURONS {}\n\n'.format(max_rnn_neurons))
hf.write('typedef struct {\n') hf.write('typedef struct {\n')
for i, name in enumerate(layer_list): for i, name in enumerate(layer_list):
hf.write(' float {}_state[{}_STATE_SIZE];\n'.format(name, name.upper())) hf.write(' float {}_state[{}_STATE_SIZE];\n'.format(name, name.upper()))
hf.write('} PLCNetState;\n\n') hf.write('} PLCNetState;\n\n')
model_struct.write('} PLCModel;\n\n') model_struct.write('} PLCModel;\n\n')

View file

@ -173,7 +173,7 @@ f"""
[ [
dump_conv1d_layer(encoder.get_layer(name), source_fid, header_fid) dump_conv1d_layer(encoder.get_layer(name), source_fid, header_fid)
for name in encoder_conv1d_names for name in encoder_conv1d_names
] ]
) )
# dump Dense layers # dump Dense layers
@ -232,13 +232,13 @@ f"""
'dec_dense7', 'dec_dense7',
'dec_dense8', 'dec_dense8',
'dec_final' 'dec_final'
] ]
decoder_gru_names = [ decoder_gru_names = [
'dec_dense2', 'dec_dense2',
'dec_dense4', 'dec_dense4',
'dec_dense6' 'dec_dense6'
] ]
source_fid = open("dred_rdovae_dec_data.c", 'w') source_fid = open("dred_rdovae_dec_data.c", 'w')
header_fid = open("dred_rdovae_dec_data.h", 'w') header_fid = open("dred_rdovae_dec_data.h", 'w')

View file

@ -97,7 +97,7 @@ total_delay = silk_delay + zero_history + args.extra_delay - dump_data_delay
# load signal # load signal
if args.input.endswith('.raw') or args.input.endswith('.pcm') or args.input.endswith('.sw'): if args.input.endswith('.raw') or args.input.endswith('.pcm') or args.input.endswith('.sw'):
signal = np.fromfile(args.input, dtype='int16') signal = np.fromfile(args.input, dtype='int16')
elif args.input.endswith('.wav'): elif args.input.endswith('.wav'):
fs, signal = wavfile.read(args.input) fs, signal = wavfile.read(args.input)
else: else:
@ -107,7 +107,7 @@ else:
padded_signal_length = len(signal) + total_delay padded_signal_length = len(signal) + total_delay
tail = padded_signal_length % frame_size tail = padded_signal_length % frame_size
right_padding = (frame_size - tail) % frame_size right_padding = (frame_size - tail) % frame_size
signal = np.concatenate((np.zeros(total_delay, dtype=np.int16), signal, np.zeros(right_padding, dtype=np.int16))) signal = np.concatenate((np.zeros(total_delay, dtype=np.int16), signal, np.zeros(right_padding, dtype=np.int16)))
padded_signal_file = os.path.splitext(args.input)[0] + '_padded.raw' padded_signal_file = os.path.splitext(args.input)[0] + '_padded.raw'
@ -228,7 +228,7 @@ if args.lossfile != None:
fec_out_full[:, :nb_used_features] = fec_out fec_out_full[:, :nb_used_features] = fec_out
fec_out_full.tofile(packet_file[:-4] + f'_fec.f32') fec_out_full.tofile(packet_file[:-4] + f'_fec.f32')
#create packets array like in the original version for debugging purposes #create packets array like in the original version for debugging purposes
for i in range(offset, num_frames): for i in range(offset, num_frames):
@ -254,4 +254,3 @@ if args.debug_output:
print(f"writing debug output {packet_file[:-4] + f'_tf_batch{batch}_offset{offset}.f32'}") print(f"writing debug output {packet_file[:-4] + f'_tf_batch{batch}_offset{offset}.f32'}")
test_features_full.tofile(packet_file[:-4] + f'_tf_batch{batch}_offset{offset}.f32') test_features_full.tofile(packet_file[:-4] + f'_tf_batch{batch}_offset{offset}.f32')

View file

@ -43,7 +43,7 @@ int get_fec_frame(const char * const filename, float *features, int packet_index
long offset; long offset;
FILE *fid = fopen(filename, "rb"); FILE *fid = fopen(filename, "rb");
/* read header */ /* read header */
if (fread(&version, sizeof(version), 1, fid) != 1) goto error; if (fread(&version, sizeof(version), 1, fid) != 1) goto error;
if (fread(&header_size, sizeof(header_size), 1, fid) != 1) goto error; if (fread(&header_size, sizeof(header_size), 1, fid) != 1) goto error;
@ -88,7 +88,7 @@ int get_fec_rate(const char * const filename, int packet_index)
int16_t rate; int16_t rate;
FILE *fid = fopen(filename, "rb"); FILE *fid = fopen(filename, "rb");
/* read header */ /* read header */
if (fread(&version, sizeof(version), 1, fid) != 1) goto error; if (fread(&version, sizeof(version), 1, fid) != 1) goto error;
if (fread(&header_size, sizeof(header_size), 1, fid) != 1) goto error; if (fread(&header_size, sizeof(header_size), 1, fid) != 1) goto error;

View file

@ -33,25 +33,25 @@ import numpy as np
def write_fec_packets(filename, packets, rates=None): def write_fec_packets(filename, packets, rates=None):
""" writes packets in binary format """ """ writes packets in binary format """
assert np.dtype(np.float32).itemsize == 4 assert np.dtype(np.float32).itemsize == 4
assert np.dtype(np.int16).itemsize == 2 assert np.dtype(np.int16).itemsize == 2
# derive some sizes # derive some sizes
num_packets = len(packets) num_packets = len(packets)
subframes_per_packet = packets[0].shape[-2] subframes_per_packet = packets[0].shape[-2]
num_features = packets[0].shape[-1] num_features = packets[0].shape[-1]
# size of float is 4 # size of float is 4
subframe_size = num_features * 4 subframe_size = num_features * 4
packet_size = subframe_size * subframes_per_packet + 2 # two bytes for rate packet_size = subframe_size * subframes_per_packet + 2 # two bytes for rate
version = 1 version = 1
# header size (version, header_size, num_packets, packet_size, subframe_size, subrames_per_packet, num_features) # header size (version, header_size, num_packets, packet_size, subframe_size, subrames_per_packet, num_features)
header_size = 14 header_size = 14
with open(filename, 'wb') as f: with open(filename, 'wb') as f:
# header # header
f.write(np.int16(version).tobytes()) f.write(np.int16(version).tobytes())
f.write(np.int16(header_size).tobytes()) f.write(np.int16(header_size).tobytes())
@ -60,28 +60,28 @@ def write_fec_packets(filename, packets, rates=None):
f.write(np.int16(subframe_size).tobytes()) f.write(np.int16(subframe_size).tobytes())
f.write(np.int16(subframes_per_packet).tobytes()) f.write(np.int16(subframes_per_packet).tobytes())
f.write(np.int16(num_features).tobytes()) f.write(np.int16(num_features).tobytes())
# packets # packets
for i, packet in enumerate(packets): for i, packet in enumerate(packets):
if type(rates) == type(None): if type(rates) == type(None):
rate = 0 rate = 0
else: else:
rate = rates[i] rate = rates[i]
f.write(np.int16(rate).tobytes()) f.write(np.int16(rate).tobytes())
features = np.flip(packet, axis=-2) features = np.flip(packet, axis=-2)
f.write(features.astype(np.float32).tobytes()) f.write(features.astype(np.float32).tobytes())
def read_fec_packets(filename): def read_fec_packets(filename):
""" reads packets from binary format """ """ reads packets from binary format """
assert np.dtype(np.float32).itemsize == 4 assert np.dtype(np.float32).itemsize == 4
assert np.dtype(np.int16).itemsize == 2 assert np.dtype(np.int16).itemsize == 2
with open(filename, 'rb') as f: with open(filename, 'rb') as f:
# header # header
version = np.frombuffer(f.read(2), dtype=np.int16).item() version = np.frombuffer(f.read(2), dtype=np.int16).item()
header_size = np.frombuffer(f.read(2), dtype=np.int16).item() header_size = np.frombuffer(f.read(2), dtype=np.int16).item()
@ -90,19 +90,19 @@ def read_fec_packets(filename):
subframe_size = np.frombuffer(f.read(2), dtype=np.int16).item() subframe_size = np.frombuffer(f.read(2), dtype=np.int16).item()
subframes_per_packet = np.frombuffer(f.read(2), dtype=np.int16).item() subframes_per_packet = np.frombuffer(f.read(2), dtype=np.int16).item()
num_features = np.frombuffer(f.read(2), dtype=np.int16).item() num_features = np.frombuffer(f.read(2), dtype=np.int16).item()
dummy_features = np.zeros((1, subframes_per_packet, num_features), dtype=np.float32) dummy_features = np.zeros((1, subframes_per_packet, num_features), dtype=np.float32)
# packets # packets
rates = [] rates = []
packets = [] packets = []
for i in range(num_packets): for i in range(num_packets):
rate = np.frombuffer(f.read(2), dtype=np.int16).item rate = np.frombuffer(f.read(2), dtype=np.int16).item
rates.append(rate) rates.append(rate)
features = np.reshape(np.frombuffer(f.read(subframe_size * subframes_per_packet), dtype=np.float32), dummy_features.shape) features = np.reshape(np.frombuffer(f.read(subframe_size * subframes_per_packet), dtype=np.float32), dummy_features.shape)
packet = np.flip(features, axis=-2) packet = np.flip(features, axis=-2)
packets.append(packet) packets.append(packet)
return packets return packets

View file

@ -35,7 +35,7 @@ def interp_mulaw(gamma = 1):
alpha = e_gt - tf.math.floor(e_gt) alpha = e_gt - tf.math.floor(e_gt)
alpha = tf.tile(alpha,[1,1,256]) alpha = tf.tile(alpha,[1,1,256])
e_gt = tf.cast(e_gt,'int32') e_gt = tf.cast(e_gt,'int32')
e_gt = tf.clip_by_value(e_gt,0,254) e_gt = tf.clip_by_value(e_gt,0,254)
interp_probab = (1 - alpha)*model_out + alpha*tf.roll(model_out,shift = -1,axis = -1) interp_probab = (1 - alpha)*model_out + alpha*tf.roll(model_out,shift = -1,axis = -1)
sparse_cel = tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE)(e_gt,interp_probab) sparse_cel = tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE)(e_gt,interp_probab)
loss_mod = sparse_cel + prob_compensation + gamma*regularization loss_mod = sparse_cel + prob_compensation + gamma*regularization
@ -51,7 +51,7 @@ def metric_oginterploss(y_true,y_pred):
alpha = e_gt - tf.math.floor(e_gt) alpha = e_gt - tf.math.floor(e_gt)
alpha = tf.tile(alpha,[1,1,256]) alpha = tf.tile(alpha,[1,1,256])
e_gt = tf.cast(e_gt,'int32') e_gt = tf.cast(e_gt,'int32')
e_gt = tf.clip_by_value(e_gt,0,254) e_gt = tf.clip_by_value(e_gt,0,254)
interp_probab = (1 - alpha)*model_out + alpha*tf.roll(model_out,shift = -1,axis = -1) interp_probab = (1 - alpha)*model_out + alpha*tf.roll(model_out,shift = -1,axis = -1)
sparse_cel = tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE)(e_gt,interp_probab) sparse_cel = tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE)(e_gt,interp_probab)
loss_mod = sparse_cel + prob_compensation loss_mod = sparse_cel + prob_compensation
@ -78,7 +78,7 @@ def metric_cel(y_true, y_pred):
e_gt = tf_l2u(y_true - p) e_gt = tf_l2u(y_true - p)
e_gt = tf.round(e_gt) e_gt = tf.round(e_gt)
e_gt = tf.cast(e_gt,'int32') e_gt = tf.cast(e_gt,'int32')
e_gt = tf.clip_by_value(e_gt,0,255) e_gt = tf.clip_by_value(e_gt,0,255)
sparse_cel = tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE)(e_gt,model_out) sparse_cel = tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE)(e_gt,model_out)
return sparse_cel return sparse_cel
@ -97,4 +97,3 @@ def loss_matchlar():
loss_lar_diff = tf.square(loss_lar_diff) loss_lar_diff = tf.square(loss_lar_diff)
return tf.reduce_mean(loss_lar_diff, axis=-1) return tf.reduce_mean(loss_lar_diff, axis=-1)
return loss return loss

View file

@ -186,7 +186,7 @@ class SparsifyGRUB(Callback):
w[0] = p w[0] = p
layer.set_weights(w) layer.set_weights(w)
class PCMInit(Initializer): class PCMInit(Initializer):
def __init__(self, gain=.1, seed=None): def __init__(self, gain=.1, seed=None):
@ -264,20 +264,20 @@ def new_lpcnet_model(rnn_units1=384, rnn_units2=16, nb_used_features=20, batch_s
lpcoeffs = diff_rc2lpc(name = "rc2lpc")(cfeat) lpcoeffs = diff_rc2lpc(name = "rc2lpc")(cfeat)
else: else:
lpcoeffs = Input(shape=(None, lpc_order), batch_size=batch_size) lpcoeffs = Input(shape=(None, lpc_order), batch_size=batch_size)
real_preds = diff_pred(name = "real_lpc2preds")([pcm,lpcoeffs]) real_preds = diff_pred(name = "real_lpc2preds")([pcm,lpcoeffs])
weighting = lpc_gamma ** np.arange(1, 17).astype('float32') weighting = lpc_gamma ** np.arange(1, 17).astype('float32')
weighted_lpcoeffs = Lambda(lambda x: x[0]*x[1])([lpcoeffs, weighting]) weighted_lpcoeffs = Lambda(lambda x: x[0]*x[1])([lpcoeffs, weighting])
tensor_preds = diff_pred(name = "lpc2preds")([pcm,weighted_lpcoeffs]) tensor_preds = diff_pred(name = "lpc2preds")([pcm,weighted_lpcoeffs])
past_errors = error_calc([pcm,tensor_preds]) past_errors = error_calc([pcm,tensor_preds])
embed = diff_Embed(name='embed_sig',initializer = PCMInit()) embed = diff_Embed(name='embed_sig',initializer = PCMInit())
cpcm = Concatenate()([tf_l2u(pcm),tf_l2u(tensor_preds),past_errors]) cpcm = Concatenate()([tf_l2u(pcm),tf_l2u(tensor_preds),past_errors])
cpcm = GaussianNoise(.3)(cpcm) cpcm = GaussianNoise(.3)(cpcm)
cpcm = Reshape((-1, embed_size*3))(embed(cpcm)) cpcm = Reshape((-1, embed_size*3))(embed(cpcm))
cpcm_decoder = Reshape((-1, embed_size*3))(embed(dpcm)) cpcm_decoder = Reshape((-1, embed_size*3))(embed(dpcm))
rep = Lambda(lambda x: K.repeat_elements(x, frame_size, 1)) rep = Lambda(lambda x: K.repeat_elements(x, frame_size, 1))
quant = quant_regularizer if quantize else None quant = quant_regularizer if quantize else None
@ -305,7 +305,7 @@ def new_lpcnet_model(rnn_units1=384, rnn_units2=16, nb_used_features=20, batch_s
rnn2.trainable=False rnn2.trainable=False
md.trainable=False md.trainable=False
embed.Trainable=False embed.Trainable=False
m_out = Concatenate(name='pdf')([tensor_preds,real_preds,ulaw_prob]) m_out = Concatenate(name='pdf')([tensor_preds,real_preds,ulaw_prob])
if not flag_e2e: if not flag_e2e:
model = Model([pcm, feat, pitch, lpcoeffs], m_out) model = Model([pcm, feat, pitch, lpcoeffs], m_out)
@ -315,7 +315,7 @@ def new_lpcnet_model(rnn_units1=384, rnn_units2=16, nb_used_features=20, batch_s
model.rnn_units2 = rnn_units2 model.rnn_units2 = rnn_units2
model.nb_used_features = nb_used_features model.nb_used_features = nb_used_features
model.frame_size = frame_size model.frame_size = frame_size
if not flag_e2e: if not flag_e2e:
encoder = Model([feat, pitch], cfeat) encoder = Model([feat, pitch], cfeat)
dec_rnn_in = Concatenate()([cpcm_decoder, dec_feat]) dec_rnn_in = Concatenate()([cpcm_decoder, dec_feat])
@ -330,7 +330,7 @@ def new_lpcnet_model(rnn_units1=384, rnn_units2=16, nb_used_features=20, batch_s
decoder = Model([dpcm, dec_feat, dec_state1, dec_state2], [dec_ulaw_prob, state1, state2]) decoder = Model([dpcm, dec_feat, dec_state1, dec_state2], [dec_ulaw_prob, state1, state2])
else: else:
decoder = Model([dpcm, dec_feat, dec_state1, dec_state2], [dec_ulaw_prob, state1, state2]) decoder = Model([dpcm, dec_feat, dec_state1, dec_state2], [dec_ulaw_prob, state1, state2])
# add parameters to model # add parameters to model
set_parameter(model, 'lpc_gamma', lpc_gamma, dtype='float64') set_parameter(model, 'lpc_gamma', lpc_gamma, dtype='float64')
set_parameter(model, 'flag_e2e', flag_e2e, dtype='bool') set_parameter(model, 'flag_e2e', flag_e2e, dtype='bool')

View file

@ -88,10 +88,10 @@ def new_lpcnet_plc_model(rnn_units=256, nb_used_features=20, nb_burg_features=36
gru_out1, _ = rnn(cfeat) gru_out1, _ = rnn(cfeat)
gru_out1 = GaussianNoise(.005)(gru_out1) gru_out1 = GaussianNoise(.005)(gru_out1)
gru_out2, _ = rnn2(gru_out1) gru_out2, _ = rnn2(gru_out1)
out_dense = Dense(nb_used_features, activation='linear', name='plc_out') out_dense = Dense(nb_used_features, activation='linear', name='plc_out')
plc_out = out_dense(gru_out2) plc_out = out_dense(gru_out2)
model = Model([feat, lost], plc_out) model = Model([feat, lost], plc_out)
model.rnn_units = rnn_units model.rnn_units = rnn_units
model.cond_size = cond_size model.cond_size = cond_size

View file

@ -6,7 +6,7 @@ import numpy as np
import math import math
class MDense(Layer): class MDense(Layer):
def __init__(self, outputs, def __init__(self, outputs,
channels=2, channels=2,
activation=None, activation=None,

View file

@ -5,9 +5,9 @@ import tensorflow as tf
def set_parameter(model, parameter_name, parameter_value, dtype='float32'): def set_parameter(model, parameter_name, parameter_value, dtype='float32'):
""" stores parameter_value as non-trainable weight with name parameter_name:0 """ """ stores parameter_value as non-trainable weight with name parameter_name:0 """
weights = [weight for weight in model.weights if weight.name == (parameter_name + ":0")] weights = [weight for weight in model.weights if weight.name == (parameter_name + ":0")]
if len(weights) == 0: if len(weights) == 0:
model.add_weight(parameter_name, trainable=False, initializer=tf.keras.initializers.Constant(parameter_value), dtype=dtype) model.add_weight(parameter_name, trainable=False, initializer=tf.keras.initializers.Constant(parameter_value), dtype=dtype)
elif len(weights) == 1: elif len(weights) == 1:
@ -15,14 +15,14 @@ def set_parameter(model, parameter_name, parameter_value, dtype='float32'):
else: else:
raise ValueError(f"more than one weight starting with {parameter_name}:0 in model") raise ValueError(f"more than one weight starting with {parameter_name}:0 in model")
def get_parameter(model, parameter_name, default=None): def get_parameter(model, parameter_name, default=None):
""" returns parameter value if parameter is present in model and otherwise default """ """ returns parameter value if parameter is present in model and otherwise default """
weights = [weight for weight in model.weights if weight.name == (parameter_name + ":0")] weights = [weight for weight in model.weights if weight.name == (parameter_name + ":0")]
if len(weights) == 0: if len(weights) == 0:
return default return default
elif len(weights) > 1: elif len(weights) > 1:
raise ValueError(f"more than one weight starting with {parameter_name}:0 in model") raise ValueError(f"more than one weight starting with {parameter_name}:0 in model")
else: else:

View file

@ -56,7 +56,7 @@ class PLCLoader(Sequence):
lost_mask = np.tile(lost, (1,1,features.shape[2])) lost_mask = np.tile(lost, (1,1,features.shape[2]))
in_features = features*lost_mask in_features = features*lost_mask
in_features[:,:,:self.nb_burg_features] = in_features[:,:,:self.nb_burg_features]*burg_mask in_features[:,:,:self.nb_burg_features] = in_features[:,:,:self.nb_burg_features]*burg_mask
#For the first frame after a loss, we don't have valid features, but the Burg estimate is valid. #For the first frame after a loss, we don't have valid features, but the Burg estimate is valid.
#in_features[:,1:,self.nb_burg_features:] = in_features[:,1:,self.nb_burg_features:]*lost_mask[:,:-1,self.nb_burg_features:] #in_features[:,1:,self.nb_burg_features:] = in_features[:,1:,self.nb_burg_features:]*lost_mask[:,:-1,self.nb_burg_features:]
out_lost = np.copy(lost) out_lost = np.copy(lost)

View file

@ -61,7 +61,7 @@ def soft_quantize(x):
#x = 4*x #x = 4*x
#x = x - (.25/np.math.pi)*tf.math.sin(2*np.math.pi*x) #x = x - (.25/np.math.pi)*tf.math.sin(2*np.math.pi*x)
#x = x - (.25/np.math.pi)*tf.math.sin(2*np.math.pi*x) #x = x - (.25/np.math.pi)*tf.math.sin(2*np.math.pi*x)
#x = x - (.25/np.math.pi)*tf.math.sin(2*np.math.pi*x) #x = x - (.25/np.math.pi)*tf.math.sin(2*np.math.pi*x)
return x return x
def noise_quantize(x): def noise_quantize(x):
@ -237,7 +237,7 @@ def new_rdovae_decoder(nb_used_features=20, nb_bits=17, bunch=4, nb_quant=40, ba
bits_input = Input(shape=(None, nb_bits), batch_size=batch_size, name="dec_bits") bits_input = Input(shape=(None, nb_bits), batch_size=batch_size, name="dec_bits")
gru_state_input = Input(shape=(nb_state_dim,), batch_size=batch_size, name="dec_state") gru_state_input = Input(shape=(nb_state_dim,), batch_size=batch_size, name="dec_state")
gru = CuDNNGRU if training else GRU gru = CuDNNGRU if training else GRU
dec_dense1 = Dense(cond_size2, activation='tanh', kernel_constraint=constraint, name='dec_dense1') dec_dense1 = Dense(cond_size2, activation='tanh', kernel_constraint=constraint, name='dec_dense1')
dec_dense2 = gru(cond_size, return_sequences=True, kernel_constraint=constraint, recurrent_constraint=constraint, name='dec_dense2') dec_dense2 = gru(cond_size, return_sequences=True, kernel_constraint=constraint, recurrent_constraint=constraint, name='dec_dense2')
@ -300,7 +300,7 @@ def tensor_concat(x):
y = [] y = []
for i in range(n-1): for i in range(n-1):
offset = 2 * (n-1-i) offset = 2 * (n-1-i)
tmp = K.concatenate([x[i][:, offset:, :], x[-1][:, -offset:, :]], axis=-2) tmp = K.concatenate([x[i][:, offset:, :], x[-1][:, -offset:, :]], axis=-2)
y.append(tf.expand_dims(tmp, axis=0)) y.append(tf.expand_dims(tmp, axis=0))
y.append(tf.expand_dims(x[-1], axis=0)) y.append(tf.expand_dims(x[-1], axis=0))
return Concatenate(axis=0)(y) return Concatenate(axis=0)(y)
@ -335,7 +335,7 @@ def new_rdovae_model(nb_used_features=20, nb_bits=17, bunch=4, nb_quant=40, batc
dze = dzone([ze,dead_zone]) dze = dzone([ze,dead_zone])
ndze = noisequant(dze) ndze = noisequant(dze)
dze_quant = hardquant(dze) dze_quant = hardquant(dze)
div = Lambda(lambda x: x[0]/x[1]) div = Lambda(lambda x: x[0]/x[1])
dze_quant = div([dze_quant,quant_scale]) dze_quant = div([dze_quant,quant_scale])
ndze_unquant = div([ndze,quant_scale]) ndze_unquant = div([ndze,quant_scale])
@ -355,13 +355,13 @@ def new_rdovae_model(nb_used_features=20, nb_bits=17, bunch=4, nb_quant=40, batc
combined_output.append(tmp) combined_output.append(tmp)
tmp = split_decoder([ndze_select, state_select]) tmp = split_decoder([ndze_select, state_select])
tmp = cat([tmp, lambda_up]) tmp = cat([tmp, lambda_up])
unquantized_output.append(tmp) unquantized_output.append(tmp)
concat = Lambda(tensor_concat, name="output") concat = Lambda(tensor_concat, name="output")
combined_output = concat(combined_output) combined_output = concat(combined_output)
unquantized_output = concat(unquantized_output) unquantized_output = concat(unquantized_output)
e2 = Concatenate(name="hard_bits")([dze, hard_distr_embed, lambda_val]) e2 = Concatenate(name="hard_bits")([dze, hard_distr_embed, lambda_val])
e = Concatenate(name="soft_bits")([dze, soft_distr_embed, lambda_val]) e = Concatenate(name="soft_bits")([dze, soft_distr_embed, lambda_val])
@ -370,4 +370,3 @@ def new_rdovae_model(nb_used_features=20, nb_bits=17, bunch=4, nb_quant=40, batc
model.nb_used_features = nb_used_features model.nb_used_features = nb_used_features
return model, encoder, decoder, qembedding return model, encoder, decoder, qembedding

View file

@ -113,7 +113,7 @@ if __name__ == "__main__":
# qembedding # qembedding
print(f"writing layer {exchange_name['qembedding']}...") print(f"writing layer {exchange_name['qembedding']}...")
dump_tf_weights(os.path.join(args.output, exchange_name['qembedding']), qembedding) dump_tf_weights(os.path.join(args.output, exchange_name['qembedding']), qembedding)
# decoder # decoder
decoder_dense_names = [ decoder_dense_names = [
'state1', 'state1',
@ -125,7 +125,7 @@ if __name__ == "__main__":
'dec_dense7', 'dec_dense7',
'dec_dense8', 'dec_dense8',
'dec_final' 'dec_final'
] ]
decoder_gru_names = [ decoder_gru_names = [
'dec_dense2', 'dec_dense2',

View file

@ -79,7 +79,7 @@ exchange_name = {
if __name__ == "__main__": if __name__ == "__main__":
model, encoder, decoder, qembedding = new_rdovae_model(20, args.latent_dim, cond_size=args.cond_size, nb_quant=args.quant_levels) model, encoder, decoder, qembedding = new_rdovae_model(20, args.latent_dim, cond_size=args.cond_size, nb_quant=args.quant_levels)
encoder_layers = [ encoder_layers = [
'enc_dense1', 'enc_dense1',
'enc_dense3', 'enc_dense3',
@ -93,7 +93,7 @@ if __name__ == "__main__":
'enc_dense6', 'enc_dense6',
'bits_dense' 'bits_dense'
] ]
decoder_layers = [ decoder_layers = [
'state1', 'state1',
'state2', 'state2',
@ -108,16 +108,16 @@ if __name__ == "__main__":
'dec_dense4', 'dec_dense4',
'dec_dense6' 'dec_dense6'
] ]
for name in encoder_layers: for name in encoder_layers:
print(f"loading weight for layer {name}...") print(f"loading weight for layer {name}...")
load_tf_weights(os.path.join(args.input, exchange_name[name]), encoder.get_layer(name)) load_tf_weights(os.path.join(args.input, exchange_name[name]), encoder.get_layer(name))
print(f"loading weight for layer qembedding...") print(f"loading weight for layer qembedding...")
load_tf_weights(os.path.join(args.input, exchange_name['qembedding']), qembedding) load_tf_weights(os.path.join(args.input, exchange_name['qembedding']), qembedding)
for name in decoder_layers: for name in decoder_layers:
print(f"loading weight for layer {name}...") print(f"loading weight for layer {name}...")
load_tf_weights(os.path.join(args.input, exchange_name[name]), decoder.get_layer(name)) load_tf_weights(os.path.join(args.input, exchange_name[name]), decoder.get_layer(name))
model.save(args.weights) model.save(args.weights)

View file

@ -118,5 +118,3 @@ for c in range(0, nb_frames):
#print(mem) #print(mem)
np.array([np.round(mem)], dtype='int16').tofile(fout) np.array([np.round(mem)], dtype='int16').tofile(fout)
skip = 0 skip = 0

View file

@ -36,12 +36,12 @@ class diff_pred(Layer):
rept = Lambda(lambda x: K.repeat_elements(x , frame_size, 1)) rept = Lambda(lambda x: K.repeat_elements(x , frame_size, 1))
zpX = Lambda(lambda x: K.concatenate([0*x[:,0:lpcoeffs_N,:], x],axis = 1)) zpX = Lambda(lambda x: K.concatenate([0*x[:,0:lpcoeffs_N,:], x],axis = 1))
cX = Lambda(lambda x: K.concatenate([x[:,(lpcoeffs_N - i):(lpcoeffs_N - i + 2400),:] for i in range(lpcoeffs_N)],axis = 2)) cX = Lambda(lambda x: K.concatenate([x[:,(lpcoeffs_N - i):(lpcoeffs_N - i + 2400),:] for i in range(lpcoeffs_N)],axis = 2))
pred = -Multiply()([rept(lpc),cX(zpX(xt))]) pred = -Multiply()([rept(lpc),cX(zpX(xt))])
return K.sum(pred,axis = 2,keepdims = True) return K.sum(pred,axis = 2,keepdims = True)
# Differentiable Transformations (RC <-> LPC) computed using the Levinson Durbin Recursion # Differentiable Transformations (RC <-> LPC) computed using the Levinson Durbin Recursion
class diff_rc2lpc(Layer): class diff_rc2lpc(Layer):
def call(self, inputs, lpcoeffs_N = 16): def call(self, inputs, lpcoeffs_N = 16):
def pred_lpc_recursive(input): def pred_lpc_recursive(input):

View file

@ -134,7 +134,7 @@ strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()
with strategy.scope(): with strategy.scope():
model, _, _ = lpcnet.new_lpcnet_model(rnn_units1=args.grua_size, model, _, _ = lpcnet.new_lpcnet_model(rnn_units1=args.grua_size,
rnn_units2=args.grub_size, rnn_units2=args.grub_size,
batch_size=batch_size, training=True, batch_size=batch_size, training=True,
quantize=quantize, quantize=quantize,
flag_e2e=flag_e2e, flag_e2e=flag_e2e,

View file

@ -200,14 +200,14 @@ static inline void sgemv_accum16(float *out, const float *weights, int rows, int
for (i=0;i<rows;i+=16) for (i=0;i<rows;i+=16)
{ {
float * restrict y = &out[i]; float * restrict y = &out[i];
/* keep y[0..15] in registers for duration of inner loop */ /* keep y[0..15] in registers for duration of inner loop */
float32x4_t y0_3 = vld1q_f32(&y[0]); float32x4_t y0_3 = vld1q_f32(&y[0]);
float32x4_t y4_7 = vld1q_f32(&y[4]); float32x4_t y4_7 = vld1q_f32(&y[4]);
float32x4_t y8_11 = vld1q_f32(&y[8]); float32x4_t y8_11 = vld1q_f32(&y[8]);
float32x4_t y12_15 = vld1q_f32(&y[12]); float32x4_t y12_15 = vld1q_f32(&y[12]);
for (j=0;j<cols;j++) for (j=0;j<cols;j++)
{ {
const float * restrict w; const float * restrict w;
@ -219,9 +219,9 @@ static inline void sgemv_accum16(float *out, const float *weights, int rows, int
wvec4_7 = vld1q_f32(&w[4]); wvec4_7 = vld1q_f32(&w[4]);
wvec8_11 = vld1q_f32(&w[8]); wvec8_11 = vld1q_f32(&w[8]);
wvec12_15 = vld1q_f32(&w[12]); wvec12_15 = vld1q_f32(&w[12]);
xj = vld1q_dup_f32(&x[j]); xj = vld1q_dup_f32(&x[j]);
y0_3 = vmlaq_f32(y0_3, wvec0_3, xj); y0_3 = vmlaq_f32(y0_3, wvec0_3, xj);
y4_7 = vmlaq_f32(y4_7, wvec4_7, xj); y4_7 = vmlaq_f32(y4_7, wvec4_7, xj);
y8_11 = vmlaq_f32(y8_11, wvec8_11, xj); y8_11 = vmlaq_f32(y8_11, wvec8_11, xj);
@ -229,12 +229,12 @@ static inline void sgemv_accum16(float *out, const float *weights, int rows, int
} }
/* save y[0..15] back to memory */ /* save y[0..15] back to memory */
vst1q_f32(&y[0], y0_3); vst1q_f32(&y[0], y0_3);
vst1q_f32(&y[4], y4_7); vst1q_f32(&y[4], y4_7);
vst1q_f32(&y[8], y8_11); vst1q_f32(&y[8], y8_11);
vst1q_f32(&y[12], y12_15); vst1q_f32(&y[12], y12_15);
} }
} }
@ -249,32 +249,32 @@ static inline void sparse_sgemv_accum16(float *out, const float *w, int rows, co
y = &out[i]; y = &out[i];
/* keep y[0..15] in registers for duration of inner loop */ /* keep y[0..15] in registers for duration of inner loop */
float32x4_t y0_3 = vld1q_f32(&y[0]); float32x4_t y0_3 = vld1q_f32(&y[0]);
float32x4_t y4_7 = vld1q_f32(&y[4]); float32x4_t y4_7 = vld1q_f32(&y[4]);
float32x4_t y8_11 = vld1q_f32(&y[8]); float32x4_t y8_11 = vld1q_f32(&y[8]);
float32x4_t y12_15 = vld1q_f32(&y[12]); float32x4_t y12_15 = vld1q_f32(&y[12]);
for (j=0;j<cols;j++) for (j=0;j<cols;j++)
{ {
float32x4_t xj= vld1q_dup_f32(&x[*idx++]); float32x4_t xj= vld1q_dup_f32(&x[*idx++]);
float32x4_t wvec; float32x4_t wvec;
wvec = vld1q_f32(&w[0]); y0_3 = vmlaq_f32(y0_3, wvec, xj); wvec = vld1q_f32(&w[0]); y0_3 = vmlaq_f32(y0_3, wvec, xj);
wvec = vld1q_f32(&w[4]); y4_7 = vmlaq_f32(y4_7, wvec, xj); wvec = vld1q_f32(&w[4]); y4_7 = vmlaq_f32(y4_7, wvec, xj);
wvec = vld1q_f32(&w[8]); y8_11 = vmlaq_f32(y8_11, wvec, xj); wvec = vld1q_f32(&w[8]); y8_11 = vmlaq_f32(y8_11, wvec, xj);
wvec = vld1q_f32(&w[12]); y12_15 = vmlaq_f32(y12_15, wvec, xj); wvec = vld1q_f32(&w[12]); y12_15 = vmlaq_f32(y12_15, wvec, xj);
w += 16; w += 16;
} }
/* save y[0..15] back to memory */ /* save y[0..15] back to memory */
vst1q_f32(&y[0], y0_3); vst1q_f32(&y[0], y0_3);
vst1q_f32(&y[4], y4_7); vst1q_f32(&y[4], y4_7);
vst1q_f32(&y[8], y8_11); vst1q_f32(&y[8], y8_11);
vst1q_f32(&y[12], y12_15); vst1q_f32(&y[12], y12_15);
} }
} }