From 144b7311bc84ff4985b692642a633fb29f990b81 Mon Sep 17 00:00:00 2001 From: Jean-Marc Valin Date: Wed, 13 Oct 2021 23:18:57 -0400 Subject: [PATCH] Dumping 16-bit linear training data --- dnn/dump_data.c | 30 +++++++----------------- dnn/training_tf2/dataloader.py | 18 +++++++++------ dnn/training_tf2/dump_lpcnet.py | 2 +- dnn/training_tf2/lossfuncs.py | 14 +++++++----- dnn/training_tf2/lpcnet.py | 39 +++++++++++++++++--------------- dnn/training_tf2/tf_funcs.py | 4 ++-- dnn/training_tf2/train_lpcnet.py | 16 ++++++------- 7 files changed, 58 insertions(+), 65 deletions(-) diff --git a/dnn/dump_data.c b/dnn/dump_data.c index a97cec36..99e040cc 100644 --- a/dnn/dump_data.c +++ b/dnn/dump_data.c @@ -75,28 +75,20 @@ void compute_noise(int *noise, float noise_std) { } -void write_audio(LPCNetEncState *st, const short *pcm, const int *noise, FILE *file, int nframes, int e2e) { +void write_audio(LPCNetEncState *st, const short *pcm, const int *noise, FILE *file, int nframes) { int i, k; for (k=0;kfeatures[k][NB_BANDS+2+j]*st->sig_mem[j]; e = lin2ulaw(pcm[k*FRAME_SIZE+i] - p); - /* Signal. */ - data[4*i] = lin2ulaw(st->sig_mem[0]); - /* Prediction. */ - data[4*i+1] = lin2ulaw(p); - /* Excitation in. */ - data[4*i+2] = st->exc_mem; - /* Excitation out. */ - if (e2e) { - data[4*i+3] = lin2ulaw(pcm[k*FRAME_SIZE+i]); - } else { - data[4*i+3] = e; - } + /* Signal in. */ + data[2*i] = st->sig_mem[0]; + /* Signal out. */ + data[2*i+1] = pcm[k*FRAME_SIZE+i]; /* Simulate error on excitation. */ e += noise[k*FRAME_SIZE+i]; e = IMIN(255, IMAX(0, e)); @@ -119,7 +111,6 @@ static short float2short(float x) int main(int argc, char **argv) { int i; char *argv0; - int e2e=0; int count=0; static const float a_hp[2] = {-1.99599, 0.99600}; static const float b_hp[2] = {-2, 1}; @@ -151,11 +142,6 @@ int main(int argc, char **argv) { srand(getpid()); st = lpcnet_encoder_create(); argv0=argv[0]; - if (argc > 2 && strcmp(argv[1], "-end2end")==0) { - e2e = 1; - argv++; - argc--; - } if (argc == 5 && strcmp(argv[1], "-train")==0) training = 1; if (argc == 5 && strcmp(argv[1], "-qtrain")==0) { training = 1; @@ -281,7 +267,7 @@ int main(int argc, char **argv) { if (!quantize) { process_single_frame(st, ffeat); - if (fpcm) write_audio(st, pcm, &noisebuf[st->pcount*FRAME_SIZE], fpcm, 1, e2e); + if (fpcm) write_audio(st, pcm, &noisebuf[st->pcount*FRAME_SIZE], fpcm, 1); } st->pcount++; /* Running on groups of 4 frames. */ @@ -289,7 +275,7 @@ int main(int argc, char **argv) { if (quantize) { unsigned char buf[8]; process_superframe(st, buf, ffeat, encode, quantize); - if (fpcm) write_audio(st, pcmbuf, noisebuf, fpcm, 4, e2e); + if (fpcm) write_audio(st, pcmbuf, noisebuf, fpcm, 4); } st->pcount = 0; } diff --git a/dnn/training_tf2/dataloader.py b/dnn/training_tf2/dataloader.py index 93c38255..79360a26 100644 --- a/dnn/training_tf2/dataloader.py +++ b/dnn/training_tf2/dataloader.py @@ -1,5 +1,6 @@ import numpy as np from tensorflow.keras.utils import Sequence +from ulaw import lin2ulaw def lpc2rc(lpc): #print("shape is = ", lpc.shape) @@ -12,13 +13,13 @@ def lpc2rc(lpc): return rc class LPCNetLoader(Sequence): - def __init__(self, data, features, periods, batch_size, lpc_out=False): + def __init__(self, data, features, periods, batch_size, e2e=False): self.batch_size = batch_size self.nb_batches = np.minimum(np.minimum(data.shape[0], features.shape[0]), periods.shape[0])//self.batch_size self.data = data[:self.nb_batches*self.batch_size, :] self.features = features[:self.nb_batches*self.batch_size, :] self.periods = periods[:self.nb_batches*self.batch_size, :] - self.lpc_out = lpc_out + self.e2e = e2e self.on_epoch_end() def on_epoch_end(self): @@ -27,15 +28,18 @@ class LPCNetLoader(Sequence): def __getitem__(self, index): data = self.data[self.indices[index*self.batch_size:(index+1)*self.batch_size], :, :] - in_data = data[: , :, :3] - out_data = data[: , :, 3:4] + in_data = data[: , :, :1] + out_data = data[: , :, 1:] features = self.features[self.indices[index*self.batch_size:(index+1)*self.batch_size], :, :-16] periods = self.periods[self.indices[index*self.batch_size:(index+1)*self.batch_size], :, :] outputs = [out_data] - if self.lpc_out: - lpc = self.features[self.indices[index*self.batch_size:(index+1)*self.batch_size], 2:-2, -16:] + inputs = [in_data, features, periods] + lpc = self.features[self.indices[index*self.batch_size:(index+1)*self.batch_size], 2:-2, -16:] + if self.e2e: outputs.append(lpc2rc(lpc)) - return ([in_data, features, periods], outputs) + else: + inputs.append(lpc) + return (inputs, outputs) def __len__(self): return self.nb_batches diff --git a/dnn/training_tf2/dump_lpcnet.py b/dnn/training_tf2/dump_lpcnet.py index 768e41b3..d2fd096f 100755 --- a/dnn/training_tf2/dump_lpcnet.py +++ b/dnn/training_tf2/dump_lpcnet.py @@ -252,7 +252,7 @@ with h5py.File(filename, "r") as f: cond_size = min(f['model_weights']['feature_dense1']['feature_dense1']['kernel:0'].shape) e2e = 'rc2lpc' in f['model_weights'] -model, _, _ = lpcnet.new_lpcnet_model(rnn_units1=units, rnn_units2=units2, flag_e2e = flag_e2e, cond_size=cond_size) +model, _, _ = lpcnet.new_lpcnet_model(rnn_units1=units, rnn_units2=units2, flag_e2e = e2e, cond_size=cond_size) model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['sparse_categorical_accuracy']) #model.summary() diff --git a/dnn/training_tf2/lossfuncs.py b/dnn/training_tf2/lossfuncs.py index f858f354..fae285b2 100644 --- a/dnn/training_tf2/lossfuncs.py +++ b/dnn/training_tf2/lossfuncs.py @@ -12,7 +12,7 @@ def res_from_sigloss(): def loss(y_true,y_pred): p = y_pred[:,:,0:1] model_out = y_pred[:,:,1:] - e_gt = tf_l2u(tf_u2l(y_true) - tf_u2l(p)) + e_gt = tf_l2u(y_true - p) e_gt = tf.round(e_gt) e_gt = tf.cast(e_gt,'int32') sparse_cel = tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE)(e_gt,model_out) @@ -24,9 +24,10 @@ def res_from_sigloss(): # Also adds a probability compensation (to account for matching cross entropy in the linear domain), weighted by gamma def interp_mulaw(gamma = 1): def loss(y_true,y_pred): + y_true = tf.cast(y_true, 'float32') p = y_pred[:,:,0:1] model_out = y_pred[:,:,1:] - e_gt = tf_l2u(tf_u2l(y_true) - tf_u2l(p)) + e_gt = tf_l2u(y_true - p) prob_compensation = tf.squeeze((K.abs(e_gt - 128)/128.0)*K.log(256.0)) alpha = e_gt - tf.math.floor(e_gt) alpha = tf.tile(alpha,[1,1,256]) @@ -42,7 +43,7 @@ def interp_mulaw(gamma = 1): def metric_oginterploss(y_true,y_pred): p = y_pred[:,:,0:1] model_out = y_pred[:,:,1:] - e_gt = tf_l2u(tf_u2l(y_true) - tf_u2l(p)) + e_gt = tf_l2u(y_true - p) prob_compensation = tf.squeeze((K.abs(e_gt - 128)/128.0)*K.log(256.0)) alpha = e_gt - tf.math.floor(e_gt) alpha = tf.tile(alpha,[1,1,256]) @@ -57,7 +58,7 @@ def metric_oginterploss(y_true,y_pred): def metric_icel(y_true, y_pred): p = y_pred[:,:,0:1] model_out = y_pred[:,:,1:] - e_gt = tf_l2u(tf_u2l(y_true) - tf_u2l(p)) + e_gt = tf_l2u(y_true - p) alpha = e_gt - tf.math.floor(e_gt) alpha = tf.tile(alpha,[1,1,256]) e_gt = tf.cast(e_gt,'int32') @@ -68,9 +69,10 @@ def metric_icel(y_true, y_pred): # Non-interpolated (rounded) cross entropy loss metric def metric_cel(y_true, y_pred): + y_true = tf.cast(y_true, 'float32') p = y_pred[:,:,0:1] model_out = y_pred[:,:,1:] - e_gt = tf_l2u(tf_u2l(y_true) - tf_u2l(p)) + e_gt = tf_l2u(y_true - p) e_gt = tf.round(e_gt) e_gt = tf.cast(e_gt,'int32') e_gt = tf.clip_by_value(e_gt,0,255) @@ -80,7 +82,7 @@ def metric_cel(y_true, y_pred): # Variance metric of the output excitation def metric_exc_sd(y_true,y_pred): p = y_pred[:,:,0:1] - e_gt = tf_l2u(tf_u2l(y_true) - tf_u2l(p)) + e_gt = tf_l2u(y_true - p) sd_egt = tf.keras.losses.MeanSquaredError(reduction=tf.keras.losses.Reduction.NONE)(e_gt,128) return sd_egt diff --git a/dnn/training_tf2/lpcnet.py b/dnn/training_tf2/lpcnet.py index 3a62e4c1..8e83b42c 100644 --- a/dnn/training_tf2/lpcnet.py +++ b/dnn/training_tf2/lpcnet.py @@ -230,8 +230,9 @@ class WeightClip(Constraint): constraint = WeightClip(0.992) -def new_lpcnet_model(rnn_units1=384, rnn_units2=16, nb_used_features=20, batch_size=128, training=False, adaptation=False, quantize=False, flag_e2e = False, cond_size=128): - pcm = Input(shape=(None, 3), batch_size=batch_size) +def new_lpcnet_model(rnn_units1=384, rnn_units2=16, nb_used_features=20, batch_size=128, training=False, adaptation=False, quantize=False, flag_e2e = False, cond_size=128, lpc_order=16): + pcm = Input(shape=(None, 1), batch_size=batch_size) + dpcm = Input(shape=(None, 3), batch_size=batch_size) feat = Input(shape=(None, nb_used_features), batch_size=batch_size) pitch = Input(shape=(None, 1), batch_size=batch_size) dec_feat = Input(shape=(None, cond_size)) @@ -257,20 +258,19 @@ def new_lpcnet_model(rnn_units1=384, rnn_units2=16, nb_used_features=20, batch_s cfeat = fdense2(fdense1(cfeat)) - if not flag_e2e: - embed = Embedding(256, embed_size, embeddings_initializer=PCMInit(), name='embed_sig') - cpcm = Reshape((-1, embed_size*3))(embed(pcm)) - else: - Input_extractor = Lambda(lambda x: K.expand_dims(x[0][:,:,x[1]],axis = -1)) - error_calc = Lambda(lambda x: tf_l2u(tf_u2l(x[0]) - tf.roll(tf_u2l(x[1]),1,axis = 1))) + Input_extractor = Lambda(lambda x: K.expand_dims(x[0][:,:,x[1]],axis = -1)) + error_calc = Lambda(lambda x: tf_l2u(x[0] - tf.roll(x[1],1,axis = 1))) + if flag_e2e: lpcoeffs = diff_rc2lpc(name = "rc2lpc")(cfeat) - tensor_preds = diff_pred(name = "lpc2preds")([Input_extractor([pcm,0]),lpcoeffs]) - past_errors = error_calc([Input_extractor([pcm,0]),tensor_preds]) - embed = diff_Embed(name='embed_sig',initializer = PCMInit()) - cpcm = Concatenate()([Input_extractor([pcm,0]),tensor_preds,past_errors]) - cpcm = Reshape((-1, embed_size*3))(embed(cpcm)) - cpcm_decoder = Concatenate()([Input_extractor([pcm,0]),Input_extractor([pcm,1]),Input_extractor([pcm,2])]) - cpcm_decoder = Reshape((-1, embed_size*3))(embed(cpcm_decoder)) + else: + lpcoeffs = Input(shape=(None, lpc_order), batch_size=batch_size) + tensor_preds = diff_pred(name = "lpc2preds")([Input_extractor([pcm,0]),lpcoeffs]) + past_errors = error_calc([Input_extractor([pcm,0]),tensor_preds]) + embed = diff_Embed(name='embed_sig',initializer = PCMInit()) + cpcm = Concatenate()([tf_l2u(Input_extractor([pcm,0])),tf_l2u(tensor_preds),past_errors]) + cpcm = Reshape((-1, embed_size*3))(embed(cpcm)) + cpcm_decoder = Concatenate()([Input_extractor([dpcm,0]),Input_extractor([dpcm,1]),Input_extractor([dpcm,2])]) + cpcm_decoder = Reshape((-1, embed_size*3))(embed(cpcm_decoder)) rep = Lambda(lambda x: K.repeat_elements(x, frame_size, 1)) @@ -301,10 +301,10 @@ def new_lpcnet_model(rnn_units1=384, rnn_units2=16, nb_used_features=20, batch_s md.trainable=False embed.Trainable=False + m_out = Concatenate(name='pdf')([tensor_preds,ulaw_prob]) if not flag_e2e: - model = Model([pcm, feat, pitch], ulaw_prob) + model = Model([pcm, feat, pitch, lpcoeffs], m_out) else: - m_out = Concatenate(name='pdf')([tensor_preds,ulaw_prob]) model = Model([pcm, feat, pitch], [m_out, cfeat]) model.rnn_units1 = rnn_units1 model.rnn_units2 = rnn_units2 @@ -321,5 +321,8 @@ def new_lpcnet_model(rnn_units1=384, rnn_units2=16, nb_used_features=20, batch_s dec_gru_out2, state2 = rnn2(Concatenate()([dec_gru_out1, dec_feat]), initial_state=dec_state2) dec_ulaw_prob = Lambda(tree_to_pdf_infer)(md(dec_gru_out2)) - decoder = Model([pcm, dec_feat, dec_state1, dec_state2], [dec_ulaw_prob, state1, state2]) + if flag_e2e: + decoder = Model([dpcm, dec_feat, dec_state1, dec_state2], [dec_ulaw_prob, state1, state2]) + else: + decoder = Model([pcm, dec_feat, dec_state1, dec_state2, lpcoeffs], [dec_ulaw_prob, state1, state2]) return model, encoder, decoder diff --git a/dnn/training_tf2/tf_funcs.py b/dnn/training_tf2/tf_funcs.py index 467c4138..5e065012 100644 --- a/dnn/training_tf2/tf_funcs.py +++ b/dnn/training_tf2/tf_funcs.py @@ -30,7 +30,7 @@ def tf_u2l(u): # The inputs xt and lpc conform with the shapes in lpcnet.py (the '2400' is coded keeping this in mind) class diff_pred(Layer): def call(self, inputs, lpcoeffs_N = 16, frame_size = 160): - xt = tf_u2l(inputs[0]) + xt = inputs[0] lpc = inputs[1] rept = Lambda(lambda x: K.repeat_elements(x , frame_size, 1)) @@ -39,7 +39,7 @@ class diff_pred(Layer): pred = -Multiply()([rept(lpc),cX(zpX(xt))]) - return tf_l2u(K.sum(pred,axis = 2,keepdims = True)) + return K.sum(pred,axis = 2,keepdims = True) # Differentiable Transformations (RC <-> LPC) computed using the Levinson Durbin Recursion class diff_rc2lpc(Layer): diff --git a/dnn/training_tf2/train_lpcnet.py b/dnn/training_tf2/train_lpcnet.py index bd0486be..9b827be1 100755 --- a/dnn/training_tf2/train_lpcnet.py +++ b/dnn/training_tf2/train_lpcnet.py @@ -125,7 +125,7 @@ strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy() with strategy.scope(): model, _, _ = lpcnet.new_lpcnet_model(rnn_units1=args.grua_size, rnn_units2=args.grub_size, batch_size=batch_size, training=True, quantize=quantize, flag_e2e = flag_e2e, cond_size=args.cond_size) if not flag_e2e: - model.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics='sparse_categorical_crossentropy') + model.compile(optimizer=opt, loss=metric_cel, metrics=metric_cel) else: model.compile(optimizer=opt, loss = [interp_mulaw(gamma=gamma), loss_matchlar()], loss_weights = [1.0, 2.0], metrics={'pdf':[metric_cel,metric_icel,metric_exc_sd,metric_oginterploss]}) model.summary() @@ -140,19 +140,17 @@ pcm_chunk_size = frame_size*feature_chunk_size # u for unquantised, load 16 bit PCM samples and convert to mu-law -data = np.memmap(pcm_file, dtype='uint8', mode='r') -nb_frames = (len(data)//(4*pcm_chunk_size)-1)//batch_size*batch_size +data = np.memmap(pcm_file, dtype='int16', mode='r') +nb_frames = (len(data)//(2*pcm_chunk_size)-1)//batch_size*batch_size features = np.memmap(feature_file, dtype='float32', mode='r') # limit to discrete number of frames -data = data[4*2*frame_size:] -data = data[:nb_frames*4*pcm_chunk_size] +data = data[2*2*frame_size:] +data = data[:nb_frames*2*pcm_chunk_size] -data = np.reshape(data, (nb_frames, pcm_chunk_size, 4)) -#in_data = data[:,:,:3] -#out_exc = data[:,:,3:4] +data = np.reshape(data, (nb_frames, pcm_chunk_size, 2)) #print("ulaw std = ", np.std(out_exc)) @@ -187,7 +185,7 @@ else: model.save_weights('{}_{}_initial.h5'.format(args.output, args.grua_size)) -loader = LPCNetLoader(data, features, periods, batch_size, lpc_out=flag_e2e) +loader = LPCNetLoader(data, features, periods, batch_size, e2e=flag_e2e) callbacks = [checkpoint, sparsify, grub_sparsify] if args.logdir is not None: