diff --git a/dnn/lpcnet.py b/dnn/lpcnet.py index b07292dc..0ead18e5 100644 --- a/dnn/lpcnet.py +++ b/dnn/lpcnet.py @@ -10,7 +10,8 @@ import numpy as np import h5py import sys -rnn_units=128 +rnn_units1=128 +rnn_units2=32 pcm_bits = 8 embed_size = 128 pcm_levels = 2**pcm_bits @@ -47,7 +48,8 @@ def new_wavernn_model(): feat = Input(shape=(None, nb_used_features)) pitch = Input(shape=(None, 1)) dec_feat = Input(shape=(None, 128)) - dec_state = Input(shape=(rnn_units,)) + dec_state1 = Input(shape=(rnn_units1,)) + dec_state2 = Input(shape=(rnn_units2,)) fconv1 = Conv1D(128, 3, padding='same', activation='tanh') fconv2 = Conv1D(102, 3, padding='same', activation='tanh') @@ -70,18 +72,21 @@ def new_wavernn_model(): rep = Lambda(lambda x: K.repeat_elements(x, 160, 1)) - rnn = CuDNNGRU(rnn_units, return_sequences=True, return_state=True) + rnn = CuDNNGRU(rnn_units1, return_sequences=True, return_state=True) + rnn2 = CuDNNGRU(rnn_units2, return_sequences=True, return_state=True) rnn_in = Concatenate()([cpcm, cexc, rep(cfeat)]) md = MDense(pcm_levels, activation='softmax') - gru_out, state = rnn(rnn_in) - ulaw_prob = md(gru_out) + gru_out1, _ = rnn(rnn_in) + gru_out2, _ = rnn2(gru_out1) + ulaw_prob = md(gru_out2) model = Model([pcm, exc, feat, pitch], ulaw_prob) encoder = Model([feat, pitch], cfeat) dec_rnn_in = Concatenate()([cpcm, cexc, dec_feat]) - dec_gru_out, state = rnn(dec_rnn_in, initial_state=dec_state) - dec_ulaw_prob = md(dec_gru_out) + dec_gru_out1, state1 = rnn(dec_rnn_in, initial_state=dec_state1) + dec_gru_out2, state2 = rnn2(dec_gru_out1, initial_state=dec_state2) + dec_ulaw_prob = md(dec_gru_out2) - decoder = Model([pcm, exc, dec_feat, dec_state], [dec_ulaw_prob, state]) + decoder = Model([pcm, exc, dec_feat, dec_state1, dec_state2], [dec_ulaw_prob, state1, state2]) return model, encoder, decoder diff --git a/dnn/test_wavenet_audio.py b/dnn/test_wavenet_audio.py index 68d788cc..a8817d8a 100755 --- a/dnn/test_wavenet_audio.py +++ b/dnn/test_wavenet_audio.py @@ -59,14 +59,15 @@ in_data = np.reshape(in_data, (nb_frames*pcm_chunk_size, 1)) out_data = np.reshape(data, (nb_frames*pcm_chunk_size, 1)) -model.load_weights('wavenet4f2_30.h5') +model.load_weights('wavenet5d0_19.h5') order = 16 pcm = 0.*out_data fexc = np.zeros((1, 1, 2), dtype='float32') iexc = np.zeros((1, 1, 1), dtype='int16') -state = np.zeros((1, lpcnet.rnn_units), dtype='float32') +state1 = np.zeros((1, lpcnet.rnn_units1), dtype='float32') +state2 = np.zeros((1, lpcnet.rnn_units2), dtype='float32') for c in range(1, nb_frames): cfeat = enc.predict([features[c:c+1, :, :nb_used_features], periods[c:c+1, :, :]]) for fr in range(1, feature_chunk_size): @@ -82,7 +83,7 @@ for c in range(1, nb_frames): pred = -sum(a*pcm[f*frame_size + i - 1:f*frame_size + i - order-1:-1, 0]) fexc[0, 0, 1] = lin2ulaw(pred) - p, state = dec.predict([fexc, iexc, cfeat[:, fr:fr+1, :], state]) + p, state1, state2 = dec.predict([fexc, iexc, cfeat[:, fr:fr+1, :], state1, state2]) #p = p*p #p = p/(1e-18 + np.sum(p)) p = np.maximum(p-0.001, 0).astype('float64') diff --git a/dnn/train_wavenet_audio.py b/dnn/train_wavenet_audio.py index 229f02de..3ad26db7 100755 --- a/dnn/train_wavenet_audio.py +++ b/dnn/train_wavenet_audio.py @@ -86,7 +86,7 @@ periods = (50*features[:,:,36:37]+100).astype('int16') in_data = np.concatenate([in_data, pred], axis=-1) -checkpoint = ModelCheckpoint('wavenet5b_{epoch:02d}.h5') +checkpoint = ModelCheckpoint('wavenet5d0_{epoch:02d}.h5') #model.load_weights('wavenet4f2_30.h5') model.compile(optimizer=Adam(0.001, amsgrad=True, decay=2e-4), loss='sparse_categorical_crossentropy', metrics=['sparse_categorical_accuracy'])