audio-domain synthesis

This commit is contained in:
Jean-Marc Valin 2018-07-31 18:37:27 -04:00
parent 4cf2b2705a
commit 70789e6f43
4 changed files with 138 additions and 3 deletions

View file

@ -4,6 +4,7 @@ import math
from keras.models import Model from keras.models import Model
from keras.layers import Input, LSTM, CuDNNGRU, Dense, Embedding, Reshape, Concatenate, Lambda, Conv1D, Multiply, Bidirectional, MaxPooling1D, Activation from keras.layers import Input, LSTM, CuDNNGRU, Dense, Embedding, Reshape, Concatenate, Lambda, Conv1D, Multiply, Bidirectional, MaxPooling1D, Activation
from keras import backend as K from keras import backend as K
from keras.initializers import Initializer
from mdense import MDense from mdense import MDense
import numpy as np import numpy as np
import h5py import h5py
@ -14,6 +15,30 @@ pcm_bits = 8
pcm_levels = 2**pcm_bits pcm_levels = 2**pcm_bits
nb_used_features = 38 nb_used_features = 38
class PCMInit(Initializer):
def __init__(self, gain=.1, seed=None):
self.gain = gain
self.seed = seed
def __call__(self, shape, dtype=None):
num_rows = 1
for dim in shape[:-1]:
num_rows *= dim
num_cols = shape[-1]
flat_shape = (num_rows, num_cols)
if self.seed is not None:
np.random.seed(self.seed)
a = np.random.uniform(-1.7321, 1.7321, flat_shape)
#a[:,0] = math.sqrt(12)*np.arange(-.5*num_rows+.5,.5*num_rows-.4)/num_rows
#a[:,1] = .5*a[:,0]*a[:,0]*a[:,0]
a = a + np.reshape(math.sqrt(12)*np.arange(-.5*num_rows+.5,.5*num_rows-.4)/num_rows, (num_rows, 1))
return self.gain * a
def get_config(self):
return {
'gain': self.gain,
'seed': self.seed
}
def new_wavernn_model(): def new_wavernn_model():
pcm = Input(shape=(None, 1)) pcm = Input(shape=(None, 1))
@ -35,6 +60,10 @@ def new_wavernn_model():
cpcm = pcm cpcm = pcm
cpitch = pitch cpitch = pitch
embed = Embedding(256, 128, embeddings_initializer=PCMInit())
cpcm = Reshape((-1, 128))(embed(pcm))
cfeat = fconv2(fconv1(feat)) cfeat = fconv2(fconv1(feat))
rep = Lambda(lambda x: K.repeat_elements(x, 160, 1)) rep = Lambda(lambda x: K.repeat_elements(x, 160, 1))

103
dnn/test_wavenet_audio.py Executable file
View file

@ -0,0 +1,103 @@
#!/usr/bin/python3
import wavenet
import lpcnet
import sys
import numpy as np
from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint
from ulaw import ulaw2lin, lin2ulaw
import keras.backend as K
import h5py
#import tensorflow as tf
#from keras.backend.tensorflow_backend import set_session
#config = tf.ConfigProto()
#config.gpu_options.per_process_gpu_memory_fraction = 0.44
#set_session(tf.Session(config=config))
nb_epochs = 40
batch_size = 64
#model = wavenet.new_wavenet_model(fftnet=True)
model, enc, dec = lpcnet.new_wavernn_model()
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['sparse_categorical_accuracy'])
model.summary()
pcmfile = sys.argv[1]
feature_file = sys.argv[2]
frame_size = 160
nb_features = 54
nb_used_features = wavenet.nb_used_features
feature_chunk_size = 15
pcm_chunk_size = frame_size*feature_chunk_size
data = np.fromfile(pcmfile, dtype='int16')
data = np.minimum(127, lin2ulaw(data[80:]/32768.))
nb_frames = len(data)//pcm_chunk_size
features = np.fromfile(feature_file, dtype='float32')
data = data[:nb_frames*pcm_chunk_size]
features = features[:nb_frames*feature_chunk_size*nb_features]
in_data = np.concatenate([data[0:1], data[:-1]]);
features = np.reshape(features, (nb_frames*feature_chunk_size, nb_features))
pitch = 1.*data
pitch[:320] = 0
for i in range(2, nb_frames*feature_chunk_size):
period = int(50*features[i,36]+100)
period = period - 4
pitch[i*frame_size:(i+1)*frame_size] = data[i*frame_size-period:(i+1)*frame_size-period]
in_pitch = np.reshape(pitch/16., (nb_frames, pcm_chunk_size, 1))
in_data = np.reshape(in_data, (nb_frames, pcm_chunk_size, 1))
in_data = (in_data.astype('int16')+128).astype('uint8')
out_data = np.reshape(data, (nb_frames, pcm_chunk_size, 1))
out_data = (out_data.astype('int16')+128).astype('uint8')
features = np.reshape(features, (nb_frames, feature_chunk_size, nb_features))
features = features[:, :, :nb_used_features]
in_data = np.reshape(in_data, (nb_frames*pcm_chunk_size, 1))
out_data = np.reshape(data, (nb_frames*pcm_chunk_size, 1))
model.load_weights('wavenet3e_30.h5')
order = 16
pcm = 0.*out_data
exc = out_data-0
pitch = np.zeros((1, 1, 1), dtype='float32')
fexc = np.zeros((1, 1, 1), dtype='float32')
iexc = np.zeros((1, 1, 1), dtype='int16')
state = np.zeros((1, lpcnet.rnn_units), dtype='float32')
for c in range(1, nb_frames):
cfeat = enc.predict(features[c:c+1, :, :nb_used_features])
for fr in range(1, feature_chunk_size):
f = c*feature_chunk_size + fr
a = features[c, fr, nb_used_features:]
#print(a)
gain = 1.;
period = int(50*features[c, fr, 36]+100)
period = period - 4
for i in range(frame_size):
pitch[0, 0, 0] = exc[f*frame_size + i - period, 0]
fexc[0, 0, 0] = iexc + 128
#fexc[0, 0, 0] = in_data[f*frame_size + i, 0]
#print(cfeat.shape)
p, state = dec.predict([fexc, cfeat[:, fr:fr+1, :], state])
p = p/(1e-5 + np.sum(p))
#print(np.sum(p))
iexc[0, 0, 0] = np.argmax(np.random.multinomial(1, p[0,0,:], 1))-128
exc[f*frame_size + i] = iexc[0, 0, 0]/16.
#out_data[f*frame_size + i, 0] = iexc[0, 0, 0]
pcm[f*frame_size + i, 0] = 32768*ulaw2lin(iexc[0, 0, 0]*1.0)
print(iexc[0, 0, 0], out_data[f*frame_size + i, 0], pcm[f*frame_size + i, 0])

View file

@ -1,6 +1,7 @@
#!/usr/bin/python3 #!/usr/bin/python3
import wavenet import wavenet
import lpcnet
import sys import sys
import numpy as np import numpy as np
from keras.optimizers import Adam from keras.optimizers import Adam
@ -18,7 +19,9 @@ import h5py
nb_epochs = 40 nb_epochs = 40
batch_size = 64 batch_size = 64
model = wavenet.new_wavenet_model(fftnet=True) #model = wavenet.new_wavenet_model(fftnet=True)
model, _, _ = lpcnet.new_wavernn_model()
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['sparse_categorical_accuracy']) model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['sparse_categorical_accuracy'])
model.summary() model.summary()
@ -64,7 +67,7 @@ features = features[:, :, :nb_used_features]
# f.create_dataset('data', data=in_data[:50000, :, :]) # f.create_dataset('data', data=in_data[:50000, :, :])
# f.create_dataset('feat', data=features[:50000, :, :]) # f.create_dataset('feat', data=features[:50000, :, :])
checkpoint = ModelCheckpoint('wavenet3c_{epoch:02d}.h5') checkpoint = ModelCheckpoint('wavenet3e_{epoch:02d}.h5')
#model.load_weights('wavernn1c_01.h5') #model.load_weights('wavernn1c_01.h5')
model.compile(optimizer=Adam(0.001, amsgrad=True, decay=2e-4), loss='sparse_categorical_crossentropy', metrics=['sparse_categorical_accuracy']) model.compile(optimizer=Adam(0.001, amsgrad=True, decay=2e-4), loss='sparse_categorical_crossentropy', metrics=['sparse_categorical_accuracy'])

View file

@ -5,7 +5,7 @@ import math
def ulaw2lin(u): def ulaw2lin(u):
s = np.sign(u) s = np.sign(u)
u = np.abs(u) u = np.abs(u)
return s*(np.exp(u/128*math.log(256))-1)/255 return s*(np.exp(u/128.*math.log(256))-1)/255
def lin2ulaw(x): def lin2ulaw(x):