Support for multi-GPU training

Not sure why CuDNNGRU doesn't get used by default, but we need to explicitly use it to get things to run fast.
2025-05-28 14:19:13 +00:00 · 2021-06-13 03:50:51 -04:00 · 2021-06-13 03:50:51 -04:00 · 237245f815
commit 237245f815
parent ebc9483b4c
2 changed files with 42 additions and 21 deletions
--- a/dnn/training_tf2/lpcnet.py
+++ b/dnn/training_tf2/lpcnet.py
@ -26,8 +26,10 @@
 '''
 import math
 import tensorflow as tf
 from tensorflow.keras.models import Model
 from tensorflow.keras.layers import Input, GRU, Dense, Embedding, Reshape, Concatenate, Lambda, Conv1D, Multiply, Add, Bidirectional, MaxPooling1D, Activation
 from tensorflow.compat.v1.keras.layers import CuDNNGRU
 from tensorflow.keras import backend as K
 from tensorflow.keras.constraints import Constraint
 from tensorflow.keras.initializers import Initializer
@ -42,6 +44,12 @@ pcm_bits = 8
 embed_size = 128
 pcm_levels = 2**pcm_bits
 def quant_regularizer(x):
    Q = 128
    Q_1 = 1./Q
    #return .01 * tf.reduce_mean(1 - tf.math.cos(2*3.1415926535897931*(Q*x-tf.round(Q*x))))
    return .01 * tf.reduce_mean(K.sqrt(K.sqrt(1.0001 - tf.math.cos(2*3.1415926535897931*(Q*x-tf.round(Q*x))))))
 class Sparsify(Callback):
    def __init__(self, t_start, t_end, interval, density):
        super(Sparsify, self).__init__()
@ -129,9 +137,9 @@ class WeightClip(Constraint):
        return {'name': self.__class__.__name__,
            'c': self.c}
-constraint = WeightClip(0.999)
+constraint = WeightClip(0.992)
-def new_lpcnet_model(rnn_units1=384, rnn_units2=16, nb_used_features = 38, training=False, adaptation=False):
+def new_lpcnet_model(rnn_units1=384, rnn_units2=16, nb_used_features = 38, training=False, adaptation=False, quantize=False):
    pcm = Input(shape=(None, 3))
    feat = Input(shape=(None, nb_used_features))
    pitch = Input(shape=(None, 1))
@ -158,10 +166,18 @@ def new_lpcnet_model(rnn_units1=384, rnn_units2=16, nb_used_features = 38, train
    rep = Lambda(lambda x: K.repeat_elements(x, frame_size, 1))
-    rnn = GRU(rnn_units1, return_sequences=True, return_state=True, recurrent_activation="sigmoid", reset_after='true', name='gru_a',
+    quant = quant_regularizer if quantize else None
-              recurrent_constraint = constraint)
+
-    rnn2 = GRU(rnn_units2, return_sequences=True, return_state=True, recurrent_activation="sigmoid", reset_after='true', name='gru_b',
+    if training:
-               kernel_constraint=constraint)
+        rnn = CuDNNGRU(rnn_units1, return_sequences=True, return_state=True, name='gru_a',
              recurrent_constraint = constraint, recurrent_regularizer=quant)
        rnn2 = CuDNNGRU(rnn_units2, return_sequences=True, return_state=True, name='gru_b',
               kernel_constraint=constraint, kernel_regularizer=quant)
    else:
        rnn = GRU(rnn_units1, return_sequences=True, return_state=True, recurrent_activation="sigmoid", reset_after='true', name='gru_a',
              recurrent_constraint = constraint, recurrent_regularizer=quant)
        rnn2 = GRU(rnn_units2, return_sequences=True, return_state=True, recurrent_activation="sigmoid", reset_after='true', name='gru_b',
               kernel_constraint=constraint, kernel_regularizer=quant)
    rnn_in = Concatenate()([cpcm, rep(cfeat)])
    md = MDense(pcm_levels, activation='softmax', name='dual_fc')
--- a/dnn/training_tf2/train_lpcnet.py
+++ b/dnn/training_tf2/train_lpcnet.py
@ -49,10 +49,23 @@ nb_epochs = 120
 # Try reducing batch_size if you run out of memory on your GPU
 batch_size = 128
-model, _, _ = lpcnet.new_lpcnet_model(training=True)
+#Set this to True to adapt an existing model (e.g. on new data)
 adaptation = False
-model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['sparse_categorical_accuracy'])
+if adaptation:
-model.summary()
+    lr = 0.0001
    decay = 0
 else:
    lr = 0.001
    decay = 2.5e-5
 opt = Adam(lr, decay=decay, beta_2=0.99)
 strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()
 with strategy.scope():
    model, _, _ = lpcnet.new_lpcnet_model(training=True)
    model.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['sparse_categorical_accuracy'])
    model.summary()
 feature_file = sys.argv[1]
 pcm_file = sys.argv[2]     # 16 bit unsigned short PCM samples
@ -65,7 +78,7 @@ pcm_chunk_size = frame_size*feature_chunk_size
 # u for unquantised, load 16 bit PCM samples and convert to mu-law
 data = np.fromfile(pcm_file, dtype='uint8')
-nb_frames = len(data)//(4*pcm_chunk_size)
+nb_frames = len(data)//(4*pcm_chunk_size)//batch_size*batch_size
 features = np.fromfile(feature_file, dtype='float32')
@ -102,23 +115,15 @@ del pred
 del in_exc
 # dump models to disk as we go
-checkpoint = ModelCheckpoint('lpcnet33_384_{epoch:02d}.h5')
+checkpoint = ModelCheckpoint('lpcnet33e_384_{epoch:02d}.h5')
 #Set this to True to adapt an existing model (e.g. on new data)
 adaptation = False
 if adaptation:
    #Adapting from an existing model
-    model.load_weights('lpcnet32v_384_100.h5')
+    model.load_weights('lpcnet33a_384_100.h5')
    sparsify = lpcnet.Sparsify(0, 0, 1, (0.05, 0.05, 0.2))
    lr = 0.0001
    decay = 0
 else:
    #Training from scratch
    sparsify = lpcnet.Sparsify(2000, 40000, 400, (0.05, 0.05, 0.2))
    lr = 0.001
    decay = 5e-5
-model.compile(optimizer=Adam(lr, decay=decay, beta_2=0.99), loss='sparse_categorical_crossentropy')
+model.save_weights('lpcnet33e_384_00.h5');
 model.save_weights('lpcnet33_384_00.h5');
 model.fit([in_data, features, periods], out_exc, batch_size=batch_size, epochs=nb_epochs, validation_split=0.0, callbacks=[checkpoint, sparsify])