diff --git a/dnn/denoise.c b/dnn/denoise.c index 84c7d7ea..1eea96ad 100644 --- a/dnn/denoise.c +++ b/dnn/denoise.c @@ -38,6 +38,7 @@ #include "pitch.h" #include "arch.h" #include "celt_lpc.h" +#include #define PREEMPHASIS (0.85f) @@ -64,7 +65,7 @@ #define CEPS_MEM 8 #define NB_DELTA_CEPS 6 -#define NB_FEATURES (2*NB_BANDS+2+LPC_ORDER) +#define NB_FEATURES (2*NB_BANDS+3+LPC_ORDER) #ifndef TRAINING @@ -305,12 +306,20 @@ int lowpass = FREQ_SIZE; int band_lp = NB_BANDS; #endif -static void frame_analysis(DenoiseState *st, signed char *iexc, float *lpc, kiss_fft_cpx *X, float *Ex, const float *in) { +short float2short(float x) +{ + int i; + i = (int)floor(.5+x); + return IMAX(-32767, IMIN(32767, i)); +} + +static float frame_analysis(DenoiseState *st, signed char *iexc, short *pred, short *pcm, float *lpc, kiss_fft_cpx *X, float *Ex, const float *in) { int i; float x[WINDOW_SIZE]; float x0[WINDOW_SIZE]; float ac[LPC_ORDER+1]; float rc[LPC_ORDER]; + float g; RNN_COPY(x, st->analysis_mem, FRAME_SIZE); for (i=0;ianalysis_mem, in, FRAME_SIZE); @@ -325,7 +334,8 @@ static void frame_analysis(DenoiseState *st, signed char *iexc, float *lpc, kiss /* Lag windowing. */ for (i=1;ipitch_buf, &st->pitch_buf[FRAME_SIZE], PITCH_BUF_SIZE-FRAME_SIZE); RNN_COPY(&st->pitch_buf[PITCH_BUF_SIZE-FRAME_SIZE], in, FRAME_SIZE); //pre[0] = &st->pitch_buf[0]; @@ -419,7 +433,8 @@ static int compute_frame_features(DenoiseState *st, signed char *iexc, kiss_fft_ #endif features[2*NB_BANDS] = .01*(pitch_index-200); features[2*NB_BANDS+1] = gain; - for (i=0;imem_hp_x, in, b_hp, a_hp, FRAME_SIZE); - silence = compute_frame_features(st, NULL, X, P, Ex, Ep, Exp, features, x); + //silence = compute_frame_features(st, NULL, X, P, Ex, Ep, Exp, features, x); if (!silence) { pitch_filter(X, P, Ex, Ep, Exp, g); @@ -554,15 +569,23 @@ int main(int argc, char **argv) { float x[FRAME_SIZE]; FILE *f1; FILE *fexc; + FILE *ffeat; + FILE *fpred; + FILE *fpcm; signed char iexc[FRAME_SIZE]; + short pred[FRAME_SIZE]; + short pcm[FRAME_SIZE]; DenoiseState *st; st = rnnoise_create(); - if (argc!=3) { - fprintf(stderr, "usage: %s \n", argv[0]); + if (argc!=6) { + fprintf(stderr, "usage: %s \n", argv[0]); return 1; } f1 = fopen(argv[1], "r"); fexc = fopen(argv[2], "w"); + ffeat = fopen(argv[3], "w"); + fpred = fopen(argv[4], "w"); + fpcm = fopen(argv[5], "w"); while (1) { kiss_fft_cpx X[FREQ_SIZE], P[WINDOW_SIZE]; float Ex[NB_BANDS], Ep[NB_BANDS]; @@ -582,10 +605,12 @@ int main(int argc, char **argv) { biquad(x, mem_hp_x, x, b_hp, a_hp, FRAME_SIZE); preemphasis(x, &mem_preemph, x, PREEMPHASIS, FRAME_SIZE); - compute_frame_features(st, iexc, X, P, Ex, Ep, Exp, features, x); + compute_frame_features(st, iexc, pred, pcm, X, P, Ex, Ep, Exp, features, x); #if 1 - fwrite(features, sizeof(float), NB_FEATURES, stdout); fwrite(iexc, sizeof(signed char), FRAME_SIZE, fexc); + fwrite(features, sizeof(float), NB_FEATURES, ffeat); + fwrite(pred, sizeof(short), FRAME_SIZE, fpred); + fwrite(pcm, sizeof(short), FRAME_SIZE, fpcm); #endif count++; } diff --git a/dnn/test_lpcnet.py b/dnn/test_lpcnet.py index 5b194554..c20a8c01 100755 --- a/dnn/test_lpcnet.py +++ b/dnn/test_lpcnet.py @@ -73,6 +73,7 @@ for c in range(1, nb_frames): #fexc[0, 0, 0] = in_data[f*frame_size + i, 0] #print(cfeat.shape) p, state = dec.predict([fexc, cfeat[:, fr:fr+1, :], state]) + #p = np.maximum(p-0.003, 0) p = p/(1e-5 + np.sum(p)) #print(np.sum(p)) iexc[0, 0, 0] = np.argmax(np.random.multinomial(1, p[0,0,:], 1))-128 diff --git a/dnn/test_wavenet_audio.py b/dnn/test_wavenet_audio.py index a2b7984b..dffb393f 100755 --- a/dnn/test_wavenet_audio.py +++ b/dnn/test_wavenet_audio.py @@ -10,11 +10,11 @@ from ulaw import ulaw2lin, lin2ulaw import keras.backend as K import h5py -#import tensorflow as tf -#from keras.backend.tensorflow_backend import set_session -#config = tf.ConfigProto() -#config.gpu_options.per_process_gpu_memory_fraction = 0.44 -#set_session(tf.Session(config=config)) +import tensorflow as tf +from keras.backend.tensorflow_backend import set_session +config = tf.ConfigProto() +config.gpu_options.per_process_gpu_memory_fraction = 0.2 +set_session(tf.Session(config=config)) nb_epochs = 40 batch_size = 64 @@ -66,7 +66,7 @@ in_data = np.reshape(in_data, (nb_frames*pcm_chunk_size, 1)) out_data = np.reshape(data, (nb_frames*pcm_chunk_size, 1)) -model.load_weights('wavenet3e_30.h5') +model.load_weights('wavenet3g_30.h5') order = 16 @@ -92,6 +92,7 @@ for c in range(1, nb_frames): #fexc[0, 0, 0] = in_data[f*frame_size + i, 0] #print(cfeat.shape) p, state = dec.predict([fexc, cfeat[:, fr:fr+1, :], state]) + #p = np.maximum(p-0.003, 0) p = p/(1e-5 + np.sum(p)) #print(np.sum(p)) iexc[0, 0, 0] = np.argmax(np.random.multinomial(1, p[0,0,:], 1))-128 diff --git a/dnn/train_wavenet.py b/dnn/train_wavenet.py index e4e32c33..1ac21604 100755 --- a/dnn/train_wavenet.py +++ b/dnn/train_wavenet.py @@ -9,11 +9,11 @@ from ulaw import ulaw2lin, lin2ulaw import keras.backend as K import h5py -#import tensorflow as tf -#from keras.backend.tensorflow_backend import set_session -#config = tf.ConfigProto() -#config.gpu_options.per_process_gpu_memory_fraction = 0.44 -#set_session(tf.Session(config=config)) +import tensorflow as tf +from keras.backend.tensorflow_backend import set_session +config = tf.ConfigProto() +config.gpu_options.per_process_gpu_memory_fraction = 0.44 +set_session(tf.Session(config=config)) nb_epochs = 40 batch_size = 64 diff --git a/dnn/train_wavenet_audio.py b/dnn/train_wavenet_audio.py index c0d233e5..c65f408b 100755 --- a/dnn/train_wavenet_audio.py +++ b/dnn/train_wavenet_audio.py @@ -10,11 +10,11 @@ from ulaw import ulaw2lin, lin2ulaw import keras.backend as K import h5py -#import tensorflow as tf -#from keras.backend.tensorflow_backend import set_session -#config = tf.ConfigProto() -#config.gpu_options.per_process_gpu_memory_fraction = 0.44 -#set_session(tf.Session(config=config)) +import tensorflow as tf +from keras.backend.tensorflow_backend import set_session +config = tf.ConfigProto() +config.gpu_options.per_process_gpu_memory_fraction = 0.44 +set_session(tf.Session(config=config)) nb_epochs = 40 batch_size = 64 @@ -43,6 +43,7 @@ data = data[:nb_frames*pcm_chunk_size] features = features[:nb_frames*feature_chunk_size*nb_features] in_data = np.concatenate([data[0:1], data[:-1]]); +in_data = in_data + np.random.randint(-1, 1, len(data)) features = np.reshape(features, (nb_frames*feature_chunk_size, nb_features)) pitch = 1.*data @@ -67,7 +68,7 @@ features = features[:, :, :nb_used_features] # f.create_dataset('data', data=in_data[:50000, :, :]) # f.create_dataset('feat', data=features[:50000, :, :]) -checkpoint = ModelCheckpoint('wavenet3e_{epoch:02d}.h5') +checkpoint = ModelCheckpoint('wavenet3g_{epoch:02d}.h5') #model.load_weights('wavernn1c_01.h5') model.compile(optimizer=Adam(0.001, amsgrad=True, decay=2e-4), loss='sparse_categorical_crossentropy', metrics=['sparse_categorical_accuracy'])