mirror of
https://github.com/xiph/opus.git
synced 2025-05-22 19:38:30 +00:00
149 lines
8 KiB
Python
149 lines
8 KiB
Python
"""
|
|
Perform Data Augmentation (Gain, Additive Noise, Random Filtering) on Input TTS Data
|
|
1. Read in chunks and compute clean pitch first
|
|
2. Then add in augmentation (Noise/Level/Response)
|
|
- Adds filtered noise from the "Demand" dataset, https://zenodo.org/record/1227121#.XRKKxYhKiUk
|
|
- When using the Demand Dataset, consider each channel as a possible noise input, and keep the first 4 minutes of noise for training
|
|
3. Use this "augmented" audio for feature computation, and compute pitch using CREPE on the clean input
|
|
|
|
Notes: To ensure consistency with the discovered CREPE offset, we do the following
|
|
- We pad the input audio to the zero-centered CREPE estimator with 80 zeros
|
|
- We pad the input audio to our feature computation with 160 zeros to center them
|
|
"""
|
|
|
|
import argparse
|
|
parser = argparse.ArgumentParser()
|
|
|
|
parser.add_argument('data', type=str, help='input raw audio data')
|
|
parser.add_argument('output', type=str, help='output directory')
|
|
parser.add_argument('path_lpcnet_extractor', type=str, help='path to LPCNet extractor object file (generated on compilation)')
|
|
parser.add_argument('noise_dataset', type=str, help='Location of the Demand Datset')
|
|
parser.add_argument('--flag_xcorr', type=bool, help='Flag to additionally dump xcorr features',choices=[True,False],default = False,required = False)
|
|
parser.add_argument('--fraction_input_use', type=float, help='Fraction of input data to consider',default = 0.3,required = False)
|
|
parser.add_argument('--gpu_index', type=int, help='GPU index to use if multiple GPUs',default = 0,required = False)
|
|
parser.add_argument('--choice_augment', type=str, help='Choice of noise augmentation, either use additive synthetic noise or add noise from the demand dataset',choices = ['demand','synthetic'],default = "demand",required = False)
|
|
parser.add_argument('--fraction_clean', type=float, help='Fraction of data to keep clean (that is not augment with anything)',default = 0.2,required = False)
|
|
parser.add_argument('--chunk_size', type=int, help='Number of samples to augment with for each iteration',default = 80000,required = False)
|
|
parser.add_argument('--N', type=int, help='STFT window size',default = 320,required = False)
|
|
parser.add_argument('--H', type=int, help='STFT Hop size',default = 160,required = False)
|
|
parser.add_argument('--freq_keep', type=int, help='Number of Frequencies to keep',default = 30,required = False)
|
|
|
|
args = parser.parse_args()
|
|
|
|
import os
|
|
os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu_index)
|
|
|
|
from utils import stft, random_filter
|
|
|
|
import numpy as np
|
|
import tqdm
|
|
import crepe
|
|
import random
|
|
import glob
|
|
import subprocess
|
|
|
|
data_full = np.memmap(args.data, dtype=np.int16,mode = 'r')
|
|
data = data_full[:(int)(args.fraction_input_use*data_full.shape[0])]
|
|
|
|
# list_features = []
|
|
list_cents = []
|
|
list_confidences = []
|
|
|
|
N = args.N
|
|
H = args.H
|
|
freq_keep = args.freq_keep
|
|
# Minimum/Maximum periods, decided by LPCNet
|
|
min_period = 32
|
|
max_period = 256
|
|
f_ref = 16000/max_period
|
|
chunk_size = args.chunk_size
|
|
num_frames_chunk = chunk_size//H
|
|
list_indices_keep = np.concatenate([np.arange(freq_keep), (N//2 + 1) + np.arange(freq_keep), 2*(N//2 + 1) + np.arange(freq_keep)])
|
|
|
|
output_IF = np.memmap(args.output + '_iffeat.f32', dtype=np.float32, shape=(((data.shape[0]//chunk_size - 1)//1)*num_frames_chunk,list_indices_keep.shape[0]), mode='w+')
|
|
if args.flag_xcorr:
|
|
output_xcorr = np.memmap(args.output + '_xcorr.f32', dtype=np.float32, shape=(((data.shape[0]//chunk_size - 1)//1)*num_frames_chunk,257), mode='w+')
|
|
|
|
fraction_clean = args.fraction_clean
|
|
|
|
noise_dataset = args.noise_dataset
|
|
|
|
for i in tqdm.trange((data.shape[0]//chunk_size - 1)//1):
|
|
chunk = data[i*chunk_size:(i + 1)*chunk_size]/(2**15 - 1)
|
|
|
|
# Clean Pitch/Confidence Estimate
|
|
# Padding input to CREPE by 80 samples to ensure it aligns
|
|
_, pitch, confidence, _ = crepe.predict(np.concatenate([np.zeros(80),chunk]), 16000, center=True, viterbi=True,verbose=0)
|
|
cent = 1200*np.log2(np.divide(pitch, f_ref, out=np.zeros_like(pitch), where=pitch!=0) + 1.0e-8)
|
|
|
|
# Filter out of range pitches/confidences
|
|
confidence[pitch < 16000/max_period] = 0
|
|
confidence[pitch > 16000/min_period] = 0
|
|
|
|
# Keep fraction of data clean, augment only 1 minus the fraction
|
|
if (np.random.rand() > fraction_clean):
|
|
# Response, generate controlled/random 2nd order IIR filter and filter chunk
|
|
chunk = random_filter(chunk)
|
|
|
|
# Level/Gain response {scale by random gain between 1.0e-3 and 10}
|
|
# Generate random gain in dB and then convert to scale
|
|
g_dB = np.random.uniform(low = -60, high = 20, size = 1)
|
|
# g_dB = 0
|
|
g = 10**(g_dB/20)
|
|
|
|
# Noise Addition {Add random SNR 2nd order randomly colored noise}
|
|
# Generate noise SNR value and add corresponding noise
|
|
snr_dB = np.random.uniform(low = -20, high = 30, size = 1)
|
|
|
|
if args.choice_augment == 'synthetic':
|
|
n = np.random.randn(chunk_size)
|
|
else:
|
|
list_noisefiles = noise_dataset + '*.wav'
|
|
noise_file = random.choice(glob.glob(list_noisefiles))
|
|
n = np.memmap(noise_file, dtype=np.int16,mode = 'r')/(2**15 - 1)
|
|
rand_range = np.random.randint(low = 0, high = (n.shape[0] - 16000*60 - chunk.shape[0])) # 16000 is subtracted because we will use the last 1 minutes of noise for testing
|
|
n = n[rand_range:rand_range + chunk.shape[0]]
|
|
|
|
# Randomly filter the sampled noise as well
|
|
n = random_filter(n)
|
|
# generate random prime number between 0,500 and make those samples of noise 0 (to prevent GRU from picking up temporal patterns)
|
|
Nprime = random.choice([2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97, 101, 103, 107, 109, 113, 127, 131, 137, 139, 149, 151, 157, 163, 167, 173, 179, 181, 191, 193, 197, 199, 211, 223, 227, 229, 233, 239, 241, 251, 257, 263, 269, 271, 277, 281, 283, 293, 307, 311, 313, 317, 331, 337, 347, 349, 353, 359, 367, 373, 379, 383, 389, 397, 401, 409, 419, 421, 431, 433, 439, 443, 449, 457, 461, 463, 467, 479, 487, 491, 499, 503, 509, 521, 523, 541])
|
|
n[chunk_size - Nprime:] = np.zeros(Nprime)
|
|
snr_multiplier = np.sqrt((np.sum(np.abs(chunk)**2)/np.sum(np.abs(n)**2))*10**(-snr_dB/10))
|
|
|
|
chunk = g*(chunk + snr_multiplier*n)
|
|
|
|
# Zero pad input audio by 160 to center the frames
|
|
spec = stft(x = np.concatenate([np.zeros(160),chunk]), w = 'boxcar', N = N, H = H).T
|
|
phase_diff = spec*np.conj(np.roll(spec,1,axis = -1))
|
|
phase_diff = phase_diff/(np.abs(phase_diff) + 1.0e-8)
|
|
feature = np.concatenate([np.log(np.abs(spec) + 1.0e-8),np.real(phase_diff),np.imag(phase_diff)],axis = 0).T
|
|
feature = feature[:,list_indices_keep]
|
|
|
|
if args.flag_xcorr:
|
|
# Dump noisy audio into temp file
|
|
data_temp = np.memmap('./temp_augment.raw', dtype=np.int16, shape=(chunk.shape[0]), mode='w+')
|
|
# data_temp[:chunk.shape[0]] = (chunk/(np.max(np.abs(chunk)))*(2**15 - 1)).astype(np.int16)
|
|
data_temp[:chunk.shape[0]] = ((chunk)*(2**15 - 1)).astype(np.int16)
|
|
|
|
subprocess.run([args.path_lpcnet_extractor, './temp_augment.raw', './temp_augment_xcorr.f32'])
|
|
feature_xcorr = np.flip(np.fromfile('./temp_augment_xcorr.f32', dtype='float32').reshape((-1,256),order = 'C'),axis = 1)
|
|
ones_zero_lag = np.expand_dims(np.ones(feature_xcorr.shape[0]),-1)
|
|
feature_xcorr = np.concatenate([ones_zero_lag,feature_xcorr],axis = -1)
|
|
|
|
os.remove('./temp_augment.raw')
|
|
os.remove('./temp_augment_xcorr.f32')
|
|
num_frames = min(cent.shape[0],feature.shape[0],feature_xcorr.shape[0],num_frames_chunk)
|
|
feature = feature[:num_frames,:]
|
|
cent = cent[:num_frames]
|
|
confidence = confidence[:num_frames]
|
|
feature_xcorr = feature_xcorr[:num_frames]
|
|
output_IF[i*num_frames_chunk:(i + 1)*num_frames_chunk,:] = feature
|
|
output_xcorr[i*num_frames_chunk:(i + 1)*num_frames_chunk,:] = feature_xcorr
|
|
list_cents.append(cent)
|
|
list_confidences.append(confidence)
|
|
|
|
list_cents = np.hstack(list_cents)
|
|
list_confidences = np.hstack(list_confidences)
|
|
|
|
np.save(args.output + '_pitches',np.vstack([list_cents,list_confidences]))
|