opus/dnn/torch/neural-pitch/data_augmentation.py

"""
Perform Data Augmentation (Gain, Additive Noise, Random Filtering) on Input TTS Data
1. Read in chunks and compute clean pitch first
2. Then add in augmentation (Noise/Level/Response)
    - Adds filtered noise from the "Demand" dataset, https://zenodo.org/record/1227121#.XRKKxYhKiUk
    - When using the Demand Dataset, consider each channel as a possible noise input, and keep the first 4 minutes of noise for training
3. Use this "augmented" audio for feature computation, and compute pitch using CREPE on the clean input

Notes: To ensure consistency with the discovered CREPE offset, we do the following
- We pad the input audio to the zero-centered CREPE estimator with 80 zeros
- We pad the input audio to our feature computation with 160 zeros to center them
"""

import argparse
parser = argparse.ArgumentParser()

parser.add_argument('data', type=str, help='input raw audio data')
parser.add_argument('output', type=str, help='output directory')
parser.add_argument('path_lpcnet_extractor', type=str, help='path to LPCNet extractor object file (generated on compilation)')
parser.add_argument('noise_dataset', type=str, help='Location of the Demand Datset')
parser.add_argument('--flag_xcorr', type=bool, help='Flag to additionally dump xcorr features',choices=[True,False],default = False,required = False)
parser.add_argument('--fraction_input_use', type=float, help='Fraction of input data to consider',default = 0.3,required = False)
parser.add_argument('--gpu_index', type=int, help='GPU index to use if multiple GPUs',default = 0,required = False)
parser.add_argument('--choice_augment', type=str, help='Choice of noise augmentation, either use additive synthetic noise or add noise from the demand dataset',choices = ['demand','synthetic'],default = "demand",required = False)
parser.add_argument('--fraction_clean', type=float, help='Fraction of data to keep clean (that is not augment with anything)',default = 0.2,required = False)
parser.add_argument('--chunk_size', type=int, help='Number of samples to augment with for each iteration',default = 80000,required = False)
parser.add_argument('--N', type=int, help='STFT window size',default = 320,required = False)
parser.add_argument('--H', type=int, help='STFT Hop size',default = 160,required = False)
parser.add_argument('--freq_keep', type=int, help='Number of Frequencies to keep',default = 30,required = False)

args = parser.parse_args()

import os
os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu_index)

from utils import stft, random_filter

import numpy as np
import tqdm
import crepe
import random
import glob
import subprocess

data_full = np.memmap(args.data, dtype=np.int16,mode = 'r')
data = data_full[:(int)(args.fraction_input_use*data_full.shape[0])]

# list_features = []
list_cents = []
list_confidences = []

N = args.N
H = args.H
freq_keep = args.freq_keep
# Minimum/Maximum periods, decided by LPCNet
min_period = 32
max_period = 256
f_ref = 16000/max_period
chunk_size = args.chunk_size
num_frames_chunk = chunk_size//H
list_indices_keep = np.concatenate([np.arange(freq_keep), (N//2 + 1) + np.arange(freq_keep), 2*(N//2 + 1) + np.arange(freq_keep)])

output_IF  = np.memmap(args.output + '_iffeat.f32', dtype=np.float32, shape=(((data.shape[0]//chunk_size - 1)//1)*num_frames_chunk,list_indices_keep.shape[0]), mode='w+')
if args.flag_xcorr:
    output_xcorr  = np.memmap(args.output + '_xcorr.f32', dtype=np.float32, shape=(((data.shape[0]//chunk_size - 1)//1)*num_frames_chunk,257), mode='w+')

fraction_clean = args.fraction_clean

noise_dataset = args.noise_dataset

for i in tqdm.trange((data.shape[0]//chunk_size - 1)//1):
    chunk = data[i*chunk_size:(i + 1)*chunk_size]/(2**15 - 1)

    # Clean Pitch/Confidence Estimate
    # Padding input to CREPE by 80 samples to ensure it aligns
    _, pitch, confidence, _ = crepe.predict(np.concatenate([np.zeros(80),chunk]), 16000, center=True, viterbi=True,verbose=0)
    cent = 1200*np.log2(np.divide(pitch, f_ref, out=np.zeros_like(pitch), where=pitch!=0) + 1.0e-8)

    # Filter out of range pitches/confidences
    confidence[pitch < 16000/max_period] = 0
    confidence[pitch > 16000/min_period] = 0

    # Keep fraction of data clean, augment only 1 minus the fraction
    if (np.random.rand() > fraction_clean):
        # Response, generate controlled/random 2nd order IIR filter and filter chunk
        chunk = random_filter(chunk)

        # Level/Gain response {scale by random gain between 1.0e-3 and 10}
        # Generate random gain in dB and then convert to scale
        g_dB = np.random.uniform(low =  -60, high = 20, size = 1)
        # g_dB = 0
        g = 10**(g_dB/20)

        # Noise Addition {Add random SNR 2nd order randomly colored noise}
        # Generate noise SNR value and add corresponding noise
        snr_dB = np.random.uniform(low =  -20, high = 30, size = 1)

        if args.choice_augment == 'synthetic':
            n = np.random.randn(chunk_size)
        else:
            list_noisefiles = noise_dataset + '*.wav'
            noise_file = random.choice(glob.glob(list_noisefiles))
            n = np.memmap(noise_file, dtype=np.int16,mode = 'r')/(2**15 - 1)
            rand_range = np.random.randint(low = 0, high = (n.shape[0] - 16000*60 - chunk.shape[0])) # 16000 is subtracted because we will use the last 1 minutes of noise for testing
            n = n[rand_range:rand_range + chunk.shape[0]]

        # Randomly filter the sampled noise as well
        n = random_filter(n)
        # generate random prime number between 0,500 and make those samples of noise 0 (to prevent GRU from picking up temporal patterns)
        Nprime = random.choice([2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97, 101, 103, 107, 109, 113, 127, 131, 137, 139, 149, 151, 157, 163, 167, 173, 179, 181, 191, 193, 197, 199, 211, 223, 227, 229, 233, 239, 241, 251, 257, 263, 269, 271, 277, 281, 283, 293, 307, 311, 313, 317, 331, 337, 347, 349, 353, 359, 367, 373, 379, 383, 389, 397, 401, 409, 419, 421, 431, 433, 439, 443, 449, 457, 461, 463, 467, 479, 487, 491, 499, 503, 509, 521, 523, 541])
        n[chunk_size - Nprime:] = np.zeros(Nprime)
        snr_multiplier = np.sqrt((np.sum(np.abs(chunk)**2)/np.sum(np.abs(n)**2))*10**(-snr_dB/10))

        chunk = g*(chunk + snr_multiplier*n)

    # Zero pad input audio by 160 to center the frames
    spec = stft(x = np.concatenate([np.zeros(160),chunk]), w = 'boxcar', N = N, H = H).T
    phase_diff = spec*np.conj(np.roll(spec,1,axis = -1))
    phase_diff = phase_diff/(np.abs(phase_diff) + 1.0e-8)
    feature = np.concatenate([np.log(np.abs(spec) + 1.0e-8),np.real(phase_diff),np.imag(phase_diff)],axis = 0).T
    feature = feature[:,list_indices_keep]

    if args.flag_xcorr:
        # Dump noisy audio into temp file
        data_temp = np.memmap('./temp_augment.raw', dtype=np.int16, shape=(chunk.shape[0]), mode='w+')
        # data_temp[:chunk.shape[0]] = (chunk/(np.max(np.abs(chunk)))*(2**15 - 1)).astype(np.int16)
        data_temp[:chunk.shape[0]] = ((chunk)*(2**15 - 1)).astype(np.int16)

        subprocess.run([args.path_lpcnet_extractor, './temp_augment.raw', './temp_augment_xcorr.f32'])
        feature_xcorr = np.flip(np.fromfile('./temp_augment_xcorr.f32', dtype='float32').reshape((-1,256),order = 'C'),axis = 1)
        ones_zero_lag = np.expand_dims(np.ones(feature_xcorr.shape[0]),-1)
        feature_xcorr = np.concatenate([ones_zero_lag,feature_xcorr],axis = -1)

        os.remove('./temp_augment.raw')
        os.remove('./temp_augment_xcorr.f32')
    num_frames = min(cent.shape[0],feature.shape[0],feature_xcorr.shape[0],num_frames_chunk)
    feature = feature[:num_frames,:]
    cent = cent[:num_frames]
    confidence = confidence[:num_frames]
    feature_xcorr = feature_xcorr[:num_frames]
    output_IF[i*num_frames_chunk:(i + 1)*num_frames_chunk,:] = feature
    output_xcorr[i*num_frames_chunk:(i + 1)*num_frames_chunk,:] = feature_xcorr
    list_cents.append(cent)
    list_confidences.append(confidence)

list_cents = np.hstack(list_cents)
list_confidences = np.hstack(list_confidences)

np.save(args.output + '_pitches',np.vstack([list_cents,list_confidences]))