Python code for neural pitch

This commit is contained in:
Krishna Subramani 2023-09-25 00:19:41 -04:00 committed by Jean-Marc Valin
parent d88dd89358
commit f38b4a317f
No known key found for this signature in database
GPG key ID: 531A52533318F00A
11 changed files with 1481 additions and 0 deletions

View file

@ -0,0 +1,18 @@
## Neural Pitch Estimation
- Dataset Installation
1. Download and unzip PTDB Dataset:
wget https://www2.spsc.tugraz.at/databases/PTDB-TUG/SPEECH_DATA_ZIPPED.zip
unzip SPEECH_DATA_ZIPPED.zip
2. Inside "SPEECH DATA" above, run ptdb_process.sh to combine male/female
3. To Download and combine demand, simply run download_demand.sh
- LPCNet preparation
1. To extract xcorr, add lpcnet_extractor.c and add relevant functions to lpcnet_enc.c, add source for headers/c files and Makefile.am, and compile to generate ./lpcnet_xcorr_extractor object
- Dataset Augmentation and training (check out arguments to each of the following)
1. Run data_augmentation.py
2. Run training.py using augmented data
3. Run experiments.py

View file

@ -0,0 +1,149 @@
"""
Perform Data Augmentation (Gain, Additive Noise, Random Filtering) on Input TTS Data
1. Read in chunks and compute clean pitch first
2. Then add in augmentation (Noise/Level/Response)
- Adds filtered noise from the "Demand" dataset, https://zenodo.org/record/1227121#.XRKKxYhKiUk
- When using the Demand Dataset, consider each channel as a possible noise input, and keep the first 4 minutes of noise for training
3. Use this "augmented" audio for feature computation, and compute pitch using CREPE on the clean input
Notes: To ensure consistency with the discovered CREPE offset, we do the following
- We pad the input audio to the zero-centered CREPE estimator with 80 zeros
- We pad the input audio to our feature computation with 160 zeros to center them
"""
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('data', type=str, help='input raw audio data')
parser.add_argument('output', type=str, help='output directory')
parser.add_argument('path_lpcnet_extractor', type=str, help='path to LPCNet extractor object file (generated on compilation)')
parser.add_argument('noise_dataset', type=str, help='Location of the Demand Datset')
parser.add_argument('--flag_xcorr', type=bool, help='Flag to additionally dump xcorr features',choices=[True,False],default = False,required = False)
parser.add_argument('--fraction_input_use', type=float, help='Fraction of input data to consider',default = 0.3,required = False)
parser.add_argument('--gpu_index', type=int, help='GPU index to use if multiple GPUs',default = 0,required = False)
parser.add_argument('--choice_augment', type=str, help='Choice of noise augmentation, either use additive synthetic noise or add noise from the demand dataset',choices = ['demand','synthetic'],default = "demand",required = False)
parser.add_argument('--fraction_clean', type=float, help='Fraction of data to keep clean (that is not augment with anything)',default = 0.2,required = False)
parser.add_argument('--chunk_size', type=int, help='Number of samples to augment with for each iteration',default = 80000,required = False)
parser.add_argument('--N', type=int, help='STFT window size',default = 320,required = False)
parser.add_argument('--H', type=int, help='STFT Hop size',default = 160,required = False)
parser.add_argument('--freq_keep', type=int, help='Number of Frequencies to keep',default = 30,required = False)
args = parser.parse_args()
import os
os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu_index)
from utils import stft, random_filter
import numpy as np
import tqdm
import crepe
import random
import glob
import subprocess
data_full = np.memmap(args.data, dtype=np.int16,mode = 'r')
data = data_full[:(int)(args.fraction_input_use*data_full.shape[0])]
# list_features = []
list_cents = []
list_confidences = []
N = args.N
H = args.H
freq_keep = args.freq_keep
# Minimum/Maximum periods, decided by LPCNet
min_period = 32
max_period = 256
f_ref = 16000/max_period
chunk_size = args.chunk_size
num_frames_chunk = chunk_size//H
list_indices_keep = np.concatenate([np.arange(freq_keep), (N//2 + 1) + np.arange(freq_keep), 2*(N//2 + 1) + np.arange(freq_keep)])
output_IF = np.memmap(args.output + '_iffeat.f32', dtype=np.float32, shape=(((data.shape[0]//chunk_size - 1)//1)*num_frames_chunk,list_indices_keep.shape[0]), mode='w+')
if args.flag_xcorr:
output_xcorr = np.memmap(args.output + '_xcorr.f32', dtype=np.float32, shape=(((data.shape[0]//chunk_size - 1)//1)*num_frames_chunk,257), mode='w+')
fraction_clean = args.fraction_clean
noise_dataset = args.noise_dataset
for i in tqdm.trange((data.shape[0]//chunk_size - 1)//1):
chunk = data[i*chunk_size:(i + 1)*chunk_size]/(2**15 - 1)
# Clean Pitch/Confidence Estimate
# Padding input to CREPE by 80 samples to ensure it aligns
_, pitch, confidence, _ = crepe.predict(np.concatenate([np.zeros(80),chunk]), 16000, center=True, viterbi=True,verbose=0)
cent = 1200*np.log2(np.divide(pitch, f_ref, out=np.zeros_like(pitch), where=pitch!=0) + 1.0e-8)
# Filter out of range pitches/confidences
confidence[pitch < 16000/max_period] = 0
confidence[pitch > 16000/min_period] = 0
# Keep fraction of data clean, augment only 1 minus the fraction
if (np.random.rand() > fraction_clean):
# Response, generate controlled/random 2nd order IIR filter and filter chunk
chunk = random_filter(chunk)
# Level/Gain response {scale by random gain between 1.0e-3 and 10}
# Generate random gain in dB and then convert to scale
g_dB = np.random.uniform(low = -60, high = 20, size = 1)
# g_dB = 0
g = 10**(g_dB/20)
# Noise Addition {Add random SNR 2nd order randomly colored noise}
# Generate noise SNR value and add corresponding noise
snr_dB = np.random.uniform(low = -20, high = 30, size = 1)
if args.choice_augment == 'synthetic':
n = np.random.randn(chunk_size)
else:
list_noisefiles = noise_dataset + '*.wav'
noise_file = random.choice(glob.glob(list_noisefiles))
n = np.memmap(noise_file, dtype=np.int16,mode = 'r')/(2**15 - 1)
rand_range = np.random.randint(low = 0, high = (n.shape[0] - 16000*60 - chunk.shape[0])) # 16000 is subtracted because we will use the last 1 minutes of noise for testing
n = n[rand_range:rand_range + chunk.shape[0]]
# Randomly filter the sampled noise as well
n = random_filter(n)
# generate random prime number between 0,500 and make those samples of noise 0 (to prevent GRU from picking up temporal patterns)
Nprime = random.choice([2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97, 101, 103, 107, 109, 113, 127, 131, 137, 139, 149, 151, 157, 163, 167, 173, 179, 181, 191, 193, 197, 199, 211, 223, 227, 229, 233, 239, 241, 251, 257, 263, 269, 271, 277, 281, 283, 293, 307, 311, 313, 317, 331, 337, 347, 349, 353, 359, 367, 373, 379, 383, 389, 397, 401, 409, 419, 421, 431, 433, 439, 443, 449, 457, 461, 463, 467, 479, 487, 491, 499, 503, 509, 521, 523, 541])
n[chunk_size - Nprime:] = np.zeros(Nprime)
snr_multiplier = np.sqrt((np.sum(np.abs(chunk)**2)/np.sum(np.abs(n)**2))*10**(-snr_dB/10))
chunk = g*(chunk + snr_multiplier*n)
# Zero pad input audio by 160 to center the frames
spec = stft(x = np.concatenate([np.zeros(160),chunk]), w = 'boxcar', N = N, H = H).T
phase_diff = spec*np.conj(np.roll(spec,1,axis = -1))
phase_diff = phase_diff/(np.abs(phase_diff) + 1.0e-8)
feature = np.concatenate([np.log(np.abs(spec) + 1.0e-8),np.real(phase_diff),np.imag(phase_diff)],axis = 0).T
feature = feature[:,list_indices_keep]
if args.flag_xcorr:
# Dump noisy audio into temp file
data_temp = np.memmap('./temp_augment.raw', dtype=np.int16, shape=(chunk.shape[0]), mode='w+')
# data_temp[:chunk.shape[0]] = (chunk/(np.max(np.abs(chunk)))*(2**15 - 1)).astype(np.int16)
data_temp[:chunk.shape[0]] = ((chunk)*(2**15 - 1)).astype(np.int16)
subprocess.run([args.path_lpcnet_extractor, './temp_augment.raw', './temp_augment_xcorr.f32'])
feature_xcorr = np.flip(np.fromfile('./temp_augment_xcorr.f32', dtype='float32').reshape((-1,256),order = 'C'),axis = 1)
ones_zero_lag = np.expand_dims(np.ones(feature_xcorr.shape[0]),-1)
feature_xcorr = np.concatenate([ones_zero_lag,feature_xcorr],axis = -1)
os.remove('./temp_augment.raw')
os.remove('./temp_augment_xcorr.f32')
num_frames = min(cent.shape[0],feature.shape[0],feature_xcorr.shape[0],num_frames_chunk)
feature = feature[:num_frames,:]
cent = cent[:num_frames]
confidence = confidence[:num_frames]
feature_xcorr = feature_xcorr[:num_frames]
output_IF[i*num_frames_chunk:(i + 1)*num_frames_chunk,:] = feature
output_xcorr[i*num_frames_chunk:(i + 1)*num_frames_chunk,:] = feature_xcorr
list_cents.append(cent)
list_confidences.append(confidence)
list_cents = np.hstack(list_cents)
list_confidences = np.hstack(list_confidences)
np.save(args.output + '_pitches',np.vstack([list_cents,list_confidences]))

View file

@ -0,0 +1,43 @@
wget https://zenodo.org/record/1227121/files/DKITCHEN_16k.zip
wget https://zenodo.org/record/1227121/files/DLIVING_16k.zip
wget https://zenodo.org/record/1227121/files/DWASHING_16k.zip
wget https://zenodo.org/record/1227121/files/NFIELD_16k.zip
wget https://zenodo.org/record/1227121/files/NPARK_16k.zip
wget https://zenodo.org/record/1227121/files/NRIVER_16k.zip
wget https://zenodo.org/record/1227121/files/OHALLWAY_16k.zip
wget https://zenodo.org/record/1227121/files/OMEETING_16k.zip
wget https://zenodo.org/record/1227121/files/OOFFICE_16k.zip
wget https://zenodo.org/record/1227121/files/PCAFETER_16k.zip
wget https://zenodo.org/record/1227121/files/PRESTO_16k.zip
wget https://zenodo.org/record/1227121/files/PSTATION_16k.zip
wget https://zenodo.org/record/1227121/files/TMETRO_16k.zip
wget https://zenodo.org/record/1227121/files/TCAR_16k.zip
wget https://zenodo.org/record/1227121/files/TBUS_16k.zip
wget https://zenodo.org/record/1227121/files/STRAFFIC_16k.zip
wget https://zenodo.org/record/1227121/files/SPSQUARE_16k.zip
unzip '*.zip'
mkdir -p ./combined_demand_channels/
for file in */*.wav; do
parentdir="$(dirname "$file")"
echo $parentdir
fname="$(basename "$file")"
cp $file ./combined_demand_channels/$parentdir+$fname
done

View file

@ -0,0 +1,464 @@
"""
Evaluation script to compute the Raw Pitch Accuracy
Procedure:
- Look at all voiced frames in file
- Compute number of pitches in those frames that lie within a 50 cent threshold
RPA = (Total number of pitches within threshold summed across all files)/(Total number of voiced frames summed accross all files)
"""
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
from prettytable import PrettyTable
import numpy as np
import glob
import random
import tqdm
import torch
import librosa
import json
from utils import stft, random_filter, feature_xform
import subprocess
import crepe
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def rca(reference,input,voicing,thresh = 25):
idx_voiced = np.where(voicing != 0)[0]
acc = np.where(np.abs(reference - input)[idx_voiced] < thresh)[0]
return acc.shape[0]
def sweep_rca(reference,input,voicing,thresh = 25,ind_arr = np.arange(-10,10)):
l = []
for i in ind_arr:
l.append(rca(reference,np.roll(input,i),voicing,thresh))
l = np.array(l)
return np.max(l)
def rpa(model,device = 'cpu',data_format = 'if'):
list_files = glob.glob('/home/ubuntu/Code/Datasets/SPEECH DATA/combined_mic_16k_raw/*.raw')
dir_f0 = '/home/ubuntu/Code/Datasets/SPEECH DATA/combine_f0_ptdb/'
# random_shuffle = list(np.random.permutation(len(list_files)))
random.shuffle(list_files)
list_files = list_files[:1000]
# C_lp = 0
# C_lp_m = 0
# C_lp_f = 0
# list_rca_model_lp = []
# list_rca_male_lp = []
# list_rca_female_lp = []
# C_hp = 0
# C_hp_m = 0
# C_hp_f = 0
# list_rca_model_hp = []
# list_rca_male_hp = []
# list_rca_female_hp = []
C_all = 0
C_all_m = 0
C_all_f = 0
list_rca_model_all = []
list_rca_male_all = []
list_rca_female_all = []
thresh = 50
N = 320
H = 160
freq_keep = 30
for idx in tqdm.trange(len(list_files)):
audio_file = list_files[idx]
file_name = os.path.basename(list_files[idx])[:-4]
audio = np.memmap(list_files[idx], dtype=np.int16)/(2**15 - 1)
offset = 432
audio = audio[offset:]
rmse = np.squeeze(librosa.feature.rms(y = audio,frame_length = 320,hop_length = 160))
spec = stft(x = np.concatenate([np.zeros(160),audio]), w = 'boxcar', N = N, H = H).T
phase_diff = spec*np.conj(np.roll(spec,1,axis = -1))
phase_diff = phase_diff/(np.abs(phase_diff) + 1.0e-8)
idx_save = np.concatenate([np.arange(freq_keep),(N//2 + 1) + np.arange(freq_keep),2*(N//2 + 1) + np.arange(freq_keep)])
feature = np.concatenate([np.log(np.abs(spec) + 1.0e-8),np.real(phase_diff),np.imag(phase_diff)],axis = 0).T
feature_if = feature[:,idx_save]
data_temp = np.memmap('./temp.raw', dtype=np.int16, shape=(audio.shape[0]), mode='w+')
data_temp[:audio.shape[0]] = (audio/(np.max(np.abs(audio)))*(2**15 - 1)).astype(np.int16)
subprocess.run(["../../../lpcnet_xcorr_extractor", './temp.raw', './temp_xcorr.f32'])
feature_xcorr = np.flip(np.fromfile('./temp_xcorr.f32', dtype='float32').reshape((-1,256),order = 'C'),axis = 1)
ones_zero_lag = np.expand_dims(np.ones(feature_xcorr.shape[0]),-1)
feature_xcorr = np.concatenate([ones_zero_lag,feature_xcorr],axis = -1)
# feature_xcorr = feature_xform(feature_xcorr)
os.remove('./temp.raw')
os.remove('./temp_xcorr.f32')
if data_format == 'if':
feature = feature_if
elif data_format == 'xcorr':
feature = feature_xcorr
else:
indmin = min(feature_if.shape[0],feature_xcorr.shape[0])
feature = np.concatenate([feature_xcorr[:indmin,:],feature_if[:indmin,:]],-1)
pitch_file_name = dir_f0 + "ref" + os.path.basename(list_files[idx])[3:-4] + ".f0"
pitch = np.loadtxt(pitch_file_name)[:,0]
voicing = np.loadtxt(pitch_file_name)[:,1]
indmin = min(voicing.shape[0],rmse.shape[0],pitch.shape[0])
pitch = pitch[:indmin]
voicing = voicing[:indmin]
rmse = rmse[:indmin]
voicing = voicing*(rmse > 0.05*np.max(rmse))
if "mic_F" in audio_file:
idx_correct = np.where(pitch < 125)
voicing[idx_correct] = 0
cent = np.rint(1200*np.log2(np.divide(pitch, (16000/256), out=np.zeros_like(pitch), where=pitch!=0) + 1.0e-8)).astype('int')
# if (model == 'penn'):
# model_frequency, _ = penn.from_audio(
# torch.from_numpy(audio).unsqueeze(0).float(),
# 16000,
# hopsize=0.01,
# fmin=(16000.0/256),
# fmax=500,
# checkpoint=penn.DEFAULT_CHECKPOINT,
# batch_size=32,
# pad=True,
# interp_unvoiced_at=0.065,
# gpu=0)
# model_frequency = model_frequency.cpu().detach().squeeze().numpy()
# model_cents = 1200*np.log2(model_frequency/(16000/256))
# elif (model == 'crepe'):
# _, model_frequency, _, _ = crepe.predict(audio, 16000, viterbi=vflag,center=True,verbose=0)
# lpcnet_file_name = '/home/ubuntu/Code/Datasets/SPEECH_DATA/lpcnet_f0_16k_residual/' + file_name + '_f0.f32'
# period_lpcnet = np.fromfile(lpcnet_file_name, dtype='float32')
# model_frequency = 16000/(period_lpcnet + 1.0e-6)
# model_cents = 1200*np.log2(model_frequency/(16000/256))
# else:
model_cents = model(torch.from_numpy(np.copy(np.expand_dims(feature,0))).float().to(device))
model_cents = 20*model_cents.argmax(dim=1).cpu().detach().squeeze().numpy()
# model_cents = np.roll(model_cents,-1*3)
num_frames = min(cent.shape[0],model_cents.shape[0])
pitch = pitch[:num_frames]
cent = cent[:num_frames]
voicing = voicing[:num_frames]
model_cents = model_cents[:num_frames]
voicing_all = np.copy(voicing)
# Forcefully make regions where pitch is <65 or greater than 500 unvoiced for relevant accurate pitch comparisons for our model
force_out_of_pitch = np.where(np.logical_or(pitch < 65,pitch > 500)==True)
voicing_all[force_out_of_pitch] = 0
C_all = C_all + np.where(voicing_all != 0)[0].shape[0]
# list_rca_model_all.append(sweep_rca(cent,model_cents,voicing_all,thresh,[0]))
list_rca_model_all.append(rca(cent,model_cents,voicing_all,thresh))
# list_rca_model_all.append(np.count_nonzero(np.where(np.abs(cent - model_cents))))
if "mic_M" in audio_file:
# list_rca_male_all.append(sweep_rca(cent,model_cents,voicing_all,thresh,[0]))
list_rca_male_all.append(rca(cent,model_cents,voicing_all,thresh))
C_all_m = C_all_m + np.where(voicing_all != 0)[0].shape[0]
else:
# list_rca_female_all.append(sweep_rca(cent,model_cents,voicing_all,thresh,[0]))
list_rca_female_all.append(rca(cent,model_cents,voicing_all,thresh))
C_all_f = C_all_f + np.where(voicing_all != 0)[0].shape[0]
"""
# Low pitch estimation
voicing_lp = np.copy(voicing)
force_out_of_pitch = np.where(np.logical_or(pitch < 65,pitch > 125)==True)
voicing_lp[force_out_of_pitch] = 0
C_lp = C_lp + np.where(voicing_lp != 0)[0].shape[0]
# list_rca_model_lp.append(sweep_rca(cent,model_cents,voicing_lp,thresh,[0]))
list_rca_model_lp.append(rca(cent,model_cents,voicing_lp,thresh))
if "mic_M" in audio_file:
# list_rca_male_lp.append(sweep_rca(cent,model_cents,voicing_lp,thresh,[0]))
list_rca_male_lp.append(rca(cent,model_cents,voicing_lp,thresh))
C_lp_m = C_lp_m + np.where(voicing_lp != 0)[0].shape[0]
else:
# list_rca_female_lp.append(sweep_rca(cent,model_cents,voicing_lp,thresh,[0]))
list_rca_female_lp.append(rca(cent,model_cents,voicing_lp,thresh))
C_lp_f = C_lp_f + np.where(voicing_lp != 0)[0].shape[0]
# High pitch estimation
voicing_hp = np.copy(voicing)
force_out_of_pitch = np.where(np.logical_or(pitch < 125,pitch > 500)==True)
voicing_hp[force_out_of_pitch] = 0
C_hp = C_hp + np.where(voicing_hp != 0)[0].shape[0]
# list_rca_model_hp.append(sweep_rca(cent,model_cents,voicing_hp,thresh,[0]))
list_rca_model_hp.append(rca(cent,model_cents,voicing_hp,thresh))
if "mic_M" in audio_file:
# list_rca_male_hp.append(sweep_rca(cent,model_cents,voicing_hp,thresh,[0]))
list_rca_male_hp.append(rca(cent,model_cents,voicing_hp,thresh))
C_hp_m = C_hp_m + np.where(voicing_hp != 0)[0].shape[0]
else:
# list_rca_female_hp.append(sweep_rca(cent,model_cents,voicing_hp,thresh,[0]))
list_rca_female_hp.append(rca(cent,model_cents,voicing_hp,thresh))
C_hp_f = C_hp_f + np.where(voicing_hp != 0)[0].shape[0]
# list_rca_model.append(acc_model)
# list_rca_crepe.append(acc_crepe)
# list_rca_lpcnet.append(acc_lpcnet)
# list_rca_penn.append(acc_penn)
"""
# list_rca_crepe = np.array(list_rca_crepe)
# list_rca_model_lp = np.array(list_rca_model_lp)
# list_rca_male_lp = np.array(list_rca_male_lp)
# list_rca_female_lp = np.array(list_rca_female_lp)
# list_rca_model_hp = np.array(list_rca_model_hp)
# list_rca_male_hp = np.array(list_rca_male_hp)
# list_rca_female_hp = np.array(list_rca_female_hp)
list_rca_model_all = np.array(list_rca_model_all)
list_rca_male_all = np.array(list_rca_male_all)
list_rca_female_all = np.array(list_rca_female_all)
# list_rca_lpcnet = np.array(list_rca_lpcnet)
# list_rca_penn = np.array(list_rca_penn)
x = PrettyTable()
x.field_names = ["Experiment", "Mean RPA"]
x.add_row(["Both all pitches", np.sum(list_rca_model_all)/C_all])
# x.add_row(["Both low pitches", np.sum(list_rca_model_lp)/C_lp])
# x.add_row(["Both high pitches", np.sum(list_rca_model_hp)/C_hp])
x.add_row(["Male all pitches", np.sum(list_rca_male_all)/C_all_m])
# x.add_row(["Male low pitches", np.sum(list_rca_male_lp)/C_lp_m])
# x.add_row(["Male high pitches", np.sum(list_rca_male_hp)/C_hp_m])
x.add_row(["Female all pitches", np.sum(list_rca_female_all)/C_all_f])
# x.add_row(["Female low pitches", np.sum(list_rca_female_lp)/C_lp_f])
# x.add_row(["Female high pitches", np.sum(list_rca_female_hp)/C_hp_f])
print(x)
return None
def cycle_eval(list_files_pth, noise_type = 'synthetic', noise_dataset = None, list_snr = [-20,-15,-10,-5,0,5,10,15,20], ptdb_dataset_path = None,fraction = 0.1,thresh = 50):
"""
Cycle through SNR evaluation for list of .pth files
"""
# list_files = glob.glob('/home/ubuntu/Code/Datasets/SPEECH DATA/combined_mic_16k_raw/*.raw')
# dir_f0 = '/home/ubuntu/Code/Datasets/SPEECH DATA/combine_f0_ptdb/'
# random_shuffle = list(np.random.permutation(len(list_files)))
list_files = glob.glob(ptdb_dataset_path + 'combined_mic_16k/*.raw')
dir_f0 = ptdb_dataset_path + 'combined_reference_f0/'
random.shuffle(list_files)
list_files = list_files[:(int)(fraction*len(list_files))]
# list_nfiles = ['DKITCHEN','NFIELD','OHALLWAY','PCAFETER','SPSQUARE','TCAR','DLIVING','NPARK','OMEETING','PRESTO','STRAFFIC','TMETRO','DWASHING','NRIVER','OOFFICE','PSTATION','TBUS']
dict_models = {}
list_snr.append(np.inf)
# thresh = 50
for f in list_files_pth:
if (f!='crepe') and (f!='lpcnet'):
fname = os.path.basename(f).split('_')[0] + '_' + os.path.basename(f).split('_')[-1][:-4]
config_path = os.path.dirname(f) + '/' + os.path.basename(f).split('_')[0] + '_' + 'config_' + os.path.basename(f).split('_')[-1][:-4] + '.json'
with open(config_path) as json_file:
dict_params = json.load(json_file)
if dict_params['data_format'] == 'if':
from models import large_if_ccode as model
pitch_nn = model(dict_params['freq_keep']*3,dict_params['gru_dim'],dict_params['output_dim']).to(device)
elif dict_params['data_format'] == 'xcorr':
from models import large_xcorr as model
pitch_nn = model(dict_params['xcorr_dim'],dict_params['gru_dim'],dict_params['output_dim']).to(device)
else:
from models import large_joint as model
pitch_nn = model(dict_params['freq_keep']*3,dict_params['xcorr_dim'],dict_params['gru_dim'],dict_params['output_dim']).to(device)
pitch_nn.load_state_dict(torch.load(f))
N = dict_params['window_size']
H = dict_params['hop_factor']
freq_keep = dict_params['freq_keep']
list_mean = []
list_std = []
for snr_dB in list_snr:
C_all = 0
C_correct = 0
for idx in tqdm.trange(len(list_files)):
audio_file = list_files[idx]
file_name = os.path.basename(list_files[idx])[:-4]
audio = np.memmap(list_files[idx], dtype=np.int16)/(2**15 - 1)
offset = 432
audio = audio[offset:]
rmse = np.squeeze(librosa.feature.rms(y = audio,frame_length = N,hop_length = H))
if noise_type != 'synthetic':
list_noisefiles = noise_dataset + '*.wav'
noise_file = random.choice(glob.glob(list_noisefiles))
n = np.memmap(noise_file, dtype=np.int16,mode = 'r')/(2**15 - 1)
rand_range = np.random.randint(low = 0, high = (16000*60*5 - audio.shape[0])) # Last 1 minute of noise used for testing
n = n[rand_range:rand_range + audio.shape[0]]
else:
n = np.random.randn(audio.shape[0])
n = random_filter(n)
snr_multiplier = np.sqrt((np.sum(np.abs(audio)**2)/np.sum(np.abs(n)**2))*10**(-snr_dB/10))
audio = audio + snr_multiplier*n
spec = stft(x = np.concatenate([np.zeros(160),audio]), w = 'boxcar', N = N, H = H).T
phase_diff = spec*np.conj(np.roll(spec,1,axis = -1))
phase_diff = phase_diff/(np.abs(phase_diff) + 1.0e-8)
idx_save = np.concatenate([np.arange(freq_keep),(N//2 + 1) + np.arange(freq_keep),2*(N//2 + 1) + np.arange(freq_keep)])
feature = np.concatenate([np.log(np.abs(spec) + 1.0e-8),np.real(phase_diff),np.imag(phase_diff)],axis = 0).T
feature_if = feature[:,idx_save]
data_temp = np.memmap('./temp.raw', dtype=np.int16, shape=(audio.shape[0]), mode='w+')
# data_temp[:audio.shape[0]] = (audio/(np.max(np.abs(audio)))*(2**15 - 1)).astype(np.int16)
data_temp[:audio.shape[0]] = ((audio)*(2**15 - 1)).astype(np.int16)
subprocess.run(["../../../lpcnet_xcorr_extractor", './temp.raw', './temp_xcorr.f32'])
feature_xcorr = np.flip(np.fromfile('./temp_xcorr.f32', dtype='float32').reshape((-1,256),order = 'C'),axis = 1)
ones_zero_lag = np.expand_dims(np.ones(feature_xcorr.shape[0]),-1)
feature_xcorr = np.concatenate([ones_zero_lag,feature_xcorr],axis = -1)
os.remove('./temp.raw')
os.remove('./temp_xcorr.f32')
if dict_params['data_format'] == 'if':
feature = feature_if
elif dict_params['data_format'] == 'xcorr':
feature = feature_xcorr
else:
indmin = min(feature_if.shape[0],feature_xcorr.shape[0])
feature = np.concatenate([feature_xcorr[:indmin,:],feature_if[:indmin,:]],-1)
pitch_file_name = dir_f0 + "ref" + os.path.basename(list_files[idx])[3:-4] + ".f0"
pitch = np.loadtxt(pitch_file_name)[:,0]
voicing = np.loadtxt(pitch_file_name)[:,1]
indmin = min(voicing.shape[0],rmse.shape[0],pitch.shape[0])
pitch = pitch[:indmin]
voicing = voicing[:indmin]
rmse = rmse[:indmin]
voicing = voicing*(rmse > 0.05*np.max(rmse))
if "mic_F" in audio_file:
idx_correct = np.where(pitch < 125)
voicing[idx_correct] = 0
cent = np.rint(1200*np.log2(np.divide(pitch, (16000/256), out=np.zeros_like(pitch), where=pitch!=0) + 1.0e-8)).astype('int')
# if os.path.basename(f) == 'crepe':
# elif (model == 'crepe'):
# _, model_frequency, _, _ = crepe.predict(np.concatenate([np.zeros(80),audio]), 16000, viterbi=True,center=True,verbose=0)
# model_cents = 1200*np.log2(model_frequency/(16000/256))
# else:
# else:
model_cents = pitch_nn(torch.from_numpy(np.copy(np.expand_dims(feature,0))).float().to(device))
model_cents = 20*model_cents.argmax(dim=1).cpu().detach().squeeze().numpy()
# model_cents = np.roll(model_cents,-1*3)
num_frames = min(cent.shape[0],model_cents.shape[0])
pitch = pitch[:num_frames]
cent = cent[:num_frames]
voicing = voicing[:num_frames]
model_cents = model_cents[:num_frames]
voicing_all = np.copy(voicing)
# Forcefully make regions where pitch is <65 or greater than 500 unvoiced for relevant accurate pitch comparisons for our model
force_out_of_pitch = np.where(np.logical_or(pitch < 65,pitch > 500)==True)
voicing_all[force_out_of_pitch] = 0
C_all = C_all + np.where(voicing_all != 0)[0].shape[0]
# list_rca_model_all.append(sweep_rca(cent,model_cents,voicing_all,thresh,[0]))
C_correct = C_correct + rca(cent,model_cents,voicing_all,thresh)
# list_rca_model_all.append(np.count_nonzero(np.where(np.abs(cent - model_cents))))
list_mean.append(C_correct/C_all)
else:
fname = f
list_mean = []
list_std = []
for snr_dB in list_snr:
C_all = 0
C_correct = 0
for idx in tqdm.trange(len(list_files)):
audio_file = list_files[idx]
file_name = os.path.basename(list_files[idx])[:-4]
audio = np.memmap(list_files[idx], dtype=np.int16)/(2**15 - 1)
offset = 432
audio = audio[offset:]
rmse = np.squeeze(librosa.feature.rms(y = audio,frame_length = 320,hop_length = 160))
if noise_type != 'synthetic':
list_noisefiles = noise_dataset + '*.wav'
noise_file = random.choice(glob.glob(list_noisefiles))
n = np.memmap(noise_file, dtype=np.int16,mode = 'r')/(2**15 - 1)
rand_range = np.random.randint(low = 0, high = (16000*60*5 - audio.shape[0])) # Last 1 minute of noise used for testing
n = n[rand_range:rand_range + audio.shape[0]]
else:
n = np.random.randn(audio.shape[0])
n = random_filter(n)
snr_multiplier = np.sqrt((np.sum(np.abs(audio)**2)/np.sum(np.abs(n)**2))*10**(-snr_dB/10))
audio = audio + snr_multiplier*n
if (f == 'crepe'):
_, model_frequency, _, _ = crepe.predict(np.concatenate([np.zeros(80),audio]), 16000, viterbi=True,center=True,verbose=0)
model_cents = 1200*np.log2(model_frequency/(16000/256) + 1.0e-8)
else:
data_temp = np.memmap('./temp.raw', dtype=np.int16, shape=(audio.shape[0]), mode='w+')
# data_temp[:audio.shape[0]] = (audio/(np.max(np.abs(audio)))*(2**15 - 1)).astype(np.int16)
data_temp[:audio.shape[0]] = ((audio)*(2**15 - 1)).astype(np.int16)
subprocess.run(["../../../lpcnet_xcorr_extractor", './temp.raw', './temp_xcorr.f32', './temp_period.f32'])
feature_xcorr = np.fromfile('./temp_period.f32', dtype='float32')
model_cents = 1200*np.log2((256/feature_xcorr + 1.0e-8) + 1.0e-8)
os.remove('./temp.raw')
os.remove('./temp_xcorr.f32')
os.remove('./temp_period.f32')
pitch_file_name = dir_f0 + "ref" + os.path.basename(list_files[idx])[3:-4] + ".f0"
pitch = np.loadtxt(pitch_file_name)[:,0]
voicing = np.loadtxt(pitch_file_name)[:,1]
indmin = min(voicing.shape[0],rmse.shape[0],pitch.shape[0])
pitch = pitch[:indmin]
voicing = voicing[:indmin]
rmse = rmse[:indmin]
voicing = voicing*(rmse > 0.05*np.max(rmse))
if "mic_F" in audio_file:
idx_correct = np.where(pitch < 125)
voicing[idx_correct] = 0
cent = np.rint(1200*np.log2(np.divide(pitch, (16000/256), out=np.zeros_like(pitch), where=pitch!=0) + 1.0e-8)).astype('int')
num_frames = min(cent.shape[0],model_cents.shape[0])
pitch = pitch[:num_frames]
cent = cent[:num_frames]
voicing = voicing[:num_frames]
model_cents = model_cents[:num_frames]
voicing_all = np.copy(voicing)
# Forcefully make regions where pitch is <65 or greater than 500 unvoiced for relevant accurate pitch comparisons for our model
force_out_of_pitch = np.where(np.logical_or(pitch < 65,pitch > 500)==True)
voicing_all[force_out_of_pitch] = 0
C_all = C_all + np.where(voicing_all != 0)[0].shape[0]
# list_rca_model_all.append(sweep_rca(cent,model_cents,voicing_all,thresh,[0]))
C_correct = C_correct + rca(cent,model_cents,voicing_all,thresh)
# list_rca_model_all.append(np.count_nonzero(np.where(np.abs(cent - model_cents))))
list_mean.append(C_correct/C_all)
dict_models[fname] = {}
dict_models[fname]['list_SNR'] = list_mean[:-1]
dict_models[fname]['inf'] = list_mean[-1]
return dict_models

View file

@ -0,0 +1,38 @@
"""
Running the experiments;
1. RCA vs SNR for our models, CREPE, LPCNet
"""
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('ptdb_root', type=str, help='Root Directory for PTDB generated by running ptdb_process.sh ')
parser.add_argument('output', type=str, help='Output dump file name')
parser.add_argument('method', type=str, help='Output Directory to save experiment dumps',choices=['model','lpcnet','crepe'])
parser.add_argument('--noise_dataset', type=str, help='Location of the Demand Datset',default = './',required=False)
parser.add_argument('--noise_type', type=str, help='Type of additive noise',default = 'synthetic',choices=['synthetic','demand'],required=False)
parser.add_argument('--pth_file', type=str, help='.pth file to analyze',default = './',required = False)
parser.add_argument('--fraction_files_analyze', type=float, help='Fraction of PTDB dataset to test on',default = 1,required = False)
parser.add_argument('--threshold_rca', type=float, help='Cent threshold when computing RCA',default = 50,required = False)
parser.add_argument('--gpu_index', type=int, help='GPU index to use if multiple GPUs',default = 0,required = False)
args = parser.parse_args()
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu_index)
import json
from evaluation import cycle_eval
if args.method == 'model':
dict_store = cycle_eval([args.pth_file], noise_type = args.noise_type, noise_dataset = args.noise_dataset, list_snr = [-20,-15,-10,-5,0,5,10,15,20], ptdb_dataset_path = args.ptdb_root,fraction = args.fraction_files_analyze,thresh = args.threshold_rca)
else:
dict_store = cycle_eval([args.method], noise_type = args.noise_type, noise_dataset = args.noise_dataset, list_snr = [-20,-15,-10,-5,0,5,10,15,20], ptdb_dataset_path = args.ptdb_root,fraction = args.fraction_files_analyze,thresh = args.threshold_rca)
dict_store["method"] = args.method
if args.method == 'model':
dict_store['pth'] = args.pth_file
with open(args.output, 'w') as fp:
json.dump(dict_store, fp)

View file

@ -0,0 +1,89 @@
"""
/* Copyright (c) 2022 Amazon
Written by Jan Buethe */
/*
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
"""
import os
import argparse
import sys
sys.path.append(os.path.join(os.path.dirname(__file__), '../weight-exchange'))
parser = argparse.ArgumentParser()
parser.add_argument('checkpoint', type=str, help='rdovae model checkpoint')
parser.add_argument('output_dir', type=str, help='output folder')
args = parser.parse_args()
import torch
import numpy as np
from models import large_if_ccode
from wexchange.torch import dump_torch_weights
from wexchange.c_export import CWriter, print_vector
def c_export(args, model):
message = f"Auto generated from checkpoint {os.path.basename(args.checkpoint)}"
enc_writer = CWriter(os.path.join(args.output_dir, "neural_pitch_data"), message=message, model_struct_name='nnpitch')
enc_writer.header.write(
f"""
#include "opus_types.h"
"""
)
# encoder
encoder_dense_layers = [
('initial' , 'initial', 'TANH'),
('upsample' , 'upsample', 'TANH')
]
for name, export_name, _ in encoder_dense_layers:
layer = model.get_submodule(name)
dump_torch_weights(enc_writer, layer, name=export_name, verbose=True)
encoder_gru_layers = [
('gru' , 'gru', 'TANH'),
]
enc_max_rnn_units = max([dump_torch_weights(enc_writer, model.get_submodule(name), export_name, verbose=True, input_sparse=False, quantize=False)
for name, export_name, _ in encoder_gru_layers])
del enc_writer
if __name__ == "__main__":
os.makedirs(args.output_dir, exist_ok=True)
model = large_if_ccode()
model.load_state_dict(torch.load(args.checkpoint,map_location='cpu'))
c_export(args, model)

View file

@ -0,0 +1,218 @@
"""
Pitch Estimation Models and dataloaders
- Classification Based (Input features, output logits)
"""
import torch
import numpy as np
class large_if_ccode(torch.nn.Module):
def __init__(self,input_dim = 90,gru_dim = 64,output_dim = 192):
super(large_if_ccode,self).__init__()
self.activation = torch.nn.Tanh()
self.initial = torch.nn.Linear(input_dim,gru_dim)
self.hidden = torch.nn.Linear(gru_dim,gru_dim)
self.gru = torch.nn.GRU(input_size = gru_dim,hidden_size = gru_dim,batch_first = True)
self.upsample = torch.nn.Linear(gru_dim,output_dim)
def forward(self, x):
x = self.initial(x)
x = self.activation(x)
x = self.hidden(x)
x = self.activation(x)
x,_ = self.gru(x)
x = self.upsample(x)
x = self.activation(x)
x = x.permute(0,2,1)
return x
class large_xcorr(torch.nn.Module):
def __init__(self,input_dim = 90,gru_dim = 64,output_dim = 192):
super(large_xcorr,self).__init__()
self.activation = torch.nn.Tanh()
self.conv = torch.nn.Sequential(
torch.nn.ZeroPad2d((2,0,1,1)),
torch.nn.Conv2d(1, 8, 3, bias = True),
self.activation,
torch.nn.ZeroPad2d((2,0,1,1)),
torch.nn.Conv2d(8, 8, 3, bias = True),
self.activation,
torch.nn.ZeroPad2d((2,0,1,1)),
torch.nn.Conv2d(8, 1, 3, bias = True),
self.activation,
)
# self.conv = torch.nn.Sequential(
# torch.nn.ConstantPad1d((2,0),0),
# torch.nn.Conv1d(64,10,3),
# self.activation,
# torch.nn.ConstantPad1d((2,0),0),
# torch.nn.Conv1d(10,64,3),
# self.activation,
# )
self.downsample = torch.nn.Sequential(
torch.nn.Linear(input_dim,gru_dim),
self.activation
)
self.GRU = torch.nn.GRU(input_size = gru_dim,hidden_size = gru_dim,num_layers = 1,batch_first = True)
self.upsample = torch.nn.Sequential(
torch.nn.Linear(gru_dim,output_dim),
self.activation
)
def forward(self, x):
# x = x[:,:,:257].unsqueeze(-1)
x = self.conv(x.unsqueeze(-1).permute(0,3,2,1)).squeeze(1)
# print(x.shape)
# x = self.conv(x.permute(0,3,2,1)).squeeze(1)
x,_ = self.GRU(self.downsample(x.permute(0,2,1)))
x = self.upsample(x).permute(0,2,1)
# x = self.downsample(x)
# x = self.activation(x)
# x = self.conv(x.permute(0,2,1)).permute(0,2,1)
# x,_ = self.GRU(x)
# x = self.upsample(x).permute(0,2,1)
return x
class large_joint(torch.nn.Module):
"""
Joint IF-xcorr
1D CNN on IF, merge with xcorr, 2D CNN on merged + GRU
"""
def __init__(self,input_IF_dim = 90,input_xcorr_dim = 257,gru_dim = 64,output_dim = 192):
super(large_joint,self).__init__()
self.activation = torch.nn.Tanh()
self.if_upsample = torch.nn.Sequential(
torch.nn.Linear(input_IF_dim,64),
self.activation,
torch.nn.Linear(64,64),
self.activation,
)
# self.if_upsample = torch.nn.Sequential(
# torch.nn.ConstantPad1d((2,0),0),
# torch.nn.Conv1d(90,10,3),
# self.activation,
# torch.nn.ConstantPad1d((2,0),0),
# torch.nn.Conv1d(10,257,3),
# self.activation,
# )
self.conv = torch.nn.Sequential(
torch.nn.ZeroPad2d((2,0,1,1)),
torch.nn.Conv2d(1, 8, 3, bias = True),
self.activation,
torch.nn.ZeroPad2d((2,0,1,1)),
torch.nn.Conv2d(8, 8, 3, bias = True),
self.activation,
torch.nn.ZeroPad2d((2,0,1,1)),
torch.nn.Conv2d(8, 1, 3, bias = True),
self.activation,
)
# self.conv = torch.nn.Sequential(
# torch.nn.ConstantPad1d((2,0),0),
# torch.nn.Conv1d(257,10,3),
# self.activation,
# torch.nn.ConstantPad1d((2,0),0),
# torch.nn.Conv1d(10,64,3),
# self.activation,
# )
self.downsample = torch.nn.Sequential(
torch.nn.Linear(64 + input_xcorr_dim,gru_dim),
self.activation
)
self.GRU = torch.nn.GRU(input_size = gru_dim,hidden_size = gru_dim,num_layers = 1,batch_first = True)
self.upsample = torch.nn.Sequential(
torch.nn.Linear(gru_dim,output_dim),
self.activation
)
def forward(self, x):
xcorr_feat = x[:,:,:257]
if_feat = x[:,:,257:]
# x = torch.cat([xcorr_feat.unsqueeze(-1),self.if_upsample(if_feat).unsqueeze(-1)],axis = -1)
xcorr_feat = self.conv(xcorr_feat.unsqueeze(-1).permute(0,3,2,1)).squeeze(1).permute(0,2,1)
if_feat = self.if_upsample(if_feat)
x = torch.cat([xcorr_feat,if_feat],axis = - 1)
# x = self.conv(x.permute(0,3,2,1)).squeeze(1)
x,_ = self.GRU(self.downsample(x))
x = self.upsample(x).permute(0,2,1)
return x
# Dataloaders
class loader(torch.utils.data.Dataset):
def __init__(self, features_if, file_pitch,confidence_threshold = 0.4,dimension_if = 30,context = 100):
self.if_feat = np.memmap(features_if, dtype=np.float32).reshape(-1,3*dimension_if)
# Resolution of 20 cents
self.cents = np.rint(np.load(file_pitch)[0,:]/20)
self.cents = np.clip(self.cents,0,179)
self.confidence = np.load(file_pitch)[1,:]
# Filter confidence for CREPE
self.confidence[self.confidence < confidence_threshold] = 0
self.context = context
# Clip both to same size
size_common = min(self.if_feat.shape[0],self.cents.shape[0])
self.if_feat = self.if_feat[:size_common,:]
self.cents = self.cents[:size_common]
self.confidence = self.confidence[:size_common]
frame_max = self.if_feat.shape[0]//context
self.if_feat = np.reshape(self.if_feat[:frame_max*context,:],(frame_max,context,3*dimension_if))
self.cents = np.reshape(self.cents[:frame_max*context],(frame_max,context))
self.confidence = np.reshape(self.confidence[:frame_max*context],(frame_max,context))
def __len__(self):
return self.if_feat.shape[0]
def __getitem__(self, index):
return torch.from_numpy(self.if_feat[index,:,:]),torch.from_numpy(self.cents[index]),torch.from_numpy(self.confidence[index])
class loader_joint(torch.utils.data.Dataset):
def __init__(self, features_if, file_pitch, features_xcorr,confidence_threshold = 0.4,context = 100, choice_data = 'both'):
self.if_feat = np.memmap(features_if, dtype=np.float32).reshape(-1,90)
self.xcorr = np.memmap(features_xcorr, dtype=np.float32).reshape(-1,257)
self.cents = np.rint(np.load(file_pitch)[0,:]/20)
self.cents = np.clip(self.cents,0,179)
self.confidence = np.load(file_pitch)[1,:]
# Filter confidence for CREPE
self.confidence[self.confidence < confidence_threshold] = 0
self.context = context
self.choice_data = choice_data
frame_max = self.if_feat.shape[0]//context
self.if_feat = np.reshape(self.if_feat[:frame_max*context,:],(frame_max,context,90))
self.cents = np.reshape(self.cents[:frame_max*context],(frame_max,context))
self.xcorr = np.reshape(self.xcorr[:frame_max*context,:],(frame_max,context,257))
# self.cents = np.rint(60*np.log2(256/(self.periods + 1.0e-8))).astype('int')
# self.cents = np.clip(self.cents,0,239)
self.confidence = np.reshape(self.confidence[:frame_max*context],(frame_max,context))
# print(self.if_feat.shape)
def __len__(self):
return self.if_feat.shape[0]
def __getitem__(self, index):
if self.choice_data == 'both':
return torch.cat([torch.from_numpy(self.xcorr[index,:,:]),torch.from_numpy(self.if_feat[index,:,:])],dim=-1),torch.from_numpy(self.cents[index]),torch.from_numpy(self.confidence[index])
elif self.choice_data == 'if':
return torch.from_numpy(self.if_feat[index,:,:]),torch.from_numpy(self.cents[index]),torch.from_numpy(self.confidence[index])
else:
return torch.from_numpy(self.xcorr[index,:,:]),torch.from_numpy(self.cents[index]),torch.from_numpy(self.confidence[index])

View file

@ -0,0 +1,207 @@
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('features', type=str, help='Features generated from dump_data')
parser.add_argument('data', type=str, help='Data generated from dump_data (offset by 5ms)')
parser.add_argument('output', type=str, help='output .f32 feature file with replaced neural pitch')
parser.add_argument('pth_file', type=str, help='.pth file to use for pitch')
parser.add_argument('path_lpcnet_extractor', type=str, help='path to LPCNet extractor object file (generated on compilation)')
parser.add_argument('--device', type=str, help='compute device',default = None,required = False)
parser.add_argument('--replace_xcorr', type = bool, default = False, help='Replace LPCNet xcorr with updated one')
args = parser.parse_args()
import os
from utils import stft, random_filter
import subprocess
import numpy as np
import json
import torch
import tqdm
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if device is not None:
device = torch.device(args.device)
# Loading the appropriate model
config_path = os.path.dirname(args.pth_file) + '/' + os.path.basename(args.pth_file).split('_')[0] + '_' + 'config_' + os.path.basename(args.pth_file).split('_')[-1][:-4] + '.json'
with open(config_path) as json_file:
dict_params = json.load(json_file)
if dict_params['data_format'] == 'if':
from models import large_if_ccode as model
pitch_nn = model(dict_params['freq_keep']*3,dict_params['gru_dim'],dict_params['output_dim']).to(device)
elif dict_params['data_format'] == 'xcorr':
from models import large_xcorr as model
pitch_nn = model(dict_params['xcorr_dim'],dict_params['gru_dim'],dict_params['output_dim']).to(device)
else:
from models import large_joint as model
pitch_nn = model(dict_params['freq_keep']*3,dict_params['xcorr_dim'],dict_params['gru_dim'],dict_params['output_dim']).to(device)
pitch_nn.load_state_dict(torch.load(args.pth_file))
pitch_nn = pitch_nn.to(device)
N = dict_params['window_size']
H = dict_params['hop_factor']
freq_keep = dict_params['freq_keep']
# import os
# import argparse
# os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["OMP_NUM_THREADS"] = "16"
# parser = argparse.ArgumentParser()
# parser.add_argument('features', type=str, help='input features')
# parser.add_argument('data', type=str, help='input data')
# parser.add_argument('output', type=str, help='output features')
# parser.add_argument('--add-confidence', action='store_true', help='add CREPE confidence to features')
# parser.add_argument('--viterbi', action='store_true', help='enable viterbi algo for pitch tracking')
def run_lpc(signal, lpcs, frame_length=160):
num_frames, lpc_order = lpcs.shape
prediction = np.concatenate(
[- np.convolve(signal[i * frame_length : (i + 1) * frame_length + lpc_order - 1], lpcs[i], mode='valid') for i in range(num_frames)]
)
error = signal[lpc_order :] - prediction
return prediction, error
if __name__ == "__main__":
args = parser.parse_args()
features = np.memmap(args.features, dtype=np.float32,mode = 'r').reshape((-1, 36))
data = np.memmap(args.data, dtype=np.int16,mode = 'r').reshape((-1, 2))
num_frames = features.shape[0]
feature_dim = features.shape[1]
assert feature_dim == 36
# if args.add_confidence:
# feature_dim += 1
output = np.memmap(args.output, dtype=np.float32, shape=(num_frames, feature_dim), mode='w+')
output[:, :36] = features
# lpc coefficients and signal
lpcs = features[:, 20:36]
sig = data[:, 1]
# parameters
# use_viterbi=args.viterbi
# constants
pitch_min = 32
pitch_max = 256
lpc_order = 16
fs = 16000
frame_length = 160
overlap_frames = 100
chunk_size = 10000
history_length = frame_length * overlap_frames
history = np.zeros(history_length, dtype=np.int16)
pitch_position=18
xcorr_position=19
conf_position=36
num_frames = len(sig) // 160 - 1
frame_start = 0
frame_stop = min(frame_start + chunk_size, num_frames)
signal_start = 0
signal_stop = frame_stop * frame_length
niters = (num_frames - 1)//chunk_size
for i in tqdm.trange(niters):
if (frame_start > num_frames - 1):
break
chunk = np.concatenate((history, sig[signal_start:signal_stop]))
chunk_la = np.concatenate((history, sig[signal_start:signal_stop + 80]))
# time, frequency, confidence, _ = crepe.predict(chunk, fs, center=True, viterbi=True,verbose=0)
# Feature computation
spec = stft(x = np.concatenate([np.zeros(80),chunk_la/(2**15 - 1)]), w = 'boxcar', N = N, H = H).T
phase_diff = spec*np.conj(np.roll(spec,1,axis = -1))
phase_diff = phase_diff/(np.abs(phase_diff) + 1.0e-8)
idx_save = np.concatenate([np.arange(freq_keep),(N//2 + 1) + np.arange(freq_keep),2*(N//2 + 1) + np.arange(freq_keep)])
feature = np.concatenate([np.log(np.abs(spec) + 1.0e-8),np.real(phase_diff),np.imag(phase_diff)],axis = 0).T
feature_if = feature[:,idx_save]
data_temp = np.memmap('./temp_featcompute_' + dict_params['data_format'] + '_.raw', dtype=np.int16, shape=(chunk.shape[0]), mode='w+')
data_temp[:chunk.shape[0]] = chunk_la[80:].astype(np.int16)
subprocess.run([args.path_lpcnet_extractor, './temp_featcompute_' + dict_params['data_format'] + '_.raw', './temp_featcompute_xcorr_' + dict_params['data_format'] + '_.raw'])
feature_xcorr = np.flip(np.fromfile('./temp_featcompute_xcorr_' + dict_params['data_format'] + '_.raw', dtype='float32').reshape((-1,256),order = 'C'),axis = 1)
ones_zero_lag = np.expand_dims(np.ones(feature_xcorr.shape[0]),-1)
feature_xcorr = np.concatenate([ones_zero_lag,feature_xcorr],axis = -1)
os.remove('./temp_featcompute_' + dict_params['data_format'] + '_.raw')
os.remove('./temp_featcompute_xcorr_' + dict_params['data_format'] + '_.raw')
if dict_params['data_format'] == 'if':
feature = feature_if
elif dict_params['data_format'] == 'xcorr':
feature = feature_xcorr
else:
indmin = min(feature_if.shape[0],feature_xcorr.shape[0])
feature = np.concatenate([feature_xcorr[:indmin,:],feature_if[:indmin,:]],-1)
# Compute pitch with my model
model_cents = pitch_nn(torch.from_numpy(np.copy(np.expand_dims(feature,0))).float().to(device))
model_cents = 20*model_cents.argmax(dim=1).cpu().detach().squeeze().numpy()
frequency = 62.5*2**(model_cents/1200)
frequency = frequency[overlap_frames : overlap_frames + frame_stop - frame_start]
# confidence = confidence[overlap_frames : overlap_frames + frame_stop - frame_start]
# convert frequencies to periods
periods = np.round(fs / frequency)
# adjust to pitch range
# confidence[periods < pitch_min] = 0
# confidence[periods > pitch_max] = 0
periods = np.clip(periods, pitch_min, pitch_max)
output[frame_start:frame_stop, pitch_position] = (periods - 100) / 50
# if args.replace_xcorr:
# re-calculate xcorr
frame_offset = (pitch_max + frame_length - 1) // frame_length
offset = frame_offset * frame_length
padding = lpc_order
if frame_start < frame_offset:
lpc_coeffs = np.concatenate((np.zeros((frame_offset - frame_start, lpc_order), dtype=np.float32), lpcs[:frame_stop]))
else:
lpc_coeffs = lpcs[frame_start - frame_offset : frame_stop]
pred, error = run_lpc(chunk[history_length - offset - padding :], lpc_coeffs, frame_length=frame_length)
xcorr = np.zeros(frame_stop - frame_start)
for i, p in enumerate(periods.astype(np.int16)):
if p > 0:
f1 = error[offset + i * frame_length : offset + (i + 1) * frame_length]
f2 = error[offset + i * frame_length - p : offset + (i + 1) * frame_length - p]
xcorr[i] = np.dot(f1, f2) / np.sqrt(np.dot(f1, f1) * np.dot(f2, f2) + 1e-6)
output[frame_start:frame_stop, xcorr_position] = xcorr - 0.5
# update buffers and indices
history = chunk[-history_length :]
frame_start += chunk_size
frame_stop += chunk_size
frame_stop = min(frame_stop, num_frames)
signal_start = frame_start * frame_length
signal_stop = frame_stop * frame_length

View file

@ -0,0 +1,34 @@
# Copy into PTDB root directory and run to combine all the male/female raw audio/references into below directories
# Make folder for combined audio
mkdir -p './combined_mic_16k/'
# Make folder for combined pitch reference
mkdir -p './combined_reference_f0/'
# Resample Male Audio
for i in ./MALE/MIC/**/*.wav; do
j="$(basename "$i" .wav)"
echo $j
sox -r 48000 -b 16 -e signed-integer "$i" -r 16000 -b 16 -e signed-integer ./combined_mic_16k/$j.raw
done
# Resample Female Audio
for i in ./FEMALE/MIC/**/*.wav; do
j="$(basename "$i" .wav)"
echo $j
sox -r 48000 -b 16 -e signed-integer "$i" -r 16000 -b 16 -e signed-integer ./combined_mic_16k/$j.raw
done
# Shift Male reference pitch files
for i in ./MALE/REF/**/*.f0; do
j="$(basename "$i" .wav)"
echo $j
cp "$i" ./combined_reference_f0/
done
# Shift Female reference pitch files
for i in ./FEMALE/REF/**/*.f0; do
j="$(basename "$i" .wav)"
echo $j
cp "$i" ./combined_reference_f0/
done

View file

@ -0,0 +1,162 @@
"""
Training the neural pitch estimator
"""
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('features_if', type=str, help='.f32 IF Features for training (generated by augmentation script)')
parser.add_argument('features_xcorr', type=str, help='.f32 Xcorr Features for training (generated by augmentation script)')
parser.add_argument('features_pitch', type=str, help='.npy Pitch file for training (generated by augmentation script)')
parser.add_argument('output_folder', type=str, help='Output directory to store the model weights and config')
parser.add_argument('data_format', type=str, help='Choice of Input Data',choices=['if','xcorr','both'])
parser.add_argument('--gpu_index', type=int, help='GPU index to use if multiple GPUs',default = 0,required = False)
parser.add_argument('--confidence_threshold', type=float, help='Confidence value below which pitch will be neglected during training',default = 0.4,required = False)
parser.add_argument('--context', type=int, help='Sequence length during training',default = 100,required = False)
parser.add_argument('--N', type=int, help='STFT window size',default = 320,required = False)
parser.add_argument('--H', type=int, help='STFT Hop size',default = 160,required = False)
parser.add_argument('--xcorr_dimension', type=int, help='Dimension of Input cross-correlation',default = 257,required = False)
parser.add_argument('--freq_keep', type=int, help='Number of Frequencies to keep',default = 30,required = False)
parser.add_argument('--gru_dim', type=int, help='GRU Dimension',default = 64,required = False)
parser.add_argument('--output_dim', type=int, help='Output dimension',default = 192,required = False)
parser.add_argument('--learning_rate', type=float, help='Learning Rate',default = 1.0e-3,required = False)
parser.add_argument('--epochs', type=int, help='Number of training epochs',default = 50,required = False)
parser.add_argument('--choice_cel', type=str, help='Choice of Cross Entropy Loss (default or robust)',choices=['default','robust'],default = 'default',required = False)
args = parser.parse_args()
# import os
# os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
# os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu_index)
# Fixing the seeds for reproducability
import time
np_seed = int(time.time())
torch_seed = int(time.time())
import json
import torch
torch.manual_seed(torch_seed)
import numpy as np
np.random.seed(np_seed)
from utils import count_parameters
import tqdm
import sys
from datetime import datetime
from evaluation import rpa
# print(list(range(torch.cuda.device_count())))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = 'cpu'
from models import loader_joint as loader
if args.data_format == 'if':
from models import large_if_ccode as model
pitch_nn = model(args.freq_keep*3,args.gru_dim,args.output_dim)
elif args.data_format == 'xcorr':
from models import large_xcorr as model
pitch_nn = model(args.xcorr_dimension,args.gru_dim,args.output_dim)
else:
from models import large_joint as model
pitch_nn = model(args.freq_keep*3,args.xcorr_dimension,args.gru_dim,args.output_dim)
dataset_training = loader(args.features_if,args.features_pitch,args.features_xcorr,args.confidence_threshold,args.context,args.data_format)
def loss_custom(logits,labels,confidence,choice = 'default',nmax = 192,q = 0.7):
logits_softmax = torch.nn.Softmax(dim = 1)(logits).permute(0,2,1)
labels_one_hot = torch.nn.functional.one_hot(labels.long(),nmax)
if choice == 'default':
# Categorical Cross Entropy
CE = -torch.sum(torch.log(logits_softmax*labels_one_hot + 1.0e-6)*labels_one_hot,dim=-1)
CE = torch.sum(confidence*CE)
else:
# Robust Cross Entropy
CE = (1.0/q)*(1 - torch.sum(torch.pow(logits_softmax*labels_one_hot + 1.0e-7,q),dim=-1) )
CE = torch.sum(confidence*CE)
return CE
# features = args.features
# pitch = args.crepe_pitch
# dataset_training = loader(features,pitch,args.confidence_threshold,args.freq_keep,args.context)
# dataset_training = loader(features,pitch,'../../../../testing/testing_features_10pct_xcorr.f32')
train_dataset, test_dataset = torch.utils.data.random_split(dataset_training, [0.95,0.05],generator=torch.Generator().manual_seed(torch_seed))
batch_size = 256
train_dataloader = torch.utils.data.DataLoader(dataset = train_dataset,batch_size = batch_size,shuffle = True,num_workers = 0, pin_memory = False)
test_dataloader = torch.utils.data.DataLoader(dataset = test_dataset,batch_size = batch_size,shuffle = True,num_workers = 0, pin_memory = False)
# pitch_nn = model(args.freq_keep*3,args.gru_dim,args.output_dim).to(device)
pitch_nn = pitch_nn.to(device)
num_params = count_parameters(pitch_nn)
learning_rate = args.learning_rate
model_opt = torch.optim.Adam(pitch_nn.parameters(), lr = learning_rate)
num_epochs = args.epochs
for epoch in range(num_epochs):
losses = []
pitch_nn.train()
with tqdm.tqdm(train_dataloader) as train_epoch:
for i, (xi, yi, ci) in enumerate(train_epoch):
yi, xi, ci = yi.to(device, non_blocking=True), xi.to(device, non_blocking=True), ci.to(device, non_blocking=True)
pi = pitch_nn(xi.float())
loss = loss_custom(logits = pi,labels = yi,confidence = ci,choice = args.choice_cel,nmax = args.output_dim)
model_opt.zero_grad()
loss.backward()
model_opt.step()
losses.append(loss.item())
avg_loss = np.mean(losses)
train_epoch.set_postfix({"Train Epoch" : epoch, "Train Loss":avg_loss})
if epoch % 5 == 0:
pitch_nn.eval()
losses = []
with tqdm.tqdm(test_dataloader) as test_epoch:
for i, (xi, yi, ci) in enumerate(test_epoch):
yi, xi, ci = yi.to(device, non_blocking=True), xi.to(device, non_blocking=True), ci.to(device, non_blocking=True)
pi = pitch_nn(xi.float())
loss = loss_custom(logits = pi,labels = yi,confidence = ci,choice = args.choice_cel,nmax = args.output_dim)
losses.append(loss.item())
avg_loss = np.mean(losses)
test_epoch.set_postfix({"Epoch" : epoch, "Test Loss":avg_loss})
pitch_nn.eval()
rpa(pitch_nn,device,data_format = args.data_format)
config = dict(
data_format = args.data_format,
epochs = num_epochs,
window_size = args.N,
hop_factor = args.H,
freq_keep = args.freq_keep,
batch_size = batch_size,
learning_rate = learning_rate,
confidence_threshold = args.confidence_threshold,
model_parameters = num_params,
np_seed = np_seed,
torch_seed = torch_seed,
xcorr_dim = args.xcorr_dimension,
dim_input = 3*args.freq_keep,
gru_dim = args.gru_dim,
output_dim = args.output_dim,
choice_cel = args.choice_cel,
context = args.context,
)
now = datetime.now()
dir_pth_save = args.output_folder
dir_network = dir_pth_save + str(now) + '_net_' + args.data_format + '.pth'
dir_dictparams = dir_pth_save + str(now) + '_config_' + args.data_format + '.json'
# Save Weights
torch.save(pitch_nn.state_dict(), dir_network)
# Save Config
with open(dir_dictparams, 'w') as fp:
json.dump(config, fp)

View file

@ -0,0 +1,59 @@
"""
Utility functions that are commonly used
"""
import numpy as np
from scipy.signal import windows, lfilter
from prettytable import PrettyTable
# Source: https://gist.github.com/thongonary/026210fc186eb5056f2b6f1ca362d912
def count_parameters(model):
table = PrettyTable(["Modules", "Parameters"])
total_params = 0
for name, parameter in model.named_parameters():
if not parameter.requires_grad: continue
param = parameter.numel()
table.add_row([name, param])
total_params+=param
print(table)
print(f"Total Trainable Params: {total_params}")
return total_params
def stft(x, w = 'boxcar', N = 320, H = 160):
x = np.concatenate([x,np.zeros(N)])
# win_custom = np.concatenate([windows.hann(80)[:40],np.ones(240),windows.hann(80)[40:]])
return np.stack([np.fft.rfft(x[i:i + N]*windows.get_window(w,N)) for i in np.arange(0,x.shape[0]-N,H)])
def random_filter(x):
# Randomly filter x with second order IIR filter with coefficients in between -3/8,3/8
filter_coeff = np.random.uniform(low = -3.0/8, high = 3.0/8, size = 4)
b = [1,filter_coeff[0],filter_coeff[1]]
a = [1,filter_coeff[2],filter_coeff[3]]
return lfilter(b,a,x)
def feature_xform(feature):
"""
Take as input the (N * 256) xcorr features output by LPCNet and perform the following
1. Downsample and Upsample by 2 (followed by smoothing)
2. Append positional embeddings (of dim k) coresponding to each xcorr lag
"""
from scipy.signal import resample_poly, lfilter
feature_US = lfilter([0.25,0.5,0.25],[1],resample_poly(feature,2,1,axis = 1),axis = 1)[:,:feature.shape[1]]
feature_DS = lfilter([0.5,0.5],[1],resample_poly(feature,1,2,axis = 1),axis = 1)
Z_append = np.zeros((feature.shape[0],feature.shape[1] - feature_DS.shape[1]))
feature_DS = np.concatenate([feature_DS,Z_append],axis = -1)
# pos_embedding = []
# for i in range(k):
# pos_embedding.append(np.cos((2**i)*np.pi*((np.repeat(np.arange(feature.shape[1]).reshape(feature.shape[1],1),feature.shape[0],axis = 1)).T/(2*feature.shape[1]))))
# pos_embedding = np.stack(pos_embedding,axis = -1)
feature = np.stack((feature_DS,feature,feature_US),axis = -1)
# feature = np.concatenate((feature,pos_embedding),axis = -1)
return feature