mirror of
https://github.com/xiph/opus.git
synced 2025-05-24 20:29:12 +00:00
Python code for neural pitch
This commit is contained in:
parent
d88dd89358
commit
f38b4a317f
11 changed files with 1481 additions and 0 deletions
18
dnn/torch/neural-pitch/README.md
Normal file
18
dnn/torch/neural-pitch/README.md
Normal file
|
@ -0,0 +1,18 @@
|
|||
## Neural Pitch Estimation
|
||||
|
||||
- Dataset Installation
|
||||
1. Download and unzip PTDB Dataset:
|
||||
wget https://www2.spsc.tugraz.at/databases/PTDB-TUG/SPEECH_DATA_ZIPPED.zip
|
||||
unzip SPEECH_DATA_ZIPPED.zip
|
||||
|
||||
2. Inside "SPEECH DATA" above, run ptdb_process.sh to combine male/female
|
||||
|
||||
3. To Download and combine demand, simply run download_demand.sh
|
||||
|
||||
- LPCNet preparation
|
||||
1. To extract xcorr, add lpcnet_extractor.c and add relevant functions to lpcnet_enc.c, add source for headers/c files and Makefile.am, and compile to generate ./lpcnet_xcorr_extractor object
|
||||
|
||||
- Dataset Augmentation and training (check out arguments to each of the following)
|
||||
1. Run data_augmentation.py
|
||||
2. Run training.py using augmented data
|
||||
3. Run experiments.py
|
149
dnn/torch/neural-pitch/data_augmentation.py
Normal file
149
dnn/torch/neural-pitch/data_augmentation.py
Normal file
|
@ -0,0 +1,149 @@
|
|||
"""
|
||||
Perform Data Augmentation (Gain, Additive Noise, Random Filtering) on Input TTS Data
|
||||
1. Read in chunks and compute clean pitch first
|
||||
2. Then add in augmentation (Noise/Level/Response)
|
||||
- Adds filtered noise from the "Demand" dataset, https://zenodo.org/record/1227121#.XRKKxYhKiUk
|
||||
- When using the Demand Dataset, consider each channel as a possible noise input, and keep the first 4 minutes of noise for training
|
||||
3. Use this "augmented" audio for feature computation, and compute pitch using CREPE on the clean input
|
||||
|
||||
Notes: To ensure consistency with the discovered CREPE offset, we do the following
|
||||
- We pad the input audio to the zero-centered CREPE estimator with 80 zeros
|
||||
- We pad the input audio to our feature computation with 160 zeros to center them
|
||||
"""
|
||||
|
||||
import argparse
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
parser.add_argument('data', type=str, help='input raw audio data')
|
||||
parser.add_argument('output', type=str, help='output directory')
|
||||
parser.add_argument('path_lpcnet_extractor', type=str, help='path to LPCNet extractor object file (generated on compilation)')
|
||||
parser.add_argument('noise_dataset', type=str, help='Location of the Demand Datset')
|
||||
parser.add_argument('--flag_xcorr', type=bool, help='Flag to additionally dump xcorr features',choices=[True,False],default = False,required = False)
|
||||
parser.add_argument('--fraction_input_use', type=float, help='Fraction of input data to consider',default = 0.3,required = False)
|
||||
parser.add_argument('--gpu_index', type=int, help='GPU index to use if multiple GPUs',default = 0,required = False)
|
||||
parser.add_argument('--choice_augment', type=str, help='Choice of noise augmentation, either use additive synthetic noise or add noise from the demand dataset',choices = ['demand','synthetic'],default = "demand",required = False)
|
||||
parser.add_argument('--fraction_clean', type=float, help='Fraction of data to keep clean (that is not augment with anything)',default = 0.2,required = False)
|
||||
parser.add_argument('--chunk_size', type=int, help='Number of samples to augment with for each iteration',default = 80000,required = False)
|
||||
parser.add_argument('--N', type=int, help='STFT window size',default = 320,required = False)
|
||||
parser.add_argument('--H', type=int, help='STFT Hop size',default = 160,required = False)
|
||||
parser.add_argument('--freq_keep', type=int, help='Number of Frequencies to keep',default = 30,required = False)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
import os
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu_index)
|
||||
|
||||
from utils import stft, random_filter
|
||||
|
||||
import numpy as np
|
||||
import tqdm
|
||||
import crepe
|
||||
import random
|
||||
import glob
|
||||
import subprocess
|
||||
|
||||
data_full = np.memmap(args.data, dtype=np.int16,mode = 'r')
|
||||
data = data_full[:(int)(args.fraction_input_use*data_full.shape[0])]
|
||||
|
||||
# list_features = []
|
||||
list_cents = []
|
||||
list_confidences = []
|
||||
|
||||
N = args.N
|
||||
H = args.H
|
||||
freq_keep = args.freq_keep
|
||||
# Minimum/Maximum periods, decided by LPCNet
|
||||
min_period = 32
|
||||
max_period = 256
|
||||
f_ref = 16000/max_period
|
||||
chunk_size = args.chunk_size
|
||||
num_frames_chunk = chunk_size//H
|
||||
list_indices_keep = np.concatenate([np.arange(freq_keep), (N//2 + 1) + np.arange(freq_keep), 2*(N//2 + 1) + np.arange(freq_keep)])
|
||||
|
||||
output_IF = np.memmap(args.output + '_iffeat.f32', dtype=np.float32, shape=(((data.shape[0]//chunk_size - 1)//1)*num_frames_chunk,list_indices_keep.shape[0]), mode='w+')
|
||||
if args.flag_xcorr:
|
||||
output_xcorr = np.memmap(args.output + '_xcorr.f32', dtype=np.float32, shape=(((data.shape[0]//chunk_size - 1)//1)*num_frames_chunk,257), mode='w+')
|
||||
|
||||
fraction_clean = args.fraction_clean
|
||||
|
||||
noise_dataset = args.noise_dataset
|
||||
|
||||
for i in tqdm.trange((data.shape[0]//chunk_size - 1)//1):
|
||||
chunk = data[i*chunk_size:(i + 1)*chunk_size]/(2**15 - 1)
|
||||
|
||||
# Clean Pitch/Confidence Estimate
|
||||
# Padding input to CREPE by 80 samples to ensure it aligns
|
||||
_, pitch, confidence, _ = crepe.predict(np.concatenate([np.zeros(80),chunk]), 16000, center=True, viterbi=True,verbose=0)
|
||||
cent = 1200*np.log2(np.divide(pitch, f_ref, out=np.zeros_like(pitch), where=pitch!=0) + 1.0e-8)
|
||||
|
||||
# Filter out of range pitches/confidences
|
||||
confidence[pitch < 16000/max_period] = 0
|
||||
confidence[pitch > 16000/min_period] = 0
|
||||
|
||||
# Keep fraction of data clean, augment only 1 minus the fraction
|
||||
if (np.random.rand() > fraction_clean):
|
||||
# Response, generate controlled/random 2nd order IIR filter and filter chunk
|
||||
chunk = random_filter(chunk)
|
||||
|
||||
# Level/Gain response {scale by random gain between 1.0e-3 and 10}
|
||||
# Generate random gain in dB and then convert to scale
|
||||
g_dB = np.random.uniform(low = -60, high = 20, size = 1)
|
||||
# g_dB = 0
|
||||
g = 10**(g_dB/20)
|
||||
|
||||
# Noise Addition {Add random SNR 2nd order randomly colored noise}
|
||||
# Generate noise SNR value and add corresponding noise
|
||||
snr_dB = np.random.uniform(low = -20, high = 30, size = 1)
|
||||
|
||||
if args.choice_augment == 'synthetic':
|
||||
n = np.random.randn(chunk_size)
|
||||
else:
|
||||
list_noisefiles = noise_dataset + '*.wav'
|
||||
noise_file = random.choice(glob.glob(list_noisefiles))
|
||||
n = np.memmap(noise_file, dtype=np.int16,mode = 'r')/(2**15 - 1)
|
||||
rand_range = np.random.randint(low = 0, high = (n.shape[0] - 16000*60 - chunk.shape[0])) # 16000 is subtracted because we will use the last 1 minutes of noise for testing
|
||||
n = n[rand_range:rand_range + chunk.shape[0]]
|
||||
|
||||
# Randomly filter the sampled noise as well
|
||||
n = random_filter(n)
|
||||
# generate random prime number between 0,500 and make those samples of noise 0 (to prevent GRU from picking up temporal patterns)
|
||||
Nprime = random.choice([2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97, 101, 103, 107, 109, 113, 127, 131, 137, 139, 149, 151, 157, 163, 167, 173, 179, 181, 191, 193, 197, 199, 211, 223, 227, 229, 233, 239, 241, 251, 257, 263, 269, 271, 277, 281, 283, 293, 307, 311, 313, 317, 331, 337, 347, 349, 353, 359, 367, 373, 379, 383, 389, 397, 401, 409, 419, 421, 431, 433, 439, 443, 449, 457, 461, 463, 467, 479, 487, 491, 499, 503, 509, 521, 523, 541])
|
||||
n[chunk_size - Nprime:] = np.zeros(Nprime)
|
||||
snr_multiplier = np.sqrt((np.sum(np.abs(chunk)**2)/np.sum(np.abs(n)**2))*10**(-snr_dB/10))
|
||||
|
||||
chunk = g*(chunk + snr_multiplier*n)
|
||||
|
||||
# Zero pad input audio by 160 to center the frames
|
||||
spec = stft(x = np.concatenate([np.zeros(160),chunk]), w = 'boxcar', N = N, H = H).T
|
||||
phase_diff = spec*np.conj(np.roll(spec,1,axis = -1))
|
||||
phase_diff = phase_diff/(np.abs(phase_diff) + 1.0e-8)
|
||||
feature = np.concatenate([np.log(np.abs(spec) + 1.0e-8),np.real(phase_diff),np.imag(phase_diff)],axis = 0).T
|
||||
feature = feature[:,list_indices_keep]
|
||||
|
||||
if args.flag_xcorr:
|
||||
# Dump noisy audio into temp file
|
||||
data_temp = np.memmap('./temp_augment.raw', dtype=np.int16, shape=(chunk.shape[0]), mode='w+')
|
||||
# data_temp[:chunk.shape[0]] = (chunk/(np.max(np.abs(chunk)))*(2**15 - 1)).astype(np.int16)
|
||||
data_temp[:chunk.shape[0]] = ((chunk)*(2**15 - 1)).astype(np.int16)
|
||||
|
||||
subprocess.run([args.path_lpcnet_extractor, './temp_augment.raw', './temp_augment_xcorr.f32'])
|
||||
feature_xcorr = np.flip(np.fromfile('./temp_augment_xcorr.f32', dtype='float32').reshape((-1,256),order = 'C'),axis = 1)
|
||||
ones_zero_lag = np.expand_dims(np.ones(feature_xcorr.shape[0]),-1)
|
||||
feature_xcorr = np.concatenate([ones_zero_lag,feature_xcorr],axis = -1)
|
||||
|
||||
os.remove('./temp_augment.raw')
|
||||
os.remove('./temp_augment_xcorr.f32')
|
||||
num_frames = min(cent.shape[0],feature.shape[0],feature_xcorr.shape[0],num_frames_chunk)
|
||||
feature = feature[:num_frames,:]
|
||||
cent = cent[:num_frames]
|
||||
confidence = confidence[:num_frames]
|
||||
feature_xcorr = feature_xcorr[:num_frames]
|
||||
output_IF[i*num_frames_chunk:(i + 1)*num_frames_chunk,:] = feature
|
||||
output_xcorr[i*num_frames_chunk:(i + 1)*num_frames_chunk,:] = feature_xcorr
|
||||
list_cents.append(cent)
|
||||
list_confidences.append(confidence)
|
||||
|
||||
list_cents = np.hstack(list_cents)
|
||||
list_confidences = np.hstack(list_confidences)
|
||||
|
||||
np.save(args.output + '_pitches',np.vstack([list_cents,list_confidences]))
|
43
dnn/torch/neural-pitch/download_demand.sh
Normal file
43
dnn/torch/neural-pitch/download_demand.sh
Normal file
|
@ -0,0 +1,43 @@
|
|||
wget https://zenodo.org/record/1227121/files/DKITCHEN_16k.zip
|
||||
|
||||
wget https://zenodo.org/record/1227121/files/DLIVING_16k.zip
|
||||
|
||||
wget https://zenodo.org/record/1227121/files/DWASHING_16k.zip
|
||||
|
||||
wget https://zenodo.org/record/1227121/files/NFIELD_16k.zip
|
||||
|
||||
wget https://zenodo.org/record/1227121/files/NPARK_16k.zip
|
||||
|
||||
wget https://zenodo.org/record/1227121/files/NRIVER_16k.zip
|
||||
|
||||
wget https://zenodo.org/record/1227121/files/OHALLWAY_16k.zip
|
||||
|
||||
wget https://zenodo.org/record/1227121/files/OMEETING_16k.zip
|
||||
|
||||
wget https://zenodo.org/record/1227121/files/OOFFICE_16k.zip
|
||||
|
||||
wget https://zenodo.org/record/1227121/files/PCAFETER_16k.zip
|
||||
|
||||
wget https://zenodo.org/record/1227121/files/PRESTO_16k.zip
|
||||
|
||||
wget https://zenodo.org/record/1227121/files/PSTATION_16k.zip
|
||||
|
||||
wget https://zenodo.org/record/1227121/files/TMETRO_16k.zip
|
||||
|
||||
wget https://zenodo.org/record/1227121/files/TCAR_16k.zip
|
||||
|
||||
wget https://zenodo.org/record/1227121/files/TBUS_16k.zip
|
||||
|
||||
wget https://zenodo.org/record/1227121/files/STRAFFIC_16k.zip
|
||||
|
||||
wget https://zenodo.org/record/1227121/files/SPSQUARE_16k.zip
|
||||
|
||||
unzip '*.zip'
|
||||
|
||||
mkdir -p ./combined_demand_channels/
|
||||
for file in */*.wav; do
|
||||
parentdir="$(dirname "$file")"
|
||||
echo $parentdir
|
||||
fname="$(basename "$file")"
|
||||
cp $file ./combined_demand_channels/$parentdir+$fname
|
||||
done
|
464
dnn/torch/neural-pitch/evaluation.py
Normal file
464
dnn/torch/neural-pitch/evaluation.py
Normal file
|
@ -0,0 +1,464 @@
|
|||
"""
|
||||
Evaluation script to compute the Raw Pitch Accuracy
|
||||
Procedure:
|
||||
- Look at all voiced frames in file
|
||||
- Compute number of pitches in those frames that lie within a 50 cent threshold
|
||||
RPA = (Total number of pitches within threshold summed across all files)/(Total number of voiced frames summed accross all files)
|
||||
"""
|
||||
|
||||
import os
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
|
||||
|
||||
from prettytable import PrettyTable
|
||||
import numpy as np
|
||||
import glob
|
||||
import random
|
||||
import tqdm
|
||||
import torch
|
||||
import librosa
|
||||
import json
|
||||
from utils import stft, random_filter, feature_xform
|
||||
import subprocess
|
||||
import crepe
|
||||
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
|
||||
def rca(reference,input,voicing,thresh = 25):
|
||||
idx_voiced = np.where(voicing != 0)[0]
|
||||
acc = np.where(np.abs(reference - input)[idx_voiced] < thresh)[0]
|
||||
return acc.shape[0]
|
||||
|
||||
def sweep_rca(reference,input,voicing,thresh = 25,ind_arr = np.arange(-10,10)):
|
||||
l = []
|
||||
for i in ind_arr:
|
||||
l.append(rca(reference,np.roll(input,i),voicing,thresh))
|
||||
l = np.array(l)
|
||||
|
||||
return np.max(l)
|
||||
|
||||
def rpa(model,device = 'cpu',data_format = 'if'):
|
||||
list_files = glob.glob('/home/ubuntu/Code/Datasets/SPEECH DATA/combined_mic_16k_raw/*.raw')
|
||||
dir_f0 = '/home/ubuntu/Code/Datasets/SPEECH DATA/combine_f0_ptdb/'
|
||||
# random_shuffle = list(np.random.permutation(len(list_files)))
|
||||
random.shuffle(list_files)
|
||||
list_files = list_files[:1000]
|
||||
|
||||
# C_lp = 0
|
||||
# C_lp_m = 0
|
||||
# C_lp_f = 0
|
||||
# list_rca_model_lp = []
|
||||
# list_rca_male_lp = []
|
||||
# list_rca_female_lp = []
|
||||
|
||||
# C_hp = 0
|
||||
# C_hp_m = 0
|
||||
# C_hp_f = 0
|
||||
# list_rca_model_hp = []
|
||||
# list_rca_male_hp = []
|
||||
# list_rca_female_hp = []
|
||||
|
||||
C_all = 0
|
||||
C_all_m = 0
|
||||
C_all_f = 0
|
||||
list_rca_model_all = []
|
||||
list_rca_male_all = []
|
||||
list_rca_female_all = []
|
||||
|
||||
thresh = 50
|
||||
N = 320
|
||||
H = 160
|
||||
freq_keep = 30
|
||||
|
||||
for idx in tqdm.trange(len(list_files)):
|
||||
audio_file = list_files[idx]
|
||||
file_name = os.path.basename(list_files[idx])[:-4]
|
||||
|
||||
audio = np.memmap(list_files[idx], dtype=np.int16)/(2**15 - 1)
|
||||
offset = 432
|
||||
audio = audio[offset:]
|
||||
rmse = np.squeeze(librosa.feature.rms(y = audio,frame_length = 320,hop_length = 160))
|
||||
|
||||
spec = stft(x = np.concatenate([np.zeros(160),audio]), w = 'boxcar', N = N, H = H).T
|
||||
phase_diff = spec*np.conj(np.roll(spec,1,axis = -1))
|
||||
phase_diff = phase_diff/(np.abs(phase_diff) + 1.0e-8)
|
||||
idx_save = np.concatenate([np.arange(freq_keep),(N//2 + 1) + np.arange(freq_keep),2*(N//2 + 1) + np.arange(freq_keep)])
|
||||
feature = np.concatenate([np.log(np.abs(spec) + 1.0e-8),np.real(phase_diff),np.imag(phase_diff)],axis = 0).T
|
||||
feature_if = feature[:,idx_save]
|
||||
|
||||
data_temp = np.memmap('./temp.raw', dtype=np.int16, shape=(audio.shape[0]), mode='w+')
|
||||
data_temp[:audio.shape[0]] = (audio/(np.max(np.abs(audio)))*(2**15 - 1)).astype(np.int16)
|
||||
|
||||
subprocess.run(["../../../lpcnet_xcorr_extractor", './temp.raw', './temp_xcorr.f32'])
|
||||
feature_xcorr = np.flip(np.fromfile('./temp_xcorr.f32', dtype='float32').reshape((-1,256),order = 'C'),axis = 1)
|
||||
ones_zero_lag = np.expand_dims(np.ones(feature_xcorr.shape[0]),-1)
|
||||
feature_xcorr = np.concatenate([ones_zero_lag,feature_xcorr],axis = -1)
|
||||
# feature_xcorr = feature_xform(feature_xcorr)
|
||||
|
||||
os.remove('./temp.raw')
|
||||
os.remove('./temp_xcorr.f32')
|
||||
|
||||
if data_format == 'if':
|
||||
feature = feature_if
|
||||
elif data_format == 'xcorr':
|
||||
feature = feature_xcorr
|
||||
else:
|
||||
indmin = min(feature_if.shape[0],feature_xcorr.shape[0])
|
||||
feature = np.concatenate([feature_xcorr[:indmin,:],feature_if[:indmin,:]],-1)
|
||||
|
||||
|
||||
pitch_file_name = dir_f0 + "ref" + os.path.basename(list_files[idx])[3:-4] + ".f0"
|
||||
pitch = np.loadtxt(pitch_file_name)[:,0]
|
||||
voicing = np.loadtxt(pitch_file_name)[:,1]
|
||||
indmin = min(voicing.shape[0],rmse.shape[0],pitch.shape[0])
|
||||
pitch = pitch[:indmin]
|
||||
voicing = voicing[:indmin]
|
||||
rmse = rmse[:indmin]
|
||||
voicing = voicing*(rmse > 0.05*np.max(rmse))
|
||||
if "mic_F" in audio_file:
|
||||
idx_correct = np.where(pitch < 125)
|
||||
voicing[idx_correct] = 0
|
||||
|
||||
cent = np.rint(1200*np.log2(np.divide(pitch, (16000/256), out=np.zeros_like(pitch), where=pitch!=0) + 1.0e-8)).astype('int')
|
||||
|
||||
# if (model == 'penn'):
|
||||
# model_frequency, _ = penn.from_audio(
|
||||
# torch.from_numpy(audio).unsqueeze(0).float(),
|
||||
# 16000,
|
||||
# hopsize=0.01,
|
||||
# fmin=(16000.0/256),
|
||||
# fmax=500,
|
||||
# checkpoint=penn.DEFAULT_CHECKPOINT,
|
||||
# batch_size=32,
|
||||
# pad=True,
|
||||
# interp_unvoiced_at=0.065,
|
||||
# gpu=0)
|
||||
# model_frequency = model_frequency.cpu().detach().squeeze().numpy()
|
||||
# model_cents = 1200*np.log2(model_frequency/(16000/256))
|
||||
|
||||
# elif (model == 'crepe'):
|
||||
# _, model_frequency, _, _ = crepe.predict(audio, 16000, viterbi=vflag,center=True,verbose=0)
|
||||
# lpcnet_file_name = '/home/ubuntu/Code/Datasets/SPEECH_DATA/lpcnet_f0_16k_residual/' + file_name + '_f0.f32'
|
||||
# period_lpcnet = np.fromfile(lpcnet_file_name, dtype='float32')
|
||||
# model_frequency = 16000/(period_lpcnet + 1.0e-6)
|
||||
# model_cents = 1200*np.log2(model_frequency/(16000/256))
|
||||
# else:
|
||||
model_cents = model(torch.from_numpy(np.copy(np.expand_dims(feature,0))).float().to(device))
|
||||
model_cents = 20*model_cents.argmax(dim=1).cpu().detach().squeeze().numpy()
|
||||
# model_cents = np.roll(model_cents,-1*3)
|
||||
|
||||
num_frames = min(cent.shape[0],model_cents.shape[0])
|
||||
pitch = pitch[:num_frames]
|
||||
cent = cent[:num_frames]
|
||||
voicing = voicing[:num_frames]
|
||||
model_cents = model_cents[:num_frames]
|
||||
|
||||
voicing_all = np.copy(voicing)
|
||||
# Forcefully make regions where pitch is <65 or greater than 500 unvoiced for relevant accurate pitch comparisons for our model
|
||||
force_out_of_pitch = np.where(np.logical_or(pitch < 65,pitch > 500)==True)
|
||||
voicing_all[force_out_of_pitch] = 0
|
||||
C_all = C_all + np.where(voicing_all != 0)[0].shape[0]
|
||||
|
||||
# list_rca_model_all.append(sweep_rca(cent,model_cents,voicing_all,thresh,[0]))
|
||||
list_rca_model_all.append(rca(cent,model_cents,voicing_all,thresh))
|
||||
# list_rca_model_all.append(np.count_nonzero(np.where(np.abs(cent - model_cents))))
|
||||
|
||||
if "mic_M" in audio_file:
|
||||
# list_rca_male_all.append(sweep_rca(cent,model_cents,voicing_all,thresh,[0]))
|
||||
list_rca_male_all.append(rca(cent,model_cents,voicing_all,thresh))
|
||||
C_all_m = C_all_m + np.where(voicing_all != 0)[0].shape[0]
|
||||
else:
|
||||
# list_rca_female_all.append(sweep_rca(cent,model_cents,voicing_all,thresh,[0]))
|
||||
list_rca_female_all.append(rca(cent,model_cents,voicing_all,thresh))
|
||||
C_all_f = C_all_f + np.where(voicing_all != 0)[0].shape[0]
|
||||
|
||||
"""
|
||||
# Low pitch estimation
|
||||
voicing_lp = np.copy(voicing)
|
||||
force_out_of_pitch = np.where(np.logical_or(pitch < 65,pitch > 125)==True)
|
||||
voicing_lp[force_out_of_pitch] = 0
|
||||
C_lp = C_lp + np.where(voicing_lp != 0)[0].shape[0]
|
||||
|
||||
# list_rca_model_lp.append(sweep_rca(cent,model_cents,voicing_lp,thresh,[0]))
|
||||
list_rca_model_lp.append(rca(cent,model_cents,voicing_lp,thresh))
|
||||
|
||||
if "mic_M" in audio_file:
|
||||
# list_rca_male_lp.append(sweep_rca(cent,model_cents,voicing_lp,thresh,[0]))
|
||||
list_rca_male_lp.append(rca(cent,model_cents,voicing_lp,thresh))
|
||||
C_lp_m = C_lp_m + np.where(voicing_lp != 0)[0].shape[0]
|
||||
else:
|
||||
# list_rca_female_lp.append(sweep_rca(cent,model_cents,voicing_lp,thresh,[0]))
|
||||
list_rca_female_lp.append(rca(cent,model_cents,voicing_lp,thresh))
|
||||
C_lp_f = C_lp_f + np.where(voicing_lp != 0)[0].shape[0]
|
||||
|
||||
# High pitch estimation
|
||||
voicing_hp = np.copy(voicing)
|
||||
force_out_of_pitch = np.where(np.logical_or(pitch < 125,pitch > 500)==True)
|
||||
voicing_hp[force_out_of_pitch] = 0
|
||||
C_hp = C_hp + np.where(voicing_hp != 0)[0].shape[0]
|
||||
|
||||
# list_rca_model_hp.append(sweep_rca(cent,model_cents,voicing_hp,thresh,[0]))
|
||||
list_rca_model_hp.append(rca(cent,model_cents,voicing_hp,thresh))
|
||||
|
||||
if "mic_M" in audio_file:
|
||||
# list_rca_male_hp.append(sweep_rca(cent,model_cents,voicing_hp,thresh,[0]))
|
||||
list_rca_male_hp.append(rca(cent,model_cents,voicing_hp,thresh))
|
||||
C_hp_m = C_hp_m + np.where(voicing_hp != 0)[0].shape[0]
|
||||
else:
|
||||
# list_rca_female_hp.append(sweep_rca(cent,model_cents,voicing_hp,thresh,[0]))
|
||||
list_rca_female_hp.append(rca(cent,model_cents,voicing_hp,thresh))
|
||||
C_hp_f = C_hp_f + np.where(voicing_hp != 0)[0].shape[0]
|
||||
# list_rca_model.append(acc_model)
|
||||
# list_rca_crepe.append(acc_crepe)
|
||||
# list_rca_lpcnet.append(acc_lpcnet)
|
||||
# list_rca_penn.append(acc_penn)
|
||||
"""
|
||||
|
||||
# list_rca_crepe = np.array(list_rca_crepe)
|
||||
# list_rca_model_lp = np.array(list_rca_model_lp)
|
||||
# list_rca_male_lp = np.array(list_rca_male_lp)
|
||||
# list_rca_female_lp = np.array(list_rca_female_lp)
|
||||
|
||||
# list_rca_model_hp = np.array(list_rca_model_hp)
|
||||
# list_rca_male_hp = np.array(list_rca_male_hp)
|
||||
# list_rca_female_hp = np.array(list_rca_female_hp)
|
||||
|
||||
list_rca_model_all = np.array(list_rca_model_all)
|
||||
list_rca_male_all = np.array(list_rca_male_all)
|
||||
list_rca_female_all = np.array(list_rca_female_all)
|
||||
# list_rca_lpcnet = np.array(list_rca_lpcnet)
|
||||
# list_rca_penn = np.array(list_rca_penn)
|
||||
|
||||
x = PrettyTable()
|
||||
|
||||
x.field_names = ["Experiment", "Mean RPA"]
|
||||
x.add_row(["Both all pitches", np.sum(list_rca_model_all)/C_all])
|
||||
# x.add_row(["Both low pitches", np.sum(list_rca_model_lp)/C_lp])
|
||||
# x.add_row(["Both high pitches", np.sum(list_rca_model_hp)/C_hp])
|
||||
|
||||
x.add_row(["Male all pitches", np.sum(list_rca_male_all)/C_all_m])
|
||||
# x.add_row(["Male low pitches", np.sum(list_rca_male_lp)/C_lp_m])
|
||||
# x.add_row(["Male high pitches", np.sum(list_rca_male_hp)/C_hp_m])
|
||||
|
||||
x.add_row(["Female all pitches", np.sum(list_rca_female_all)/C_all_f])
|
||||
# x.add_row(["Female low pitches", np.sum(list_rca_female_lp)/C_lp_f])
|
||||
# x.add_row(["Female high pitches", np.sum(list_rca_female_hp)/C_hp_f])
|
||||
|
||||
print(x)
|
||||
|
||||
return None
|
||||
|
||||
def cycle_eval(list_files_pth, noise_type = 'synthetic', noise_dataset = None, list_snr = [-20,-15,-10,-5,0,5,10,15,20], ptdb_dataset_path = None,fraction = 0.1,thresh = 50):
|
||||
"""
|
||||
Cycle through SNR evaluation for list of .pth files
|
||||
"""
|
||||
# list_files = glob.glob('/home/ubuntu/Code/Datasets/SPEECH DATA/combined_mic_16k_raw/*.raw')
|
||||
# dir_f0 = '/home/ubuntu/Code/Datasets/SPEECH DATA/combine_f0_ptdb/'
|
||||
# random_shuffle = list(np.random.permutation(len(list_files)))
|
||||
list_files = glob.glob(ptdb_dataset_path + 'combined_mic_16k/*.raw')
|
||||
dir_f0 = ptdb_dataset_path + 'combined_reference_f0/'
|
||||
random.shuffle(list_files)
|
||||
list_files = list_files[:(int)(fraction*len(list_files))]
|
||||
|
||||
# list_nfiles = ['DKITCHEN','NFIELD','OHALLWAY','PCAFETER','SPSQUARE','TCAR','DLIVING','NPARK','OMEETING','PRESTO','STRAFFIC','TMETRO','DWASHING','NRIVER','OOFFICE','PSTATION','TBUS']
|
||||
|
||||
dict_models = {}
|
||||
list_snr.append(np.inf)
|
||||
# thresh = 50
|
||||
|
||||
for f in list_files_pth:
|
||||
if (f!='crepe') and (f!='lpcnet'):
|
||||
fname = os.path.basename(f).split('_')[0] + '_' + os.path.basename(f).split('_')[-1][:-4]
|
||||
config_path = os.path.dirname(f) + '/' + os.path.basename(f).split('_')[0] + '_' + 'config_' + os.path.basename(f).split('_')[-1][:-4] + '.json'
|
||||
with open(config_path) as json_file:
|
||||
dict_params = json.load(json_file)
|
||||
|
||||
if dict_params['data_format'] == 'if':
|
||||
from models import large_if_ccode as model
|
||||
pitch_nn = model(dict_params['freq_keep']*3,dict_params['gru_dim'],dict_params['output_dim']).to(device)
|
||||
elif dict_params['data_format'] == 'xcorr':
|
||||
from models import large_xcorr as model
|
||||
pitch_nn = model(dict_params['xcorr_dim'],dict_params['gru_dim'],dict_params['output_dim']).to(device)
|
||||
else:
|
||||
from models import large_joint as model
|
||||
pitch_nn = model(dict_params['freq_keep']*3,dict_params['xcorr_dim'],dict_params['gru_dim'],dict_params['output_dim']).to(device)
|
||||
|
||||
pitch_nn.load_state_dict(torch.load(f))
|
||||
|
||||
N = dict_params['window_size']
|
||||
H = dict_params['hop_factor']
|
||||
freq_keep = dict_params['freq_keep']
|
||||
|
||||
list_mean = []
|
||||
list_std = []
|
||||
for snr_dB in list_snr:
|
||||
C_all = 0
|
||||
C_correct = 0
|
||||
for idx in tqdm.trange(len(list_files)):
|
||||
audio_file = list_files[idx]
|
||||
file_name = os.path.basename(list_files[idx])[:-4]
|
||||
|
||||
audio = np.memmap(list_files[idx], dtype=np.int16)/(2**15 - 1)
|
||||
offset = 432
|
||||
audio = audio[offset:]
|
||||
rmse = np.squeeze(librosa.feature.rms(y = audio,frame_length = N,hop_length = H))
|
||||
|
||||
if noise_type != 'synthetic':
|
||||
list_noisefiles = noise_dataset + '*.wav'
|
||||
noise_file = random.choice(glob.glob(list_noisefiles))
|
||||
n = np.memmap(noise_file, dtype=np.int16,mode = 'r')/(2**15 - 1)
|
||||
rand_range = np.random.randint(low = 0, high = (16000*60*5 - audio.shape[0])) # Last 1 minute of noise used for testing
|
||||
n = n[rand_range:rand_range + audio.shape[0]]
|
||||
else:
|
||||
n = np.random.randn(audio.shape[0])
|
||||
n = random_filter(n)
|
||||
|
||||
snr_multiplier = np.sqrt((np.sum(np.abs(audio)**2)/np.sum(np.abs(n)**2))*10**(-snr_dB/10))
|
||||
audio = audio + snr_multiplier*n
|
||||
|
||||
spec = stft(x = np.concatenate([np.zeros(160),audio]), w = 'boxcar', N = N, H = H).T
|
||||
phase_diff = spec*np.conj(np.roll(spec,1,axis = -1))
|
||||
phase_diff = phase_diff/(np.abs(phase_diff) + 1.0e-8)
|
||||
idx_save = np.concatenate([np.arange(freq_keep),(N//2 + 1) + np.arange(freq_keep),2*(N//2 + 1) + np.arange(freq_keep)])
|
||||
feature = np.concatenate([np.log(np.abs(spec) + 1.0e-8),np.real(phase_diff),np.imag(phase_diff)],axis = 0).T
|
||||
feature_if = feature[:,idx_save]
|
||||
|
||||
data_temp = np.memmap('./temp.raw', dtype=np.int16, shape=(audio.shape[0]), mode='w+')
|
||||
# data_temp[:audio.shape[0]] = (audio/(np.max(np.abs(audio)))*(2**15 - 1)).astype(np.int16)
|
||||
data_temp[:audio.shape[0]] = ((audio)*(2**15 - 1)).astype(np.int16)
|
||||
|
||||
subprocess.run(["../../../lpcnet_xcorr_extractor", './temp.raw', './temp_xcorr.f32'])
|
||||
feature_xcorr = np.flip(np.fromfile('./temp_xcorr.f32', dtype='float32').reshape((-1,256),order = 'C'),axis = 1)
|
||||
ones_zero_lag = np.expand_dims(np.ones(feature_xcorr.shape[0]),-1)
|
||||
feature_xcorr = np.concatenate([ones_zero_lag,feature_xcorr],axis = -1)
|
||||
|
||||
os.remove('./temp.raw')
|
||||
os.remove('./temp_xcorr.f32')
|
||||
|
||||
if dict_params['data_format'] == 'if':
|
||||
feature = feature_if
|
||||
elif dict_params['data_format'] == 'xcorr':
|
||||
feature = feature_xcorr
|
||||
else:
|
||||
indmin = min(feature_if.shape[0],feature_xcorr.shape[0])
|
||||
feature = np.concatenate([feature_xcorr[:indmin,:],feature_if[:indmin,:]],-1)
|
||||
|
||||
pitch_file_name = dir_f0 + "ref" + os.path.basename(list_files[idx])[3:-4] + ".f0"
|
||||
pitch = np.loadtxt(pitch_file_name)[:,0]
|
||||
voicing = np.loadtxt(pitch_file_name)[:,1]
|
||||
indmin = min(voicing.shape[0],rmse.shape[0],pitch.shape[0])
|
||||
pitch = pitch[:indmin]
|
||||
voicing = voicing[:indmin]
|
||||
rmse = rmse[:indmin]
|
||||
voicing = voicing*(rmse > 0.05*np.max(rmse))
|
||||
if "mic_F" in audio_file:
|
||||
idx_correct = np.where(pitch < 125)
|
||||
voicing[idx_correct] = 0
|
||||
|
||||
cent = np.rint(1200*np.log2(np.divide(pitch, (16000/256), out=np.zeros_like(pitch), where=pitch!=0) + 1.0e-8)).astype('int')
|
||||
|
||||
# if os.path.basename(f) == 'crepe':
|
||||
# elif (model == 'crepe'):
|
||||
# _, model_frequency, _, _ = crepe.predict(np.concatenate([np.zeros(80),audio]), 16000, viterbi=True,center=True,verbose=0)
|
||||
# model_cents = 1200*np.log2(model_frequency/(16000/256))
|
||||
# else:
|
||||
# else:
|
||||
model_cents = pitch_nn(torch.from_numpy(np.copy(np.expand_dims(feature,0))).float().to(device))
|
||||
model_cents = 20*model_cents.argmax(dim=1).cpu().detach().squeeze().numpy()
|
||||
# model_cents = np.roll(model_cents,-1*3)
|
||||
|
||||
num_frames = min(cent.shape[0],model_cents.shape[0])
|
||||
pitch = pitch[:num_frames]
|
||||
cent = cent[:num_frames]
|
||||
voicing = voicing[:num_frames]
|
||||
model_cents = model_cents[:num_frames]
|
||||
|
||||
voicing_all = np.copy(voicing)
|
||||
# Forcefully make regions where pitch is <65 or greater than 500 unvoiced for relevant accurate pitch comparisons for our model
|
||||
force_out_of_pitch = np.where(np.logical_or(pitch < 65,pitch > 500)==True)
|
||||
voicing_all[force_out_of_pitch] = 0
|
||||
C_all = C_all + np.where(voicing_all != 0)[0].shape[0]
|
||||
|
||||
# list_rca_model_all.append(sweep_rca(cent,model_cents,voicing_all,thresh,[0]))
|
||||
C_correct = C_correct + rca(cent,model_cents,voicing_all,thresh)
|
||||
# list_rca_model_all.append(np.count_nonzero(np.where(np.abs(cent - model_cents))))
|
||||
list_mean.append(C_correct/C_all)
|
||||
else:
|
||||
fname = f
|
||||
list_mean = []
|
||||
list_std = []
|
||||
for snr_dB in list_snr:
|
||||
C_all = 0
|
||||
C_correct = 0
|
||||
for idx in tqdm.trange(len(list_files)):
|
||||
audio_file = list_files[idx]
|
||||
file_name = os.path.basename(list_files[idx])[:-4]
|
||||
|
||||
audio = np.memmap(list_files[idx], dtype=np.int16)/(2**15 - 1)
|
||||
offset = 432
|
||||
audio = audio[offset:]
|
||||
rmse = np.squeeze(librosa.feature.rms(y = audio,frame_length = 320,hop_length = 160))
|
||||
|
||||
if noise_type != 'synthetic':
|
||||
list_noisefiles = noise_dataset + '*.wav'
|
||||
noise_file = random.choice(glob.glob(list_noisefiles))
|
||||
n = np.memmap(noise_file, dtype=np.int16,mode = 'r')/(2**15 - 1)
|
||||
rand_range = np.random.randint(low = 0, high = (16000*60*5 - audio.shape[0])) # Last 1 minute of noise used for testing
|
||||
n = n[rand_range:rand_range + audio.shape[0]]
|
||||
else:
|
||||
n = np.random.randn(audio.shape[0])
|
||||
n = random_filter(n)
|
||||
|
||||
snr_multiplier = np.sqrt((np.sum(np.abs(audio)**2)/np.sum(np.abs(n)**2))*10**(-snr_dB/10))
|
||||
audio = audio + snr_multiplier*n
|
||||
|
||||
if (f == 'crepe'):
|
||||
_, model_frequency, _, _ = crepe.predict(np.concatenate([np.zeros(80),audio]), 16000, viterbi=True,center=True,verbose=0)
|
||||
model_cents = 1200*np.log2(model_frequency/(16000/256) + 1.0e-8)
|
||||
else:
|
||||
data_temp = np.memmap('./temp.raw', dtype=np.int16, shape=(audio.shape[0]), mode='w+')
|
||||
# data_temp[:audio.shape[0]] = (audio/(np.max(np.abs(audio)))*(2**15 - 1)).astype(np.int16)
|
||||
data_temp[:audio.shape[0]] = ((audio)*(2**15 - 1)).astype(np.int16)
|
||||
|
||||
subprocess.run(["../../../lpcnet_xcorr_extractor", './temp.raw', './temp_xcorr.f32', './temp_period.f32'])
|
||||
feature_xcorr = np.fromfile('./temp_period.f32', dtype='float32')
|
||||
model_cents = 1200*np.log2((256/feature_xcorr + 1.0e-8) + 1.0e-8)
|
||||
|
||||
os.remove('./temp.raw')
|
||||
os.remove('./temp_xcorr.f32')
|
||||
os.remove('./temp_period.f32')
|
||||
|
||||
|
||||
pitch_file_name = dir_f0 + "ref" + os.path.basename(list_files[idx])[3:-4] + ".f0"
|
||||
pitch = np.loadtxt(pitch_file_name)[:,0]
|
||||
voicing = np.loadtxt(pitch_file_name)[:,1]
|
||||
indmin = min(voicing.shape[0],rmse.shape[0],pitch.shape[0])
|
||||
pitch = pitch[:indmin]
|
||||
voicing = voicing[:indmin]
|
||||
rmse = rmse[:indmin]
|
||||
voicing = voicing*(rmse > 0.05*np.max(rmse))
|
||||
if "mic_F" in audio_file:
|
||||
idx_correct = np.where(pitch < 125)
|
||||
voicing[idx_correct] = 0
|
||||
|
||||
cent = np.rint(1200*np.log2(np.divide(pitch, (16000/256), out=np.zeros_like(pitch), where=pitch!=0) + 1.0e-8)).astype('int')
|
||||
num_frames = min(cent.shape[0],model_cents.shape[0])
|
||||
pitch = pitch[:num_frames]
|
||||
cent = cent[:num_frames]
|
||||
voicing = voicing[:num_frames]
|
||||
model_cents = model_cents[:num_frames]
|
||||
|
||||
voicing_all = np.copy(voicing)
|
||||
# Forcefully make regions where pitch is <65 or greater than 500 unvoiced for relevant accurate pitch comparisons for our model
|
||||
force_out_of_pitch = np.where(np.logical_or(pitch < 65,pitch > 500)==True)
|
||||
voicing_all[force_out_of_pitch] = 0
|
||||
C_all = C_all + np.where(voicing_all != 0)[0].shape[0]
|
||||
|
||||
# list_rca_model_all.append(sweep_rca(cent,model_cents,voicing_all,thresh,[0]))
|
||||
C_correct = C_correct + rca(cent,model_cents,voicing_all,thresh)
|
||||
# list_rca_model_all.append(np.count_nonzero(np.where(np.abs(cent - model_cents))))
|
||||
list_mean.append(C_correct/C_all)
|
||||
dict_models[fname] = {}
|
||||
dict_models[fname]['list_SNR'] = list_mean[:-1]
|
||||
dict_models[fname]['inf'] = list_mean[-1]
|
||||
|
||||
return dict_models
|
38
dnn/torch/neural-pitch/experiments.py
Normal file
38
dnn/torch/neural-pitch/experiments.py
Normal file
|
@ -0,0 +1,38 @@
|
|||
"""
|
||||
Running the experiments;
|
||||
1. RCA vs SNR for our models, CREPE, LPCNet
|
||||
"""
|
||||
|
||||
import argparse
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
parser.add_argument('ptdb_root', type=str, help='Root Directory for PTDB generated by running ptdb_process.sh ')
|
||||
parser.add_argument('output', type=str, help='Output dump file name')
|
||||
parser.add_argument('method', type=str, help='Output Directory to save experiment dumps',choices=['model','lpcnet','crepe'])
|
||||
parser.add_argument('--noise_dataset', type=str, help='Location of the Demand Datset',default = './',required=False)
|
||||
parser.add_argument('--noise_type', type=str, help='Type of additive noise',default = 'synthetic',choices=['synthetic','demand'],required=False)
|
||||
parser.add_argument('--pth_file', type=str, help='.pth file to analyze',default = './',required = False)
|
||||
parser.add_argument('--fraction_files_analyze', type=float, help='Fraction of PTDB dataset to test on',default = 1,required = False)
|
||||
parser.add_argument('--threshold_rca', type=float, help='Cent threshold when computing RCA',default = 50,required = False)
|
||||
parser.add_argument('--gpu_index', type=int, help='GPU index to use if multiple GPUs',default = 0,required = False)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
import os
|
||||
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu_index)
|
||||
|
||||
import json
|
||||
from evaluation import cycle_eval
|
||||
|
||||
if args.method == 'model':
|
||||
dict_store = cycle_eval([args.pth_file], noise_type = args.noise_type, noise_dataset = args.noise_dataset, list_snr = [-20,-15,-10,-5,0,5,10,15,20], ptdb_dataset_path = args.ptdb_root,fraction = args.fraction_files_analyze,thresh = args.threshold_rca)
|
||||
else:
|
||||
dict_store = cycle_eval([args.method], noise_type = args.noise_type, noise_dataset = args.noise_dataset, list_snr = [-20,-15,-10,-5,0,5,10,15,20], ptdb_dataset_path = args.ptdb_root,fraction = args.fraction_files_analyze,thresh = args.threshold_rca)
|
||||
|
||||
dict_store["method"] = args.method
|
||||
if args.method == 'model':
|
||||
dict_store['pth'] = args.pth_file
|
||||
|
||||
with open(args.output, 'w') as fp:
|
||||
json.dump(dict_store, fp)
|
89
dnn/torch/neural-pitch/export_neuralpitch_weights.py
Normal file
89
dnn/torch/neural-pitch/export_neuralpitch_weights.py
Normal file
|
@ -0,0 +1,89 @@
|
|||
"""
|
||||
/* Copyright (c) 2022 Amazon
|
||||
Written by Jan Buethe */
|
||||
/*
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
"""
|
||||
|
||||
import os
|
||||
import argparse
|
||||
import sys
|
||||
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), '../weight-exchange'))
|
||||
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
parser.add_argument('checkpoint', type=str, help='rdovae model checkpoint')
|
||||
parser.add_argument('output_dir', type=str, help='output folder')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
import torch
|
||||
import numpy as np
|
||||
|
||||
from models import large_if_ccode
|
||||
from wexchange.torch import dump_torch_weights
|
||||
from wexchange.c_export import CWriter, print_vector
|
||||
|
||||
def c_export(args, model):
|
||||
|
||||
message = f"Auto generated from checkpoint {os.path.basename(args.checkpoint)}"
|
||||
|
||||
enc_writer = CWriter(os.path.join(args.output_dir, "neural_pitch_data"), message=message, model_struct_name='nnpitch')
|
||||
enc_writer.header.write(
|
||||
f"""
|
||||
#include "opus_types.h"
|
||||
"""
|
||||
)
|
||||
|
||||
|
||||
# encoder
|
||||
encoder_dense_layers = [
|
||||
('initial' , 'initial', 'TANH'),
|
||||
('upsample' , 'upsample', 'TANH')
|
||||
]
|
||||
|
||||
for name, export_name, _ in encoder_dense_layers:
|
||||
layer = model.get_submodule(name)
|
||||
dump_torch_weights(enc_writer, layer, name=export_name, verbose=True)
|
||||
|
||||
|
||||
encoder_gru_layers = [
|
||||
('gru' , 'gru', 'TANH'),
|
||||
]
|
||||
|
||||
enc_max_rnn_units = max([dump_torch_weights(enc_writer, model.get_submodule(name), export_name, verbose=True, input_sparse=False, quantize=False)
|
||||
for name, export_name, _ in encoder_gru_layers])
|
||||
|
||||
del enc_writer
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
os.makedirs(args.output_dir, exist_ok=True)
|
||||
model = large_if_ccode()
|
||||
model.load_state_dict(torch.load(args.checkpoint,map_location='cpu'))
|
||||
c_export(args, model)
|
218
dnn/torch/neural-pitch/models.py
Normal file
218
dnn/torch/neural-pitch/models.py
Normal file
|
@ -0,0 +1,218 @@
|
|||
"""
|
||||
Pitch Estimation Models and dataloaders
|
||||
- Classification Based (Input features, output logits)
|
||||
"""
|
||||
|
||||
import torch
|
||||
import numpy as np
|
||||
|
||||
class large_if_ccode(torch.nn.Module):
|
||||
|
||||
def __init__(self,input_dim = 90,gru_dim = 64,output_dim = 192):
|
||||
super(large_if_ccode,self).__init__()
|
||||
|
||||
self.activation = torch.nn.Tanh()
|
||||
self.initial = torch.nn.Linear(input_dim,gru_dim)
|
||||
self.hidden = torch.nn.Linear(gru_dim,gru_dim)
|
||||
self.gru = torch.nn.GRU(input_size = gru_dim,hidden_size = gru_dim,batch_first = True)
|
||||
self.upsample = torch.nn.Linear(gru_dim,output_dim)
|
||||
|
||||
def forward(self, x):
|
||||
|
||||
x = self.initial(x)
|
||||
x = self.activation(x)
|
||||
x = self.hidden(x)
|
||||
x = self.activation(x)
|
||||
x,_ = self.gru(x)
|
||||
x = self.upsample(x)
|
||||
x = self.activation(x)
|
||||
x = x.permute(0,2,1)
|
||||
|
||||
return x
|
||||
|
||||
class large_xcorr(torch.nn.Module):
|
||||
|
||||
def __init__(self,input_dim = 90,gru_dim = 64,output_dim = 192):
|
||||
super(large_xcorr,self).__init__()
|
||||
|
||||
self.activation = torch.nn.Tanh()
|
||||
|
||||
self.conv = torch.nn.Sequential(
|
||||
torch.nn.ZeroPad2d((2,0,1,1)),
|
||||
torch.nn.Conv2d(1, 8, 3, bias = True),
|
||||
self.activation,
|
||||
torch.nn.ZeroPad2d((2,0,1,1)),
|
||||
torch.nn.Conv2d(8, 8, 3, bias = True),
|
||||
self.activation,
|
||||
torch.nn.ZeroPad2d((2,0,1,1)),
|
||||
torch.nn.Conv2d(8, 1, 3, bias = True),
|
||||
self.activation,
|
||||
)
|
||||
|
||||
# self.conv = torch.nn.Sequential(
|
||||
# torch.nn.ConstantPad1d((2,0),0),
|
||||
# torch.nn.Conv1d(64,10,3),
|
||||
# self.activation,
|
||||
# torch.nn.ConstantPad1d((2,0),0),
|
||||
# torch.nn.Conv1d(10,64,3),
|
||||
# self.activation,
|
||||
# )
|
||||
|
||||
self.downsample = torch.nn.Sequential(
|
||||
torch.nn.Linear(input_dim,gru_dim),
|
||||
self.activation
|
||||
)
|
||||
self.GRU = torch.nn.GRU(input_size = gru_dim,hidden_size = gru_dim,num_layers = 1,batch_first = True)
|
||||
self.upsample = torch.nn.Sequential(
|
||||
torch.nn.Linear(gru_dim,output_dim),
|
||||
self.activation
|
||||
)
|
||||
|
||||
def forward(self, x):
|
||||
# x = x[:,:,:257].unsqueeze(-1)
|
||||
x = self.conv(x.unsqueeze(-1).permute(0,3,2,1)).squeeze(1)
|
||||
# print(x.shape)
|
||||
# x = self.conv(x.permute(0,3,2,1)).squeeze(1)
|
||||
x,_ = self.GRU(self.downsample(x.permute(0,2,1)))
|
||||
x = self.upsample(x).permute(0,2,1)
|
||||
|
||||
# x = self.downsample(x)
|
||||
# x = self.activation(x)
|
||||
# x = self.conv(x.permute(0,2,1)).permute(0,2,1)
|
||||
# x,_ = self.GRU(x)
|
||||
# x = self.upsample(x).permute(0,2,1)
|
||||
return x
|
||||
|
||||
class large_joint(torch.nn.Module):
|
||||
"""
|
||||
Joint IF-xcorr
|
||||
1D CNN on IF, merge with xcorr, 2D CNN on merged + GRU
|
||||
"""
|
||||
|
||||
def __init__(self,input_IF_dim = 90,input_xcorr_dim = 257,gru_dim = 64,output_dim = 192):
|
||||
super(large_joint,self).__init__()
|
||||
|
||||
self.activation = torch.nn.Tanh()
|
||||
|
||||
self.if_upsample = torch.nn.Sequential(
|
||||
torch.nn.Linear(input_IF_dim,64),
|
||||
self.activation,
|
||||
torch.nn.Linear(64,64),
|
||||
self.activation,
|
||||
)
|
||||
|
||||
# self.if_upsample = torch.nn.Sequential(
|
||||
# torch.nn.ConstantPad1d((2,0),0),
|
||||
# torch.nn.Conv1d(90,10,3),
|
||||
# self.activation,
|
||||
# torch.nn.ConstantPad1d((2,0),0),
|
||||
# torch.nn.Conv1d(10,257,3),
|
||||
# self.activation,
|
||||
# )
|
||||
|
||||
self.conv = torch.nn.Sequential(
|
||||
torch.nn.ZeroPad2d((2,0,1,1)),
|
||||
torch.nn.Conv2d(1, 8, 3, bias = True),
|
||||
self.activation,
|
||||
torch.nn.ZeroPad2d((2,0,1,1)),
|
||||
torch.nn.Conv2d(8, 8, 3, bias = True),
|
||||
self.activation,
|
||||
torch.nn.ZeroPad2d((2,0,1,1)),
|
||||
torch.nn.Conv2d(8, 1, 3, bias = True),
|
||||
self.activation,
|
||||
)
|
||||
|
||||
# self.conv = torch.nn.Sequential(
|
||||
# torch.nn.ConstantPad1d((2,0),0),
|
||||
# torch.nn.Conv1d(257,10,3),
|
||||
# self.activation,
|
||||
# torch.nn.ConstantPad1d((2,0),0),
|
||||
# torch.nn.Conv1d(10,64,3),
|
||||
# self.activation,
|
||||
# )
|
||||
|
||||
self.downsample = torch.nn.Sequential(
|
||||
torch.nn.Linear(64 + input_xcorr_dim,gru_dim),
|
||||
self.activation
|
||||
)
|
||||
self.GRU = torch.nn.GRU(input_size = gru_dim,hidden_size = gru_dim,num_layers = 1,batch_first = True)
|
||||
self.upsample = torch.nn.Sequential(
|
||||
torch.nn.Linear(gru_dim,output_dim),
|
||||
self.activation
|
||||
)
|
||||
|
||||
def forward(self, x):
|
||||
xcorr_feat = x[:,:,:257]
|
||||
if_feat = x[:,:,257:]
|
||||
# x = torch.cat([xcorr_feat.unsqueeze(-1),self.if_upsample(if_feat).unsqueeze(-1)],axis = -1)
|
||||
xcorr_feat = self.conv(xcorr_feat.unsqueeze(-1).permute(0,3,2,1)).squeeze(1).permute(0,2,1)
|
||||
if_feat = self.if_upsample(if_feat)
|
||||
x = torch.cat([xcorr_feat,if_feat],axis = - 1)
|
||||
# x = self.conv(x.permute(0,3,2,1)).squeeze(1)
|
||||
x,_ = self.GRU(self.downsample(x))
|
||||
x = self.upsample(x).permute(0,2,1)
|
||||
|
||||
return x
|
||||
|
||||
|
||||
# Dataloaders
|
||||
class loader(torch.utils.data.Dataset):
|
||||
def __init__(self, features_if, file_pitch,confidence_threshold = 0.4,dimension_if = 30,context = 100):
|
||||
self.if_feat = np.memmap(features_if, dtype=np.float32).reshape(-1,3*dimension_if)
|
||||
|
||||
# Resolution of 20 cents
|
||||
self.cents = np.rint(np.load(file_pitch)[0,:]/20)
|
||||
self.cents = np.clip(self.cents,0,179)
|
||||
self.confidence = np.load(file_pitch)[1,:]
|
||||
|
||||
# Filter confidence for CREPE
|
||||
self.confidence[self.confidence < confidence_threshold] = 0
|
||||
self.context = context
|
||||
# Clip both to same size
|
||||
size_common = min(self.if_feat.shape[0],self.cents.shape[0])
|
||||
self.if_feat = self.if_feat[:size_common,:]
|
||||
self.cents = self.cents[:size_common]
|
||||
self.confidence = self.confidence[:size_common]
|
||||
|
||||
frame_max = self.if_feat.shape[0]//context
|
||||
self.if_feat = np.reshape(self.if_feat[:frame_max*context,:],(frame_max,context,3*dimension_if))
|
||||
self.cents = np.reshape(self.cents[:frame_max*context],(frame_max,context))
|
||||
self.confidence = np.reshape(self.confidence[:frame_max*context],(frame_max,context))
|
||||
|
||||
def __len__(self):
|
||||
return self.if_feat.shape[0]
|
||||
|
||||
def __getitem__(self, index):
|
||||
return torch.from_numpy(self.if_feat[index,:,:]),torch.from_numpy(self.cents[index]),torch.from_numpy(self.confidence[index])
|
||||
|
||||
class loader_joint(torch.utils.data.Dataset):
|
||||
def __init__(self, features_if, file_pitch, features_xcorr,confidence_threshold = 0.4,context = 100, choice_data = 'both'):
|
||||
self.if_feat = np.memmap(features_if, dtype=np.float32).reshape(-1,90)
|
||||
self.xcorr = np.memmap(features_xcorr, dtype=np.float32).reshape(-1,257)
|
||||
self.cents = np.rint(np.load(file_pitch)[0,:]/20)
|
||||
self.cents = np.clip(self.cents,0,179)
|
||||
self.confidence = np.load(file_pitch)[1,:]
|
||||
# Filter confidence for CREPE
|
||||
self.confidence[self.confidence < confidence_threshold] = 0
|
||||
self.context = context
|
||||
|
||||
self.choice_data = choice_data
|
||||
|
||||
frame_max = self.if_feat.shape[0]//context
|
||||
self.if_feat = np.reshape(self.if_feat[:frame_max*context,:],(frame_max,context,90))
|
||||
self.cents = np.reshape(self.cents[:frame_max*context],(frame_max,context))
|
||||
self.xcorr = np.reshape(self.xcorr[:frame_max*context,:],(frame_max,context,257))
|
||||
# self.cents = np.rint(60*np.log2(256/(self.periods + 1.0e-8))).astype('int')
|
||||
# self.cents = np.clip(self.cents,0,239)
|
||||
self.confidence = np.reshape(self.confidence[:frame_max*context],(frame_max,context))
|
||||
# print(self.if_feat.shape)
|
||||
def __len__(self):
|
||||
return self.if_feat.shape[0]
|
||||
|
||||
def __getitem__(self, index):
|
||||
if self.choice_data == 'both':
|
||||
return torch.cat([torch.from_numpy(self.xcorr[index,:,:]),torch.from_numpy(self.if_feat[index,:,:])],dim=-1),torch.from_numpy(self.cents[index]),torch.from_numpy(self.confidence[index])
|
||||
elif self.choice_data == 'if':
|
||||
return torch.from_numpy(self.if_feat[index,:,:]),torch.from_numpy(self.cents[index]),torch.from_numpy(self.confidence[index])
|
||||
else:
|
||||
return torch.from_numpy(self.xcorr[index,:,:]),torch.from_numpy(self.cents[index]),torch.from_numpy(self.confidence[index])
|
207
dnn/torch/neural-pitch/neural_pitch_update.py
Normal file
207
dnn/torch/neural-pitch/neural_pitch_update.py
Normal file
|
@ -0,0 +1,207 @@
|
|||
import argparse
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
parser.add_argument('features', type=str, help='Features generated from dump_data')
|
||||
parser.add_argument('data', type=str, help='Data generated from dump_data (offset by 5ms)')
|
||||
parser.add_argument('output', type=str, help='output .f32 feature file with replaced neural pitch')
|
||||
parser.add_argument('pth_file', type=str, help='.pth file to use for pitch')
|
||||
parser.add_argument('path_lpcnet_extractor', type=str, help='path to LPCNet extractor object file (generated on compilation)')
|
||||
parser.add_argument('--device', type=str, help='compute device',default = None,required = False)
|
||||
parser.add_argument('--replace_xcorr', type = bool, default = False, help='Replace LPCNet xcorr with updated one')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
import os
|
||||
|
||||
from utils import stft, random_filter
|
||||
import subprocess
|
||||
import numpy as np
|
||||
import json
|
||||
import torch
|
||||
import tqdm
|
||||
|
||||
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
if device is not None:
|
||||
device = torch.device(args.device)
|
||||
|
||||
# Loading the appropriate model
|
||||
config_path = os.path.dirname(args.pth_file) + '/' + os.path.basename(args.pth_file).split('_')[0] + '_' + 'config_' + os.path.basename(args.pth_file).split('_')[-1][:-4] + '.json'
|
||||
with open(config_path) as json_file:
|
||||
dict_params = json.load(json_file)
|
||||
|
||||
if dict_params['data_format'] == 'if':
|
||||
from models import large_if_ccode as model
|
||||
pitch_nn = model(dict_params['freq_keep']*3,dict_params['gru_dim'],dict_params['output_dim']).to(device)
|
||||
elif dict_params['data_format'] == 'xcorr':
|
||||
from models import large_xcorr as model
|
||||
pitch_nn = model(dict_params['xcorr_dim'],dict_params['gru_dim'],dict_params['output_dim']).to(device)
|
||||
else:
|
||||
from models import large_joint as model
|
||||
pitch_nn = model(dict_params['freq_keep']*3,dict_params['xcorr_dim'],dict_params['gru_dim'],dict_params['output_dim']).to(device)
|
||||
|
||||
pitch_nn.load_state_dict(torch.load(args.pth_file))
|
||||
pitch_nn = pitch_nn.to(device)
|
||||
|
||||
N = dict_params['window_size']
|
||||
H = dict_params['hop_factor']
|
||||
freq_keep = dict_params['freq_keep']
|
||||
|
||||
# import os
|
||||
# import argparse
|
||||
|
||||
|
||||
|
||||
# os.environ["CUDA_VISIBLE_DEVICES"] = "0"
|
||||
os.environ["OMP_NUM_THREADS"] = "16"
|
||||
|
||||
# parser = argparse.ArgumentParser()
|
||||
|
||||
# parser.add_argument('features', type=str, help='input features')
|
||||
# parser.add_argument('data', type=str, help='input data')
|
||||
# parser.add_argument('output', type=str, help='output features')
|
||||
# parser.add_argument('--add-confidence', action='store_true', help='add CREPE confidence to features')
|
||||
# parser.add_argument('--viterbi', action='store_true', help='enable viterbi algo for pitch tracking')
|
||||
|
||||
|
||||
def run_lpc(signal, lpcs, frame_length=160):
|
||||
num_frames, lpc_order = lpcs.shape
|
||||
|
||||
prediction = np.concatenate(
|
||||
[- np.convolve(signal[i * frame_length : (i + 1) * frame_length + lpc_order - 1], lpcs[i], mode='valid') for i in range(num_frames)]
|
||||
)
|
||||
error = signal[lpc_order :] - prediction
|
||||
|
||||
return prediction, error
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parser.parse_args()
|
||||
|
||||
features = np.memmap(args.features, dtype=np.float32,mode = 'r').reshape((-1, 36))
|
||||
data = np.memmap(args.data, dtype=np.int16,mode = 'r').reshape((-1, 2))
|
||||
|
||||
num_frames = features.shape[0]
|
||||
feature_dim = features.shape[1]
|
||||
|
||||
assert feature_dim == 36
|
||||
|
||||
# if args.add_confidence:
|
||||
# feature_dim += 1
|
||||
|
||||
output = np.memmap(args.output, dtype=np.float32, shape=(num_frames, feature_dim), mode='w+')
|
||||
output[:, :36] = features
|
||||
|
||||
# lpc coefficients and signal
|
||||
lpcs = features[:, 20:36]
|
||||
sig = data[:, 1]
|
||||
|
||||
# parameters
|
||||
# use_viterbi=args.viterbi
|
||||
|
||||
# constants
|
||||
pitch_min = 32
|
||||
pitch_max = 256
|
||||
lpc_order = 16
|
||||
fs = 16000
|
||||
frame_length = 160
|
||||
overlap_frames = 100
|
||||
chunk_size = 10000
|
||||
history_length = frame_length * overlap_frames
|
||||
history = np.zeros(history_length, dtype=np.int16)
|
||||
pitch_position=18
|
||||
xcorr_position=19
|
||||
conf_position=36
|
||||
|
||||
num_frames = len(sig) // 160 - 1
|
||||
|
||||
frame_start = 0
|
||||
frame_stop = min(frame_start + chunk_size, num_frames)
|
||||
signal_start = 0
|
||||
signal_stop = frame_stop * frame_length
|
||||
|
||||
niters = (num_frames - 1)//chunk_size
|
||||
for i in tqdm.trange(niters):
|
||||
if (frame_start > num_frames - 1):
|
||||
break
|
||||
chunk = np.concatenate((history, sig[signal_start:signal_stop]))
|
||||
chunk_la = np.concatenate((history, sig[signal_start:signal_stop + 80]))
|
||||
# time, frequency, confidence, _ = crepe.predict(chunk, fs, center=True, viterbi=True,verbose=0)
|
||||
|
||||
# Feature computation
|
||||
spec = stft(x = np.concatenate([np.zeros(80),chunk_la/(2**15 - 1)]), w = 'boxcar', N = N, H = H).T
|
||||
phase_diff = spec*np.conj(np.roll(spec,1,axis = -1))
|
||||
phase_diff = phase_diff/(np.abs(phase_diff) + 1.0e-8)
|
||||
idx_save = np.concatenate([np.arange(freq_keep),(N//2 + 1) + np.arange(freq_keep),2*(N//2 + 1) + np.arange(freq_keep)])
|
||||
feature = np.concatenate([np.log(np.abs(spec) + 1.0e-8),np.real(phase_diff),np.imag(phase_diff)],axis = 0).T
|
||||
feature_if = feature[:,idx_save]
|
||||
|
||||
data_temp = np.memmap('./temp_featcompute_' + dict_params['data_format'] + '_.raw', dtype=np.int16, shape=(chunk.shape[0]), mode='w+')
|
||||
data_temp[:chunk.shape[0]] = chunk_la[80:].astype(np.int16)
|
||||
|
||||
subprocess.run([args.path_lpcnet_extractor, './temp_featcompute_' + dict_params['data_format'] + '_.raw', './temp_featcompute_xcorr_' + dict_params['data_format'] + '_.raw'])
|
||||
feature_xcorr = np.flip(np.fromfile('./temp_featcompute_xcorr_' + dict_params['data_format'] + '_.raw', dtype='float32').reshape((-1,256),order = 'C'),axis = 1)
|
||||
ones_zero_lag = np.expand_dims(np.ones(feature_xcorr.shape[0]),-1)
|
||||
feature_xcorr = np.concatenate([ones_zero_lag,feature_xcorr],axis = -1)
|
||||
|
||||
os.remove('./temp_featcompute_' + dict_params['data_format'] + '_.raw')
|
||||
os.remove('./temp_featcompute_xcorr_' + dict_params['data_format'] + '_.raw')
|
||||
|
||||
if dict_params['data_format'] == 'if':
|
||||
feature = feature_if
|
||||
elif dict_params['data_format'] == 'xcorr':
|
||||
feature = feature_xcorr
|
||||
else:
|
||||
indmin = min(feature_if.shape[0],feature_xcorr.shape[0])
|
||||
feature = np.concatenate([feature_xcorr[:indmin,:],feature_if[:indmin,:]],-1)
|
||||
|
||||
# Compute pitch with my model
|
||||
model_cents = pitch_nn(torch.from_numpy(np.copy(np.expand_dims(feature,0))).float().to(device))
|
||||
model_cents = 20*model_cents.argmax(dim=1).cpu().detach().squeeze().numpy()
|
||||
frequency = 62.5*2**(model_cents/1200)
|
||||
|
||||
frequency = frequency[overlap_frames : overlap_frames + frame_stop - frame_start]
|
||||
# confidence = confidence[overlap_frames : overlap_frames + frame_stop - frame_start]
|
||||
|
||||
# convert frequencies to periods
|
||||
periods = np.round(fs / frequency)
|
||||
|
||||
# adjust to pitch range
|
||||
# confidence[periods < pitch_min] = 0
|
||||
# confidence[periods > pitch_max] = 0
|
||||
periods = np.clip(periods, pitch_min, pitch_max)
|
||||
|
||||
output[frame_start:frame_stop, pitch_position] = (periods - 100) / 50
|
||||
|
||||
# if args.replace_xcorr:
|
||||
# re-calculate xcorr
|
||||
frame_offset = (pitch_max + frame_length - 1) // frame_length
|
||||
offset = frame_offset * frame_length
|
||||
padding = lpc_order
|
||||
|
||||
|
||||
if frame_start < frame_offset:
|
||||
lpc_coeffs = np.concatenate((np.zeros((frame_offset - frame_start, lpc_order), dtype=np.float32), lpcs[:frame_stop]))
|
||||
else:
|
||||
lpc_coeffs = lpcs[frame_start - frame_offset : frame_stop]
|
||||
|
||||
pred, error = run_lpc(chunk[history_length - offset - padding :], lpc_coeffs, frame_length=frame_length)
|
||||
|
||||
xcorr = np.zeros(frame_stop - frame_start)
|
||||
for i, p in enumerate(periods.astype(np.int16)):
|
||||
if p > 0:
|
||||
f1 = error[offset + i * frame_length : offset + (i + 1) * frame_length]
|
||||
f2 = error[offset + i * frame_length - p : offset + (i + 1) * frame_length - p]
|
||||
xcorr[i] = np.dot(f1, f2) / np.sqrt(np.dot(f1, f1) * np.dot(f2, f2) + 1e-6)
|
||||
|
||||
output[frame_start:frame_stop, xcorr_position] = xcorr - 0.5
|
||||
|
||||
# update buffers and indices
|
||||
history = chunk[-history_length :]
|
||||
|
||||
frame_start += chunk_size
|
||||
frame_stop += chunk_size
|
||||
frame_stop = min(frame_stop, num_frames)
|
||||
|
||||
signal_start = frame_start * frame_length
|
||||
signal_stop = frame_stop * frame_length
|
34
dnn/torch/neural-pitch/ptdb_process.sh
Normal file
34
dnn/torch/neural-pitch/ptdb_process.sh
Normal file
|
@ -0,0 +1,34 @@
|
|||
# Copy into PTDB root directory and run to combine all the male/female raw audio/references into below directories
|
||||
|
||||
# Make folder for combined audio
|
||||
mkdir -p './combined_mic_16k/'
|
||||
# Make folder for combined pitch reference
|
||||
mkdir -p './combined_reference_f0/'
|
||||
|
||||
# Resample Male Audio
|
||||
for i in ./MALE/MIC/**/*.wav; do
|
||||
j="$(basename "$i" .wav)"
|
||||
echo $j
|
||||
sox -r 48000 -b 16 -e signed-integer "$i" -r 16000 -b 16 -e signed-integer ./combined_mic_16k/$j.raw
|
||||
done
|
||||
|
||||
# Resample Female Audio
|
||||
for i in ./FEMALE/MIC/**/*.wav; do
|
||||
j="$(basename "$i" .wav)"
|
||||
echo $j
|
||||
sox -r 48000 -b 16 -e signed-integer "$i" -r 16000 -b 16 -e signed-integer ./combined_mic_16k/$j.raw
|
||||
done
|
||||
|
||||
# Shift Male reference pitch files
|
||||
for i in ./MALE/REF/**/*.f0; do
|
||||
j="$(basename "$i" .wav)"
|
||||
echo $j
|
||||
cp "$i" ./combined_reference_f0/
|
||||
done
|
||||
|
||||
# Shift Female reference pitch files
|
||||
for i in ./FEMALE/REF/**/*.f0; do
|
||||
j="$(basename "$i" .wav)"
|
||||
echo $j
|
||||
cp "$i" ./combined_reference_f0/
|
||||
done
|
162
dnn/torch/neural-pitch/training.py
Normal file
162
dnn/torch/neural-pitch/training.py
Normal file
|
@ -0,0 +1,162 @@
|
|||
"""
|
||||
Training the neural pitch estimator
|
||||
|
||||
"""
|
||||
|
||||
import argparse
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
parser.add_argument('features_if', type=str, help='.f32 IF Features for training (generated by augmentation script)')
|
||||
parser.add_argument('features_xcorr', type=str, help='.f32 Xcorr Features for training (generated by augmentation script)')
|
||||
parser.add_argument('features_pitch', type=str, help='.npy Pitch file for training (generated by augmentation script)')
|
||||
parser.add_argument('output_folder', type=str, help='Output directory to store the model weights and config')
|
||||
parser.add_argument('data_format', type=str, help='Choice of Input Data',choices=['if','xcorr','both'])
|
||||
parser.add_argument('--gpu_index', type=int, help='GPU index to use if multiple GPUs',default = 0,required = False)
|
||||
parser.add_argument('--confidence_threshold', type=float, help='Confidence value below which pitch will be neglected during training',default = 0.4,required = False)
|
||||
parser.add_argument('--context', type=int, help='Sequence length during training',default = 100,required = False)
|
||||
parser.add_argument('--N', type=int, help='STFT window size',default = 320,required = False)
|
||||
parser.add_argument('--H', type=int, help='STFT Hop size',default = 160,required = False)
|
||||
parser.add_argument('--xcorr_dimension', type=int, help='Dimension of Input cross-correlation',default = 257,required = False)
|
||||
parser.add_argument('--freq_keep', type=int, help='Number of Frequencies to keep',default = 30,required = False)
|
||||
parser.add_argument('--gru_dim', type=int, help='GRU Dimension',default = 64,required = False)
|
||||
parser.add_argument('--output_dim', type=int, help='Output dimension',default = 192,required = False)
|
||||
parser.add_argument('--learning_rate', type=float, help='Learning Rate',default = 1.0e-3,required = False)
|
||||
parser.add_argument('--epochs', type=int, help='Number of training epochs',default = 50,required = False)
|
||||
parser.add_argument('--choice_cel', type=str, help='Choice of Cross Entropy Loss (default or robust)',choices=['default','robust'],default = 'default',required = False)
|
||||
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# import os
|
||||
# os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
|
||||
# os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu_index)
|
||||
|
||||
# Fixing the seeds for reproducability
|
||||
import time
|
||||
np_seed = int(time.time())
|
||||
torch_seed = int(time.time())
|
||||
|
||||
import json
|
||||
import torch
|
||||
torch.manual_seed(torch_seed)
|
||||
import numpy as np
|
||||
np.random.seed(np_seed)
|
||||
from utils import count_parameters
|
||||
import tqdm
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from evaluation import rpa
|
||||
|
||||
# print(list(range(torch.cuda.device_count())))
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
# device = 'cpu'
|
||||
|
||||
from models import loader_joint as loader
|
||||
if args.data_format == 'if':
|
||||
from models import large_if_ccode as model
|
||||
pitch_nn = model(args.freq_keep*3,args.gru_dim,args.output_dim)
|
||||
elif args.data_format == 'xcorr':
|
||||
from models import large_xcorr as model
|
||||
pitch_nn = model(args.xcorr_dimension,args.gru_dim,args.output_dim)
|
||||
else:
|
||||
from models import large_joint as model
|
||||
pitch_nn = model(args.freq_keep*3,args.xcorr_dimension,args.gru_dim,args.output_dim)
|
||||
|
||||
dataset_training = loader(args.features_if,args.features_pitch,args.features_xcorr,args.confidence_threshold,args.context,args.data_format)
|
||||
|
||||
def loss_custom(logits,labels,confidence,choice = 'default',nmax = 192,q = 0.7):
|
||||
logits_softmax = torch.nn.Softmax(dim = 1)(logits).permute(0,2,1)
|
||||
labels_one_hot = torch.nn.functional.one_hot(labels.long(),nmax)
|
||||
|
||||
if choice == 'default':
|
||||
# Categorical Cross Entropy
|
||||
CE = -torch.sum(torch.log(logits_softmax*labels_one_hot + 1.0e-6)*labels_one_hot,dim=-1)
|
||||
CE = torch.sum(confidence*CE)
|
||||
|
||||
else:
|
||||
# Robust Cross Entropy
|
||||
CE = (1.0/q)*(1 - torch.sum(torch.pow(logits_softmax*labels_one_hot + 1.0e-7,q),dim=-1) )
|
||||
CE = torch.sum(confidence*CE)
|
||||
|
||||
return CE
|
||||
|
||||
# features = args.features
|
||||
# pitch = args.crepe_pitch
|
||||
# dataset_training = loader(features,pitch,args.confidence_threshold,args.freq_keep,args.context)
|
||||
# dataset_training = loader(features,pitch,'../../../../testing/testing_features_10pct_xcorr.f32')
|
||||
|
||||
train_dataset, test_dataset = torch.utils.data.random_split(dataset_training, [0.95,0.05],generator=torch.Generator().manual_seed(torch_seed))
|
||||
|
||||
batch_size = 256
|
||||
train_dataloader = torch.utils.data.DataLoader(dataset = train_dataset,batch_size = batch_size,shuffle = True,num_workers = 0, pin_memory = False)
|
||||
test_dataloader = torch.utils.data.DataLoader(dataset = test_dataset,batch_size = batch_size,shuffle = True,num_workers = 0, pin_memory = False)
|
||||
|
||||
# pitch_nn = model(args.freq_keep*3,args.gru_dim,args.output_dim).to(device)
|
||||
pitch_nn = pitch_nn.to(device)
|
||||
num_params = count_parameters(pitch_nn)
|
||||
learning_rate = args.learning_rate
|
||||
model_opt = torch.optim.Adam(pitch_nn.parameters(), lr = learning_rate)
|
||||
|
||||
num_epochs = args.epochs
|
||||
|
||||
for epoch in range(num_epochs):
|
||||
losses = []
|
||||
pitch_nn.train()
|
||||
with tqdm.tqdm(train_dataloader) as train_epoch:
|
||||
for i, (xi, yi, ci) in enumerate(train_epoch):
|
||||
yi, xi, ci = yi.to(device, non_blocking=True), xi.to(device, non_blocking=True), ci.to(device, non_blocking=True)
|
||||
pi = pitch_nn(xi.float())
|
||||
loss = loss_custom(logits = pi,labels = yi,confidence = ci,choice = args.choice_cel,nmax = args.output_dim)
|
||||
|
||||
model_opt.zero_grad()
|
||||
loss.backward()
|
||||
model_opt.step()
|
||||
|
||||
losses.append(loss.item())
|
||||
avg_loss = np.mean(losses)
|
||||
train_epoch.set_postfix({"Train Epoch" : epoch, "Train Loss":avg_loss})
|
||||
|
||||
if epoch % 5 == 0:
|
||||
pitch_nn.eval()
|
||||
losses = []
|
||||
with tqdm.tqdm(test_dataloader) as test_epoch:
|
||||
for i, (xi, yi, ci) in enumerate(test_epoch):
|
||||
yi, xi, ci = yi.to(device, non_blocking=True), xi.to(device, non_blocking=True), ci.to(device, non_blocking=True)
|
||||
pi = pitch_nn(xi.float())
|
||||
loss = loss_custom(logits = pi,labels = yi,confidence = ci,choice = args.choice_cel,nmax = args.output_dim)
|
||||
losses.append(loss.item())
|
||||
avg_loss = np.mean(losses)
|
||||
test_epoch.set_postfix({"Epoch" : epoch, "Test Loss":avg_loss})
|
||||
|
||||
pitch_nn.eval()
|
||||
rpa(pitch_nn,device,data_format = args.data_format)
|
||||
|
||||
config = dict(
|
||||
data_format = args.data_format,
|
||||
epochs = num_epochs,
|
||||
window_size = args.N,
|
||||
hop_factor = args.H,
|
||||
freq_keep = args.freq_keep,
|
||||
batch_size = batch_size,
|
||||
learning_rate = learning_rate,
|
||||
confidence_threshold = args.confidence_threshold,
|
||||
model_parameters = num_params,
|
||||
np_seed = np_seed,
|
||||
torch_seed = torch_seed,
|
||||
xcorr_dim = args.xcorr_dimension,
|
||||
dim_input = 3*args.freq_keep,
|
||||
gru_dim = args.gru_dim,
|
||||
output_dim = args.output_dim,
|
||||
choice_cel = args.choice_cel,
|
||||
context = args.context,
|
||||
)
|
||||
|
||||
now = datetime.now()
|
||||
dir_pth_save = args.output_folder
|
||||
dir_network = dir_pth_save + str(now) + '_net_' + args.data_format + '.pth'
|
||||
dir_dictparams = dir_pth_save + str(now) + '_config_' + args.data_format + '.json'
|
||||
# Save Weights
|
||||
torch.save(pitch_nn.state_dict(), dir_network)
|
||||
# Save Config
|
||||
with open(dir_dictparams, 'w') as fp:
|
||||
json.dump(config, fp)
|
59
dnn/torch/neural-pitch/utils.py
Normal file
59
dnn/torch/neural-pitch/utils.py
Normal file
|
@ -0,0 +1,59 @@
|
|||
"""
|
||||
Utility functions that are commonly used
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
from scipy.signal import windows, lfilter
|
||||
from prettytable import PrettyTable
|
||||
|
||||
|
||||
# Source: https://gist.github.com/thongonary/026210fc186eb5056f2b6f1ca362d912
|
||||
def count_parameters(model):
|
||||
table = PrettyTable(["Modules", "Parameters"])
|
||||
total_params = 0
|
||||
for name, parameter in model.named_parameters():
|
||||
if not parameter.requires_grad: continue
|
||||
param = parameter.numel()
|
||||
table.add_row([name, param])
|
||||
total_params+=param
|
||||
print(table)
|
||||
print(f"Total Trainable Params: {total_params}")
|
||||
return total_params
|
||||
|
||||
def stft(x, w = 'boxcar', N = 320, H = 160):
|
||||
x = np.concatenate([x,np.zeros(N)])
|
||||
# win_custom = np.concatenate([windows.hann(80)[:40],np.ones(240),windows.hann(80)[40:]])
|
||||
return np.stack([np.fft.rfft(x[i:i + N]*windows.get_window(w,N)) for i in np.arange(0,x.shape[0]-N,H)])
|
||||
|
||||
def random_filter(x):
|
||||
# Randomly filter x with second order IIR filter with coefficients in between -3/8,3/8
|
||||
filter_coeff = np.random.uniform(low = -3.0/8, high = 3.0/8, size = 4)
|
||||
b = [1,filter_coeff[0],filter_coeff[1]]
|
||||
a = [1,filter_coeff[2],filter_coeff[3]]
|
||||
return lfilter(b,a,x)
|
||||
|
||||
def feature_xform(feature):
|
||||
"""
|
||||
Take as input the (N * 256) xcorr features output by LPCNet and perform the following
|
||||
1. Downsample and Upsample by 2 (followed by smoothing)
|
||||
2. Append positional embeddings (of dim k) coresponding to each xcorr lag
|
||||
"""
|
||||
|
||||
from scipy.signal import resample_poly, lfilter
|
||||
|
||||
|
||||
feature_US = lfilter([0.25,0.5,0.25],[1],resample_poly(feature,2,1,axis = 1),axis = 1)[:,:feature.shape[1]]
|
||||
feature_DS = lfilter([0.5,0.5],[1],resample_poly(feature,1,2,axis = 1),axis = 1)
|
||||
Z_append = np.zeros((feature.shape[0],feature.shape[1] - feature_DS.shape[1]))
|
||||
feature_DS = np.concatenate([feature_DS,Z_append],axis = -1)
|
||||
|
||||
# pos_embedding = []
|
||||
# for i in range(k):
|
||||
# pos_embedding.append(np.cos((2**i)*np.pi*((np.repeat(np.arange(feature.shape[1]).reshape(feature.shape[1],1),feature.shape[0],axis = 1)).T/(2*feature.shape[1]))))
|
||||
|
||||
# pos_embedding = np.stack(pos_embedding,axis = -1)
|
||||
|
||||
feature = np.stack((feature_DS,feature,feature_US),axis = -1)
|
||||
# feature = np.concatenate((feature,pos_embedding),axis = -1)
|
||||
|
||||
return feature
|
Loading…
Add table
Add a link
Reference in a new issue