Python code for neural pitch

2025-05-24 20:29:12 +00:00 · 2023-09-25 00:19:41 -04:00 · 2023-09-25 00:19:41 -04:00 · f38b4a317f
commit f38b4a317f
parent d88dd89358
11 changed files with 1481 additions and 0 deletions
--- a/dnn/torch/neural-pitch/README.md
+++ b/dnn/torch/neural-pitch/README.md
@ -0,0 +1,18 @@
 ## Neural Pitch Estimation
 - Dataset Installation
    1. Download and unzip PTDB Dataset:
        wget https://www2.spsc.tugraz.at/databases/PTDB-TUG/SPEECH_DATA_ZIPPED.zip
        unzip SPEECH_DATA_ZIPPED.zip
    2. Inside "SPEECH DATA" above, run ptdb_process.sh to combine male/female
    3. To Download and combine demand, simply run download_demand.sh
 - LPCNet preparation
    1. To extract xcorr, add lpcnet_extractor.c and add relevant functions to lpcnet_enc.c, add source for headers/c files and Makefile.am, and compile to generate ./lpcnet_xcorr_extractor object
 - Dataset Augmentation and training (check out arguments to each of the following)
    1. Run data_augmentation.py
    2. Run training.py using augmented data
    3. Run experiments.py
--- a/dnn/torch/neural-pitch/data_augmentation.py
+++ b/dnn/torch/neural-pitch/data_augmentation.py
@ -0,0 +1,149 @@
 """
 Perform Data Augmentation (Gain, Additive Noise, Random Filtering) on Input TTS Data
 1. Read in chunks and compute clean pitch first
 2. Then add in augmentation (Noise/Level/Response)
    - Adds filtered noise from the "Demand" dataset, https://zenodo.org/record/1227121#.XRKKxYhKiUk
    - When using the Demand Dataset, consider each channel as a possible noise input, and keep the first 4 minutes of noise for training
 3. Use this "augmented" audio for feature computation, and compute pitch using CREPE on the clean input
 Notes: To ensure consistency with the discovered CREPE offset, we do the following
 - We pad the input audio to the zero-centered CREPE estimator with 80 zeros
 - We pad the input audio to our feature computation with 160 zeros to center them
 """
 import argparse
 parser = argparse.ArgumentParser()
 parser.add_argument('data', type=str, help='input raw audio data')
 parser.add_argument('output', type=str, help='output directory')
 parser.add_argument('path_lpcnet_extractor', type=str, help='path to LPCNet extractor object file (generated on compilation)')
 parser.add_argument('noise_dataset', type=str, help='Location of the Demand Datset')
 parser.add_argument('--flag_xcorr', type=bool, help='Flag to additionally dump xcorr features',choices=[True,False],default = False,required = False)
 parser.add_argument('--fraction_input_use', type=float, help='Fraction of input data to consider',default = 0.3,required = False)
 parser.add_argument('--gpu_index', type=int, help='GPU index to use if multiple GPUs',default = 0,required = False)
 parser.add_argument('--choice_augment', type=str, help='Choice of noise augmentation, either use additive synthetic noise or add noise from the demand dataset',choices = ['demand','synthetic'],default = "demand",required = False)
 parser.add_argument('--fraction_clean', type=float, help='Fraction of data to keep clean (that is not augment with anything)',default = 0.2,required = False)
 parser.add_argument('--chunk_size', type=int, help='Number of samples to augment with for each iteration',default = 80000,required = False)
 parser.add_argument('--N', type=int, help='STFT window size',default = 320,required = False)
 parser.add_argument('--H', type=int, help='STFT Hop size',default = 160,required = False)
 parser.add_argument('--freq_keep', type=int, help='Number of Frequencies to keep',default = 30,required = False)
 args = parser.parse_args()
 import os
 os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu_index)
 from utils import stft, random_filter
 import numpy as np
 import tqdm
 import crepe
 import random
 import glob
 import subprocess
 data_full = np.memmap(args.data, dtype=np.int16,mode = 'r')
 data = data_full[:(int)(args.fraction_input_use*data_full.shape[0])]
 # list_features = []
 list_cents = []
 list_confidences = []
 N = args.N
 H = args.H
 freq_keep = args.freq_keep
 # Minimum/Maximum periods, decided by LPCNet
 min_period = 32
 max_period = 256
 f_ref = 16000/max_period
 chunk_size = args.chunk_size
 num_frames_chunk = chunk_size//H
 list_indices_keep = np.concatenate([np.arange(freq_keep), (N//2 + 1) + np.arange(freq_keep), 2*(N//2 + 1) + np.arange(freq_keep)])
 output_IF  = np.memmap(args.output + '_iffeat.f32', dtype=np.float32, shape=(((data.shape[0]//chunk_size - 1)//1)*num_frames_chunk,list_indices_keep.shape[0]), mode='w+')
 if args.flag_xcorr:
    output_xcorr  = np.memmap(args.output + '_xcorr.f32', dtype=np.float32, shape=(((data.shape[0]//chunk_size - 1)//1)*num_frames_chunk,257), mode='w+')
 fraction_clean = args.fraction_clean
 noise_dataset = args.noise_dataset
 for i in tqdm.trange((data.shape[0]//chunk_size - 1)//1):
    chunk = data[i*chunk_size:(i + 1)*chunk_size]/(2**15 - 1)
    # Clean Pitch/Confidence Estimate
    # Padding input to CREPE by 80 samples to ensure it aligns
    _, pitch, confidence, _ = crepe.predict(np.concatenate([np.zeros(80),chunk]), 16000, center=True, viterbi=True,verbose=0)
    cent = 1200*np.log2(np.divide(pitch, f_ref, out=np.zeros_like(pitch), where=pitch!=0) + 1.0e-8)
    # Filter out of range pitches/confidences
    confidence[pitch < 16000/max_period] = 0
    confidence[pitch > 16000/min_period] = 0
    # Keep fraction of data clean, augment only 1 minus the fraction
    if (np.random.rand() > fraction_clean):
        # Response, generate controlled/random 2nd order IIR filter and filter chunk
        chunk = random_filter(chunk)
        # Level/Gain response {scale by random gain between 1.0e-3 and 10}
        # Generate random gain in dB and then convert to scale
        g_dB = np.random.uniform(low =  -60, high = 20, size = 1)
        # g_dB = 0
        g = 10**(g_dB/20)
        # Noise Addition {Add random SNR 2nd order randomly colored noise}
        # Generate noise SNR value and add corresponding noise
        snr_dB = np.random.uniform(low =  -20, high = 30, size = 1)
        if args.choice_augment == 'synthetic':
            n = np.random.randn(chunk_size)
        else:
            list_noisefiles = noise_dataset + '*.wav'
            noise_file = random.choice(glob.glob(list_noisefiles))
            n = np.memmap(noise_file, dtype=np.int16,mode = 'r')/(2**15 - 1)
            rand_range = np.random.randint(low = 0, high = (n.shape[0] - 16000*60 - chunk.shape[0])) # 16000 is subtracted because we will use the last 1 minutes of noise for testing
            n = n[rand_range:rand_range + chunk.shape[0]]
        # Randomly filter the sampled noise as well
        n = random_filter(n)
        # generate random prime number between 0,500 and make those samples of noise 0 (to prevent GRU from picking up temporal patterns)
        Nprime = random.choice([2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97, 101, 103, 107, 109, 113, 127, 131, 137, 139, 149, 151, 157, 163, 167, 173, 179, 181, 191, 193, 197, 199, 211, 223, 227, 229, 233, 239, 241, 251, 257, 263, 269, 271, 277, 281, 283, 293, 307, 311, 313, 317, 331, 337, 347, 349, 353, 359, 367, 373, 379, 383, 389, 397, 401, 409, 419, 421, 431, 433, 439, 443, 449, 457, 461, 463, 467, 479, 487, 491, 499, 503, 509, 521, 523, 541])
        n[chunk_size - Nprime:] = np.zeros(Nprime)
        snr_multiplier = np.sqrt((np.sum(np.abs(chunk)**2)/np.sum(np.abs(n)**2))*10**(-snr_dB/10))
        chunk = g*(chunk + snr_multiplier*n)
    # Zero pad input audio by 160 to center the frames
    spec = stft(x = np.concatenate([np.zeros(160),chunk]), w = 'boxcar', N = N, H = H).T
    phase_diff = spec*np.conj(np.roll(spec,1,axis = -1))
    phase_diff = phase_diff/(np.abs(phase_diff) + 1.0e-8)
    feature = np.concatenate([np.log(np.abs(spec) + 1.0e-8),np.real(phase_diff),np.imag(phase_diff)],axis = 0).T
    feature = feature[:,list_indices_keep]
    if args.flag_xcorr:
        # Dump noisy audio into temp file
        data_temp = np.memmap('./temp_augment.raw', dtype=np.int16, shape=(chunk.shape[0]), mode='w+')
        # data_temp[:chunk.shape[0]] = (chunk/(np.max(np.abs(chunk)))*(2**15 - 1)).astype(np.int16)
        data_temp[:chunk.shape[0]] = ((chunk)*(2**15 - 1)).astype(np.int16)
        subprocess.run([args.path_lpcnet_extractor, './temp_augment.raw', './temp_augment_xcorr.f32'])
        feature_xcorr = np.flip(np.fromfile('./temp_augment_xcorr.f32', dtype='float32').reshape((-1,256),order = 'C'),axis = 1)
        ones_zero_lag = np.expand_dims(np.ones(feature_xcorr.shape[0]),-1)
        feature_xcorr = np.concatenate([ones_zero_lag,feature_xcorr],axis = -1)
        os.remove('./temp_augment.raw')
        os.remove('./temp_augment_xcorr.f32')
    num_frames = min(cent.shape[0],feature.shape[0],feature_xcorr.shape[0],num_frames_chunk)
    feature = feature[:num_frames,:]
    cent = cent[:num_frames]
    confidence = confidence[:num_frames]
    feature_xcorr = feature_xcorr[:num_frames]
    output_IF[i*num_frames_chunk:(i + 1)*num_frames_chunk,:] = feature
    output_xcorr[i*num_frames_chunk:(i + 1)*num_frames_chunk,:] = feature_xcorr
    list_cents.append(cent)
    list_confidences.append(confidence)
 list_cents = np.hstack(list_cents)
 list_confidences = np.hstack(list_confidences)
 np.save(args.output + '_pitches',np.vstack([list_cents,list_confidences]))
--- a/dnn/torch/neural-pitch/download_demand.sh
+++ b/dnn/torch/neural-pitch/download_demand.sh
@ -0,0 +1,43 @@
 wget https://zenodo.org/record/1227121/files/DKITCHEN_16k.zip
 wget https://zenodo.org/record/1227121/files/DLIVING_16k.zip
 wget https://zenodo.org/record/1227121/files/DWASHING_16k.zip
 wget https://zenodo.org/record/1227121/files/NFIELD_16k.zip
 wget https://zenodo.org/record/1227121/files/NPARK_16k.zip
 wget https://zenodo.org/record/1227121/files/NRIVER_16k.zip
 wget https://zenodo.org/record/1227121/files/OHALLWAY_16k.zip
 wget https://zenodo.org/record/1227121/files/OMEETING_16k.zip
 wget https://zenodo.org/record/1227121/files/OOFFICE_16k.zip
 wget https://zenodo.org/record/1227121/files/PCAFETER_16k.zip
 wget https://zenodo.org/record/1227121/files/PRESTO_16k.zip
 wget https://zenodo.org/record/1227121/files/PSTATION_16k.zip
 wget https://zenodo.org/record/1227121/files/TMETRO_16k.zip
 wget https://zenodo.org/record/1227121/files/TCAR_16k.zip
 wget https://zenodo.org/record/1227121/files/TBUS_16k.zip
 wget https://zenodo.org/record/1227121/files/STRAFFIC_16k.zip
 wget https://zenodo.org/record/1227121/files/SPSQUARE_16k.zip
 unzip '*.zip'
 mkdir -p ./combined_demand_channels/
 for file in */*.wav; do
 parentdir="$(dirname "$file")"
 echo $parentdir
 fname="$(basename "$file")"
 cp $file ./combined_demand_channels/$parentdir+$fname
 done
--- a/dnn/torch/neural-pitch/evaluation.py
+++ b/dnn/torch/neural-pitch/evaluation.py
@ -0,0 +1,464 @@
 """
 Evaluation script to compute the Raw Pitch Accuracy
 Procedure:
    - Look at all voiced frames in file
    - Compute number of pitches in those frames that lie within a 50 cent threshold
    RPA = (Total number of pitches within threshold summed across all files)/(Total number of voiced frames summed accross all files)
 """
 import os
 os.environ["CUDA_VISIBLE_DEVICES"] = "0"
 from prettytable import PrettyTable
 import numpy as np
 import glob
 import random
 import tqdm
 import torch
 import librosa
 import json
 from utils import stft, random_filter, feature_xform
 import subprocess
 import crepe
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 def rca(reference,input,voicing,thresh = 25):
    idx_voiced = np.where(voicing != 0)[0]
    acc = np.where(np.abs(reference - input)[idx_voiced] < thresh)[0]
    return acc.shape[0]
 def sweep_rca(reference,input,voicing,thresh = 25,ind_arr = np.arange(-10,10)):
    l = []
    for i in ind_arr:
        l.append(rca(reference,np.roll(input,i),voicing,thresh))
    l = np.array(l)
    return np.max(l)
 def rpa(model,device = 'cpu',data_format = 'if'):
    list_files = glob.glob('/home/ubuntu/Code/Datasets/SPEECH DATA/combined_mic_16k_raw/*.raw')
    dir_f0 = '/home/ubuntu/Code/Datasets/SPEECH DATA/combine_f0_ptdb/'
    # random_shuffle = list(np.random.permutation(len(list_files)))
    random.shuffle(list_files)
    list_files = list_files[:1000]
    # C_lp = 0
    # C_lp_m = 0
    # C_lp_f = 0
    # list_rca_model_lp = []
    # list_rca_male_lp = []
    # list_rca_female_lp = []
    # C_hp = 0
    # C_hp_m = 0
    # C_hp_f = 0
    # list_rca_model_hp = []
    # list_rca_male_hp = []
    # list_rca_female_hp = []
    C_all = 0
    C_all_m = 0
    C_all_f = 0
    list_rca_model_all = []
    list_rca_male_all = []
    list_rca_female_all = []
    thresh = 50
    N = 320
    H = 160
    freq_keep = 30
    for idx in tqdm.trange(len(list_files)):
        audio_file = list_files[idx]
        file_name = os.path.basename(list_files[idx])[:-4]
        audio = np.memmap(list_files[idx], dtype=np.int16)/(2**15 - 1)
        offset = 432
        audio = audio[offset:]
        rmse = np.squeeze(librosa.feature.rms(y = audio,frame_length = 320,hop_length = 160))
        spec = stft(x = np.concatenate([np.zeros(160),audio]), w = 'boxcar', N = N, H = H).T
        phase_diff = spec*np.conj(np.roll(spec,1,axis = -1))
        phase_diff = phase_diff/(np.abs(phase_diff) + 1.0e-8)
        idx_save = np.concatenate([np.arange(freq_keep),(N//2 + 1) + np.arange(freq_keep),2*(N//2 + 1) + np.arange(freq_keep)])
        feature = np.concatenate([np.log(np.abs(spec) + 1.0e-8),np.real(phase_diff),np.imag(phase_diff)],axis = 0).T
        feature_if = feature[:,idx_save]
        data_temp = np.memmap('./temp.raw', dtype=np.int16, shape=(audio.shape[0]), mode='w+')
        data_temp[:audio.shape[0]] = (audio/(np.max(np.abs(audio)))*(2**15 - 1)).astype(np.int16)
        subprocess.run(["../../../lpcnet_xcorr_extractor", './temp.raw', './temp_xcorr.f32'])
        feature_xcorr = np.flip(np.fromfile('./temp_xcorr.f32', dtype='float32').reshape((-1,256),order = 'C'),axis = 1)
        ones_zero_lag = np.expand_dims(np.ones(feature_xcorr.shape[0]),-1)
        feature_xcorr = np.concatenate([ones_zero_lag,feature_xcorr],axis = -1)
        # feature_xcorr = feature_xform(feature_xcorr)
        os.remove('./temp.raw')
        os.remove('./temp_xcorr.f32')
        if data_format == 'if':
            feature = feature_if
        elif data_format == 'xcorr':
            feature = feature_xcorr
        else:
            indmin = min(feature_if.shape[0],feature_xcorr.shape[0])
            feature = np.concatenate([feature_xcorr[:indmin,:],feature_if[:indmin,:]],-1)
        pitch_file_name = dir_f0 + "ref" + os.path.basename(list_files[idx])[3:-4] + ".f0"
        pitch = np.loadtxt(pitch_file_name)[:,0]
        voicing = np.loadtxt(pitch_file_name)[:,1]
        indmin = min(voicing.shape[0],rmse.shape[0],pitch.shape[0])
        pitch = pitch[:indmin]
        voicing = voicing[:indmin]
        rmse = rmse[:indmin]
        voicing = voicing*(rmse > 0.05*np.max(rmse))
        if "mic_F" in audio_file:
            idx_correct = np.where(pitch < 125)
            voicing[idx_correct] = 0
        cent = np.rint(1200*np.log2(np.divide(pitch, (16000/256), out=np.zeros_like(pitch), where=pitch!=0) + 1.0e-8)).astype('int')
        # if (model == 'penn'):
        # model_frequency, _ = penn.from_audio(
        # torch.from_numpy(audio).unsqueeze(0).float(),
        # 16000,
        # hopsize=0.01,
        # fmin=(16000.0/256),
        # fmax=500,
        # checkpoint=penn.DEFAULT_CHECKPOINT,
        # batch_size=32,
        # pad=True,
        # interp_unvoiced_at=0.065,
        # gpu=0)
        # model_frequency = model_frequency.cpu().detach().squeeze().numpy()
        # model_cents = 1200*np.log2(model_frequency/(16000/256))
        # elif (model == 'crepe'):
        # _, model_frequency, _, _ = crepe.predict(audio, 16000, viterbi=vflag,center=True,verbose=0)
        # lpcnet_file_name = '/home/ubuntu/Code/Datasets/SPEECH_DATA/lpcnet_f0_16k_residual/' + file_name + '_f0.f32'
        # period_lpcnet = np.fromfile(lpcnet_file_name, dtype='float32')
        # model_frequency = 16000/(period_lpcnet + 1.0e-6)
        # model_cents = 1200*np.log2(model_frequency/(16000/256))
        # else:
        model_cents = model(torch.from_numpy(np.copy(np.expand_dims(feature,0))).float().to(device))
        model_cents = 20*model_cents.argmax(dim=1).cpu().detach().squeeze().numpy()
        # model_cents = np.roll(model_cents,-1*3)
        num_frames = min(cent.shape[0],model_cents.shape[0])
        pitch = pitch[:num_frames]
        cent = cent[:num_frames]
        voicing = voicing[:num_frames]
        model_cents = model_cents[:num_frames]
        voicing_all = np.copy(voicing)
        # Forcefully make regions where pitch is <65 or greater than 500 unvoiced for relevant accurate pitch comparisons for our model
        force_out_of_pitch = np.where(np.logical_or(pitch < 65,pitch > 500)==True)
        voicing_all[force_out_of_pitch] = 0
        C_all = C_all + np.where(voicing_all != 0)[0].shape[0]
        # list_rca_model_all.append(sweep_rca(cent,model_cents,voicing_all,thresh,[0]))
        list_rca_model_all.append(rca(cent,model_cents,voicing_all,thresh))
        # list_rca_model_all.append(np.count_nonzero(np.where(np.abs(cent - model_cents))))
        if "mic_M" in audio_file:
            # list_rca_male_all.append(sweep_rca(cent,model_cents,voicing_all,thresh,[0]))
            list_rca_male_all.append(rca(cent,model_cents,voicing_all,thresh))
            C_all_m = C_all_m + np.where(voicing_all != 0)[0].shape[0]
        else:
            # list_rca_female_all.append(sweep_rca(cent,model_cents,voicing_all,thresh,[0]))
            list_rca_female_all.append(rca(cent,model_cents,voicing_all,thresh))
            C_all_f = C_all_f + np.where(voicing_all != 0)[0].shape[0]
        """
        # Low pitch estimation
        voicing_lp = np.copy(voicing)
        force_out_of_pitch = np.where(np.logical_or(pitch < 65,pitch > 125)==True)
        voicing_lp[force_out_of_pitch] = 0
        C_lp = C_lp + np.where(voicing_lp != 0)[0].shape[0]
        # list_rca_model_lp.append(sweep_rca(cent,model_cents,voicing_lp,thresh,[0]))
        list_rca_model_lp.append(rca(cent,model_cents,voicing_lp,thresh))
        if "mic_M" in audio_file:
            # list_rca_male_lp.append(sweep_rca(cent,model_cents,voicing_lp,thresh,[0]))
            list_rca_male_lp.append(rca(cent,model_cents,voicing_lp,thresh))
            C_lp_m = C_lp_m + np.where(voicing_lp != 0)[0].shape[0]
        else:
            # list_rca_female_lp.append(sweep_rca(cent,model_cents,voicing_lp,thresh,[0]))
            list_rca_female_lp.append(rca(cent,model_cents,voicing_lp,thresh))
            C_lp_f = C_lp_f + np.where(voicing_lp != 0)[0].shape[0]
        # High pitch estimation
        voicing_hp = np.copy(voicing)
        force_out_of_pitch = np.where(np.logical_or(pitch < 125,pitch > 500)==True)
        voicing_hp[force_out_of_pitch] = 0
        C_hp = C_hp + np.where(voicing_hp != 0)[0].shape[0]
        # list_rca_model_hp.append(sweep_rca(cent,model_cents,voicing_hp,thresh,[0]))
        list_rca_model_hp.append(rca(cent,model_cents,voicing_hp,thresh))
        if "mic_M" in audio_file:
            # list_rca_male_hp.append(sweep_rca(cent,model_cents,voicing_hp,thresh,[0]))
            list_rca_male_hp.append(rca(cent,model_cents,voicing_hp,thresh))
            C_hp_m = C_hp_m + np.where(voicing_hp != 0)[0].shape[0]
        else:
            # list_rca_female_hp.append(sweep_rca(cent,model_cents,voicing_hp,thresh,[0]))
            list_rca_female_hp.append(rca(cent,model_cents,voicing_hp,thresh))
            C_hp_f = C_hp_f + np.where(voicing_hp != 0)[0].shape[0]
        # list_rca_model.append(acc_model)
        # list_rca_crepe.append(acc_crepe)
        # list_rca_lpcnet.append(acc_lpcnet)
        # list_rca_penn.append(acc_penn)
        """
    # list_rca_crepe = np.array(list_rca_crepe)
    # list_rca_model_lp = np.array(list_rca_model_lp)
    # list_rca_male_lp = np.array(list_rca_male_lp)
    # list_rca_female_lp = np.array(list_rca_female_lp)
    # list_rca_model_hp = np.array(list_rca_model_hp)
    # list_rca_male_hp = np.array(list_rca_male_hp)
    # list_rca_female_hp = np.array(list_rca_female_hp)
    list_rca_model_all = np.array(list_rca_model_all)
    list_rca_male_all = np.array(list_rca_male_all)
    list_rca_female_all = np.array(list_rca_female_all)
    # list_rca_lpcnet = np.array(list_rca_lpcnet)
    # list_rca_penn = np.array(list_rca_penn)
    x = PrettyTable()
    x.field_names = ["Experiment", "Mean RPA"]
    x.add_row(["Both all pitches", np.sum(list_rca_model_all)/C_all])
    # x.add_row(["Both low pitches", np.sum(list_rca_model_lp)/C_lp])
    # x.add_row(["Both high pitches", np.sum(list_rca_model_hp)/C_hp])
    x.add_row(["Male all pitches", np.sum(list_rca_male_all)/C_all_m])
    # x.add_row(["Male low pitches", np.sum(list_rca_male_lp)/C_lp_m])
    # x.add_row(["Male high pitches", np.sum(list_rca_male_hp)/C_hp_m])
    x.add_row(["Female all pitches", np.sum(list_rca_female_all)/C_all_f])
    # x.add_row(["Female low pitches", np.sum(list_rca_female_lp)/C_lp_f])
    # x.add_row(["Female high pitches", np.sum(list_rca_female_hp)/C_hp_f])
    print(x)
    return None
 def cycle_eval(list_files_pth, noise_type = 'synthetic', noise_dataset = None, list_snr = [-20,-15,-10,-5,0,5,10,15,20], ptdb_dataset_path = None,fraction = 0.1,thresh = 50):
    """
    Cycle through SNR evaluation for list of .pth files
    """
    # list_files = glob.glob('/home/ubuntu/Code/Datasets/SPEECH DATA/combined_mic_16k_raw/*.raw')
    # dir_f0 = '/home/ubuntu/Code/Datasets/SPEECH DATA/combine_f0_ptdb/'
    # random_shuffle = list(np.random.permutation(len(list_files)))
    list_files = glob.glob(ptdb_dataset_path + 'combined_mic_16k/*.raw')
    dir_f0 = ptdb_dataset_path + 'combined_reference_f0/'
    random.shuffle(list_files)
    list_files = list_files[:(int)(fraction*len(list_files))]
    # list_nfiles = ['DKITCHEN','NFIELD','OHALLWAY','PCAFETER','SPSQUARE','TCAR','DLIVING','NPARK','OMEETING','PRESTO','STRAFFIC','TMETRO','DWASHING','NRIVER','OOFFICE','PSTATION','TBUS']
    dict_models = {}
    list_snr.append(np.inf)
    # thresh = 50
    for f in list_files_pth:
        if (f!='crepe') and (f!='lpcnet'):
            fname = os.path.basename(f).split('_')[0]  + '_' + os.path.basename(f).split('_')[-1][:-4]
            config_path = os.path.dirname(f) + '/' + os.path.basename(f).split('_')[0] + '_' + 'config_' + os.path.basename(f).split('_')[-1][:-4] + '.json'
            with open(config_path) as json_file:
                dict_params = json.load(json_file)
            if dict_params['data_format'] == 'if':
                from models import large_if_ccode as model
                pitch_nn = model(dict_params['freq_keep']*3,dict_params['gru_dim'],dict_params['output_dim']).to(device)
            elif dict_params['data_format'] == 'xcorr':
                from models import large_xcorr as model
                pitch_nn = model(dict_params['xcorr_dim'],dict_params['gru_dim'],dict_params['output_dim']).to(device)
            else:
                from models import large_joint as model
                pitch_nn = model(dict_params['freq_keep']*3,dict_params['xcorr_dim'],dict_params['gru_dim'],dict_params['output_dim']).to(device)
            pitch_nn.load_state_dict(torch.load(f))
            N = dict_params['window_size']
            H = dict_params['hop_factor']
            freq_keep = dict_params['freq_keep']
            list_mean = []
            list_std = []
            for snr_dB in list_snr:
                C_all = 0
                C_correct = 0
                for idx in tqdm.trange(len(list_files)):
                    audio_file = list_files[idx]
                    file_name = os.path.basename(list_files[idx])[:-4]
                    audio = np.memmap(list_files[idx], dtype=np.int16)/(2**15 - 1)
                    offset = 432
                    audio = audio[offset:]
                    rmse = np.squeeze(librosa.feature.rms(y = audio,frame_length = N,hop_length = H))
                    if noise_type != 'synthetic':
                        list_noisefiles = noise_dataset + '*.wav'
                        noise_file = random.choice(glob.glob(list_noisefiles))
                        n = np.memmap(noise_file, dtype=np.int16,mode = 'r')/(2**15 - 1)
                        rand_range = np.random.randint(low = 0, high = (16000*60*5 - audio.shape[0])) # Last 1 minute of noise used for testing
                        n = n[rand_range:rand_range + audio.shape[0]]
                    else:
                        n = np.random.randn(audio.shape[0])
                        n = random_filter(n)
                    snr_multiplier = np.sqrt((np.sum(np.abs(audio)**2)/np.sum(np.abs(n)**2))*10**(-snr_dB/10))
                    audio = audio + snr_multiplier*n
                    spec = stft(x = np.concatenate([np.zeros(160),audio]), w = 'boxcar', N = N, H = H).T
                    phase_diff = spec*np.conj(np.roll(spec,1,axis = -1))
                    phase_diff = phase_diff/(np.abs(phase_diff) + 1.0e-8)
                    idx_save = np.concatenate([np.arange(freq_keep),(N//2 + 1) + np.arange(freq_keep),2*(N//2 + 1) + np.arange(freq_keep)])
                    feature = np.concatenate([np.log(np.abs(spec) + 1.0e-8),np.real(phase_diff),np.imag(phase_diff)],axis = 0).T
                    feature_if = feature[:,idx_save]
                    data_temp = np.memmap('./temp.raw', dtype=np.int16, shape=(audio.shape[0]), mode='w+')
                    # data_temp[:audio.shape[0]] = (audio/(np.max(np.abs(audio)))*(2**15 - 1)).astype(np.int16)
                    data_temp[:audio.shape[0]] = ((audio)*(2**15 - 1)).astype(np.int16)
                    subprocess.run(["../../../lpcnet_xcorr_extractor", './temp.raw', './temp_xcorr.f32'])
                    feature_xcorr = np.flip(np.fromfile('./temp_xcorr.f32', dtype='float32').reshape((-1,256),order = 'C'),axis = 1)
                    ones_zero_lag = np.expand_dims(np.ones(feature_xcorr.shape[0]),-1)
                    feature_xcorr = np.concatenate([ones_zero_lag,feature_xcorr],axis = -1)
                    os.remove('./temp.raw')
                    os.remove('./temp_xcorr.f32')
                    if dict_params['data_format'] == 'if':
                        feature = feature_if
                    elif dict_params['data_format'] == 'xcorr':
                        feature = feature_xcorr
                    else:
                        indmin = min(feature_if.shape[0],feature_xcorr.shape[0])
                        feature = np.concatenate([feature_xcorr[:indmin,:],feature_if[:indmin,:]],-1)
                    pitch_file_name = dir_f0 + "ref" + os.path.basename(list_files[idx])[3:-4] + ".f0"
                    pitch = np.loadtxt(pitch_file_name)[:,0]
                    voicing = np.loadtxt(pitch_file_name)[:,1]
                    indmin = min(voicing.shape[0],rmse.shape[0],pitch.shape[0])
                    pitch = pitch[:indmin]
                    voicing = voicing[:indmin]
                    rmse = rmse[:indmin]
                    voicing = voicing*(rmse > 0.05*np.max(rmse))
                    if "mic_F" in audio_file:
                        idx_correct = np.where(pitch < 125)
                        voicing[idx_correct] = 0
                    cent = np.rint(1200*np.log2(np.divide(pitch, (16000/256), out=np.zeros_like(pitch), where=pitch!=0) + 1.0e-8)).astype('int')
                    # if os.path.basename(f) == 'crepe':
                    # elif (model == 'crepe'):
                        # _, model_frequency, _, _ = crepe.predict(np.concatenate([np.zeros(80),audio]), 16000, viterbi=True,center=True,verbose=0)
                        # model_cents = 1200*np.log2(model_frequency/(16000/256))
                    # else:
                    # else:
                    model_cents = pitch_nn(torch.from_numpy(np.copy(np.expand_dims(feature,0))).float().to(device))
                    model_cents = 20*model_cents.argmax(dim=1).cpu().detach().squeeze().numpy()
                    # model_cents = np.roll(model_cents,-1*3)
                    num_frames = min(cent.shape[0],model_cents.shape[0])
                    pitch = pitch[:num_frames]
                    cent = cent[:num_frames]
                    voicing = voicing[:num_frames]
                    model_cents = model_cents[:num_frames]
                    voicing_all = np.copy(voicing)
                    # Forcefully make regions where pitch is <65 or greater than 500 unvoiced for relevant accurate pitch comparisons for our model
                    force_out_of_pitch = np.where(np.logical_or(pitch < 65,pitch > 500)==True)
                    voicing_all[force_out_of_pitch] = 0
                    C_all = C_all + np.where(voicing_all != 0)[0].shape[0]
                    # list_rca_model_all.append(sweep_rca(cent,model_cents,voicing_all,thresh,[0]))
                    C_correct = C_correct + rca(cent,model_cents,voicing_all,thresh)
                    # list_rca_model_all.append(np.count_nonzero(np.where(np.abs(cent - model_cents))))
                list_mean.append(C_correct/C_all)
        else:
            fname = f
            list_mean = []
            list_std = []
            for snr_dB in list_snr:
                C_all = 0
                C_correct = 0
                for idx in tqdm.trange(len(list_files)):
                    audio_file = list_files[idx]
                    file_name = os.path.basename(list_files[idx])[:-4]
                    audio = np.memmap(list_files[idx], dtype=np.int16)/(2**15 - 1)
                    offset = 432
                    audio = audio[offset:]
                    rmse = np.squeeze(librosa.feature.rms(y = audio,frame_length = 320,hop_length = 160))
                    if noise_type != 'synthetic':
                        list_noisefiles = noise_dataset + '*.wav'
                        noise_file = random.choice(glob.glob(list_noisefiles))
                        n = np.memmap(noise_file, dtype=np.int16,mode = 'r')/(2**15 - 1)
                        rand_range = np.random.randint(low = 0, high = (16000*60*5 - audio.shape[0])) # Last 1 minute of noise used for testing
                        n = n[rand_range:rand_range + audio.shape[0]]
                    else:
                        n = np.random.randn(audio.shape[0])
                        n = random_filter(n)
                    snr_multiplier = np.sqrt((np.sum(np.abs(audio)**2)/np.sum(np.abs(n)**2))*10**(-snr_dB/10))
                    audio = audio + snr_multiplier*n
                    if (f == 'crepe'):
                        _, model_frequency, _, _ = crepe.predict(np.concatenate([np.zeros(80),audio]), 16000, viterbi=True,center=True,verbose=0)
                        model_cents = 1200*np.log2(model_frequency/(16000/256) + 1.0e-8)
                    else:
                        data_temp = np.memmap('./temp.raw', dtype=np.int16, shape=(audio.shape[0]), mode='w+')
                        # data_temp[:audio.shape[0]] = (audio/(np.max(np.abs(audio)))*(2**15 - 1)).astype(np.int16)
                        data_temp[:audio.shape[0]] = ((audio)*(2**15 - 1)).astype(np.int16)
                        subprocess.run(["../../../lpcnet_xcorr_extractor", './temp.raw', './temp_xcorr.f32', './temp_period.f32'])
                        feature_xcorr = np.fromfile('./temp_period.f32', dtype='float32')
                        model_cents = 1200*np.log2((256/feature_xcorr +  1.0e-8) + 1.0e-8)
                        os.remove('./temp.raw')
                        os.remove('./temp_xcorr.f32')
                        os.remove('./temp_period.f32')
                    pitch_file_name = dir_f0 + "ref" + os.path.basename(list_files[idx])[3:-4] + ".f0"
                    pitch = np.loadtxt(pitch_file_name)[:,0]
                    voicing = np.loadtxt(pitch_file_name)[:,1]
                    indmin = min(voicing.shape[0],rmse.shape[0],pitch.shape[0])
                    pitch = pitch[:indmin]
                    voicing = voicing[:indmin]
                    rmse = rmse[:indmin]
                    voicing = voicing*(rmse > 0.05*np.max(rmse))
                    if "mic_F" in audio_file:
                        idx_correct = np.where(pitch < 125)
                        voicing[idx_correct] = 0
                    cent = np.rint(1200*np.log2(np.divide(pitch, (16000/256), out=np.zeros_like(pitch), where=pitch!=0) + 1.0e-8)).astype('int')
                    num_frames = min(cent.shape[0],model_cents.shape[0])
                    pitch = pitch[:num_frames]
                    cent = cent[:num_frames]
                    voicing = voicing[:num_frames]
                    model_cents = model_cents[:num_frames]
                    voicing_all = np.copy(voicing)
                    # Forcefully make regions where pitch is <65 or greater than 500 unvoiced for relevant accurate pitch comparisons for our model
                    force_out_of_pitch = np.where(np.logical_or(pitch < 65,pitch > 500)==True)
                    voicing_all[force_out_of_pitch] = 0
                    C_all = C_all + np.where(voicing_all != 0)[0].shape[0]
                    # list_rca_model_all.append(sweep_rca(cent,model_cents,voicing_all,thresh,[0]))
                    C_correct = C_correct + rca(cent,model_cents,voicing_all,thresh)
                    # list_rca_model_all.append(np.count_nonzero(np.where(np.abs(cent - model_cents))))
                list_mean.append(C_correct/C_all)
        dict_models[fname] = {}
        dict_models[fname]['list_SNR'] = list_mean[:-1]
        dict_models[fname]['inf'] = list_mean[-1]
    return dict_models
--- a/dnn/torch/neural-pitch/experiments.py
+++ b/dnn/torch/neural-pitch/experiments.py
@ -0,0 +1,38 @@
 """
 Running the experiments;
    1. RCA vs SNR for our models, CREPE, LPCNet
 """
 import argparse
 parser = argparse.ArgumentParser()
 parser.add_argument('ptdb_root', type=str, help='Root Directory for PTDB generated by running ptdb_process.sh ')
 parser.add_argument('output', type=str, help='Output dump file name')
 parser.add_argument('method', type=str, help='Output Directory to save experiment dumps',choices=['model','lpcnet','crepe'])
 parser.add_argument('--noise_dataset', type=str, help='Location of the Demand Datset',default = './',required=False)
 parser.add_argument('--noise_type', type=str, help='Type of additive noise',default = 'synthetic',choices=['synthetic','demand'],required=False)
 parser.add_argument('--pth_file', type=str, help='.pth file to analyze',default = './',required = False)
 parser.add_argument('--fraction_files_analyze', type=float, help='Fraction of PTDB dataset to test on',default = 1,required = False)
 parser.add_argument('--threshold_rca', type=float, help='Cent threshold when computing RCA',default = 50,required = False)
 parser.add_argument('--gpu_index', type=int, help='GPU index to use if multiple GPUs',default = 0,required = False)
 args = parser.parse_args()
 import os
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu_index)
 import json
 from evaluation import cycle_eval
 if args.method == 'model':
    dict_store = cycle_eval([args.pth_file], noise_type = args.noise_type, noise_dataset = args.noise_dataset, list_snr = [-20,-15,-10,-5,0,5,10,15,20], ptdb_dataset_path = args.ptdb_root,fraction = args.fraction_files_analyze,thresh = args.threshold_rca)
 else:
    dict_store = cycle_eval([args.method], noise_type = args.noise_type, noise_dataset = args.noise_dataset, list_snr = [-20,-15,-10,-5,0,5,10,15,20], ptdb_dataset_path = args.ptdb_root,fraction = args.fraction_files_analyze,thresh = args.threshold_rca)
 dict_store["method"] = args.method
 if args.method == 'model':
    dict_store['pth'] = args.pth_file
 with open(args.output, 'w') as fp:
    json.dump(dict_store, fp)
--- a/dnn/torch/neural-pitch/export_neuralpitch_weights.py
+++ b/dnn/torch/neural-pitch/export_neuralpitch_weights.py
@ -0,0 +1,89 @@
 """
 /* Copyright (c) 2022 Amazon
   Written by Jan Buethe */
 /*
   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions
   are met:
   - Redistributions of source code must retain the above copyright
   notice, this list of conditions and the following disclaimer.
   - Redistributions in binary form must reproduce the above copyright
   notice, this list of conditions and the following disclaimer in the
   documentation and/or other materials provided with the distribution.
   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 """
 import os
 import argparse
 import sys
 sys.path.append(os.path.join(os.path.dirname(__file__), '../weight-exchange'))
 parser = argparse.ArgumentParser()
 parser.add_argument('checkpoint', type=str, help='rdovae model checkpoint')
 parser.add_argument('output_dir', type=str, help='output folder')
 args = parser.parse_args()
 import torch
 import numpy as np
 from models import large_if_ccode
 from wexchange.torch import dump_torch_weights
 from wexchange.c_export import CWriter, print_vector
 def c_export(args, model):
    message = f"Auto generated from checkpoint {os.path.basename(args.checkpoint)}"
    enc_writer = CWriter(os.path.join(args.output_dir, "neural_pitch_data"), message=message, model_struct_name='nnpitch')
    enc_writer.header.write(
 f"""
 #include "opus_types.h"
 """
        )
    # encoder
    encoder_dense_layers = [
        ('initial'       , 'initial',   'TANH'),
        ('upsample'       , 'upsample',   'TANH')
    ]
    for name, export_name, _ in encoder_dense_layers:
        layer = model.get_submodule(name)
        dump_torch_weights(enc_writer, layer, name=export_name, verbose=True)
    encoder_gru_layers = [
        ('gru'         , 'gru',   'TANH'),
    ]
    enc_max_rnn_units = max([dump_torch_weights(enc_writer, model.get_submodule(name), export_name, verbose=True, input_sparse=False, quantize=False)
                             for name, export_name, _ in encoder_gru_layers])
    del enc_writer
 if __name__ == "__main__":
    os.makedirs(args.output_dir, exist_ok=True)
    model = large_if_ccode()
    model.load_state_dict(torch.load(args.checkpoint,map_location='cpu'))
    c_export(args, model)
--- a/dnn/torch/neural-pitch/models.py
+++ b/dnn/torch/neural-pitch/models.py
@ -0,0 +1,218 @@
 """
 Pitch Estimation Models and dataloaders
    - Classification Based (Input features, output logits)
 """
 import torch
 import numpy as np
 class large_if_ccode(torch.nn.Module):
    def __init__(self,input_dim = 90,gru_dim = 64,output_dim = 192):
        super(large_if_ccode,self).__init__()
        self.activation = torch.nn.Tanh()
        self.initial = torch.nn.Linear(input_dim,gru_dim)
        self.hidden = torch.nn.Linear(gru_dim,gru_dim)
        self.gru = torch.nn.GRU(input_size = gru_dim,hidden_size = gru_dim,batch_first = True)
        self.upsample = torch.nn.Linear(gru_dim,output_dim)
    def forward(self, x):
        x = self.initial(x)
        x = self.activation(x)
        x = self.hidden(x)
        x = self.activation(x)
        x,_ = self.gru(x)
        x = self.upsample(x)
        x = self.activation(x)
        x = x.permute(0,2,1)
        return x
 class large_xcorr(torch.nn.Module):
    def __init__(self,input_dim = 90,gru_dim = 64,output_dim = 192):
        super(large_xcorr,self).__init__()
        self.activation = torch.nn.Tanh()
        self.conv = torch.nn.Sequential(
            torch.nn.ZeroPad2d((2,0,1,1)),
            torch.nn.Conv2d(1, 8, 3, bias = True),
            self.activation,
            torch.nn.ZeroPad2d((2,0,1,1)),
            torch.nn.Conv2d(8, 8, 3, bias = True),
            self.activation,
            torch.nn.ZeroPad2d((2,0,1,1)),
            torch.nn.Conv2d(8, 1, 3, bias = True),
            self.activation,
        )
        # self.conv = torch.nn.Sequential(
        #     torch.nn.ConstantPad1d((2,0),0),
        #     torch.nn.Conv1d(64,10,3),
        #     self.activation,
        #     torch.nn.ConstantPad1d((2,0),0),
        #     torch.nn.Conv1d(10,64,3),
        #     self.activation,
        # )
        self.downsample = torch.nn.Sequential(
            torch.nn.Linear(input_dim,gru_dim),
            self.activation
        )
        self.GRU = torch.nn.GRU(input_size = gru_dim,hidden_size = gru_dim,num_layers = 1,batch_first = True)
        self.upsample = torch.nn.Sequential(
            torch.nn.Linear(gru_dim,output_dim),
            self.activation
        )
    def forward(self, x):
        # x = x[:,:,:257].unsqueeze(-1)
        x = self.conv(x.unsqueeze(-1).permute(0,3,2,1)).squeeze(1)
        # print(x.shape)
        # x = self.conv(x.permute(0,3,2,1)).squeeze(1)
        x,_ = self.GRU(self.downsample(x.permute(0,2,1)))
        x = self.upsample(x).permute(0,2,1)
        # x = self.downsample(x)
        # x = self.activation(x)
        # x = self.conv(x.permute(0,2,1)).permute(0,2,1)
        # x,_ = self.GRU(x)
        # x = self.upsample(x).permute(0,2,1)
        return x
 class large_joint(torch.nn.Module):
    """
    Joint IF-xcorr
    1D CNN on IF, merge with xcorr, 2D CNN on merged + GRU
    """
    def __init__(self,input_IF_dim = 90,input_xcorr_dim = 257,gru_dim = 64,output_dim = 192):
        super(large_joint,self).__init__()
        self.activation = torch.nn.Tanh()
        self.if_upsample = torch.nn.Sequential(
            torch.nn.Linear(input_IF_dim,64),
            self.activation,
            torch.nn.Linear(64,64),
            self.activation,
        )
        # self.if_upsample = torch.nn.Sequential(
        #     torch.nn.ConstantPad1d((2,0),0),
        #     torch.nn.Conv1d(90,10,3),
        #     self.activation,
        #     torch.nn.ConstantPad1d((2,0),0),
        #     torch.nn.Conv1d(10,257,3),
        #     self.activation,
        # )
        self.conv = torch.nn.Sequential(
            torch.nn.ZeroPad2d((2,0,1,1)),
            torch.nn.Conv2d(1, 8, 3, bias = True),
            self.activation,
            torch.nn.ZeroPad2d((2,0,1,1)),
            torch.nn.Conv2d(8, 8, 3, bias = True),
            self.activation,
            torch.nn.ZeroPad2d((2,0,1,1)),
            torch.nn.Conv2d(8, 1, 3, bias = True),
            self.activation,
        )
        # self.conv = torch.nn.Sequential(
        #     torch.nn.ConstantPad1d((2,0),0),
        #     torch.nn.Conv1d(257,10,3),
        #     self.activation,
        #     torch.nn.ConstantPad1d((2,0),0),
        #     torch.nn.Conv1d(10,64,3),
        #     self.activation,
        # )
        self.downsample = torch.nn.Sequential(
            torch.nn.Linear(64 + input_xcorr_dim,gru_dim),
            self.activation
        )
        self.GRU = torch.nn.GRU(input_size = gru_dim,hidden_size = gru_dim,num_layers = 1,batch_first = True)
        self.upsample = torch.nn.Sequential(
            torch.nn.Linear(gru_dim,output_dim),
            self.activation
        )
    def forward(self, x):
        xcorr_feat = x[:,:,:257]
        if_feat = x[:,:,257:]
        # x = torch.cat([xcorr_feat.unsqueeze(-1),self.if_upsample(if_feat).unsqueeze(-1)],axis = -1)
        xcorr_feat = self.conv(xcorr_feat.unsqueeze(-1).permute(0,3,2,1)).squeeze(1).permute(0,2,1)
        if_feat = self.if_upsample(if_feat)
        x = torch.cat([xcorr_feat,if_feat],axis = - 1)
        # x = self.conv(x.permute(0,3,2,1)).squeeze(1)
        x,_ = self.GRU(self.downsample(x))
        x = self.upsample(x).permute(0,2,1)
        return x
 # Dataloaders
 class loader(torch.utils.data.Dataset):
      def __init__(self, features_if, file_pitch,confidence_threshold = 0.4,dimension_if = 30,context = 100):
            self.if_feat = np.memmap(features_if, dtype=np.float32).reshape(-1,3*dimension_if)
            # Resolution of 20 cents
            self.cents = np.rint(np.load(file_pitch)[0,:]/20)
            self.cents = np.clip(self.cents,0,179)
            self.confidence = np.load(file_pitch)[1,:]
            # Filter confidence for CREPE
            self.confidence[self.confidence < confidence_threshold] = 0
            self.context = context
            # Clip both to same size
            size_common = min(self.if_feat.shape[0],self.cents.shape[0])
            self.if_feat = self.if_feat[:size_common,:]
            self.cents = self.cents[:size_common]
            self.confidence = self.confidence[:size_common]
            frame_max = self.if_feat.shape[0]//context
            self.if_feat = np.reshape(self.if_feat[:frame_max*context,:],(frame_max,context,3*dimension_if))
            self.cents = np.reshape(self.cents[:frame_max*context],(frame_max,context))
            self.confidence = np.reshape(self.confidence[:frame_max*context],(frame_max,context))
      def __len__(self):
            return self.if_feat.shape[0]
      def __getitem__(self, index):
            return torch.from_numpy(self.if_feat[index,:,:]),torch.from_numpy(self.cents[index]),torch.from_numpy(self.confidence[index])
 class loader_joint(torch.utils.data.Dataset):
      def __init__(self, features_if, file_pitch, features_xcorr,confidence_threshold = 0.4,context = 100, choice_data = 'both'):
            self.if_feat = np.memmap(features_if, dtype=np.float32).reshape(-1,90)
            self.xcorr = np.memmap(features_xcorr, dtype=np.float32).reshape(-1,257)
            self.cents = np.rint(np.load(file_pitch)[0,:]/20)
            self.cents = np.clip(self.cents,0,179)
            self.confidence = np.load(file_pitch)[1,:]
            # Filter confidence for CREPE
            self.confidence[self.confidence < confidence_threshold] = 0
            self.context = context
            self.choice_data = choice_data
            frame_max = self.if_feat.shape[0]//context
            self.if_feat = np.reshape(self.if_feat[:frame_max*context,:],(frame_max,context,90))
            self.cents = np.reshape(self.cents[:frame_max*context],(frame_max,context))
            self.xcorr = np.reshape(self.xcorr[:frame_max*context,:],(frame_max,context,257))
            # self.cents = np.rint(60*np.log2(256/(self.periods + 1.0e-8))).astype('int')
            # self.cents = np.clip(self.cents,0,239)
            self.confidence = np.reshape(self.confidence[:frame_max*context],(frame_max,context))
            # print(self.if_feat.shape)
      def __len__(self):
            return self.if_feat.shape[0]
      def __getitem__(self, index):
            if self.choice_data == 'both':
                return torch.cat([torch.from_numpy(self.xcorr[index,:,:]),torch.from_numpy(self.if_feat[index,:,:])],dim=-1),torch.from_numpy(self.cents[index]),torch.from_numpy(self.confidence[index])
            elif self.choice_data == 'if':
                return torch.from_numpy(self.if_feat[index,:,:]),torch.from_numpy(self.cents[index]),torch.from_numpy(self.confidence[index])
            else:
                return torch.from_numpy(self.xcorr[index,:,:]),torch.from_numpy(self.cents[index]),torch.from_numpy(self.confidence[index])
--- a/dnn/torch/neural-pitch/neural_pitch_update.py
+++ b/dnn/torch/neural-pitch/neural_pitch_update.py
@ -0,0 +1,207 @@
 import argparse
 parser = argparse.ArgumentParser()
 parser.add_argument('features', type=str, help='Features generated from dump_data')
 parser.add_argument('data', type=str, help='Data generated from dump_data (offset by 5ms)')
 parser.add_argument('output', type=str, help='output .f32 feature file with replaced neural pitch')
 parser.add_argument('pth_file', type=str, help='.pth file to use for pitch')
 parser.add_argument('path_lpcnet_extractor', type=str, help='path to LPCNet extractor object file (generated on compilation)')
 parser.add_argument('--device', type=str, help='compute device',default = None,required = False)
 parser.add_argument('--replace_xcorr', type = bool, default = False, help='Replace LPCNet xcorr with updated one')
 args = parser.parse_args()
 import os
 from utils import stft, random_filter
 import subprocess
 import numpy as np
 import json
 import torch
 import tqdm
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 if device is not None:
    device = torch.device(args.device)
 # Loading the appropriate model
 config_path = os.path.dirname(args.pth_file) + '/' + os.path.basename(args.pth_file).split('_')[0] + '_' + 'config_' + os.path.basename(args.pth_file).split('_')[-1][:-4] + '.json'
 with open(config_path) as json_file:
    dict_params = json.load(json_file)
 if dict_params['data_format'] == 'if':
    from models import large_if_ccode as model
    pitch_nn = model(dict_params['freq_keep']*3,dict_params['gru_dim'],dict_params['output_dim']).to(device)
 elif dict_params['data_format'] == 'xcorr':
    from models import large_xcorr as model
    pitch_nn = model(dict_params['xcorr_dim'],dict_params['gru_dim'],dict_params['output_dim']).to(device)
 else:
    from models import large_joint as model
    pitch_nn = model(dict_params['freq_keep']*3,dict_params['xcorr_dim'],dict_params['gru_dim'],dict_params['output_dim']).to(device)
 pitch_nn.load_state_dict(torch.load(args.pth_file))
 pitch_nn = pitch_nn.to(device)
 N = dict_params['window_size']
 H = dict_params['hop_factor']
 freq_keep = dict_params['freq_keep']
 # import os
 # import argparse
 # os.environ["CUDA_VISIBLE_DEVICES"] = "0"
 os.environ["OMP_NUM_THREADS"] = "16"
 # parser = argparse.ArgumentParser()
 # parser.add_argument('features', type=str, help='input features')
 # parser.add_argument('data', type=str, help='input data')
 # parser.add_argument('output', type=str, help='output features')
 # parser.add_argument('--add-confidence', action='store_true', help='add CREPE confidence to features')
 # parser.add_argument('--viterbi', action='store_true', help='enable viterbi algo for pitch tracking')
 def run_lpc(signal, lpcs, frame_length=160):
    num_frames, lpc_order = lpcs.shape
    prediction = np.concatenate(
        [- np.convolve(signal[i * frame_length : (i + 1) * frame_length + lpc_order - 1], lpcs[i], mode='valid') for i in range(num_frames)]
    )
    error = signal[lpc_order :] - prediction
    return prediction, error
 if __name__ == "__main__":
    args = parser.parse_args()
    features = np.memmap(args.features, dtype=np.float32,mode = 'r').reshape((-1, 36))
    data     = np.memmap(args.data, dtype=np.int16,mode = 'r').reshape((-1, 2))
    num_frames = features.shape[0]
    feature_dim = features.shape[1]
    assert feature_dim == 36
    # if args.add_confidence:
        # feature_dim += 1
    output  = np.memmap(args.output, dtype=np.float32, shape=(num_frames, feature_dim), mode='w+')
    output[:, :36] = features
    # lpc coefficients and signal
    lpcs = features[:, 20:36]
    sig = data[:, 1]
    # parameters
    # use_viterbi=args.viterbi
    # constants
    pitch_min = 32
    pitch_max = 256
    lpc_order = 16
    fs = 16000
    frame_length = 160
    overlap_frames = 100
    chunk_size = 10000
    history_length = frame_length * overlap_frames
    history = np.zeros(history_length, dtype=np.int16)
    pitch_position=18
    xcorr_position=19
    conf_position=36
    num_frames = len(sig) // 160 - 1
    frame_start = 0
    frame_stop = min(frame_start + chunk_size, num_frames)
    signal_start = 0
    signal_stop = frame_stop * frame_length
    niters = (num_frames - 1)//chunk_size
    for i in tqdm.trange(niters):
        if (frame_start > num_frames - 1):
            break
        chunk = np.concatenate((history, sig[signal_start:signal_stop]))
        chunk_la = np.concatenate((history, sig[signal_start:signal_stop + 80]))
        # time, frequency, confidence, _ = crepe.predict(chunk, fs, center=True, viterbi=True,verbose=0)
        # Feature computation
        spec = stft(x = np.concatenate([np.zeros(80),chunk_la/(2**15 - 1)]), w = 'boxcar', N = N, H = H).T
        phase_diff = spec*np.conj(np.roll(spec,1,axis = -1))
        phase_diff = phase_diff/(np.abs(phase_diff) + 1.0e-8)
        idx_save = np.concatenate([np.arange(freq_keep),(N//2 + 1) + np.arange(freq_keep),2*(N//2 + 1) + np.arange(freq_keep)])
        feature = np.concatenate([np.log(np.abs(spec) + 1.0e-8),np.real(phase_diff),np.imag(phase_diff)],axis = 0).T
        feature_if = feature[:,idx_save]
        data_temp = np.memmap('./temp_featcompute_' + dict_params['data_format'] + '_.raw', dtype=np.int16, shape=(chunk.shape[0]), mode='w+')
        data_temp[:chunk.shape[0]] = chunk_la[80:].astype(np.int16)
        subprocess.run([args.path_lpcnet_extractor, './temp_featcompute_' + dict_params['data_format'] + '_.raw', './temp_featcompute_xcorr_' + dict_params['data_format'] + '_.raw'])
        feature_xcorr = np.flip(np.fromfile('./temp_featcompute_xcorr_' + dict_params['data_format'] + '_.raw', dtype='float32').reshape((-1,256),order = 'C'),axis = 1)
        ones_zero_lag = np.expand_dims(np.ones(feature_xcorr.shape[0]),-1)
        feature_xcorr = np.concatenate([ones_zero_lag,feature_xcorr],axis = -1)
        os.remove('./temp_featcompute_' + dict_params['data_format'] + '_.raw')
        os.remove('./temp_featcompute_xcorr_' + dict_params['data_format'] + '_.raw')
        if dict_params['data_format'] == 'if':
            feature = feature_if
        elif dict_params['data_format'] == 'xcorr':
            feature = feature_xcorr
        else:
            indmin = min(feature_if.shape[0],feature_xcorr.shape[0])
            feature = np.concatenate([feature_xcorr[:indmin,:],feature_if[:indmin,:]],-1)
        # Compute pitch with my model
        model_cents = pitch_nn(torch.from_numpy(np.copy(np.expand_dims(feature,0))).float().to(device))
        model_cents = 20*model_cents.argmax(dim=1).cpu().detach().squeeze().numpy()
        frequency = 62.5*2**(model_cents/1200)
        frequency  = frequency[overlap_frames : overlap_frames + frame_stop - frame_start]
        # confidence = confidence[overlap_frames : overlap_frames + frame_stop - frame_start]
        # convert frequencies to periods
        periods    = np.round(fs / frequency)
        # adjust to pitch range
        # confidence[periods < pitch_min] = 0
        # confidence[periods > pitch_max] = 0
        periods = np.clip(periods, pitch_min, pitch_max)
        output[frame_start:frame_stop, pitch_position] = (periods - 100) / 50
        # if args.replace_xcorr:
            # re-calculate xcorr
        frame_offset = (pitch_max + frame_length - 1) // frame_length
        offset = frame_offset * frame_length
        padding = lpc_order
        if frame_start < frame_offset:
            lpc_coeffs = np.concatenate((np.zeros((frame_offset - frame_start, lpc_order), dtype=np.float32), lpcs[:frame_stop]))
        else:
            lpc_coeffs = lpcs[frame_start - frame_offset : frame_stop]
        pred, error = run_lpc(chunk[history_length - offset - padding :], lpc_coeffs, frame_length=frame_length)
        xcorr = np.zeros(frame_stop - frame_start)
        for i, p in enumerate(periods.astype(np.int16)):
            if p > 0:
                f1 = error[offset + i * frame_length : offset + (i + 1) * frame_length]
                f2 = error[offset + i * frame_length - p : offset + (i + 1) * frame_length - p]
                xcorr[i] = np.dot(f1, f2) / np.sqrt(np.dot(f1, f1) * np.dot(f2, f2) + 1e-6)
        output[frame_start:frame_stop, xcorr_position] = xcorr - 0.5
        # update buffers and indices
        history = chunk[-history_length :]
        frame_start += chunk_size
        frame_stop += chunk_size
        frame_stop = min(frame_stop, num_frames)
        signal_start = frame_start * frame_length
        signal_stop  = frame_stop  * frame_length
--- a/dnn/torch/neural-pitch/ptdb_process.sh
+++ b/dnn/torch/neural-pitch/ptdb_process.sh
@ -0,0 +1,34 @@
 # Copy into PTDB root directory and run to combine all the male/female raw audio/references into below directories
 # Make folder for combined audio
 mkdir -p './combined_mic_16k/'
 # Make folder for combined pitch reference
 mkdir -p './combined_reference_f0/'
 # Resample Male Audio
 for i in ./MALE/MIC/**/*.wav; do
 j="$(basename "$i" .wav)"
 echo $j
 sox -r 48000 -b 16 -e signed-integer "$i" -r 16000 -b 16 -e signed-integer ./combined_mic_16k/$j.raw
 done
 # Resample Female Audio
 for i in ./FEMALE/MIC/**/*.wav; do
 j="$(basename "$i" .wav)"
 echo $j
 sox -r 48000 -b 16 -e signed-integer "$i" -r 16000 -b 16 -e signed-integer ./combined_mic_16k/$j.raw
 done
 # Shift Male reference pitch files
 for i in ./MALE/REF/**/*.f0; do
 j="$(basename "$i" .wav)"
 echo $j
 cp "$i" ./combined_reference_f0/
 done
 # Shift Female reference pitch files
 for i in ./FEMALE/REF/**/*.f0; do
 j="$(basename "$i" .wav)"
 echo $j
 cp "$i" ./combined_reference_f0/
 done
--- a/dnn/torch/neural-pitch/training.py
+++ b/dnn/torch/neural-pitch/training.py
@ -0,0 +1,162 @@
 """
 Training the neural pitch estimator
 """
 import argparse
 parser = argparse.ArgumentParser()
 parser.add_argument('features_if', type=str, help='.f32 IF Features for training (generated by augmentation script)')
 parser.add_argument('features_xcorr', type=str, help='.f32 Xcorr Features for training (generated by augmentation script)')
 parser.add_argument('features_pitch', type=str, help='.npy Pitch file for training (generated by augmentation script)')
 parser.add_argument('output_folder', type=str, help='Output directory to store the model weights and config')
 parser.add_argument('data_format', type=str, help='Choice of Input Data',choices=['if','xcorr','both'])
 parser.add_argument('--gpu_index', type=int, help='GPU index to use if multiple GPUs',default = 0,required = False)
 parser.add_argument('--confidence_threshold', type=float, help='Confidence value below which pitch will be neglected during training',default = 0.4,required = False)
 parser.add_argument('--context', type=int, help='Sequence length during training',default = 100,required = False)
 parser.add_argument('--N', type=int, help='STFT window size',default = 320,required = False)
 parser.add_argument('--H', type=int, help='STFT Hop size',default = 160,required = False)
 parser.add_argument('--xcorr_dimension', type=int, help='Dimension of Input cross-correlation',default = 257,required = False)
 parser.add_argument('--freq_keep', type=int, help='Number of Frequencies to keep',default = 30,required = False)
 parser.add_argument('--gru_dim', type=int, help='GRU Dimension',default = 64,required = False)
 parser.add_argument('--output_dim', type=int, help='Output dimension',default = 192,required = False)
 parser.add_argument('--learning_rate', type=float, help='Learning Rate',default = 1.0e-3,required = False)
 parser.add_argument('--epochs', type=int, help='Number of training epochs',default = 50,required = False)
 parser.add_argument('--choice_cel', type=str, help='Choice of Cross Entropy Loss (default or robust)',choices=['default','robust'],default = 'default',required = False)
 args = parser.parse_args()
 # import os
 # os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu_index)
 # Fixing the seeds for reproducability
 import time
 np_seed = int(time.time())
 torch_seed = int(time.time())
 import json
 import torch
 torch.manual_seed(torch_seed)
 import numpy as np
 np.random.seed(np_seed)
 from utils import count_parameters
 import tqdm
 import sys
 from datetime import datetime
 from evaluation import rpa
 # print(list(range(torch.cuda.device_count())))
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 # device = 'cpu'
 from models import loader_joint as loader
 if args.data_format == 'if':
    from models import large_if_ccode as model
    pitch_nn = model(args.freq_keep*3,args.gru_dim,args.output_dim)
 elif args.data_format == 'xcorr':
    from models import large_xcorr as model
    pitch_nn = model(args.xcorr_dimension,args.gru_dim,args.output_dim)
 else:
    from models import large_joint as model
    pitch_nn = model(args.freq_keep*3,args.xcorr_dimension,args.gru_dim,args.output_dim)
 dataset_training = loader(args.features_if,args.features_pitch,args.features_xcorr,args.confidence_threshold,args.context,args.data_format)
 def loss_custom(logits,labels,confidence,choice = 'default',nmax = 192,q = 0.7):
    logits_softmax = torch.nn.Softmax(dim = 1)(logits).permute(0,2,1)
    labels_one_hot = torch.nn.functional.one_hot(labels.long(),nmax)
    if choice == 'default':
        # Categorical Cross Entropy
        CE = -torch.sum(torch.log(logits_softmax*labels_one_hot + 1.0e-6)*labels_one_hot,dim=-1)
        CE = torch.sum(confidence*CE)
    else:
        # Robust Cross Entropy
        CE = (1.0/q)*(1 - torch.sum(torch.pow(logits_softmax*labels_one_hot + 1.0e-7,q),dim=-1) )
        CE = torch.sum(confidence*CE)
    return CE
 # features = args.features
 # pitch = args.crepe_pitch
 # dataset_training = loader(features,pitch,args.confidence_threshold,args.freq_keep,args.context)
 # dataset_training = loader(features,pitch,'../../../../testing/testing_features_10pct_xcorr.f32')
 train_dataset, test_dataset = torch.utils.data.random_split(dataset_training, [0.95,0.05],generator=torch.Generator().manual_seed(torch_seed))
 batch_size = 256
 train_dataloader = torch.utils.data.DataLoader(dataset = train_dataset,batch_size = batch_size,shuffle = True,num_workers = 0, pin_memory = False)
 test_dataloader = torch.utils.data.DataLoader(dataset = test_dataset,batch_size = batch_size,shuffle = True,num_workers = 0, pin_memory = False)
 # pitch_nn = model(args.freq_keep*3,args.gru_dim,args.output_dim).to(device)
 pitch_nn = pitch_nn.to(device)
 num_params = count_parameters(pitch_nn)
 learning_rate = args.learning_rate
 model_opt = torch.optim.Adam(pitch_nn.parameters(), lr = learning_rate)
 num_epochs = args.epochs
 for epoch in range(num_epochs):
    losses = []
    pitch_nn.train()
    with tqdm.tqdm(train_dataloader) as train_epoch:
        for i, (xi, yi, ci) in enumerate(train_epoch):
            yi, xi, ci = yi.to(device, non_blocking=True), xi.to(device, non_blocking=True), ci.to(device, non_blocking=True)
            pi = pitch_nn(xi.float())
            loss = loss_custom(logits = pi,labels = yi,confidence = ci,choice = args.choice_cel,nmax = args.output_dim)
            model_opt.zero_grad()
            loss.backward()
            model_opt.step()
            losses.append(loss.item())
            avg_loss = np.mean(losses)
            train_epoch.set_postfix({"Train Epoch" : epoch, "Train Loss":avg_loss})
    if epoch % 5 == 0:
        pitch_nn.eval()
        losses = []
        with tqdm.tqdm(test_dataloader) as test_epoch:
            for i, (xi, yi, ci) in enumerate(test_epoch):
                yi, xi, ci = yi.to(device, non_blocking=True), xi.to(device, non_blocking=True), ci.to(device, non_blocking=True)
                pi = pitch_nn(xi.float())
                loss = loss_custom(logits = pi,labels = yi,confidence = ci,choice = args.choice_cel,nmax = args.output_dim)
                losses.append(loss.item())
                avg_loss = np.mean(losses)
                test_epoch.set_postfix({"Epoch" : epoch, "Test Loss":avg_loss})
 pitch_nn.eval()
 rpa(pitch_nn,device,data_format = args.data_format)
 config = dict(
 data_format = args.data_format,
 epochs = num_epochs,
 window_size =  args.N,
 hop_factor =  args.H,
 freq_keep = args.freq_keep,
 batch_size =  batch_size,
 learning_rate = learning_rate,
 confidence_threshold = args.confidence_threshold,
 model_parameters =  num_params,
 np_seed = np_seed,
 torch_seed = torch_seed,
 xcorr_dim = args.xcorr_dimension,
 dim_input = 3*args.freq_keep,
 gru_dim = args.gru_dim,
 output_dim = args.output_dim,
 choice_cel = args.choice_cel,
 context = args.context,
 )
 now = datetime.now()
 dir_pth_save = args.output_folder
 dir_network = dir_pth_save + str(now) + '_net_' + args.data_format + '.pth'
 dir_dictparams = dir_pth_save + str(now) + '_config_' + args.data_format + '.json'
 # Save Weights
 torch.save(pitch_nn.state_dict(), dir_network)
 # Save Config
 with open(dir_dictparams, 'w') as fp:
    json.dump(config, fp)
--- a/dnn/torch/neural-pitch/utils.py
+++ b/dnn/torch/neural-pitch/utils.py
@ -0,0 +1,59 @@
 """
 Utility functions that are commonly used
 """
 import numpy as np
 from scipy.signal import windows, lfilter
 from prettytable import PrettyTable
 # Source: https://gist.github.com/thongonary/026210fc186eb5056f2b6f1ca362d912
 def count_parameters(model):
    table = PrettyTable(["Modules", "Parameters"])
    total_params = 0
    for name, parameter in model.named_parameters():
        if not parameter.requires_grad: continue
        param = parameter.numel()
        table.add_row([name, param])
        total_params+=param
    print(table)
    print(f"Total Trainable Params: {total_params}")
    return total_params
 def stft(x, w = 'boxcar', N = 320, H = 160):
    x = np.concatenate([x,np.zeros(N)])
    # win_custom = np.concatenate([windows.hann(80)[:40],np.ones(240),windows.hann(80)[40:]])
    return np.stack([np.fft.rfft(x[i:i + N]*windows.get_window(w,N)) for i in np.arange(0,x.shape[0]-N,H)])
 def random_filter(x):
    # Randomly filter x with second order IIR filter with coefficients in between -3/8,3/8
    filter_coeff = np.random.uniform(low =  -3.0/8, high = 3.0/8, size = 4)
    b = [1,filter_coeff[0],filter_coeff[1]]
    a = [1,filter_coeff[2],filter_coeff[3]]
    return lfilter(b,a,x)
 def feature_xform(feature):
    """
    Take as input the (N * 256) xcorr features output by LPCNet and perform the following
    1. Downsample and Upsample by 2 (followed by smoothing)
    2. Append positional embeddings (of dim k) coresponding to each xcorr lag
    """
    from scipy.signal import resample_poly, lfilter
    feature_US = lfilter([0.25,0.5,0.25],[1],resample_poly(feature,2,1,axis = 1),axis = 1)[:,:feature.shape[1]]
    feature_DS = lfilter([0.5,0.5],[1],resample_poly(feature,1,2,axis = 1),axis = 1)
    Z_append = np.zeros((feature.shape[0],feature.shape[1] - feature_DS.shape[1]))
    feature_DS = np.concatenate([feature_DS,Z_append],axis = -1)
    # pos_embedding = []
    # for i in range(k):
    #     pos_embedding.append(np.cos((2**i)*np.pi*((np.repeat(np.arange(feature.shape[1]).reshape(feature.shape[1],1),feature.shape[0],axis = 1)).T/(2*feature.shape[1]))))
    # pos_embedding = np.stack(pos_embedding,axis = -1)
    feature = np.stack((feature_DS,feature,feature_US),axis = -1)
    # feature = np.concatenate((feature,pos_embedding),axis = -1)
    return feature