add some code

2025-09-05 13:25:11 +08:00
parent 9ff0a99e7a
commit 3cf1229a85
8911 changed files with 2535396 additions and 0 deletions
--- a/managed_components/78__esp-opus/dnn/torch/neural-pitch/README.md
+++ b/managed_components/78__esp-opus/dnn/torch/neural-pitch/README.md
@@ -0,0 +1,18 @@
+## Neural Pitch Estimation
+
+- Dataset Installation
+    1. Download and unzip PTDB Dataset:
+        wget https://www2.spsc.tugraz.at/databases/PTDB-TUG/SPEECH_DATA_ZIPPED.zip
+        unzip SPEECH_DATA_ZIPPED.zip
+
+    2. Inside "SPEECH DATA" above, run ptdb_process.sh to combine male/female
+
+    3. To Download and combine demand, simply run download_demand.sh
+
+- LPCNet preparation
+    1. To extract xcorr, add lpcnet_extractor.c and add relevant functions to lpcnet_enc.c, add source for headers/c files and Makefile.am, and compile to generate ./lpcnet_xcorr_extractor object
+
+- Dataset Augmentation and training (check out arguments to each of the following)
+    1. Run data_augmentation.py
+    2. Run training.py using augmented data
+    3. Run experiments.py
--- a/managed_components/78__esp-opus/dnn/torch/neural-pitch/data_augmentation.py
+++ b/managed_components/78__esp-opus/dnn/torch/neural-pitch/data_augmentation.py
@@ -0,0 +1,149 @@
+"""
+Perform Data Augmentation (Gain, Additive Noise, Random Filtering) on Input TTS Data
+1. Read in chunks and compute clean pitch first
+2. Then add in augmentation (Noise/Level/Response)
+    - Adds filtered noise from the "Demand" dataset, https://zenodo.org/record/1227121#.XRKKxYhKiUk
+    - When using the Demand Dataset, consider each channel as a possible noise input, and keep the first 4 minutes of noise for training
+3. Use this "augmented" audio for feature computation, and compute pitch using CREPE on the clean input
+
+Notes: To ensure consistency with the discovered CREPE offset, we do the following
+- We pad the input audio to the zero-centered CREPE estimator with 80 zeros
+- We pad the input audio to our feature computation with 160 zeros to center them
+"""
+
+import argparse
+parser = argparse.ArgumentParser()
+
+parser.add_argument('data', type=str, help='input raw audio data')
+parser.add_argument('output', type=str, help='output directory')
+parser.add_argument('path_lpcnet_extractor', type=str, help='path to LPCNet extractor object file (generated on compilation)')
+parser.add_argument('noise_dataset', type=str, help='Location of the Demand Datset')
+parser.add_argument('--flag_xcorr', type=bool, help='Flag to additionally dump xcorr features',choices=[True,False],default = False,required = False)
+parser.add_argument('--fraction_input_use', type=float, help='Fraction of input data to consider',default = 0.3,required = False)
+parser.add_argument('--gpu_index', type=int, help='GPU index to use if multiple GPUs',default = 0,required = False)
+parser.add_argument('--choice_augment', type=str, help='Choice of noise augmentation, either use additive synthetic noise or add noise from the demand dataset',choices = ['demand','synthetic'],default = "demand",required = False)
+parser.add_argument('--fraction_clean', type=float, help='Fraction of data to keep clean (that is not augment with anything)',default = 0.2,required = False)
+parser.add_argument('--chunk_size', type=int, help='Number of samples to augment with for each iteration',default = 80000,required = False)
+parser.add_argument('--N', type=int, help='STFT window size',default = 320,required = False)
+parser.add_argument('--H', type=int, help='STFT Hop size',default = 160,required = False)
+parser.add_argument('--freq_keep', type=int, help='Number of Frequencies to keep',default = 30,required = False)
+
+args = parser.parse_args()
+
+import os
+os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu_index)
+
+from utils import stft, random_filter
+
+import numpy as np
+import tqdm
+import crepe
+import random
+import glob
+import subprocess
+
+data_full = np.memmap(args.data, dtype=np.int16,mode = 'r')
+data = data_full[:(int)(args.fraction_input_use*data_full.shape[0])]
+
+# list_features = []
+list_cents = []
+list_confidences = []
+
+N = args.N
+H = args.H
+freq_keep = args.freq_keep
+# Minimum/Maximum periods, decided by LPCNet
+min_period = 32
+max_period = 256
+f_ref = 16000/max_period
+chunk_size = args.chunk_size
+num_frames_chunk = chunk_size//H
+list_indices_keep = np.concatenate([np.arange(freq_keep), (N//2 + 1) + np.arange(freq_keep), 2*(N//2 + 1) + np.arange(freq_keep)])
+
+output_IF  = np.memmap(args.output + '_iffeat.f32', dtype=np.float32, shape=(((data.shape[0]//chunk_size - 1)//1)*num_frames_chunk,list_indices_keep.shape[0]), mode='w+')
+if args.flag_xcorr:
+    output_xcorr  = np.memmap(args.output + '_xcorr.f32', dtype=np.float32, shape=(((data.shape[0]//chunk_size - 1)//1)*num_frames_chunk,257), mode='w+')
+
+fraction_clean = args.fraction_clean
+
+noise_dataset = args.noise_dataset
+
+for i in tqdm.trange((data.shape[0]//chunk_size - 1)//1):
+    chunk = data[i*chunk_size:(i + 1)*chunk_size]/(2**15 - 1)
+
+    # Clean Pitch/Confidence Estimate
+    # Padding input to CREPE by 80 samples to ensure it aligns
+    _, pitch, confidence, _ = crepe.predict(np.concatenate([np.zeros(80),chunk]), 16000, center=True, viterbi=True,verbose=0)
+    cent = 1200*np.log2(np.divide(pitch, f_ref, out=np.zeros_like(pitch), where=pitch!=0) + 1.0e-8)
+
+    # Filter out of range pitches/confidences
+    confidence[pitch < 16000/max_period] = 0
+    confidence[pitch > 16000/min_period] = 0
+
+    # Keep fraction of data clean, augment only 1 minus the fraction
+    if (np.random.rand() > fraction_clean):
+        # Response, generate controlled/random 2nd order IIR filter and filter chunk
+        chunk = random_filter(chunk)
+
+        # Level/Gain response {scale by random gain between 1.0e-3 and 10}
+        # Generate random gain in dB and then convert to scale
+        g_dB = np.random.uniform(low =  -60, high = 20, size = 1)
+        # g_dB = 0
+        g = 10**(g_dB/20)
+
+        # Noise Addition {Add random SNR 2nd order randomly colored noise}
+        # Generate noise SNR value and add corresponding noise
+        snr_dB = np.random.uniform(low =  -20, high = 30, size = 1)
+
+        if args.choice_augment == 'synthetic':
+            n = np.random.randn(chunk_size)
+        else:
+            list_noisefiles = noise_dataset + '*.wav'
+            noise_file = random.choice(glob.glob(list_noisefiles))
+            n = np.memmap(noise_file, dtype=np.int16,mode = 'r')/(2**15 - 1)
+            rand_range = np.random.randint(low = 0, high = (n.shape[0] - 16000*60 - chunk.shape[0])) # 16000 is subtracted because we will use the last 1 minutes of noise for testing
+            n = n[rand_range:rand_range + chunk.shape[0]]
+
+        # Randomly filter the sampled noise as well
+        n = random_filter(n)
+        # generate random prime number between 0,500 and make those samples of noise 0 (to prevent GRU from picking up temporal patterns)
+        Nprime = random.choice([2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97, 101, 103, 107, 109, 113, 127, 131, 137, 139, 149, 151, 157, 163, 167, 173, 179, 181, 191, 193, 197, 199, 211, 223, 227, 229, 233, 239, 241, 251, 257, 263, 269, 271, 277, 281, 283, 293, 307, 311, 313, 317, 331, 337, 347, 349, 353, 359, 367, 373, 379, 383, 389, 397, 401, 409, 419, 421, 431, 433, 439, 443, 449, 457, 461, 463, 467, 479, 487, 491, 499, 503, 509, 521, 523, 541])
+        n[chunk_size - Nprime:] = np.zeros(Nprime)
+        snr_multiplier = np.sqrt((np.sum(np.abs(chunk)**2)/np.sum(np.abs(n)**2))*10**(-snr_dB/10))
+
+        chunk = g*(chunk + snr_multiplier*n)
+
+    # Zero pad input audio by 160 to center the frames
+    spec = stft(x = np.concatenate([np.zeros(160),chunk]), w = 'boxcar', N = N, H = H).T
+    phase_diff = spec*np.conj(np.roll(spec,1,axis = -1))
+    phase_diff = phase_diff/(np.abs(phase_diff) + 1.0e-8)
+    feature = np.concatenate([np.log(np.abs(spec) + 1.0e-8),np.real(phase_diff),np.imag(phase_diff)],axis = 0).T
+    feature = feature[:,list_indices_keep]
+
+    if args.flag_xcorr:
+        # Dump noisy audio into temp file
+        data_temp = np.memmap('./temp_augment.raw', dtype=np.int16, shape=(chunk.shape[0]), mode='w+')
+        # data_temp[:chunk.shape[0]] = (chunk/(np.max(np.abs(chunk)))*(2**15 - 1)).astype(np.int16)
+        data_temp[:chunk.shape[0]] = ((chunk)*(2**15 - 1)).astype(np.int16)
+
+        subprocess.run([args.path_lpcnet_extractor, './temp_augment.raw', './temp_augment_xcorr.f32'])
+        feature_xcorr = np.flip(np.fromfile('./temp_augment_xcorr.f32', dtype='float32').reshape((-1,256),order = 'C'),axis = 1)
+        ones_zero_lag = np.expand_dims(np.ones(feature_xcorr.shape[0]),-1)
+        feature_xcorr = np.concatenate([ones_zero_lag,feature_xcorr],axis = -1)
+
+        os.remove('./temp_augment.raw')
+        os.remove('./temp_augment_xcorr.f32')
+    num_frames = min(cent.shape[0],feature.shape[0],feature_xcorr.shape[0],num_frames_chunk)
+    feature = feature[:num_frames,:]
+    cent = cent[:num_frames]
+    confidence = confidence[:num_frames]
+    feature_xcorr = feature_xcorr[:num_frames]
+    output_IF[i*num_frames_chunk:(i + 1)*num_frames_chunk,:] = feature
+    output_xcorr[i*num_frames_chunk:(i + 1)*num_frames_chunk,:] = feature_xcorr
+    list_cents.append(cent)
+    list_confidences.append(confidence)
+
+list_cents = np.hstack(list_cents)
+list_confidences = np.hstack(list_confidences)
+
+np.save(args.output + '_pitches',np.vstack([list_cents,list_confidences]))
--- a/managed_components/78__esp-opus/dnn/torch/neural-pitch/download_demand.sh
+++ b/managed_components/78__esp-opus/dnn/torch/neural-pitch/download_demand.sh
@@ -0,0 +1,43 @@
+wget https://zenodo.org/record/1227121/files/DKITCHEN_16k.zip
+
+wget https://zenodo.org/record/1227121/files/DLIVING_16k.zip
+
+wget https://zenodo.org/record/1227121/files/DWASHING_16k.zip
+
+wget https://zenodo.org/record/1227121/files/NFIELD_16k.zip
+
+wget https://zenodo.org/record/1227121/files/NPARK_16k.zip
+
+wget https://zenodo.org/record/1227121/files/NRIVER_16k.zip
+
+wget https://zenodo.org/record/1227121/files/OHALLWAY_16k.zip
+
+wget https://zenodo.org/record/1227121/files/OMEETING_16k.zip
+
+wget https://zenodo.org/record/1227121/files/OOFFICE_16k.zip
+
+wget https://zenodo.org/record/1227121/files/PCAFETER_16k.zip
+
+wget https://zenodo.org/record/1227121/files/PRESTO_16k.zip
+
+wget https://zenodo.org/record/1227121/files/PSTATION_16k.zip
+
+wget https://zenodo.org/record/1227121/files/TMETRO_16k.zip
+
+wget https://zenodo.org/record/1227121/files/TCAR_16k.zip
+
+wget https://zenodo.org/record/1227121/files/TBUS_16k.zip
+
+wget https://zenodo.org/record/1227121/files/STRAFFIC_16k.zip
+
+wget https://zenodo.org/record/1227121/files/SPSQUARE_16k.zip
+
+unzip '*.zip'
+
+mkdir -p ./combined_demand_channels/
+for file in */*.wav; do
+parentdir="$(dirname "$file")"
+echo $parentdir
+fname="$(basename "$file")"
+cp $file ./combined_demand_channels/$parentdir+$fname
+done
--- a/managed_components/78__esp-opus/dnn/torch/neural-pitch/evaluation.py
+++ b/managed_components/78__esp-opus/dnn/torch/neural-pitch/evaluation.py
@@ -0,0 +1,349 @@
+"""
+Evaluation script to compute the Raw Pitch Accuracy
+Procedure:
+    - Look at all voiced frames in file
+    - Compute number of pitches in those frames that lie within a 50 cent threshold
+    RPA = (Total number of pitches within threshold summed across all files)/(Total number of voiced frames summed accross all files)
+"""
+
+import os
+os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+
+from prettytable import PrettyTable
+import numpy as np
+import glob
+import random
+import tqdm
+import torch
+import librosa
+import json
+from utils import stft, random_filter, feature_xform
+import subprocess
+import crepe
+
+from models import PitchDNN, PitchDNNIF, PitchDNNXcorr
+
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+def rca(reference,input,voicing,thresh = 25):
+    idx_voiced = np.where(voicing != 0)[0]
+    acc = np.where(np.abs(reference - input)[idx_voiced] < thresh)[0]
+    return acc.shape[0]
+
+def sweep_rca(reference,input,voicing,thresh = 25,ind_arr = np.arange(-10,10)):
+    l = []
+    for i in ind_arr:
+        l.append(rca(reference,np.roll(input,i),voicing,thresh))
+    l = np.array(l)
+
+    return np.max(l)
+
+def rpa(model,device = 'cpu',data_format = 'if'):
+    list_files = glob.glob('/home/ubuntu/Code/Datasets/SPEECH DATA/combined_mic_16k_raw/*.raw')
+    dir_f0 = '/home/ubuntu/Code/Datasets/SPEECH DATA/combine_f0_ptdb/'
+    # random_shuffle = list(np.random.permutation(len(list_files)))
+    random.shuffle(list_files)
+    list_files = list_files[:1000]
+
+    C_all = 0
+    C_all_m = 0
+    C_all_f = 0
+    list_rca_model_all = []
+    list_rca_male_all = []
+    list_rca_female_all = []
+
+    thresh = 50
+    N = 320
+    H = 160
+    freq_keep = 30
+
+    for idx in tqdm.trange(len(list_files)):
+        audio_file = list_files[idx]
+        file_name = os.path.basename(list_files[idx])[:-4]
+
+        audio = np.memmap(list_files[idx], dtype=np.int16)/(2**15 - 1)
+        offset = 432
+        audio = audio[offset:]
+        rmse = np.squeeze(librosa.feature.rms(y = audio,frame_length = 320,hop_length = 160))
+
+        spec = stft(x = np.concatenate([np.zeros(160),audio]), w = 'boxcar', N = N, H = H).T
+        phase_diff = spec*np.conj(np.roll(spec,1,axis = -1))
+        phase_diff = phase_diff/(np.abs(phase_diff) + 1.0e-8)
+        idx_save = np.concatenate([np.arange(freq_keep),(N//2 + 1) + np.arange(freq_keep),2*(N//2 + 1) + np.arange(freq_keep)])
+        feature = np.concatenate([np.log(np.abs(spec) + 1.0e-8),np.real(phase_diff),np.imag(phase_diff)],axis = 0).T
+        feature_if = feature[:,idx_save]
+
+        data_temp = np.memmap('./temp.raw', dtype=np.int16, shape=(audio.shape[0]), mode='w+')
+        data_temp[:audio.shape[0]] = (audio/(np.max(np.abs(audio)))*(2**15 - 1)).astype(np.int16)
+
+        subprocess.run(["../../../lpcnet_xcorr_extractor", './temp.raw', './temp_xcorr.f32'])
+        feature_xcorr = np.flip(np.fromfile('./temp_xcorr.f32', dtype='float32').reshape((-1,256),order = 'C'),axis = 1)
+        ones_zero_lag = np.expand_dims(np.ones(feature_xcorr.shape[0]),-1)
+        feature_xcorr = np.concatenate([ones_zero_lag,feature_xcorr],axis = -1)
+        # feature_xcorr = feature_xform(feature_xcorr)
+
+        os.remove('./temp.raw')
+        os.remove('./temp_xcorr.f32')
+
+        if data_format == 'if':
+            feature = feature_if
+        elif data_format == 'xcorr':
+            feature = feature_xcorr
+        else:
+            indmin = min(feature_if.shape[0],feature_xcorr.shape[0])
+            feature = np.concatenate([feature_xcorr[:indmin,:],feature_if[:indmin,:]],-1)
+
+
+        pitch_file_name = dir_f0 + "ref" + os.path.basename(list_files[idx])[3:-4] + ".f0"
+        pitch = np.loadtxt(pitch_file_name)[:,0]
+        voicing = np.loadtxt(pitch_file_name)[:,1]
+        indmin = min(voicing.shape[0],rmse.shape[0],pitch.shape[0])
+        pitch = pitch[:indmin]
+        voicing = voicing[:indmin]
+        rmse = rmse[:indmin]
+        voicing = voicing*(rmse > 0.05*np.max(rmse))
+        if "mic_F" in audio_file:
+            idx_correct = np.where(pitch < 125)
+            voicing[idx_correct] = 0
+
+        cent = np.rint(1200*np.log2(np.divide(pitch, (16000/256), out=np.zeros_like(pitch), where=pitch!=0) + 1.0e-8)).astype('int')
+
+
+        model_cents = model(torch.from_numpy(np.copy(np.expand_dims(feature,0))).float().to(device))
+        model_cents = 20*model_cents.argmax(dim=1).cpu().detach().squeeze().numpy()
+
+        num_frames = min(cent.shape[0],model_cents.shape[0])
+        pitch = pitch[:num_frames]
+        cent = cent[:num_frames]
+        voicing = voicing[:num_frames]
+        model_cents = model_cents[:num_frames]
+
+        voicing_all = np.copy(voicing)
+        # Forcefully make regions where pitch is <65 or greater than 500 unvoiced for relevant accurate pitch comparisons for our model
+        force_out_of_pitch = np.where(np.logical_or(pitch < 65,pitch > 500)==True)
+        voicing_all[force_out_of_pitch] = 0
+        C_all = C_all + np.where(voicing_all != 0)[0].shape[0]
+
+        list_rca_model_all.append(rca(cent,model_cents,voicing_all,thresh))
+
+        if "mic_M" in audio_file:
+            list_rca_male_all.append(rca(cent,model_cents,voicing_all,thresh))
+            C_all_m = C_all_m + np.where(voicing_all != 0)[0].shape[0]
+        else:
+            list_rca_female_all.append(rca(cent,model_cents,voicing_all,thresh))
+            C_all_f = C_all_f + np.where(voicing_all != 0)[0].shape[0]
+
+    list_rca_model_all = np.array(list_rca_model_all)
+    list_rca_male_all = np.array(list_rca_male_all)
+    list_rca_female_all = np.array(list_rca_female_all)
+
+
+    x = PrettyTable()
+
+    x.field_names = ["Experiment", "Mean RPA"]
+    x.add_row(["Both all pitches", np.sum(list_rca_model_all)/C_all])
+
+    x.add_row(["Male all pitches", np.sum(list_rca_male_all)/C_all_m])
+
+    x.add_row(["Female all pitches", np.sum(list_rca_female_all)/C_all_f])
+
+    print(x)
+
+    return None
+
+def cycle_eval(checkpoint_list, noise_type = 'synthetic', noise_dataset = None, list_snr = [-20,-15,-10,-5,0,5,10,15,20], ptdb_dataset_path = None,fraction = 0.1,thresh = 50):
+    """
+    Cycle through SNR evaluation for list of checkpoints
+    """
+    list_files = glob.glob(ptdb_dataset_path + 'combined_mic_16k/*.raw')
+    dir_f0 = ptdb_dataset_path + 'combined_reference_f0/'
+    random.shuffle(list_files)
+    list_files = list_files[:(int)(fraction*len(list_files))]
+
+    dict_models = {}
+    list_snr.append(np.inf)
+
+    for f in checkpoint_list:
+        if (f!='crepe') and (f!='lpcnet'):
+
+            checkpoint = torch.load(f, map_location='cpu')
+            dict_params = checkpoint['config']
+            if dict_params['data_format'] == 'if':
+                from models import large_if_ccode as model
+                pitch_nn = PitchDNNIF(dict_params['freq_keep']*3,dict_params['gru_dim'],dict_params['output_dim'])
+            elif dict_params['data_format'] == 'xcorr':
+                from models import large_xcorr as model
+                pitch_nn = PitchDNNXcorr(dict_params['xcorr_dim'],dict_params['gru_dim'],dict_params['output_dim'])
+            else:
+                from models import large_joint as model
+                pitch_nn = PitchDNN(dict_params['freq_keep']*3,dict_params['xcorr_dim'],dict_params['gru_dim'],dict_params['output_dim'])
+
+            pitch_nn.load_state_dict(checkpoint['state_dict'])
+
+            N = dict_params['window_size']
+            H = dict_params['hop_factor']
+            freq_keep = dict_params['freq_keep']
+
+            list_mean = []
+            list_std = []
+            for snr_dB in list_snr:
+                C_all = 0
+                C_correct = 0
+                for idx in tqdm.trange(len(list_files)):
+                    audio_file = list_files[idx]
+                    file_name = os.path.basename(list_files[idx])[:-4]
+
+                    audio = np.memmap(list_files[idx], dtype=np.int16)/(2**15 - 1)
+                    offset = 432
+                    audio = audio[offset:]
+                    rmse = np.squeeze(librosa.feature.rms(y = audio,frame_length = N,hop_length = H))
+
+                    if noise_type != 'synthetic':
+                        list_noisefiles = noise_dataset + '*.wav'
+                        noise_file = random.choice(glob.glob(list_noisefiles))
+                        n = np.memmap(noise_file, dtype=np.int16,mode = 'r')/(2**15 - 1)
+                        rand_range = np.random.randint(low = 0, high = (16000*60*5 - audio.shape[0])) # Last 1 minute of noise used for testing
+                        n = n[rand_range:rand_range + audio.shape[0]]
+                    else:
+                        n = np.random.randn(audio.shape[0])
+                        n = random_filter(n)
+
+                    snr_multiplier = np.sqrt((np.sum(np.abs(audio)**2)/np.sum(np.abs(n)**2))*10**(-snr_dB/10))
+                    audio = audio + snr_multiplier*n
+
+                    spec = stft(x = np.concatenate([np.zeros(160),audio]), w = 'boxcar', N = N, H = H).T
+                    phase_diff = spec*np.conj(np.roll(spec,1,axis = -1))
+                    phase_diff = phase_diff/(np.abs(phase_diff) + 1.0e-8)
+                    idx_save = np.concatenate([np.arange(freq_keep),(N//2 + 1) + np.arange(freq_keep),2*(N//2 + 1) + np.arange(freq_keep)])
+                    feature = np.concatenate([np.log(np.abs(spec) + 1.0e-8),np.real(phase_diff),np.imag(phase_diff)],axis = 0).T
+                    feature_if = feature[:,idx_save]
+
+                    data_temp = np.memmap('./temp.raw', dtype=np.int16, shape=(audio.shape[0]), mode='w+')
+                    # data_temp[:audio.shape[0]] = (audio/(np.max(np.abs(audio)))*(2**15 - 1)).astype(np.int16)
+                    data_temp[:audio.shape[0]] = ((audio)*(2**15 - 1)).astype(np.int16)
+
+                    subprocess.run(["../../../lpcnet_xcorr_extractor", './temp.raw', './temp_xcorr.f32'])
+                    feature_xcorr = np.flip(np.fromfile('./temp_xcorr.f32', dtype='float32').reshape((-1,256),order = 'C'),axis = 1)
+                    ones_zero_lag = np.expand_dims(np.ones(feature_xcorr.shape[0]),-1)
+                    feature_xcorr = np.concatenate([ones_zero_lag,feature_xcorr],axis = -1)
+
+                    os.remove('./temp.raw')
+                    os.remove('./temp_xcorr.f32')
+
+                    if dict_params['data_format'] == 'if':
+                        feature = feature_if
+                    elif dict_params['data_format'] == 'xcorr':
+                        feature = feature_xcorr
+                    else:
+                        indmin = min(feature_if.shape[0],feature_xcorr.shape[0])
+                        feature = np.concatenate([feature_xcorr[:indmin,:],feature_if[:indmin,:]],-1)
+
+                    pitch_file_name = dir_f0 + "ref" + os.path.basename(list_files[idx])[3:-4] + ".f0"
+                    pitch = np.loadtxt(pitch_file_name)[:,0]
+                    voicing = np.loadtxt(pitch_file_name)[:,1]
+                    indmin = min(voicing.shape[0],rmse.shape[0],pitch.shape[0])
+                    pitch = pitch[:indmin]
+                    voicing = voicing[:indmin]
+                    rmse = rmse[:indmin]
+                    voicing = voicing*(rmse > 0.05*np.max(rmse))
+                    if "mic_F" in audio_file:
+                        idx_correct = np.where(pitch < 125)
+                        voicing[idx_correct] = 0
+
+                    cent = np.rint(1200*np.log2(np.divide(pitch, (16000/256), out=np.zeros_like(pitch), where=pitch!=0) + 1.0e-8)).astype('int')
+
+                    model_cents = pitch_nn(torch.from_numpy(np.copy(np.expand_dims(feature,0))).float().to(device))
+                    model_cents = 20*model_cents.argmax(dim=1).cpu().detach().squeeze().numpy()
+
+                    num_frames = min(cent.shape[0],model_cents.shape[0])
+                    pitch = pitch[:num_frames]
+                    cent = cent[:num_frames]
+                    voicing = voicing[:num_frames]
+                    model_cents = model_cents[:num_frames]
+
+                    voicing_all = np.copy(voicing)
+                    # Forcefully make regions where pitch is <65 or greater than 500 unvoiced for relevant accurate pitch comparisons for our model
+                    force_out_of_pitch = np.where(np.logical_or(pitch < 65,pitch > 500)==True)
+                    voicing_all[force_out_of_pitch] = 0
+                    C_all = C_all + np.where(voicing_all != 0)[0].shape[0]
+
+                    C_correct = C_correct + rca(cent,model_cents,voicing_all,thresh)
+                list_mean.append(C_correct/C_all)
+        else:
+            fname = f
+            list_mean = []
+            list_std = []
+            for snr_dB in list_snr:
+                C_all = 0
+                C_correct = 0
+                for idx in tqdm.trange(len(list_files)):
+                    audio_file = list_files[idx]
+                    file_name = os.path.basename(list_files[idx])[:-4]
+
+                    audio = np.memmap(list_files[idx], dtype=np.int16)/(2**15 - 1)
+                    offset = 432
+                    audio = audio[offset:]
+                    rmse = np.squeeze(librosa.feature.rms(y = audio,frame_length = 320,hop_length = 160))
+
+                    if noise_type != 'synthetic':
+                        list_noisefiles = noise_dataset + '*.wav'
+                        noise_file = random.choice(glob.glob(list_noisefiles))
+                        n = np.memmap(noise_file, dtype=np.int16,mode = 'r')/(2**15 - 1)
+                        rand_range = np.random.randint(low = 0, high = (16000*60*5 - audio.shape[0])) # Last 1 minute of noise used for testing
+                        n = n[rand_range:rand_range + audio.shape[0]]
+                    else:
+                        n = np.random.randn(audio.shape[0])
+                        n = random_filter(n)
+
+                    snr_multiplier = np.sqrt((np.sum(np.abs(audio)**2)/np.sum(np.abs(n)**2))*10**(-snr_dB/10))
+                    audio = audio + snr_multiplier*n
+
+                    if (f == 'crepe'):
+                        _, model_frequency, _, _ = crepe.predict(np.concatenate([np.zeros(80),audio]), 16000, viterbi=True,center=True,verbose=0)
+                        model_cents = 1200*np.log2(model_frequency/(16000/256) + 1.0e-8)
+                    else:
+                        data_temp = np.memmap('./temp.raw', dtype=np.int16, shape=(audio.shape[0]), mode='w+')
+                        # data_temp[:audio.shape[0]] = (audio/(np.max(np.abs(audio)))*(2**15 - 1)).astype(np.int16)
+                        data_temp[:audio.shape[0]] = ((audio)*(2**15 - 1)).astype(np.int16)
+
+                        subprocess.run(["../../../lpcnet_xcorr_extractor", './temp.raw', './temp_xcorr.f32', './temp_period.f32'])
+                        feature_xcorr = np.fromfile('./temp_period.f32', dtype='float32')
+                        model_cents = 1200*np.log2((256/feature_xcorr +  1.0e-8) + 1.0e-8)
+
+                        os.remove('./temp.raw')
+                        os.remove('./temp_xcorr.f32')
+                        os.remove('./temp_period.f32')
+
+
+                    pitch_file_name = dir_f0 + "ref" + os.path.basename(list_files[idx])[3:-4] + ".f0"
+                    pitch = np.loadtxt(pitch_file_name)[:,0]
+                    voicing = np.loadtxt(pitch_file_name)[:,1]
+                    indmin = min(voicing.shape[0],rmse.shape[0],pitch.shape[0])
+                    pitch = pitch[:indmin]
+                    voicing = voicing[:indmin]
+                    rmse = rmse[:indmin]
+                    voicing = voicing*(rmse > 0.05*np.max(rmse))
+                    if "mic_F" in audio_file:
+                        idx_correct = np.where(pitch < 125)
+                        voicing[idx_correct] = 0
+
+                    cent = np.rint(1200*np.log2(np.divide(pitch, (16000/256), out=np.zeros_like(pitch), where=pitch!=0) + 1.0e-8)).astype('int')
+                    num_frames = min(cent.shape[0],model_cents.shape[0])
+                    pitch = pitch[:num_frames]
+                    cent = cent[:num_frames]
+                    voicing = voicing[:num_frames]
+                    model_cents = model_cents[:num_frames]
+
+                    voicing_all = np.copy(voicing)
+                    # Forcefully make regions where pitch is <65 or greater than 500 unvoiced for relevant accurate pitch comparisons for our model
+                    force_out_of_pitch = np.where(np.logical_or(pitch < 65,pitch > 500)==True)
+                    voicing_all[force_out_of_pitch] = 0
+                    C_all = C_all + np.where(voicing_all != 0)[0].shape[0]
+
+                    C_correct = C_correct + rca(cent,model_cents,voicing_all,thresh)
+                list_mean.append(C_correct/C_all)
+        dict_models[fname] = {}
+        dict_models[fname]['list_SNR'] = list_mean[:-1]
+        dict_models[fname]['inf'] = list_mean[-1]
+
+    return dict_models
--- a/managed_components/78__esp-opus/dnn/torch/neural-pitch/experiments.py
+++ b/managed_components/78__esp-opus/dnn/torch/neural-pitch/experiments.py
@@ -0,0 +1,38 @@
+"""
+Running the experiments;
+    1. RCA vs SNR for our models, CREPE, LPCNet
+"""
+
+import argparse
+parser = argparse.ArgumentParser()
+
+parser.add_argument('ptdb_root', type=str, help='Root Directory for PTDB generated by running ptdb_process.sh ')
+parser.add_argument('output', type=str, help='Output dump file name')
+parser.add_argument('method', type=str, help='Output Directory to save experiment dumps',choices=['model','lpcnet','crepe'])
+parser.add_argument('--noise_dataset', type=str, help='Location of the Demand Datset',default = './',required=False)
+parser.add_argument('--noise_type', type=str, help='Type of additive noise',default = 'synthetic',choices=['synthetic','demand'],required=False)
+parser.add_argument('--pth_file', type=str, help='.pth file to analyze',default = './',required = False)
+parser.add_argument('--fraction_files_analyze', type=float, help='Fraction of PTDB dataset to test on',default = 1,required = False)
+parser.add_argument('--threshold_rca', type=float, help='Cent threshold when computing RCA',default = 50,required = False)
+parser.add_argument('--gpu_index', type=int, help='GPU index to use if multiple GPUs',default = 0,required = False)
+
+args = parser.parse_args()
+
+import os
+os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu_index)
+
+import json
+from evaluation import cycle_eval
+
+if args.method == 'model':
+    dict_store = cycle_eval([args.pth_file], noise_type = args.noise_type, noise_dataset = args.noise_dataset, list_snr = [-20,-15,-10,-5,0,5,10,15,20], ptdb_dataset_path = args.ptdb_root,fraction = args.fraction_files_analyze,thresh = args.threshold_rca)
+else:
+    dict_store = cycle_eval([args.method], noise_type = args.noise_type, noise_dataset = args.noise_dataset, list_snr = [-20,-15,-10,-5,0,5,10,15,20], ptdb_dataset_path = args.ptdb_root,fraction = args.fraction_files_analyze,thresh = args.threshold_rca)
+
+dict_store["method"] = args.method
+if args.method == 'model':
+    dict_store['pth'] = args.pth_file
+
+with open(args.output, 'w') as fp:
+    json.dump(dict_store, fp)
--- a/managed_components/78__esp-opus/dnn/torch/neural-pitch/export_neuralpitch_weights.py
+++ b/managed_components/78__esp-opus/dnn/torch/neural-pitch/export_neuralpitch_weights.py
@@ -0,0 +1,109 @@
+"""
+/* Copyright (c) 2022 Amazon
+   Written by Jan Buethe */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+"""
+
+import os
+import argparse
+import sys
+
+sys.path.append(os.path.join(os.path.dirname(__file__), '../weight-exchange'))
+
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument('checkpoint', type=str, help='model checkpoint')
+parser.add_argument('output_dir', type=str, help='output folder')
+
+args = parser.parse_args()
+
+import torch
+import numpy as np
+
+from models import PitchDNN
+from wexchange.torch import dump_torch_weights
+from wexchange.c_export import CWriter, print_vector
+
+def c_export(args, model):
+
+    message = f"Auto generated from checkpoint {os.path.basename(args.checkpoint)}"
+
+    writer = CWriter(os.path.join(args.output_dir, "pitchdnn_data"), message=message, model_struct_name='PitchDNN')
+    writer.header.write(
+f"""
+#include "opus_types.h"
+"""
+        )
+
+    dense_layers = [
+        ('if_upsample.0', "dense_if_upsampler_1"),
+        ('if_upsample.2', "dense_if_upsampler_2"),
+        ('downsample.0', "dense_downsampler"),
+        ("upsample.0", "dense_final_upsampler")
+    ]
+
+
+    for name, export_name in dense_layers:
+        layer = model.get_submodule(name)
+        dump_torch_weights(writer, layer, name=export_name, verbose=True, quantize=True, scale=None)
+
+    conv_layers = [
+        ('conv.1', "conv2d_1"),
+        ('conv.4', "conv2d_2")
+    ]
+
+
+    for name, export_name in conv_layers:
+        layer = model.get_submodule(name)
+        dump_torch_weights(writer, layer, name=export_name, verbose=True)
+
+
+    gru_layers = [
+        ("GRU", "gru_1"),
+    ]
+
+    max_rnn_units = max([dump_torch_weights(writer, model.get_submodule(name), export_name, verbose=True, input_sparse=False, quantize=True, scale=None, recurrent_scale=None)
+                             for name, export_name in gru_layers])
+
+    writer.header.write(
+f"""
+
+#define PITCH_DNN_MAX_RNN_UNITS {max_rnn_units}
+
+"""
+        )
+
+    writer.close()
+
+
+if __name__ == "__main__":
+
+    os.makedirs(args.output_dir, exist_ok=True)
+    model = PitchDNN()
+    checkpoint = torch.load(args.checkpoint, map_location='cpu')
+    model.load_state_dict(checkpoint['state_dict'])
+    c_export(args, model)
--- a/managed_components/78__esp-opus/dnn/torch/neural-pitch/models.py
+++ b/managed_components/78__esp-opus/dnn/torch/neural-pitch/models.py
@@ -0,0 +1,178 @@
+"""
+Pitch Estimation Models and dataloaders
+    - Classification Based (Input features, output logits)
+"""
+
+import torch
+import numpy as np
+
+class PitchDNNIF(torch.nn.Module):
+
+    def __init__(self, input_dim=88, gru_dim=64, output_dim=192):
+        super().__init__()
+
+        self.activation = torch.nn.Tanh()
+        self.initial = torch.nn.Linear(input_dim, gru_dim)
+        self.hidden = torch.nn.Linear(gru_dim, gru_dim)
+        self.gru = torch.nn.GRU(input_size=gru_dim, hidden_size=gru_dim, batch_first=True)
+        self.upsample = torch.nn.Linear(gru_dim, output_dim)
+
+    def forward(self, x):
+
+        x = self.initial(x)
+        x = self.activation(x)
+        x = self.hidden(x)
+        x = self.activation(x)
+        x,_ = self.gru(x)
+        x = self.upsample(x)
+        x = self.activation(x)
+        x = x.permute(0,2,1)
+
+        return x
+
+class PitchDNNXcorr(torch.nn.Module):
+
+    def __init__(self, input_dim=90, gru_dim=64, output_dim=192):
+        super().__init__()
+
+        self.activation = torch.nn.Tanh()
+
+        self.conv = torch.nn.Sequential(
+            torch.nn.ZeroPad2d((2, 0, 1, 1)),
+            torch.nn.Conv2d(1, 8, 3, bias=True),
+            self.activation,
+            torch.nn.ZeroPad2d((2,0,1,1)),
+            torch.nn.Conv2d(8, 8, 3, bias=True),
+            self.activation,
+            torch.nn.ZeroPad2d((2,0,1,1)),
+            torch.nn.Conv2d(8, 1, 3, bias=True),
+            self.activation,
+        )
+
+        self.downsample = torch.nn.Sequential(
+            torch.nn.Linear(input_dim, gru_dim),
+            self.activation
+        )
+        self.GRU = torch.nn.GRU(input_size=gru_dim, hidden_size=gru_dim, num_layers=1, batch_first=True)
+        self.upsample = torch.nn.Sequential(
+            torch.nn.Linear(gru_dim,output_dim),
+            self.activation
+        )
+
+    def forward(self, x):
+        x = self.conv(x.unsqueeze(-1).permute(0,3,2,1)).squeeze(1)
+        x,_ = self.GRU(self.downsample(x.permute(0,2,1)))
+        x = self.upsample(x).permute(0,2,1)
+
+        return x
+
+class PitchDNN(torch.nn.Module):
+    """
+    Joint IF-xcorr
+    1D CNN on IF, merge with xcorr, 2D CNN on merged + GRU
+    """
+
+    def __init__(self,input_IF_dim=88, input_xcorr_dim=224, gru_dim=64, output_dim=192):
+        super().__init__()
+
+        self.activation = torch.nn.Tanh()
+
+        self.if_upsample = torch.nn.Sequential(
+            torch.nn.Linear(input_IF_dim,64),
+            self.activation,
+            torch.nn.Linear(64,64),
+            self.activation,
+        )
+
+        self.conv = torch.nn.Sequential(
+            torch.nn.ZeroPad2d((2,0,1,1)),
+            torch.nn.Conv2d(1, 4, 3, bias=True),
+            self.activation,
+            torch.nn.ZeroPad2d((2,0,1,1)),
+            torch.nn.Conv2d(4, 1, 3, bias=True),
+            self.activation,
+        )
+
+        self.downsample = torch.nn.Sequential(
+            torch.nn.Linear(64 + input_xcorr_dim, gru_dim),
+            self.activation
+        )
+        self.GRU = torch.nn.GRU(input_size=gru_dim, hidden_size=gru_dim, num_layers=1, batch_first=True)
+        self.upsample = torch.nn.Sequential(
+            torch.nn.Linear(gru_dim, output_dim)
+        )
+
+    def forward(self, x):
+        xcorr_feat = x[:,:,:224]
+        if_feat = x[:,:,224:]
+        xcorr_feat = self.conv(xcorr_feat.unsqueeze(-1).permute(0,3,2,1)).squeeze(1).permute(0,2,1)
+        if_feat = self.if_upsample(if_feat)
+        x = torch.cat([xcorr_feat,if_feat],axis = - 1)
+        x,_ = self.GRU(self.downsample(x))
+        x = self.upsample(x).permute(0,2,1)
+
+        return x
+
+
+# Dataloaders
+class Loader(torch.utils.data.Dataset):
+      def __init__(self, features_if, file_pitch, confidence_threshold=0.4, dimension_if=30, context=100):
+            self.if_feat = np.memmap(features_if, dtype=np.float32).reshape(-1,3*dimension_if)
+
+            # Resolution of 20 cents
+            self.cents = np.rint(np.load(file_pitch)[0,:]/20)
+            self.cents = np.clip(self.cents,0,179)
+            self.confidence = np.load(file_pitch)[1,:]
+
+            # Filter confidence for CREPE
+            self.confidence[self.confidence < confidence_threshold] = 0
+            self.context = context
+            # Clip both to same size
+            size_common = min(self.if_feat.shape[0], self.cents.shape[0])
+            self.if_feat = self.if_feat[:size_common,:]
+            self.cents = self.cents[:size_common]
+            self.confidence = self.confidence[:size_common]
+
+            frame_max = self.if_feat.shape[0]//context
+            self.if_feat = np.reshape(self.if_feat[:frame_max*context, :],(frame_max, context,3*dimension_if))
+            self.cents = np.reshape(self.cents[:frame_max * context],(frame_max, context))
+            self.confidence = np.reshape(self.confidence[:frame_max*context],(frame_max, context))
+
+      def __len__(self):
+            return self.if_feat.shape[0]
+
+      def __getitem__(self, index):
+            return torch.from_numpy(self.if_feat[index,:,:]), torch.from_numpy(self.cents[index]), torch.from_numpy(self.confidence[index])
+
+class PitchDNNDataloader(torch.utils.data.Dataset):
+      def __init__(self, features, file_pitch, confidence_threshold=0.4, context=100, choice_data='both'):
+            self.feat = np.memmap(features, mode='r', dtype=np.int8).reshape(-1,312)
+            self.xcorr = self.feat[:,:224]
+            self.if_feat = self.feat[:,224:]
+            ground_truth = np.memmap(file_pitch, mode='r', dtype=np.float32).reshape(-1,2)
+            self.cents = np.rint(60*np.log2(ground_truth[:,0]/62.5))
+            mask = (self.cents>=0).astype('float32') * (self.cents<=180).astype('float32')
+            self.cents = np.clip(self.cents,0,179)
+            self.confidence = ground_truth[:,1] * mask
+            # Filter confidence for CREPE
+            self.confidence[self.confidence < confidence_threshold] = 0
+            self.context = context
+
+            self.choice_data = choice_data
+
+            frame_max = self.if_feat.shape[0]//context
+            self.if_feat = np.reshape(self.if_feat[:frame_max*context,:], (frame_max, context, 88))
+            self.cents = np.reshape(self.cents[:frame_max*context], (frame_max,context))
+            self.xcorr = np.reshape(self.xcorr[:frame_max*context,:], (frame_max,context, 224))
+            self.confidence = np.reshape(self.confidence[:frame_max*context], (frame_max, context))
+
+      def __len__(self):
+            return self.if_feat.shape[0]
+
+      def __getitem__(self, index):
+            if self.choice_data == 'both':
+                return torch.cat([torch.from_numpy((1./127)*self.xcorr[index,:,:]), torch.from_numpy((1./127)*self.if_feat[index,:,:])], dim=-1), torch.from_numpy(self.cents[index]), torch.from_numpy(self.confidence[index])
+            elif self.choice_data == 'if':
+                return torch.from_numpy((1./127)*self.if_feat[index,:,:]),torch.from_numpy(self.cents[index]),torch.from_numpy(self.confidence[index])
+            else:
+                return torch.from_numpy((1./127)*self.xcorr[index,:,:]),torch.from_numpy(self.cents[index]),torch.from_numpy(self.confidence[index])
--- a/managed_components/78__esp-opus/dnn/torch/neural-pitch/neural_pitch_update.py
+++ b/managed_components/78__esp-opus/dnn/torch/neural-pitch/neural_pitch_update.py
@@ -0,0 +1,179 @@
+import argparse
+parser = argparse.ArgumentParser()
+
+parser.add_argument('features', type=str, help='Features generated from dump_data')
+parser.add_argument('data', type=str, help='Data generated from dump_data (offset by 5ms)')
+parser.add_argument('output', type=str, help='output .f32 feature file with replaced neural pitch')
+parser.add_argument('checkpoint', type=str, help='model checkpoint file')
+parser.add_argument('path_lpcnet_extractor', type=str, help='path to LPCNet extractor object file (generated on compilation)')
+parser.add_argument('--device', type=str, help='compute device',default = None,required = False)
+parser.add_argument('--replace_xcorr', type = bool, default = False, help='Replace LPCNet xcorr with updated one')
+
+args = parser.parse_args()
+
+import os
+
+from utils import stft, random_filter
+import subprocess
+import numpy as np
+import json
+import torch
+import tqdm
+
+from models import PitchDNNIF, PitchDNNXcorr, PitchDNN
+
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+if device is not None:
+    device = torch.device(args.device)
+
+# Loading the appropriate model
+checkpoint = torch.load(args.checkpoint, map_location='cpu')
+dict_params = checkpoint['config']
+
+if dict_params['data_format'] == 'if':
+    pitch_nn = PitchDNNIF(dict_params['freq_keep']*3, dict_params['gru_dim'], dict_params['output_dim'])
+elif dict_params['data_format'] == 'xcorr':
+    pitch_nn = PitchDNNXcorr(dict_params['xcorr_dim'], dict_params['gru_dim'], dict_params['output_dim'])
+else:
+    pitch_nn = PitchDNN(dict_params['freq_keep']*3, dict_params['xcorr_dim'], dict_params['gru_dim'], dict_params['output_dim'])
+
+pitch_nn.load_state_dict(checkpoint['state_dict'])
+pitch_nn = pitch_nn.to(device)
+
+N = dict_params['window_size']
+H = dict_params['hop_factor']
+freq_keep = dict_params['freq_keep']
+
+os.environ["OMP_NUM_THREADS"] = "16"
+
+
+def run_lpc(signal, lpcs, frame_length=160):
+    num_frames, lpc_order = lpcs.shape
+
+    prediction = np.concatenate(
+        [- np.convolve(signal[i * frame_length : (i + 1) * frame_length + lpc_order - 1], lpcs[i], mode='valid') for i in range(num_frames)]
+    )
+    error = signal[lpc_order :] - prediction
+
+    return prediction, error
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+
+    features = np.memmap(args.features, dtype=np.float32,mode = 'r').reshape((-1, 36))
+    data     = np.memmap(args.data, dtype=np.int16,mode = 'r').reshape((-1, 2))
+
+    num_frames = features.shape[0]
+    feature_dim = features.shape[1]
+
+    assert feature_dim == 36
+
+    output  = np.memmap(args.output, dtype=np.float32, shape=(num_frames, feature_dim), mode='w+')
+    output[:, :36] = features
+
+    # lpc coefficients and signal
+    lpcs = features[:, 20:36]
+    sig = data[:, 1]
+
+    # parameters
+
+    # constants
+    pitch_min = 32
+    pitch_max = 256
+    lpc_order = 16
+    fs = 16000
+    frame_length = 160
+    overlap_frames = 100
+    chunk_size = 10000
+    history_length = frame_length * overlap_frames
+    history = np.zeros(history_length, dtype=np.int16)
+    pitch_position=18
+    xcorr_position=19
+    conf_position=36
+
+    num_frames = len(sig) // 160 - 1
+
+    frame_start = 0
+    frame_stop = min(frame_start + chunk_size, num_frames)
+    signal_start = 0
+    signal_stop = frame_stop * frame_length
+
+    niters = (num_frames - 1)//chunk_size
+    for i in tqdm.trange(niters):
+        if (frame_start > num_frames - 1):
+            break
+        chunk = np.concatenate((history, sig[signal_start:signal_stop]))
+        chunk_la = np.concatenate((history, sig[signal_start:signal_stop + 80]))
+
+        # Feature computation
+        spec = stft(x = np.concatenate([np.zeros(80),chunk_la/(2**15 - 1)]), w = 'boxcar', N = N, H = H).T
+        phase_diff = spec*np.conj(np.roll(spec,1,axis = -1))
+        phase_diff = phase_diff/(np.abs(phase_diff) + 1.0e-8)
+        idx_save = np.concatenate([np.arange(freq_keep),(N//2 + 1) + np.arange(freq_keep),2*(N//2 + 1) + np.arange(freq_keep)])
+        feature = np.concatenate([np.log(np.abs(spec) + 1.0e-8),np.real(phase_diff),np.imag(phase_diff)],axis = 0).T
+        feature_if = feature[:,idx_save]
+
+        data_temp = np.memmap('./temp_featcompute_' + dict_params['data_format'] + '_.raw', dtype=np.int16, shape=(chunk.shape[0]), mode='w+')
+        data_temp[:chunk.shape[0]] = chunk_la[80:].astype(np.int16)
+
+        subprocess.run([args.path_lpcnet_extractor, './temp_featcompute_' + dict_params['data_format'] + '_.raw', './temp_featcompute_xcorr_' + dict_params['data_format'] + '_.raw'])
+        feature_xcorr = np.flip(np.fromfile('./temp_featcompute_xcorr_' + dict_params['data_format'] + '_.raw', dtype='float32').reshape((-1,256),order = 'C'),axis = 1)
+        ones_zero_lag = np.expand_dims(np.ones(feature_xcorr.shape[0]),-1)
+        feature_xcorr = np.concatenate([ones_zero_lag,feature_xcorr],axis = -1)
+
+        os.remove('./temp_featcompute_' + dict_params['data_format'] + '_.raw')
+        os.remove('./temp_featcompute_xcorr_' + dict_params['data_format'] + '_.raw')
+
+        if dict_params['data_format'] == 'if':
+            feature = feature_if
+        elif dict_params['data_format'] == 'xcorr':
+            feature = feature_xcorr
+        else:
+            indmin = min(feature_if.shape[0],feature_xcorr.shape[0])
+            feature = np.concatenate([feature_xcorr[:indmin,:],feature_if[:indmin,:]],-1)
+
+        # Compute pitch with my model
+        model_cents = pitch_nn(torch.from_numpy(np.copy(np.expand_dims(feature,0))).float().to(device))
+        model_cents = 20*model_cents.argmax(dim=1).cpu().detach().squeeze().numpy()
+        frequency = 62.5*2**(model_cents/1200)
+
+        frequency  = frequency[overlap_frames : overlap_frames + frame_stop - frame_start]
+
+        # convert frequencies to periods
+        periods    = np.round(fs / frequency)
+
+        periods = np.clip(periods, pitch_min, pitch_max)
+
+        output[frame_start:frame_stop, pitch_position] = (periods - 100) / 50
+
+        frame_offset = (pitch_max + frame_length - 1) // frame_length
+        offset = frame_offset * frame_length
+        padding = lpc_order
+
+
+        if frame_start < frame_offset:
+            lpc_coeffs = np.concatenate((np.zeros((frame_offset - frame_start, lpc_order), dtype=np.float32), lpcs[:frame_stop]))
+        else:
+            lpc_coeffs = lpcs[frame_start - frame_offset : frame_stop]
+
+        pred, error = run_lpc(chunk[history_length - offset - padding :], lpc_coeffs, frame_length=frame_length)
+
+        xcorr = np.zeros(frame_stop - frame_start)
+        for i, p in enumerate(periods.astype(np.int16)):
+            if p > 0:
+                f1 = error[offset + i * frame_length : offset + (i + 1) * frame_length]
+                f2 = error[offset + i * frame_length - p : offset + (i + 1) * frame_length - p]
+                xcorr[i] = np.dot(f1, f2) / np.sqrt(np.dot(f1, f1) * np.dot(f2, f2) + 1e-6)
+
+        output[frame_start:frame_stop, xcorr_position] = xcorr - 0.5
+
+        # update buffers and indices
+        history = chunk[-history_length :]
+
+        frame_start += chunk_size
+        frame_stop += chunk_size
+        frame_stop = min(frame_stop, num_frames)
+
+        signal_start = frame_start * frame_length
+        signal_stop  = frame_stop  * frame_length
--- a/managed_components/78__esp-opus/dnn/torch/neural-pitch/ptdb_process.sh
+++ b/managed_components/78__esp-opus/dnn/torch/neural-pitch/ptdb_process.sh
@@ -0,0 +1,34 @@
+# Copy into PTDB root directory and run to combine all the male/female raw audio/references into below directories
+
+# Make folder for combined audio
+mkdir -p './combined_mic_16k/'
+# Make folder for combined pitch reference
+mkdir -p './combined_reference_f0/'
+
+# Resample Male Audio
+for i in ./MALE/MIC/**/*.wav; do
+j="$(basename "$i" .wav)"
+echo $j
+sox -r 48000 -b 16 -e signed-integer "$i" -r 16000 -b 16 -e signed-integer ./combined_mic_16k/$j.raw
+done
+
+# Resample Female Audio
+for i in ./FEMALE/MIC/**/*.wav; do
+j="$(basename "$i" .wav)"
+echo $j
+sox -r 48000 -b 16 -e signed-integer "$i" -r 16000 -b 16 -e signed-integer ./combined_mic_16k/$j.raw
+done
+
+# Shift Male reference pitch files
+for i in ./MALE/REF/**/*.f0; do
+j="$(basename "$i" .wav)"
+echo $j
+cp "$i" ./combined_reference_f0/
+done
+
+# Shift Female reference pitch files
+for i in ./FEMALE/REF/**/*.f0; do
+j="$(basename "$i" .wav)"
+echo $j
+cp "$i" ./combined_reference_f0/
+done
--- a/managed_components/78__esp-opus/dnn/torch/neural-pitch/run_crepe.py
+++ b/managed_components/78__esp-opus/dnn/torch/neural-pitch/run_crepe.py
@@ -0,0 +1,72 @@
+"""
+Perform Data Augmentation (Gain, Additive Noise, Random Filtering) on Input TTS Data
+1. Read in chunks and compute clean pitch first
+2. Then add in augmentation (Noise/Level/Response)
+    - Adds filtered noise from the "Demand" dataset, https://zenodo.org/record/1227121#.XRKKxYhKiUk
+    - When using the Demand Dataset, consider each channel as a possible noise input, and keep the first 4 minutes of noise for training
+3. Use this "augmented" audio for feature computation, and compute pitch using CREPE on the clean input
+
+Notes: To ensure consistency with the discovered CREPE offset, we do the following
+- We pad the input audio to the zero-centered CREPE estimator with 80 zeros
+- We pad the input audio to our feature computation with 160 zeros to center them
+"""
+
+import argparse
+parser = argparse.ArgumentParser()
+
+parser.add_argument('data', type=str, help='input raw audio data')
+parser.add_argument('output', type=str, help='output directory')
+parser.add_argument('--gpu-index', type=int, help='GPU index to use if multiple GPUs',default = 0,required = False)
+parser.add_argument('--chunk-size-frames', type=int, help='Number of frames to process at a time',default = 100000,required = False)
+
+args = parser.parse_args()
+
+import os
+os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu_index)
+
+import numpy as np
+import tqdm
+import crepe
+
+data = np.memmap(args.data, dtype=np.int16,mode = 'r')
+
+# list_features = []
+list_cents = []
+list_confidences = []
+
+min_period = 32
+max_period = 256
+f_ref = 16000/max_period
+chunk_size_frames = args.chunk_size_frames
+chunk_size = chunk_size_frames*160
+
+nb_chunks = (data.shape[0]+79)//chunk_size+1
+
+output_data = np.zeros((0,2),dtype='float32')
+
+for i in tqdm.trange(nb_chunks):
+    if i==0:
+        chunk = np.concatenate([np.zeros(80),data[:chunk_size-80]])
+    elif i==nb_chunks-1:
+        chunk = data[i*chunk_size-80:]
+    else:
+        chunk = data[i*chunk_size-80:(i+1)*chunk_size-80]
+    chunk = chunk/np.array(32767.,dtype='float32')
+
+    # Clean Pitch/Confidence Estimate
+    # Padding input to CREPE by 80 samples to ensure it aligns
+    _, pitch, confidence, _ = crepe.predict(chunk, 16000, center=True, viterbi=True,verbose=0)
+    pitch = pitch[:chunk_size_frames]
+    confidence = confidence[:chunk_size_frames]
+
+
+    # Filter out of range pitches/confidences
+    confidence[pitch < 16000/max_period] = 0
+    confidence[pitch > 16000/min_period] = 0
+    pitch = np.reshape(pitch, (-1, 1))
+    confidence = np.reshape(confidence, (-1, 1))
+    out = np.concatenate([pitch, confidence], axis=-1, dtype='float32')
+    output_data = np.concatenate([output_data, out], axis=0)
+
+
+output_data.tofile(args.output)
--- a/managed_components/78__esp-opus/dnn/torch/neural-pitch/training.py
+++ b/managed_components/78__esp-opus/dnn/torch/neural-pitch/training.py
@@ -0,0 +1,162 @@
+"""
+Training the neural pitch estimator
+
+"""
+
+import os
+import argparse
+parser = argparse.ArgumentParser()
+
+parser.add_argument('features', type=str, help='.f32 IF Features for training (generated by augmentation script)')
+parser.add_argument('features_pitch', type=str, help='.npy Pitch file for training (generated by augmentation script)')
+parser.add_argument('output_folder', type=str, help='Output directory to store the model weights and config')
+parser.add_argument('data_format', type=str, help='Choice of Input Data',choices=['if','xcorr','both'])
+parser.add_argument('--gpu_index', type=int, help='GPU index to use if multiple GPUs',default = 0,required = False)
+parser.add_argument('--confidence_threshold', type=float, help='Confidence value below which pitch will be neglected during training',default = 0.4,required = False)
+parser.add_argument('--context', type=int, help='Sequence length during training',default = 100,required = False)
+parser.add_argument('--N', type=int, help='STFT window size',default = 320,required = False)
+parser.add_argument('--H', type=int, help='STFT Hop size',default = 160,required = False)
+parser.add_argument('--xcorr_dimension', type=int, help='Dimension of Input cross-correlation',default = 257,required = False)
+parser.add_argument('--freq_keep', type=int, help='Number of Frequencies to keep',default = 30,required = False)
+parser.add_argument('--gru_dim', type=int, help='GRU Dimension',default = 64,required = False)
+parser.add_argument('--output_dim', type=int, help='Output dimension',default = 192,required = False)
+parser.add_argument('--learning_rate', type=float, help='Learning Rate',default = 1.0e-3,required = False)
+parser.add_argument('--epochs', type=int, help='Number of training epochs',default = 50,required = False)
+parser.add_argument('--choice_cel', type=str, help='Choice of Cross Entropy Loss (default or robust)',choices=['default','robust'],default = 'default',required = False)
+parser.add_argument('--prefix', type=str, help="prefix for model export, default: model", default='model')
+parser.add_argument('--initial-checkpoint', type=str, help='initial checkpoint to start training from, default: None', default=None)
+
+
+args = parser.parse_args()
+
+# import os
+# os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+# os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu_index)
+
+# Fixing the seeds for reproducability
+import time
+np_seed = int(time.time())
+torch_seed = int(time.time())
+
+import torch
+torch.manual_seed(torch_seed)
+import numpy as np
+np.random.seed(np_seed)
+from utils import count_parameters
+import tqdm
+from models import PitchDNN, PitchDNNIF, PitchDNNXcorr, PitchDNNDataloader
+
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+
+if args.data_format == 'if':
+    pitch_nn = PitchDNNIF(3 * args.freq_keep - 2, args.gru_dim, args.output_dim)
+elif args.data_format == 'xcorr':
+    pitch_nn = PitchDNNXcorr(args.xcorr_dimension, args.gru_dim, args.output_dim)
+else:
+    pitch_nn = PitchDNN(3 * args.freq_keep - 2, 224, args.gru_dim, args.output_dim)
+
+if type(args.initial_checkpoint) != type(None):
+    checkpoint = torch.load(args.initial_checkpoint, map_location='cpu')
+    pitch_nn.load_state_dict(checkpoint['state_dict'], strict=False)
+
+
+dataset_training = PitchDNNDataloader(args.features,args.features_pitch,args.confidence_threshold,args.context,args.data_format)
+
+def loss_custom(logits,labels,confidence,choice = 'default',nmax = 192,q = 0.7):
+    logits_softmax = torch.nn.Softmax(dim = 1)(logits).permute(0,2,1)
+    labels_one_hot = torch.nn.functional.one_hot(labels.long(),nmax)
+
+    if choice == 'default':
+        # Categorical Cross Entropy
+        CE = -torch.sum(torch.log(logits_softmax*labels_one_hot + 1.0e-6)*labels_one_hot,dim=-1)
+        CE = torch.mean(confidence*CE)
+
+    else:
+        # Robust Cross Entropy
+        CE = (1.0/q)*(1 - torch.sum(torch.pow(logits_softmax*labels_one_hot + 1.0e-7,q),dim=-1) )
+        CE = torch.sum(confidence*CE)
+
+    return CE
+
+def accuracy(logits,labels,confidence,choice = 'default',nmax = 192,q = 0.7):
+    logits_softmax = torch.nn.Softmax(dim = 1)(logits).permute(0,2,1)
+    pred_pitch = torch.argmax(logits_softmax, 2)
+    accuracy = (pred_pitch != labels.long())*1.
+    return 1.-torch.mean(confidence*accuracy)
+
+train_dataset, test_dataset = torch.utils.data.random_split(dataset_training, [0.95,0.05], generator=torch.Generator().manual_seed(torch_seed))
+
+batch_size = 256
+train_dataloader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True, num_workers=0, pin_memory=False)
+test_dataloader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=True, num_workers=0, pin_memory=False)
+
+pitch_nn = pitch_nn.to(device)
+num_params = count_parameters(pitch_nn)
+learning_rate = args.learning_rate
+model_opt = torch.optim.Adam(pitch_nn.parameters(), lr = learning_rate)
+
+num_epochs = args.epochs
+
+for epoch in range(num_epochs):
+    losses = []
+    accs = []
+    pitch_nn.train()
+    with tqdm.tqdm(train_dataloader) as train_epoch:
+        for i, (xi, yi, ci) in enumerate(train_epoch):
+            yi, xi, ci = yi.to(device, non_blocking=True), xi.to(device, non_blocking=True), ci.to(device, non_blocking=True)
+            pi = pitch_nn(xi.float())
+            loss = loss_custom(logits = pi,labels = yi,confidence = ci,choice = args.choice_cel,nmax = args.output_dim)
+            acc = accuracy(logits = pi,labels = yi,confidence = ci,choice = args.choice_cel,nmax = args.output_dim)
+            acc = acc.detach()
+
+            model_opt.zero_grad()
+            loss.backward()
+            model_opt.step()
+
+            losses.append(loss.item())
+            accs.append(acc.item())
+            avg_loss = np.mean(losses)
+            avg_acc = np.mean(accs)
+            train_epoch.set_postfix({"Train Epoch" : epoch, "Train Loss":avg_loss, "acc" : avg_acc.item()})
+
+    if epoch % 5 == 0:
+        pitch_nn.eval()
+        losses = []
+        with tqdm.tqdm(test_dataloader) as test_epoch:
+            for i, (xi, yi, ci) in enumerate(test_epoch):
+                yi, xi, ci = yi.to(device, non_blocking=True), xi.to(device, non_blocking=True), ci.to(device, non_blocking=True)
+                pi = pitch_nn(xi.float())
+                loss = loss_custom(logits = pi,labels = yi,confidence = ci,choice = args.choice_cel,nmax = args.output_dim)
+                losses.append(loss.item())
+                avg_loss = np.mean(losses)
+                test_epoch.set_postfix({"Epoch" : epoch, "Test Loss":avg_loss})
+
+pitch_nn.eval()
+
+config = dict(
+    data_format=args.data_format,
+    epochs=num_epochs,
+    window_size= args.N,
+    hop_factor= args.H,
+    freq_keep=args.freq_keep,
+    batch_size=batch_size,
+    learning_rate=learning_rate,
+    confidence_threshold=args.confidence_threshold,
+    model_parameters=num_params,
+    np_seed=np_seed,
+    torch_seed=torch_seed,
+    xcorr_dim=args.xcorr_dimension,
+    dim_input=3*args.freq_keep - 2,
+    gru_dim=args.gru_dim,
+    output_dim=args.output_dim,
+    choice_cel=args.choice_cel,
+    context=args.context,
+)
+
+model_save_path = os.path.join(args.output_folder, f"{args.prefix}_{args.data_format}.pth")
+checkpoint = {
+    'state_dict': pitch_nn.state_dict(),
+    'config': config
+}
+torch.save(checkpoint, model_save_path)
--- a/managed_components/78__esp-opus/dnn/torch/neural-pitch/utils.py
+++ b/managed_components/78__esp-opus/dnn/torch/neural-pitch/utils.py
@@ -0,0 +1,59 @@
+"""
+Utility functions that are commonly used
+"""
+
+import numpy as np
+from scipy.signal import windows, lfilter
+from prettytable import PrettyTable
+
+
+# Source: https://gist.github.com/thongonary/026210fc186eb5056f2b6f1ca362d912
+def count_parameters(model):
+    table = PrettyTable(["Modules", "Parameters"])
+    total_params = 0
+    for name, parameter in model.named_parameters():
+        if not parameter.requires_grad: continue
+        param = parameter.numel()
+        table.add_row([name, param])
+        total_params+=param
+    print(table)
+    print(f"Total Trainable Params: {total_params}")
+    return total_params
+
+def stft(x, w = 'boxcar', N = 320, H = 160):
+    x = np.concatenate([x,np.zeros(N)])
+    # win_custom = np.concatenate([windows.hann(80)[:40],np.ones(240),windows.hann(80)[40:]])
+    return np.stack([np.fft.rfft(x[i:i + N]*windows.get_window(w,N)) for i in np.arange(0,x.shape[0]-N,H)])
+
+def random_filter(x):
+    # Randomly filter x with second order IIR filter with coefficients in between -3/8,3/8
+    filter_coeff = np.random.uniform(low =  -3.0/8, high = 3.0/8, size = 4)
+    b = [1,filter_coeff[0],filter_coeff[1]]
+    a = [1,filter_coeff[2],filter_coeff[3]]
+    return lfilter(b,a,x)
+
+def feature_xform(feature):
+    """
+    Take as input the (N * 256) xcorr features output by LPCNet and perform the following
+    1. Downsample and Upsample by 2 (followed by smoothing)
+    2. Append positional embeddings (of dim k) coresponding to each xcorr lag
+    """
+
+    from scipy.signal import resample_poly, lfilter
+
+
+    feature_US = lfilter([0.25,0.5,0.25],[1],resample_poly(feature,2,1,axis = 1),axis = 1)[:,:feature.shape[1]]
+    feature_DS = lfilter([0.5,0.5],[1],resample_poly(feature,1,2,axis = 1),axis = 1)
+    Z_append = np.zeros((feature.shape[0],feature.shape[1] - feature_DS.shape[1]))
+    feature_DS = np.concatenate([feature_DS,Z_append],axis = -1)
+
+    # pos_embedding = []
+    # for i in range(k):
+    #     pos_embedding.append(np.cos((2**i)*np.pi*((np.repeat(np.arange(feature.shape[1]).reshape(feature.shape[1],1),feature.shape[0],axis = 1)).T/(2*feature.shape[1]))))
+
+    # pos_embedding = np.stack(pos_embedding,axis = -1)
+
+    feature = np.stack((feature_DS,feature,feature_US),axis = -1)
+    # feature = np.concatenate((feature,pos_embedding),axis = -1)
+
+    return feature