add some code
This commit is contained in:
@@ -0,0 +1,18 @@
|
||||
## Neural Pitch Estimation
|
||||
|
||||
- Dataset Installation
|
||||
1. Download and unzip PTDB Dataset:
|
||||
wget https://www2.spsc.tugraz.at/databases/PTDB-TUG/SPEECH_DATA_ZIPPED.zip
|
||||
unzip SPEECH_DATA_ZIPPED.zip
|
||||
|
||||
2. Inside "SPEECH DATA" above, run ptdb_process.sh to combine male/female
|
||||
|
||||
3. To Download and combine demand, simply run download_demand.sh
|
||||
|
||||
- LPCNet preparation
|
||||
1. To extract xcorr, add lpcnet_extractor.c and add relevant functions to lpcnet_enc.c, add source for headers/c files and Makefile.am, and compile to generate ./lpcnet_xcorr_extractor object
|
||||
|
||||
- Dataset Augmentation and training (check out arguments to each of the following)
|
||||
1. Run data_augmentation.py
|
||||
2. Run training.py using augmented data
|
||||
3. Run experiments.py
|
||||
@@ -0,0 +1,149 @@
|
||||
"""
|
||||
Perform Data Augmentation (Gain, Additive Noise, Random Filtering) on Input TTS Data
|
||||
1. Read in chunks and compute clean pitch first
|
||||
2. Then add in augmentation (Noise/Level/Response)
|
||||
- Adds filtered noise from the "Demand" dataset, https://zenodo.org/record/1227121#.XRKKxYhKiUk
|
||||
- When using the Demand Dataset, consider each channel as a possible noise input, and keep the first 4 minutes of noise for training
|
||||
3. Use this "augmented" audio for feature computation, and compute pitch using CREPE on the clean input
|
||||
|
||||
Notes: To ensure consistency with the discovered CREPE offset, we do the following
|
||||
- We pad the input audio to the zero-centered CREPE estimator with 80 zeros
|
||||
- We pad the input audio to our feature computation with 160 zeros to center them
|
||||
"""
|
||||
|
||||
import argparse
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
parser.add_argument('data', type=str, help='input raw audio data')
|
||||
parser.add_argument('output', type=str, help='output directory')
|
||||
parser.add_argument('path_lpcnet_extractor', type=str, help='path to LPCNet extractor object file (generated on compilation)')
|
||||
parser.add_argument('noise_dataset', type=str, help='Location of the Demand Datset')
|
||||
parser.add_argument('--flag_xcorr', type=bool, help='Flag to additionally dump xcorr features',choices=[True,False],default = False,required = False)
|
||||
parser.add_argument('--fraction_input_use', type=float, help='Fraction of input data to consider',default = 0.3,required = False)
|
||||
parser.add_argument('--gpu_index', type=int, help='GPU index to use if multiple GPUs',default = 0,required = False)
|
||||
parser.add_argument('--choice_augment', type=str, help='Choice of noise augmentation, either use additive synthetic noise or add noise from the demand dataset',choices = ['demand','synthetic'],default = "demand",required = False)
|
||||
parser.add_argument('--fraction_clean', type=float, help='Fraction of data to keep clean (that is not augment with anything)',default = 0.2,required = False)
|
||||
parser.add_argument('--chunk_size', type=int, help='Number of samples to augment with for each iteration',default = 80000,required = False)
|
||||
parser.add_argument('--N', type=int, help='STFT window size',default = 320,required = False)
|
||||
parser.add_argument('--H', type=int, help='STFT Hop size',default = 160,required = False)
|
||||
parser.add_argument('--freq_keep', type=int, help='Number of Frequencies to keep',default = 30,required = False)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
import os
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu_index)
|
||||
|
||||
from utils import stft, random_filter
|
||||
|
||||
import numpy as np
|
||||
import tqdm
|
||||
import crepe
|
||||
import random
|
||||
import glob
|
||||
import subprocess
|
||||
|
||||
data_full = np.memmap(args.data, dtype=np.int16,mode = 'r')
|
||||
data = data_full[:(int)(args.fraction_input_use*data_full.shape[0])]
|
||||
|
||||
# list_features = []
|
||||
list_cents = []
|
||||
list_confidences = []
|
||||
|
||||
N = args.N
|
||||
H = args.H
|
||||
freq_keep = args.freq_keep
|
||||
# Minimum/Maximum periods, decided by LPCNet
|
||||
min_period = 32
|
||||
max_period = 256
|
||||
f_ref = 16000/max_period
|
||||
chunk_size = args.chunk_size
|
||||
num_frames_chunk = chunk_size//H
|
||||
list_indices_keep = np.concatenate([np.arange(freq_keep), (N//2 + 1) + np.arange(freq_keep), 2*(N//2 + 1) + np.arange(freq_keep)])
|
||||
|
||||
output_IF = np.memmap(args.output + '_iffeat.f32', dtype=np.float32, shape=(((data.shape[0]//chunk_size - 1)//1)*num_frames_chunk,list_indices_keep.shape[0]), mode='w+')
|
||||
if args.flag_xcorr:
|
||||
output_xcorr = np.memmap(args.output + '_xcorr.f32', dtype=np.float32, shape=(((data.shape[0]//chunk_size - 1)//1)*num_frames_chunk,257), mode='w+')
|
||||
|
||||
fraction_clean = args.fraction_clean
|
||||
|
||||
noise_dataset = args.noise_dataset
|
||||
|
||||
for i in tqdm.trange((data.shape[0]//chunk_size - 1)//1):
|
||||
chunk = data[i*chunk_size:(i + 1)*chunk_size]/(2**15 - 1)
|
||||
|
||||
# Clean Pitch/Confidence Estimate
|
||||
# Padding input to CREPE by 80 samples to ensure it aligns
|
||||
_, pitch, confidence, _ = crepe.predict(np.concatenate([np.zeros(80),chunk]), 16000, center=True, viterbi=True,verbose=0)
|
||||
cent = 1200*np.log2(np.divide(pitch, f_ref, out=np.zeros_like(pitch), where=pitch!=0) + 1.0e-8)
|
||||
|
||||
# Filter out of range pitches/confidences
|
||||
confidence[pitch < 16000/max_period] = 0
|
||||
confidence[pitch > 16000/min_period] = 0
|
||||
|
||||
# Keep fraction of data clean, augment only 1 minus the fraction
|
||||
if (np.random.rand() > fraction_clean):
|
||||
# Response, generate controlled/random 2nd order IIR filter and filter chunk
|
||||
chunk = random_filter(chunk)
|
||||
|
||||
# Level/Gain response {scale by random gain between 1.0e-3 and 10}
|
||||
# Generate random gain in dB and then convert to scale
|
||||
g_dB = np.random.uniform(low = -60, high = 20, size = 1)
|
||||
# g_dB = 0
|
||||
g = 10**(g_dB/20)
|
||||
|
||||
# Noise Addition {Add random SNR 2nd order randomly colored noise}
|
||||
# Generate noise SNR value and add corresponding noise
|
||||
snr_dB = np.random.uniform(low = -20, high = 30, size = 1)
|
||||
|
||||
if args.choice_augment == 'synthetic':
|
||||
n = np.random.randn(chunk_size)
|
||||
else:
|
||||
list_noisefiles = noise_dataset + '*.wav'
|
||||
noise_file = random.choice(glob.glob(list_noisefiles))
|
||||
n = np.memmap(noise_file, dtype=np.int16,mode = 'r')/(2**15 - 1)
|
||||
rand_range = np.random.randint(low = 0, high = (n.shape[0] - 16000*60 - chunk.shape[0])) # 16000 is subtracted because we will use the last 1 minutes of noise for testing
|
||||
n = n[rand_range:rand_range + chunk.shape[0]]
|
||||
|
||||
# Randomly filter the sampled noise as well
|
||||
n = random_filter(n)
|
||||
# generate random prime number between 0,500 and make those samples of noise 0 (to prevent GRU from picking up temporal patterns)
|
||||
Nprime = random.choice([2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97, 101, 103, 107, 109, 113, 127, 131, 137, 139, 149, 151, 157, 163, 167, 173, 179, 181, 191, 193, 197, 199, 211, 223, 227, 229, 233, 239, 241, 251, 257, 263, 269, 271, 277, 281, 283, 293, 307, 311, 313, 317, 331, 337, 347, 349, 353, 359, 367, 373, 379, 383, 389, 397, 401, 409, 419, 421, 431, 433, 439, 443, 449, 457, 461, 463, 467, 479, 487, 491, 499, 503, 509, 521, 523, 541])
|
||||
n[chunk_size - Nprime:] = np.zeros(Nprime)
|
||||
snr_multiplier = np.sqrt((np.sum(np.abs(chunk)**2)/np.sum(np.abs(n)**2))*10**(-snr_dB/10))
|
||||
|
||||
chunk = g*(chunk + snr_multiplier*n)
|
||||
|
||||
# Zero pad input audio by 160 to center the frames
|
||||
spec = stft(x = np.concatenate([np.zeros(160),chunk]), w = 'boxcar', N = N, H = H).T
|
||||
phase_diff = spec*np.conj(np.roll(spec,1,axis = -1))
|
||||
phase_diff = phase_diff/(np.abs(phase_diff) + 1.0e-8)
|
||||
feature = np.concatenate([np.log(np.abs(spec) + 1.0e-8),np.real(phase_diff),np.imag(phase_diff)],axis = 0).T
|
||||
feature = feature[:,list_indices_keep]
|
||||
|
||||
if args.flag_xcorr:
|
||||
# Dump noisy audio into temp file
|
||||
data_temp = np.memmap('./temp_augment.raw', dtype=np.int16, shape=(chunk.shape[0]), mode='w+')
|
||||
# data_temp[:chunk.shape[0]] = (chunk/(np.max(np.abs(chunk)))*(2**15 - 1)).astype(np.int16)
|
||||
data_temp[:chunk.shape[0]] = ((chunk)*(2**15 - 1)).astype(np.int16)
|
||||
|
||||
subprocess.run([args.path_lpcnet_extractor, './temp_augment.raw', './temp_augment_xcorr.f32'])
|
||||
feature_xcorr = np.flip(np.fromfile('./temp_augment_xcorr.f32', dtype='float32').reshape((-1,256),order = 'C'),axis = 1)
|
||||
ones_zero_lag = np.expand_dims(np.ones(feature_xcorr.shape[0]),-1)
|
||||
feature_xcorr = np.concatenate([ones_zero_lag,feature_xcorr],axis = -1)
|
||||
|
||||
os.remove('./temp_augment.raw')
|
||||
os.remove('./temp_augment_xcorr.f32')
|
||||
num_frames = min(cent.shape[0],feature.shape[0],feature_xcorr.shape[0],num_frames_chunk)
|
||||
feature = feature[:num_frames,:]
|
||||
cent = cent[:num_frames]
|
||||
confidence = confidence[:num_frames]
|
||||
feature_xcorr = feature_xcorr[:num_frames]
|
||||
output_IF[i*num_frames_chunk:(i + 1)*num_frames_chunk,:] = feature
|
||||
output_xcorr[i*num_frames_chunk:(i + 1)*num_frames_chunk,:] = feature_xcorr
|
||||
list_cents.append(cent)
|
||||
list_confidences.append(confidence)
|
||||
|
||||
list_cents = np.hstack(list_cents)
|
||||
list_confidences = np.hstack(list_confidences)
|
||||
|
||||
np.save(args.output + '_pitches',np.vstack([list_cents,list_confidences]))
|
||||
@@ -0,0 +1,43 @@
|
||||
wget https://zenodo.org/record/1227121/files/DKITCHEN_16k.zip
|
||||
|
||||
wget https://zenodo.org/record/1227121/files/DLIVING_16k.zip
|
||||
|
||||
wget https://zenodo.org/record/1227121/files/DWASHING_16k.zip
|
||||
|
||||
wget https://zenodo.org/record/1227121/files/NFIELD_16k.zip
|
||||
|
||||
wget https://zenodo.org/record/1227121/files/NPARK_16k.zip
|
||||
|
||||
wget https://zenodo.org/record/1227121/files/NRIVER_16k.zip
|
||||
|
||||
wget https://zenodo.org/record/1227121/files/OHALLWAY_16k.zip
|
||||
|
||||
wget https://zenodo.org/record/1227121/files/OMEETING_16k.zip
|
||||
|
||||
wget https://zenodo.org/record/1227121/files/OOFFICE_16k.zip
|
||||
|
||||
wget https://zenodo.org/record/1227121/files/PCAFETER_16k.zip
|
||||
|
||||
wget https://zenodo.org/record/1227121/files/PRESTO_16k.zip
|
||||
|
||||
wget https://zenodo.org/record/1227121/files/PSTATION_16k.zip
|
||||
|
||||
wget https://zenodo.org/record/1227121/files/TMETRO_16k.zip
|
||||
|
||||
wget https://zenodo.org/record/1227121/files/TCAR_16k.zip
|
||||
|
||||
wget https://zenodo.org/record/1227121/files/TBUS_16k.zip
|
||||
|
||||
wget https://zenodo.org/record/1227121/files/STRAFFIC_16k.zip
|
||||
|
||||
wget https://zenodo.org/record/1227121/files/SPSQUARE_16k.zip
|
||||
|
||||
unzip '*.zip'
|
||||
|
||||
mkdir -p ./combined_demand_channels/
|
||||
for file in */*.wav; do
|
||||
parentdir="$(dirname "$file")"
|
||||
echo $parentdir
|
||||
fname="$(basename "$file")"
|
||||
cp $file ./combined_demand_channels/$parentdir+$fname
|
||||
done
|
||||
@@ -0,0 +1,349 @@
|
||||
"""
|
||||
Evaluation script to compute the Raw Pitch Accuracy
|
||||
Procedure:
|
||||
- Look at all voiced frames in file
|
||||
- Compute number of pitches in those frames that lie within a 50 cent threshold
|
||||
RPA = (Total number of pitches within threshold summed across all files)/(Total number of voiced frames summed accross all files)
|
||||
"""
|
||||
|
||||
import os
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
|
||||
|
||||
from prettytable import PrettyTable
|
||||
import numpy as np
|
||||
import glob
|
||||
import random
|
||||
import tqdm
|
||||
import torch
|
||||
import librosa
|
||||
import json
|
||||
from utils import stft, random_filter, feature_xform
|
||||
import subprocess
|
||||
import crepe
|
||||
|
||||
from models import PitchDNN, PitchDNNIF, PitchDNNXcorr
|
||||
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
|
||||
def rca(reference,input,voicing,thresh = 25):
|
||||
idx_voiced = np.where(voicing != 0)[0]
|
||||
acc = np.where(np.abs(reference - input)[idx_voiced] < thresh)[0]
|
||||
return acc.shape[0]
|
||||
|
||||
def sweep_rca(reference,input,voicing,thresh = 25,ind_arr = np.arange(-10,10)):
|
||||
l = []
|
||||
for i in ind_arr:
|
||||
l.append(rca(reference,np.roll(input,i),voicing,thresh))
|
||||
l = np.array(l)
|
||||
|
||||
return np.max(l)
|
||||
|
||||
def rpa(model,device = 'cpu',data_format = 'if'):
|
||||
list_files = glob.glob('/home/ubuntu/Code/Datasets/SPEECH DATA/combined_mic_16k_raw/*.raw')
|
||||
dir_f0 = '/home/ubuntu/Code/Datasets/SPEECH DATA/combine_f0_ptdb/'
|
||||
# random_shuffle = list(np.random.permutation(len(list_files)))
|
||||
random.shuffle(list_files)
|
||||
list_files = list_files[:1000]
|
||||
|
||||
C_all = 0
|
||||
C_all_m = 0
|
||||
C_all_f = 0
|
||||
list_rca_model_all = []
|
||||
list_rca_male_all = []
|
||||
list_rca_female_all = []
|
||||
|
||||
thresh = 50
|
||||
N = 320
|
||||
H = 160
|
||||
freq_keep = 30
|
||||
|
||||
for idx in tqdm.trange(len(list_files)):
|
||||
audio_file = list_files[idx]
|
||||
file_name = os.path.basename(list_files[idx])[:-4]
|
||||
|
||||
audio = np.memmap(list_files[idx], dtype=np.int16)/(2**15 - 1)
|
||||
offset = 432
|
||||
audio = audio[offset:]
|
||||
rmse = np.squeeze(librosa.feature.rms(y = audio,frame_length = 320,hop_length = 160))
|
||||
|
||||
spec = stft(x = np.concatenate([np.zeros(160),audio]), w = 'boxcar', N = N, H = H).T
|
||||
phase_diff = spec*np.conj(np.roll(spec,1,axis = -1))
|
||||
phase_diff = phase_diff/(np.abs(phase_diff) + 1.0e-8)
|
||||
idx_save = np.concatenate([np.arange(freq_keep),(N//2 + 1) + np.arange(freq_keep),2*(N//2 + 1) + np.arange(freq_keep)])
|
||||
feature = np.concatenate([np.log(np.abs(spec) + 1.0e-8),np.real(phase_diff),np.imag(phase_diff)],axis = 0).T
|
||||
feature_if = feature[:,idx_save]
|
||||
|
||||
data_temp = np.memmap('./temp.raw', dtype=np.int16, shape=(audio.shape[0]), mode='w+')
|
||||
data_temp[:audio.shape[0]] = (audio/(np.max(np.abs(audio)))*(2**15 - 1)).astype(np.int16)
|
||||
|
||||
subprocess.run(["../../../lpcnet_xcorr_extractor", './temp.raw', './temp_xcorr.f32'])
|
||||
feature_xcorr = np.flip(np.fromfile('./temp_xcorr.f32', dtype='float32').reshape((-1,256),order = 'C'),axis = 1)
|
||||
ones_zero_lag = np.expand_dims(np.ones(feature_xcorr.shape[0]),-1)
|
||||
feature_xcorr = np.concatenate([ones_zero_lag,feature_xcorr],axis = -1)
|
||||
# feature_xcorr = feature_xform(feature_xcorr)
|
||||
|
||||
os.remove('./temp.raw')
|
||||
os.remove('./temp_xcorr.f32')
|
||||
|
||||
if data_format == 'if':
|
||||
feature = feature_if
|
||||
elif data_format == 'xcorr':
|
||||
feature = feature_xcorr
|
||||
else:
|
||||
indmin = min(feature_if.shape[0],feature_xcorr.shape[0])
|
||||
feature = np.concatenate([feature_xcorr[:indmin,:],feature_if[:indmin,:]],-1)
|
||||
|
||||
|
||||
pitch_file_name = dir_f0 + "ref" + os.path.basename(list_files[idx])[3:-4] + ".f0"
|
||||
pitch = np.loadtxt(pitch_file_name)[:,0]
|
||||
voicing = np.loadtxt(pitch_file_name)[:,1]
|
||||
indmin = min(voicing.shape[0],rmse.shape[0],pitch.shape[0])
|
||||
pitch = pitch[:indmin]
|
||||
voicing = voicing[:indmin]
|
||||
rmse = rmse[:indmin]
|
||||
voicing = voicing*(rmse > 0.05*np.max(rmse))
|
||||
if "mic_F" in audio_file:
|
||||
idx_correct = np.where(pitch < 125)
|
||||
voicing[idx_correct] = 0
|
||||
|
||||
cent = np.rint(1200*np.log2(np.divide(pitch, (16000/256), out=np.zeros_like(pitch), where=pitch!=0) + 1.0e-8)).astype('int')
|
||||
|
||||
|
||||
model_cents = model(torch.from_numpy(np.copy(np.expand_dims(feature,0))).float().to(device))
|
||||
model_cents = 20*model_cents.argmax(dim=1).cpu().detach().squeeze().numpy()
|
||||
|
||||
num_frames = min(cent.shape[0],model_cents.shape[0])
|
||||
pitch = pitch[:num_frames]
|
||||
cent = cent[:num_frames]
|
||||
voicing = voicing[:num_frames]
|
||||
model_cents = model_cents[:num_frames]
|
||||
|
||||
voicing_all = np.copy(voicing)
|
||||
# Forcefully make regions where pitch is <65 or greater than 500 unvoiced for relevant accurate pitch comparisons for our model
|
||||
force_out_of_pitch = np.where(np.logical_or(pitch < 65,pitch > 500)==True)
|
||||
voicing_all[force_out_of_pitch] = 0
|
||||
C_all = C_all + np.where(voicing_all != 0)[0].shape[0]
|
||||
|
||||
list_rca_model_all.append(rca(cent,model_cents,voicing_all,thresh))
|
||||
|
||||
if "mic_M" in audio_file:
|
||||
list_rca_male_all.append(rca(cent,model_cents,voicing_all,thresh))
|
||||
C_all_m = C_all_m + np.where(voicing_all != 0)[0].shape[0]
|
||||
else:
|
||||
list_rca_female_all.append(rca(cent,model_cents,voicing_all,thresh))
|
||||
C_all_f = C_all_f + np.where(voicing_all != 0)[0].shape[0]
|
||||
|
||||
list_rca_model_all = np.array(list_rca_model_all)
|
||||
list_rca_male_all = np.array(list_rca_male_all)
|
||||
list_rca_female_all = np.array(list_rca_female_all)
|
||||
|
||||
|
||||
x = PrettyTable()
|
||||
|
||||
x.field_names = ["Experiment", "Mean RPA"]
|
||||
x.add_row(["Both all pitches", np.sum(list_rca_model_all)/C_all])
|
||||
|
||||
x.add_row(["Male all pitches", np.sum(list_rca_male_all)/C_all_m])
|
||||
|
||||
x.add_row(["Female all pitches", np.sum(list_rca_female_all)/C_all_f])
|
||||
|
||||
print(x)
|
||||
|
||||
return None
|
||||
|
||||
def cycle_eval(checkpoint_list, noise_type = 'synthetic', noise_dataset = None, list_snr = [-20,-15,-10,-5,0,5,10,15,20], ptdb_dataset_path = None,fraction = 0.1,thresh = 50):
|
||||
"""
|
||||
Cycle through SNR evaluation for list of checkpoints
|
||||
"""
|
||||
list_files = glob.glob(ptdb_dataset_path + 'combined_mic_16k/*.raw')
|
||||
dir_f0 = ptdb_dataset_path + 'combined_reference_f0/'
|
||||
random.shuffle(list_files)
|
||||
list_files = list_files[:(int)(fraction*len(list_files))]
|
||||
|
||||
dict_models = {}
|
||||
list_snr.append(np.inf)
|
||||
|
||||
for f in checkpoint_list:
|
||||
if (f!='crepe') and (f!='lpcnet'):
|
||||
|
||||
checkpoint = torch.load(f, map_location='cpu')
|
||||
dict_params = checkpoint['config']
|
||||
if dict_params['data_format'] == 'if':
|
||||
from models import large_if_ccode as model
|
||||
pitch_nn = PitchDNNIF(dict_params['freq_keep']*3,dict_params['gru_dim'],dict_params['output_dim'])
|
||||
elif dict_params['data_format'] == 'xcorr':
|
||||
from models import large_xcorr as model
|
||||
pitch_nn = PitchDNNXcorr(dict_params['xcorr_dim'],dict_params['gru_dim'],dict_params['output_dim'])
|
||||
else:
|
||||
from models import large_joint as model
|
||||
pitch_nn = PitchDNN(dict_params['freq_keep']*3,dict_params['xcorr_dim'],dict_params['gru_dim'],dict_params['output_dim'])
|
||||
|
||||
pitch_nn.load_state_dict(checkpoint['state_dict'])
|
||||
|
||||
N = dict_params['window_size']
|
||||
H = dict_params['hop_factor']
|
||||
freq_keep = dict_params['freq_keep']
|
||||
|
||||
list_mean = []
|
||||
list_std = []
|
||||
for snr_dB in list_snr:
|
||||
C_all = 0
|
||||
C_correct = 0
|
||||
for idx in tqdm.trange(len(list_files)):
|
||||
audio_file = list_files[idx]
|
||||
file_name = os.path.basename(list_files[idx])[:-4]
|
||||
|
||||
audio = np.memmap(list_files[idx], dtype=np.int16)/(2**15 - 1)
|
||||
offset = 432
|
||||
audio = audio[offset:]
|
||||
rmse = np.squeeze(librosa.feature.rms(y = audio,frame_length = N,hop_length = H))
|
||||
|
||||
if noise_type != 'synthetic':
|
||||
list_noisefiles = noise_dataset + '*.wav'
|
||||
noise_file = random.choice(glob.glob(list_noisefiles))
|
||||
n = np.memmap(noise_file, dtype=np.int16,mode = 'r')/(2**15 - 1)
|
||||
rand_range = np.random.randint(low = 0, high = (16000*60*5 - audio.shape[0])) # Last 1 minute of noise used for testing
|
||||
n = n[rand_range:rand_range + audio.shape[0]]
|
||||
else:
|
||||
n = np.random.randn(audio.shape[0])
|
||||
n = random_filter(n)
|
||||
|
||||
snr_multiplier = np.sqrt((np.sum(np.abs(audio)**2)/np.sum(np.abs(n)**2))*10**(-snr_dB/10))
|
||||
audio = audio + snr_multiplier*n
|
||||
|
||||
spec = stft(x = np.concatenate([np.zeros(160),audio]), w = 'boxcar', N = N, H = H).T
|
||||
phase_diff = spec*np.conj(np.roll(spec,1,axis = -1))
|
||||
phase_diff = phase_diff/(np.abs(phase_diff) + 1.0e-8)
|
||||
idx_save = np.concatenate([np.arange(freq_keep),(N//2 + 1) + np.arange(freq_keep),2*(N//2 + 1) + np.arange(freq_keep)])
|
||||
feature = np.concatenate([np.log(np.abs(spec) + 1.0e-8),np.real(phase_diff),np.imag(phase_diff)],axis = 0).T
|
||||
feature_if = feature[:,idx_save]
|
||||
|
||||
data_temp = np.memmap('./temp.raw', dtype=np.int16, shape=(audio.shape[0]), mode='w+')
|
||||
# data_temp[:audio.shape[0]] = (audio/(np.max(np.abs(audio)))*(2**15 - 1)).astype(np.int16)
|
||||
data_temp[:audio.shape[0]] = ((audio)*(2**15 - 1)).astype(np.int16)
|
||||
|
||||
subprocess.run(["../../../lpcnet_xcorr_extractor", './temp.raw', './temp_xcorr.f32'])
|
||||
feature_xcorr = np.flip(np.fromfile('./temp_xcorr.f32', dtype='float32').reshape((-1,256),order = 'C'),axis = 1)
|
||||
ones_zero_lag = np.expand_dims(np.ones(feature_xcorr.shape[0]),-1)
|
||||
feature_xcorr = np.concatenate([ones_zero_lag,feature_xcorr],axis = -1)
|
||||
|
||||
os.remove('./temp.raw')
|
||||
os.remove('./temp_xcorr.f32')
|
||||
|
||||
if dict_params['data_format'] == 'if':
|
||||
feature = feature_if
|
||||
elif dict_params['data_format'] == 'xcorr':
|
||||
feature = feature_xcorr
|
||||
else:
|
||||
indmin = min(feature_if.shape[0],feature_xcorr.shape[0])
|
||||
feature = np.concatenate([feature_xcorr[:indmin,:],feature_if[:indmin,:]],-1)
|
||||
|
||||
pitch_file_name = dir_f0 + "ref" + os.path.basename(list_files[idx])[3:-4] + ".f0"
|
||||
pitch = np.loadtxt(pitch_file_name)[:,0]
|
||||
voicing = np.loadtxt(pitch_file_name)[:,1]
|
||||
indmin = min(voicing.shape[0],rmse.shape[0],pitch.shape[0])
|
||||
pitch = pitch[:indmin]
|
||||
voicing = voicing[:indmin]
|
||||
rmse = rmse[:indmin]
|
||||
voicing = voicing*(rmse > 0.05*np.max(rmse))
|
||||
if "mic_F" in audio_file:
|
||||
idx_correct = np.where(pitch < 125)
|
||||
voicing[idx_correct] = 0
|
||||
|
||||
cent = np.rint(1200*np.log2(np.divide(pitch, (16000/256), out=np.zeros_like(pitch), where=pitch!=0) + 1.0e-8)).astype('int')
|
||||
|
||||
model_cents = pitch_nn(torch.from_numpy(np.copy(np.expand_dims(feature,0))).float().to(device))
|
||||
model_cents = 20*model_cents.argmax(dim=1).cpu().detach().squeeze().numpy()
|
||||
|
||||
num_frames = min(cent.shape[0],model_cents.shape[0])
|
||||
pitch = pitch[:num_frames]
|
||||
cent = cent[:num_frames]
|
||||
voicing = voicing[:num_frames]
|
||||
model_cents = model_cents[:num_frames]
|
||||
|
||||
voicing_all = np.copy(voicing)
|
||||
# Forcefully make regions where pitch is <65 or greater than 500 unvoiced for relevant accurate pitch comparisons for our model
|
||||
force_out_of_pitch = np.where(np.logical_or(pitch < 65,pitch > 500)==True)
|
||||
voicing_all[force_out_of_pitch] = 0
|
||||
C_all = C_all + np.where(voicing_all != 0)[0].shape[0]
|
||||
|
||||
C_correct = C_correct + rca(cent,model_cents,voicing_all,thresh)
|
||||
list_mean.append(C_correct/C_all)
|
||||
else:
|
||||
fname = f
|
||||
list_mean = []
|
||||
list_std = []
|
||||
for snr_dB in list_snr:
|
||||
C_all = 0
|
||||
C_correct = 0
|
||||
for idx in tqdm.trange(len(list_files)):
|
||||
audio_file = list_files[idx]
|
||||
file_name = os.path.basename(list_files[idx])[:-4]
|
||||
|
||||
audio = np.memmap(list_files[idx], dtype=np.int16)/(2**15 - 1)
|
||||
offset = 432
|
||||
audio = audio[offset:]
|
||||
rmse = np.squeeze(librosa.feature.rms(y = audio,frame_length = 320,hop_length = 160))
|
||||
|
||||
if noise_type != 'synthetic':
|
||||
list_noisefiles = noise_dataset + '*.wav'
|
||||
noise_file = random.choice(glob.glob(list_noisefiles))
|
||||
n = np.memmap(noise_file, dtype=np.int16,mode = 'r')/(2**15 - 1)
|
||||
rand_range = np.random.randint(low = 0, high = (16000*60*5 - audio.shape[0])) # Last 1 minute of noise used for testing
|
||||
n = n[rand_range:rand_range + audio.shape[0]]
|
||||
else:
|
||||
n = np.random.randn(audio.shape[0])
|
||||
n = random_filter(n)
|
||||
|
||||
snr_multiplier = np.sqrt((np.sum(np.abs(audio)**2)/np.sum(np.abs(n)**2))*10**(-snr_dB/10))
|
||||
audio = audio + snr_multiplier*n
|
||||
|
||||
if (f == 'crepe'):
|
||||
_, model_frequency, _, _ = crepe.predict(np.concatenate([np.zeros(80),audio]), 16000, viterbi=True,center=True,verbose=0)
|
||||
model_cents = 1200*np.log2(model_frequency/(16000/256) + 1.0e-8)
|
||||
else:
|
||||
data_temp = np.memmap('./temp.raw', dtype=np.int16, shape=(audio.shape[0]), mode='w+')
|
||||
# data_temp[:audio.shape[0]] = (audio/(np.max(np.abs(audio)))*(2**15 - 1)).astype(np.int16)
|
||||
data_temp[:audio.shape[0]] = ((audio)*(2**15 - 1)).astype(np.int16)
|
||||
|
||||
subprocess.run(["../../../lpcnet_xcorr_extractor", './temp.raw', './temp_xcorr.f32', './temp_period.f32'])
|
||||
feature_xcorr = np.fromfile('./temp_period.f32', dtype='float32')
|
||||
model_cents = 1200*np.log2((256/feature_xcorr + 1.0e-8) + 1.0e-8)
|
||||
|
||||
os.remove('./temp.raw')
|
||||
os.remove('./temp_xcorr.f32')
|
||||
os.remove('./temp_period.f32')
|
||||
|
||||
|
||||
pitch_file_name = dir_f0 + "ref" + os.path.basename(list_files[idx])[3:-4] + ".f0"
|
||||
pitch = np.loadtxt(pitch_file_name)[:,0]
|
||||
voicing = np.loadtxt(pitch_file_name)[:,1]
|
||||
indmin = min(voicing.shape[0],rmse.shape[0],pitch.shape[0])
|
||||
pitch = pitch[:indmin]
|
||||
voicing = voicing[:indmin]
|
||||
rmse = rmse[:indmin]
|
||||
voicing = voicing*(rmse > 0.05*np.max(rmse))
|
||||
if "mic_F" in audio_file:
|
||||
idx_correct = np.where(pitch < 125)
|
||||
voicing[idx_correct] = 0
|
||||
|
||||
cent = np.rint(1200*np.log2(np.divide(pitch, (16000/256), out=np.zeros_like(pitch), where=pitch!=0) + 1.0e-8)).astype('int')
|
||||
num_frames = min(cent.shape[0],model_cents.shape[0])
|
||||
pitch = pitch[:num_frames]
|
||||
cent = cent[:num_frames]
|
||||
voicing = voicing[:num_frames]
|
||||
model_cents = model_cents[:num_frames]
|
||||
|
||||
voicing_all = np.copy(voicing)
|
||||
# Forcefully make regions where pitch is <65 or greater than 500 unvoiced for relevant accurate pitch comparisons for our model
|
||||
force_out_of_pitch = np.where(np.logical_or(pitch < 65,pitch > 500)==True)
|
||||
voicing_all[force_out_of_pitch] = 0
|
||||
C_all = C_all + np.where(voicing_all != 0)[0].shape[0]
|
||||
|
||||
C_correct = C_correct + rca(cent,model_cents,voicing_all,thresh)
|
||||
list_mean.append(C_correct/C_all)
|
||||
dict_models[fname] = {}
|
||||
dict_models[fname]['list_SNR'] = list_mean[:-1]
|
||||
dict_models[fname]['inf'] = list_mean[-1]
|
||||
|
||||
return dict_models
|
||||
@@ -0,0 +1,38 @@
|
||||
"""
|
||||
Running the experiments;
|
||||
1. RCA vs SNR for our models, CREPE, LPCNet
|
||||
"""
|
||||
|
||||
import argparse
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
parser.add_argument('ptdb_root', type=str, help='Root Directory for PTDB generated by running ptdb_process.sh ')
|
||||
parser.add_argument('output', type=str, help='Output dump file name')
|
||||
parser.add_argument('method', type=str, help='Output Directory to save experiment dumps',choices=['model','lpcnet','crepe'])
|
||||
parser.add_argument('--noise_dataset', type=str, help='Location of the Demand Datset',default = './',required=False)
|
||||
parser.add_argument('--noise_type', type=str, help='Type of additive noise',default = 'synthetic',choices=['synthetic','demand'],required=False)
|
||||
parser.add_argument('--pth_file', type=str, help='.pth file to analyze',default = './',required = False)
|
||||
parser.add_argument('--fraction_files_analyze', type=float, help='Fraction of PTDB dataset to test on',default = 1,required = False)
|
||||
parser.add_argument('--threshold_rca', type=float, help='Cent threshold when computing RCA',default = 50,required = False)
|
||||
parser.add_argument('--gpu_index', type=int, help='GPU index to use if multiple GPUs',default = 0,required = False)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
import os
|
||||
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu_index)
|
||||
|
||||
import json
|
||||
from evaluation import cycle_eval
|
||||
|
||||
if args.method == 'model':
|
||||
dict_store = cycle_eval([args.pth_file], noise_type = args.noise_type, noise_dataset = args.noise_dataset, list_snr = [-20,-15,-10,-5,0,5,10,15,20], ptdb_dataset_path = args.ptdb_root,fraction = args.fraction_files_analyze,thresh = args.threshold_rca)
|
||||
else:
|
||||
dict_store = cycle_eval([args.method], noise_type = args.noise_type, noise_dataset = args.noise_dataset, list_snr = [-20,-15,-10,-5,0,5,10,15,20], ptdb_dataset_path = args.ptdb_root,fraction = args.fraction_files_analyze,thresh = args.threshold_rca)
|
||||
|
||||
dict_store["method"] = args.method
|
||||
if args.method == 'model':
|
||||
dict_store['pth'] = args.pth_file
|
||||
|
||||
with open(args.output, 'w') as fp:
|
||||
json.dump(dict_store, fp)
|
||||
@@ -0,0 +1,109 @@
|
||||
"""
|
||||
/* Copyright (c) 2022 Amazon
|
||||
Written by Jan Buethe */
|
||||
/*
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
"""
|
||||
|
||||
import os
|
||||
import argparse
|
||||
import sys
|
||||
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), '../weight-exchange'))
|
||||
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
parser.add_argument('checkpoint', type=str, help='model checkpoint')
|
||||
parser.add_argument('output_dir', type=str, help='output folder')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
import torch
|
||||
import numpy as np
|
||||
|
||||
from models import PitchDNN
|
||||
from wexchange.torch import dump_torch_weights
|
||||
from wexchange.c_export import CWriter, print_vector
|
||||
|
||||
def c_export(args, model):
|
||||
|
||||
message = f"Auto generated from checkpoint {os.path.basename(args.checkpoint)}"
|
||||
|
||||
writer = CWriter(os.path.join(args.output_dir, "pitchdnn_data"), message=message, model_struct_name='PitchDNN')
|
||||
writer.header.write(
|
||||
f"""
|
||||
#include "opus_types.h"
|
||||
"""
|
||||
)
|
||||
|
||||
dense_layers = [
|
||||
('if_upsample.0', "dense_if_upsampler_1"),
|
||||
('if_upsample.2', "dense_if_upsampler_2"),
|
||||
('downsample.0', "dense_downsampler"),
|
||||
("upsample.0", "dense_final_upsampler")
|
||||
]
|
||||
|
||||
|
||||
for name, export_name in dense_layers:
|
||||
layer = model.get_submodule(name)
|
||||
dump_torch_weights(writer, layer, name=export_name, verbose=True, quantize=True, scale=None)
|
||||
|
||||
conv_layers = [
|
||||
('conv.1', "conv2d_1"),
|
||||
('conv.4', "conv2d_2")
|
||||
]
|
||||
|
||||
|
||||
for name, export_name in conv_layers:
|
||||
layer = model.get_submodule(name)
|
||||
dump_torch_weights(writer, layer, name=export_name, verbose=True)
|
||||
|
||||
|
||||
gru_layers = [
|
||||
("GRU", "gru_1"),
|
||||
]
|
||||
|
||||
max_rnn_units = max([dump_torch_weights(writer, model.get_submodule(name), export_name, verbose=True, input_sparse=False, quantize=True, scale=None, recurrent_scale=None)
|
||||
for name, export_name in gru_layers])
|
||||
|
||||
writer.header.write(
|
||||
f"""
|
||||
|
||||
#define PITCH_DNN_MAX_RNN_UNITS {max_rnn_units}
|
||||
|
||||
"""
|
||||
)
|
||||
|
||||
writer.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
os.makedirs(args.output_dir, exist_ok=True)
|
||||
model = PitchDNN()
|
||||
checkpoint = torch.load(args.checkpoint, map_location='cpu')
|
||||
model.load_state_dict(checkpoint['state_dict'])
|
||||
c_export(args, model)
|
||||
178
managed_components/78__esp-opus/dnn/torch/neural-pitch/models.py
Normal file
178
managed_components/78__esp-opus/dnn/torch/neural-pitch/models.py
Normal file
@@ -0,0 +1,178 @@
|
||||
"""
|
||||
Pitch Estimation Models and dataloaders
|
||||
- Classification Based (Input features, output logits)
|
||||
"""
|
||||
|
||||
import torch
|
||||
import numpy as np
|
||||
|
||||
class PitchDNNIF(torch.nn.Module):
|
||||
|
||||
def __init__(self, input_dim=88, gru_dim=64, output_dim=192):
|
||||
super().__init__()
|
||||
|
||||
self.activation = torch.nn.Tanh()
|
||||
self.initial = torch.nn.Linear(input_dim, gru_dim)
|
||||
self.hidden = torch.nn.Linear(gru_dim, gru_dim)
|
||||
self.gru = torch.nn.GRU(input_size=gru_dim, hidden_size=gru_dim, batch_first=True)
|
||||
self.upsample = torch.nn.Linear(gru_dim, output_dim)
|
||||
|
||||
def forward(self, x):
|
||||
|
||||
x = self.initial(x)
|
||||
x = self.activation(x)
|
||||
x = self.hidden(x)
|
||||
x = self.activation(x)
|
||||
x,_ = self.gru(x)
|
||||
x = self.upsample(x)
|
||||
x = self.activation(x)
|
||||
x = x.permute(0,2,1)
|
||||
|
||||
return x
|
||||
|
||||
class PitchDNNXcorr(torch.nn.Module):
|
||||
|
||||
def __init__(self, input_dim=90, gru_dim=64, output_dim=192):
|
||||
super().__init__()
|
||||
|
||||
self.activation = torch.nn.Tanh()
|
||||
|
||||
self.conv = torch.nn.Sequential(
|
||||
torch.nn.ZeroPad2d((2, 0, 1, 1)),
|
||||
torch.nn.Conv2d(1, 8, 3, bias=True),
|
||||
self.activation,
|
||||
torch.nn.ZeroPad2d((2,0,1,1)),
|
||||
torch.nn.Conv2d(8, 8, 3, bias=True),
|
||||
self.activation,
|
||||
torch.nn.ZeroPad2d((2,0,1,1)),
|
||||
torch.nn.Conv2d(8, 1, 3, bias=True),
|
||||
self.activation,
|
||||
)
|
||||
|
||||
self.downsample = torch.nn.Sequential(
|
||||
torch.nn.Linear(input_dim, gru_dim),
|
||||
self.activation
|
||||
)
|
||||
self.GRU = torch.nn.GRU(input_size=gru_dim, hidden_size=gru_dim, num_layers=1, batch_first=True)
|
||||
self.upsample = torch.nn.Sequential(
|
||||
torch.nn.Linear(gru_dim,output_dim),
|
||||
self.activation
|
||||
)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.conv(x.unsqueeze(-1).permute(0,3,2,1)).squeeze(1)
|
||||
x,_ = self.GRU(self.downsample(x.permute(0,2,1)))
|
||||
x = self.upsample(x).permute(0,2,1)
|
||||
|
||||
return x
|
||||
|
||||
class PitchDNN(torch.nn.Module):
|
||||
"""
|
||||
Joint IF-xcorr
|
||||
1D CNN on IF, merge with xcorr, 2D CNN on merged + GRU
|
||||
"""
|
||||
|
||||
def __init__(self,input_IF_dim=88, input_xcorr_dim=224, gru_dim=64, output_dim=192):
|
||||
super().__init__()
|
||||
|
||||
self.activation = torch.nn.Tanh()
|
||||
|
||||
self.if_upsample = torch.nn.Sequential(
|
||||
torch.nn.Linear(input_IF_dim,64),
|
||||
self.activation,
|
||||
torch.nn.Linear(64,64),
|
||||
self.activation,
|
||||
)
|
||||
|
||||
self.conv = torch.nn.Sequential(
|
||||
torch.nn.ZeroPad2d((2,0,1,1)),
|
||||
torch.nn.Conv2d(1, 4, 3, bias=True),
|
||||
self.activation,
|
||||
torch.nn.ZeroPad2d((2,0,1,1)),
|
||||
torch.nn.Conv2d(4, 1, 3, bias=True),
|
||||
self.activation,
|
||||
)
|
||||
|
||||
self.downsample = torch.nn.Sequential(
|
||||
torch.nn.Linear(64 + input_xcorr_dim, gru_dim),
|
||||
self.activation
|
||||
)
|
||||
self.GRU = torch.nn.GRU(input_size=gru_dim, hidden_size=gru_dim, num_layers=1, batch_first=True)
|
||||
self.upsample = torch.nn.Sequential(
|
||||
torch.nn.Linear(gru_dim, output_dim)
|
||||
)
|
||||
|
||||
def forward(self, x):
|
||||
xcorr_feat = x[:,:,:224]
|
||||
if_feat = x[:,:,224:]
|
||||
xcorr_feat = self.conv(xcorr_feat.unsqueeze(-1).permute(0,3,2,1)).squeeze(1).permute(0,2,1)
|
||||
if_feat = self.if_upsample(if_feat)
|
||||
x = torch.cat([xcorr_feat,if_feat],axis = - 1)
|
||||
x,_ = self.GRU(self.downsample(x))
|
||||
x = self.upsample(x).permute(0,2,1)
|
||||
|
||||
return x
|
||||
|
||||
|
||||
# Dataloaders
|
||||
class Loader(torch.utils.data.Dataset):
|
||||
def __init__(self, features_if, file_pitch, confidence_threshold=0.4, dimension_if=30, context=100):
|
||||
self.if_feat = np.memmap(features_if, dtype=np.float32).reshape(-1,3*dimension_if)
|
||||
|
||||
# Resolution of 20 cents
|
||||
self.cents = np.rint(np.load(file_pitch)[0,:]/20)
|
||||
self.cents = np.clip(self.cents,0,179)
|
||||
self.confidence = np.load(file_pitch)[1,:]
|
||||
|
||||
# Filter confidence for CREPE
|
||||
self.confidence[self.confidence < confidence_threshold] = 0
|
||||
self.context = context
|
||||
# Clip both to same size
|
||||
size_common = min(self.if_feat.shape[0], self.cents.shape[0])
|
||||
self.if_feat = self.if_feat[:size_common,:]
|
||||
self.cents = self.cents[:size_common]
|
||||
self.confidence = self.confidence[:size_common]
|
||||
|
||||
frame_max = self.if_feat.shape[0]//context
|
||||
self.if_feat = np.reshape(self.if_feat[:frame_max*context, :],(frame_max, context,3*dimension_if))
|
||||
self.cents = np.reshape(self.cents[:frame_max * context],(frame_max, context))
|
||||
self.confidence = np.reshape(self.confidence[:frame_max*context],(frame_max, context))
|
||||
|
||||
def __len__(self):
|
||||
return self.if_feat.shape[0]
|
||||
|
||||
def __getitem__(self, index):
|
||||
return torch.from_numpy(self.if_feat[index,:,:]), torch.from_numpy(self.cents[index]), torch.from_numpy(self.confidence[index])
|
||||
|
||||
class PitchDNNDataloader(torch.utils.data.Dataset):
|
||||
def __init__(self, features, file_pitch, confidence_threshold=0.4, context=100, choice_data='both'):
|
||||
self.feat = np.memmap(features, mode='r', dtype=np.int8).reshape(-1,312)
|
||||
self.xcorr = self.feat[:,:224]
|
||||
self.if_feat = self.feat[:,224:]
|
||||
ground_truth = np.memmap(file_pitch, mode='r', dtype=np.float32).reshape(-1,2)
|
||||
self.cents = np.rint(60*np.log2(ground_truth[:,0]/62.5))
|
||||
mask = (self.cents>=0).astype('float32') * (self.cents<=180).astype('float32')
|
||||
self.cents = np.clip(self.cents,0,179)
|
||||
self.confidence = ground_truth[:,1] * mask
|
||||
# Filter confidence for CREPE
|
||||
self.confidence[self.confidence < confidence_threshold] = 0
|
||||
self.context = context
|
||||
|
||||
self.choice_data = choice_data
|
||||
|
||||
frame_max = self.if_feat.shape[0]//context
|
||||
self.if_feat = np.reshape(self.if_feat[:frame_max*context,:], (frame_max, context, 88))
|
||||
self.cents = np.reshape(self.cents[:frame_max*context], (frame_max,context))
|
||||
self.xcorr = np.reshape(self.xcorr[:frame_max*context,:], (frame_max,context, 224))
|
||||
self.confidence = np.reshape(self.confidence[:frame_max*context], (frame_max, context))
|
||||
|
||||
def __len__(self):
|
||||
return self.if_feat.shape[0]
|
||||
|
||||
def __getitem__(self, index):
|
||||
if self.choice_data == 'both':
|
||||
return torch.cat([torch.from_numpy((1./127)*self.xcorr[index,:,:]), torch.from_numpy((1./127)*self.if_feat[index,:,:])], dim=-1), torch.from_numpy(self.cents[index]), torch.from_numpy(self.confidence[index])
|
||||
elif self.choice_data == 'if':
|
||||
return torch.from_numpy((1./127)*self.if_feat[index,:,:]),torch.from_numpy(self.cents[index]),torch.from_numpy(self.confidence[index])
|
||||
else:
|
||||
return torch.from_numpy((1./127)*self.xcorr[index,:,:]),torch.from_numpy(self.cents[index]),torch.from_numpy(self.confidence[index])
|
||||
@@ -0,0 +1,179 @@
|
||||
import argparse
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
parser.add_argument('features', type=str, help='Features generated from dump_data')
|
||||
parser.add_argument('data', type=str, help='Data generated from dump_data (offset by 5ms)')
|
||||
parser.add_argument('output', type=str, help='output .f32 feature file with replaced neural pitch')
|
||||
parser.add_argument('checkpoint', type=str, help='model checkpoint file')
|
||||
parser.add_argument('path_lpcnet_extractor', type=str, help='path to LPCNet extractor object file (generated on compilation)')
|
||||
parser.add_argument('--device', type=str, help='compute device',default = None,required = False)
|
||||
parser.add_argument('--replace_xcorr', type = bool, default = False, help='Replace LPCNet xcorr with updated one')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
import os
|
||||
|
||||
from utils import stft, random_filter
|
||||
import subprocess
|
||||
import numpy as np
|
||||
import json
|
||||
import torch
|
||||
import tqdm
|
||||
|
||||
from models import PitchDNNIF, PitchDNNXcorr, PitchDNN
|
||||
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
if device is not None:
|
||||
device = torch.device(args.device)
|
||||
|
||||
# Loading the appropriate model
|
||||
checkpoint = torch.load(args.checkpoint, map_location='cpu')
|
||||
dict_params = checkpoint['config']
|
||||
|
||||
if dict_params['data_format'] == 'if':
|
||||
pitch_nn = PitchDNNIF(dict_params['freq_keep']*3, dict_params['gru_dim'], dict_params['output_dim'])
|
||||
elif dict_params['data_format'] == 'xcorr':
|
||||
pitch_nn = PitchDNNXcorr(dict_params['xcorr_dim'], dict_params['gru_dim'], dict_params['output_dim'])
|
||||
else:
|
||||
pitch_nn = PitchDNN(dict_params['freq_keep']*3, dict_params['xcorr_dim'], dict_params['gru_dim'], dict_params['output_dim'])
|
||||
|
||||
pitch_nn.load_state_dict(checkpoint['state_dict'])
|
||||
pitch_nn = pitch_nn.to(device)
|
||||
|
||||
N = dict_params['window_size']
|
||||
H = dict_params['hop_factor']
|
||||
freq_keep = dict_params['freq_keep']
|
||||
|
||||
os.environ["OMP_NUM_THREADS"] = "16"
|
||||
|
||||
|
||||
def run_lpc(signal, lpcs, frame_length=160):
|
||||
num_frames, lpc_order = lpcs.shape
|
||||
|
||||
prediction = np.concatenate(
|
||||
[- np.convolve(signal[i * frame_length : (i + 1) * frame_length + lpc_order - 1], lpcs[i], mode='valid') for i in range(num_frames)]
|
||||
)
|
||||
error = signal[lpc_order :] - prediction
|
||||
|
||||
return prediction, error
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parser.parse_args()
|
||||
|
||||
features = np.memmap(args.features, dtype=np.float32,mode = 'r').reshape((-1, 36))
|
||||
data = np.memmap(args.data, dtype=np.int16,mode = 'r').reshape((-1, 2))
|
||||
|
||||
num_frames = features.shape[0]
|
||||
feature_dim = features.shape[1]
|
||||
|
||||
assert feature_dim == 36
|
||||
|
||||
output = np.memmap(args.output, dtype=np.float32, shape=(num_frames, feature_dim), mode='w+')
|
||||
output[:, :36] = features
|
||||
|
||||
# lpc coefficients and signal
|
||||
lpcs = features[:, 20:36]
|
||||
sig = data[:, 1]
|
||||
|
||||
# parameters
|
||||
|
||||
# constants
|
||||
pitch_min = 32
|
||||
pitch_max = 256
|
||||
lpc_order = 16
|
||||
fs = 16000
|
||||
frame_length = 160
|
||||
overlap_frames = 100
|
||||
chunk_size = 10000
|
||||
history_length = frame_length * overlap_frames
|
||||
history = np.zeros(history_length, dtype=np.int16)
|
||||
pitch_position=18
|
||||
xcorr_position=19
|
||||
conf_position=36
|
||||
|
||||
num_frames = len(sig) // 160 - 1
|
||||
|
||||
frame_start = 0
|
||||
frame_stop = min(frame_start + chunk_size, num_frames)
|
||||
signal_start = 0
|
||||
signal_stop = frame_stop * frame_length
|
||||
|
||||
niters = (num_frames - 1)//chunk_size
|
||||
for i in tqdm.trange(niters):
|
||||
if (frame_start > num_frames - 1):
|
||||
break
|
||||
chunk = np.concatenate((history, sig[signal_start:signal_stop]))
|
||||
chunk_la = np.concatenate((history, sig[signal_start:signal_stop + 80]))
|
||||
|
||||
# Feature computation
|
||||
spec = stft(x = np.concatenate([np.zeros(80),chunk_la/(2**15 - 1)]), w = 'boxcar', N = N, H = H).T
|
||||
phase_diff = spec*np.conj(np.roll(spec,1,axis = -1))
|
||||
phase_diff = phase_diff/(np.abs(phase_diff) + 1.0e-8)
|
||||
idx_save = np.concatenate([np.arange(freq_keep),(N//2 + 1) + np.arange(freq_keep),2*(N//2 + 1) + np.arange(freq_keep)])
|
||||
feature = np.concatenate([np.log(np.abs(spec) + 1.0e-8),np.real(phase_diff),np.imag(phase_diff)],axis = 0).T
|
||||
feature_if = feature[:,idx_save]
|
||||
|
||||
data_temp = np.memmap('./temp_featcompute_' + dict_params['data_format'] + '_.raw', dtype=np.int16, shape=(chunk.shape[0]), mode='w+')
|
||||
data_temp[:chunk.shape[0]] = chunk_la[80:].astype(np.int16)
|
||||
|
||||
subprocess.run([args.path_lpcnet_extractor, './temp_featcompute_' + dict_params['data_format'] + '_.raw', './temp_featcompute_xcorr_' + dict_params['data_format'] + '_.raw'])
|
||||
feature_xcorr = np.flip(np.fromfile('./temp_featcompute_xcorr_' + dict_params['data_format'] + '_.raw', dtype='float32').reshape((-1,256),order = 'C'),axis = 1)
|
||||
ones_zero_lag = np.expand_dims(np.ones(feature_xcorr.shape[0]),-1)
|
||||
feature_xcorr = np.concatenate([ones_zero_lag,feature_xcorr],axis = -1)
|
||||
|
||||
os.remove('./temp_featcompute_' + dict_params['data_format'] + '_.raw')
|
||||
os.remove('./temp_featcompute_xcorr_' + dict_params['data_format'] + '_.raw')
|
||||
|
||||
if dict_params['data_format'] == 'if':
|
||||
feature = feature_if
|
||||
elif dict_params['data_format'] == 'xcorr':
|
||||
feature = feature_xcorr
|
||||
else:
|
||||
indmin = min(feature_if.shape[0],feature_xcorr.shape[0])
|
||||
feature = np.concatenate([feature_xcorr[:indmin,:],feature_if[:indmin,:]],-1)
|
||||
|
||||
# Compute pitch with my model
|
||||
model_cents = pitch_nn(torch.from_numpy(np.copy(np.expand_dims(feature,0))).float().to(device))
|
||||
model_cents = 20*model_cents.argmax(dim=1).cpu().detach().squeeze().numpy()
|
||||
frequency = 62.5*2**(model_cents/1200)
|
||||
|
||||
frequency = frequency[overlap_frames : overlap_frames + frame_stop - frame_start]
|
||||
|
||||
# convert frequencies to periods
|
||||
periods = np.round(fs / frequency)
|
||||
|
||||
periods = np.clip(periods, pitch_min, pitch_max)
|
||||
|
||||
output[frame_start:frame_stop, pitch_position] = (periods - 100) / 50
|
||||
|
||||
frame_offset = (pitch_max + frame_length - 1) // frame_length
|
||||
offset = frame_offset * frame_length
|
||||
padding = lpc_order
|
||||
|
||||
|
||||
if frame_start < frame_offset:
|
||||
lpc_coeffs = np.concatenate((np.zeros((frame_offset - frame_start, lpc_order), dtype=np.float32), lpcs[:frame_stop]))
|
||||
else:
|
||||
lpc_coeffs = lpcs[frame_start - frame_offset : frame_stop]
|
||||
|
||||
pred, error = run_lpc(chunk[history_length - offset - padding :], lpc_coeffs, frame_length=frame_length)
|
||||
|
||||
xcorr = np.zeros(frame_stop - frame_start)
|
||||
for i, p in enumerate(periods.astype(np.int16)):
|
||||
if p > 0:
|
||||
f1 = error[offset + i * frame_length : offset + (i + 1) * frame_length]
|
||||
f2 = error[offset + i * frame_length - p : offset + (i + 1) * frame_length - p]
|
||||
xcorr[i] = np.dot(f1, f2) / np.sqrt(np.dot(f1, f1) * np.dot(f2, f2) + 1e-6)
|
||||
|
||||
output[frame_start:frame_stop, xcorr_position] = xcorr - 0.5
|
||||
|
||||
# update buffers and indices
|
||||
history = chunk[-history_length :]
|
||||
|
||||
frame_start += chunk_size
|
||||
frame_stop += chunk_size
|
||||
frame_stop = min(frame_stop, num_frames)
|
||||
|
||||
signal_start = frame_start * frame_length
|
||||
signal_stop = frame_stop * frame_length
|
||||
@@ -0,0 +1,34 @@
|
||||
# Copy into PTDB root directory and run to combine all the male/female raw audio/references into below directories
|
||||
|
||||
# Make folder for combined audio
|
||||
mkdir -p './combined_mic_16k/'
|
||||
# Make folder for combined pitch reference
|
||||
mkdir -p './combined_reference_f0/'
|
||||
|
||||
# Resample Male Audio
|
||||
for i in ./MALE/MIC/**/*.wav; do
|
||||
j="$(basename "$i" .wav)"
|
||||
echo $j
|
||||
sox -r 48000 -b 16 -e signed-integer "$i" -r 16000 -b 16 -e signed-integer ./combined_mic_16k/$j.raw
|
||||
done
|
||||
|
||||
# Resample Female Audio
|
||||
for i in ./FEMALE/MIC/**/*.wav; do
|
||||
j="$(basename "$i" .wav)"
|
||||
echo $j
|
||||
sox -r 48000 -b 16 -e signed-integer "$i" -r 16000 -b 16 -e signed-integer ./combined_mic_16k/$j.raw
|
||||
done
|
||||
|
||||
# Shift Male reference pitch files
|
||||
for i in ./MALE/REF/**/*.f0; do
|
||||
j="$(basename "$i" .wav)"
|
||||
echo $j
|
||||
cp "$i" ./combined_reference_f0/
|
||||
done
|
||||
|
||||
# Shift Female reference pitch files
|
||||
for i in ./FEMALE/REF/**/*.f0; do
|
||||
j="$(basename "$i" .wav)"
|
||||
echo $j
|
||||
cp "$i" ./combined_reference_f0/
|
||||
done
|
||||
@@ -0,0 +1,72 @@
|
||||
"""
|
||||
Perform Data Augmentation (Gain, Additive Noise, Random Filtering) on Input TTS Data
|
||||
1. Read in chunks and compute clean pitch first
|
||||
2. Then add in augmentation (Noise/Level/Response)
|
||||
- Adds filtered noise from the "Demand" dataset, https://zenodo.org/record/1227121#.XRKKxYhKiUk
|
||||
- When using the Demand Dataset, consider each channel as a possible noise input, and keep the first 4 minutes of noise for training
|
||||
3. Use this "augmented" audio for feature computation, and compute pitch using CREPE on the clean input
|
||||
|
||||
Notes: To ensure consistency with the discovered CREPE offset, we do the following
|
||||
- We pad the input audio to the zero-centered CREPE estimator with 80 zeros
|
||||
- We pad the input audio to our feature computation with 160 zeros to center them
|
||||
"""
|
||||
|
||||
import argparse
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
parser.add_argument('data', type=str, help='input raw audio data')
|
||||
parser.add_argument('output', type=str, help='output directory')
|
||||
parser.add_argument('--gpu-index', type=int, help='GPU index to use if multiple GPUs',default = 0,required = False)
|
||||
parser.add_argument('--chunk-size-frames', type=int, help='Number of frames to process at a time',default = 100000,required = False)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
import os
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu_index)
|
||||
|
||||
import numpy as np
|
||||
import tqdm
|
||||
import crepe
|
||||
|
||||
data = np.memmap(args.data, dtype=np.int16,mode = 'r')
|
||||
|
||||
# list_features = []
|
||||
list_cents = []
|
||||
list_confidences = []
|
||||
|
||||
min_period = 32
|
||||
max_period = 256
|
||||
f_ref = 16000/max_period
|
||||
chunk_size_frames = args.chunk_size_frames
|
||||
chunk_size = chunk_size_frames*160
|
||||
|
||||
nb_chunks = (data.shape[0]+79)//chunk_size+1
|
||||
|
||||
output_data = np.zeros((0,2),dtype='float32')
|
||||
|
||||
for i in tqdm.trange(nb_chunks):
|
||||
if i==0:
|
||||
chunk = np.concatenate([np.zeros(80),data[:chunk_size-80]])
|
||||
elif i==nb_chunks-1:
|
||||
chunk = data[i*chunk_size-80:]
|
||||
else:
|
||||
chunk = data[i*chunk_size-80:(i+1)*chunk_size-80]
|
||||
chunk = chunk/np.array(32767.,dtype='float32')
|
||||
|
||||
# Clean Pitch/Confidence Estimate
|
||||
# Padding input to CREPE by 80 samples to ensure it aligns
|
||||
_, pitch, confidence, _ = crepe.predict(chunk, 16000, center=True, viterbi=True,verbose=0)
|
||||
pitch = pitch[:chunk_size_frames]
|
||||
confidence = confidence[:chunk_size_frames]
|
||||
|
||||
|
||||
# Filter out of range pitches/confidences
|
||||
confidence[pitch < 16000/max_period] = 0
|
||||
confidence[pitch > 16000/min_period] = 0
|
||||
pitch = np.reshape(pitch, (-1, 1))
|
||||
confidence = np.reshape(confidence, (-1, 1))
|
||||
out = np.concatenate([pitch, confidence], axis=-1, dtype='float32')
|
||||
output_data = np.concatenate([output_data, out], axis=0)
|
||||
|
||||
|
||||
output_data.tofile(args.output)
|
||||
@@ -0,0 +1,162 @@
|
||||
"""
|
||||
Training the neural pitch estimator
|
||||
|
||||
"""
|
||||
|
||||
import os
|
||||
import argparse
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
parser.add_argument('features', type=str, help='.f32 IF Features for training (generated by augmentation script)')
|
||||
parser.add_argument('features_pitch', type=str, help='.npy Pitch file for training (generated by augmentation script)')
|
||||
parser.add_argument('output_folder', type=str, help='Output directory to store the model weights and config')
|
||||
parser.add_argument('data_format', type=str, help='Choice of Input Data',choices=['if','xcorr','both'])
|
||||
parser.add_argument('--gpu_index', type=int, help='GPU index to use if multiple GPUs',default = 0,required = False)
|
||||
parser.add_argument('--confidence_threshold', type=float, help='Confidence value below which pitch will be neglected during training',default = 0.4,required = False)
|
||||
parser.add_argument('--context', type=int, help='Sequence length during training',default = 100,required = False)
|
||||
parser.add_argument('--N', type=int, help='STFT window size',default = 320,required = False)
|
||||
parser.add_argument('--H', type=int, help='STFT Hop size',default = 160,required = False)
|
||||
parser.add_argument('--xcorr_dimension', type=int, help='Dimension of Input cross-correlation',default = 257,required = False)
|
||||
parser.add_argument('--freq_keep', type=int, help='Number of Frequencies to keep',default = 30,required = False)
|
||||
parser.add_argument('--gru_dim', type=int, help='GRU Dimension',default = 64,required = False)
|
||||
parser.add_argument('--output_dim', type=int, help='Output dimension',default = 192,required = False)
|
||||
parser.add_argument('--learning_rate', type=float, help='Learning Rate',default = 1.0e-3,required = False)
|
||||
parser.add_argument('--epochs', type=int, help='Number of training epochs',default = 50,required = False)
|
||||
parser.add_argument('--choice_cel', type=str, help='Choice of Cross Entropy Loss (default or robust)',choices=['default','robust'],default = 'default',required = False)
|
||||
parser.add_argument('--prefix', type=str, help="prefix for model export, default: model", default='model')
|
||||
parser.add_argument('--initial-checkpoint', type=str, help='initial checkpoint to start training from, default: None', default=None)
|
||||
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# import os
|
||||
# os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
|
||||
# os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu_index)
|
||||
|
||||
# Fixing the seeds for reproducability
|
||||
import time
|
||||
np_seed = int(time.time())
|
||||
torch_seed = int(time.time())
|
||||
|
||||
import torch
|
||||
torch.manual_seed(torch_seed)
|
||||
import numpy as np
|
||||
np.random.seed(np_seed)
|
||||
from utils import count_parameters
|
||||
import tqdm
|
||||
from models import PitchDNN, PitchDNNIF, PitchDNNXcorr, PitchDNNDataloader
|
||||
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
|
||||
|
||||
if args.data_format == 'if':
|
||||
pitch_nn = PitchDNNIF(3 * args.freq_keep - 2, args.gru_dim, args.output_dim)
|
||||
elif args.data_format == 'xcorr':
|
||||
pitch_nn = PitchDNNXcorr(args.xcorr_dimension, args.gru_dim, args.output_dim)
|
||||
else:
|
||||
pitch_nn = PitchDNN(3 * args.freq_keep - 2, 224, args.gru_dim, args.output_dim)
|
||||
|
||||
if type(args.initial_checkpoint) != type(None):
|
||||
checkpoint = torch.load(args.initial_checkpoint, map_location='cpu')
|
||||
pitch_nn.load_state_dict(checkpoint['state_dict'], strict=False)
|
||||
|
||||
|
||||
dataset_training = PitchDNNDataloader(args.features,args.features_pitch,args.confidence_threshold,args.context,args.data_format)
|
||||
|
||||
def loss_custom(logits,labels,confidence,choice = 'default',nmax = 192,q = 0.7):
|
||||
logits_softmax = torch.nn.Softmax(dim = 1)(logits).permute(0,2,1)
|
||||
labels_one_hot = torch.nn.functional.one_hot(labels.long(),nmax)
|
||||
|
||||
if choice == 'default':
|
||||
# Categorical Cross Entropy
|
||||
CE = -torch.sum(torch.log(logits_softmax*labels_one_hot + 1.0e-6)*labels_one_hot,dim=-1)
|
||||
CE = torch.mean(confidence*CE)
|
||||
|
||||
else:
|
||||
# Robust Cross Entropy
|
||||
CE = (1.0/q)*(1 - torch.sum(torch.pow(logits_softmax*labels_one_hot + 1.0e-7,q),dim=-1) )
|
||||
CE = torch.sum(confidence*CE)
|
||||
|
||||
return CE
|
||||
|
||||
def accuracy(logits,labels,confidence,choice = 'default',nmax = 192,q = 0.7):
|
||||
logits_softmax = torch.nn.Softmax(dim = 1)(logits).permute(0,2,1)
|
||||
pred_pitch = torch.argmax(logits_softmax, 2)
|
||||
accuracy = (pred_pitch != labels.long())*1.
|
||||
return 1.-torch.mean(confidence*accuracy)
|
||||
|
||||
train_dataset, test_dataset = torch.utils.data.random_split(dataset_training, [0.95,0.05], generator=torch.Generator().manual_seed(torch_seed))
|
||||
|
||||
batch_size = 256
|
||||
train_dataloader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True, num_workers=0, pin_memory=False)
|
||||
test_dataloader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=True, num_workers=0, pin_memory=False)
|
||||
|
||||
pitch_nn = pitch_nn.to(device)
|
||||
num_params = count_parameters(pitch_nn)
|
||||
learning_rate = args.learning_rate
|
||||
model_opt = torch.optim.Adam(pitch_nn.parameters(), lr = learning_rate)
|
||||
|
||||
num_epochs = args.epochs
|
||||
|
||||
for epoch in range(num_epochs):
|
||||
losses = []
|
||||
accs = []
|
||||
pitch_nn.train()
|
||||
with tqdm.tqdm(train_dataloader) as train_epoch:
|
||||
for i, (xi, yi, ci) in enumerate(train_epoch):
|
||||
yi, xi, ci = yi.to(device, non_blocking=True), xi.to(device, non_blocking=True), ci.to(device, non_blocking=True)
|
||||
pi = pitch_nn(xi.float())
|
||||
loss = loss_custom(logits = pi,labels = yi,confidence = ci,choice = args.choice_cel,nmax = args.output_dim)
|
||||
acc = accuracy(logits = pi,labels = yi,confidence = ci,choice = args.choice_cel,nmax = args.output_dim)
|
||||
acc = acc.detach()
|
||||
|
||||
model_opt.zero_grad()
|
||||
loss.backward()
|
||||
model_opt.step()
|
||||
|
||||
losses.append(loss.item())
|
||||
accs.append(acc.item())
|
||||
avg_loss = np.mean(losses)
|
||||
avg_acc = np.mean(accs)
|
||||
train_epoch.set_postfix({"Train Epoch" : epoch, "Train Loss":avg_loss, "acc" : avg_acc.item()})
|
||||
|
||||
if epoch % 5 == 0:
|
||||
pitch_nn.eval()
|
||||
losses = []
|
||||
with tqdm.tqdm(test_dataloader) as test_epoch:
|
||||
for i, (xi, yi, ci) in enumerate(test_epoch):
|
||||
yi, xi, ci = yi.to(device, non_blocking=True), xi.to(device, non_blocking=True), ci.to(device, non_blocking=True)
|
||||
pi = pitch_nn(xi.float())
|
||||
loss = loss_custom(logits = pi,labels = yi,confidence = ci,choice = args.choice_cel,nmax = args.output_dim)
|
||||
losses.append(loss.item())
|
||||
avg_loss = np.mean(losses)
|
||||
test_epoch.set_postfix({"Epoch" : epoch, "Test Loss":avg_loss})
|
||||
|
||||
pitch_nn.eval()
|
||||
|
||||
config = dict(
|
||||
data_format=args.data_format,
|
||||
epochs=num_epochs,
|
||||
window_size= args.N,
|
||||
hop_factor= args.H,
|
||||
freq_keep=args.freq_keep,
|
||||
batch_size=batch_size,
|
||||
learning_rate=learning_rate,
|
||||
confidence_threshold=args.confidence_threshold,
|
||||
model_parameters=num_params,
|
||||
np_seed=np_seed,
|
||||
torch_seed=torch_seed,
|
||||
xcorr_dim=args.xcorr_dimension,
|
||||
dim_input=3*args.freq_keep - 2,
|
||||
gru_dim=args.gru_dim,
|
||||
output_dim=args.output_dim,
|
||||
choice_cel=args.choice_cel,
|
||||
context=args.context,
|
||||
)
|
||||
|
||||
model_save_path = os.path.join(args.output_folder, f"{args.prefix}_{args.data_format}.pth")
|
||||
checkpoint = {
|
||||
'state_dict': pitch_nn.state_dict(),
|
||||
'config': config
|
||||
}
|
||||
torch.save(checkpoint, model_save_path)
|
||||
@@ -0,0 +1,59 @@
|
||||
"""
|
||||
Utility functions that are commonly used
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
from scipy.signal import windows, lfilter
|
||||
from prettytable import PrettyTable
|
||||
|
||||
|
||||
# Source: https://gist.github.com/thongonary/026210fc186eb5056f2b6f1ca362d912
|
||||
def count_parameters(model):
|
||||
table = PrettyTable(["Modules", "Parameters"])
|
||||
total_params = 0
|
||||
for name, parameter in model.named_parameters():
|
||||
if not parameter.requires_grad: continue
|
||||
param = parameter.numel()
|
||||
table.add_row([name, param])
|
||||
total_params+=param
|
||||
print(table)
|
||||
print(f"Total Trainable Params: {total_params}")
|
||||
return total_params
|
||||
|
||||
def stft(x, w = 'boxcar', N = 320, H = 160):
|
||||
x = np.concatenate([x,np.zeros(N)])
|
||||
# win_custom = np.concatenate([windows.hann(80)[:40],np.ones(240),windows.hann(80)[40:]])
|
||||
return np.stack([np.fft.rfft(x[i:i + N]*windows.get_window(w,N)) for i in np.arange(0,x.shape[0]-N,H)])
|
||||
|
||||
def random_filter(x):
|
||||
# Randomly filter x with second order IIR filter with coefficients in between -3/8,3/8
|
||||
filter_coeff = np.random.uniform(low = -3.0/8, high = 3.0/8, size = 4)
|
||||
b = [1,filter_coeff[0],filter_coeff[1]]
|
||||
a = [1,filter_coeff[2],filter_coeff[3]]
|
||||
return lfilter(b,a,x)
|
||||
|
||||
def feature_xform(feature):
|
||||
"""
|
||||
Take as input the (N * 256) xcorr features output by LPCNet and perform the following
|
||||
1. Downsample and Upsample by 2 (followed by smoothing)
|
||||
2. Append positional embeddings (of dim k) coresponding to each xcorr lag
|
||||
"""
|
||||
|
||||
from scipy.signal import resample_poly, lfilter
|
||||
|
||||
|
||||
feature_US = lfilter([0.25,0.5,0.25],[1],resample_poly(feature,2,1,axis = 1),axis = 1)[:,:feature.shape[1]]
|
||||
feature_DS = lfilter([0.5,0.5],[1],resample_poly(feature,1,2,axis = 1),axis = 1)
|
||||
Z_append = np.zeros((feature.shape[0],feature.shape[1] - feature_DS.shape[1]))
|
||||
feature_DS = np.concatenate([feature_DS,Z_append],axis = -1)
|
||||
|
||||
# pos_embedding = []
|
||||
# for i in range(k):
|
||||
# pos_embedding.append(np.cos((2**i)*np.pi*((np.repeat(np.arange(feature.shape[1]).reshape(feature.shape[1],1),feature.shape[0],axis = 1)).T/(2*feature.shape[1]))))
|
||||
|
||||
# pos_embedding = np.stack(pos_embedding,axis = -1)
|
||||
|
||||
feature = np.stack((feature_DS,feature,feature_US),axis = -1)
|
||||
# feature = np.concatenate((feature,pos_embedding),axis = -1)
|
||||
|
||||
return feature
|
||||
Reference in New Issue
Block a user