Spaces:
Runtime error
Runtime error
| import os | |
| import sys | |
| import math | |
| import torch | |
| import parselmouth | |
| import numba as nb | |
| import numpy as np | |
| from librosa import yin, pyin | |
| from scipy.signal import medfilt | |
| sys.path.append(os.getcwd()) | |
| from modules.rmvpe import RMVPE | |
| from modules.utils import Autotune | |
| from modules.torchfcpe import FCPE | |
| from modules.pyworld import PYWORLD | |
| from modules.swipe import swipe, stonemask | |
| from modules.torchcrepe import CREPE, mean, median | |
| def post_process(f0, f0_up_key, f0_mel_min, f0_mel_max): | |
| f0 = np.multiply(f0, pow(2, f0_up_key / 12)) | |
| f0_mel = 1127 * np.log(1 + f0 / 700) | |
| f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (f0_mel_max - f0_mel_min) + 1 | |
| f0_mel[f0_mel <= 1] = 1 | |
| f0_mel[f0_mel > 255] = 255 | |
| return np.rint(f0_mel).astype(np.int32), f0 | |
| class Generator: | |
| def __init__(self, sample_rate = 16000, hop_length = 160, f0_min = 50, f0_max = 1100, is_half = False, device = "cpu"): | |
| self.sample_rate = sample_rate | |
| self.hop_length = hop_length | |
| self.f0_min = f0_min | |
| self.f0_max = f0_max | |
| self.is_half = is_half | |
| self.device = device | |
| self.window = 160 | |
| self.ref_freqs = [49.00, 51.91, 55.00, 58.27, 61.74, 65.41, 69.30, 73.42, 77.78, 82.41, 87.31, 92.50, 98.00, 103.83, 110.00, 116.54, 123.47, 130.81, 138.59, 146.83, 155.56, 164.81, 174.61, 185.00, 196.00, 207.65, 220.00, 233.08, 246.94, 261.63, 277.18, 293.66, 311.13, 329.63, 349.23, 369.99, 392.00, 415.30, 440.00, 466.16, 493.88, 523.25, 554.37, 587.33, 622.25, 659.25, 698.46, 739.99, 783.99, 830.61, 880.00, 932.33, 987.77, 1046.50] | |
| self.autotune = Autotune(self.ref_freqs) | |
| self.note_dict = self.autotune.note_dict | |
| def calculator(self, f0_method, x, f0_up_key = 0, p_len = None, filter_radius = 3, f0_autotune = False, f0_autotune_strength = 1): | |
| if p_len is None: p_len = x.shape[0] // self.window | |
| f0 = self.compute_f0(f0_method, x, p_len, filter_radius if filter_radius % 2 != 0 else filter_radius + 1) | |
| if isinstance(f0, tuple): f0 = f0[0] | |
| if f0_autotune: f0 = Autotune.autotune_f0(self, f0, f0_autotune_strength) | |
| return post_process( | |
| f0, | |
| f0_up_key, | |
| 1127 * math.log(1 + self.f0_min / 700), | |
| 1127 * math.log(1 + self.f0_max / 700), | |
| ) | |
| def _resize_f0(self, x, target_len): | |
| source = np.array(x) | |
| source[source < 0.001] = np.nan | |
| return np.nan_to_num( | |
| np.interp( | |
| np.arange(0, len(source) * target_len, len(source)) / target_len, | |
| np.arange(0, len(source)), | |
| source | |
| ) | |
| ) | |
| def compute_f0(self, f0_method, x, p_len, filter_radius): | |
| return { | |
| "pm": lambda: self.get_f0_pm(x, p_len), | |
| "dio": lambda: self.get_f0_pyworld(x, p_len, filter_radius, "dio"), | |
| "mangio-crepe-tiny": lambda: self.get_f0_mangio_crepe(x, p_len, "tiny"), | |
| "mangio-crepe-small": lambda: self.get_f0_mangio_crepe(x, p_len, "small"), | |
| "mangio-crepe-medium": lambda: self.get_f0_mangio_crepe(x, p_len, "medium"), | |
| "mangio-crepe-large": lambda: self.get_f0_mangio_crepe(x, p_len, "large"), | |
| "mangio-crepe-full": lambda: self.get_f0_mangio_crepe(x, p_len, "full"), | |
| "crepe-tiny": lambda: self.get_f0_crepe(x, p_len, "tiny"), | |
| "crepe-small": lambda: self.get_f0_crepe(x, p_len, "small"), | |
| "crepe-medium": lambda: self.get_f0_crepe(x, p_len, "medium"), | |
| "crepe-large": lambda: self.get_f0_crepe(x, p_len, "large"), | |
| "crepe-full": lambda: self.get_f0_crepe(x, p_len, "full"), | |
| "fcpe": lambda: self.get_f0_fcpe(x, p_len), | |
| "fcpe-legacy": lambda: self.get_f0_fcpe(x, p_len, legacy=True), | |
| "rmvpe": lambda: self.get_f0_rmvpe(x, p_len), | |
| "rmvpe-legacy": lambda: self.get_f0_rmvpe(x, p_len, legacy=True), | |
| "harvest": lambda: self.get_f0_pyworld(x, p_len, filter_radius, "harvest"), | |
| "yin": lambda: self.get_f0_yin(x, p_len, mode="yin"), | |
| "pyin": lambda: self.get_f0_yin(x, p_len, mode="pyin"), | |
| "swipe": lambda: self.get_f0_swipe(x, p_len) | |
| }[f0_method]() | |
| def get_f0_pm(self, x, p_len): | |
| f0 = ( | |
| parselmouth.Sound( | |
| x, | |
| self.sample_rate | |
| ).to_pitch_ac( | |
| time_step=160 / self.sample_rate * 1000 / 1000, | |
| voicing_threshold=0.6, | |
| pitch_floor=self.f0_min, | |
| pitch_ceiling=self.f0_max | |
| ).selected_array["frequency"] | |
| ) | |
| pad_size = (p_len - len(f0) + 1) // 2 | |
| if pad_size > 0 or p_len - len(f0) - pad_size > 0: f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant") | |
| return f0 | |
| def get_f0_mangio_crepe(self, x, p_len, model="full"): | |
| if not hasattr(self, "mangio_crepe"): | |
| self.mangio_crepe = CREPE( | |
| os.path.join( | |
| "models", | |
| f"crepe_{model}.pth" | |
| ), | |
| model_size=model, | |
| hop_length=self.hop_length, | |
| batch_size=self.hop_length * 2, | |
| f0_min=self.f0_min, | |
| f0_max=self.f0_max, | |
| device=self.device, | |
| sample_rate=self.sample_rate, | |
| return_periodicity=False | |
| ) | |
| x = x.astype(np.float32) | |
| x /= np.quantile(np.abs(x), 0.999) | |
| audio = torch.unsqueeze(torch.from_numpy(x).to(self.device, copy=True), dim=0) | |
| if audio.ndim == 2 and audio.shape[0] > 1: audio = torch.mean(audio, dim=0, keepdim=True).detach() | |
| f0 = self.mangio_crepe.compute_f0(audio.detach(), pad=True) | |
| return self._resize_f0(f0.squeeze(0).cpu().float().numpy(), p_len) | |
| def get_f0_crepe(self, x, p_len, model="full"): | |
| if not hasattr(self, "crepe"): | |
| self.crepe = CREPE( | |
| os.path.join( | |
| "models", | |
| f"crepe_{model}.pth" | |
| ), | |
| model_size=model, | |
| hop_length=self.hop_length, | |
| batch_size=512, | |
| f0_min=self.f0_min, | |
| f0_max=self.f0_max, | |
| device=self.device, | |
| sample_rate=self.sample_rate, | |
| return_periodicity=True | |
| ) | |
| f0, pd = self.crepe.compute_f0(torch.tensor(np.copy(x))[None].float(), pad=True) | |
| f0, pd = mean(f0, 3), median(pd, 3) | |
| f0[pd < 0.1] = 0 | |
| return self._resize_f0(f0[0].cpu().numpy(), p_len) | |
| def get_f0_fcpe(self, x, p_len, legacy=False): | |
| if not hasattr(self, "fcpe"): | |
| self.fcpe = FCPE( | |
| os.path.join( | |
| "models", | |
| ("fcpe_legacy" if legacy else "fcpe") + ".pt" | |
| ), | |
| hop_length=self.hop_length, | |
| f0_min=self.f0_min, | |
| f0_max=self.f0_max, | |
| dtype=torch.float32, | |
| device=self.device, | |
| sample_rate=self.sample_rate, | |
| threshold=0.03 if legacy else 0.006, | |
| legacy=legacy | |
| ) | |
| f0 = self.fcpe.compute_f0(x, p_len) | |
| return f0 | |
| def get_f0_rmvpe(self, x, p_len, legacy=False): | |
| if not hasattr(self, "rmvpe"): | |
| self.rmvpe = RMVPE( | |
| os.path.join( | |
| "models", | |
| "rmvpe.pt" | |
| ), | |
| is_half=self.is_half, | |
| device=self.device, | |
| ) | |
| f0 = self.rmvpe.infer_from_audio_with_pitch(x, thred=0.03, f0_min=self.f0_min, f0_max=self.f0_max) if legacy else self.rmvpe.infer_from_audio(x, thred=0.03) | |
| return self._resize_f0(f0, p_len) | |
| def get_f0_pyworld(self, x, p_len, filter_radius, model="harvest"): | |
| if not hasattr(self, "pw"): self.pw = PYWORLD() | |
| x = x.astype(np.double) | |
| pw = self.pw.harvest if model == "harvest" else self.pw.dio | |
| f0, t = pw( | |
| x, | |
| fs=self.sample_rate, | |
| f0_ceil=self.f0_max, | |
| f0_floor=self.f0_min, | |
| frame_period=1000 * self.window / self.sample_rate | |
| ) | |
| f0 = self.pw.stonemask( | |
| x, | |
| self.sample_rate, | |
| t, | |
| f0 | |
| ) | |
| if filter_radius > 2 and model == "harvest": f0 = medfilt(f0, filter_radius) | |
| elif model == "dio": | |
| for index, pitch in enumerate(f0): | |
| f0[index] = round(pitch, 1) | |
| return self._resize_f0(f0, p_len) | |
| def get_f0_swipe(self, x, p_len): | |
| f0, t = swipe( | |
| x.astype(np.float32), | |
| self.sample_rate, | |
| f0_floor=self.f0_min, | |
| f0_ceil=self.f0_max, | |
| frame_period=1000 * self.window / self.sample_rate | |
| ) | |
| return self._resize_f0( | |
| stonemask( | |
| x, | |
| self.sample_rate, | |
| t, | |
| f0 | |
| ), | |
| p_len | |
| ) | |
| def get_f0_yin(self, x, p_len, mode="yin"): | |
| self.if_yin = mode == "yin" | |
| self.yin = yin if self.if_yin else pyin | |
| f0 = self.yin( | |
| x.astype(np.float32), | |
| sr=self.sample_rate, | |
| fmin=self.f0_min, | |
| fmax=self.f0_max, | |
| hop_length=self.hop_length | |
| ) | |
| if not self.if_yin: f0 = f0[0] | |
| return self._resize_f0(f0, p_len) |