Soprano-RVC / RVC /modules /generator.py
NeoPy's picture
EXP
05aac64 verified
import os
import sys
import math
import torch
import parselmouth
import numba as nb
import numpy as np
from librosa import yin, pyin
from scipy.signal import medfilt
sys.path.append(os.getcwd())
from modules.rmvpe import RMVPE
from modules.utils import Autotune
from modules.torchfcpe import FCPE
from modules.pyworld import PYWORLD
from modules.swipe import swipe, stonemask
from modules.torchcrepe import CREPE, mean, median
@nb.jit(nopython=True)
def post_process(f0, f0_up_key, f0_mel_min, f0_mel_max):
f0 = np.multiply(f0, pow(2, f0_up_key / 12))
f0_mel = 1127 * np.log(1 + f0 / 700)
f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (f0_mel_max - f0_mel_min) + 1
f0_mel[f0_mel <= 1] = 1
f0_mel[f0_mel > 255] = 255
return np.rint(f0_mel).astype(np.int32), f0
class Generator:
def __init__(self, sample_rate = 16000, hop_length = 160, f0_min = 50, f0_max = 1100, is_half = False, device = "cpu"):
self.sample_rate = sample_rate
self.hop_length = hop_length
self.f0_min = f0_min
self.f0_max = f0_max
self.is_half = is_half
self.device = device
self.window = 160
self.ref_freqs = [49.00, 51.91, 55.00, 58.27, 61.74, 65.41, 69.30, 73.42, 77.78, 82.41, 87.31, 92.50, 98.00, 103.83, 110.00, 116.54, 123.47, 130.81, 138.59, 146.83, 155.56, 164.81, 174.61, 185.00, 196.00, 207.65, 220.00, 233.08, 246.94, 261.63, 277.18, 293.66, 311.13, 329.63, 349.23, 369.99, 392.00, 415.30, 440.00, 466.16, 493.88, 523.25, 554.37, 587.33, 622.25, 659.25, 698.46, 739.99, 783.99, 830.61, 880.00, 932.33, 987.77, 1046.50]
self.autotune = Autotune(self.ref_freqs)
self.note_dict = self.autotune.note_dict
def calculator(self, f0_method, x, f0_up_key = 0, p_len = None, filter_radius = 3, f0_autotune = False, f0_autotune_strength = 1):
if p_len is None: p_len = x.shape[0] // self.window
f0 = self.compute_f0(f0_method, x, p_len, filter_radius if filter_radius % 2 != 0 else filter_radius + 1)
if isinstance(f0, tuple): f0 = f0[0]
if f0_autotune: f0 = Autotune.autotune_f0(self, f0, f0_autotune_strength)
return post_process(
f0,
f0_up_key,
1127 * math.log(1 + self.f0_min / 700),
1127 * math.log(1 + self.f0_max / 700),
)
def _resize_f0(self, x, target_len):
source = np.array(x)
source[source < 0.001] = np.nan
return np.nan_to_num(
np.interp(
np.arange(0, len(source) * target_len, len(source)) / target_len,
np.arange(0, len(source)),
source
)
)
def compute_f0(self, f0_method, x, p_len, filter_radius):
return {
"pm": lambda: self.get_f0_pm(x, p_len),
"dio": lambda: self.get_f0_pyworld(x, p_len, filter_radius, "dio"),
"mangio-crepe-tiny": lambda: self.get_f0_mangio_crepe(x, p_len, "tiny"),
"mangio-crepe-small": lambda: self.get_f0_mangio_crepe(x, p_len, "small"),
"mangio-crepe-medium": lambda: self.get_f0_mangio_crepe(x, p_len, "medium"),
"mangio-crepe-large": lambda: self.get_f0_mangio_crepe(x, p_len, "large"),
"mangio-crepe-full": lambda: self.get_f0_mangio_crepe(x, p_len, "full"),
"crepe-tiny": lambda: self.get_f0_crepe(x, p_len, "tiny"),
"crepe-small": lambda: self.get_f0_crepe(x, p_len, "small"),
"crepe-medium": lambda: self.get_f0_crepe(x, p_len, "medium"),
"crepe-large": lambda: self.get_f0_crepe(x, p_len, "large"),
"crepe-full": lambda: self.get_f0_crepe(x, p_len, "full"),
"fcpe": lambda: self.get_f0_fcpe(x, p_len),
"fcpe-legacy": lambda: self.get_f0_fcpe(x, p_len, legacy=True),
"rmvpe": lambda: self.get_f0_rmvpe(x, p_len),
"rmvpe-legacy": lambda: self.get_f0_rmvpe(x, p_len, legacy=True),
"harvest": lambda: self.get_f0_pyworld(x, p_len, filter_radius, "harvest"),
"yin": lambda: self.get_f0_yin(x, p_len, mode="yin"),
"pyin": lambda: self.get_f0_yin(x, p_len, mode="pyin"),
"swipe": lambda: self.get_f0_swipe(x, p_len)
}[f0_method]()
def get_f0_pm(self, x, p_len):
f0 = (
parselmouth.Sound(
x,
self.sample_rate
).to_pitch_ac(
time_step=160 / self.sample_rate * 1000 / 1000,
voicing_threshold=0.6,
pitch_floor=self.f0_min,
pitch_ceiling=self.f0_max
).selected_array["frequency"]
)
pad_size = (p_len - len(f0) + 1) // 2
if pad_size > 0 or p_len - len(f0) - pad_size > 0: f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant")
return f0
def get_f0_mangio_crepe(self, x, p_len, model="full"):
if not hasattr(self, "mangio_crepe"):
self.mangio_crepe = CREPE(
os.path.join(
"models",
f"crepe_{model}.pth"
),
model_size=model,
hop_length=self.hop_length,
batch_size=self.hop_length * 2,
f0_min=self.f0_min,
f0_max=self.f0_max,
device=self.device,
sample_rate=self.sample_rate,
return_periodicity=False
)
x = x.astype(np.float32)
x /= np.quantile(np.abs(x), 0.999)
audio = torch.unsqueeze(torch.from_numpy(x).to(self.device, copy=True), dim=0)
if audio.ndim == 2 and audio.shape[0] > 1: audio = torch.mean(audio, dim=0, keepdim=True).detach()
f0 = self.mangio_crepe.compute_f0(audio.detach(), pad=True)
return self._resize_f0(f0.squeeze(0).cpu().float().numpy(), p_len)
def get_f0_crepe(self, x, p_len, model="full"):
if not hasattr(self, "crepe"):
self.crepe = CREPE(
os.path.join(
"models",
f"crepe_{model}.pth"
),
model_size=model,
hop_length=self.hop_length,
batch_size=512,
f0_min=self.f0_min,
f0_max=self.f0_max,
device=self.device,
sample_rate=self.sample_rate,
return_periodicity=True
)
f0, pd = self.crepe.compute_f0(torch.tensor(np.copy(x))[None].float(), pad=True)
f0, pd = mean(f0, 3), median(pd, 3)
f0[pd < 0.1] = 0
return self._resize_f0(f0[0].cpu().numpy(), p_len)
def get_f0_fcpe(self, x, p_len, legacy=False):
if not hasattr(self, "fcpe"):
self.fcpe = FCPE(
os.path.join(
"models",
("fcpe_legacy" if legacy else "fcpe") + ".pt"
),
hop_length=self.hop_length,
f0_min=self.f0_min,
f0_max=self.f0_max,
dtype=torch.float32,
device=self.device,
sample_rate=self.sample_rate,
threshold=0.03 if legacy else 0.006,
legacy=legacy
)
f0 = self.fcpe.compute_f0(x, p_len)
return f0
def get_f0_rmvpe(self, x, p_len, legacy=False):
if not hasattr(self, "rmvpe"):
self.rmvpe = RMVPE(
os.path.join(
"models",
"rmvpe.pt"
),
is_half=self.is_half,
device=self.device,
)
f0 = self.rmvpe.infer_from_audio_with_pitch(x, thred=0.03, f0_min=self.f0_min, f0_max=self.f0_max) if legacy else self.rmvpe.infer_from_audio(x, thred=0.03)
return self._resize_f0(f0, p_len)
def get_f0_pyworld(self, x, p_len, filter_radius, model="harvest"):
if not hasattr(self, "pw"): self.pw = PYWORLD()
x = x.astype(np.double)
pw = self.pw.harvest if model == "harvest" else self.pw.dio
f0, t = pw(
x,
fs=self.sample_rate,
f0_ceil=self.f0_max,
f0_floor=self.f0_min,
frame_period=1000 * self.window / self.sample_rate
)
f0 = self.pw.stonemask(
x,
self.sample_rate,
t,
f0
)
if filter_radius > 2 and model == "harvest": f0 = medfilt(f0, filter_radius)
elif model == "dio":
for index, pitch in enumerate(f0):
f0[index] = round(pitch, 1)
return self._resize_f0(f0, p_len)
def get_f0_swipe(self, x, p_len):
f0, t = swipe(
x.astype(np.float32),
self.sample_rate,
f0_floor=self.f0_min,
f0_ceil=self.f0_max,
frame_period=1000 * self.window / self.sample_rate
)
return self._resize_f0(
stonemask(
x,
self.sample_rate,
t,
f0
),
p_len
)
def get_f0_yin(self, x, p_len, mode="yin"):
self.if_yin = mode == "yin"
self.yin = yin if self.if_yin else pyin
f0 = self.yin(
x.astype(np.float32),
sr=self.sample_rate,
fmin=self.f0_min,
fmax=self.f0_max,
hop_length=self.hop_length
)
if not self.if_yin: f0 = f0[0]
return self._resize_f0(f0, p_len)