Soprano-RVC / RVC /inference.py
NeoPy's picture
Upload inference.py
8150bbe verified
import os
import sys
import torch
import librosa
import logging
import warnings
import numpy as np
import soundfile as sf
warnings.filterwarnings("ignore")
sys.path.append(os.getcwd())
from modules import fairseq
from modules.config import Config
from modules.cut import cut, restore
from modules.pipeline import Pipeline
from modules.utils import clear_gpu_cache
from modules.synthesizers import Synthesizer
from modules.utils import check_predictors, check_embedders, load_audio
for l in ["torch", "faiss", "omegaconf", "httpx", "httpcore", "faiss.loader", "numba.core", "urllib3", "transformers", "matplotlib"]:
logging.getLogger(l).setLevel(logging.ERROR)
def run_inference_script(
is_half=False,
cpu_mode=False,
pitch=0,
filter_radius=3,
index_rate=0.5,
volume_envelope=1,
protect=0.5,
hop_length=64,
f0_method="rmvpe",
input_path=None,
output_path="./output.wav",
pth_path=None,
index_path=None,
export_format="wav",
embedder_model="contentvec_base",
resample_sr=0,
f0_autotune=False,
f0_autotune_strength=1,
split_audio=False,
clean_audio=False,
clean_strength=0.7
):
check_predictors(f0_method); check_embedders(embedder_model)
if not pth_path or not os.path.exists(pth_path) or os.path.isdir(pth_path) or not pth_path.endswith(".pth"):
print("[WARNING] Please enter a valid model.")
return
config = Config(is_half=is_half, cpu_mode=cpu_mode)
cvt = VoiceConverter(config, pth_path, 0)
if os.path.isdir(input_path):
print("[INFO] Use batch conversion...")
audio_files = [f for f in os.listdir(input_path) if f.lower().endswith(("wav", "mp3", "flac", "ogg", "opus", "m4a", "mp4", "aac", "alac", "wma", "aiff", "webm", "ac3"))]
if not audio_files:
print("[WARNING] No audio files found.")
return
print(f"[INFO] Found {len(audio_files)} audio files for conversion.")
for audio in audio_files:
audio_path = os.path.join(input_path, audio)
output_audio = os.path.join(input_path, os.path.splitext(audio)[0] + f"_output.{export_format}")
print(f"[INFO] Conversion '{audio_path}'...")
if os.path.exists(output_audio): os.remove(output_audio)
cvt.convert_audio(
audio_input_path=audio_path,
audio_output_path=output_audio,
index_path=index_path,
embedder_model=embedder_model,
pitch=pitch,
f0_method=f0_method,
index_rate=index_rate,
volume_envelope=volume_envelope,
protect=protect,
hop_length=hop_length,
filter_radius=filter_radius,
export_format=export_format,
resample_sr=resample_sr,
f0_autotune=f0_autotune,
f0_autotune_strength=f0_autotune_strength,
split_audio=split_audio,
clean_audio=clean_audio,
clean_strength=clean_strength
)
print("[INFO] Conversion complete.")
else:
if not os.path.exists(input_path):
print("[WARNING] No audio files found.")
return
print(f"[INFO] Conversion '{input_path}'...")
if os.path.exists(output_path): os.remove(output_path)
cvt.convert_audio(
audio_input_path=input_path,
audio_output_path=output_path,
index_path=index_path,
embedder_model=embedder_model,
pitch=pitch,
f0_method=f0_method,
index_rate=index_rate,
volume_envelope=volume_envelope,
protect=protect,
hop_length=hop_length,
filter_radius=filter_radius,
export_format=export_format,
resample_sr=resample_sr,
f0_autotune=f0_autotune,
f0_autotune_strength=f0_autotune_strength,
split_audio=split_audio,
clean_audio=clean_audio,
clean_strength=clean_strength
)
print("[INFO] Conversion complete.")
class VoiceConverter:
def __init__(self, config, model_path, sid = 0):
self.config = config
self.device = config.device
self.hubert_model = None
self.tgt_sr = None
self.net_g = None
self.vc = None
self.cpt = None
self.version = None
self.n_spk = None
self.use_f0 = None
self.loaded_model = None
self.vocoder = "Default"
self.sample_rate = 16000
self.sid = sid
self.get_vc(model_path, sid)
def convert_audio(
self,
audio_input_path,
audio_output_path,
index_path,
embedder_model,
pitch,
f0_method,
index_rate,
volume_envelope,
protect,
hop_length,
filter_radius,
export_format,
resample_sr = 0,
f0_autotune=False,
f0_autotune_strength=1,
split_audio=False,
clean_audio=False,
clean_strength=0.5
):
try:
audio = load_audio(audio_input_path, self.sample_rate)
audio_max = np.abs(audio).max() / 0.95
if audio_max > 1: audio /= audio_max
if not self.hubert_model:
embedder_model_path = os.path.join("models", embedder_model + ".pt")
if not os.path.exists(embedder_model_path): raise FileNotFoundError(f"[ERROR] Not found embeddeder: {embedder_model}")
models = fairseq.load_model(embedder_model_path).to(self.device).eval()
self.hubert_model = models.half() if self.config.is_half else models.float()
if split_audio:
chunks = cut(
audio,
self.sample_rate,
db_thresh=-60,
min_interval=500
)
print(f"Split Total: {len(chunks)}")
else: chunks = [(audio, 0, 0)]
converted_chunks = [
(
start,
end,
self.vc.pipeline(
model=self.hubert_model,
net_g=self.net_g,
sid=self.sid,
audio=waveform,
f0_up_key=pitch,
f0_method=f0_method,
file_index=(
index_path.strip().strip('"').strip("\n").strip('"').strip().replace("trained", "added")
),
index_rate=index_rate,
pitch_guidance=self.use_f0,
filter_radius=filter_radius,
volume_envelope=volume_envelope,
version=self.version,
protect=protect,
hop_length=hop_length,
energy_use=self.energy,
f0_autotune=f0_autotune,
f0_autotune_strength=f0_autotune_strength
)
) for waveform, start, end in chunks
]
audio_output = restore(
converted_chunks,
total_len=len(audio),
dtype=converted_chunks[0][2].dtype
) if split_audio else converted_chunks[0][2]
if self.tgt_sr != resample_sr and resample_sr > 0:
audio_output = librosa.resample(audio_output, orig_sr=self.tgt_sr, target_sr=resample_sr, res_type="soxr_vhq")
self.tgt_sr = resample_sr
if clean_audio:
from modules.noisereduce import reduce_noise
audio_output = reduce_noise(
y=audio_output,
sr=self.tgt_sr,
prop_decrease=clean_strength,
device=self.device
)
sf.write(audio_output_path, audio_output, self.tgt_sr, format=export_format)
except Exception as e:
import traceback
print(traceback.format_exc())
print(f"[ERROR] An error has occurred: {e}")
def get_vc(self, weight_root, sid):
if sid == "" or sid == []:
self.cleanup()
clear_gpu_cache()
if not self.loaded_model or self.loaded_model != weight_root:
self.loaded_model = weight_root
self.load_model()
if self.cpt is not None: self.setup()
def cleanup(self):
if self.hubert_model is not None:
del self.net_g, self.n_spk, self.vc, self.hubert_model, self.tgt_sr
self.hubert_model = self.net_g = self.n_spk = self.vc = self.tgt_sr = None
clear_gpu_cache()
del self.net_g, self.cpt
clear_gpu_cache()
self.cpt = None
def load_model(self):
if os.path.isfile(self.loaded_model): self.cpt = torch.load(self.loaded_model, map_location="cpu")
else: self.cpt = None
def setup(self):
if self.cpt is not None:
self.tgt_sr = self.cpt["config"][-1]
self.cpt["config"][-3] = self.cpt["weight"]["emb_g.weight"].shape[0]
self.use_f0 = self.cpt.get("f0", 1)
self.version = self.cpt.get("version", "v1")
self.vocoder = self.cpt.get("vocoder", "Default")
self.energy = self.cpt.get("energy", False)
if self.vocoder != "Default": self.config.is_half = False
self.net_g = Synthesizer(*self.cpt["config"], use_f0=self.use_f0, text_enc_hidden_dim=768 if self.version == "v2" else 256, vocoder=self.vocoder, energy=self.energy)
del self.net_g.enc_q
self.net_g.load_state_dict(self.cpt["weight"], strict=False)
self.net_g.eval().to(self.device)
self.net_g = (self.net_g.half() if self.config.is_half else self.net_g.float())
self.n_spk = self.cpt["config"][-3]
self.vc = Pipeline(self.tgt_sr, self.config)