Spaces:
Runtime error
Runtime error
| import os | |
| import sys | |
| import torch | |
| import librosa | |
| import logging | |
| import warnings | |
| import numpy as np | |
| import soundfile as sf | |
| warnings.filterwarnings("ignore") | |
| sys.path.append(os.getcwd()) | |
| from modules import fairseq | |
| from modules.config import Config | |
| from modules.cut import cut, restore | |
| from modules.pipeline import Pipeline | |
| from modules.utils import clear_gpu_cache | |
| from modules.synthesizers import Synthesizer | |
| from modules.utils import check_predictors, check_embedders, load_audio | |
| for l in ["torch", "faiss", "omegaconf", "httpx", "httpcore", "faiss.loader", "numba.core", "urllib3", "transformers", "matplotlib"]: | |
| logging.getLogger(l).setLevel(logging.ERROR) | |
| def run_inference_script( | |
| is_half=False, | |
| cpu_mode=False, | |
| pitch=0, | |
| filter_radius=3, | |
| index_rate=0.5, | |
| volume_envelope=1, | |
| protect=0.5, | |
| hop_length=64, | |
| f0_method="rmvpe", | |
| input_path=None, | |
| output_path="./output.wav", | |
| pth_path=None, | |
| index_path=None, | |
| export_format="wav", | |
| embedder_model="contentvec_base", | |
| resample_sr=0, | |
| f0_autotune=False, | |
| f0_autotune_strength=1, | |
| split_audio=False, | |
| clean_audio=False, | |
| clean_strength=0.7 | |
| ): | |
| check_predictors(f0_method); check_embedders(embedder_model) | |
| if not pth_path or not os.path.exists(pth_path) or os.path.isdir(pth_path) or not pth_path.endswith(".pth"): | |
| print("[WARNING] Please enter a valid model.") | |
| return | |
| config = Config(is_half=is_half, cpu_mode=cpu_mode) | |
| cvt = VoiceConverter(config, pth_path, 0) | |
| if os.path.isdir(input_path): | |
| print("[INFO] Use batch conversion...") | |
| audio_files = [f for f in os.listdir(input_path) if f.lower().endswith(("wav", "mp3", "flac", "ogg", "opus", "m4a", "mp4", "aac", "alac", "wma", "aiff", "webm", "ac3"))] | |
| if not audio_files: | |
| print("[WARNING] No audio files found.") | |
| return | |
| print(f"[INFO] Found {len(audio_files)} audio files for conversion.") | |
| for audio in audio_files: | |
| audio_path = os.path.join(input_path, audio) | |
| output_audio = os.path.join(input_path, os.path.splitext(audio)[0] + f"_output.{export_format}") | |
| print(f"[INFO] Conversion '{audio_path}'...") | |
| if os.path.exists(output_audio): os.remove(output_audio) | |
| cvt.convert_audio( | |
| audio_input_path=audio_path, | |
| audio_output_path=output_audio, | |
| index_path=index_path, | |
| embedder_model=embedder_model, | |
| pitch=pitch, | |
| f0_method=f0_method, | |
| index_rate=index_rate, | |
| volume_envelope=volume_envelope, | |
| protect=protect, | |
| hop_length=hop_length, | |
| filter_radius=filter_radius, | |
| export_format=export_format, | |
| resample_sr=resample_sr, | |
| f0_autotune=f0_autotune, | |
| f0_autotune_strength=f0_autotune_strength, | |
| split_audio=split_audio, | |
| clean_audio=clean_audio, | |
| clean_strength=clean_strength | |
| ) | |
| print("[INFO] Conversion complete.") | |
| else: | |
| if not os.path.exists(input_path): | |
| print("[WARNING] No audio files found.") | |
| return | |
| print(f"[INFO] Conversion '{input_path}'...") | |
| if os.path.exists(output_path): os.remove(output_path) | |
| cvt.convert_audio( | |
| audio_input_path=input_path, | |
| audio_output_path=output_path, | |
| index_path=index_path, | |
| embedder_model=embedder_model, | |
| pitch=pitch, | |
| f0_method=f0_method, | |
| index_rate=index_rate, | |
| volume_envelope=volume_envelope, | |
| protect=protect, | |
| hop_length=hop_length, | |
| filter_radius=filter_radius, | |
| export_format=export_format, | |
| resample_sr=resample_sr, | |
| f0_autotune=f0_autotune, | |
| f0_autotune_strength=f0_autotune_strength, | |
| split_audio=split_audio, | |
| clean_audio=clean_audio, | |
| clean_strength=clean_strength | |
| ) | |
| print("[INFO] Conversion complete.") | |
| class VoiceConverter: | |
| def __init__(self, config, model_path, sid = 0): | |
| self.config = config | |
| self.device = config.device | |
| self.hubert_model = None | |
| self.tgt_sr = None | |
| self.net_g = None | |
| self.vc = None | |
| self.cpt = None | |
| self.version = None | |
| self.n_spk = None | |
| self.use_f0 = None | |
| self.loaded_model = None | |
| self.vocoder = "Default" | |
| self.sample_rate = 16000 | |
| self.sid = sid | |
| self.get_vc(model_path, sid) | |
| def convert_audio( | |
| self, | |
| audio_input_path, | |
| audio_output_path, | |
| index_path, | |
| embedder_model, | |
| pitch, | |
| f0_method, | |
| index_rate, | |
| volume_envelope, | |
| protect, | |
| hop_length, | |
| filter_radius, | |
| export_format, | |
| resample_sr = 0, | |
| f0_autotune=False, | |
| f0_autotune_strength=1, | |
| split_audio=False, | |
| clean_audio=False, | |
| clean_strength=0.5 | |
| ): | |
| try: | |
| audio = load_audio(audio_input_path, self.sample_rate) | |
| audio_max = np.abs(audio).max() / 0.95 | |
| if audio_max > 1: audio /= audio_max | |
| if not self.hubert_model: | |
| embedder_model_path = os.path.join("models", embedder_model + ".pt") | |
| if not os.path.exists(embedder_model_path): raise FileNotFoundError(f"[ERROR] Not found embeddeder: {embedder_model}") | |
| models = fairseq.load_model(embedder_model_path).to(self.device).eval() | |
| self.hubert_model = models.half() if self.config.is_half else models.float() | |
| if split_audio: | |
| chunks = cut( | |
| audio, | |
| self.sample_rate, | |
| db_thresh=-60, | |
| min_interval=500 | |
| ) | |
| print(f"Split Total: {len(chunks)}") | |
| else: chunks = [(audio, 0, 0)] | |
| converted_chunks = [ | |
| ( | |
| start, | |
| end, | |
| self.vc.pipeline( | |
| model=self.hubert_model, | |
| net_g=self.net_g, | |
| sid=self.sid, | |
| audio=waveform, | |
| f0_up_key=pitch, | |
| f0_method=f0_method, | |
| file_index=( | |
| index_path.strip().strip('"').strip("\n").strip('"').strip().replace("trained", "added") | |
| ), | |
| index_rate=index_rate, | |
| pitch_guidance=self.use_f0, | |
| filter_radius=filter_radius, | |
| volume_envelope=volume_envelope, | |
| version=self.version, | |
| protect=protect, | |
| hop_length=hop_length, | |
| energy_use=self.energy, | |
| f0_autotune=f0_autotune, | |
| f0_autotune_strength=f0_autotune_strength | |
| ) | |
| ) for waveform, start, end in chunks | |
| ] | |
| audio_output = restore( | |
| converted_chunks, | |
| total_len=len(audio), | |
| dtype=converted_chunks[0][2].dtype | |
| ) if split_audio else converted_chunks[0][2] | |
| if self.tgt_sr != resample_sr and resample_sr > 0: | |
| audio_output = librosa.resample(audio_output, orig_sr=self.tgt_sr, target_sr=resample_sr, res_type="soxr_vhq") | |
| self.tgt_sr = resample_sr | |
| if clean_audio: | |
| from modules.noisereduce import reduce_noise | |
| audio_output = reduce_noise( | |
| y=audio_output, | |
| sr=self.tgt_sr, | |
| prop_decrease=clean_strength, | |
| device=self.device | |
| ) | |
| sf.write(audio_output_path, audio_output, self.tgt_sr, format=export_format) | |
| except Exception as e: | |
| import traceback | |
| print(traceback.format_exc()) | |
| print(f"[ERROR] An error has occurred: {e}") | |
| def get_vc(self, weight_root, sid): | |
| if sid == "" or sid == []: | |
| self.cleanup() | |
| clear_gpu_cache() | |
| if not self.loaded_model or self.loaded_model != weight_root: | |
| self.loaded_model = weight_root | |
| self.load_model() | |
| if self.cpt is not None: self.setup() | |
| def cleanup(self): | |
| if self.hubert_model is not None: | |
| del self.net_g, self.n_spk, self.vc, self.hubert_model, self.tgt_sr | |
| self.hubert_model = self.net_g = self.n_spk = self.vc = self.tgt_sr = None | |
| clear_gpu_cache() | |
| del self.net_g, self.cpt | |
| clear_gpu_cache() | |
| self.cpt = None | |
| def load_model(self): | |
| if os.path.isfile(self.loaded_model): self.cpt = torch.load(self.loaded_model, map_location="cpu") | |
| else: self.cpt = None | |
| def setup(self): | |
| if self.cpt is not None: | |
| self.tgt_sr = self.cpt["config"][-1] | |
| self.cpt["config"][-3] = self.cpt["weight"]["emb_g.weight"].shape[0] | |
| self.use_f0 = self.cpt.get("f0", 1) | |
| self.version = self.cpt.get("version", "v1") | |
| self.vocoder = self.cpt.get("vocoder", "Default") | |
| self.energy = self.cpt.get("energy", False) | |
| if self.vocoder != "Default": self.config.is_half = False | |
| self.net_g = Synthesizer(*self.cpt["config"], use_f0=self.use_f0, text_enc_hidden_dim=768 if self.version == "v2" else 256, vocoder=self.vocoder, energy=self.energy) | |
| del self.net_g.enc_q | |
| self.net_g.load_state_dict(self.cpt["weight"], strict=False) | |
| self.net_g.eval().to(self.device) | |
| self.net_g = (self.net_g.half() if self.config.is_half else self.net_g.float()) | |
| self.n_spk = self.cpt["config"][-3] | |
| self.vc = Pipeline(self.tgt_sr, self.config) |