Soprano-RVC

Runtime error

File size: 10,219 Bytes

8150bbe

import os
import sys
import torch
import librosa
import logging
import warnings

import numpy as np
import soundfile as sf

warnings.filterwarnings("ignore")
sys.path.append(os.getcwd())

from modules import fairseq
from modules.config import Config
from modules.cut import cut, restore
from modules.pipeline import Pipeline
from modules.utils import clear_gpu_cache
from modules.synthesizers import Synthesizer
from modules.utils import check_predictors, check_embedders, load_audio

for l in ["torch", "faiss", "omegaconf", "httpx", "httpcore", "faiss.loader", "numba.core", "urllib3", "transformers", "matplotlib"]:
    logging.getLogger(l).setLevel(logging.ERROR)

def run_inference_script(
    is_half=False, 
    cpu_mode=False,
    pitch=0, 
    filter_radius=3, 
    index_rate=0.5, 
    volume_envelope=1, 
    protect=0.5, 
    hop_length=64, 
    f0_method="rmvpe", 
    input_path=None, 
    output_path="./output.wav", 
    pth_path=None, 
    index_path=None, 
    export_format="wav", 
    embedder_model="contentvec_base", 
    resample_sr=0,  
    f0_autotune=False, 
    f0_autotune_strength=1, 
    split_audio=False,
    clean_audio=False, 
    clean_strength=0.7
):
    check_predictors(f0_method); check_embedders(embedder_model)
    
    if not pth_path or not os.path.exists(pth_path) or os.path.isdir(pth_path) or not pth_path.endswith(".pth"):
        print("[WARNING] Please enter a valid model.")
        return

    config = Config(is_half=is_half, cpu_mode=cpu_mode)
    cvt = VoiceConverter(config, pth_path, 0)

    if os.path.isdir(input_path):
        print("[INFO] Use batch conversion...")
        audio_files = [f for f in os.listdir(input_path) if f.lower().endswith(("wav", "mp3", "flac", "ogg", "opus", "m4a", "mp4", "aac", "alac", "wma", "aiff", "webm", "ac3"))]

        if not audio_files: 
            print("[WARNING] No audio files found.")
            return

        print(f"[INFO] Found {len(audio_files)} audio files for conversion.")

        for audio in audio_files:
            audio_path = os.path.join(input_path, audio)
            output_audio = os.path.join(input_path, os.path.splitext(audio)[0] + f"_output.{export_format}")

            print(f"[INFO] Conversion '{audio_path}'...")
            if os.path.exists(output_audio): os.remove(output_audio)

            cvt.convert_audio(
                audio_input_path=audio_path, 
                audio_output_path=output_audio, 
                index_path=index_path, 
                embedder_model=embedder_model, 
                pitch=pitch, 
                f0_method=f0_method, 
                index_rate=index_rate, 
                volume_envelope=volume_envelope, 
                protect=protect, 
                hop_length=hop_length, 
                filter_radius=filter_radius, 
                export_format=export_format, 
                resample_sr=resample_sr, 
                f0_autotune=f0_autotune, 
                f0_autotune_strength=f0_autotune_strength,
                split_audio=split_audio,
                clean_audio=clean_audio,
                clean_strength=clean_strength
            )

        print("[INFO] Conversion complete.")
    else:
        if not os.path.exists(input_path):
            print("[WARNING] No audio files found.")
            return

        print(f"[INFO] Conversion '{input_path}'...")
        if os.path.exists(output_path): os.remove(output_path)

        cvt.convert_audio(
            audio_input_path=input_path, 
            audio_output_path=output_path, 
            index_path=index_path, 
            embedder_model=embedder_model, 
            pitch=pitch, 
            f0_method=f0_method, 
            index_rate=index_rate, 
            volume_envelope=volume_envelope, 
            protect=protect, 
            hop_length=hop_length, 
            filter_radius=filter_radius,  
            export_format=export_format, 
            resample_sr=resample_sr, 
            f0_autotune=f0_autotune, 
            f0_autotune_strength=f0_autotune_strength,
            split_audio=split_audio,
            clean_audio=clean_audio,
            clean_strength=clean_strength
        )

        print("[INFO] Conversion complete.")

class VoiceConverter:
    def __init__(self, config, model_path, sid = 0):
        self.config = config
        self.device = config.device
        self.hubert_model = None
        self.tgt_sr = None 
        self.net_g = None 
        self.vc = None
        self.cpt = None  
        self.version = None 
        self.n_spk = None  
        self.use_f0 = None  
        self.loaded_model = None
        self.vocoder = "Default"
        self.sample_rate = 16000
        self.sid = sid
        self.get_vc(model_path, sid)

    def convert_audio(
        self, 
        audio_input_path, 
        audio_output_path, 
        index_path, 
        embedder_model, 
        pitch, 
        f0_method, 
        index_rate, 
        volume_envelope, 
        protect, 
        hop_length, 
        filter_radius, 
        export_format, 
        resample_sr = 0, 
        f0_autotune=False, 
        f0_autotune_strength=1,
        split_audio=False,
        clean_audio=False,
        clean_strength=0.5
    ):
        try:
            audio = load_audio(audio_input_path, self.sample_rate)
            audio_max = np.abs(audio).max() / 0.95
            if audio_max > 1: audio /= audio_max

            if not self.hubert_model:
                embedder_model_path = os.path.join("models", embedder_model + ".pt")
                if not os.path.exists(embedder_model_path): raise FileNotFoundError(f"[ERROR] Not found embeddeder: {embedder_model}")

                models = fairseq.load_model(embedder_model_path).to(self.device).eval()
                self.hubert_model = models.half() if self.config.is_half else models.float()

            if split_audio:
                chunks = cut(
                    audio, 
                    self.sample_rate, 
                    db_thresh=-60, 
                    min_interval=500
                )  
                print(f"Split Total: {len(chunks)}")
            else: chunks = [(audio, 0, 0)]

            converted_chunks = [
                (
                    start, 
                    end, 
                    self.vc.pipeline(
                        model=self.hubert_model, 
                        net_g=self.net_g, 
                        sid=self.sid, 
                        audio=waveform, 
                        f0_up_key=pitch, 
                        f0_method=f0_method, 
                        file_index=(
                            index_path.strip().strip('"').strip("\n").strip('"').strip().replace("trained", "added")
                        ), 
                        index_rate=index_rate, 
                        pitch_guidance=self.use_f0, 
                        filter_radius=filter_radius, 
                        volume_envelope=volume_envelope, 
                        version=self.version, 
                        protect=protect, 
                        hop_length=hop_length, 
                        energy_use=self.energy,
                        f0_autotune=f0_autotune, 
                        f0_autotune_strength=f0_autotune_strength
                    )
                ) for waveform, start, end in chunks
            ]

            audio_output = restore(
                converted_chunks, 
                total_len=len(audio), 
                dtype=converted_chunks[0][2].dtype
            ) if split_audio else converted_chunks[0][2]

            if self.tgt_sr != resample_sr and resample_sr > 0: 
                audio_output = librosa.resample(audio_output, orig_sr=self.tgt_sr, target_sr=resample_sr, res_type="soxr_vhq")
                self.tgt_sr = resample_sr

            if clean_audio:
                from modules.noisereduce import reduce_noise
                audio_output = reduce_noise(
                    y=audio_output, 
                    sr=self.tgt_sr, 
                    prop_decrease=clean_strength, 
                    device=self.device
                ) 

            sf.write(audio_output_path, audio_output, self.tgt_sr, format=export_format)
        except Exception as e:
            import traceback
            print(traceback.format_exc())
            print(f"[ERROR] An error has occurred: {e}")

    def get_vc(self, weight_root, sid):
        if sid == "" or sid == []:
            self.cleanup()
            clear_gpu_cache()

        if not self.loaded_model or self.loaded_model != weight_root:
            self.loaded_model = weight_root
            self.load_model()
            if self.cpt is not None: self.setup()

    def cleanup(self):
        if self.hubert_model is not None:
            del self.net_g, self.n_spk, self.vc, self.hubert_model, self.tgt_sr
            self.hubert_model = self.net_g = self.n_spk = self.vc = self.tgt_sr = None
            clear_gpu_cache()

        del self.net_g, self.cpt
        clear_gpu_cache()
        self.cpt = None

    def load_model(self):
        if os.path.isfile(self.loaded_model): self.cpt = torch.load(self.loaded_model, map_location="cpu")  
        else: self.cpt = None

    def setup(self):
        if self.cpt is not None:
            self.tgt_sr = self.cpt["config"][-1]
            self.cpt["config"][-3] = self.cpt["weight"]["emb_g.weight"].shape[0]

            self.use_f0 = self.cpt.get("f0", 1)
            self.version = self.cpt.get("version", "v1")
            self.vocoder = self.cpt.get("vocoder", "Default")
            self.energy = self.cpt.get("energy", False)

            if self.vocoder != "Default": self.config.is_half = False
            self.net_g = Synthesizer(*self.cpt["config"], use_f0=self.use_f0, text_enc_hidden_dim=768 if self.version == "v2" else 256, vocoder=self.vocoder, energy=self.energy)
            del self.net_g.enc_q

            self.net_g.load_state_dict(self.cpt["weight"], strict=False)
            self.net_g.eval().to(self.device)
            self.net_g = (self.net_g.half() if self.config.is_half else self.net_g.float())
            self.n_spk = self.cpt["config"][-3]

            self.vc = Pipeline(self.tgt_sr, self.config)