E2-F5-TTS

Running

App Files Files Community

kevinwang676 commited on May 13

Commit

a35e94c

verified ·

1 Parent(s): d4069e8

Update speech_edit.py

Browse files

Files changed (1) hide show

speech_edit.py +255 -181

speech_edit.py CHANGED Viewed

@@ -1,183 +1,257 @@
-import os
 import torch
-import torch.nn.functional as F
-import torchaudio
-from vocos import Vocos
-from model import CFM, UNetT, DiT, MMDiT
-from model.utils import (
-    load_checkpoint,
-    get_tokenizer,
-    convert_char_to_pinyin,
-    save_spectrogram,
-)
-device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
-# --------------------- Dataset Settings -------------------- #
-target_sample_rate = 24000
-n_mel_channels = 100
-hop_length = 256
-target_rms = 0.1
-tokenizer = "pinyin"
-dataset_name = "Emilia_ZH_EN"
-# ---------------------- infer setting ---------------------- #
-seed = None  # int | None
-exp_name = "F5TTS_Base"  # F5TTS_Base | E2TTS_Base
-ckpt_step = 1200000
-nfe_step = 32  # 16, 32
-cfg_strength = 2.
-ode_method = 'euler'  # euler | midpoint
-sway_sampling_coef = -1.
-speed = 1.
-if exp_name == "F5TTS_Base":
-    model_cls = DiT
-    model_cfg = dict(dim = 1024, depth = 22, heads = 16, ff_mult = 2, text_dim = 512, conv_layers = 4)
-elif exp_name == "E2TTS_Base":
-    model_cls = UNetT
-    model_cfg = dict(dim = 1024, depth = 24, heads = 16, ff_mult = 4)
-ckpt_path = f"ckpts/{exp_name}/model_{ckpt_step}.safetensors"
-output_dir = "tests"
-# [leverage https://github.com/MahmoudAshraf97/ctc-forced-aligner to get char level alignment]
-# pip install git+https://github.com/MahmoudAshraf97/ctc-forced-aligner.git
-# [write the origin_text into a file, e.g. tests/test_edit.txt]
-# ctc-forced-aligner --audio_path "tests/ref_audio/test_en_1_ref_short.wav" --text_path "tests/test_edit.txt" --language "zho" --romanize --split_size "char"
-# [result will be saved at same path of audio file]
-# [--language "zho" for Chinese, "eng" for English]
-# [if local ckpt, set --alignment_model "../checkpoints/mms-300m-1130-forced-aligner"]
-audio_to_edit = "tests/ref_audio/test_en_1_ref_short.wav"
-origin_text = "Some call me nature, others call me mother nature."
-target_text = "Some call me optimist, others call me realist."
-parts_to_edit = [[1.42, 2.44], [4.04, 4.9], ]  # stard_ends of "nature" & "mother nature", in seconds
-fix_duration = [1.2, 1, ]  # fix duration for "optimist" & "realist", in seconds
-# audio_to_edit = "tests/ref_audio/test_zh_1_ref_short.wav"
-# origin_text = "对，这就是我，万人敬仰的太乙真人。"
-# target_text = "对，那就是你，万人敬仰的太白金星。"
-# parts_to_edit = [[0.84, 1.4], [1.92, 2.4], [4.26, 6.26], ]
-# fix_duration = None  # use origin text duration
-# -------------------------------------------------#
-use_ema = True
-if not os.path.exists(output_dir):
-    os.makedirs(output_dir)
-# Vocoder model
-local = False
-if local:
-    vocos_local_path = "../checkpoints/charactr/vocos-mel-24khz"
-    vocos = Vocos.from_hparams(f"{vocos_local_path}/config.yaml")
-    state_dict = torch.load(f"{vocos_local_path}/pytorch_model.bin", weights_only=True, map_location=device)
-    vocos.load_state_dict(state_dict)
-    vocos.eval()
-else:
-    vocos = Vocos.from_pretrained("charactr/vocos-mel-24khz")
-# Tokenizer
-vocab_char_map, vocab_size = get_tokenizer(dataset_name, tokenizer)
-# Model
-model = CFM(
-    transformer = model_cls(
-        **model_cfg,
-        text_num_embeds = vocab_size,
-        mel_dim = n_mel_channels
-    ),
-    mel_spec_kwargs = dict(
-        target_sample_rate = target_sample_rate,
-        n_mel_channels = n_mel_channels,
-        hop_length = hop_length,
-    ),
-    odeint_kwargs = dict(
-        method = ode_method,
-    ),
-    vocab_char_map = vocab_char_map,
-).to(device)
-model = load_checkpoint(model, ckpt_path, device, use_ema = use_ema)
-# Audio
-audio, sr = torchaudio.load(audio_to_edit)
-if audio.shape[0] > 1:
-    audio = torch.mean(audio, dim=0, keepdim=True)
-rms = torch.sqrt(torch.mean(torch.square(audio)))
-if rms < target_rms:
-    audio = audio * target_rms / rms
-if sr != target_sample_rate:
-    resampler = torchaudio.transforms.Resample(sr, target_sample_rate)
-    audio = resampler(audio)
-offset = 0
-audio_ = torch.zeros(1, 0)
-edit_mask = torch.zeros(1, 0, dtype=torch.bool)
-for part in parts_to_edit:
-    start, end = part
-    part_dur = end - start if fix_duration is None else fix_duration.pop(0)
-    part_dur = part_dur * target_sample_rate
-    start = start * target_sample_rate
-    audio_ = torch.cat((audio_, audio[:, round(offset):round(start)], torch.zeros(1, round(part_dur))), dim = -1)
-    edit_mask = torch.cat((edit_mask,
-                           torch.ones(1, round((start - offset) / hop_length), dtype = torch.bool),
-                           torch.zeros(1, round(part_dur / hop_length), dtype = torch.bool)
-                           ), dim = -1)
-    offset = end * target_sample_rate
-# audio = torch.cat((audio_, audio[:, round(offset):]), dim = -1)
-edit_mask = F.pad(edit_mask, (0, audio.shape[-1] // hop_length - edit_mask.shape[-1] + 1), value = True)
-audio = audio.to(device)
-edit_mask = edit_mask.to(device)
-# Text
-text_list = [target_text]
-if tokenizer == "pinyin":
-    final_text_list = convert_char_to_pinyin(text_list)
-else:
-    final_text_list = [text_list]
-print(f"text  : {text_list}")
-print(f"pinyin: {final_text_list}")
-# Duration
-ref_audio_len = 0
-duration = audio.shape[-1] // hop_length
-# Inference
-with torch.inference_mode():
-    generated, trajectory = model.sample(
-        cond = audio,
-        text = final_text_list,
-        duration = duration,
-        steps = nfe_step,
-        cfg_strength = cfg_strength,
-        sway_sampling_coef = sway_sampling_coef,
-        seed = seed,
-        edit_mask = edit_mask,
     )
-print(f"Generated mel: {generated.shape}")
-# Final result
-generated = generated.to(torch.float32)
-generated = generated[:, ref_audio_len:, :]
-generated_mel_spec = generated.permute(0, 2, 1)
-generated_wave = vocos.decode(generated_mel_spec.cpu())
-if rms < target_rms:
-    generated_wave = generated_wave * rms / target_rms
-save_spectrogram(generated_mel_spec[0].cpu().numpy(), f"{output_dir}/speech_edit_out.png")
-torchaudio.save(f"{output_dir}/speech_edit_out.wav", generated_wave, target_sample_rate)
-print(f"Generated wav: {generated_wave.shape}")

+#!/usr/bin/env python3
+# coding: utf‑8
+"""
+CosyVoice gRPC back‑end – updated to mirror the FastAPI logic
+*   loads CosyVoice2 with TRT / FP16 first (falls back to CosyVoice)
+*   inference_zero_shot  ➜  adds   stream=False   +   speed
+*   inference_instruct   ➜  keeps original “speaker‑ID” path
+*   inference_instruct2  ➜  new:  prompt‑audio + speed (no speaker‑ID)
+"""
+import io, os, tempfile, requests, soundfile as sf, torchaudio
+import sys
+from concurrent import futures
+import argparse
+import logging
+import grpc
+import numpy as np
 import torch
+import cosyvoice_pb2
+import cosyvoice_pb2_grpc
+# ────────────────────────────────────────────────────────────────────────────────
+# set‑up
+# ────────────────────────────────────────────────────────────────────────────────
+logging.getLogger("matplotlib").setLevel(logging.WARNING)
+logging.basicConfig(level=logging.INFO,
+                    format="%(asctime)s %(levelname)s %(message)s")
+ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
+sys.path.extend([
+    f"{ROOT_DIR}/../../..",
+    f"{ROOT_DIR}/../../../third_party/Matcha-TTS",
+])
+from cosyvoice.cli.cosyvoice import CosyVoice2          # noqa: E402
+# ────────────────────────────────────────────────────────────────────────────────
+# helpers
+# ────────────────────────────────────────────────────────────────────────────────
+def _bytes_to_tensor(wav_bytes: bytes) -> torch.Tensor:
+    """
+    Convert int16 little‑endian PCM bytes → torch.FloatTensor in range [‑1,1]
+    """
+    speech = torch.from_numpy(
+        np.frombuffer(wav_bytes, dtype=np.int16)
+    ).unsqueeze(0).float() / (2 ** 15)
+    return speech                                                      # [1, T]
+def _yield_audio(model_output):
+    """
+    Generator that converts CosyVoice output → protobuf Response messages.
+    """
+    for seg in model_output:
+        pcm16 = (seg["tts_speech"].numpy() * (2 ** 15)).astype(np.int16)
+        resp = cosyvoice_pb2.Response(tts_audio=pcm16.tobytes())
+        yield resp
+# ────────────────────────────────────────────────────────────────────────────────
+# gRPC service
+# ────────────────────────────────────────────────────────────────────────────────
+class CosyVoiceServiceImpl(cosyvoice_pb2_grpc.CosyVoiceServicer):
+    def __init__(self, args):
+        # try CosyVoice2 first (preferred runtime: TRT / FP16)
+        try:
+            self.cosyvoice = CosyVoice2(args.model_dir,
+                                        load_jit=False,
+                                        load_trt=True,
+                                        fp16=True)
+            logging.info("Loaded CosyVoice2 (TRT / FP16).")
+        except Exception:
+            raise TypeError("No valid CosyVoice model found!")
+    # ---------------------------------------------------------------------
+    # single bi‑di streaming RPC
+    # ---------------------------------------------------------------------
+    def Inference(self, request, context):
+        """Route to the correct model call based on the oneof field present."""
+        # 1. Supervised fine‑tuning
+        if request.HasField("sft_request"):
+            logging.info("Received SFT inference request")
+            mo = self.cosyvoice.inference_sft(
+                request.sft_request.tts_text,
+                request.sft_request.spk_id
+            )
+            yield from _yield_audio(mo)
+            return
+        # 2. Zero‑shot speaker cloning  (bytes OR S3 URL)
+        if request.HasField("zero_shot_request"):
+            logging.info("Received zero‑shot inference request")
+            zr = request.zero_shot_request
+            tmp_path = None  # initialise so we can delete later
+            try:
+                # ───── determine payload type ──────────────────────────────────────
+                if zr.prompt_audio.startswith(b'http'):
+                    # —— remote URL —— ---------------------------------------------
+                    url = zr.prompt_audio.decode('utf‑8')
+                    logging.info("Downloading prompt audio from %s", url)
+                    resp = requests.get(url, timeout=10)
+                    resp.raise_for_status()
+                    # save to a temp file
+                    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
+                        f.write(resp.content)
+                        tmp_path = f.name
+                    # load, mono‑ise, resample → tensor [1, T]
+                    wav, sr = sf.read(tmp_path, dtype="float32")
+                    if wav.ndim > 1:
+                        wav = wav.mean(axis=1)
+                    if sr != 16_000:
+                        wav = torchaudio.functional.resample(
+                            torch.from_numpy(wav).unsqueeze(0), sr, 16_000
+                        )[0].numpy()
+                    prompt = torch.from_numpy(wav).unsqueeze(0)
+                else:
+                    # —— legacy raw PCM bytes —— -----------------------------------
+                    prompt = _bytes_to_tensor(zr.prompt_audio)
+                # ───── call the model ──────────────────────────────────────────────
+                speed = getattr(zr, "speed", 1.0)
+                mo = self.cosyvoice.inference_zero_shot(
+                    zr.tts_text,
+                    zr.prompt_text,
+                    prompt,
+                    stream=False,
+                    speed=speed,
+                )
+            finally:
+                # clean up any temporary file we created
+                if tmp_path and os.path.exists(tmp_path):
+                    try:
+                        os.remove(tmp_path)
+                    except Exception as e:
+                        logging.warning("Could not remove temp file %s: %s", tmp_path, e)
+            yield from _yield_audio(mo)
+            return
+        # 3. Cross‑lingual
+        if request.HasField("cross_lingual_request"):
+            logging.info("Received cross‑lingual inference request")
+            cr = request.cross_lingual_request
+            prompt = _bytes_to_tensor(cr.prompt_audio)
+            mo = self.cosyvoice.inference_cross_lingual(
+                cr.tts_text,
+                prompt
+            )
+            yield from _yield_audio(mo)
+            return
+        # 4. Instruction‑TTS (two flavours)
+        if request.HasField("instruct_request"):
+            ir = request.instruct_request
+            # ──────────────────────────────────────────────────────────────────
+            # 4‑a) instruct‑2  (has prompt_audio  →  bytes OR S3 URL)
+            # ──────────────────────────────────────────────────────────────────
+            if ir.HasField("prompt_audio"):
+                logging.info("Received instruct‑2 inference request")
+                tmp_path = None
+                try:
+                    if ir.prompt_audio.startswith(b'http'):
+                        # treat as URL, download then load
+                        url = ir.prompt_audio.decode('utf‑8')
+                        logging.info("Downloading prompt audio from %s", url)
+                        resp = requests.get(url, timeout=10)
+                        resp.raise_for_status()
+                        with tempfile.NamedTemporaryFile(delete=False,
+                                                         suffix=".wav") as f:
+                            f.write(resp.content)
+                            tmp_path = f.name
+                        wav, sr = sf.read(tmp_path, dtype='float32')
+                        if wav.ndim > 1:
+                            wav = wav.mean(axis=1)
+                        if sr != 16_000:
+                            wav = torchaudio.functional.resample(
+                                torch.from_numpy(wav).unsqueeze(0), sr, 16_000
+                            )[0].numpy()
+                        prompt = torch.from_numpy(wav).unsqueeze(0)
+                    else:
+                        # legacy raw‑bytes payload
+                        prompt = _bytes_to_tensor(ir.prompt_audio)
+                    speed = getattr(ir, "speed", 1.0)
+                    mo = self.cosyvoice.inference_instruct2(
+                        ir.tts_text,
+                        ir.instruct_text,
+                        prompt,
+                        stream=False,
+                        speed=speed
+                    )
+                finally:
+                    if tmp_path and os.path.exists(tmp_path):
+                        try:
+                            os.remove(tmp_path)
+                        except Exception as e:
+                            logging.warning("Could not remove temp file %s: %s",
+                                            tmp_path, e)
+            # ──────────────────────────────────────────────────────────────────
+            # 4‑b) classic instruct (speaker‑ID, no prompt audio)
+            # ──────────────────────────────────────────────────────────────────
+            else:
+                logging.info("Received instruct inference request")
+                mo = self.cosyvoice.inference_instruct(
+                    ir.tts_text,
+                    ir.spk_id,
+                    ir.instruct_text
+                )
+            yield from _yield_audio(mo)
+            return
+        # unknown request type
+        context.abort(grpc.StatusCode.INVALID_ARGUMENT,
+                      "Unsupported request type in oneof field.")
+# ────────────────────────────────────────────────────────────────────────────────
+# entry‑point
+# ────────────────────────────────────────────────────────────────────────────────
+def serve(args):
+    server = grpc.server(
+        futures.ThreadPoolExecutor(max_workers=args.max_conc),
+        maximum_concurrent_rpcs=args.max_conc
+    )
+    cosyvoice_pb2_grpc.add_CosyVoiceServicer_to_server(
+        CosyVoiceServiceImpl(args), server
     )
+    server.add_insecure_port(f"0.0.0.0:{args.port}")
+    server.start()
+    logging.info("CosyVoice gRPC server listening on 0.0.0.0:%d", args.port)
+    server.wait_for_termination()
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--port", type=int, default=8000)
+    parser.add_argument("--max_conc", type=int, default=4,
+                        help="maximum concurrent requests / threads")
+    parser.add_argument("--model_dir", type=str,
+                        default="pretrained_models/CosyVoice2-0.5B",
+                        help="local path or ModelScope repo id")
+    serve(parser.parse_args())