Spaces:
Runtime error
Runtime error
| import spaces | |
| import gradio as gr | |
| import torch | |
| import torchaudio | |
| import tempfile | |
| import json | |
| import os | |
| from typing import Optional, Tuple | |
| from generation_utils import load_model, process_batch | |
| def load_examples_from_jsonl(): | |
| """ | |
| Load examples from examples/examples.jsonl and convert to format for both ROLE and SINGLE modes | |
| """ | |
| role_examples = [] | |
| single_examples = [] | |
| jsonl_path = "examples/examples.jsonl" | |
| if not os.path.exists(jsonl_path): | |
| print(f"Warning: {jsonl_path} not found") | |
| return [], [] | |
| with open(jsonl_path, 'r', encoding='utf-8') as f: | |
| for line in f: | |
| line = line.strip() | |
| if not line: | |
| continue | |
| data = json.loads(line) | |
| # Extract required fields | |
| text = data.get('text', '') | |
| base_path = data.get('base_path', 'examples') | |
| use_normalize = data.get('use_normalize', True) | |
| # Check if this is a role-based example (has speaker1 and speaker2 audio) | |
| if 'prompt_audio_speaker1' in data and 'prompt_audio_speaker2' in data: | |
| # Role mode example | |
| audio_mode = "Role" | |
| prompt_audio_1 = os.path.join(base_path, data['prompt_audio_speaker1']) | |
| prompt_text_1 = data.get('prompt_text_speaker1', '') | |
| prompt_audio_2 = os.path.join(base_path, data['prompt_audio_speaker2']) | |
| prompt_text_2 = data.get('prompt_text_speaker2', '') | |
| example = [text, audio_mode, prompt_audio_1, prompt_text_1, prompt_audio_2, prompt_text_2, use_normalize] | |
| role_examples.append(example) | |
| # Check if this is a single audio example (has prompt_audio and prompt_text) | |
| elif 'prompt_audio' in data and 'prompt_text' in data: | |
| # Single mode example | |
| audio_mode = "Single" | |
| prompt_audio = os.path.join(base_path, data['prompt_audio']) | |
| prompt_text = data.get('prompt_text', '') | |
| example = [text, audio_mode, prompt_audio, prompt_text, use_normalize] | |
| single_examples.append(example) | |
| print(f"Loaded {len(role_examples)} role examples and {len(single_examples)} single examples from {jsonl_path}") | |
| return role_examples, single_examples | |
| # Load examples from JSONL file | |
| ROLE_EXAMPLES, SINGLE_EXAMPLES = load_examples_from_jsonl() | |
| # Language configuration | |
| LANGUAGES = { | |
| "English": { | |
| "title": "MOSS-TTSD🪐 Dialogue Generation", | |
| "script_input": "### Script Input", | |
| "text_to_synthesize": "Text to Synthesize", | |
| "text_placeholder": "Text to be synthesized, format: [S1]Role1 text[S2]Role2 text", | |
| "use_normalize": "Use text normalization", | |
| "normalize_info": "Recommended to enable, improves handling of numbers, punctuation, etc.", | |
| "audio_input_mode": "### Audio Input Mode", | |
| "select_input_mode": "Select input mode", | |
| "mode_info": "Single Audio: Upload one audio with [S1][S2] text; Role Audio: Upload separate audio for Role1 and Role2", | |
| "drag_drop_audio": "Drag and drop audio here - or - click to upload", | |
| "prompt_text": "Prompt Text", | |
| "prompt_placeholder": "Format: [S1]Role1 text[S2]Role2 text", | |
| "role1_audio": "**Role1 Audio**", | |
| "role1_audio_file": "Role1 Audio File", | |
| "role1_text": "Role1 Text", | |
| "role1_placeholder": "Role1 text content", | |
| "role2_audio": "**Role2 Audio**", | |
| "role2_audio_file": "Role2 Audio File", | |
| "role2_text": "Role2 Text", | |
| "role2_placeholder": "Role2 text content", | |
| "generate_audio": "Generate Audio", | |
| "generated_audio": "Generated Audio", | |
| "status_info": "Status Information", | |
| "examples": "### Examples", | |
| "examples_desc": "Click on examples below to auto-fill the form", | |
| "role_examples": "Role Mode Examples", | |
| "single_examples": "Single Audio Mode Examples", | |
| "role_headers": ["Text to Synthesize", "Input Mode", "Role1 Audio File", "Role1 Text", "Role2 Audio File", "Role2 Text", "Use Normalize"], | |
| "single_headers": ["Text to Synthesize", "Input Mode", "Audio File", "Prompt Text", "Use Normalize"] | |
| }, | |
| "中文": { | |
| "title": "MOSS-TTSD🪐 对话语音生成", | |
| "script_input": "### 文本输入", | |
| "text_to_synthesize": "要合成的文本", | |
| "text_placeholder": "要合成的文本,格式:[S1]角色1文本[S2]角色2文本", | |
| "use_normalize": "使用文本规范化", | |
| "normalize_info": "建议启用,改善数字、标点符号等的处理", | |
| "audio_input_mode": "### 音频输入模式", | |
| "select_input_mode": "选择输入模式", | |
| "mode_info": "单音频:上传一个包含[S1][S2]文本的音频;角色音频:分别为角色1和角色2上传音频", | |
| "drag_drop_audio": "拖拽音频文件到此处 - 或 - 点击上传", | |
| "prompt_text": "提示文本", | |
| "prompt_placeholder": "格式:[S1]角色1文本[S2]角色2文本", | |
| "role1_audio": "**角色1音频**", | |
| "role1_audio_file": "角色1音频文件", | |
| "role1_text": "角色1文本", | |
| "role1_placeholder": "角色1文本内容", | |
| "role2_audio": "**角色2音频**", | |
| "role2_audio_file": "角色2音频文件", | |
| "role2_text": "角色2文本", | |
| "role2_placeholder": "角色2文本内容", | |
| "generate_audio": "生成音频", | |
| "generated_audio": "生成的音频", | |
| "status_info": "状态信息", | |
| "examples": "### 示例", | |
| "examples_desc": "点击下方示例自动填充表单", | |
| "role_examples": "角色模式示例", | |
| "single_examples": "单音频模式示例", | |
| "role_headers": ["要合成的文本", "输入模式", "角色1音频文件", "角色1文本", "角色2音频文件", "角色2文本", "使用规范化"], | |
| "single_headers": ["要合成的文本", "输入模式", "音频文件", "提示文本", "使用规范化"] | |
| } | |
| } | |
| # Model configuration | |
| SYSTEM_PROMPT = "You are a speech synthesizer that generates natural, realistic, and human-like conversational audio from dialogue text." | |
| MODEL_PATH = os.environ["MODEL_REPO_ID"] | |
| SPT_CONFIG_PATH = "XY_Tokenizer/config/xy_tokenizer_config.yaml" | |
| # SPT_CHECKPOINT_PATH = "XY_Tokenizer/weights/xy_tokenizer.ckpt" | |
| MAX_CHANNELS = 8 | |
| from huggingface_hub import hf_hub_download | |
| SPT_CHECKPOINT_PATH = hf_hub_download( | |
| repo_id="fnlp/XY_Tokenizer_TTSD_V0", | |
| filename="xy_tokenizer.ckpt", | |
| cache_dir="XY_Tokenizer/weights" | |
| ) | |
| print("Checkpoint downloaded to:", SPT_CHECKPOINT_PATH) | |
| # Global variables for caching loaded models | |
| tokenizer = None | |
| model = None | |
| spt = None | |
| device = None | |
| def initialize_model(): | |
| """Initialize model (load only on first call)""" | |
| global tokenizer, model, spt, device | |
| if tokenizer is None: | |
| print("Initializing model...") | |
| device = "cuda" | |
| print(f"Using {device}") | |
| tokenizer, model, spt = load_model(MODEL_PATH, SPT_CONFIG_PATH, SPT_CHECKPOINT_PATH) | |
| spt = spt.to(device) | |
| model = model.to(device) | |
| # limit max new tokens to avoid timeouts | |
| model.generation_config.max_new_tokens = 4096 | |
| print("Model initialization completed!") | |
| return tokenizer, model, spt, device | |
| # Initialize model when starting the application | |
| initialize_model() | |
| def process_single_audio_generation( | |
| text_input: str, | |
| audio_mode: str, | |
| prompt_text_single: str, | |
| prompt_audio_single: Optional[str] = None, | |
| prompt_text_1: str = "", | |
| prompt_audio_1: Optional[str] = None, | |
| prompt_text_2: str = "", | |
| prompt_audio_2: Optional[str] = None, | |
| use_normalize: bool = True | |
| ) -> Tuple[Optional[str], str]: | |
| """ | |
| Process single audio generation request | |
| Args: | |
| text_input: Text to synthesize | |
| prompt_text_single: Prompt text for single audio | |
| prompt_audio_single: Single audio file path | |
| prompt_text_1: Role1 text | |
| prompt_audio_1: Role1 audio file path | |
| prompt_text_2: Role2 text | |
| prompt_audio_2: Role2 audio file path | |
| use_normalize: Whether to use text normalization | |
| Returns: | |
| Generated audio file path and status information | |
| """ | |
| try: | |
| # Initialize model | |
| tokenizer, model, spt, device = initialize_model() | |
| # Build input item | |
| item = { | |
| "text": text_input, | |
| } | |
| # Handle different audio input modes (mutually exclusive) | |
| if audio_mode == "Single": | |
| # Use single audio mode | |
| item["prompt_audio"] = prompt_audio_single | |
| item["prompt_text"] = prompt_text_single | |
| elif audio_mode == "Role" and prompt_audio_1 and prompt_audio_2: | |
| # Use role audio mode (requires both audio files) | |
| item["prompt_audio_speaker1"] = prompt_audio_1 | |
| item["prompt_text_speaker1"] = prompt_text_1 if prompt_text_1 else "" | |
| item["prompt_audio_speaker2"] = prompt_audio_2 | |
| item["prompt_text_speaker2"] = prompt_text_2 if prompt_text_2 else "" | |
| elif audio_mode == "Role" and prompt_audio_1: | |
| # Only Role 1 audio provided, treat as single audio | |
| print("Only Role 1 audio provided, treating as single audio.") | |
| item["prompt_audio"] = prompt_audio_1 | |
| item["prompt_text"] = prompt_text_1 if prompt_text_1 else "" | |
| elif audio_mode == "Role" and prompt_audio_2: | |
| # Only Role 2 audio provided, treat as single audio | |
| print("Only Role 2 audio provided, treating as single audio.") | |
| item["prompt_audio"] = prompt_audio_2 | |
| item["prompt_text"] = prompt_text_2 if prompt_text_2 else "" | |
| else: | |
| return None, "Error: Please select a mode and provide corresponding audio files\n- Single Audio Mode: Provide one audio file and corresponding text\n- Role Mode: Provide audio files for Role1 and Role2" | |
| # Set random seed to ensure reproducible results | |
| # import accelerate | |
| # accelerate.utils.set_seed(42) | |
| # Process batch (single item) | |
| actual_texts_data, audio_results = process_batch( | |
| batch_items=[item], | |
| tokenizer=tokenizer, | |
| model=model, | |
| spt=spt, | |
| device=device, | |
| system_prompt=SYSTEM_PROMPT, | |
| start_idx=0, | |
| use_normalize=use_normalize | |
| ) | |
| # Check results | |
| if not audio_results or audio_results[0] is None: | |
| return None, "Error: Audio generation failed" | |
| audio_result = audio_results[0] | |
| # Create temporary output file | |
| output_path = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name | |
| # Save audio | |
| torchaudio.save(output_path, audio_result["audio_data"], audio_result["sample_rate"]) | |
| # Build status information (using English since this is server-side output) | |
| status_info = f""" | |
| ✅ Generation successful! | |
| 📊 Audio Information: | |
| - Sample Rate: {audio_result["sample_rate"]} Hz | |
| - Audio Length: {audio_result["audio_data"].shape[-1] / audio_result["sample_rate"]:.2f} seconds | |
| - Channels: {audio_result["audio_data"].shape[0]} | |
| 📝 Text Processing Information: | |
| - Original Text: {actual_texts_data[0]['original_text'][:100]}... | |
| - Final Text: {actual_texts_data[0]['final_text'][:100]}... | |
| - Use Normalize: {actual_texts_data[0]['use_normalize']} | |
| """ | |
| return output_path, status_info | |
| except Exception as e: | |
| import traceback | |
| error_msg = f"Error: Audio generation failed: {str(e)}\n\nDetails:\n{traceback.format_exc()}" | |
| return None, error_msg | |
| # Create Gradio interface | |
| def create_gradio_interface() -> gr.Blocks: | |
| with gr.Blocks(title="MOSS-TTSD🪐 Dialogue Generation", theme=gr.themes.Soft()) as demo: | |
| # Language selection at the top | |
| with gr.Row(): | |
| language_selector = gr.Radio( | |
| choices=["English", "中文"], | |
| value="English", | |
| label="Language / 语言", | |
| info="Select interface language / 选择界面语言" | |
| ) | |
| # Title and header (will be updated based on language) | |
| title_md = gr.Markdown("# MOSS-TTSD🪐 Dialogue Generation") | |
| github_md = gr.Markdown("### [Github](https://github.com/OpenMOSS/MOSS-TTSD)") | |
| with gr.Row(): | |
| # Left input area | |
| with gr.Column(scale=1): | |
| script_input_md = gr.Markdown("### Script Input") | |
| text_input = gr.Textbox( | |
| label="Text to Synthesize", | |
| placeholder="Text to be synthesized, format: [S1]Role1 text[S2]Role2 text", | |
| lines=6, | |
| ) | |
| use_normalize_single = gr.Checkbox( | |
| label="Use text normalization", | |
| value=True, | |
| info="Recommended to enable, improves handling of numbers, punctuation, etc." | |
| ) | |
| # Right audio input area | |
| with gr.Column(scale=1): | |
| audio_input_mode_md = gr.Markdown("### Audio Input Mode") | |
| # Audio input mode selection | |
| audio_mode = gr.Radio( | |
| choices=["Single", "Role"], | |
| value="Single", | |
| label="Select input mode", | |
| info="Single Audio: Upload one audio with [S1][S2] text; Role Audio: Upload separate audio for Role1 and Role2" | |
| ) | |
| # Single audio mode | |
| with gr.Group(visible=True) as single_mode_group: | |
| prompt_audio_single = gr.File( | |
| label="Drag and drop audio here - or - click to upload", | |
| file_types=["audio"], | |
| type="filepath" | |
| ) | |
| prompt_text_single = gr.Textbox( | |
| label="Prompt Text", | |
| placeholder="Format: [S1]Role1 text[S2]Role2 text", | |
| lines=3, | |
| ) | |
| # Role audio mode | |
| with gr.Group(visible=False) as role_mode_group: | |
| with gr.Row(): | |
| with gr.Column(): | |
| role1_audio_md = gr.Markdown("**Role1 Audio**") | |
| prompt_audio_1 = gr.File( | |
| label="Role1 Audio File", | |
| file_types=["audio"], | |
| type="filepath" | |
| ) | |
| prompt_text_1 = gr.Textbox( | |
| label="Role1 Text", | |
| placeholder="Role1 text content", | |
| lines=2 | |
| ) | |
| with gr.Column(): | |
| role2_audio_md = gr.Markdown("**Role2 Audio**") | |
| prompt_audio_2 = gr.File( | |
| label="Role2 Audio File", | |
| file_types=["audio"], | |
| type="filepath" | |
| ) | |
| prompt_text_2 = gr.Textbox( | |
| label="Role2 Text", | |
| placeholder="Role2 text content", | |
| lines=2 | |
| ) | |
| # Generate button | |
| with gr.Row(): | |
| generate_btn = gr.Button("Generate Audio", variant="primary", size="lg") | |
| # Output area | |
| with gr.Row(): | |
| with gr.Column(): | |
| output_audio = gr.Audio(label="Generated Audio", type="filepath") | |
| status_info = gr.Textbox( | |
| label="Status Information", | |
| lines=10, | |
| interactive=False | |
| ) | |
| # Examples area | |
| with gr.Row(): | |
| with gr.Column(): | |
| examples_md = gr.Markdown("### Examples") | |
| examples_desc_md = gr.Markdown("Click on examples below to auto-fill the form") | |
| # Role mode examples | |
| with gr.Group(): | |
| role_examples_md = gr.Markdown("**Role Mode Examples**") | |
| role_examples = gr.Examples( | |
| examples=ROLE_EXAMPLES, | |
| inputs=[text_input, audio_mode, prompt_audio_1, prompt_text_1, prompt_audio_2, prompt_text_2, use_normalize_single], | |
| ) | |
| # Single audio mode examples | |
| with gr.Group(): | |
| single_examples_md = gr.Markdown("**Single Audio Mode Examples**") | |
| single_examples = gr.Examples( | |
| examples=SINGLE_EXAMPLES, | |
| inputs=[text_input, audio_mode, prompt_audio_single, prompt_text_single, use_normalize_single], | |
| ) | |
| # Event handlers | |
| # Language change event | |
| def update_language(lang): | |
| """Update interface language""" | |
| texts = LANGUAGES[lang] | |
| # Update demo title | |
| demo.title = texts["title"] | |
| return ( | |
| gr.Markdown(f"# {texts['title']}"), # title_md | |
| texts["script_input"], # script_input_md | |
| gr.Textbox( | |
| label=texts["text_to_synthesize"], | |
| placeholder=texts["text_placeholder"], | |
| lines=6, | |
| ), # text_input | |
| gr.Checkbox( | |
| label=texts["use_normalize"], | |
| value=True, | |
| info=texts["normalize_info"] | |
| ), # use_normalize_single | |
| texts["audio_input_mode"], # audio_input_mode_md | |
| gr.Radio( | |
| choices=["Single", "Role"], | |
| value="Single", | |
| label=texts["select_input_mode"], | |
| info=texts["mode_info"] | |
| ), # audio_mode | |
| gr.File( | |
| label=texts["drag_drop_audio"], | |
| file_types=["audio"], | |
| type="filepath" | |
| ), # prompt_audio_single | |
| gr.Textbox( | |
| label=texts["prompt_text"], | |
| placeholder=texts["prompt_placeholder"], | |
| lines=3, | |
| ), # prompt_text_single | |
| texts["role1_audio"], # role1_audio_md | |
| gr.File( | |
| label=texts["role1_audio_file"], | |
| file_types=["audio"], | |
| type="filepath" | |
| ), # prompt_audio_1 | |
| gr.Textbox( | |
| label=texts["role1_text"], | |
| placeholder=texts["role1_placeholder"], | |
| lines=2 | |
| ), # prompt_text_1 | |
| texts["role2_audio"], # role2_audio_md | |
| gr.File( | |
| label=texts["role2_audio_file"], | |
| file_types=["audio"], | |
| type="filepath" | |
| ), # prompt_audio_2 | |
| gr.Textbox( | |
| label=texts["role2_text"], | |
| placeholder=texts["role2_placeholder"], | |
| lines=2 | |
| ), # prompt_text_2 | |
| gr.Button(texts["generate_audio"], variant="primary", size="lg"), # generate_btn | |
| gr.Audio(label=texts["generated_audio"], type="filepath"), # output_audio | |
| gr.Textbox( | |
| label=texts["status_info"], | |
| lines=10, | |
| interactive=False | |
| ), # status_info | |
| texts["examples"], # examples_md | |
| texts["examples_desc"], # examples_desc_md | |
| texts["role_examples"], # role_examples_md | |
| texts["single_examples"], # single_examples_md | |
| gr.Dataset(headers=texts["role_headers"]), | |
| gr.Dataset(headers=texts["single_headers"]), | |
| ) | |
| language_selector.change( | |
| fn=update_language, | |
| inputs=[language_selector], | |
| outputs=[ | |
| title_md, script_input_md, text_input, use_normalize_single, | |
| audio_input_mode_md, audio_mode, prompt_audio_single, prompt_text_single, | |
| role1_audio_md, prompt_audio_1, prompt_text_1, | |
| role2_audio_md, prompt_audio_2, prompt_text_2, | |
| generate_btn, output_audio, status_info, | |
| examples_md, examples_desc_md, role_examples_md, single_examples_md, | |
| role_examples.dataset, single_examples.dataset | |
| ] | |
| ) | |
| # Audio mode toggle event | |
| def toggle_audio_mode(mode): | |
| if mode == "Single": | |
| return gr.Group(visible=True), gr.Group(visible=False) | |
| else: | |
| return gr.Group(visible=False), gr.Group(visible=True) | |
| audio_mode.change( | |
| fn=toggle_audio_mode, | |
| inputs=[audio_mode], | |
| outputs=[single_mode_group, role_mode_group] | |
| ) | |
| # Audio generation event | |
| generate_btn.click( | |
| fn=process_single_audio_generation, | |
| inputs=[ | |
| text_input, | |
| audio_mode, | |
| prompt_text_single, | |
| prompt_audio_single, | |
| prompt_text_1, | |
| prompt_audio_1, | |
| prompt_text_2, | |
| prompt_audio_2, | |
| use_normalize_single | |
| ], | |
| outputs=[output_audio, status_info], | |
| show_progress=True | |
| ) | |
| return demo | |
| # Main function | |
| if __name__ == "__main__": | |
| demo = create_gradio_interface() | |
| # Launch interface | |
| demo.launch() | |