Spaces:
Running
on
Zero
Running
on
Zero
| import subprocess | |
| import sys | |
| subprocess.check_call([sys.executable, "-m", "pip", "install", "pip==24.0"]) | |
| subprocess.check_call([sys.executable, "-m", "pip", "install", "omegaconf==2.0.6"]) | |
| subprocess.check_call([sys.executable, "-m", "pip", "install", "git+https://github.com/facebookresearch/fairseq.git@v0.12.2"]) | |
| import gradio as gr | |
| import os | |
| import torch | |
| import librosa | |
| import soundfile as sf | |
| import tempfile | |
| import spaces # ZeroGPU requirement | |
| # 导入你的模块 | |
| import Echox_copy_stream as Echox | |
| os.environ["TOKENIZERS_PARALLELISM"] = "false" | |
| # 全局变量 | |
| _MODEL_ON_CUDA = False | |
| inference_model = None | |
| def init_model(): | |
| """在CPU上初始化模型""" | |
| global inference_model | |
| if inference_model is None: | |
| inference_model = Echox.EchoxAssistant() | |
| return inference_model | |
| def process_audio_input(audio): | |
| """处理音频输入""" | |
| if audio is None: | |
| return None | |
| try: | |
| # 如果是文件路径,直接返回 | |
| if isinstance(audio, str): | |
| return audio | |
| # 如果是numpy数组格式 (sr, data) | |
| if isinstance(audio, tuple): | |
| sr, y = audio | |
| if y.ndim > 1: | |
| y = y[:, 0] # 只保留第一个声道 | |
| else: | |
| # 如果直接是数组 | |
| y = audio | |
| sr = 16000 # 默认采样率 | |
| # 保存为临时文件 | |
| with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp_file: | |
| sf.write(tmp_file.name, y, sr) | |
| return tmp_file.name | |
| except Exception as e: | |
| print(f"Error processing audio: {e}") | |
| return None | |
| def process_audio_text(text, audio): | |
| global _MODEL_ON_CUDA, inference_model | |
| if inference_model is None: | |
| init_model() | |
| if not _MODEL_ON_CUDA: | |
| try: | |
| if hasattr(inference_model, 'model'): | |
| inference_model.model = inference_model.model.to("cuda") | |
| if hasattr(inference_model, 'unit_translator'): | |
| inference_model.unit_translator = inference_model.unit_translator.to("cuda") | |
| inference_model.device = "cuda" | |
| _MODEL_ON_CUDA = True | |
| print("Model moved to GPU") | |
| except Exception as e: | |
| print(f"Error moving model to GPU: {e}") | |
| audio_path = process_audio_input(audio) | |
| tmp = [{ | |
| "conversations": [ | |
| { | |
| "from": "user", | |
| "value": text, | |
| "audio": audio_path | |
| } | |
| ] | |
| }] | |
| accumulated_text = "" | |
| try: | |
| for text_response, audio_data in inference_model._inference(tmp): | |
| if text_response: | |
| accumulated_text = text_response | |
| if audio_data is not None: | |
| sr, audio_array = audio_data | |
| yield (sr, audio_array), accumulated_text | |
| else: | |
| yield None, accumulated_text | |
| except Exception as e: | |
| yield None, f"Error: {str(e)}" | |
| finally: | |
| if audio_path and audio_path != audio and os.path.exists(audio_path): | |
| try: | |
| os.unlink(audio_path) | |
| except: | |
| pass | |
| init_model() | |
| if __name__ == "__main__": | |
| examples = [ | |
| ["Recognize what the voice said and respond to it.", "./show_case/1.wav"], | |
| ["", "./show_case/2.wav"], | |
| ] | |
| iface = gr.Interface( | |
| fn=process_audio_text, | |
| inputs=[ | |
| gr.Textbox(label="Input Text", value=examples[0][0]), | |
| gr.Audio(type="filepath", label="Upload Audio", value=examples[0][1]) | |
| ], | |
| outputs=[ | |
| gr.Audio(label="Streamed Audio", streaming=True, autoplay=True), | |
| gr.Textbox(label="Model output") | |
| ], | |
| examples=examples, | |
| live=False, | |
| allow_flagging="never" | |
| ) | |
| iface.launch(server_name="0.0.0.0", server_port=7860, share=False) |