Spaces:
Running
on
Zero
Running
on
Zero
File size: 4,127 Bytes
0e29aed c5f1fa0 0e29aed f8c9676 a7eeceb f8c9676 0e29aed a7eeceb 0e29aed c5f1fa0 0e29aed c5f1fa0 0e29aed 9eb296a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 |
import subprocess
import sys
subprocess.check_call([sys.executable, "-m", "pip", "install", "pip==24.0"])
subprocess.check_call([sys.executable, "-m", "pip", "install", "omegaconf==2.0.6"])
subprocess.check_call([sys.executable, "-m", "pip", "install", "git+https://github.com/facebookresearch/fairseq.git@v0.12.2"])
import gradio as gr
import os
import torch
import librosa
import soundfile as sf
import tempfile
import spaces # ZeroGPU requirement
# 导入你的模块
import Echox_copy_stream as Echox
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# 全局变量
_MODEL_ON_CUDA = False
inference_model = None
def init_model():
"""在CPU上初始化模型"""
global inference_model
if inference_model is None:
inference_model = Echox.EchoxAssistant()
return inference_model
def process_audio_input(audio):
"""处理音频输入"""
if audio is None:
return None
try:
# 如果是文件路径,直接返回
if isinstance(audio, str):
return audio
# 如果是numpy数组格式 (sr, data)
if isinstance(audio, tuple):
sr, y = audio
if y.ndim > 1:
y = y[:, 0] # 只保留第一个声道
else:
# 如果直接是数组
y = audio
sr = 16000 # 默认采样率
# 保存为临时文件
with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp_file:
sf.write(tmp_file.name, y, sr)
return tmp_file.name
except Exception as e:
print(f"Error processing audio: {e}")
return None
@spaces.GPU(duration=180) # 使用ZeroGPU,3分钟超时
def process_audio_text(audio):
"""主要处理函数"""
global _MODEL_ON_CUDA, inference_model
# 初始化模型(如果还没初始化)
if inference_model is None:
init_model()
# 首次使用GPU时移动模型
if not _MODEL_ON_CUDA:
try:
# 将模型移动到GPU
if hasattr(inference_model, 'model'):
inference_model.model = inference_model.model.to("cuda")
if hasattr(inference_model, 'unit_translator'):
inference_model.unit_translator = inference_model.unit_translator.to("cuda")
inference_model.device = "cuda"
_MODEL_ON_CUDA = True
print("Model moved to GPU")
except Exception as e:
print(f"Error moving model to GPU: {e}")
# 处理音频输入
audio_path = process_audio_input(audio)
text = ""
tmp = [{
"conversations": [
{
"from": "user",
"value": text,
"audio": audio_path
}
]
}]
accumulated_text = ""
try:
for text_response, audio_data in inference_model._inference(tmp):
if text_response:
accumulated_text = text_response
if audio_data is not None:
sr, audio_array = audio_data
yield accumulated_text, (sr, audio_array)
else:
yield accumulated_text, None
except Exception as e:
yield f"Error: {str(e)}", None
finally:
# 清理临时文件
if audio_path and audio_path != audio and os.path.exists(audio_path):
try:
os.unlink(audio_path)
except:
pass
# 初始化模型(在CPU上)
init_model()
if __name__ == "__main__":
examples = [
["./show_case/1.wav"],
["./show_case/2.wav"],
]
iface = gr.Interface(
fn=process_audio_text,
inputs=[
gr.Audio(type="filepath", label="Upload Audio")
],
outputs=[
gr.Audio(label="Streamed Audio", streaming=True, autoplay=True),
gr.Textbox(label="Model output")
],
examples=examples,
live=False,
allow_flagging="never"
)
iface.launch(server_name="0.0.0.0", server_port=7860, share=True) |