File size: 4,127 Bytes
0e29aed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c5f1fa0
0e29aed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f8c9676
a7eeceb
f8c9676
0e29aed
a7eeceb
 
 
 
 
 
 
0e29aed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c5f1fa0
 
0e29aed
 
 
 
 
 
 
 
c5f1fa0
 
0e29aed
 
 
 
 
 
9eb296a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import subprocess
import sys

subprocess.check_call([sys.executable, "-m", "pip", "install", "pip==24.0"])
subprocess.check_call([sys.executable, "-m", "pip", "install", "omegaconf==2.0.6"])
subprocess.check_call([sys.executable, "-m", "pip", "install", "git+https://github.com/facebookresearch/fairseq.git@v0.12.2"])
import gradio as gr
import os
import torch
import librosa
import soundfile as sf
import tempfile
import spaces  # ZeroGPU requirement

# 导入你的模块
import Echox_copy_stream as Echox

os.environ["TOKENIZERS_PARALLELISM"] = "false"

# 全局变量
_MODEL_ON_CUDA = False
inference_model = None

def init_model():
    """在CPU上初始化模型"""
    global inference_model
    if inference_model is None:
        inference_model = Echox.EchoxAssistant()
    return inference_model

def process_audio_input(audio):
    """处理音频输入"""
    if audio is None:
        return None
    
    try:
        # 如果是文件路径,直接返回
        if isinstance(audio, str):
            return audio
        
        # 如果是numpy数组格式 (sr, data)
        if isinstance(audio, tuple):
            sr, y = audio
            if y.ndim > 1:
                y = y[:, 0]  # 只保留第一个声道
        else:
            # 如果直接是数组
            y = audio
            sr = 16000  # 默认采样率
        
        # 保存为临时文件
        with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp_file:
            sf.write(tmp_file.name, y, sr)
            return tmp_file.name
            
    except Exception as e:
        print(f"Error processing audio: {e}")
        return None

@spaces.GPU(duration=180)  # 使用ZeroGPU,3分钟超时
def process_audio_text(audio):
    """主要处理函数"""
    global _MODEL_ON_CUDA, inference_model
    
    # 初始化模型(如果还没初始化)
    if inference_model is None:
        init_model()
    
    # 首次使用GPU时移动模型
    if not _MODEL_ON_CUDA:
        try:
            # 将模型移动到GPU
            if hasattr(inference_model, 'model'):
                inference_model.model = inference_model.model.to("cuda")
            if hasattr(inference_model, 'unit_translator'):
                inference_model.unit_translator = inference_model.unit_translator.to("cuda")
            
            inference_model.device = "cuda"
            _MODEL_ON_CUDA = True
            print("Model moved to GPU")
        except Exception as e:
            print(f"Error moving model to GPU: {e}")
    
    # 处理音频输入
    audio_path = process_audio_input(audio)
    
    text = ""
    
    tmp = [{
        "conversations": [
            {
                "from": "user",
                "value": text,
                "audio": audio_path
            }
        ]
    }]
    
    accumulated_text = ""
    
    try:
        for text_response, audio_data in inference_model._inference(tmp):
            if text_response:
                accumulated_text = text_response
            
            if audio_data is not None:
                sr, audio_array = audio_data
                yield accumulated_text, (sr, audio_array)
            else:
                yield accumulated_text, None
    except Exception as e:
        yield f"Error: {str(e)}", None
    finally:
        # 清理临时文件
        if audio_path and audio_path != audio and os.path.exists(audio_path):
            try:
                os.unlink(audio_path)
            except:
                pass

# 初始化模型(在CPU上)
init_model()

if __name__ == "__main__":
    examples = [
        ["./show_case/1.wav"],
        ["./show_case/2.wav"],
    ]  

    iface = gr.Interface(
        fn=process_audio_text,
        inputs=[
            gr.Audio(type="filepath", label="Upload Audio")
        ],
        outputs=[
            gr.Audio(label="Streamed Audio", streaming=True, autoplay=True),
            gr.Textbox(label="Model output")
        ],
        examples=examples,
        live=False,
        allow_flagging="never"
    )

    iface.launch(server_name="0.0.0.0", server_port=7860, share=True)