File size: 12,777 Bytes
7d519a8
 
 
 
 
 
 
e92b5fd
 
7d519a8
 
 
 
 
145d58c
e92b5fd
145d58c
7d519a8
145d58c
 
 
 
b2d26db
145d58c
b2d26db
 
 
145d58c
b2d26db
145d58c
 
b2d26db
 
 
145d58c
b2d26db
145d58c
 
 
 
 
 
b2d26db
145d58c
 
 
 
 
 
 
b2d26db
 
145d58c
b2d26db
 
 
 
 
145d58c
b2d26db
145d58c
 
 
b2d26db
145d58c
b2d26db
 
145d58c
 
b2d26db
145d58c
b2d26db
 
7d519a8
e92b5fd
b2d26db
e92b5fd
145d58c
 
e92b5fd
 
 
 
 
 
 
 
 
 
 
 
7d519a8
e92b5fd
 
 
145d58c
 
 
e92b5fd
 
145d58c
e92b5fd
 
145d58c
 
 
7d519a8
e92b5fd
145d58c
e92b5fd
145d58c
 
e92b5fd
 
145d58c
4742671
e92b5fd
 
 
 
145d58c
 
b2d26db
 
145d58c
e92b5fd
 
 
7d519a8
145d58c
7d519a8
145d58c
7d519a8
 
145d58c
 
7d519a8
145d58c
b2d26db
 
 
145d58c
 
 
 
 
b2d26db
 
e92b5fd
145d58c
e92b5fd
 
 
145d58c
b2d26db
4742671
 
 
 
e92b5fd
7d519a8
4742671
7d519a8
 
145d58c
e92b5fd
7d519a8
 
 
145d58c
7d519a8
 
145d58c
 
 
 
 
 
7d519a8
e92b5fd
 
 
 
 
 
 
 
 
 
 
145d58c
 
e92b5fd
 
145d58c
7d519a8
e92b5fd
 
7d519a8
145d58c
 
e92b5fd
7d519a8
145d58c
7d519a8
145d58c
7d519a8
 
 
 
 
 
 
 
 
4742671
e92b5fd
145d58c
e92b5fd
 
145d58c
 
e92b5fd
 
 
 
145d58c
e92b5fd
145d58c
e92b5fd
 
7d519a8
 
 
4742671
145d58c
7d519a8
 
145d58c
 
 
 
 
 
e92b5fd
7d519a8
145d58c
 
 
e92b5fd
 
145d58c
 
7d519a8
 
145d58c
e92b5fd
4742671
 
 
 
 
 
 
 
 
 
 
 
 
7d519a8
145d58c
7d519a8
145d58c
 
 
 
7d519a8
 
145d58c
 
7d519a8
145d58c
 
 
7d519a8
145d58c
 
 
 
e92b5fd
145d58c
4742671
145d58c
7d519a8
145d58c
7d519a8
145d58c
4742671
145d58c
 
 
 
 
 
 
e92b5fd
7d519a8
4742671
7d519a8
 
145d58c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
import gradio as gr
import speech_recognition as sr
import numpy as np
from pydub import AudioSegment
import io
import tempfile
import os
import wave
import threading
import queue
import time

# تنظیمات اولیه
recognizer = sr.Recognizer()
recognizer.energy_threshold = 400  # کمی بالاتر برای real-time
recognizer.dynamic_energy_threshold = True
recognizer.pause_threshold = 0.5  # حساس‌تر به pause

# صف برای accumulate chunk‌های real-time
audio_queue = queue.Queue(maxsize=10)  # کوچک‌تر برای real-time
PROCESS_INTERVAL = 3.0  # هر 3 ثانیه process
MIN_DURATION = 1.5  # حداقل 1.5s صوت برای process

# Transcript global
current_transcript = ""
transcript_lock = threading.Lock()

# Background thread برای پردازش queue
def background_processor():
    accumulated = []
    last_process = time.time()
    
    while True:
        try:
            # Get chunk if available
            if not audio_queue.empty():
                audio_tuple = audio_queue.get(timeout=0.2)
                if audio_tuple and audio_tuple[1] is not None:
                    rate, data = audio_tuple
                    data = np.clip(data, -1.0, 1.0)  # Normalize
                    accumulated.append((rate, data))
                    print(f"Accumulated chunk: {len(data)/rate:.2f}s")
            
            # Process if interval passed and enough audio
            now = time.time()
            total_dur = sum(len(d)/r for r, d in accumulated) if accumulated else 0
            if (now - last_process >= PROCESS_INTERVAL) and total_dur >= MIN_DURATION:
                if accumulated:
                    merged_rate = accumulated[0][0]
                    merged_data = np.concatenate([d for _, d in accumulated])
                    
                    text = process_audio_chunk((merged_rate, merged_data))
                    if text and text not in ["", "[خطا در اتصال]"]:
                        with transcript_lock:
                            if current_transcript:
                                current_transcript += " " + text
                            else:
                                current_transcript = text
                        print(f"✅ Processed: {text[:50]}...")  # Log کوتاه
                    
                    # Reset
                    accumulated = []
                    last_process = now
            
            time.sleep(0.2)  # Poll faster for responsiveness
            
        except Exception as e:
            print(f"Background error: {e}")
            time.sleep(1)

# Start background thread
processor_thread = threading.Thread(target=background_processor, daemon=True)
processor_thread.start()

def numpy_to_audio_segment(audio_data, sample_rate):
    if audio_data is None or len(audio_data) == 0:
        return None
    if audio_data.dtype in [np.float32, np.float64]:
        audio_data = np.clip(audio_data, -1.0, 1.0)
        audio_data = (audio_data * 32767).astype(np.int16)
    
    buffer = io.BytesIO()
    with wave.open(buffer, 'wb') as wav_file:
        wav_file.setnchannels(1)
        wav_file.setsampwidth(2)
        wav_file.setframerate(sample_rate)
        wav_file.writeframes(audio_data.tobytes())
    buffer.seek(0)
    return AudioSegment.from_wav(buffer)

def process_audio_chunk(audio_tuple):
    try:
        if audio_tuple is None:
            return ""
        sample_rate, audio_data = audio_tuple
        duration = len(audio_data) / sample_rate if sample_rate else 0
        if duration < MIN_DURATION:
            return ""
        
        audio_segment = numpy_to_audio_segment(audio_data, sample_rate)
        if audio_segment is None or len(audio_segment) < MIN_DURATION * 1000:
            return ""
        
        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
            audio_segment.export(tmp.name, format="wav")
            tmp_path = tmp.name
        
        with sr.AudioFile(tmp_path) as source:
            recognizer.adjust_for_ambient_noise(source, duration=0.3)  # Quick adjust
            audio = recognizer.record(source)
        
        # Google recognition
        text = ""
        try:
            text = recognizer.recognize_google(audio, language='fa-IR')  # Persian first
            # اگر key داری: text = recognizer.recognize_google(audio, language='fa-IR', key="YOUR_GOOGLE_API_KEY")
        except sr.UnknownValueError:
            try:
                text = recognizer.recognize_google(audio, language='en-US')
            except sr.UnknownValueError:
                print("No speech in chunk")
                text = ""
        except sr.RequestError as e:
            print(f"Google API error: {e}")
            text = "[خطا اتصال]"
        
        if os.path.exists(tmp_path):
            os.unlink(tmp_path)
        
        return text.strip()
    except Exception as e:
        print(f"Process error: {e}")
        return ""

def handle_realtime_audio(audio):
    """Handle incoming audio chunks from microphone"""
    if audio is None:
        return gr.update()
    try:
        audio_queue.put(audio, block=False)
    except queue.Full:
        print("Queue full, skip")
    return gr.update()

def get_current_transcript():
    """Poll transcript for UI update"""
    with transcript_lock:
        return current_transcript

def clear_transcript():
    global current_transcript
    with transcript_lock:
        current_transcript = ""
    # Clear queue
    while not audio_queue.empty():
        try:
            audio_queue.get_nowait()
        except queue.Empty:
            break
    return ""

# File transcription (unchanged)
def transcribe_file(audio_file, chunk_duration=30):
    if audio_file is None:
        yield "لطفاً فایل آپلود کنید", ""
        return
    try:
        audio = AudioSegment.from_file(audio_file)
        duration_ms = len(audio)
        chunk_ms = chunk_duration * 1000
        all_text = []
        
        num_chunks = (duration_ms + chunk_ms - 1) // chunk_ms
        for i in range(0, duration_ms, chunk_ms):
            chunk = audio[i:i + chunk_ms]
            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
                chunk.export(tmp.name, format="wav")
                tmp_path = tmp.name
            
            try:
                with sr.AudioFile(tmp_path) as source:
                    audio_data = recognizer.record(source)
                try:
                    text = recognizer.recognize_google(audio_data, language='fa-IR')
                except sr.UnknownValueError:
                    try:
                        text = recognizer.recognize_google(audio_data, language='en-US')
                    except sr.UnknownValueError:
                        text = ""
                except sr.RequestError:
                    text = "[خطا اتصال]"
                if text and text != "[خطا اتصال]":
                    all_text.append(text)
            except Exception as e:
                print(f"File chunk error: {e}")
            
            if os.path.exists(tmp_path):
                os.unlink(tmp_path)
            
            progress = min((i + chunk_ms) / duration_ms * 100, 100)
            yield " ".join(all_text), f"پیشرفت: {progress:.1f}% - بخش {(i // chunk_ms)+1} از {num_chunks}"
            time.sleep(0.5)
        
        yield " ".join(all_text), "کامل شد! ✅"
    except Exception as e:
        yield f"خطا: {e}", "خطا ❌"

def save_text(text):
    if not text.strip():
        return None
    temp_file = tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.txt', encoding='utf-8')
    temp_file.write(text)
    temp_file.close()
    return temp_file.name

# Gradio UI (Fixed: Separate clicks, Timer with load event)
with gr.Blocks(
    title="تبدیل گفتار به متن - Real-time Fixed",
    theme=gr.themes.Soft(),
    css="""
    .gradio-container { font-family: 'Vazir', 'Tahoma', sans-serif !important; }
    .rtl { direction: rtl; text-align: right; }
    """
) as demo:
    gr.HTML("""
    <div style="text-align: center; max-width: 800px; margin: 0 auto;">
        <h1 style="font-size: 2.5em; margin-bottom: 0.5em;">🎤 تبدیل گفتار به متن</h1>
        <p style="font-size: 1.1em; color: #666; margin-bottom: 2em;">
            Real-time با Google API - پردازش background | قابل توزیع روی مرورگرها
        </p>
    </div>
    """)
    
    with gr.Tabs():
        with gr.TabItem("🎙️ ضبط مستقیم", id="realtime_tab") as realtime_tab:
            gr.Markdown("### فعال کنید و 5+ ثانیه واضح صحبت کنید (متن هر 3s آپدیت می‌شه)", elem_classes="rtl")
            
            with gr.Row():
                audio_input = gr.Audio(
                    sources=["microphone"],
                    type="numpy",
                    label="میکروفون (ضبط رو شروع کن)",
                    elem_classes="rtl"
                )
            
            realtime_output = gr.Textbox(
                label="متن live",
                placeholder="پس از 3-5s صحبت، متن ظاهر می‌شه...",
                lines=10,
                elem_classes="rtl",
                rtl=True,
                show_copy_button=True,
                interactive=False
            )
            
            clear_btn = gr.Button("🗑️ پاک کردن", variant="secondary")
            
            # Events (Fixed: Separate clicks)
            audio_input.change(
                handle_realtime_audio, 
                inputs=[audio_input], 
                outputs=[realtime_output]  # Update output on change
            )
            
            # Timer for live update: Start when tab loads
            timer = gr.Timer(value=2.0, active=False)
            def start_timer():
                timer.change(active=True)
                return get_current_transcript()
            realtime_tab.select(start_timer, outputs=[realtime_output])
            
            timer.tick(get_current_transcript, outputs=[realtime_output])
            
            clear_btn.click(clear_transcript, outputs=[realtime_output])
        
        with gr.TabItem("📁 فایل صوتی"):
            gr.Markdown("### فایل آپلود کن و تبدیل کن", elem_classes="rtl")
            
            with gr.Row():
                file_input = gr.Audio(sources=["upload"], type="filepath", label="فایل صوتی", elem_classes="rtl")
                chunk_slider = gr.Slider(10, 60, 30, 5, label="بخش‌بندی (s)", elem_classes="rtl")
            
            process_btn = gr.Button("🚀 تبدیل", variant="primary")
            progress_label = gr.Textbox(label="وضعیت", interactive=False, elem_classes="rtl")
            file_output = gr.Textbox(label="متن", lines=10, elem_classes="rtl", rtl=True, show_copy_button=True)
            
            with gr.Row():
                save_btn = gr.Button("💾 ذخیره", variant="secondary")
                clear_file_btn = gr.Button("🗑️ پاک", variant="secondary")
                download_file = gr.File(label="دانلود TXT", visible=False, elem_classes="rtl")
            
            process_btn.click(transcribe_file, [file_input, chunk_slider], [file_output, progress_label])
            save_btn.click(save_text, file_output, download_file).then(lambda: gr.update(visible=True), outputs=[download_file])
            clear_file_btn.click(lambda: ("", ""), [file_output, progress_label])
    
    with gr.Accordion("📖 راهنما", open=False, elem_classes="rtl"):
        gr.Markdown("""
        ### استفاده:
        - **Real-time**: تب رو باز کن، میکروفون فعال کن، 5s+ صحبت کن. هر 3s متن آپدیت می‌شه (background).
        - **فایل**: آپلود و دکمه بزن.
        ### نکات:
        - 🗣️ واضح صحبت کن، نویز کم.
        - 🌐 اینترنت پایدار (Google API).
        - 📱 روی موبایل/دسکتاپ کار می‌کنه (mic access بده).
        - ⚠️ محدودیت Google: ~60 req/min - برای حجم بالا، API key اضافه کن (در کد کامنت).
        - توزیع: share لینک رو share کن، همه browserها ساپورت.
        """, elem_classes="rtl")
    
    gr.HTML('<div style="text-align: center; margin-top: 2em; padding: 1em; background: #f8f9fa;"><p style="color: #666;">نسخه 2.3 - Fixed Timer & Clicks | Google Backend</p></div>')

if __name__ == "__main__":
    demo.queue().launch(share=True, show_error=True, server_name="0.0.0.0", server_port=7860)