import gradio as gr import speech_recognition as sr import numpy as np from pydub import AudioSegment import io import tempfile import os import wave import threading import queue import time # تنظیمات اولیه recognizer = sr.Recognizer() recognizer.energy_threshold = 400 # کمی بالاتر برای real-time recognizer.dynamic_energy_threshold = True recognizer.pause_threshold = 0.5 # حساس‌تر به pause # صف برای accumulate chunk‌های real-time audio_queue = queue.Queue(maxsize=10) # کوچک‌تر برای real-time PROCESS_INTERVAL = 3.0 # هر 3 ثانیه process MIN_DURATION = 1.5 # حداقل 1.5s صوت برای process # Transcript global current_transcript = "" transcript_lock = threading.Lock() # Background thread برای پردازش queue def background_processor(): accumulated = [] last_process = time.time() while True: try: # Get chunk if available if not audio_queue.empty(): audio_tuple = audio_queue.get(timeout=0.2) if audio_tuple and audio_tuple[1] is not None: rate, data = audio_tuple data = np.clip(data, -1.0, 1.0) # Normalize accumulated.append((rate, data)) print(f"Accumulated chunk: {len(data)/rate:.2f}s") # Process if interval passed and enough audio now = time.time() total_dur = sum(len(d)/r for r, d in accumulated) if accumulated else 0 if (now - last_process >= PROCESS_INTERVAL) and total_dur >= MIN_DURATION: if accumulated: merged_rate = accumulated[0][0] merged_data = np.concatenate([d for _, d in accumulated]) text = process_audio_chunk((merged_rate, merged_data)) if text and text not in ["", "[خطا در اتصال]"]: with transcript_lock: if current_transcript: current_transcript += " " + text else: current_transcript = text print(f"✅ Processed: {text[:50]}...") # Log کوتاه # Reset accumulated = [] last_process = now time.sleep(0.2) # Poll faster for responsiveness except Exception as e: print(f"Background error: {e}") time.sleep(1) # Start background thread processor_thread = threading.Thread(target=background_processor, daemon=True) processor_thread.start() def numpy_to_audio_segment(audio_data, sample_rate): if audio_data is None or len(audio_data) == 0: return None if audio_data.dtype in [np.float32, np.float64]: audio_data = np.clip(audio_data, -1.0, 1.0) audio_data = (audio_data * 32767).astype(np.int16) buffer = io.BytesIO() with wave.open(buffer, 'wb') as wav_file: wav_file.setnchannels(1) wav_file.setsampwidth(2) wav_file.setframerate(sample_rate) wav_file.writeframes(audio_data.tobytes()) buffer.seek(0) return AudioSegment.from_wav(buffer) def process_audio_chunk(audio_tuple): try: if audio_tuple is None: return "" sample_rate, audio_data = audio_tuple duration = len(audio_data) / sample_rate if sample_rate else 0 if duration < MIN_DURATION: return "" audio_segment = numpy_to_audio_segment(audio_data, sample_rate) if audio_segment is None or len(audio_segment) < MIN_DURATION * 1000: return "" with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp: audio_segment.export(tmp.name, format="wav") tmp_path = tmp.name with sr.AudioFile(tmp_path) as source: recognizer.adjust_for_ambient_noise(source, duration=0.3) # Quick adjust audio = recognizer.record(source) # Google recognition text = "" try: text = recognizer.recognize_google(audio, language='fa-IR') # Persian first # اگر key داری: text = recognizer.recognize_google(audio, language='fa-IR', key="YOUR_GOOGLE_API_KEY") except sr.UnknownValueError: try: text = recognizer.recognize_google(audio, language='en-US') except sr.UnknownValueError: print("No speech in chunk") text = "" except sr.RequestError as e: print(f"Google API error: {e}") text = "[خطا اتصال]" if os.path.exists(tmp_path): os.unlink(tmp_path) return text.strip() except Exception as e: print(f"Process error: {e}") return "" def handle_realtime_audio(audio): """Handle incoming audio chunks from microphone""" if audio is None: return gr.update() try: audio_queue.put(audio, block=False) except queue.Full: print("Queue full, skip") return gr.update() def get_current_transcript(): """Poll transcript for UI update""" with transcript_lock: return current_transcript def clear_transcript(): global current_transcript with transcript_lock: current_transcript = "" # Clear queue while not audio_queue.empty(): try: audio_queue.get_nowait() except queue.Empty: break return "" # File transcription (unchanged) def transcribe_file(audio_file, chunk_duration=30): if audio_file is None: yield "لطفاً فایل آپلود کنید", "" return try: audio = AudioSegment.from_file(audio_file) duration_ms = len(audio) chunk_ms = chunk_duration * 1000 all_text = [] num_chunks = (duration_ms + chunk_ms - 1) // chunk_ms for i in range(0, duration_ms, chunk_ms): chunk = audio[i:i + chunk_ms] with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp: chunk.export(tmp.name, format="wav") tmp_path = tmp.name try: with sr.AudioFile(tmp_path) as source: audio_data = recognizer.record(source) try: text = recognizer.recognize_google(audio_data, language='fa-IR') except sr.UnknownValueError: try: text = recognizer.recognize_google(audio_data, language='en-US') except sr.UnknownValueError: text = "" except sr.RequestError: text = "[خطا اتصال]" if text and text != "[خطا اتصال]": all_text.append(text) except Exception as e: print(f"File chunk error: {e}") if os.path.exists(tmp_path): os.unlink(tmp_path) progress = min((i + chunk_ms) / duration_ms * 100, 100) yield " ".join(all_text), f"پیشرفت: {progress:.1f}% - بخش {(i // chunk_ms)+1} از {num_chunks}" time.sleep(0.5) yield " ".join(all_text), "کامل شد! ✅" except Exception as e: yield f"خطا: {e}", "خطا ❌" def save_text(text): if not text.strip(): return None temp_file = tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.txt', encoding='utf-8') temp_file.write(text) temp_file.close() return temp_file.name # Gradio UI (Fixed: Separate clicks, Timer with load event) with gr.Blocks( title="تبدیل گفتار به متن - Real-time Fixed", theme=gr.themes.Soft(), css=""" .gradio-container { font-family: 'Vazir', 'Tahoma', sans-serif !important; } .rtl { direction: rtl; text-align: right; } """ ) as demo: gr.HTML("""

🎤 تبدیل گفتار به متن

Real-time با Google API - پردازش background | قابل توزیع روی مرورگرها

""") with gr.Tabs(): with gr.TabItem("🎙️ ضبط مستقیم", id="realtime_tab") as realtime_tab: gr.Markdown("### فعال کنید و 5+ ثانیه واضح صحبت کنید (متن هر 3s آپدیت می‌شه)", elem_classes="rtl") with gr.Row(): audio_input = gr.Audio( sources=["microphone"], type="numpy", label="میکروفون (ضبط رو شروع کن)", elem_classes="rtl" ) realtime_output = gr.Textbox( label="متن live", placeholder="پس از 3-5s صحبت، متن ظاهر می‌شه...", lines=10, elem_classes="rtl", rtl=True, show_copy_button=True, interactive=False ) clear_btn = gr.Button("🗑️ پاک کردن", variant="secondary") # Events (Fixed: Separate clicks) audio_input.change( handle_realtime_audio, inputs=[audio_input], outputs=[realtime_output] # Update output on change ) # Timer for live update: Start when tab loads timer = gr.Timer(value=2.0, active=False) def start_timer(): timer.change(active=True) return get_current_transcript() realtime_tab.select(start_timer, outputs=[realtime_output]) timer.tick(get_current_transcript, outputs=[realtime_output]) clear_btn.click(clear_transcript, outputs=[realtime_output]) with gr.TabItem("📁 فایل صوتی"): gr.Markdown("### فایل آپلود کن و تبدیل کن", elem_classes="rtl") with gr.Row(): file_input = gr.Audio(sources=["upload"], type="filepath", label="فایل صوتی", elem_classes="rtl") chunk_slider = gr.Slider(10, 60, 30, 5, label="بخش‌بندی (s)", elem_classes="rtl") process_btn = gr.Button("🚀 تبدیل", variant="primary") progress_label = gr.Textbox(label="وضعیت", interactive=False, elem_classes="rtl") file_output = gr.Textbox(label="متن", lines=10, elem_classes="rtl", rtl=True, show_copy_button=True) with gr.Row(): save_btn = gr.Button("💾 ذخیره", variant="secondary") clear_file_btn = gr.Button("🗑️ پاک", variant="secondary") download_file = gr.File(label="دانلود TXT", visible=False, elem_classes="rtl") process_btn.click(transcribe_file, [file_input, chunk_slider], [file_output, progress_label]) save_btn.click(save_text, file_output, download_file).then(lambda: gr.update(visible=True), outputs=[download_file]) clear_file_btn.click(lambda: ("", ""), [file_output, progress_label]) with gr.Accordion("📖 راهنما", open=False, elem_classes="rtl"): gr.Markdown(""" ### استفاده: - **Real-time**: تب رو باز کن، میکروفون فعال کن، 5s+ صحبت کن. هر 3s متن آپدیت می‌شه (background). - **فایل**: آپلود و دکمه بزن. ### نکات: - 🗣️ واضح صحبت کن، نویز کم. - 🌐 اینترنت پایدار (Google API). - 📱 روی موبایل/دسکتاپ کار می‌کنه (mic access بده). - ⚠️ محدودیت Google: ~60 req/min - برای حجم بالا، API key اضافه کن (در کد کامنت). - توزیع: share لینک رو share کن، همه browserها ساپورت. """, elem_classes="rtl") gr.HTML('

نسخه 2.3 - Fixed Timer & Clicks | Google Backend

') if __name__ == "__main__": demo.queue().launch(share=True, show_error=True, server_name="0.0.0.0", server_port=7860)