Spaces:

suprimedev
/

speh33

Build error

App Files Files Community

suprimedev commited on Oct 22

Commit

7f63304

verified ·

1 Parent(s): d57ef9c

Update app.py

Browse files

Files changed (1) hide show

app.py +170 -177

app.py CHANGED Viewed

@@ -1,199 +1,192 @@
 import gradio as gr
-import torch
-from transformers import pipeline, AutoProcessor, WhisperForConditionalGeneration
-from datasets import load_dataset
-import tempfile
 import threading
 import queue
 import os
-from typing import Optional, Tuple
-import warnings
-warnings.filterwarnings("ignore")  # Suppress minor warnings for cleaner output
-# Load Whisper model (small for speed on CPU; use "base" or "medium" for better accuracy)
-# For HF Spaces: This auto-downloads on first run; caches for reuse.
-device = "cuda:0" if torch.cuda.is_available() else "cpu"
-pipe = pipeline(
-    "automatic-speech-recognition",
-    model="openai/whisper-small",  # Multilingual, good for Farsi; swap to "openai/whisper-base" for lighter/faster
-    return_timestamps=False,
-    generate_kwargs={"language": None},  # Allow auto-detection or override
-    device=device if device == "cuda:0" else -1  # Use CPU if no GPU
-)
-# Supported languages (Whisper language codes; auto-detects if not specified)
-LANGUAGES = [
-    ("Auto-Detect", None),  # Let model guess language
-    ("English", "en"),
-    ("Spanish", "es"),
-    ("French", "fr"),
-    ("German", "de"),
-    ("Italian", "it"),
-    ("Portuguese", "pt"),
-    ("Dutch", "nl"),
-    ("Russian", "ru"),
-    ("Chinese", "zh"),
-    ("Japanese", "ja"),
-    ("Korean", "ko"),
-    ("Arabic", "ar"),
-    ("Hindi", "hi"),
-    ("Persian (Farsi)", "fa"),  # Excellent Farsi support
-    # Whisper supports 99+; add more or use custom
-]
-# Queue for background processing
-transcription_queue = queue.Queue()
-def process_audio_whisper(audio_bytes: bytes, language_code: Optional[str]) -> str:
-    """Process audio bytes to text using Whisper in background."""
-    try:
-        # Save bytes to temp WAV (Whisper expects file path or audio array; here we use file)
-        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_file:
-            temp_file.write(audio_bytes)
-            temp_file_path = temp_file.name
-        # Transcribe with Whisper
-        result = pipe(temp_file_path, generate_kwargs={"language": language_code, "task": "transcribe"})
-        text = result["text"].strip()
-        # Clean up
-        os.unlink(temp_file_path)
-        return text if text else "[No speech detected]"
     except Exception as e:
-        os.unlink(temp_file_path) if 'temp_file_path' in locals() and os.path.exists(temp_file_path) else None
-        return f"[Error: {str(e)}]"
-def transcribe_live(audio: Optional[bytes], language_code: str, use_custom: bool, custom_lang: str, transcription_history: str) -> Tuple[str, str]:
-    """
-    Main Gradio function: Processes audio in background thread for 'live' feel.
-    - Audio from mic as bytes.
-    - Appends to history on each clip (record/stop triggers).
-    - For streaming: Whisper isn't native-streaming; use chunks or integrate faster models like Faster-Whisper.
-    """
-    if audio is None:
-        return transcription_history, "[Please record audio]"
-    # Determine final language (None for auto-detect)
-    final_lang = None
-    if not use_custom:
-        final_lang = language_code
-    elif custom_lang.strip():
-        final_lang = custom_lang.strip()
-    # Background processing
-    result_queue = queue.Queue()
-    def background_worker():
-        result = process_audio_whisper(audio, final_lang)
-        result_queue.put(result)
-    thread = threading.Thread(target=background_worker)
-    thread.daemon = True
-    thread.start()
-    # Wait for result (timeout for responsiveness)
-    try:
-        new_text = result_queue.get(timeout=15)  # Whisper small: ~5-10s per clip on CPU
-        updated_history = f"{transcription_history}\n{new_text}" if transcription_history and new_text != "[No speech detected]" else new_text
-        status = f"Transcribed: {new_text}" if new_text else "[Processing complete]"
-        return updated_history, status
-    except queue.Empty:
-        return transcription_history, "[Timed out; try shorter audio]"
-    except Exception as e:
-        return transcription_history, f"[Unexpected error: {str(e)}]"
-# Gradio Interface
-with gr.Blocks(title="Live STT with Whisper (HF Transformers)") as demo:
     gr.Markdown("""
-    # Multilingual Live Speech-to-Text App with Whisper
-    Record or upload audio for transcription. Uses OpenAI Whisper (small model) for ~100 languages, including excellent Farsi support.
-    Processes in background. Auto-detects language or specify via dropdown/custom.
-    **Tip**: Speak clearly in short clips (5-15s) for best results on CPU.
     """)
-    with gr.Row():
-        audio_input = gr.Audio(
-            sources=["microphone", "upload"],  # Allow mic or file upload
-            type="bytes",
-            label="Record/Upload Audio",
-            info="Click mic to record (stop to transcribe). Upload WAV/MP3 for batch."
         )
-    with gr.Row():
-        lang_dropdown = gr.Dropdown(
-            choices=LANGUAGES,
-            value=LANGUAGES[0][1],  # Default: Auto-Detect
-            label="Language",
-            info="Select or auto-detect. Farsi: 'fa'."
         )
-        use_custom_checkbox = gr.Checkbox(
-            label="Use Custom Language Code",
-            value=False,
-            info="Enable for manual override (e.g., 'fa' for Farsi, 'en' for English)."
         )
-        custom_lang_input = gr.Textbox(
-            label="Custom Language (e.g., 'fa')",
-            placeholder="fa",
-            visible=False,
-            info="Whisper codes: See [docs](https://huggingface.co/openai/whisper-small)."
         )
-    # Toggle custom input visibility
-    use_custom_checkbox.change(
-        fn=lambda visible: gr.update(visible=visible),
-        inputs=[use_custom_checkbox],
-        outputs=[custom_lang_input]
-    )
-    # Outputs
-    history_output = gr.Textbox(
-        label="Transcription History",
-        lines=10,
-        interactive=False,
-        placeholder="Transcriptions append here (RTL support for Farsi/Arabic)..."
-    )
-    status_output = gr.Textbox(
-        label="Status",
-        interactive=False,
-        placeholder="Ready to transcribe..."
-    )
-    # Buttons
-    transcribe_btn = gr.Button("Transcribe Audio", variant="primary")
-    clear_btn = gr.Button("Clear History", variant="secondary")
-    # Event: Live on audio change (triggers on record stop or upload)
-    audio_input.change(
-        fn=transcribe_live,
-        inputs=[audio_input, lang_dropdown, use_custom_checkbox, custom_lang_input, history_output],
-        outputs=[history_output, status_output],
-        live=True
-    )
-    # Manual button (for re-processing or after UI changes)
-    transcribe_btn.click(
-        fn=transcribe_live,
-        inputs=[audio_input, lang_dropdown, use_custom_checkbox, custom_lang_input, history_output],
-        outputs=[history_output, status_output]
-    )
-    # Clear
-    clear_btn.click(
-        fn=lambda: ("", "History cleared"),
-        outputs=[history_output, status_output]
-    )
-    # Example/Info
-    gr.Markdown("""
-    ### Quick Test for Farsi
-    - Select "Persian (Farsi)" or type "fa".
-    - Record: Say "سلام، این یک تست است" (Hello, this is a test).
-    - Output should be in Persian script.
-    **Performance**: On HF Spaces (CPU), ~2-10s per 10s clip. For faster, use "openai/whisper-tiny" or GPU Spaces.
-    **Limitations**: Not real-time streaming (chunk-based). For live streaming, consider Faster-Whisper + WebSockets.
-    """)
-# Launch (for local/HF Spaces)
 if __name__ == "__main__":
-    demo.launch(server_name="0.0.0.0", server_port=7860, share=True, debug=True)

 import gradio as gr
+import speech_recognition as sr
+import numpy as np
+from pydub import AudioSegment
+import io
+import wave
 import threading
 import queue
+import time
 import os
+# تنظیمات اولیه
+recognizer = sr.Recognizer()
+recognizer.energy_threshold = 300
+recognizer.dynamic_energy_threshold = True
+recognizer.dynamic_energy_ratio = 1.5
+# صف برای پردازش asynchronous
+audio_queue = queue.Queue()
+transcript_queue = queue.Queue()
+# متغیرهای نمایش متن
+current_transcript = ""
+current_transcript_lock = threading.Lock()
+def convert_numpy_to_wav(audio_data, sample_rate=16000):
+    """تعداد به فرمت WAV با نرمال‌سازی"""
+    buffer = io.BytesIO()
+    with wave.open(buffer, 'wb') as wav_file:
+        wav_file.setnchannels(1)
+        wav_file.setsampwidth(2)
+        wav_file.setframerate(sample_rate)
+        wav_file.writeframes(np.int16(audio_data * 32767))
+    buffer.seek(0)
+    return AudioSegment.from_wav(buffer)
+def process_audio_chunk(audio_chunk):
+    """پردازش یک قطعه صوتی"""
+    try:
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
+            tmp_path = tmp_file.name
+            audio_segment = convert_numpy_to_wav(audio_chunk)
+            audio_segment.export(tmp_path, format="wav")
+            with sr.AudioFile(tmp_path) as source:
+                audio = recognizer.record(source)
+            # تلاش با فارسی اول
+            try:
+                text = recognizer.recognize_google(audio, language='fa-IR')
+            except sr.UnknownValueError:
+                # اگر فارسی معتبر نباشد، با انگلیسی تلاش کنیم
+                try:
+                    text = recognizer.recognize_google(audio, language='en-US')
+                except:
+                    text = ""
+            except sr.RequestError:
+                text = "[خطا در اتصال]"
+            os.unlink(tmp_path)  # پاک کردن فایل موقت
+            return text.strip()
     except Exception as e:
+        print(f"خطا در پردازش: {e}")
+        return ""
+def monitor_audio(audio_input):
+    """م’environیک بلند کردن و پردازش"""
+    for i in range(0, len(audio_input), 16000):
+        chunk = audio_input[i:i+16000]
+        if len(chunk) < 16000:
+            continue
+        audio_queue.put((chunk, 16000))
+def update_transcript():
+    """به‌روزرسانی متن به‌رو"""
+    while True:
+        if not transcript_queue.empty():
+            new_text = transcript_queue.get()
+            with current_transcript_lock:
+                nonlocal current_transcript
+                current_transcript += " " + new_text
+                current_transcript = " ".join(current_transcript.split())
+        time.sleep(0.1)
+# رابط کاربری با Gradio
+with gr.Blocks(title="گستره گفتار به متن", theme=gr.themes.Soft(), css="""
+    .gradio-container { font-family: 'Vazir', 'Tahoma', sans-serif !important; }
+    .rtl { direction: rtl; text-align: right; }
+""") as demo:
+    # صفحه اصلی
     gr.Markdown("""
+    # 🎤 تبدیل گفتار به متن
+    ابزار قدرتمند تبدیل صدات را به متن با پشتیبانی از زبان فارسی و انگلیسی
     """)
+    # تب ضبط مستقیم
+    with gr.TabItem("🎙️ ضبط مستقیم"):
+        gr.Markdown("### میکروفون خود را فعال کرده و شروع به صحبت کنید")
+        with gr.Row():
+            with gr.Column(scale=1):
+                audio_input = gr.Audio(
+                    sources=["microphone"],
+                    type="numpy",
+                    streaming=True,
+                    label="میکروفون",
+                    show_label=True
+                )
+            with gr.Column(scale=1):
+                clear_btn = gr.Button("🗑️ پاک کردن متن", variant="secondary")
+                realtime_output = gr.Textbox(
+                    label="متن تشخیص داده شده",
+                    placeholder="شروع به صحبت کنید و متن اینجا ظاهر می‌شود...",
+                    lines=12,
+                    elem_classes="rtl",
+                    rtl=True,
+                    show_copy_button=True
+                )
+        clear_btn.click(lambda: "", outputs=[realtime_output])
+        audio_input.stream(
+            lambda x: monitor_audio(x),
+            inputs=[audio_input],
+            outputs=[],
+            every=0.1
         )
+        audio_input.stream(
+            lambda: update_transcript(),
+            inputs=[],
+            outputs=[realtime_output],
+            every=0.1
         )
+    # تب فایل صوتی
+    with gr.TabItem("📁 فایل صو��ی"):
+        gr.Markdown("### فایل صوتی خود را انتخاب کنید")
+        with gr.Row():
+            with gr.Column(scale=3):
+                file_input = gr.Audio(
+                    sources=["upload"],
+                    type="filepath",
+                    label="انتخاب فایل صوتی",
+                    elem_classes="rtl"
+                )
+            with gr.Column(scale=1):
+                chunk_duration = gr.Slider(
+                    minimum=10, maximum=60, value=30, step=5,
+                    label="مدت هر بخش (ثانیه)"
+                )
+                process_btn = gr.Button("🚀 شروع تبدیل", variant="primary")
+                status_label = gr.Textbox(label="وضعیت پردازش", interactive=False)
+            with gr.Column(scale=1):
+                save_btn = gr.Button("💾 ذخیره متن")
+                clear_file_btn = gr.Button("🗑️ پاک کردن")
+                download_file = gr.File(label="دانلود فایل متن", visible=False)
+        def process_file(audio_file, duration):
+            try:
+                audio = AudioSegment.from_file(audio_file)
+                results = []
+                for i in range(0, len(audio), duration*1000):
+                    chunk = audio[i:i+duration*1000]
+                    chunk_text = process_audio_chunk(np.array(chunk.get_array_of_samples()))
+                    results.append(chunk_text)
+                return " ".join(results), "تکمیل پردازش ✅"
+            except Exception as e:
+                return f"خطا: {str(e)}", "خطای پردازش ❌"
+        process_btn.click(
+            process_file,
+            inputs=[file_input, chunk_duration],
+            outputs=[realtime_output, status_label]
         )
+        save_btn.click(
+            lambda x: gr.File.value(x),
+            inputs=[realtime_output],
+            outputs=[download_file]
+        ).then(
+            lambda: gr.update(visible=True),
+            outputs=[download_file]
+        )
+        clear_file_btn.click(
+            lambda: ("", ""),
+            outputs=[realtime_output, status_label]
         )
+# اجرای برنامه
 if __name__ == "__main__":
+    demo.queue().launch(
+        share=True,
+        show_error=True,
+        favicon=__file__
+    )