Spaces:

suprimedev
/

speh32

Sleeping

File size: 12,777 Bytes

7d519a8
 
 
 
 
 
 
e92b5fd
 
7d519a8
 
 
 
 
145d58c
e92b5fd
145d58c
7d519a8
145d58c
 
 
 
b2d26db
145d58c
b2d26db
 
 
145d58c
b2d26db
145d58c
 
b2d26db
 
 
145d58c
b2d26db
145d58c
 
 
 
 
 
b2d26db
145d58c
 
 
 
 
 
 
b2d26db
 
145d58c
b2d26db
 
 
 
 
145d58c
b2d26db
145d58c
 
 
b2d26db
145d58c
b2d26db
 
145d58c
 
b2d26db
145d58c
b2d26db
 
7d519a8
e92b5fd
b2d26db
e92b5fd
145d58c
 
e92b5fd
 
 
 
 
 
 
 
 
 
 
 
7d519a8
e92b5fd
 
 
145d58c
 
 
e92b5fd
 
145d58c
e92b5fd
 
145d58c
 
 
7d519a8
e92b5fd
145d58c
e92b5fd
145d58c
 
e92b5fd
 
145d58c
4742671
e92b5fd
 
 
 
145d58c
 
b2d26db
 
145d58c
e92b5fd
 
 
7d519a8
145d58c
7d519a8
145d58c
7d519a8
 
145d58c
 
7d519a8
145d58c
b2d26db
 
 
145d58c
 
 
 
 
b2d26db
 
e92b5fd
145d58c
e92b5fd
 
 
145d58c
b2d26db
4742671
 
 
 
e92b5fd
7d519a8
4742671
7d519a8
 
145d58c
e92b5fd
7d519a8
 
 
145d58c
7d519a8
 
145d58c
 
 
 
 
 
7d519a8
e92b5fd
 
 
 
 
 
 
 
 
 
 
145d58c
 
e92b5fd
 
145d58c
7d519a8
e92b5fd
 
7d519a8
145d58c
 
e92b5fd
7d519a8
145d58c
7d519a8
145d58c
7d519a8
 
 
 
 
 
 
 
 
4742671
e92b5fd
145d58c
e92b5fd
 
145d58c
 
e92b5fd
 
 
 
145d58c
e92b5fd
145d58c
e92b5fd
 
7d519a8
 
 
4742671
145d58c
7d519a8
 
145d58c
 
 
 
 
 
e92b5fd
7d519a8
145d58c
 
 
e92b5fd
 
145d58c
 
7d519a8
 
145d58c
e92b5fd
4742671
 
 
 
 
 
 
 
 
 
 
 
 
7d519a8
145d58c
7d519a8
145d58c
 
 
 
7d519a8
 
145d58c
 
7d519a8
145d58c
 
 
7d519a8
145d58c
 
 
 
e92b5fd
145d58c
4742671
145d58c
7d519a8
145d58c
7d519a8
145d58c
4742671
145d58c
 
 
 
 
 
 
e92b5fd
7d519a8
4742671
7d519a8
 
145d58c

import gradio as gr
import speech_recognition as sr
import numpy as np
from pydub import AudioSegment
import io
import tempfile
import os
import wave
import threading
import queue
import time

# تنظیمات اولیه
recognizer = sr.Recognizer()
recognizer.energy_threshold = 400  # کمی بالاتر برای real-time
recognizer.dynamic_energy_threshold = True
recognizer.pause_threshold = 0.5  # حساس‌تر به pause

# صف برای accumulate chunk‌های real-time
audio_queue = queue.Queue(maxsize=10)  # کوچک‌تر برای real-time
PROCESS_INTERVAL = 3.0  # هر 3 ثانیه process
MIN_DURATION = 1.5  # حداقل 1.5s صوت برای process

# Transcript global
current_transcript = ""
transcript_lock = threading.Lock()

# Background thread برای پردازش queue
def background_processor():
    accumulated = []
    last_process = time.time()
    
    while True:
        try:
            # Get chunk if available
            if not audio_queue.empty():
                audio_tuple = audio_queue.get(timeout=0.2)
                if audio_tuple and audio_tuple[1] is not None:
                    rate, data = audio_tuple
                    data = np.clip(data, -1.0, 1.0)  # Normalize
                    accumulated.append((rate, data))
                    print(f"Accumulated chunk: {len(data)/rate:.2f}s")
            
            # Process if interval passed and enough audio
            now = time.time()
            total_dur = sum(len(d)/r for r, d in accumulated) if accumulated else 0
            if (now - last_process >= PROCESS_INTERVAL) and total_dur >= MIN_DURATION:
                if accumulated:
                    merged_rate = accumulated[0][0]
                    merged_data = np.concatenate([d for _, d in accumulated])
                    
                    text = process_audio_chunk((merged_rate, merged_data))
                    if text and text not in ["", "[خطا در اتصال]"]:
                        with transcript_lock:
                            if current_transcript:
                                current_transcript += " " + text
                            else:
                                current_transcript = text
                        print(f"✅ Processed: {text[:50]}...")  # Log کوتاه
                    
                    # Reset
                    accumulated = []
                    last_process = now
            
            time.sleep(0.2)  # Poll faster for responsiveness
            
        except Exception as e:
            print(f"Background error: {e}")
            time.sleep(1)

# Start background thread
processor_thread = threading.Thread(target=background_processor, daemon=True)
processor_thread.start()

def numpy_to_audio_segment(audio_data, sample_rate):
    if audio_data is None or len(audio_data) == 0:
        return None
    if audio_data.dtype in [np.float32, np.float64]:
        audio_data = np.clip(audio_data, -1.0, 1.0)
        audio_data = (audio_data * 32767).astype(np.int16)
    
    buffer = io.BytesIO()
    with wave.open(buffer, 'wb') as wav_file:
        wav_file.setnchannels(1)
        wav_file.setsampwidth(2)
        wav_file.setframerate(sample_rate)
        wav_file.writeframes(audio_data.tobytes())
    buffer.seek(0)
    return AudioSegment.from_wav(buffer)

def process_audio_chunk(audio_tuple):
    try:
        if audio_tuple is None:
            return ""
        sample_rate, audio_data = audio_tuple
        duration = len(audio_data) / sample_rate if sample_rate else 0
        if duration < MIN_DURATION:
            return ""
        
        audio_segment = numpy_to_audio_segment(audio_data, sample_rate)
        if audio_segment is None or len(audio_segment) < MIN_DURATION * 1000:
            return ""
        
        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
            audio_segment.export(tmp.name, format="wav")
            tmp_path = tmp.name
        
        with sr.AudioFile(tmp_path) as source:
            recognizer.adjust_for_ambient_noise(source, duration=0.3)  # Quick adjust
            audio = recognizer.record(source)
        
        # Google recognition
        text = ""
        try:
            text = recognizer.recognize_google(audio, language='fa-IR')  # Persian first
            # اگر key داری: text = recognizer.recognize_google(audio, language='fa-IR', key="YOUR_GOOGLE_API_KEY")
        except sr.UnknownValueError:
            try:
                text = recognizer.recognize_google(audio, language='en-US')
            except sr.UnknownValueError:
                print("No speech in chunk")
                text = ""
        except sr.RequestError as e:
            print(f"Google API error: {e}")
            text = "[خطا اتصال]"
        
        if os.path.exists(tmp_path):
            os.unlink(tmp_path)
        
        return text.strip()
    except Exception as e:
        print(f"Process error: {e}")
        return ""

def handle_realtime_audio(audio):
    """Handle incoming audio chunks from microphone"""
    if audio is None:
        return gr.update()
    try:
        audio_queue.put(audio, block=False)
    except queue.Full:
        print("Queue full, skip")
    return gr.update()

def get_current_transcript():
    """Poll transcript for UI update"""
    with transcript_lock:
        return current_transcript

def clear_transcript():
    global current_transcript
    with transcript_lock:
        current_transcript = ""
    # Clear queue
    while not audio_queue.empty():
        try:
            audio_queue.get_nowait()
        except queue.Empty:
            break
    return ""

# File transcription (unchanged)
def transcribe_file(audio_file, chunk_duration=30):
    if audio_file is None:
        yield "لطفاً فایل آپلود کنید", ""
        return
    try:
        audio = AudioSegment.from_file(audio_file)
        duration_ms = len(audio)
        chunk_ms = chunk_duration * 1000
        all_text = []
        
        num_chunks = (duration_ms + chunk_ms - 1) // chunk_ms
        for i in range(0, duration_ms, chunk_ms):
            chunk = audio[i:i + chunk_ms]
            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
                chunk.export(tmp.name, format="wav")
                tmp_path = tmp.name
            
            try:
                with sr.AudioFile(tmp_path) as source:
                    audio_data = recognizer.record(source)
                try:
                    text = recognizer.recognize_google(audio_data, language='fa-IR')
                except sr.UnknownValueError:
                    try:
                        text = recognizer.recognize_google(audio_data, language='en-US')
                    except sr.UnknownValueError:
                        text = ""
                except sr.RequestError:
                    text = "[خطا اتصال]"
                if text and text != "[خطا اتصال]":
                    all_text.append(text)
            except Exception as e:
                print(f"File chunk error: {e}")
            
            if os.path.exists(tmp_path):
                os.unlink(tmp_path)
            
            progress = min((i + chunk_ms) / duration_ms * 100, 100)
            yield " ".join(all_text), f"پیشرفت: {progress:.1f}% - بخش {(i // chunk_ms)+1} از {num_chunks}"
            time.sleep(0.5)
        
        yield " ".join(all_text), "کامل شد! ✅"
    except Exception as e:
        yield f"خطا: {e}", "خطا ❌"

def save_text(text):
    if not text.strip():
        return None
    temp_file = tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.txt', encoding='utf-8')
    temp_file.write(text)
    temp_file.close()
    return temp_file.name

# Gradio UI (Fixed: Separate clicks, Timer with load event)
with gr.Blocks(
    title="تبدیل گفتار به متن - Real-time Fixed",
    theme=gr.themes.Soft(),
    css="""
    .gradio-container { font-family: 'Vazir', 'Tahoma', sans-serif !important; }
    .rtl { direction: rtl; text-align: right; }
    """
) as demo:
    gr.HTML("""
    <div style="text-align: center; max-width: 800px; margin: 0 auto;">
        <h1 style="font-size: 2.5em; margin-bottom: 0.5em;">🎤 تبدیل گفتار به متن</h1>
        <p style="font-size: 1.1em; color: #666; margin-bottom: 2em;">
            Real-time با Google API - پردازش background | قابل توزیع روی مرورگرها
        </p>
    </div>
    """)
    
    with gr.Tabs():
        with gr.TabItem("🎙️ ضبط مستقیم", id="realtime_tab") as realtime_tab:
            gr.Markdown("### فعال کنید و 5+ ثانیه واضح صحبت کنید (متن هر 3s آپدیت می‌شه)", elem_classes="rtl")
            
            with gr.Row():
                audio_input = gr.Audio(
                    sources=["microphone"],
                    type="numpy",
                    label="میکروفون (ضبط رو شروع کن)",
                    elem_classes="rtl"
                )
            
            realtime_output = gr.Textbox(
                label="متن live",
                placeholder="پس از 3-5s صحبت، متن ظاهر می‌شه...",
                lines=10,
                elem_classes="rtl",
                rtl=True,
                show_copy_button=True,
                interactive=False
            )
            
            clear_btn = gr.Button("🗑️ پاک کردن", variant="secondary")
            
            # Events (Fixed: Separate clicks)
            audio_input.change(
                handle_realtime_audio, 
                inputs=[audio_input], 
                outputs=[realtime_output]  # Update output on change
            )
            
            # Timer for live update: Start when tab loads
            timer = gr.Timer(value=2.0, active=False)
            def start_timer():
                timer.change(active=True)
                return get_current_transcript()
            realtime_tab.select(start_timer, outputs=[realtime_output])
            
            timer.tick(get_current_transcript, outputs=[realtime_output])
            
            clear_btn.click(clear_transcript, outputs=[realtime_output])
        
        with gr.TabItem("📁 فایل صوتی"):
            gr.Markdown("### فایل آپلود کن و تبدیل کن", elem_classes="rtl")
            
            with gr.Row():
                file_input = gr.Audio(sources=["upload"], type="filepath", label="فایل صوتی", elem_classes="rtl")
                chunk_slider = gr.Slider(10, 60, 30, 5, label="بخش‌بندی (s)", elem_classes="rtl")
            
            process_btn = gr.Button("🚀 تبدیل", variant="primary")
            progress_label = gr.Textbox(label="وضعیت", interactive=False, elem_classes="rtl")
            file_output = gr.Textbox(label="متن", lines=10, elem_classes="rtl", rtl=True, show_copy_button=True)
            
            with gr.Row():
                save_btn = gr.Button("💾 ذخیره", variant="secondary")
                clear_file_btn = gr.Button("🗑️ پاک", variant="secondary")
                download_file = gr.File(label="دانلود TXT", visible=False, elem_classes="rtl")
            
            process_btn.click(transcribe_file, [file_input, chunk_slider], [file_output, progress_label])
            save_btn.click(save_text, file_output, download_file).then(lambda: gr.update(visible=True), outputs=[download_file])
            clear_file_btn.click(lambda: ("", ""), [file_output, progress_label])
    
    with gr.Accordion("📖 راهنما", open=False, elem_classes="rtl"):
        gr.Markdown("""
        ### استفاده:
        - **Real-time**: تب رو باز کن، میکروفون فعال کن، 5s+ صحبت کن. هر 3s متن آپدیت می‌شه (background).
        - **فایل**: آپلود و دکمه بزن.
        ### نکات:
        - 🗣️ واضح صحبت کن، نویز کم.
        - 🌐 اینترنت پایدار (Google API).
        - 📱 روی موبایل/دسکتاپ کار می‌کنه (mic access بده).
        - ⚠️ محدودیت Google: ~60 req/min - برای حجم بالا، API key اضافه کن (در کد کامنت).
        - توزیع: share لینک رو share کن، همه browserها ساپورت.
        """, elem_classes="rtl")
    
    gr.HTML('<div style="text-align: center; margin-top: 2em; padding: 1em; background: #f8f9fa;"><p style="color: #666;">نسخه 2.3 - Fixed Timer & Clicks | Google Backend</p></div>')

if __name__ == "__main__":
    demo.queue().launch(share=True, show_error=True, server_name="0.0.0.0", server_port=7860)