Spaces:
Sleeping
Sleeping
File size: 12,777 Bytes
7d519a8 e92b5fd 7d519a8 145d58c e92b5fd 145d58c 7d519a8 145d58c b2d26db 145d58c b2d26db 145d58c b2d26db 145d58c b2d26db 145d58c b2d26db 145d58c b2d26db 145d58c b2d26db 145d58c b2d26db 145d58c b2d26db 145d58c b2d26db 145d58c b2d26db 145d58c b2d26db 145d58c b2d26db 7d519a8 e92b5fd b2d26db e92b5fd 145d58c e92b5fd 7d519a8 e92b5fd 145d58c e92b5fd 145d58c e92b5fd 145d58c 7d519a8 e92b5fd 145d58c e92b5fd 145d58c e92b5fd 145d58c 4742671 e92b5fd 145d58c b2d26db 145d58c e92b5fd 7d519a8 145d58c 7d519a8 145d58c 7d519a8 145d58c 7d519a8 145d58c b2d26db 145d58c b2d26db e92b5fd 145d58c e92b5fd 145d58c b2d26db 4742671 e92b5fd 7d519a8 4742671 7d519a8 145d58c e92b5fd 7d519a8 145d58c 7d519a8 145d58c 7d519a8 e92b5fd 145d58c e92b5fd 145d58c 7d519a8 e92b5fd 7d519a8 145d58c e92b5fd 7d519a8 145d58c 7d519a8 145d58c 7d519a8 4742671 e92b5fd 145d58c e92b5fd 145d58c e92b5fd 145d58c e92b5fd 145d58c e92b5fd 7d519a8 4742671 145d58c 7d519a8 145d58c e92b5fd 7d519a8 145d58c e92b5fd 145d58c 7d519a8 145d58c e92b5fd 4742671 7d519a8 145d58c 7d519a8 145d58c 7d519a8 145d58c 7d519a8 145d58c 7d519a8 145d58c e92b5fd 145d58c 4742671 145d58c 7d519a8 145d58c 7d519a8 145d58c 4742671 145d58c e92b5fd 7d519a8 4742671 7d519a8 145d58c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 |
import gradio as gr
import speech_recognition as sr
import numpy as np
from pydub import AudioSegment
import io
import tempfile
import os
import wave
import threading
import queue
import time
# تنظیمات اولیه
recognizer = sr.Recognizer()
recognizer.energy_threshold = 400 # کمی بالاتر برای real-time
recognizer.dynamic_energy_threshold = True
recognizer.pause_threshold = 0.5 # حساستر به pause
# صف برای accumulate chunkهای real-time
audio_queue = queue.Queue(maxsize=10) # کوچکتر برای real-time
PROCESS_INTERVAL = 3.0 # هر 3 ثانیه process
MIN_DURATION = 1.5 # حداقل 1.5s صوت برای process
# Transcript global
current_transcript = ""
transcript_lock = threading.Lock()
# Background thread برای پردازش queue
def background_processor():
accumulated = []
last_process = time.time()
while True:
try:
# Get chunk if available
if not audio_queue.empty():
audio_tuple = audio_queue.get(timeout=0.2)
if audio_tuple and audio_tuple[1] is not None:
rate, data = audio_tuple
data = np.clip(data, -1.0, 1.0) # Normalize
accumulated.append((rate, data))
print(f"Accumulated chunk: {len(data)/rate:.2f}s")
# Process if interval passed and enough audio
now = time.time()
total_dur = sum(len(d)/r for r, d in accumulated) if accumulated else 0
if (now - last_process >= PROCESS_INTERVAL) and total_dur >= MIN_DURATION:
if accumulated:
merged_rate = accumulated[0][0]
merged_data = np.concatenate([d for _, d in accumulated])
text = process_audio_chunk((merged_rate, merged_data))
if text and text not in ["", "[خطا در اتصال]"]:
with transcript_lock:
if current_transcript:
current_transcript += " " + text
else:
current_transcript = text
print(f"✅ Processed: {text[:50]}...") # Log کوتاه
# Reset
accumulated = []
last_process = now
time.sleep(0.2) # Poll faster for responsiveness
except Exception as e:
print(f"Background error: {e}")
time.sleep(1)
# Start background thread
processor_thread = threading.Thread(target=background_processor, daemon=True)
processor_thread.start()
def numpy_to_audio_segment(audio_data, sample_rate):
if audio_data is None or len(audio_data) == 0:
return None
if audio_data.dtype in [np.float32, np.float64]:
audio_data = np.clip(audio_data, -1.0, 1.0)
audio_data = (audio_data * 32767).astype(np.int16)
buffer = io.BytesIO()
with wave.open(buffer, 'wb') as wav_file:
wav_file.setnchannels(1)
wav_file.setsampwidth(2)
wav_file.setframerate(sample_rate)
wav_file.writeframes(audio_data.tobytes())
buffer.seek(0)
return AudioSegment.from_wav(buffer)
def process_audio_chunk(audio_tuple):
try:
if audio_tuple is None:
return ""
sample_rate, audio_data = audio_tuple
duration = len(audio_data) / sample_rate if sample_rate else 0
if duration < MIN_DURATION:
return ""
audio_segment = numpy_to_audio_segment(audio_data, sample_rate)
if audio_segment is None or len(audio_segment) < MIN_DURATION * 1000:
return ""
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
audio_segment.export(tmp.name, format="wav")
tmp_path = tmp.name
with sr.AudioFile(tmp_path) as source:
recognizer.adjust_for_ambient_noise(source, duration=0.3) # Quick adjust
audio = recognizer.record(source)
# Google recognition
text = ""
try:
text = recognizer.recognize_google(audio, language='fa-IR') # Persian first
# اگر key داری: text = recognizer.recognize_google(audio, language='fa-IR', key="YOUR_GOOGLE_API_KEY")
except sr.UnknownValueError:
try:
text = recognizer.recognize_google(audio, language='en-US')
except sr.UnknownValueError:
print("No speech in chunk")
text = ""
except sr.RequestError as e:
print(f"Google API error: {e}")
text = "[خطا اتصال]"
if os.path.exists(tmp_path):
os.unlink(tmp_path)
return text.strip()
except Exception as e:
print(f"Process error: {e}")
return ""
def handle_realtime_audio(audio):
"""Handle incoming audio chunks from microphone"""
if audio is None:
return gr.update()
try:
audio_queue.put(audio, block=False)
except queue.Full:
print("Queue full, skip")
return gr.update()
def get_current_transcript():
"""Poll transcript for UI update"""
with transcript_lock:
return current_transcript
def clear_transcript():
global current_transcript
with transcript_lock:
current_transcript = ""
# Clear queue
while not audio_queue.empty():
try:
audio_queue.get_nowait()
except queue.Empty:
break
return ""
# File transcription (unchanged)
def transcribe_file(audio_file, chunk_duration=30):
if audio_file is None:
yield "لطفاً فایل آپلود کنید", ""
return
try:
audio = AudioSegment.from_file(audio_file)
duration_ms = len(audio)
chunk_ms = chunk_duration * 1000
all_text = []
num_chunks = (duration_ms + chunk_ms - 1) // chunk_ms
for i in range(0, duration_ms, chunk_ms):
chunk = audio[i:i + chunk_ms]
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
chunk.export(tmp.name, format="wav")
tmp_path = tmp.name
try:
with sr.AudioFile(tmp_path) as source:
audio_data = recognizer.record(source)
try:
text = recognizer.recognize_google(audio_data, language='fa-IR')
except sr.UnknownValueError:
try:
text = recognizer.recognize_google(audio_data, language='en-US')
except sr.UnknownValueError:
text = ""
except sr.RequestError:
text = "[خطا اتصال]"
if text and text != "[خطا اتصال]":
all_text.append(text)
except Exception as e:
print(f"File chunk error: {e}")
if os.path.exists(tmp_path):
os.unlink(tmp_path)
progress = min((i + chunk_ms) / duration_ms * 100, 100)
yield " ".join(all_text), f"پیشرفت: {progress:.1f}% - بخش {(i // chunk_ms)+1} از {num_chunks}"
time.sleep(0.5)
yield " ".join(all_text), "کامل شد! ✅"
except Exception as e:
yield f"خطا: {e}", "خطا ❌"
def save_text(text):
if not text.strip():
return None
temp_file = tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.txt', encoding='utf-8')
temp_file.write(text)
temp_file.close()
return temp_file.name
# Gradio UI (Fixed: Separate clicks, Timer with load event)
with gr.Blocks(
title="تبدیل گفتار به متن - Real-time Fixed",
theme=gr.themes.Soft(),
css="""
.gradio-container { font-family: 'Vazir', 'Tahoma', sans-serif !important; }
.rtl { direction: rtl; text-align: right; }
"""
) as demo:
gr.HTML("""
<div style="text-align: center; max-width: 800px; margin: 0 auto;">
<h1 style="font-size: 2.5em; margin-bottom: 0.5em;">🎤 تبدیل گفتار به متن</h1>
<p style="font-size: 1.1em; color: #666; margin-bottom: 2em;">
Real-time با Google API - پردازش background | قابل توزیع روی مرورگرها
</p>
</div>
""")
with gr.Tabs():
with gr.TabItem("🎙️ ضبط مستقیم", id="realtime_tab") as realtime_tab:
gr.Markdown("### فعال کنید و 5+ ثانیه واضح صحبت کنید (متن هر 3s آپدیت میشه)", elem_classes="rtl")
with gr.Row():
audio_input = gr.Audio(
sources=["microphone"],
type="numpy",
label="میکروفون (ضبط رو شروع کن)",
elem_classes="rtl"
)
realtime_output = gr.Textbox(
label="متن live",
placeholder="پس از 3-5s صحبت، متن ظاهر میشه...",
lines=10,
elem_classes="rtl",
rtl=True,
show_copy_button=True,
interactive=False
)
clear_btn = gr.Button("🗑️ پاک کردن", variant="secondary")
# Events (Fixed: Separate clicks)
audio_input.change(
handle_realtime_audio,
inputs=[audio_input],
outputs=[realtime_output] # Update output on change
)
# Timer for live update: Start when tab loads
timer = gr.Timer(value=2.0, active=False)
def start_timer():
timer.change(active=True)
return get_current_transcript()
realtime_tab.select(start_timer, outputs=[realtime_output])
timer.tick(get_current_transcript, outputs=[realtime_output])
clear_btn.click(clear_transcript, outputs=[realtime_output])
with gr.TabItem("📁 فایل صوتی"):
gr.Markdown("### فایل آپلود کن و تبدیل کن", elem_classes="rtl")
with gr.Row():
file_input = gr.Audio(sources=["upload"], type="filepath", label="فایل صوتی", elem_classes="rtl")
chunk_slider = gr.Slider(10, 60, 30, 5, label="بخشبندی (s)", elem_classes="rtl")
process_btn = gr.Button("🚀 تبدیل", variant="primary")
progress_label = gr.Textbox(label="وضعیت", interactive=False, elem_classes="rtl")
file_output = gr.Textbox(label="متن", lines=10, elem_classes="rtl", rtl=True, show_copy_button=True)
with gr.Row():
save_btn = gr.Button("💾 ذخیره", variant="secondary")
clear_file_btn = gr.Button("🗑️ پاک", variant="secondary")
download_file = gr.File(label="دانلود TXT", visible=False, elem_classes="rtl")
process_btn.click(transcribe_file, [file_input, chunk_slider], [file_output, progress_label])
save_btn.click(save_text, file_output, download_file).then(lambda: gr.update(visible=True), outputs=[download_file])
clear_file_btn.click(lambda: ("", ""), [file_output, progress_label])
with gr.Accordion("📖 راهنما", open=False, elem_classes="rtl"):
gr.Markdown("""
### استفاده:
- **Real-time**: تب رو باز کن، میکروفون فعال کن، 5s+ صحبت کن. هر 3s متن آپدیت میشه (background).
- **فایل**: آپلود و دکمه بزن.
### نکات:
- 🗣️ واضح صحبت کن، نویز کم.
- 🌐 اینترنت پایدار (Google API).
- 📱 روی موبایل/دسکتاپ کار میکنه (mic access بده).
- ⚠️ محدودیت Google: ~60 req/min - برای حجم بالا، API key اضافه کن (در کد کامنت).
- توزیع: share لینک رو share کن، همه browserها ساپورت.
""", elem_classes="rtl")
gr.HTML('<div style="text-align: center; margin-top: 2em; padding: 1em; background: #f8f9fa;"><p style="color: #666;">نسخه 2.3 - Fixed Timer & Clicks | Google Backend</p></div>')
if __name__ == "__main__":
demo.queue().launch(share=True, show_error=True, server_name="0.0.0.0", server_port=7860) |