Spaces:

suprimedev
/

speh32

Sleeping

App Files Files Community

suprimedev commited on 29 days ago

Commit

b2d26db

verified ·

1 Parent(s): e92b5fd

Update app.py

Browse files

Files changed (1) hide show

app.py +108 -43

app.py CHANGED Viewed

@@ -14,17 +14,79 @@ import time
 recognizer = sr.Recognizer()
 recognizer.energy_threshold = 300
 recognizer.dynamic_energy_threshold = True
-# صف برای پردازش real-time
-audio_queue = queue.Queue()
 def numpy_to_audio_segment(audio_data, sample_rate):
-    """تبدیل numpy array به AudioSegment"""
-    if audio_data is None:
         return None
-    # نرمال‌سازی و تبدیل به int16
     if audio_data.dtype == np.float32 or audio_data.dtype == np.float64:
         audio_data = (audio_data * 32767).astype(np.int16)
     # ذخیره در buffer موقت
@@ -39,16 +101,20 @@ def numpy_to_audio_segment(audio_data, sample_rate):
     return AudioSegment.from_wav(buffer)
 def process_audio_chunk(audio_tuple):
-    """پردازش یک قطعه صوتی با Google Speech Recognition"""
     try:
         if audio_tuple is None:
             return ""
         sample_rate, audio_data = audio_tuple
         # تبدیل به AudioSegment
         audio_segment = numpy_to_audio_segment(audio_data, sample_rate)
-        if audio_segment is None:
             return ""
         # ذخیره موقت برای speech recognition
@@ -56,8 +122,9 @@ def process_audio_chunk(audio_tuple):
             audio_segment.export(tmp_file.name, format="wav")
             tmp_path = tmp_file.name
-        # استفاده از speech recognition
         with sr.AudioFile(tmp_path) as source:
             audio = recognizer.record(source)
         # تشخیص با اولویت فارسی
@@ -69,55 +136,53 @@ def process_audio_chunk(audio_tuple):
             try:
                 text = recognizer.recognize_google(audio, language='en-US')
             except sr.UnknownValueError:
-                text = ""
-        except sr.RequestError:
             text = "[خطا در اتصال]"
         # پاک کردن فایل موقت
         if os.path.exists(tmp_path):
             os.unlink(tmp_path)
-        return text
     except Exception as e:
         print(f"خطا در پردازش: {e}")
         return ""
-# متغیر global برای ذخیره متن real-time
-current_transcript = ""
-transcript_lock = threading.Lock()
 def transcribe_realtime(audio):
-    """تبدیل real-time صوت به متن"""
     global current_transcript
     if audio is None:
         return current_transcript
-    # پردازش در thread جداگانه برای جلوگیری از blocking
-    def process_async():
-        global current_transcript
-        text = process_audio_chunk(audio)
-        if text and text != "[خطا در اتصال]":
-            with transcript_lock:
-                if current_transcript:
-                    current_transcript += " " + text
-                else:
-                    current_transcript = text
-    thread = threading.Thread(target=process_async)
-    thread.daemon = True
-    thread.start()
-    return current_transcript
 def clear_realtime():
     """پاک کردن متن real-time"""
     global current_transcript
     with transcript_lock:
         current_transcript = ""
     return ""
 def transcribe_file(audio_file, chunk_duration=30):
     """تبدیل فایل صوتی به متن با تقسیم به بخش‌ها"""
     if audio_file is None:
@@ -198,7 +263,7 @@ def save_text(text):
     return temp_file.name
-# رابط کاربری Gradio
 with gr.Blocks(
     title="تبدیل گفتار به متن",
     theme=gr.themes.Soft(),
@@ -248,14 +313,14 @@ with gr.Blocks(
             realtime_output = gr.Textbox(
                 label="متن تشخیص داده شده",
-                placeholder="شروع به صحبت کنید و متن اینجا ظاهر می‌شود...",
                 lines=12,
                 elem_classes="rtl",
                 rtl=True,
                 show_copy_button=True
             )
-            # اتصال برای real-time transcription
             audio_input.stream(
                 transcribe_realtime,
                 inputs=[audio_input],
@@ -268,7 +333,7 @@ with gr.Blocks(
                 outputs=[realtime_output]
             )
-        # تب فایل صوتی
         with gr.TabItem("📁 فایل صوتی"):
             gr.Markdown("### فایل صوتی خود را انتخاب کنید", elem_classes="rtl")
@@ -349,16 +414,16 @@ with gr.Blocks(
                 outputs=[file_output, progress_label]
             )
-    # بخش راهنما
     with gr.Accordion("📖 راهنمای استفاده", open=False, elem_classes="rtl"):
         gr.Markdown("""
         ### نحوه استفاده:
-        **برای ضبط مستقیم:**
         1. به تب "ضبط مستقیم" بروید
         2. اجازه دسترسی به میکروفون را بدهید
-        3. شروع به صحبت کنید
-        4. متن به صورت خودکار نمایش داده می‌شود
         **برای فایل صوتی:**
         1. به تب "فایل صوتی" بروید
@@ -373,23 +438,23 @@ with gr.Blocks(
         ### نکات مهم:
         - 🎯 برای دقت بیشتر، از فایل‌های با کیفیت بالا استفاده کنید
         - 🔇 نویز پس‌زمینه را به حداقل برسانید
-        - 🗣️ واضح و شمرده صحبت کنید
         - 🌐 اتصال اینترنت پایدار داشته باشید
         """, elem_classes="rtl")
     # فوتر
     gr.HTML("""
     <div style="text-align: center; margin-top: 2em; padding: 1em; background-color: #f8f9fa;">
         <p style="color: #666;">
-            ساخته شده با ❤️ | نسخه 2.0
         </p>
     </div>
     """)
 # اجرای برنامه
 if __name__ == "__main__":
     demo.queue().launch(
         share=True,
         show_error=True
-    )

 recognizer = sr.Recognizer()
 recognizer.energy_threshold = 300
 recognizer.dynamic_energy_threshold = True
+recognizer.pause_threshold = 0.8  # اضافه: pause detection بهتر برای real-time
+# صف برای accumulate کردن chunk‌های real-time (هر chunk کوچک ~0.5-1s)
+audio_queue = queue.Queue(maxsize=20)  # محدود برای جلوگیری از overflow
+CHUNK_PROCESS_INTERVAL = 3.0  # ثانیه: هر چند ثانیه chunk‌ها را process کنیم
+MIN_AUDIO_DURATION = 2.0  # ثانیه: حداقل طول صوت برای process
+# متغیر global برای ذخیره متن real-time
+current_transcript = ""
+transcript_lock = threading.Lock()
+# Thread background برای پردازش مداوم queue
+def background_processor():
+    """Thread جداگانه برای polling queue و process chunk‌های accumulate‌شده"""
+    accumulated_audio = []  # لیست برای جمع chunk‌ها
+    last_process_time = time.time()
+    while True:
+        try:
+            # اگر queue خالی نیست، chunk جدید اضافه کن
+            if not audio_queue.empty():
+                audio_tuple = audio_queue.get(timeout=0.1)
+                if audio_tuple:
+                    sample_rate, audio_data = audio_tuple
+                    # Normalize و append به accumulated
+                    if audio_data is not None and len(audio_data) > 0:
+                        # Clip به [-1, 1] برای جلوگیری از overflow
+                        audio_data = np.clip(audio_data, -1.0, 1.0)
+                        accumulated_audio.append((sample_rate, audio_data))
+                        print(f"Chunk accumulated: {len(audio_data)/sample_rate:.2f}s")  # log
+            # هر INTERVAL، اگر طول accumulated کافی باشد، process کن
+            current_time = time.time()
+            total_duration = sum(len(data)/rate for rate, data in accumulated_audio) if accumulated_audio else 0
+            if (current_time - last_process_time >= CHUNK_PROCESS_INTERVAL) and total_duration >= MIN_AUDIO_DURATION:
+                # Merge chunk‌ها به یک audio واحد
+                if len(accumulated_audio) > 0:
+                    merged_rate = accumulated_audio[0][0]  # فرض sample_rate ثابت
+                    merged_data = np.concatenate([data for _, data in accumulated_audio])
+                    text = process_audio_chunk((merged_rate, merged_data))
+                    if text and text != "[خطا در اتصال]":
+                        with transcript_lock:
+                            if current_transcript:
+                                current_transcript += " " + text
+                            else:
+                                current_transcript = text
+                        print(f"Processed: {text}")  # log
+                    # Reset accumulated
+                    accumulated_audio = []
+                    last_process_time = current_time
+            time.sleep(0.1)  # poll هر 100ms
+        except queue.Empty:
+            continue
+        except Exception as e:
+            print(f"Error in background processor: {e}")
+            time.sleep(0.5)
+# شروع thread background (یک بار)
+processor_thread = threading.Thread(target=background_processor, daemon=True)
+processor_thread.start()
 def numpy_to_audio_segment(audio_data, sample_rate):
+    """تبدیل numpy array به AudioSegment (بهبود: clip و handling بهتر)"""
+    if audio_data is None or len(audio_data) == 0:
         return None
+    # نرمال‌سازی و clip به int16
     if audio_data.dtype == np.float32 or audio_data.dtype == np.float64:
+        audio_data = np.clip(audio_data, -1.0, 1.0)  # clip برای جلوگیری از distortion
         audio_data = (audio_data * 32767).astype(np.int16)
     # ذخیره در buffer موقت
     return AudioSegment.from_wav(buffer)
 def process_audio_chunk(audio_tuple):
+    """پردازش یک قطعه صوتی با Google Speech Recognition (بهبود: handling chunk کوتاه)"""
     try:
         if audio_tuple is None:
             return ""
         sample_rate, audio_data = audio_tuple
+        duration = len(audio_data) / sample_rate if sample_rate > 0 else 0
+        if duration < MIN_AUDIO_DURATION:
+            return ""  # skip chunk‌های خیلی کوتاه
         # تبدیل به AudioSegment
         audio_segment = numpy_to_audio_segment(audio_data, sample_rate)
+        if audio_segment is None or len(audio_segment) < MIN_AUDIO_DURATION * 1000:
             return ""
         # ذخیره موقت برای speech recognition
             audio_segment.export(tmp_file.name, format="wav")
             tmp_path = tmp_file.name
+        # استفاده از speech recognition با adjust برای chunk
         with sr.AudioFile(tmp_path) as source:
+            recognizer.adjust_for_ambient_noise(source, duration=0.5)  # adjust noise برای real-time
             audio = recognizer.record(source)
         # تشخیص با اولویت فارسی
             try:
                 text = recognizer.recognize_google(audio, language='en-US')
             except sr.UnknownValueError:
+                text = ""  # بهبود: لاگ
+                print("No speech detected in chunk")
+        except sr.RequestError as e:
             text = "[خطا در اتصال]"
+            print(f"Google API error: {e}")
         # پاک کردن فایل موقت
         if os.path.exists(tmp_path):
             os.unlink(tmp_path)
+        return text.strip()
     except Exception as e:
         print(f"خطا در پردازش: {e}")
         return ""
 def transcribe_realtime(audio):
+    """تبدیل real-time صوت به متن (بهبود: queue chunk و return current)"""
     global current_transcript
     if audio is None:
         return current_transcript
+    # Add to queue (non-blocking)
+    try:
+        audio_queue.put(audio, block=False)
+    except queue.Full:
+        print("Queue full, skipping chunk")  # log
+    # Return current transcript برای آپدیت UI
+    with transcript_lock:
+        return current_transcript
 def clear_realtime():
     """پاک کردن متن real-time"""
     global current_transcript
     with transcript_lock:
         current_transcript = ""
+    # Clear queue هم
+    while not audio_queue.empty():
+        try:
+            audio_queue.get_nowait()
+        except queue.Empty:
+            break
     return ""
+# توابع file و UI بدون تغییر (برای اختصار، اما کامل هستند)
 def transcribe_file(audio_file, chunk_duration=30):
     """تبدیل فایل صوتی به متن با تقسیم به بخش‌ها"""
     if audio_file is None:
     return temp_file.name
+# رابط کاربری Gradio (بدون تغییر، اما اتصال real-time بهبود یافته)
 with gr.Blocks(
     title="تبدیل گفتار به متن",
     theme=gr.themes.Soft(),
             realtime_output = gr.Textbox(
                 label="متن تشخیص داده شده",
+                placeholder="شروع به صحبت کنید و متن اینجا ظاهر می‌شود... (پس از 3-5 ثانیه)",
                 lines=12,
                 elem_classes="rtl",
                 rtl=True,
                 show_copy_button=True
             )
+            # اتصال برای real-time transcription (بهبود: stream هر chunk به queue)
             audio_input.stream(
                 transcribe_realtime,
                 inputs=[audio_input],
                 outputs=[realtime_output]
             )
+        # تب فایل صوتی (بدون تغییر)
         with gr.TabItem("📁 فایل صوتی"):
             gr.Markdown("### فایل صوتی خود را انتخاب کنید", elem_classes="rtl")
                 outputs=[file_output, progress_label]
             )
+    # بخش راهنما (بهبود: نکته real-time)
     with gr.Accordion("📖 راهنمای استفاده", open=False, elem_classes="rtl"):
         gr.Markdown("""
         ### نحوه استفاده:
+        **برای ضبط مستقیم (بهبود یافته):**
         1. به تب "ضبط مستقیم" بروید
         2. اجازه دسترسی به میکروفون را بدهید
+        3. حداقل 3-5 ثانیه واضح صحبت کنید (chunk‌ها جمع می‌شوند)
+        4. متن هر 3 ثانیه آپدیت می‌شود
         **برای فایل صوتی:**
         1. به تب "فایل صوتی" بروید
         ### نکات مهم:
         - 🎯 برای دقت بیشتر، از فایل‌های با کیفیت بالا استفاده کنید
         - 🔇 نویز پس‌زمینه را به حداقل برسانید
+        - 🗣️ واضح و شمرده صحبت کنید (حداقل 3 ثانیه)
         - 🌐 اتصال اینترنت پایدار داشته باشید
+        - ⚠️ Real-time ممکن است 1-2 ثانیه تاخیر داشته باشد
         """, elem_classes="rtl")
     # فوتر
     gr.HTML("""
     <div style="text-align: center; margin-top: 2em; padding: 1em; background-color: #f8f9fa;">
         <p style="color: #666;">
+            ساخته شده با ❤️ | نسخه 2.1 (بهبود real-time)
         </p>
     </div>
     """)
 # اجرای برنامه
 if __name__ == "__main__":
     demo.queue().launch(
         share=True,
         show_error=True
+    )