suprimedev commited on
Commit
7f63304
·
verified ·
1 Parent(s): d57ef9c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +170 -177
app.py CHANGED
@@ -1,199 +1,192 @@
1
  import gradio as gr
2
- import torch
3
- from transformers import pipeline, AutoProcessor, WhisperForConditionalGeneration
4
- from datasets import load_dataset
5
- import tempfile
 
6
  import threading
7
  import queue
 
8
  import os
9
- from typing import Optional, Tuple
10
- import warnings
11
- warnings.filterwarnings("ignore") # Suppress minor warnings for cleaner output
12
 
13
- # Load Whisper model (small for speed on CPU; use "base" or "medium" for better accuracy)
14
- # For HF Spaces: This auto-downloads on first run; caches for reuse.
15
- device = "cuda:0" if torch.cuda.is_available() else "cpu"
16
- pipe = pipeline(
17
- "automatic-speech-recognition",
18
- model="openai/whisper-small", # Multilingual, good for Farsi; swap to "openai/whisper-base" for lighter/faster
19
- return_timestamps=False,
20
- generate_kwargs={"language": None}, # Allow auto-detection or override
21
- device=device if device == "cuda:0" else -1 # Use CPU if no GPU
22
- )
23
 
24
- # Supported languages (Whisper language codes; auto-detects if not specified)
25
- LANGUAGES = [
26
- ("Auto-Detect", None), # Let model guess language
27
- ("English", "en"),
28
- ("Spanish", "es"),
29
- ("French", "fr"),
30
- ("German", "de"),
31
- ("Italian", "it"),
32
- ("Portuguese", "pt"),
33
- ("Dutch", "nl"),
34
- ("Russian", "ru"),
35
- ("Chinese", "zh"),
36
- ("Japanese", "ja"),
37
- ("Korean", "ko"),
38
- ("Arabic", "ar"),
39
- ("Hindi", "hi"),
40
- ("Persian (Farsi)", "fa"), # Excellent Farsi support
41
- # Whisper supports 99+; add more or use custom
42
- ]
43
 
44
- # Queue for background processing
45
- transcription_queue = queue.Queue()
 
46
 
47
- def process_audio_whisper(audio_bytes: bytes, language_code: Optional[str]) -> str:
48
- """Process audio bytes to text using Whisper in background."""
49
- try:
50
- # Save bytes to temp WAV (Whisper expects file path or audio array; here we use file)
51
- with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_file:
52
- temp_file.write(audio_bytes)
53
- temp_file_path = temp_file.name
 
 
 
54
 
55
- # Transcribe with Whisper
56
- result = pipe(temp_file_path, generate_kwargs={"language": language_code, "task": "transcribe"})
57
- text = result["text"].strip()
58
-
59
- # Clean up
60
- os.unlink(temp_file_path)
61
-
62
- return text if text else "[No speech detected]"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  except Exception as e:
64
- os.unlink(temp_file_path) if 'temp_file_path' in locals() and os.path.exists(temp_file_path) else None
65
- return f"[Error: {str(e)}]"
66
 
67
- def transcribe_live(audio: Optional[bytes], language_code: str, use_custom: bool, custom_lang: str, transcription_history: str) -> Tuple[str, str]:
68
- """
69
- Main Gradio function: Processes audio in background thread for 'live' feel.
70
- - Audio from mic as bytes.
71
- - Appends to history on each clip (record/stop triggers).
72
- - For streaming: Whisper isn't native-streaming; use chunks or integrate faster models like Faster-Whisper.
73
- """
74
- if audio is None:
75
- return transcription_history, "[Please record audio]"
76
 
77
- # Determine final language (None for auto-detect)
78
- final_lang = None
79
- if not use_custom:
80
- final_lang = language_code
81
- elif custom_lang.strip():
82
- final_lang = custom_lang.strip()
 
 
 
 
83
 
84
- # Background processing
85
- result_queue = queue.Queue()
86
- def background_worker():
87
- result = process_audio_whisper(audio, final_lang)
88
- result_queue.put(result)
89
-
90
- thread = threading.Thread(target=background_worker)
91
- thread.daemon = True
92
- thread.start()
93
-
94
- # Wait for result (timeout for responsiveness)
95
- try:
96
- new_text = result_queue.get(timeout=15) # Whisper small: ~5-10s per clip on CPU
97
- updated_history = f"{transcription_history}\n{new_text}" if transcription_history and new_text != "[No speech detected]" else new_text
98
- status = f"Transcribed: {new_text}" if new_text else "[Processing complete]"
99
- return updated_history, status
100
- except queue.Empty:
101
- return transcription_history, "[Timed out; try shorter audio]"
102
- except Exception as e:
103
- return transcription_history, f"[Unexpected error: {str(e)}]"
104
-
105
- # Gradio Interface
106
- with gr.Blocks(title="Live STT with Whisper (HF Transformers)") as demo:
107
  gr.Markdown("""
108
- # Multilingual Live Speech-to-Text App with Whisper
109
- Record or upload audio for transcription. Uses OpenAI Whisper (small model) for ~100 languages, including excellent Farsi support.
110
- Processes in background. Auto-detects language or specify via dropdown/custom.
111
- **Tip**: Speak clearly in short clips (5-15s) for best results on CPU.
112
  """)
113
 
114
- with gr.Row():
115
- audio_input = gr.Audio(
116
- sources=["microphone", "upload"], # Allow mic or file upload
117
- type="bytes",
118
- label="Record/Upload Audio",
119
- info="Click mic to record (stop to transcribe). Upload WAV/MP3 for batch."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
  )
121
-
122
- with gr.Row():
123
- lang_dropdown = gr.Dropdown(
124
- choices=LANGUAGES,
125
- value=LANGUAGES[0][1], # Default: Auto-Detect
126
- label="Language",
127
- info="Select or auto-detect. Farsi: 'fa'."
128
  )
129
- use_custom_checkbox = gr.Checkbox(
130
- label="Use Custom Language Code",
131
- value=False,
132
- info="Enable for manual override (e.g., 'fa' for Farsi, 'en' for English)."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
  )
134
- custom_lang_input = gr.Textbox(
135
- label="Custom Language (e.g., 'fa')",
136
- placeholder="fa",
137
- visible=False,
138
- info="Whisper codes: See [docs](https://huggingface.co/openai/whisper-small)."
 
 
 
 
 
 
139
  )
140
-
141
- # Toggle custom input visibility
142
- use_custom_checkbox.change(
143
- fn=lambda visible: gr.update(visible=visible),
144
- inputs=[use_custom_checkbox],
145
- outputs=[custom_lang_input]
146
- )
147
-
148
- # Outputs
149
- history_output = gr.Textbox(
150
- label="Transcription History",
151
- lines=10,
152
- interactive=False,
153
- placeholder="Transcriptions append here (RTL support for Farsi/Arabic)..."
154
- )
155
- status_output = gr.Textbox(
156
- label="Status",
157
- interactive=False,
158
- placeholder="Ready to transcribe..."
159
- )
160
-
161
- # Buttons
162
- transcribe_btn = gr.Button("Transcribe Audio", variant="primary")
163
- clear_btn = gr.Button("Clear History", variant="secondary")
164
-
165
- # Event: Live on audio change (triggers on record stop or upload)
166
- audio_input.change(
167
- fn=transcribe_live,
168
- inputs=[audio_input, lang_dropdown, use_custom_checkbox, custom_lang_input, history_output],
169
- outputs=[history_output, status_output],
170
- live=True
171
- )
172
-
173
- # Manual button (for re-processing or after UI changes)
174
- transcribe_btn.click(
175
- fn=transcribe_live,
176
- inputs=[audio_input, lang_dropdown, use_custom_checkbox, custom_lang_input, history_output],
177
- outputs=[history_output, status_output]
178
- )
179
-
180
- # Clear
181
- clear_btn.click(
182
- fn=lambda: ("", "History cleared"),
183
- outputs=[history_output, status_output]
184
- )
185
-
186
- # Example/Info
187
- gr.Markdown("""
188
- ### Quick Test for Farsi
189
- - Select "Persian (Farsi)" or type "fa".
190
- - Record: Say "سلام، این یک تست است" (Hello, this is a test).
191
- - Output should be in Persian script.
192
-
193
- **Performance**: On HF Spaces (CPU), ~2-10s per 10s clip. For faster, use "openai/whisper-tiny" or GPU Spaces.
194
- **Limitations**: Not real-time streaming (chunk-based). For live streaming, consider Faster-Whisper + WebSockets.
195
- """)
196
 
197
- # Launch (for local/HF Spaces)
198
  if __name__ == "__main__":
199
- demo.launch(server_name="0.0.0.0", server_port=7860, share=True, debug=True)
 
 
 
 
 
1
  import gradio as gr
2
+ import speech_recognition as sr
3
+ import numpy as np
4
+ from pydub import AudioSegment
5
+ import io
6
+ import wave
7
  import threading
8
  import queue
9
+ import time
10
  import os
 
 
 
11
 
12
+ # تنظیمات اولیه
13
+ recognizer = sr.Recognizer()
14
+ recognizer.energy_threshold = 300
15
+ recognizer.dynamic_energy_threshold = True
16
+ recognizer.dynamic_energy_ratio = 1.5
 
 
 
 
 
17
 
18
+ # صف برای پردازش asynchronous
19
+ audio_queue = queue.Queue()
20
+ transcript_queue = queue.Queue()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
+ # متغیرهای نمایش متن
23
+ current_transcript = ""
24
+ current_transcript_lock = threading.Lock()
25
 
26
+ def convert_numpy_to_wav(audio_data, sample_rate=16000):
27
+ """تعداد به فرمت WAV با نرمال‌سازی"""
28
+ buffer = io.BytesIO()
29
+ with wave.open(buffer, 'wb') as wav_file:
30
+ wav_file.setnchannels(1)
31
+ wav_file.setsampwidth(2)
32
+ wav_file.setframerate(sample_rate)
33
+ wav_file.writeframes(np.int16(audio_data * 32767))
34
+ buffer.seek(0)
35
+ return AudioSegment.from_wav(buffer)
36
 
37
+ def process_audio_chunk(audio_chunk):
38
+ """پردازش یک قطعه صوتی"""
39
+ try:
40
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
41
+ tmp_path = tmp_file.name
42
+ audio_segment = convert_numpy_to_wav(audio_chunk)
43
+ audio_segment.export(tmp_path, format="wav")
44
+
45
+ with sr.AudioFile(tmp_path) as source:
46
+ audio = recognizer.record(source)
47
+
48
+ # تلاش با فارسی اول
49
+ try:
50
+ text = recognizer.recognize_google(audio, language='fa-IR')
51
+ except sr.UnknownValueError:
52
+ # اگر فارسی معتبر نباشد، با انگلیسی تلاش کنیم
53
+ try:
54
+ text = recognizer.recognize_google(audio, language='en-US')
55
+ except:
56
+ text = ""
57
+ except sr.RequestError:
58
+ text = "[خطا در اتصال]"
59
+
60
+ os.unlink(tmp_path) # پاک کردن فایل موقت
61
+ return text.strip()
62
  except Exception as e:
63
+ print(f"خطا در پردازش: {e}")
64
+ return ""
65
 
66
+ def monitor_audio(audio_input):
67
+ """م’environیک بلند کردن و پردازش"""
68
+ for i in range(0, len(audio_input), 16000):
69
+ chunk = audio_input[i:i+16000]
70
+ if len(chunk) < 16000:
71
+ continue
72
+ audio_queue.put((chunk, 16000))
 
 
73
 
74
+ def update_transcript():
75
+ """به‌روزرسانی متن به‌رو"""
76
+ while True:
77
+ if not transcript_queue.empty():
78
+ new_text = transcript_queue.get()
79
+ with current_transcript_lock:
80
+ nonlocal current_transcript
81
+ current_transcript += " " + new_text
82
+ current_transcript = " ".join(current_transcript.split())
83
+ time.sleep(0.1)
84
 
85
+ # رابط کاربری با Gradio
86
+ with gr.Blocks(title="گستره گفتار به متن", theme=gr.themes.Soft(), css="""
87
+ .gradio-container { font-family: 'Vazir', 'Tahoma', sans-serif !important; }
88
+ .rtl { direction: rtl; text-align: right; }
89
+ """) as demo:
90
+
91
+ # صفحه اصلی
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
  gr.Markdown("""
93
+ # 🎤 تبدیل گفتار به متن
94
+ ابزار قدرتمند تبدیل صدات را به متن با پشتیبانی از زبان فارسی و انگلیسی
 
 
95
  """)
96
 
97
+ # تب ضبط مستقیم
98
+ with gr.TabItem("🎙️ ضبط مستقیم"):
99
+ gr.Markdown("### میکروفون خود را فعال کرده و شروع به صحبت کنید")
100
+ with gr.Row():
101
+ with gr.Column(scale=1):
102
+ audio_input = gr.Audio(
103
+ sources=["microphone"],
104
+ type="numpy",
105
+ streaming=True,
106
+ label="میکروفون",
107
+ show_label=True
108
+ )
109
+ with gr.Column(scale=1):
110
+ clear_btn = gr.Button("🗑️ پاک کردن متن", variant="secondary")
111
+ realtime_output = gr.Textbox(
112
+ label="متن تشخیص داده شده",
113
+ placeholder="شروع به صحبت کنید و متن اینجا ظاهر می‌شود...",
114
+ lines=12,
115
+ elem_classes="rtl",
116
+ rtl=True,
117
+ show_copy_button=True
118
+ )
119
+ clear_btn.click(lambda: "", outputs=[realtime_output])
120
+ audio_input.stream(
121
+ lambda x: monitor_audio(x),
122
+ inputs=[audio_input],
123
+ outputs=[],
124
+ every=0.1
125
  )
126
+ audio_input.stream(
127
+ lambda: update_transcript(),
128
+ inputs=[],
129
+ outputs=[realtime_output],
130
+ every=0.1
 
 
131
  )
132
+
133
+ # تب فایل صوتی
134
+ with gr.TabItem("📁 فایل صو��ی"):
135
+ gr.Markdown("### فایل صوتی خود را انتخاب کنید")
136
+ with gr.Row():
137
+ with gr.Column(scale=3):
138
+ file_input = gr.Audio(
139
+ sources=["upload"],
140
+ type="filepath",
141
+ label="انتخاب فایل صوتی",
142
+ elem_classes="rtl"
143
+ )
144
+ with gr.Column(scale=1):
145
+ chunk_duration = gr.Slider(
146
+ minimum=10, maximum=60, value=30, step=5,
147
+ label="مدت هر بخش (ثانیه)"
148
+ )
149
+ process_btn = gr.Button("🚀 شروع تبدیل", variant="primary")
150
+ status_label = gr.Textbox(label="وضعیت پردازش", interactive=False)
151
+ with gr.Column(scale=1):
152
+ save_btn = gr.Button("💾 ذخیره متن")
153
+ clear_file_btn = gr.Button("🗑️ پاک کردن")
154
+ download_file = gr.File(label="دانلود فایل متن", visible=False)
155
+
156
+ def process_file(audio_file, duration):
157
+ try:
158
+ audio = AudioSegment.from_file(audio_file)
159
+ results = []
160
+ for i in range(0, len(audio), duration*1000):
161
+ chunk = audio[i:i+duration*1000]
162
+ chunk_text = process_audio_chunk(np.array(chunk.get_array_of_samples()))
163
+ results.append(chunk_text)
164
+ return " ".join(results), "تکمیل پردازش ✅"
165
+ except Exception as e:
166
+ return f"خطا: {str(e)}", "خطای پردازش ❌"
167
+
168
+ process_btn.click(
169
+ process_file,
170
+ inputs=[file_input, chunk_duration],
171
+ outputs=[realtime_output, status_label]
172
  )
173
+ save_btn.click(
174
+ lambda x: gr.File.value(x),
175
+ inputs=[realtime_output],
176
+ outputs=[download_file]
177
+ ).then(
178
+ lambda: gr.update(visible=True),
179
+ outputs=[download_file]
180
+ )
181
+ clear_file_btn.click(
182
+ lambda: ("", ""),
183
+ outputs=[realtime_output, status_label]
184
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
185
 
186
+ # اجرای برنامه
187
  if __name__ == "__main__":
188
+ demo.queue().launch(
189
+ share=True,
190
+ show_error=True,
191
+ favicon=__file__
192
+ )