transcribe_audio

Paused

App Files Files Community

cstr commited on Oct 2, 2024

Commit

8cc0029

verified ·

1 Parent(s): 00124b5

fix

Browse files

Files changed (1) hide show

app.py +9 -12

app.py CHANGED Viewed

@@ -528,12 +528,12 @@ def get_model_options(pipeline_type):
 # Dictionary to store loaded models
 loaded_models = {}
-def transcribe_audio(audio_upload, audio_url, proxy_url, proxy_username, proxy_password, pipeline_type, model_id, dtype, batch_size, download_method, start_time=None, end_time=None, verbose=False, include_timecodes=False):
     """
     Transcribes audio from a given source using the specified pipeline and model.
     Args:
-        audio_upload (file): Uploaded audio file.
         audio_url (str): URL of audio.
         proxy_url (str): Proxy URL if needed.
         proxy_username (str): Proxy username.
@@ -567,14 +567,10 @@ def transcribe_audio(audio_upload, audio_url, proxy_url, proxy_username, proxy_p
         audio_path = None
         is_temp_file = False
-        if audio_upload is not None:
-            if isinstance(audio_upload, dict) and 'name' in audio_upload:
-                # audio_upload is a dict with file info
-                audio_path = audio_upload['name']
-                is_temp_file = False
-            elif isinstance(audio_upload, str) and os.path.exists(audio_upload):
-                audio_path = audio_upload
-                is_temp_file = False
         elif audio_url is not None and len(audio_url.strip()) > 0:
             # audio_url is provided
             audio_path, is_temp_file = download_audio(audio_url, download_method, proxy_url, proxy_username, proxy_password)
@@ -584,7 +580,7 @@ def transcribe_audio(audio_upload, audio_url, proxy_url, proxy_username, proxy_p
                 yield verbose_messages + error_msg, "", None
                 return
         else:
-            error_msg = "No audio source provided. Please upload an audio file or enter a URL."
             logging.error(error_msg)
             yield verbose_messages + error_msg, "", None
             return
@@ -696,6 +692,7 @@ def transcribe_audio(audio_upload, audio_url, proxy_url, proxy_username, proxy_p
         if audio_path and is_temp_file and os.path.exists(audio_path):
             os.remove(audio_path)
 with gr.Blocks() as iface:
     gr.Markdown("# Audio Transcription")
     gr.Markdown("Transcribe audio using multiple pipelines and (Faster) Whisper models.")
@@ -785,7 +782,7 @@ with gr.Blocks() as iface:
             [None, "https://www.youtube.com/watch?v=daQ_hqA6HDo", "", "", "", "faster-batched", "cstr/whisper-large-v3-turbo-int8_float32", "int8", 16, "yt-dlp", None, None, False, False],
             [None, "https://mcdn.podbean.com/mf/web/dir5wty678b6g4vg/HoP_453.mp3", "", "", "", "faster-sequenced", "SYSTRAN/faster-whisper-large-v1", "float16", 1, "ffmpeg", 0, 300, False, False],
         ],
-        inputs=[audio_upload, audio_url, proxy_url, proxy_username, proxy_password, pipeline_type, model_id, dtype, batch_size, download_method, start_time, end_time, verbose, include_timecodes],
     )
 iface.launch(share=False, debug=True)

 # Dictionary to store loaded models
 loaded_models = {}
+def transcribe_audio(audio_input, audio_url, proxy_url, proxy_username, proxy_password, pipeline_type, model_id, dtype, batch_size, download_method, start_time=None, end_time=None, verbose=False, include_timecodes=False):
     """
     Transcribes audio from a given source using the specified pipeline and model.
     Args:
+        audio_input (str): Path to uploaded audio file or recorded audio.
         audio_url (str): URL of audio.
         proxy_url (str): Proxy URL if needed.
         proxy_username (str): Proxy username.
         audio_path = None
         is_temp_file = False
+        if audio_input is not None and len(audio_input) > 0:
+            # audio_input is a filepath to uploaded or recorded audio
+            audio_path = audio_input
+            is_temp_file = False
         elif audio_url is not None and len(audio_url.strip()) > 0:
             # audio_url is provided
             audio_path, is_temp_file = download_audio(audio_url, download_method, proxy_url, proxy_username, proxy_password)
                 yield verbose_messages + error_msg, "", None
                 return
         else:
+            error_msg = "No audio source provided. Please upload an audio file, record audio, or enter a URL."
             logging.error(error_msg)
             yield verbose_messages + error_msg, "", None
             return
         if audio_path and is_temp_file and os.path.exists(audio_path):
             os.remove(audio_path)
 with gr.Blocks() as iface:
     gr.Markdown("# Audio Transcription")
     gr.Markdown("Transcribe audio using multiple pipelines and (Faster) Whisper models.")
             [None, "https://www.youtube.com/watch?v=daQ_hqA6HDo", "", "", "", "faster-batched", "cstr/whisper-large-v3-turbo-int8_float32", "int8", 16, "yt-dlp", None, None, False, False],
             [None, "https://mcdn.podbean.com/mf/web/dir5wty678b6g4vg/HoP_453.mp3", "", "", "", "faster-sequenced", "SYSTRAN/faster-whisper-large-v1", "float16", 1, "ffmpeg", 0, 300, False, False],
         ],
+        inputs=[audio_input, audio_url, proxy_url, proxy_username, proxy_password, pipeline_type, model_id, dtype, batch_size, download_method, start_time, end_time, verbose, include_timecodes],
     )
 iface.launch(share=False, debug=True)