transcribe_audio

Paused

App Files Files Community

cstr commited on Oct 2, 2024

Commit

4b50bd3

verified ·

1 Parent(s): e922c51

Update app.py

Browse files

Files changed (1) hide show

app.py +107 -90

app.py CHANGED Viewed

@@ -42,11 +42,18 @@ device = "cuda:0" if torch.cuda.is_available() else "cpu"
 def download_audio(url, method_choice):
     parsed_url = urlparse(url)
     logging.info(f"Downloading audio from URL: {url} using method: {method_choice}")
-    if parsed_url.netloc in ['www.youtube.com', 'youtu.be', 'youtube.com']:
-        return download_youtube_audio(url, method_choice)
-    else:
-        return download_direct_audio(url, method_choice)
 def download_youtube_audio(url, method_choice):
     methods = {
         'yt-dlp': youtube_dl_method,
@@ -66,19 +73,24 @@ def download_youtube_audio(url, method_choice):
 def youtube_dl_method(url):
     logging.info("Using yt-dlp method")
-    ydl_opts = {
-        'format': 'bestaudio/best',
-        'postprocessors': [{
-            'key': 'FFmpegExtractAudio',
-            'preferredcodec': 'mp3',
-            'preferredquality': '192',
-        }],
-        'outtmpl': '%(id)s.%(ext)s',
-    }
-    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
-        info = ydl.extract_info(url, download=True)
-        logging.info(f"Downloaded YouTube audio: {info['id']}.mp3")
-        return f"{info['id']}.mp3"
 def pytube_method(url):
     logging.info("Using pytube method")
@@ -183,11 +195,11 @@ def trim_audio(audio_path, start_time, end_time):
     # Validate times
     if start_time < 0 or end_time < 0:
-        raise ValueError("Start time and end time must be non-negative.")
     if start_time >= end_time:
-        raise gr.Error("End time must be greater than start time.")
     if start_time > audio_duration:
-        raise ValueError("Start time exceeds audio duration.")
     trimmed_audio = audio[start_time * 1000:end_time * 1000]
     trimmed_audio_path = tempfile.mktemp(suffix='.wav')
@@ -212,67 +224,40 @@ def get_model_options(pipeline_type):
     else:
         return []
 def transcribe_audio(input_source, pipeline_type, model_id, dtype, batch_size, download_method, start_time=None, end_time=None, verbose=False):
     try:
         # Determine if input_source is a URL or file
         if isinstance(input_source, str):
             if input_source.startswith('http://') or input_source.startswith('https://'):
                 audio_path = download_audio(input_source, download_method)
-                # Handle potential errors during download
                 if not audio_path or audio_path.startswith("Error"):
                     yield f"Error: {audio_path}", "", None
                     return
-        else:
-            # Assume input_source is an uploaded file object
             audio_path = input_source.name
             logging.info(f"Using uploaded audio file: {audio_path}")
-    try:
-        logging.info(f"Transcription parameters: pipeline_type={pipeline_type}, model_id={model_id}, dtype={dtype}, batch_size={batch_size}, download_method={download_method}")
-        verbose_messages = f"Starting transcription with parameters:\nPipeline Type: {pipeline_type}\nModel ID: {model_id}\nData Type: {dtype}\nBatch Size: {batch_size}\nDownload Method: {download_method}\n"
-        if verbose:
-            yield verbose_messages, "", None
-        if pipeline_type == "faster-batched":
-            model = WhisperModel(model_id, device="auto", compute_type=dtype)
-            pipeline = BatchedInferencePipeline(model=model)
-        elif pipeline_type == "faster-sequenced":
-            model = WhisperModel(model_id)
-            pipeline = model.transcribe
-        elif pipeline_type == "transformers":
-            torch_dtype = torch.float16 if dtype == "float16" else torch.float32
-            model = AutoModelForSpeechSeq2Seq.from_pretrained(
-                model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
-            )
-            model.to(device)
-            processor = AutoProcessor.from_pretrained(model_id)
-            pipeline = pipeline(
-                "automatic-speech-recognition",
-                model=model,
-                tokenizer=processor.tokenizer,
-                feature_extractor=processor.feature_extractor,
-                chunk_length_s=30,
-                batch_size=batch_size,
-                return_timestamps=True,
-                torch_dtype=torch_dtype,
-                device=device,
-            )
         else:
-            raise ValueError("Invalid pipeline type")
-        if isinstance(input_source, str) and (input_source.startswith('http://') or input_source.startswith('https://')):
-            audio_path = download_audio(input_source, download_method)
-            verbose_messages += f"Audio file downloaded: {audio_path}\n"
-            if verbose:
-                yield verbose_messages, "", None
-            if not audio_path or audio_path.startswith("Error"):
-                yield f"Error: {audio_path}", "", None
-                return
-        else:
-            audio_path = input_source
         start_time = float(start_time) if start_time else None
         end_time = float(end_time) if end_time else None
@@ -283,11 +268,47 @@ def transcribe_audio(input_source, pipeline_type, model_id, dtype, batch_size, d
             if verbose:
                 yield verbose_messages, "", None
         start_time_perf = time.time()
-        if pipeline_type in ["faster-batched", "faster-sequenced"]:
-            segments, info = pipeline(audio_path, batch_size=batch_size)
         else:
-            result = pipeline(audio_path)
             segments = result["chunks"]
         end_time_perf = time.time()
@@ -305,11 +326,10 @@ def transcribe_audio(input_source, pipeline_type, model_id, dtype, batch_size, d
         transcription = ""
         for segment in segments:
-            transcription_segment = (
-                f"[{segment.start:.2f}s -> {segment.end:.2f}s] {segment.text}\n"
-                if pipeline_type in ["faster-batched", "faster-sequenced"] else
-                f"[{segment['timestamp'][0]:.2f}s -> {segment['timestamp'][1]:.2f}s] {segment['text']}\n"
-            )
             transcription += transcription_segment
             if verbose:
                 yield verbose_messages + metrics_output, transcription, None
@@ -322,23 +342,21 @@ def transcribe_audio(input_source, pipeline_type, model_id, dtype, batch_size, d
         yield f"An error occurred: {str(e)}", "", None
     finally:
-        # Remove downloaded audio file
         if audio_path and os.path.exists(audio_path):
             os.remove(audio_path)
-        # Remove trimmed audio file
         if 'trimmed_audio_path' in locals() and os.path.exists(trimmed_audio_path):
             os.remove(trimmed_audio_path)
-        # Remove transcription file if needed
-        if transcription_file and os.path.exists(transcription_file):
             os.remove(transcription_file)
 with gr.Blocks() as iface:
     gr.Markdown("# Multi-Pipeline Transcription")
     gr.Markdown("Transcribe audio using multiple pipelines and models.")
     with gr.Row():
-        input_source = gr.File(label="Audio Source (Upload a file or enter a URL/YouTube URL)")
         pipeline_type = gr.Dropdown(
             choices=["faster-batched", "faster-sequenced", "transformers"],
             label="Pipeline Type",
@@ -375,7 +393,6 @@ with gr.Blocks() as iface:
         try:
             model_choices = get_model_options(pipeline_type)
             logging.info(f"Model choices for {pipeline_type}: {model_choices}")
             if model_choices:
                 return gr.update(choices=model_choices, value=model_choices[0], visible=True)
             else:
@@ -383,9 +400,9 @@ with gr.Blocks() as iface:
         except Exception as e:
             logging.error(f"Error in update_model_dropdown: {str(e)}")
             return gr.update(choices=["Error"], value="Error", visible=True)
-    #pipeline_type.change(update_model_dropdown, inputs=pipeline_type, outputs=model_id)
-    pipeline_type.change(update_model_dropdown, inputs=[pipeline_type], outputs=model_id)
     def transcribe_with_progress(*args):
         for result in transcribe_audio(*args):
@@ -399,9 +416,9 @@ with gr.Blocks() as iface:
     gr.Examples(
         examples=[
-        ["https://www.youtube.com/watch?v=daQ_hqA6HDo", "faster-batched", "cstr/whisper-large-v3-turbo-int8_float32", "int8", 16, "yt-dlp", None, None, True],
-        ["https://mcdn.podbean.com/mf/web/dir5wty678b6g4vg/HoP_453_-_The_Price_is_Right_-_Law_and_Economics_in_the_Second_Scholastic5yxzh.mp3", "faster-sequenced", "deepdml/faster-whisper-large-v3-turbo-ct2", "float16", 1, "ffmpeg", 0, 300, True],
-        [None, "transformers", "openai/whisper-large-v3", "float16", 16, "yt-dlp", 60, 180, True]
         ],
         inputs=[input_source, pipeline_type, model_id, dtype, batch_size, download_method, start_time, end_time, verbose],
     )

 def download_audio(url, method_choice):
     parsed_url = urlparse(url)
     logging.info(f"Downloading audio from URL: {url} using method: {method_choice}")
+    try:
+        if parsed_url.netloc in ['www.youtube.com', 'youtu.be', 'youtube.com']:
+            audio_file = download_youtube_audio(url, method_choice)
+        else:
+            audio_file = download_direct_audio(url, method_choice)
+        if not audio_file or not os.path.exists(audio_file):
+            raise Exception(f"Failed to download audio from {url}")
+        return audio_file
+    except Exception as e:
+        logging.error(f"Error downloading audio: {str(e)}")
+        return f"Error: {str(e)}"
 def download_youtube_audio(url, method_choice):
     methods = {
         'yt-dlp': youtube_dl_method,
 def youtube_dl_method(url):
     logging.info("Using yt-dlp method")
+    try:
+        ydl_opts = {
+            'format': 'bestaudio/best',
+            'postprocessors': [{
+                'key': 'FFmpegExtractAudio',
+                'preferredcodec': 'mp3',
+                'preferredquality': '192',
+            }],
+            'outtmpl': '%(id)s.%(ext)s',
+        }
+        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+            info = ydl.extract_info(url, download=True)
+            output_file = f"{info['id']}.mp3"
+            logging.info(f"Downloaded YouTube audio: {output_file}")
+            return output_file
+    except Exception as e:
+        logging.error(f"Error in youtube_dl_method: {str(e)}")
+        return None
 def pytube_method(url):
     logging.info("Using pytube method")
     # Validate times
     if start_time < 0 or end_time < 0:
+        raise gr.Error("Start time and end time must be non-negative.")
     if start_time >= end_time:
+        raise gr.Error("End time must be greater than start time.")
     if start_time > audio_duration:
+        raise gr.Error("Start time exceeds audio duration.")
     trimmed_audio = audio[start_time * 1000:end_time * 1000]
     trimmed_audio_path = tempfile.mktemp(suffix='.wav')
     else:
         return []
+loaded_models = {}
 def transcribe_audio(input_source, pipeline_type, model_id, dtype, batch_size, download_method, start_time=None, end_time=None, verbose=False):
     try:
+        if verbose:
+            logging.getLogger().setLevel(logging.INFO)
+        else:
+            logging.getLogger().setLevel(logging.WARNING)
+        logging.info(f"Transcription parameters: pipeline_type={pipeline_type}, model_id={model_id}, dtype={dtype}, batch_size={batch_size}, download_method={download_method}")
+        verbose_messages = f"Starting transcription with parameters:\nPipeline Type: {pipeline_type}\nModel ID: {model_id}\nData Type: {dtype}\nBatch Size: {batch_size}\nDownload Method: {download_method}\n"
+        if verbose:
+            yield verbose_messages, "", None
         # Determine if input_source is a URL or file
         if isinstance(input_source, str):
             if input_source.startswith('http://') or input_source.startswith('https://'):
                 audio_path = download_audio(input_source, download_method)
                 if not audio_path or audio_path.startswith("Error"):
                     yield f"Error: {audio_path}", "", None
                     return
+            else:
+                # Assume it's a local file path
+                audio_path = input_source
+        elif input_source is not None:
+            # Uploaded file object
             audio_path = input_source.name
             logging.info(f"Using uploaded audio file: {audio_path}")
         else:
+            yield "No audio source provided.", "", None
+            return
+        # Convert start_time and end_time to float or None
         start_time = float(start_time) if start_time else None
         end_time = float(end_time) if end_time else None
             if verbose:
                 yield verbose_messages, "", None
+        # Model caching
+        model_key = (pipeline_type, model_id, dtype)
+        if model_key in loaded_models:
+            model_or_pipeline = loaded_models[model_key]
+            logging.info("Loaded model from cache")
+        else:
+            if pipeline_type == "faster-batched":
+                model = WhisperModel(model_id, device=device, compute_type=dtype)
+                pipeline = BatchedInferencePipeline(model=model)
+            elif pipeline_type == "faster-sequenced":
+                model = WhisperModel(model_id, device=device, compute_type=dtype)
+                pipeline = model.transcribe
+            elif pipeline_type == "transformers":
+                torch_dtype = torch.float16 if dtype == "float16" else torch.float32
+                model = AutoModelForSpeechSeq2Seq.from_pretrained(
+                    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True
+                )
+                model.to(device)
+                processor = AutoProcessor.from_pretrained(model_id)
+                pipeline = pipeline(
+                   "automatic-speech-recognition",
+                    model=model,
+                    tokenizer=processor.tokenizer,
+                    feature_extractor=processor.feature_extractor,
+                    chunk_length_s=30,
+                    batch_size=batch_size,
+                    return_timestamps=True,
+                    torch_dtype=torch_dtype,
+                    device=device,
+                )
+            else:
+                raise ValueError("Invalid pipeline type")
+            loaded_models[model_key] = model_or_pipeline  # Cache the model
         start_time_perf = time.time()
+        if pipeline_type == "faster-batched":
+            segments, info = model_or_pipeline.transcribe(audio_path, batch_size=batch_size)
+        elif pipeline_type == "faster-sequenced":
+            segments, info = model_or_pipeline.transcribe(audio_path)
         else:
+            result = model_or_pipeline(audio_path)
             segments = result["chunks"]
         end_time_perf = time.time()
         transcription = ""
         for segment in segments:
+            if pipeline_type in ["faster-batched", "faster-sequenced"]:
+                transcription_segment = f"[{segment.start:.2f}s -> {segment.end:.2f}s] {segment.text}\n"
+            else:
+                transcription_segment = f"[{segment['timestamp'][0]:.2f}s -> {segment['timestamp'][1]:.2f}s] {segment['text']}\n"
             transcription += transcription_segment
             if verbose:
                 yield verbose_messages + metrics_output, transcription, None
         yield f"An error occurred: {str(e)}", "", None
     finally:
+        # Clean up temporary files
         if audio_path and os.path.exists(audio_path):
             os.remove(audio_path)
         if 'trimmed_audio_path' in locals() and os.path.exists(trimmed_audio_path):
             os.remove(trimmed_audio_path)
+        if 'transcription_file' in locals() and os.path.exists(transcription_file):
             os.remove(transcription_file)
 with gr.Blocks() as iface:
     gr.Markdown("# Multi-Pipeline Transcription")
     gr.Markdown("Transcribe audio using multiple pipelines and models.")
     with gr.Row():
+        #input_source = gr.File(label="Audio Source (Upload a file or enter a URL/YouTube URL)")
+        input_source = gr.Textbox(label="Audio Source (Upload a file or enter a URL/YouTube URL)")
         pipeline_type = gr.Dropdown(
             choices=["faster-batched", "faster-sequenced", "transformers"],
             label="Pipeline Type",
         try:
             model_choices = get_model_options(pipeline_type)
             logging.info(f"Model choices for {pipeline_type}: {model_choices}")
             if model_choices:
                 return gr.update(choices=model_choices, value=model_choices[0], visible=True)
             else:
         except Exception as e:
             logging.error(f"Error in update_model_dropdown: {str(e)}")
             return gr.update(choices=["Error"], value="Error", visible=True)
+    # event handler for pipeline_type change
+    pipeline_type.change(update_model_dropdown, inputs=[pipeline_type], outputs=[model_id])
     def transcribe_with_progress(*args):
         for result in transcribe_audio(*args):
     gr.Examples(
         examples=[
+            ["https://www.youtube.com/watch?v=daQ_hqA6HDo", "faster-batched", "cstr/whisper-large-v3-turbo-int8_float32", "int8", 16, "yt-dlp", None, None, True],
+            ["https://mcdn.podbean.com/mf/web/dir5wty678b6g4vg/HoP_453_-_The_Price_is_Right_-_Law_and_Economics_in_the_Second_Scholastic5yxzh.mp3", "faster-sequenced", "deepdml/faster-whisper-large-v3-turbo-ct2", "float16", 1, "ffmpeg", 0, 300, True],
+            ["path/to/local/audio.mp3", "transformers", "openai/whisper-large-v3", "float16", 16, "yt-dlp", 60, 180, True]
         ],
         inputs=[input_source, pipeline_type, model_id, dtype, batch_size, download_method, start_time, end_time, verbose],
     )