transcribe_audio

Paused

App Files Files Community

cstr commited on Oct 2, 2024

Commit

d680d0f

verified ·

1 Parent(s): 87f3409

different models

Browse files

Files changed (1) hide show

app.py +39 -33

app.py CHANGED Viewed

@@ -157,49 +157,48 @@ def save_transcription(transcription):
         f.write(transcription)
     return file_path
-def transcribe_audio(input_source, model_choice, batch_size, download_method, start_time=None, end_time=None, verbose=False):
     try:
-        if model_choice == "faster-whisper":
-            model = WhisperModel("cstr/whisper-large-v3-turbo-int8_float32", device="auto", compute_type="int8")
-            batched_model = BatchedInferencePipeline(model=model)
-        elif model_choice == "primeline/whisper-large-v3-german":
-            model_id = "primeline/whisper-large-v3-german"
             model = AutoModelForSpeechSeq2Seq.from_pretrained(
                 model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
             )
             model.to(device)
             processor = AutoProcessor.from_pretrained(model_id)
-            pipe = pipeline(
                 "automatic-speech-recognition",
                 model=model,
                 tokenizer=processor.tokenizer,
                 feature_extractor=processor.feature_extractor,
-                max_new_tokens=128,
                 chunk_length_s=30,
                 batch_size=batch_size,
                 return_timestamps=True,
                 torch_dtype=torch_dtype,
                 device=device,
             )
-        elif model_choice == "openai/whisper-large-v3":
-            model_id = "openai/whisper-large-v3"
-            model = AutoModelForSpeechSeq2Seq.from_pretrained(
-                model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
-            )
-            model.to(device)
-            processor = AutoProcessor.from_pretrained(model_id)
-            pipe = pipeline(
-                "automatic-speech-recognition",
-                model=model,
-                tokenizer=processor.tokenizer,
-                feature_extractor=processor.feature_extractor,
-                torch_dtype=torch_dtype,
-                device=device,
-            )
         else:
-            raise ValueError("Invalid model choice")
-        # Rest of the code remains the same
         if isinstance(input_source, str) and (input_source.startswith('http://') or input_source.startswith('https://')):
             audio_path = download_audio(input_source, download_method)
@@ -268,7 +267,9 @@ iface = gr.Interface(
     fn=transcribe_audio,
     inputs=[
         gr.Textbox(label="Audio Source (Upload, URL, or YouTube URL)"),
-        gr.Dropdown(choices=["faster-whisper", "primeline/whisper-large-v3-german", "openai/whisper-large-v3"], label="Model Choice", value="faster-whisper"),
         gr.Slider(minimum=1, maximum=32, step=1, value=16, label="Batch Size"),
         gr.Dropdown(choices=["yt-dlp", "pytube", "youtube-dl", "yt-dlp-alt", "ffmpeg", "aria2", "wget"], label="Download Method", value="yt-dlp"),
         gr.Number(label="Start Time (seconds)", value=0),
@@ -280,15 +281,20 @@ iface = gr.Interface(
         gr.Textbox(label="Transcription", lines=10),
         gr.File(label="Download Transcription")
     ],
-    title="Multi-Model Transcription",
-    description="Transcribe audio using multiple models.",
     examples=[
-        ["https://www.youtube.com/watch?v=daQ_hqA6HDo", "faster-whisper", 16, "yt-dlp", 0, None, False],
-        ["https://mcdn.podbean.com/mf/web/dir5wty678b6g4vg/HoP_453_-_The_Price_is_Right_-_Law_and_Economics_in_the_Second_Scholastic5yxzh.mp3", "primeline/whisper-large-v3-german", 16, "ffmpeg", 0, 300, True],
-        ["path/to/local/audio.mp3", "openai/whisper-large-v3", 16, "yt-dlp", 60, 180, False]
     ],
     cache_examples=False,
     live=True
 )
-iface.launch()

         f.write(transcription)
     return file_path
+def get_model_options(pipeline_type):
+    if pipeline_type == "faster-batched":
+        return ["cstr/whisper-large-v3-turbo-int8_float32", "deepdml/faster-whisper-large-v3-turbo-ct2", "Systran/faster-whisper-large-v3", "GalaktischeGurke/primeline-whisper-large-v3-german-ct2"]
+    elif pipeline_type == "faster-sequenced":
+        return ["cstr/whisper-large-v3-turbo-int8_float32", "deepdml/faster-whisper-large-v3-turbo-ct2", "Systran/faster-whisper-large-v3", "GalaktischeGurke/primeline-whisper-large-v3-german-ct2"]
+    elif pipeline_type == "transformers":
+        return ["openai/whisper-large-v3", "openai/whisper-large-v3-turbo", "primeline/whisper-large-v3-german"]
+    else:
+        return []
+def update_model_dropdown(pipeline_type):
+    return gr.Dropdown.update(choices=get_model_options(pipeline_type), value=get_model_options(pipeline_type)[0])
+def transcribe_audio(input_source, pipeline_type, model_id, dtype, batch_size, download_method, start_time=None, end_time=None, verbose=False):
     try:
+        if pipeline_type == "faster-batched":
+            model = WhisperModel(model_id, device="auto", compute_type=dtype)
+            pipeline = BatchedInferencePipeline(model=model)
+        elif pipeline_type == "faster-sequenced":
+            model = WhisperModel(model_id)
+            pipeline = model.transcribe
+        elif pipeline_type == "transformers":
+            torch_dtype = torch.float16 if dtype == "float16" else torch.float32
             model = AutoModelForSpeechSeq2Seq.from_pretrained(
                 model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
             )
             model.to(device)
             processor = AutoProcessor.from_pretrained(model_id)
+            pipeline = pipeline(
                 "automatic-speech-recognition",
                 model=model,
                 tokenizer=processor.tokenizer,
                 feature_extractor=processor.feature_extractor,
                 chunk_length_s=30,
                 batch_size=batch_size,
                 return_timestamps=True,
                 torch_dtype=torch_dtype,
                 device=device,
             )
         else:
+            raise ValueError("Invalid pipeline type")
         if isinstance(input_source, str) and (input_source.startswith('http://') or input_source.startswith('https://')):
             audio_path = download_audio(input_source, download_method)
     fn=transcribe_audio,
     inputs=[
         gr.Textbox(label="Audio Source (Upload, URL, or YouTube URL)"),
+        gr.Dropdown(choices=["faster-batched", "faster-sequenced", "transformers"], label="Pipeline Type", value="faster-batched"),
+        gr.Dropdown(label="Model", choices=get_model_options("faster-batched"), value=get_model_options("faster-batched")[0]),
+        gr.Dropdown(choices=["int8", "float16", "float32"], label="Data Type", value="int8"),
         gr.Slider(minimum=1, maximum=32, step=1, value=16, label="Batch Size"),
         gr.Dropdown(choices=["yt-dlp", "pytube", "youtube-dl", "yt-dlp-alt", "ffmpeg", "aria2", "wget"], label="Download Method", value="yt-dlp"),
         gr.Number(label="Start Time (seconds)", value=0),
         gr.Textbox(label="Transcription", lines=10),
         gr.File(label="Download Transcription")
     ],
+    title="Multi-Pipeline Transcription",
+    description="Transcribe audio using multiple pipelines and models.",
     examples=[
+        ["https://www.youtube.com/watch?v=daQ_hqA6HDo", "faster-batched", "cstr/whisper-large-v3-turbo-int8_float32", "int8", 16, "yt-dlp", 0, None, False],
+        ["https://mcdn.podbean.com/mf/web/dir5wty678b6g4vg/HoP_453_-_The_Price_is_Right_-_Law_and_Economics_in_the_Second_Scholastic5yxzh.mp3", "faster-sequenced", "deepdml/faster-whisper-large-v3-turbo-ct2", "float16", 1, "ffmpeg", 0, 300, True],
+        ["path/to/local/audio.mp3", "transformers", "openai/whisper-large-v3", "float16", 16, "yt-dlp", 60, 180, False]
     ],
     cache_examples=False,
     live=True
 )
+iface.launch()
+pipeline_type_dropdown = iface.inputs[1]
+model_dropdown = iface.inputs[2]
+pipeline_type_dropdown.change(update_model_dropdown, inputs=[pipeline_type_dropdown], outputs=[model_dropdown])