transcribe_audio

Paused

App Files Files Community

cstr commited on Oct 2, 2024

Commit

a8b126b

verified ·

1 Parent(s): acd8816

Update app.py

Browse files

Files changed (1) hide show

app.py +75 -18

app.py CHANGED Viewed

@@ -12,8 +12,20 @@ import yt_dlp
 logging.basicConfig(level=logging.INFO)
 sys.path.append("./faster-whisper")
-from faster_whisper import WhisperModel, BatchedInferencePipeline
 device = "cuda:0" if torch.cuda.is_available() else "cpu"
@@ -143,10 +155,49 @@ def save_transcription(transcription):
         f.write(transcription)
     return file_path
-def transcribe_audio(input_source, batch_size, download_method, start_time=None, end_time=None, verbose=False):
     try:
-        model = WhisperModel("cstr/whisper-large-v3-turbo-int8_float32", device="auto", compute_type="int8")
-        batched_model = BatchedInferencePipeline(model=model)
         if isinstance(input_source, str) and (input_source.startswith('http://') or input_source.startswith('https://')):
             audio_path = download_audio(input_source, download_method)
@@ -160,19 +211,21 @@ def transcribe_audio(input_source, batch_size, download_method, start_time=None,
             trimmed_audio_path = trim_audio(audio_path, start_time or 0, end_time)
             audio_path = trimmed_audio_path
-        start_time_perf = time.time()
-        segments, info = batched_model.transcribe(audio_path, batch_size=batch_size, initial_prompt=None)
-        end_time_perf = time.time()
         transcription_time = end_time_perf - start_time_perf
-        real_time_factor = info.duration / transcription_time
         audio_file_size = os.path.getsize(audio_path) / (1024 * 1024)
         metrics_output = (
-            f"Language: {info.language}, Probability: {info.language_probability:.2f}\n"
-            f"Duration: {info.duration:.2f}s, Duration after VAD: {info.duration_after_vad:.2f}s\n"
             f"Transcription time: {transcription_time:.2f} seconds\n"
-            f"Real-time factor: {real_time_factor:.2f}x\n"
             f"Audio file size: {audio_file_size:.2f} MB\n"
         )
@@ -182,7 +235,10 @@ def transcribe_audio(input_source, batch_size, download_method, start_time=None,
         transcription = ""
         for segment in segments:
-            transcription_segment = f"[{segment.start:.2f}s -> {segment.end:.2f}s] {segment.text}\n"
             transcription += transcription_segment
             if verbose:
@@ -205,14 +261,15 @@ def transcribe_audio(input_source, batch_size, download_method, start_time=None,
                 os.remove(trimmed_audio_path)
             except:
                 pass
 iface = gr.Interface(
     fn=transcribe_audio,
     inputs=[
         gr.Textbox(label="Audio Source (Upload, URL, or YouTube URL)"),
         gr.Slider(minimum=1, maximum=32, step=1, value=16, label="Batch Size"),
         gr.Dropdown(choices=["yt-dlp", "pytube", "youtube-dl", "yt-dlp-alt", "ffmpeg", "aria2", "wget"], label="Download Method", value="yt-dlp"),
-        gr.Number(label="Start Time (seconds)", value=0),
         gr.Number(label="End Time (seconds)", value=0),
         gr.Checkbox(label="Verbose Output", value=False)
     ],
@@ -222,11 +279,11 @@ iface = gr.Interface(
         gr.File(label="Download Transcription")
     ],
     title="Multi-Model Transcription",
-    description="Transcribe audio using with Whisper.",
     examples=[
-        ["https://www.youtube.com/watch?v=daQ_hqA6HDo", 16, "yt-dlp", 0, None, False],
-        ["https://mcdn.podbean.com/mf/web/dir5wty678b6g4vg/HoP_453_-_The_Price_is_Right_-_Law_and_Economics_in_the_Second_Scholastic5yxzh.mp3", 16, "ffmpeg", 0, 300, True],
-        ["path/to/local/audio.mp3", 16, "yt-dlp", 60, 180, False]
     ],
     cache_examples=False,
     live=True

 logging.basicConfig(level=logging.INFO)
+# Clone and install faster-whisper from GitHub
+# (we should be able to do this in build.sh in a hf space)
+try:
+    subprocess.run(["git", "clone", "https://github.com/SYSTRAN/faster-whisper.git"], check=True)
+    subprocess.run(["pip", "install", "-e", "./faster-whisper"], check=True)
+except subprocess.CalledProcessError as e:
+    print(f"Error during faster-whisper installation: {e}")
+    sys.exit(1)
+# Add the faster-whisper directory to the Python path
 sys.path.append("./faster-whisper")
+from faster_whisper import WhisperModel
+from faster_whisper.transcribe import BatchedInferencePipeline
 device = "cuda:0" if torch.cuda.is_available() else "cpu"
         f.write(transcription)
     return file_path
+def transcribe_audio(input_source, model_choice, batch_size, download_method, start_time=None, end_time=None, verbose=False):
     try:
+        if model_choice == "faster-whisper":
+            model = WhisperModel("cstr/whisper-large-v3-turbo-int8_float32", device="auto", compute_type="int8")
+            batched_model = BatchedInferencePipeline(model=model)
+        elif model_choice == "primeline/whisper-large-v3-german":
+            model_id = "primeline/whisper-large-v3-german"
+            model = AutoModelForSpeechSeq2Seq.from_pretrained(
+                model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
+            )
+            model.to(device)
+            processor = AutoProcessor.from_pretrained(model_id)
+            pipe = pipeline(
+                "automatic-speech-recognition",
+                model=model,
+                tokenizer=processor.tokenizer,
+                feature_extractor=processor.feature_extractor,
+                max_new_tokens=128,
+                chunk_length_s=30,
+                batch_size=batch_size,
+                return_timestamps=True,
+                torch_dtype=torch_dtype,
+                device=device,
+            )
+        elif model_choice == "openai/whisper-large-v3":
+            model_id = "openai/whisper-large-v3"
+            model = AutoModelForSpeechSeq2Seq.from_pretrained(
+                model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
+            )
+            model.to(device)
+            processor = AutoProcessor.from_pretrained(model_id)
+            pipe = pipeline(
+                "automatic-speech-recognition",
+                model=model,
+                tokenizer=processor.tokenizer,
+                feature_extractor=processor.feature_extractor,
+                torch_dtype=torch_dtype,
+                device=device,
+            )
+        else:
+            raise ValueError("Invalid model choice")
+        # Rest of the code remains the same
         if isinstance(input_source, str) and (input_source.startswith('http://') or input_source.startswith('https://')):
             audio_path = download_audio(input_source, download_method)
             trimmed_audio_path = trim_audio(audio_path, start_time or 0, end_time)
             audio_path = trimmed_audio_path
+        if model_choice == "faster-whisper":
+            start_time_perf = time.time()
+            segments, info = batched_model.transcribe(audio_path, batch_size=batch_size, initial_prompt=None)
+            end_time_perf = time.time()
+        else:
+            start_time_perf = time.time()
+            result = pipe(audio_path)
+            segments = result["chunks"]
+            end_time_perf = time.time()
         transcription_time = end_time_perf - start_time_perf
         audio_file_size = os.path.getsize(audio_path) / (1024 * 1024)
         metrics_output = (
             f"Transcription time: {transcription_time:.2f} seconds\n"
             f"Audio file size: {audio_file_size:.2f} MB\n"
         )
         transcription = ""
         for segment in segments:
+            if model_choice == "faster-whisper":
+                transcription_segment = f"[{segment.start:.2f}s -> {segment.end:.2f}s] {segment.text}\n"
+            else:
+                transcription_segment = f"[{segment['timestamp'][0]:.2f}s -> {segment['timestamp'][1]:.2f}s] {segment['text']}\n"
             transcription += transcription_segment
             if verbose:
                 os.remove(trimmed_audio_path)
             except:
                 pass
 iface = gr.Interface(
     fn=transcribe_audio,
     inputs=[
         gr.Textbox(label="Audio Source (Upload, URL, or YouTube URL)"),
+        gr.Dropdown(choices=["faster-whisper", "primeline/whisper-large-v3-german", "openai/whisper-large-v3"], label="Model Choice", value="faster-whisper"),
         gr.Slider(minimum=1, maximum=32, step=1, value=16, label="Batch Size"),
         gr.Dropdown(choices=["yt-dlp", "pytube", "youtube-dl", "yt-dlp-alt", "ffmpeg", "aria2", "wget"], label="Download Method", value="yt-dlp"),
+        gr.Number(label="Start Time (seconds)", value=0),
         gr.Number(label="End Time (seconds)", value=0),
         gr.Checkbox(label="Verbose Output", value=False)
     ],
         gr.File(label="Download Transcription")
     ],
     title="Multi-Model Transcription",
+    description="Transcribe audio using multiple models.",
     examples=[
+        ["https://www.youtube.com/watch?v=daQ_hqA6HDo", "faster-whisper", 16, "yt-dlp", 0, None, False],
+        ["https://mcdn.podbean.com/mf/web/dir5wty678b6g4vg/HoP_453_-_The_Price_is_Right_-_Law_and_Economics_in_the_Second_Scholastic5yxzh.mp3", "primeline/whisper-large-v3-german", 16, "ffmpeg", 0, 300, True],
+        ["path/to/local/audio.mp3", "openai/whisper-large-v3", 16, "yt-dlp", 60, 180, False]
     ],
     cache_examples=False,
     live=True