whisper-vs-distil-whisper-zero

Running on Zero

App Files Files Community

sanchit-gandhi commited on Nov 23, 2023

Commit

bf2a8ff

1 Parent(s): f8dd558

revert short-form changes

Browse files

Files changed (4) hide show

app.py +21 -55
assets/example_1.wav +2 -2
assets/example_2.wav +2 -2
assets/example_3.wav +0 -3

app.py CHANGED Viewed

@@ -1,7 +1,6 @@
-from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline, TextIteratorStreamer
 from transformers.utils import is_flash_attn_2_available
 from transformers.pipelines.audio_utils import ffmpeg_read
-from threading import Thread
 import torch
 import gradio as gr
 import time
@@ -26,7 +25,6 @@ if not use_flash_attention_2:
     distilled_model = distilled_model.to_bettertransformer()
 processor = AutoProcessor.from_pretrained("openai/whisper-large-v2")
-streamer = TextIteratorStreamer(processor.tokenizer, skip_special_tokens=True)
 model.to(device)
 distilled_model.to(device)
@@ -58,6 +56,7 @@ distil_pipe = pipeline(
 )
 distil_pipe_forward = distil_pipe._forward
 def transcribe(inputs):
     if inputs is None:
         raise gr.Error("No audio file submitted! Please record or upload an audio file before submitting your request.")
@@ -74,65 +73,32 @@ def transcribe(inputs):
             f"Got an audio of length {round(audio_length_mins, 3)} minutes."
         )
-    if audio_length_mins >= 0.5:
-        inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
-        def _forward_distil_time(*args, **kwargs):
-            global distil_runtime_pipeline
-            start_time = time.time()
-            result = distil_pipe_forward(*args, **kwargs)
-            distil_runtime_pipeline = time.time() - start_time
-            distil_runtime_pipeline = round(distil_runtime_pipeline, 2)
-            return result
-        distil_pipe._forward = _forward_distil_time
-        distil_text = distil_pipe(inputs.copy(), batch_size=BATCH_SIZE)["text"]
-        yield distil_text, distil_runtime_pipeline, None, None
-        def _forward_time(*args, **kwargs):
-            global runtime_pipeline
-            start_time = time.time()
-            result = pipe_forward(*args, **kwargs)
-            runtime_pipeline = time.time() - start_time
-            runtime_pipeline = round(runtime_pipeline, 2)
-            return result
-        pipe._forward = _forward_time
-        text = pipe(inputs, batch_size=BATCH_SIZE)["text"]
-        yield distil_text, distil_runtime_pipeline, text, runtime_pipeline
-    else:
-        input_features = processor(inputs, sampling_rate=processor.feature_extractor.sampling_rate, return_tensors="pt").input_features
-        input_features = input_features.to(device, dtype=torch_dtype)
-        # Run the generation in a separate thread, so that we can fetch the generated text in a non-blocking way.
-        generation_kwargs = dict(input_features=input_features, streamer=streamer, max_new_tokens=128, language="en", task="transcribe")
-        thread = Thread(target=distilled_model.generate, kwargs=generation_kwargs)
-        thread.start()
         start_time = time.time()
-        distil_text = ""
-        for generated_text in streamer:
-            distil_text += generated_text
-            yield distil_text, None, None, None
         distil_runtime = time.time() - start_time
         distil_runtime = round(distil_runtime, 2)
-        yield distil_text, distil_runtime, None, None
-        thread = Thread(target=model.generate, kwargs=generation_kwargs)
-        thread.start()
         start_time = time.time()
-        text = ""
-        for generated_text in streamer:
-            text += generated_text
-            yield distil_text, distil_runtime, text, None
         runtime = time.time() - start_time
         runtime = round(runtime, 2)
-        yield distil_text, distil_runtime, text, runtime
 if __name__ == "__main__":
@@ -158,7 +124,7 @@ if __name__ == "__main__":
             of the <a href="https://huggingface.co/openai/whisper-large-v2"> Whisper</a> model by OpenAI. Compared to Whisper,
             Distil-Whisper runs 6x faster with 50% fewer parameters, while performing to within 1% word error rate (WER) on
             out-of-distribution evaluation data.</p>
             <p>In this demo, we perform a speed comparison between Whisper and Distil-Whisper in order to test this claim.
             Both models use the <a href="https://huggingface.co/distil-whisper/distil-large-v2#long-form-transcription"> chunked long-form transcription algorithm</a>
             in 🤗 Transformers, as well as Flash Attention. To use Distil-Whisper yourself, check the code examples on the
@@ -181,7 +147,7 @@ if __name__ == "__main__":
         )
         gr.Markdown("## Examples")
         gr.Examples(
-            [["./assets/example_1.wav"], ["./assets/example_2.wav"], ["./assets/example_3.wav"]],
             audio,
             outputs=[distil_transcription, distil_runtime, transcription, runtime],
             fn=transcribe,

+from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
 from transformers.utils import is_flash_attn_2_available
 from transformers.pipelines.audio_utils import ffmpeg_read
 import torch
 import gradio as gr
 import time
     distilled_model = distilled_model.to_bettertransformer()
 processor = AutoProcessor.from_pretrained("openai/whisper-large-v2")
 model.to(device)
 distilled_model.to(device)
 )
 distil_pipe_forward = distil_pipe._forward
 def transcribe(inputs):
     if inputs is None:
         raise gr.Error("No audio file submitted! Please record or upload an audio file before submitting your request.")
             f"Got an audio of length {round(audio_length_mins, 3)} minutes."
         )
+    inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
+    def _forward_distil_time(*args, **kwargs):
+        global distil_runtime
         start_time = time.time()
+        result = distil_pipe_forward(*args, **kwargs)
         distil_runtime = time.time() - start_time
         distil_runtime = round(distil_runtime, 2)
+        return result
+    distil_pipe._forward = _forward_distil_time
+    distil_text = distil_pipe(inputs.copy(), batch_size=BATCH_SIZE)["text"]
+    yield distil_text, distil_runtime, None, None, None
+    def _forward_time(*args, **kwargs):
+        global runtime
         start_time = time.time()
+        result = pipe_forward(*args, **kwargs)
         runtime = time.time() - start_time
         runtime = round(runtime, 2)
+        return result
+    pipe._forward = _forward_time
+    text = pipe(inputs, batch_size=BATCH_SIZE)["text"]
+    yield distil_text, distil_runtime, text, runtime
 if __name__ == "__main__":
             of the <a href="https://huggingface.co/openai/whisper-large-v2"> Whisper</a> model by OpenAI. Compared to Whisper,
             Distil-Whisper runs 6x faster with 50% fewer parameters, while performing to within 1% word error rate (WER) on
             out-of-distribution evaluation data.</p>
             <p>In this demo, we perform a speed comparison between Whisper and Distil-Whisper in order to test this claim.
             Both models use the <a href="https://huggingface.co/distil-whisper/distil-large-v2#long-form-transcription"> chunked long-form transcription algorithm</a>
             in 🤗 Transformers, as well as Flash Attention. To use Distil-Whisper yourself, check the code examples on the
         )
         gr.Markdown("## Examples")
         gr.Examples(
+            [["./assets/example_1.wav"], ["./assets/example_2.wav"]],
             audio,
             outputs=[distil_transcription, distil_runtime, transcription, runtime],
             fn=transcribe,

assets/example_1.wav CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d96fece5c0c24d039801e9e39e9985982ad63becdab6c1a141992aa6dd37a615
-size 802110

 version https://git-lfs.github.com/spec/v1
+oid sha256:1e938b9f81dea096ec7d3752e90afca8d370f7a461d3a08e1a559f4440ed055d
+size 1963810

assets/example_2.wav CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1e938b9f81dea096ec7d3752e90afca8d370f7a461d3a08e1a559f4440ed055d
-size 1963810

 version https://git-lfs.github.com/spec/v1
+oid sha256:81fc0857f7fe11416ede431db713a02fdb787bbc049802fe74c791f3b44e5bf4
+size 1920044

assets/example_3.wav DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:81fc0857f7fe11416ede431db713a02fdb787bbc049802fe74c791f3b44e5bf4
-size 1920044