whisper-vs-distil-whisper-zero

Running on Zero

App Files Files Community

sanchit-gandhi commited on Nov 23, 2023

Commit

7bd1e74

1 Parent(s): 4487a27

short-form

Browse files

Files changed (1) hide show

app.py +52 -17

app.py CHANGED Viewed

@@ -1,6 +1,7 @@
-from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
 from transformers.utils import is_flash_attn_2_available
 from transformers.pipelines.audio_utils import ffmpeg_read
 import torch
 import gradio as gr
 import time
@@ -25,6 +26,7 @@ if not use_flash_attention_2:
     distilled_model = distilled_model.to_bettertransformer()
 processor = AutoProcessor.from_pretrained("openai/whisper-large-v2")
 model.to(device)
 distilled_model.to(device)
@@ -72,32 +74,65 @@ def transcribe(inputs):
             f"Got an audio of length {round(audio_length_mins, 3)} minutes."
         )
-    inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
-    def _forward_distil_time(*args, **kwargs):
-        global distil_runtime
         start_time = time.time()
-        result = distil_pipe_forward(*args, **kwargs)
         distil_runtime = time.time() - start_time
         distil_runtime = round(distil_runtime, 2)
-        return result
-    distil_pipe._forward = _forward_distil_time
-    distil_text = distil_pipe(inputs.copy(), batch_size=BATCH_SIZE)["text"]
-    yield distil_text, distil_runtime, None, None, None
-    def _forward_time(*args, **kwargs):
-        global runtime
         start_time = time.time()
-        result = pipe_forward(*args, **kwargs)
         runtime = time.time() - start_time
         runtime = round(runtime, 2)
-        return result
-    pipe._forward = _forward_time
-    text = pipe(inputs, batch_size=BATCH_SIZE)["text"]
-    yield distil_text, distil_runtime, text, runtime
 if __name__ == "__main__":
     with gr.Blocks() as demo:

+from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline, TextIteratorStreamer
 from transformers.utils import is_flash_attn_2_available
 from transformers.pipelines.audio_utils import ffmpeg_read
+from threading import Thread
 import torch
 import gradio as gr
 import time
     distilled_model = distilled_model.to_bettertransformer()
 processor = AutoProcessor.from_pretrained("openai/whisper-large-v2")
+streamer = TextIteratorStreamer(processor.tokenizer, skip_special_tokens=True)
 model.to(device)
 distilled_model.to(device)
             f"Got an audio of length {round(audio_length_mins, 3)} minutes."
         )
+    if audio_length_mins >= 0.5:
+        inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
+        def _forward_distil_time(*args, **kwargs):
+            global distil_runtime
+            start_time = time.time()
+            result = distil_pipe_forward(*args, **kwargs)
+            distil_runtime = time.time() - start_time
+            distil_runtime = round(distil_runtime, 2)
+            return result
+        distil_pipe._forward = _forward_distil_time
+        distil_text = distil_pipe(inputs.copy(), batch_size=BATCH_SIZE)["text"]
+        yield distil_text, distil_runtime, None, None
+        def _forward_time(*args, **kwargs):
+            global runtime
+            start_time = time.time()
+            result = pipe_forward(*args, **kwargs)
+            runtime = time.time() - start_time
+            runtime = round(runtime, 2)
+            return result
+        pipe._forward = _forward_time
+        text = pipe(inputs, batch_size=BATCH_SIZE)["text"]
+        yield distil_text, distil_runtime, text, runtime
+    else:
+        input_features = processor(inputs, sampling_rate=processor.feature_extractor.sampling_rate, return_tensors="pt").input_features
+        # Run the generation in a separate thread, so that we can fetch the generated text in a non-blocking way.
+        generation_kwargs = dict(input_features, streamer=streamer, max_new_tokens=128, language="en", task="transcribe")
+        thread = Thread(target=distilled_model.generate, kwargs=generation_kwargs)
+        thread.start()
         start_time = time.time()
+        distil_text = ""
+        for generated_text in streamer:
+            distil_text += generated_text
+            yield distil_text, None, None, None
         distil_runtime = time.time() - start_time
         distil_runtime = round(distil_runtime, 2)
+        yield distil_text, distil_runtime, None, None
+        thread = Thread(target=model.generate, kwargs=generation_kwargs)
+        thread.start()
         start_time = time.time()
+        text = ""
+        for generated_text in streamer:
+            text += generated_text
+            yield distil_text, distil_runtime, text, None
         runtime = time.time() - start_time
         runtime = round(runtime, 2)
+        yield distil_text, distil_runtime, text, runtime
 if __name__ == "__main__":
     with gr.Blocks() as demo: