nb-whisper-demo

Sleeping

App Files Files Community

pere commited on Oct 8, 2024

Commit

0f57ece

1 Parent(s): ecc7149

test

Browse files

Files changed (1) hide show

app.py +47 -43

app.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import time
 import os
 import torch
@@ -21,14 +22,14 @@ lang = "no"
 share = (os.environ.get("SHARE", "False")[0].lower() in "ty1") or None
 auth_token = os.environ.get("AUTH_TOKEN") or True
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-print(f"Using device: {device}")
 @spaces.GPU(duration=60 * 2)
 def pipe(file, return_timestamps=False):
     asr = pipeline(
         task="automatic-speech-recognition",
         model=MODEL_NAME,
-        chunk_length_s=26,
         device=device,
         token=auth_token,
         torch_dtype=torch.float16,
@@ -41,9 +42,17 @@ def pipe(file, return_timestamps=False):
     )
     return asr(file, return_timestamps=return_timestamps, batch_size=24)
 def transcribe(file, return_timestamps=False):
     if not return_timestamps:
         text = pipe(file)["text"]
     else:
         chunks = pipe(file, return_timestamps=True)["chunks"]
         text = []
@@ -52,8 +61,8 @@ def transcribe(file, return_timestamps=False):
             end_time = time.strftime('%H:%M:%S', time.gmtime(chunk["timestamp"][1])) if chunk["timestamp"][1] is not None else "??:??:??"
             line = f"[{start_time} -> {end_time}] {chunk['text']}"
             text.append(line)
-        text = "\n".join(text)
-    return text
 def _return_yt_html_embed(yt_url):
     video_id = yt_url.split("?v=")[-1]
@@ -83,49 +92,44 @@ def yt_transcribe(yt_url, return_timestamps=False):
     return html_embed_str, text
-demo = gr.Blocks()
-mf_transcribe = gr.Interface(
-    fn=transcribe,
-    inputs=[
-        gr.components.Audio(sources=['upload', 'microphone'], type="filepath"),
-        gr.components.Checkbox(label="Return timestamps"),
-    ],
-    outputs="text",
-    title="NB-Whisper",
-    description=(
-        "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the the fine-tuned"
-        f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files"
-        " of arbitrary length."
-    ),
-    allow_flagging="never",
-)
-yt_transcribe_interface = gr.Interface(
-    fn=yt_transcribe,
-    inputs=[
-        gr.components.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL"),
-        gr.components.Checkbox(label="Return timestamps"),
-    ],
-    examples=[["https://www.youtube.com/watch?v=mukeSSa5GKo"]],
-    outputs=["html", "text"],
-    title="Whisper Demo: Transcribe YouTube",
-    description=(
-        "Transcribe long-form YouTube videos with the click of a button! Demo uses the the fine-tuned checkpoint:"
-        f" [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files of"
-        " arbitrary length."
-    ),
-    allow_flagging="never",
-)
 with demo:
-    gr.TabbedInterface(
-        [mf_transcribe,
-         # yt_transcribe_interface
         ],
-        ["Transcribe Audio",
-         # "Transcribe YouTube"
-        ]
     )
 demo.launch(share=share).queue()

 import time
 import os
+import re
 import torch
 share = (os.environ.get("SHARE", "False")[0].lower() in "ty1") or None
 auth_token = os.environ.get("AUTH_TOKEN") or True
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+print(f"Bruker enhet: {device}")
 @spaces.GPU(duration=60 * 2)
 def pipe(file, return_timestamps=False):
     asr = pipeline(
         task="automatic-speech-recognition",
         model=MODEL_NAME,
+        chunk_length_s=30,
         device=device,
         token=auth_token,
         torch_dtype=torch.float16,
     )
     return asr(file, return_timestamps=return_timestamps, batch_size=24)
+def format_output(text):
+    # Add a newline after ".", "!", ":", or "?" unless part of sequences like "..."
+    text = re.sub(r'(?<!\.)[.!:?](?!\.)', lambda m: m.group() + '\n', text)
+    # Ensure newline after sequences like "..." or other punctuation patterns
+    text = re.sub(r'(\.{3,}|[.!:?])', lambda m: m.group() + '\n', text)
+    return text
 def transcribe(file, return_timestamps=False):
     if not return_timestamps:
         text = pipe(file)["text"]
+        formatted_text = format_output(text)
     else:
         chunks = pipe(file, return_timestamps=True)["chunks"]
         text = []
             end_time = time.strftime('%H:%M:%S', time.gmtime(chunk["timestamp"][1])) if chunk["timestamp"][1] is not None else "??:??:??"
             line = f"[{start_time} -> {end_time}] {chunk['text']}"
             text.append(line)
+        formatted_text = "\n".join(text)
+    return formatted_text
 def _return_yt_html_embed(yt_url):
     video_id = yt_url.split("?v=")[-1]
     return html_embed_str, text
+# Lag Gradio-appen uten faner
+demo = gr.Blocks()
 with demo:
+    mf_transcribe = gr.Interface(
+        fn=transcribe,
+        inputs=[
+            gr.components.Audio(sources=['upload', 'microphone'], type="filepath"),
+            gr.components.Checkbox(label="Inkluder tidsstempler"),
         ],
+        outputs="text",
+        title="NB-Whisper",
+        description=(
+            "Transkriber lange lydopptak fra mikrofon eller lydfiler med et enkelt klikk! Demoen bruker den fintunede"
+            f" modellen [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) og 🤗 Transformers til å transkribere lydfiler"
+            " av vilkårlig lengde."
+        ),
+        allow_flagging="never",
     )
+    # Uncomment to add the YouTube transcription interface if needed
+    # yt_transcribe_interface = gr.Interface(
+    #     fn=yt_transcribe,
+    #     inputs=[
+    #         gr.components.Textbox(lines=1, placeholder="Lim inn URL til en YouTube-video her", label="YouTube URL"),
+    #         gr.components.Checkbox(label="Inkluder tidsstempler"),
+    #     ],
+    #     examples=[["https://www.youtube.com/watch?v=mukeSSa5GKo"]],
+    #     outputs=["html", "text"],
+    #     title="Whisper Demo: Transkriber YouTube",
+    #     description=(
+    #         "Transkriber lange YouTube-videoer med et enkelt klikk! Demoen bruker den fintunede modellen:"
+    #         f" [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) og 🤗 Transformers til å transkribere lydfiler av"
+    #         " vilkårlig lengde."
+    #     ),
+    #     allow_flagging="never",
+    # )
+# Start demoen uten faner
 demo.launch(share=share).queue()