Spaces:

kingabzpro
/

Urdu-STT-with-GPT-OSS

Running

App Files Files Community

Abid Ali Awan commited on Jul 6

Commit

e4e6a48

1 Parent(s): f4b6d22

Update README.md: Change emoji, color scheme, and short description to better reflect the project focus on Urdu speech-to-text using faster-whisper.

Browse files

Files changed (3) hide show

README.md +4 -4
app.py +125 -0
requirements.txt +1 -0

README.md CHANGED Viewed

@@ -1,14 +1,14 @@
 ---
 title: Faster Urdu ASR
-emoji: 😻
-colorFrom: green
-colorTo: blue
 sdk: gradio
 sdk_version: 5.35.0
 app_file: app.py
 pinned: false
 license: apache-2.0
-short_description: Faster Whisper with CT2 on CPU.
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: Faster Urdu ASR
+emoji: 🏎️
+colorFrom: red
+colorTo: yellow
 sdk: gradio
 sdk_version: 5.35.0
 app_file: app.py
 pinned: false
 license: apache-2.0
+short_description: Best Urdu speech to text using faster-whisper.
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,125 @@

+# app.py – Urdu Whisper (CT2) transcription demo with upload + record
+import gradio as gr
+import faster_whisper
+import torch
+from datetime import timedelta
+import json
+import os
+# (Optional) cache Hugging Face files in a persistent dir when running in Spaces
+os.environ["HF_HOME"] = "/home/user/app/.cache"
+# Show GPU availability
+print(f"CUDA available: {torch.cuda.is_available()}")
+if torch.cuda.is_available():
+    print(f"GPU: {torch.cuda.get_device_name(0)}")
+# Load the Urdu CT2 Whisper model
+print("Loading model... this may take a minute the first time.")
+model = faster_whisper.WhisperModel(
+    "kingabzpro/whisper-large-v3-urdu-ct2",
+    device="cuda" if torch.cuda.is_available() else "cpu",
+    compute_type="float16" if torch.cuda.is_available() else "float32",
+)
+print("✅ Model loaded successfully!")
+def format_timestamp(seconds, format_type="srt"):
+    delta = timedelta(seconds=seconds)
+    hours = int(delta.total_seconds()) // 3600
+    minutes = (int(delta.total_seconds()) % 3600) // 60
+    sec = int(delta.total_seconds()) % 60
+    ms = int(delta.microseconds / 1000)
+    sep = "," if format_type == "srt" else "."
+    return f"{hours:02d}:{minutes:02d}:{sec:02d}{sep}{ms:03d}"
+def transcribe_audio(uploaded_path, recorded_path, output_format, beam_size):
+    # choose recorded over uploaded if present
+    audio_path = recorded_path or uploaded_path
+    if not audio_path:
+        raise gr.Error("Please upload or record an audio clip.")
+    segments_gen, info = model.transcribe(
+        audio_path,
+        language="ur",
+        beam_size=beam_size,
+        word_timestamps=True,
+        condition_on_previous_text=False,
+        vad_filter=True,
+        vad_parameters=dict(min_silence_duration_ms=500),
+    )
+    segments, full = [], []
+    for seg in segments_gen:
+        segments.append({"start": seg.start, "end": seg.end, "text": seg.text.strip()})
+        full.append(seg.text.strip())
+    if output_format == "text":
+        return " ".join(full)
+    if output_format == "srt":
+        lines = []
+        for i, s in enumerate(segments, 1):
+            lines += [
+                str(i),
+                f"{format_timestamp(s['start'])} --> {format_timestamp(s['end'])}",
+                s["text"],
+                "",
+            ]
+        return "\n".join(lines)
+    if output_format == "vtt":
+        lines = ["WEBVTT", ""]
+        for s in segments:
+            lines += [
+                f"{format_timestamp(s['start'], 'vtt')} --> {format_timestamp(s['end'], 'vtt')}",
+                s["text"],
+                "",
+            ]
+        return "\n".join(lines)
+    if output_format == "json":
+        return json.dumps(
+            {
+                "text": " ".join(full),
+                "segments": segments,
+                "language": info.language,
+                "language_probability": info.language_probability,
+                "duration": info.duration,
+                "duration_after_vad": info.duration_after_vad,
+            },
+            ensure_ascii=False,
+            indent=2,
+        )
+    raise gr.Error(f"Unsupported format: {output_format}")
+with gr.Blocks(title="Urdu Whisper Transcription") as iface:
+    gr.Markdown("## Urdu Whisper Transcription")
+    with gr.Row():
+        with gr.Column():
+            upload = gr.Audio(
+                source="upload", type="filepath", label="Upload Audio File"
+            )
+            record = gr.Audio(
+                source="microphone", type="filepath", label="Record Audio"
+            )
+            fmt = gr.Radio(
+                choices=["text", "srt", "vtt", "json"],
+                value="text",
+                label="Output Format",
+            )
+            beam = gr.Slider(minimum=1, maximum=10, value=5, step=1, label="Beam Size")
+            btn = gr.Button("Transcribe", variant="primary")
+        with gr.Column():
+            out = gr.Textbox(
+                label="Result", lines=20, max_lines=30, show_copy_button=True
+            )
+    btn.click(
+        fn=transcribe_audio,
+        inputs=[upload, record, fmt, beam],
+        outputs=out,
+        api_name="predict",
+    )
+if __name__ == "__main__":
+    iface.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ faster-whisper==1.1.1