Abid Ali Awan commited on
Commit
e4e6a48
·
1 Parent(s): f4b6d22

Update README.md: Change emoji, color scheme, and short description to better reflect the project focus on Urdu speech-to-text using faster-whisper.

Browse files
Files changed (3) hide show
  1. README.md +4 -4
  2. app.py +125 -0
  3. requirements.txt +1 -0
README.md CHANGED
@@ -1,14 +1,14 @@
1
  ---
2
  title: Faster Urdu ASR
3
- emoji: 😻
4
- colorFrom: green
5
- colorTo: blue
6
  sdk: gradio
7
  sdk_version: 5.35.0
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
11
- short_description: Faster Whisper with CT2 on CPU.
12
  ---
13
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
  title: Faster Urdu ASR
3
+ emoji: 🏎️
4
+ colorFrom: red
5
+ colorTo: yellow
6
  sdk: gradio
7
  sdk_version: 5.35.0
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
11
+ short_description: Best Urdu speech to text using faster-whisper.
12
  ---
13
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py – Urdu Whisper (CT2) transcription demo with upload + record
2
+
3
+ import gradio as gr
4
+ import faster_whisper
5
+ import torch
6
+ from datetime import timedelta
7
+ import json
8
+ import os
9
+
10
+ # (Optional) cache Hugging Face files in a persistent dir when running in Spaces
11
+ os.environ["HF_HOME"] = "/home/user/app/.cache"
12
+
13
+ # Show GPU availability
14
+ print(f"CUDA available: {torch.cuda.is_available()}")
15
+ if torch.cuda.is_available():
16
+ print(f"GPU: {torch.cuda.get_device_name(0)}")
17
+
18
+ # Load the Urdu CT2 Whisper model
19
+ print("Loading model... this may take a minute the first time.")
20
+ model = faster_whisper.WhisperModel(
21
+ "kingabzpro/whisper-large-v3-urdu-ct2",
22
+ device="cuda" if torch.cuda.is_available() else "cpu",
23
+ compute_type="float16" if torch.cuda.is_available() else "float32",
24
+ )
25
+ print("✅ Model loaded successfully!")
26
+
27
+
28
+ def format_timestamp(seconds, format_type="srt"):
29
+ delta = timedelta(seconds=seconds)
30
+ hours = int(delta.total_seconds()) // 3600
31
+ minutes = (int(delta.total_seconds()) % 3600) // 60
32
+ sec = int(delta.total_seconds()) % 60
33
+ ms = int(delta.microseconds / 1000)
34
+ sep = "," if format_type == "srt" else "."
35
+ return f"{hours:02d}:{minutes:02d}:{sec:02d}{sep}{ms:03d}"
36
+
37
+
38
+ def transcribe_audio(uploaded_path, recorded_path, output_format, beam_size):
39
+ # choose recorded over uploaded if present
40
+ audio_path = recorded_path or uploaded_path
41
+ if not audio_path:
42
+ raise gr.Error("Please upload or record an audio clip.")
43
+ segments_gen, info = model.transcribe(
44
+ audio_path,
45
+ language="ur",
46
+ beam_size=beam_size,
47
+ word_timestamps=True,
48
+ condition_on_previous_text=False,
49
+ vad_filter=True,
50
+ vad_parameters=dict(min_silence_duration_ms=500),
51
+ )
52
+
53
+ segments, full = [], []
54
+ for seg in segments_gen:
55
+ segments.append({"start": seg.start, "end": seg.end, "text": seg.text.strip()})
56
+ full.append(seg.text.strip())
57
+
58
+ if output_format == "text":
59
+ return " ".join(full)
60
+ if output_format == "srt":
61
+ lines = []
62
+ for i, s in enumerate(segments, 1):
63
+ lines += [
64
+ str(i),
65
+ f"{format_timestamp(s['start'])} --> {format_timestamp(s['end'])}",
66
+ s["text"],
67
+ "",
68
+ ]
69
+ return "\n".join(lines)
70
+ if output_format == "vtt":
71
+ lines = ["WEBVTT", ""]
72
+ for s in segments:
73
+ lines += [
74
+ f"{format_timestamp(s['start'], 'vtt')} --> {format_timestamp(s['end'], 'vtt')}",
75
+ s["text"],
76
+ "",
77
+ ]
78
+ return "\n".join(lines)
79
+ if output_format == "json":
80
+ return json.dumps(
81
+ {
82
+ "text": " ".join(full),
83
+ "segments": segments,
84
+ "language": info.language,
85
+ "language_probability": info.language_probability,
86
+ "duration": info.duration,
87
+ "duration_after_vad": info.duration_after_vad,
88
+ },
89
+ ensure_ascii=False,
90
+ indent=2,
91
+ )
92
+ raise gr.Error(f"Unsupported format: {output_format}")
93
+
94
+
95
+ with gr.Blocks(title="Urdu Whisper Transcription") as iface:
96
+ gr.Markdown("## Urdu Whisper Transcription")
97
+ with gr.Row():
98
+ with gr.Column():
99
+ upload = gr.Audio(
100
+ source="upload", type="filepath", label="Upload Audio File"
101
+ )
102
+ record = gr.Audio(
103
+ source="microphone", type="filepath", label="Record Audio"
104
+ )
105
+ fmt = gr.Radio(
106
+ choices=["text", "srt", "vtt", "json"],
107
+ value="text",
108
+ label="Output Format",
109
+ )
110
+ beam = gr.Slider(minimum=1, maximum=10, value=5, step=1, label="Beam Size")
111
+ btn = gr.Button("Transcribe", variant="primary")
112
+ with gr.Column():
113
+ out = gr.Textbox(
114
+ label="Result", lines=20, max_lines=30, show_copy_button=True
115
+ )
116
+
117
+ btn.click(
118
+ fn=transcribe_audio,
119
+ inputs=[upload, record, fmt, beam],
120
+ outputs=out,
121
+ api_name="predict",
122
+ )
123
+
124
+ if __name__ == "__main__":
125
+ iface.launch()
requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ faster-whisper==1.1.1