different models
Browse files
app.py
CHANGED
|
@@ -157,49 +157,48 @@ def save_transcription(transcription):
|
|
| 157 |
f.write(transcription)
|
| 158 |
return file_path
|
| 159 |
|
| 160 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 161 |
try:
|
| 162 |
-
if
|
| 163 |
-
model = WhisperModel(
|
| 164 |
-
|
| 165 |
-
elif
|
| 166 |
-
|
|
|
|
|
|
|
|
|
|
| 167 |
model = AutoModelForSpeechSeq2Seq.from_pretrained(
|
| 168 |
model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
|
| 169 |
)
|
| 170 |
model.to(device)
|
| 171 |
processor = AutoProcessor.from_pretrained(model_id)
|
| 172 |
-
|
| 173 |
"automatic-speech-recognition",
|
| 174 |
model=model,
|
| 175 |
tokenizer=processor.tokenizer,
|
| 176 |
feature_extractor=processor.feature_extractor,
|
| 177 |
-
max_new_tokens=128,
|
| 178 |
chunk_length_s=30,
|
| 179 |
batch_size=batch_size,
|
| 180 |
return_timestamps=True,
|
| 181 |
torch_dtype=torch_dtype,
|
| 182 |
device=device,
|
| 183 |
)
|
| 184 |
-
elif model_choice == "openai/whisper-large-v3":
|
| 185 |
-
model_id = "openai/whisper-large-v3"
|
| 186 |
-
model = AutoModelForSpeechSeq2Seq.from_pretrained(
|
| 187 |
-
model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
|
| 188 |
-
)
|
| 189 |
-
model.to(device)
|
| 190 |
-
processor = AutoProcessor.from_pretrained(model_id)
|
| 191 |
-
pipe = pipeline(
|
| 192 |
-
"automatic-speech-recognition",
|
| 193 |
-
model=model,
|
| 194 |
-
tokenizer=processor.tokenizer,
|
| 195 |
-
feature_extractor=processor.feature_extractor,
|
| 196 |
-
torch_dtype=torch_dtype,
|
| 197 |
-
device=device,
|
| 198 |
-
)
|
| 199 |
else:
|
| 200 |
-
raise ValueError("Invalid
|
| 201 |
-
|
| 202 |
-
# Rest of the code remains the same
|
| 203 |
|
| 204 |
if isinstance(input_source, str) and (input_source.startswith('http://') or input_source.startswith('https://')):
|
| 205 |
audio_path = download_audio(input_source, download_method)
|
|
@@ -268,7 +267,9 @@ iface = gr.Interface(
|
|
| 268 |
fn=transcribe_audio,
|
| 269 |
inputs=[
|
| 270 |
gr.Textbox(label="Audio Source (Upload, URL, or YouTube URL)"),
|
| 271 |
-
gr.Dropdown(choices=["faster-
|
|
|
|
|
|
|
| 272 |
gr.Slider(minimum=1, maximum=32, step=1, value=16, label="Batch Size"),
|
| 273 |
gr.Dropdown(choices=["yt-dlp", "pytube", "youtube-dl", "yt-dlp-alt", "ffmpeg", "aria2", "wget"], label="Download Method", value="yt-dlp"),
|
| 274 |
gr.Number(label="Start Time (seconds)", value=0),
|
|
@@ -280,15 +281,20 @@ iface = gr.Interface(
|
|
| 280 |
gr.Textbox(label="Transcription", lines=10),
|
| 281 |
gr.File(label="Download Transcription")
|
| 282 |
],
|
| 283 |
-
title="Multi-
|
| 284 |
-
description="Transcribe audio using multiple models.",
|
| 285 |
examples=[
|
| 286 |
-
["https://www.youtube.com/watch?v=daQ_hqA6HDo", "faster-whisper", 16, "yt-dlp", 0, None, False],
|
| 287 |
-
["https://mcdn.podbean.com/mf/web/dir5wty678b6g4vg/HoP_453_-_The_Price_is_Right_-_Law_and_Economics_in_the_Second_Scholastic5yxzh.mp3", "
|
| 288 |
-
["path/to/local/audio.mp3", "openai/whisper-large-v3", 16, "yt-dlp", 60, 180, False]
|
| 289 |
],
|
| 290 |
cache_examples=False,
|
| 291 |
live=True
|
| 292 |
)
|
| 293 |
|
| 294 |
-
iface.launch()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 157 |
f.write(transcription)
|
| 158 |
return file_path
|
| 159 |
|
| 160 |
+
|
| 161 |
+
def get_model_options(pipeline_type):
|
| 162 |
+
if pipeline_type == "faster-batched":
|
| 163 |
+
return ["cstr/whisper-large-v3-turbo-int8_float32", "deepdml/faster-whisper-large-v3-turbo-ct2", "Systran/faster-whisper-large-v3", "GalaktischeGurke/primeline-whisper-large-v3-german-ct2"]
|
| 164 |
+
elif pipeline_type == "faster-sequenced":
|
| 165 |
+
return ["cstr/whisper-large-v3-turbo-int8_float32", "deepdml/faster-whisper-large-v3-turbo-ct2", "Systran/faster-whisper-large-v3", "GalaktischeGurke/primeline-whisper-large-v3-german-ct2"]
|
| 166 |
+
elif pipeline_type == "transformers":
|
| 167 |
+
return ["openai/whisper-large-v3", "openai/whisper-large-v3-turbo", "primeline/whisper-large-v3-german"]
|
| 168 |
+
else:
|
| 169 |
+
return []
|
| 170 |
+
|
| 171 |
+
def update_model_dropdown(pipeline_type):
|
| 172 |
+
return gr.Dropdown.update(choices=get_model_options(pipeline_type), value=get_model_options(pipeline_type)[0])
|
| 173 |
+
|
| 174 |
+
def transcribe_audio(input_source, pipeline_type, model_id, dtype, batch_size, download_method, start_time=None, end_time=None, verbose=False):
|
| 175 |
try:
|
| 176 |
+
if pipeline_type == "faster-batched":
|
| 177 |
+
model = WhisperModel(model_id, device="auto", compute_type=dtype)
|
| 178 |
+
pipeline = BatchedInferencePipeline(model=model)
|
| 179 |
+
elif pipeline_type == "faster-sequenced":
|
| 180 |
+
model = WhisperModel(model_id)
|
| 181 |
+
pipeline = model.transcribe
|
| 182 |
+
elif pipeline_type == "transformers":
|
| 183 |
+
torch_dtype = torch.float16 if dtype == "float16" else torch.float32
|
| 184 |
model = AutoModelForSpeechSeq2Seq.from_pretrained(
|
| 185 |
model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
|
| 186 |
)
|
| 187 |
model.to(device)
|
| 188 |
processor = AutoProcessor.from_pretrained(model_id)
|
| 189 |
+
pipeline = pipeline(
|
| 190 |
"automatic-speech-recognition",
|
| 191 |
model=model,
|
| 192 |
tokenizer=processor.tokenizer,
|
| 193 |
feature_extractor=processor.feature_extractor,
|
|
|
|
| 194 |
chunk_length_s=30,
|
| 195 |
batch_size=batch_size,
|
| 196 |
return_timestamps=True,
|
| 197 |
torch_dtype=torch_dtype,
|
| 198 |
device=device,
|
| 199 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 200 |
else:
|
| 201 |
+
raise ValueError("Invalid pipeline type")
|
|
|
|
|
|
|
| 202 |
|
| 203 |
if isinstance(input_source, str) and (input_source.startswith('http://') or input_source.startswith('https://')):
|
| 204 |
audio_path = download_audio(input_source, download_method)
|
|
|
|
| 267 |
fn=transcribe_audio,
|
| 268 |
inputs=[
|
| 269 |
gr.Textbox(label="Audio Source (Upload, URL, or YouTube URL)"),
|
| 270 |
+
gr.Dropdown(choices=["faster-batched", "faster-sequenced", "transformers"], label="Pipeline Type", value="faster-batched"),
|
| 271 |
+
gr.Dropdown(label="Model", choices=get_model_options("faster-batched"), value=get_model_options("faster-batched")[0]),
|
| 272 |
+
gr.Dropdown(choices=["int8", "float16", "float32"], label="Data Type", value="int8"),
|
| 273 |
gr.Slider(minimum=1, maximum=32, step=1, value=16, label="Batch Size"),
|
| 274 |
gr.Dropdown(choices=["yt-dlp", "pytube", "youtube-dl", "yt-dlp-alt", "ffmpeg", "aria2", "wget"], label="Download Method", value="yt-dlp"),
|
| 275 |
gr.Number(label="Start Time (seconds)", value=0),
|
|
|
|
| 281 |
gr.Textbox(label="Transcription", lines=10),
|
| 282 |
gr.File(label="Download Transcription")
|
| 283 |
],
|
| 284 |
+
title="Multi-Pipeline Transcription",
|
| 285 |
+
description="Transcribe audio using multiple pipelines and models.",
|
| 286 |
examples=[
|
| 287 |
+
["https://www.youtube.com/watch?v=daQ_hqA6HDo", "faster-batched", "cstr/whisper-large-v3-turbo-int8_float32", "int8", 16, "yt-dlp", 0, None, False],
|
| 288 |
+
["https://mcdn.podbean.com/mf/web/dir5wty678b6g4vg/HoP_453_-_The_Price_is_Right_-_Law_and_Economics_in_the_Second_Scholastic5yxzh.mp3", "faster-sequenced", "deepdml/faster-whisper-large-v3-turbo-ct2", "float16", 1, "ffmpeg", 0, 300, True],
|
| 289 |
+
["path/to/local/audio.mp3", "transformers", "openai/whisper-large-v3", "float16", 16, "yt-dlp", 60, 180, False]
|
| 290 |
],
|
| 291 |
cache_examples=False,
|
| 292 |
live=True
|
| 293 |
)
|
| 294 |
|
| 295 |
+
iface.launch()
|
| 296 |
+
|
| 297 |
+
pipeline_type_dropdown = iface.inputs[1]
|
| 298 |
+
model_dropdown = iface.inputs[2]
|
| 299 |
+
|
| 300 |
+
pipeline_type_dropdown.change(update_model_dropdown, inputs=[pipeline_type_dropdown], outputs=[model_dropdown])
|