g1
Browse files
app.py
CHANGED
|
@@ -95,37 +95,42 @@ def download_youtube_audio(url, method_choice):
|
|
| 95 |
|
| 96 |
def yt_dlp_method(url):
|
| 97 |
"""
|
| 98 |
-
Downloads YouTube audio using yt-dlp.
|
| 99 |
-
|
| 100 |
Args:
|
| 101 |
url (str): The YouTube URL.
|
| 102 |
-
|
| 103 |
Returns:
|
| 104 |
str: Path to the downloaded audio file.
|
| 105 |
"""
|
| 106 |
logging.info("Using yt-dlp method")
|
|
|
|
|
|
|
| 107 |
ydl_opts = {
|
| 108 |
'format': 'bestaudio/best',
|
|
|
|
| 109 |
'postprocessors': [{
|
| 110 |
'key': 'FFmpegExtractAudio',
|
| 111 |
'preferredcodec': 'mp3',
|
| 112 |
'preferredquality': '192',
|
| 113 |
}],
|
| 114 |
-
'
|
|
|
|
| 115 |
}
|
| 116 |
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
| 117 |
info = ydl.extract_info(url, download=True)
|
| 118 |
-
output_file =
|
|
|
|
| 119 |
logging.info(f"Downloaded YouTube audio: {output_file}")
|
| 120 |
return output_file
|
| 121 |
|
| 122 |
def pytube_method(url):
|
| 123 |
"""
|
| 124 |
-
Downloads audio using pytube.
|
| 125 |
-
|
| 126 |
Args:
|
| 127 |
url (str): The YouTube URL.
|
| 128 |
-
|
| 129 |
Returns:
|
| 130 |
str: Path to the downloaded audio file.
|
| 131 |
"""
|
|
@@ -133,7 +138,8 @@ def pytube_method(url):
|
|
| 133 |
from pytube import YouTube
|
| 134 |
yt = YouTube(url)
|
| 135 |
audio_stream = yt.streams.filter(only_audio=True).first()
|
| 136 |
-
|
|
|
|
| 137 |
base, ext = os.path.splitext(out_file)
|
| 138 |
new_file = base + '.mp3'
|
| 139 |
os.rename(out_file, new_file)
|
|
@@ -396,10 +402,10 @@ loaded_models = {}
|
|
| 396 |
|
| 397 |
def transcribe_audio(input_source, pipeline_type, model_id, dtype, batch_size, download_method, start_time=None, end_time=None, verbose=False):
|
| 398 |
"""
|
| 399 |
-
Transcribes audio from a given
|
| 400 |
|
| 401 |
Args:
|
| 402 |
-
input_source (str
|
| 403 |
pipeline_type (str): Type of pipeline to use ('faster-batched', 'faster-sequenced', or 'transformers').
|
| 404 |
model_id (str): The ID of the model to use.
|
| 405 |
dtype (str): Data type for model computations ('int8', 'float16', or 'float32').
|
|
@@ -424,32 +430,22 @@ def transcribe_audio(input_source, pipeline_type, model_id, dtype, batch_size, d
|
|
| 424 |
if verbose:
|
| 425 |
yield verbose_messages, "", None
|
| 426 |
|
| 427 |
-
#
|
| 428 |
-
|
| 429 |
-
|
| 430 |
-
|
| 431 |
-
|
| 432 |
-
|
| 433 |
-
|
| 434 |
-
|
| 435 |
-
|
| 436 |
-
return
|
| 437 |
-
elif isinstance(input_source, str) and os.path.exists(input_source):
|
| 438 |
-
# Input source is a local file path
|
| 439 |
-
audio_path = input_source
|
| 440 |
-
is_temp_file = False
|
| 441 |
-
elif hasattr(input_source, 'name'):
|
| 442 |
-
# Input source is an uploaded file object
|
| 443 |
-
audio_path = input_source.name
|
| 444 |
-
is_temp_file = False
|
| 445 |
-
else:
|
| 446 |
-
yield "No valid audio source provided.", "", None
|
| 447 |
return
|
| 448 |
|
| 449 |
# Convert start_time and end_time to float or None
|
| 450 |
start_time = float(start_time) if start_time else None
|
| 451 |
end_time = float(end_time) if end_time else None
|
| 452 |
|
|
|
|
| 453 |
if start_time is not None or end_time is not None:
|
| 454 |
audio_path = trim_audio(audio_path, start_time, end_time)
|
| 455 |
is_temp_file = True # The trimmed audio is a temporary file
|
|
@@ -463,6 +459,7 @@ def transcribe_audio(input_source, pipeline_type, model_id, dtype, batch_size, d
|
|
| 463 |
model_or_pipeline = loaded_models[model_key]
|
| 464 |
logging.info("Loaded model from cache")
|
| 465 |
else:
|
|
|
|
| 466 |
if pipeline_type == "faster-batched":
|
| 467 |
model = WhisperModel(model_id, device=device, compute_type=dtype)
|
| 468 |
model_or_pipeline = BatchedInferencePipeline(model=model)
|
|
@@ -495,6 +492,7 @@ def transcribe_audio(input_source, pipeline_type, model_id, dtype, batch_size, d
|
|
| 495 |
raise ValueError("Invalid pipeline type")
|
| 496 |
loaded_models[model_key] = model_or_pipeline # Cache the model or pipeline
|
| 497 |
|
|
|
|
| 498 |
start_time_perf = time.time()
|
| 499 |
if pipeline_type == "faster-batched":
|
| 500 |
segments, info = model_or_pipeline.transcribe(audio_path, batch_size=batch_size)
|
|
@@ -505,6 +503,7 @@ def transcribe_audio(input_source, pipeline_type, model_id, dtype, batch_size, d
|
|
| 505 |
segments = result["chunks"]
|
| 506 |
end_time_perf = time.time()
|
| 507 |
|
|
|
|
| 508 |
transcription_time = end_time_perf - start_time_perf
|
| 509 |
audio_file_size = os.path.getsize(audio_path) / (1024 * 1024)
|
| 510 |
|
|
@@ -516,6 +515,7 @@ def transcribe_audio(input_source, pipeline_type, model_id, dtype, batch_size, d
|
|
| 516 |
if verbose:
|
| 517 |
yield verbose_messages + metrics_output, "", None
|
| 518 |
|
|
|
|
| 519 |
transcription = ""
|
| 520 |
|
| 521 |
for segment in segments:
|
|
@@ -527,6 +527,7 @@ def transcribe_audio(input_source, pipeline_type, model_id, dtype, batch_size, d
|
|
| 527 |
if verbose:
|
| 528 |
yield verbose_messages + metrics_output, transcription, None
|
| 529 |
|
|
|
|
| 530 |
transcription_file = save_transcription(transcription)
|
| 531 |
yield verbose_messages + metrics_output, transcription, transcription_file
|
| 532 |
|
|
|
|
| 95 |
|
| 96 |
def yt_dlp_method(url):
|
| 97 |
"""
|
| 98 |
+
Downloads YouTube audio using yt-dlp and saves it to a temporary file.
|
| 99 |
+
|
| 100 |
Args:
|
| 101 |
url (str): The YouTube URL.
|
| 102 |
+
|
| 103 |
Returns:
|
| 104 |
str: Path to the downloaded audio file.
|
| 105 |
"""
|
| 106 |
logging.info("Using yt-dlp method")
|
| 107 |
+
temp_dir = tempfile.mkdtemp()
|
| 108 |
+
output_template = os.path.join(temp_dir, '%(id)s.%(ext)s')
|
| 109 |
ydl_opts = {
|
| 110 |
'format': 'bestaudio/best',
|
| 111 |
+
'outtmpl': output_template,
|
| 112 |
'postprocessors': [{
|
| 113 |
'key': 'FFmpegExtractAudio',
|
| 114 |
'preferredcodec': 'mp3',
|
| 115 |
'preferredquality': '192',
|
| 116 |
}],
|
| 117 |
+
'quiet': True,
|
| 118 |
+
'no_warnings': True,
|
| 119 |
}
|
| 120 |
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
| 121 |
info = ydl.extract_info(url, download=True)
|
| 122 |
+
output_file = ydl.prepare_filename(info)
|
| 123 |
+
output_file = os.path.splitext(output_file)[0] + '.mp3'
|
| 124 |
logging.info(f"Downloaded YouTube audio: {output_file}")
|
| 125 |
return output_file
|
| 126 |
|
| 127 |
def pytube_method(url):
|
| 128 |
"""
|
| 129 |
+
Downloads audio from a YouTube URL using pytube and saves it to a temporary file.
|
| 130 |
+
|
| 131 |
Args:
|
| 132 |
url (str): The YouTube URL.
|
| 133 |
+
|
| 134 |
Returns:
|
| 135 |
str: Path to the downloaded audio file.
|
| 136 |
"""
|
|
|
|
| 138 |
from pytube import YouTube
|
| 139 |
yt = YouTube(url)
|
| 140 |
audio_stream = yt.streams.filter(only_audio=True).first()
|
| 141 |
+
temp_dir = tempfile.mkdtemp()
|
| 142 |
+
out_file = audio_stream.download(output_path=temp_dir)
|
| 143 |
base, ext = os.path.splitext(out_file)
|
| 144 |
new_file = base + '.mp3'
|
| 145 |
os.rename(out_file, new_file)
|
|
|
|
| 402 |
|
| 403 |
def transcribe_audio(input_source, pipeline_type, model_id, dtype, batch_size, download_method, start_time=None, end_time=None, verbose=False):
|
| 404 |
"""
|
| 405 |
+
Transcribes audio from a given URL using the specified pipeline and model.
|
| 406 |
|
| 407 |
Args:
|
| 408 |
+
input_source (str): URL of the audio.
|
| 409 |
pipeline_type (str): Type of pipeline to use ('faster-batched', 'faster-sequenced', or 'transformers').
|
| 410 |
model_id (str): The ID of the model to use.
|
| 411 |
dtype (str): Data type for model computations ('int8', 'float16', or 'float32').
|
|
|
|
| 430 |
if verbose:
|
| 431 |
yield verbose_messages, "", None
|
| 432 |
|
| 433 |
+
# Input source is expected to be a URL
|
| 434 |
+
if not input_source or not input_source.strip():
|
| 435 |
+
yield "No audio URL provided.", "", None
|
| 436 |
+
return
|
| 437 |
+
|
| 438 |
+
# Download the audio from the URL
|
| 439 |
+
audio_path, is_temp_file = download_audio(input_source, download_method)
|
| 440 |
+
if not audio_path or audio_path.startswith("Error"):
|
| 441 |
+
yield f"Error downloading audio: {audio_path}", "", None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 442 |
return
|
| 443 |
|
| 444 |
# Convert start_time and end_time to float or None
|
| 445 |
start_time = float(start_time) if start_time else None
|
| 446 |
end_time = float(end_time) if end_time else None
|
| 447 |
|
| 448 |
+
# Trim the audio if start or end times are provided
|
| 449 |
if start_time is not None or end_time is not None:
|
| 450 |
audio_path = trim_audio(audio_path, start_time, end_time)
|
| 451 |
is_temp_file = True # The trimmed audio is a temporary file
|
|
|
|
| 459 |
model_or_pipeline = loaded_models[model_key]
|
| 460 |
logging.info("Loaded model from cache")
|
| 461 |
else:
|
| 462 |
+
# Load the appropriate model or pipeline based on the pipeline type
|
| 463 |
if pipeline_type == "faster-batched":
|
| 464 |
model = WhisperModel(model_id, device=device, compute_type=dtype)
|
| 465 |
model_or_pipeline = BatchedInferencePipeline(model=model)
|
|
|
|
| 492 |
raise ValueError("Invalid pipeline type")
|
| 493 |
loaded_models[model_key] = model_or_pipeline # Cache the model or pipeline
|
| 494 |
|
| 495 |
+
# Perform the transcription
|
| 496 |
start_time_perf = time.time()
|
| 497 |
if pipeline_type == "faster-batched":
|
| 498 |
segments, info = model_or_pipeline.transcribe(audio_path, batch_size=batch_size)
|
|
|
|
| 503 |
segments = result["chunks"]
|
| 504 |
end_time_perf = time.time()
|
| 505 |
|
| 506 |
+
# Calculate metrics
|
| 507 |
transcription_time = end_time_perf - start_time_perf
|
| 508 |
audio_file_size = os.path.getsize(audio_path) / (1024 * 1024)
|
| 509 |
|
|
|
|
| 515 |
if verbose:
|
| 516 |
yield verbose_messages + metrics_output, "", None
|
| 517 |
|
| 518 |
+
# Compile the transcription text
|
| 519 |
transcription = ""
|
| 520 |
|
| 521 |
for segment in segments:
|
|
|
|
| 527 |
if verbose:
|
| 528 |
yield verbose_messages + metrics_output, transcription, None
|
| 529 |
|
| 530 |
+
# Save the transcription to a file
|
| 531 |
transcription_file = save_transcription(transcription)
|
| 532 |
yield verbose_messages + metrics_output, transcription, transcription_file
|
| 533 |
|