jamesesqueleto's picture
moved model instantiation outside of function
627ee40
## Imports
from __future__ import unicode_literals
import cv2
import pandas as pd
from moviepy import VideoFileClip
from moviepy.video.tools.subtitles import SubtitlesClip
import os
import spaces
from moviepy.video.tools.subtitles import SubtitlesClip
from moviepy.video.io.VideoFileClip import VideoFileClip
from moviepy import CompositeVideoClip
from moviepy import TextClip
import nemo.collections.asr as nemo_asr
import gradio as gr
#!/usr/bin/env python3
import csv, re, sys
from pathlib import Path
import urllib.request
urllib.request.urlretrieve("https://github.com/Jameshskelton/fonts/raw/refs/heads/main/P052-Roman.ttf", "P052-Roman.ttf")
def parse_time_to_srt(t):
s = str(t).strip()
if re.fullmatch(r"\d+(\.\d+)?", s):
total_ms = int(round(float(s) * 1000))
else:
parts = s.split(':')
if len(parts) == 2:
mm, ss = parts
sec = float(ss)
total_ms = int(round((int(mm) * 60 + sec) * 1000))
elif len(parts) == 3:
hh, mm, ss = parts
sec = float(ss)
total_ms = int(round(((int(hh) * 3600) + (int(mm) * 60) + sec) * 1000))
else:
raise ValueError(f"Unrecognized time format: {s}")
hours = total_ms // 3_600_000
rem = total_ms % 3_600_000
minutes = rem // 60_000
rem = rem % 60_000
seconds = rem // 1000
millis = rem % 1000
return f"{hours:02}:{minutes:02}:{seconds:02},{millis:03}"
def map_position_to_tag(pos):
if not pos:
return ""
s = str(pos).strip().lower()
m = re.search(r"\\?an([1-9])", s)
if m:
return "{\\an" + m.group(1) + "}"
if "top left" in s or ("top" in s and "left" in s):
return "{\\an7}"
if "top right" in s or ("top" in s and "right" in s):
return "{\\an9}"
if "bottom left" in s or ("bottom" in s and "left" in s):
return "{\\an1}"
if "bottom right" in s or ("bottom" in s and "right" in s):
return "{\\an3}"
if "top" in s:
return "{\\an8}"
if "middle" in s or "center" in s or "centre" in s:
return "{\\an5}"
if "bottom" in s:
return "{\\an2}"
return ""
def looks_like_header(row):
joined = ",".join(c.strip().lower() for c in row[:4])
header_words = ["position", "pos", "align", "start", "begin", "end", "stop", "subtitle", "text", "caption"]
return any(w in joined for w in header_words)
def csv_to_srt(csv_path: Path, srt_path: Path):
with open(f'{csv_path}',"r", encoding="utf-8-sig", newline="") as f:
reader = csv.reader(f)
rows = [row for row in reader if any(cell.strip() for cell in row)]
if not rows:
raise ValueError("CSV is empty.")
start_index = 1 if looks_like_header(rows[0]) else 0
normalized = []
for i, row in enumerate(rows[start_index:], start=start_index+1):
if len(row) < 4:
raise ValueError(f"Row {i} has fewer than 4 columns: {row}")
position, start, end, text = row[0].strip(), row[1].strip(), row[2].strip(), row[3]
normalized.append((position, start, end, text))
with open(f"{srt_path}", "w", encoding="utf-8") as out:
for idx, (position, start, end, text) in enumerate(normalized, start=1):
start_srt = parse_time_to_srt(start)
end_srt = parse_time_to_srt(end)
pos_tag = map_position_to_tag(position)
final_text = f"{pos_tag}{text}" if pos_tag else text
out.write(f"{idx}\n")
out.write(f"{start_srt} --> {end_srt}\n")
out.write(f"{final_text}\n\n")
from pydub import AudioSegment
def convert_audio_to_mono_16khz(input_path, output_path):
"""
Converts an audio file to mono and resamples it to 16 kHz.
Args:
input_path (str): The path to the input audio file.
output_path (str): The path to save the converted audio file.
"""
try:
audio = AudioSegment.from_file(input_path)
# Set channels to 1 (mono)
audio = audio.set_channels(1)
# Set frame rate (sample rate) to 16000 Hz
audio = audio.set_frame_rate(16000)
audio.export(output_path, format="wav") # Export as WAV or desired format
print(f"Audio converted successfully to mono, 16kHz at: {output_path}")
except Exception as e:
print(f"Error converting audio: {e}")
asr_model = nemo_asr.models.ASRModel.from_pretrained(model_name="nvidia/parakeet-tdt-0.6b-v3")
@spaces.GPU
def subtitle_video(input_file):
# ------------------------------------------------------------------------------------------------------------------------------
# Params:
# ------------------------------------------------------------------------------------------------------------------------------
#. name: str, name of directory to store files in in experiments folder
# audio_file: str, path to extracted audio file for Whisper
# input_file: str, path to video file for MoviePy to caption
# output: str, destination of final output video file
# lang: str, language
# uploaded_vid: str, path to uploaded video file if download is False
#
#--------------------------------------------------------------------------------------------------------------------------------
# Returns: An annotated video with translated captions into english, saved to name/output
#--------------------------------------------------------------------------------------------------------------------------------
## First, this checks if your expermiment name is taken. If not, it will create the directory.
## Otherwise, we will be prompted to retry with a new name
name = 'run'
try:
os.mkdir(f'experiments/{name}')
print('Starting AutoCaptioning...')
print(f'Results will be stored in experiments/{name}')
except:
None
# Use local clip if not downloading from youtube
my_clip = VideoFileClip(input_file)
my_clip.write_videofile(f"experiments/{name}/{input_file.split('/')[-1]}")
my_clip.audio.write_audiofile(f'experiments/{name}/audio_file.wav', codec="mp3")
# Instantiate parakeet model
model = asr_model
# convert to format parakeet can interpret
convert_audio_to_mono_16khz(f'experiments/{name}/audio_file.wav', f'experiments/{name}/audio_file.wav')
# transcribe audio
output = model.transcribe([f'experiments/{name}/audio_file.wav'], timestamps=True)
# Convert audio to text with timestamps, dump into dataframe
df = pd.DataFrame(output[0].timestamp['segment'])
df['text'] = df['segment']
df = df.drop(['start_offset', 'end_offset', 'segment'],axis = 1)
# save csv and srt files
df.to_csv(f'experiments/{name}/subs.csv')
csv_to_srt(f"experiments/{name}/subs.csv",f"experiments/{name}/subs.srt")
# Capture video
vidcap = cv2.VideoCapture(f'''experiments/{name}/{input_file}''')
success, image = vidcap.read()
# Instantiate MoviePy subtitle generator with TextClip, subtitles, and SubtitlesClip
generator = lambda txt: TextClip(
"./P052-Roman.ttf",
text = txt,
font_size = int(my_clip.w/50),
stroke_width=1,
color= "white",
stroke_color="black",
size = (my_clip.w, my_clip.h),
vertical_align = 'bottom',
horizontal_align = 'center',
method='caption')
subs = SubtitlesClip(f"experiments/{name}/subs.srt", make_textclip=generator)
video = VideoFileClip(input_file)
final = CompositeVideoClip([video, subs])
final.write_videofile(f'experiments/{name}/output.mp4', fps=video.fps, remove_temp=True, codec="libx264", audio_codec="aac")
return f'experiments/{name}/output.mp4', df, f"experiments/{name}/subs.srt"
with gr.Blocks() as demo:
gr.Markdown("<div style='display:flex;justify-content:center;align-items:center;gap:.5rem;font-size:24px;'>🦜 <strong>Parakeet AutoCaption Web App</strong></div>")
with gr.Column():
input_video = gr.Video(label = 'Input your video for captioning')
# input_name = gr.Textbox(label = 'Name of your experiment run')
with gr.Column():
run_button = gr.Button('Run Video Captioning')
with gr.Column():
output_video = gr.Video(label = 'Output Video')
output_subs = gr.Dataframe(label = 'Output Subtitles')
output_subs_srt_file = gr.DownloadButton(label = 'Download subtitles as SRT file')
gr.on(
triggers=[run_button.click],
fn=subtitle_video,
inputs=[
input_video,
],
outputs=[output_video, output_subs, output_subs_srt_file],
)
if __name__ == "__main__":
demo.launch(share=True)