## Imports from __future__ import unicode_literals import cv2 import pandas as pd from moviepy import VideoFileClip from moviepy.video.tools.subtitles import SubtitlesClip import os import spaces from moviepy.video.tools.subtitles import SubtitlesClip from moviepy.video.io.VideoFileClip import VideoFileClip from moviepy import CompositeVideoClip from moviepy import TextClip import nemo.collections.asr as nemo_asr import gradio as gr #!/usr/bin/env python3 import csv, re, sys from pathlib import Path import urllib.request urllib.request.urlretrieve("https://github.com/Jameshskelton/fonts/raw/refs/heads/main/P052-Roman.ttf", "P052-Roman.ttf") def parse_time_to_srt(t): s = str(t).strip() if re.fullmatch(r"\d+(\.\d+)?", s): total_ms = int(round(float(s) * 1000)) else: parts = s.split(':') if len(parts) == 2: mm, ss = parts sec = float(ss) total_ms = int(round((int(mm) * 60 + sec) * 1000)) elif len(parts) == 3: hh, mm, ss = parts sec = float(ss) total_ms = int(round(((int(hh) * 3600) + (int(mm) * 60) + sec) * 1000)) else: raise ValueError(f"Unrecognized time format: {s}") hours = total_ms // 3_600_000 rem = total_ms % 3_600_000 minutes = rem // 60_000 rem = rem % 60_000 seconds = rem // 1000 millis = rem % 1000 return f"{hours:02}:{minutes:02}:{seconds:02},{millis:03}" def map_position_to_tag(pos): if not pos: return "" s = str(pos).strip().lower() m = re.search(r"\\?an([1-9])", s) if m: return "{\\an" + m.group(1) + "}" if "top left" in s or ("top" in s and "left" in s): return "{\\an7}" if "top right" in s or ("top" in s and "right" in s): return "{\\an9}" if "bottom left" in s or ("bottom" in s and "left" in s): return "{\\an1}" if "bottom right" in s or ("bottom" in s and "right" in s): return "{\\an3}" if "top" in s: return "{\\an8}" if "middle" in s or "center" in s or "centre" in s: return "{\\an5}" if "bottom" in s: return "{\\an2}" return "" def looks_like_header(row): joined = ",".join(c.strip().lower() for c in row[:4]) header_words = ["position", "pos", "align", "start", "begin", "end", "stop", "subtitle", "text", "caption"] return any(w in joined for w in header_words) def csv_to_srt(csv_path: Path, srt_path: Path): with open(f'{csv_path}',"r", encoding="utf-8-sig", newline="") as f: reader = csv.reader(f) rows = [row for row in reader if any(cell.strip() for cell in row)] if not rows: raise ValueError("CSV is empty.") start_index = 1 if looks_like_header(rows[0]) else 0 normalized = [] for i, row in enumerate(rows[start_index:], start=start_index+1): if len(row) < 4: raise ValueError(f"Row {i} has fewer than 4 columns: {row}") position, start, end, text = row[0].strip(), row[1].strip(), row[2].strip(), row[3] normalized.append((position, start, end, text)) with open(f"{srt_path}", "w", encoding="utf-8") as out: for idx, (position, start, end, text) in enumerate(normalized, start=1): start_srt = parse_time_to_srt(start) end_srt = parse_time_to_srt(end) pos_tag = map_position_to_tag(position) final_text = f"{pos_tag}{text}" if pos_tag else text out.write(f"{idx}\n") out.write(f"{start_srt} --> {end_srt}\n") out.write(f"{final_text}\n\n") from pydub import AudioSegment def convert_audio_to_mono_16khz(input_path, output_path): """ Converts an audio file to mono and resamples it to 16 kHz. Args: input_path (str): The path to the input audio file. output_path (str): The path to save the converted audio file. """ try: audio = AudioSegment.from_file(input_path) # Set channels to 1 (mono) audio = audio.set_channels(1) # Set frame rate (sample rate) to 16000 Hz audio = audio.set_frame_rate(16000) audio.export(output_path, format="wav") # Export as WAV or desired format print(f"Audio converted successfully to mono, 16kHz at: {output_path}") except Exception as e: print(f"Error converting audio: {e}") asr_model = nemo_asr.models.ASRModel.from_pretrained(model_name="nvidia/parakeet-tdt-0.6b-v3") @spaces.GPU def subtitle_video(input_file): # ------------------------------------------------------------------------------------------------------------------------------ # Params: # ------------------------------------------------------------------------------------------------------------------------------ #. name: str, name of directory to store files in in experiments folder # audio_file: str, path to extracted audio file for Whisper # input_file: str, path to video file for MoviePy to caption # output: str, destination of final output video file # lang: str, language # uploaded_vid: str, path to uploaded video file if download is False # #-------------------------------------------------------------------------------------------------------------------------------- # Returns: An annotated video with translated captions into english, saved to name/output #-------------------------------------------------------------------------------------------------------------------------------- ## First, this checks if your expermiment name is taken. If not, it will create the directory. ## Otherwise, we will be prompted to retry with a new name name = 'run' try: os.mkdir(f'experiments/{name}') print('Starting AutoCaptioning...') print(f'Results will be stored in experiments/{name}') except: None # Use local clip if not downloading from youtube my_clip = VideoFileClip(input_file) my_clip.write_videofile(f"experiments/{name}/{input_file.split('/')[-1]}") my_clip.audio.write_audiofile(f'experiments/{name}/audio_file.wav', codec="mp3") # Instantiate parakeet model model = asr_model # convert to format parakeet can interpret convert_audio_to_mono_16khz(f'experiments/{name}/audio_file.wav', f'experiments/{name}/audio_file.wav') # transcribe audio output = model.transcribe([f'experiments/{name}/audio_file.wav'], timestamps=True) # Convert audio to text with timestamps, dump into dataframe df = pd.DataFrame(output[0].timestamp['segment']) df['text'] = df['segment'] df = df.drop(['start_offset', 'end_offset', 'segment'],axis = 1) # save csv and srt files df.to_csv(f'experiments/{name}/subs.csv') csv_to_srt(f"experiments/{name}/subs.csv",f"experiments/{name}/subs.srt") # Capture video vidcap = cv2.VideoCapture(f'''experiments/{name}/{input_file}''') success, image = vidcap.read() # Instantiate MoviePy subtitle generator with TextClip, subtitles, and SubtitlesClip generator = lambda txt: TextClip( "./P052-Roman.ttf", text = txt, font_size = int(my_clip.w/50), stroke_width=1, color= "white", stroke_color="black", size = (my_clip.w, my_clip.h), vertical_align = 'bottom', horizontal_align = 'center', method='caption') subs = SubtitlesClip(f"experiments/{name}/subs.srt", make_textclip=generator) video = VideoFileClip(input_file) final = CompositeVideoClip([video, subs]) final.write_videofile(f'experiments/{name}/output.mp4', fps=video.fps, remove_temp=True, codec="libx264", audio_codec="aac") return f'experiments/{name}/output.mp4', df, f"experiments/{name}/subs.srt" with gr.Blocks() as demo: gr.Markdown("