Spaces:
Running
on
Zero
Running
on
Zero
| ## Imports | |
| from __future__ import unicode_literals | |
| import cv2 | |
| import pandas as pd | |
| from moviepy import VideoFileClip | |
| from moviepy.video.tools.subtitles import SubtitlesClip | |
| import os | |
| import spaces | |
| from moviepy.video.tools.subtitles import SubtitlesClip | |
| from moviepy.video.io.VideoFileClip import VideoFileClip | |
| from moviepy import CompositeVideoClip | |
| from moviepy import TextClip | |
| import nemo.collections.asr as nemo_asr | |
| import gradio as gr | |
| #!/usr/bin/env python3 | |
| import csv, re, sys | |
| from pathlib import Path | |
| import urllib.request | |
| urllib.request.urlretrieve("https://github.com/Jameshskelton/fonts/raw/refs/heads/main/P052-Roman.ttf", "P052-Roman.ttf") | |
| def parse_time_to_srt(t): | |
| s = str(t).strip() | |
| if re.fullmatch(r"\d+(\.\d+)?", s): | |
| total_ms = int(round(float(s) * 1000)) | |
| else: | |
| parts = s.split(':') | |
| if len(parts) == 2: | |
| mm, ss = parts | |
| sec = float(ss) | |
| total_ms = int(round((int(mm) * 60 + sec) * 1000)) | |
| elif len(parts) == 3: | |
| hh, mm, ss = parts | |
| sec = float(ss) | |
| total_ms = int(round(((int(hh) * 3600) + (int(mm) * 60) + sec) * 1000)) | |
| else: | |
| raise ValueError(f"Unrecognized time format: {s}") | |
| hours = total_ms // 3_600_000 | |
| rem = total_ms % 3_600_000 | |
| minutes = rem // 60_000 | |
| rem = rem % 60_000 | |
| seconds = rem // 1000 | |
| millis = rem % 1000 | |
| return f"{hours:02}:{minutes:02}:{seconds:02},{millis:03}" | |
| def map_position_to_tag(pos): | |
| if not pos: | |
| return "" | |
| s = str(pos).strip().lower() | |
| m = re.search(r"\\?an([1-9])", s) | |
| if m: | |
| return "{\\an" + m.group(1) + "}" | |
| if "top left" in s or ("top" in s and "left" in s): | |
| return "{\\an7}" | |
| if "top right" in s or ("top" in s and "right" in s): | |
| return "{\\an9}" | |
| if "bottom left" in s or ("bottom" in s and "left" in s): | |
| return "{\\an1}" | |
| if "bottom right" in s or ("bottom" in s and "right" in s): | |
| return "{\\an3}" | |
| if "top" in s: | |
| return "{\\an8}" | |
| if "middle" in s or "center" in s or "centre" in s: | |
| return "{\\an5}" | |
| if "bottom" in s: | |
| return "{\\an2}" | |
| return "" | |
| def looks_like_header(row): | |
| joined = ",".join(c.strip().lower() for c in row[:4]) | |
| header_words = ["position", "pos", "align", "start", "begin", "end", "stop", "subtitle", "text", "caption"] | |
| return any(w in joined for w in header_words) | |
| def csv_to_srt(csv_path: Path, srt_path: Path): | |
| with open(f'{csv_path}',"r", encoding="utf-8-sig", newline="") as f: | |
| reader = csv.reader(f) | |
| rows = [row for row in reader if any(cell.strip() for cell in row)] | |
| if not rows: | |
| raise ValueError("CSV is empty.") | |
| start_index = 1 if looks_like_header(rows[0]) else 0 | |
| normalized = [] | |
| for i, row in enumerate(rows[start_index:], start=start_index+1): | |
| if len(row) < 4: | |
| raise ValueError(f"Row {i} has fewer than 4 columns: {row}") | |
| position, start, end, text = row[0].strip(), row[1].strip(), row[2].strip(), row[3] | |
| normalized.append((position, start, end, text)) | |
| with open(f"{srt_path}", "w", encoding="utf-8") as out: | |
| for idx, (position, start, end, text) in enumerate(normalized, start=1): | |
| start_srt = parse_time_to_srt(start) | |
| end_srt = parse_time_to_srt(end) | |
| pos_tag = map_position_to_tag(position) | |
| final_text = f"{pos_tag}{text}" if pos_tag else text | |
| out.write(f"{idx}\n") | |
| out.write(f"{start_srt} --> {end_srt}\n") | |
| out.write(f"{final_text}\n\n") | |
| from pydub import AudioSegment | |
| def convert_audio_to_mono_16khz(input_path, output_path): | |
| """ | |
| Converts an audio file to mono and resamples it to 16 kHz. | |
| Args: | |
| input_path (str): The path to the input audio file. | |
| output_path (str): The path to save the converted audio file. | |
| """ | |
| try: | |
| audio = AudioSegment.from_file(input_path) | |
| # Set channels to 1 (mono) | |
| audio = audio.set_channels(1) | |
| # Set frame rate (sample rate) to 16000 Hz | |
| audio = audio.set_frame_rate(16000) | |
| audio.export(output_path, format="wav") # Export as WAV or desired format | |
| print(f"Audio converted successfully to mono, 16kHz at: {output_path}") | |
| except Exception as e: | |
| print(f"Error converting audio: {e}") | |
| asr_model = nemo_asr.models.ASRModel.from_pretrained(model_name="nvidia/parakeet-tdt-0.6b-v3") | |
| def subtitle_video(input_file): | |
| # ------------------------------------------------------------------------------------------------------------------------------ | |
| # Params: | |
| # ------------------------------------------------------------------------------------------------------------------------------ | |
| #. name: str, name of directory to store files in in experiments folder | |
| # audio_file: str, path to extracted audio file for Whisper | |
| # input_file: str, path to video file for MoviePy to caption | |
| # output: str, destination of final output video file | |
| # lang: str, language | |
| # uploaded_vid: str, path to uploaded video file if download is False | |
| # | |
| #-------------------------------------------------------------------------------------------------------------------------------- | |
| # Returns: An annotated video with translated captions into english, saved to name/output | |
| #-------------------------------------------------------------------------------------------------------------------------------- | |
| ## First, this checks if your expermiment name is taken. If not, it will create the directory. | |
| ## Otherwise, we will be prompted to retry with a new name | |
| name = 'run' | |
| try: | |
| os.mkdir(f'experiments/{name}') | |
| print('Starting AutoCaptioning...') | |
| print(f'Results will be stored in experiments/{name}') | |
| except: | |
| None | |
| # Use local clip if not downloading from youtube | |
| my_clip = VideoFileClip(input_file) | |
| my_clip.write_videofile(f"experiments/{name}/{input_file.split('/')[-1]}") | |
| my_clip.audio.write_audiofile(f'experiments/{name}/audio_file.wav', codec="mp3") | |
| # Instantiate parakeet model | |
| model = asr_model | |
| # convert to format parakeet can interpret | |
| convert_audio_to_mono_16khz(f'experiments/{name}/audio_file.wav', f'experiments/{name}/audio_file.wav') | |
| # transcribe audio | |
| output = model.transcribe([f'experiments/{name}/audio_file.wav'], timestamps=True) | |
| # Convert audio to text with timestamps, dump into dataframe | |
| df = pd.DataFrame(output[0].timestamp['segment']) | |
| df['text'] = df['segment'] | |
| df = df.drop(['start_offset', 'end_offset', 'segment'],axis = 1) | |
| # save csv and srt files | |
| df.to_csv(f'experiments/{name}/subs.csv') | |
| csv_to_srt(f"experiments/{name}/subs.csv",f"experiments/{name}/subs.srt") | |
| # Capture video | |
| vidcap = cv2.VideoCapture(f'''experiments/{name}/{input_file}''') | |
| success, image = vidcap.read() | |
| # Instantiate MoviePy subtitle generator with TextClip, subtitles, and SubtitlesClip | |
| generator = lambda txt: TextClip( | |
| "./P052-Roman.ttf", | |
| text = txt, | |
| font_size = int(my_clip.w/50), | |
| stroke_width=1, | |
| color= "white", | |
| stroke_color="black", | |
| size = (my_clip.w, my_clip.h), | |
| vertical_align = 'bottom', | |
| horizontal_align = 'center', | |
| method='caption') | |
| subs = SubtitlesClip(f"experiments/{name}/subs.srt", make_textclip=generator) | |
| video = VideoFileClip(input_file) | |
| final = CompositeVideoClip([video, subs]) | |
| final.write_videofile(f'experiments/{name}/output.mp4', fps=video.fps, remove_temp=True, codec="libx264", audio_codec="aac") | |
| return f'experiments/{name}/output.mp4', df, f"experiments/{name}/subs.srt" | |
| with gr.Blocks() as demo: | |
| gr.Markdown("<div style='display:flex;justify-content:center;align-items:center;gap:.5rem;font-size:24px;'>🦜 <strong>Parakeet AutoCaption Web App</strong></div>") | |
| with gr.Column(): | |
| input_video = gr.Video(label = 'Input your video for captioning') | |
| # input_name = gr.Textbox(label = 'Name of your experiment run') | |
| with gr.Column(): | |
| run_button = gr.Button('Run Video Captioning') | |
| with gr.Column(): | |
| output_video = gr.Video(label = 'Output Video') | |
| output_subs = gr.Dataframe(label = 'Output Subtitles') | |
| output_subs_srt_file = gr.DownloadButton(label = 'Download subtitles as SRT file') | |
| gr.on( | |
| triggers=[run_button.click], | |
| fn=subtitle_video, | |
| inputs=[ | |
| input_video, | |
| ], | |
| outputs=[output_video, output_subs, output_subs_srt_file], | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch(share=True) | |