Spaces:

JamesDigitalOcean
/

Parakeet-AutoCaption

Running on Zero

App Files Files Community

Parakeet-AutoCaption / app.py

jamesesqueleto

moved model instantiation outside of function

627ee40 2 months ago

raw

history blame contribute delete

8.69 kB

	## Imports
	from __future__ import unicode_literals
	import cv2
	import pandas as pd
	from moviepy import VideoFileClip
	from moviepy.video.tools.subtitles import SubtitlesClip
	import os
	import spaces
	from moviepy.video.tools.subtitles import SubtitlesClip
	from moviepy.video.io.VideoFileClip import VideoFileClip
	from moviepy import CompositeVideoClip
	from moviepy import TextClip
	import nemo.collections.asr as nemo_asr
	import gradio as gr
	#!/usr/bin/env python3
	import csv, re, sys
	from pathlib import Path
	import urllib.request
	urllib.request.urlretrieve("https://github.com/Jameshskelton/fonts/raw/refs/heads/main/P052-Roman.ttf", "P052-Roman.ttf")


	def parse_time_to_srt(t):
	s = str(t).strip()
	if re.fullmatch(r"\d+(\.\d+)?", s):
	total_ms = int(round(float(s) * 1000))
	else:
	parts = s.split(':')
	if len(parts) == 2:
	mm, ss = parts
	sec = float(ss)
	total_ms = int(round((int(mm) * 60 + sec) * 1000))
	elif len(parts) == 3:
	hh, mm, ss = parts
	sec = float(ss)
	total_ms = int(round(((int(hh) * 3600) + (int(mm) * 60) + sec) * 1000))
	else:
	raise ValueError(f"Unrecognized time format: {s}")
	hours = total_ms // 3_600_000
	rem = total_ms % 3_600_000
	minutes = rem // 60_000
	rem = rem % 60_000
	seconds = rem // 1000
	millis = rem % 1000
	return f"{hours:02}:{minutes:02}:{seconds:02},{millis:03}"

	def map_position_to_tag(pos):
	if not pos:
	return ""
	s = str(pos).strip().lower()
	m = re.search(r"\\?an([1-9])", s)
	if m:
	return "{\\an" + m.group(1) + "}"
	if "top left" in s or ("top" in s and "left" in s):
	return "{\\an7}"
	if "top right" in s or ("top" in s and "right" in s):
	return "{\\an9}"
	if "bottom left" in s or ("bottom" in s and "left" in s):
	return "{\\an1}"
	if "bottom right" in s or ("bottom" in s and "right" in s):
	return "{\\an3}"
	if "top" in s:
	return "{\\an8}"
	if "middle" in s or "center" in s or "centre" in s:
	return "{\\an5}"
	if "bottom" in s:
	return "{\\an2}"
	return ""

	def looks_like_header(row):
	joined = ",".join(c.strip().lower() for c in row[:4])
	header_words = ["position", "pos", "align", "start", "begin", "end", "stop", "subtitle", "text", "caption"]
	return any(w in joined for w in header_words)

	def csv_to_srt(csv_path: Path, srt_path: Path):
	with open(f'{csv_path}',"r", encoding="utf-8-sig", newline="") as f:
	reader = csv.reader(f)
	rows = [row for row in reader if any(cell.strip() for cell in row)]
	if not rows:
	raise ValueError("CSV is empty.")
	start_index = 1 if looks_like_header(rows[0]) else 0
	normalized = []
	for i, row in enumerate(rows[start_index:], start=start_index+1):
	if len(row) < 4:
	raise ValueError(f"Row {i} has fewer than 4 columns: {row}")
	position, start, end, text = row[0].strip(), row[1].strip(), row[2].strip(), row[3]
	normalized.append((position, start, end, text))
	with open(f"{srt_path}", "w", encoding="utf-8") as out:
	for idx, (position, start, end, text) in enumerate(normalized, start=1):
	start_srt = parse_time_to_srt(start)
	end_srt = parse_time_to_srt(end)
	pos_tag = map_position_to_tag(position)
	final_text = f"{pos_tag}{text}" if pos_tag else text
	out.write(f"{idx}\n")
	out.write(f"{start_srt} --> {end_srt}\n")
	out.write(f"{final_text}\n\n")

	from pydub import AudioSegment

	def convert_audio_to_mono_16khz(input_path, output_path):
	"""
	Converts an audio file to mono and resamples it to 16 kHz.

	Args:
	input_path (str): The path to the input audio file.
	output_path (str): The path to save the converted audio file.
	"""
	try:
	audio = AudioSegment.from_file(input_path)
	# Set channels to 1 (mono)
	audio = audio.set_channels(1)
	# Set frame rate (sample rate) to 16000 Hz
	audio = audio.set_frame_rate(16000)
	audio.export(output_path, format="wav") # Export as WAV or desired format
	print(f"Audio converted successfully to mono, 16kHz at: {output_path}")
	except Exception as e:
	print(f"Error converting audio: {e}")

	asr_model = nemo_asr.models.ASRModel.from_pretrained(model_name="nvidia/parakeet-tdt-0.6b-v3")

	@spaces.GPU
	def subtitle_video(input_file):
	# ------------------------------------------------------------------------------------------------------------------------------
	# Params:
	# ------------------------------------------------------------------------------------------------------------------------------
	#. name: str, name of directory to store files in in experiments folder
	# audio_file: str, path to extracted audio file for Whisper
	# input_file: str, path to video file for MoviePy to caption
	# output: str, destination of final output video file
	# lang: str, language
	# uploaded_vid: str, path to uploaded video file if download is False
	#
	#--------------------------------------------------------------------------------------------------------------------------------
	# Returns: An annotated video with translated captions into english, saved to name/output
	#--------------------------------------------------------------------------------------------------------------------------------

	## First, this checks if your expermiment name is taken. If not, it will create the directory.
	## Otherwise, we will be prompted to retry with a new name
	name = 'run'
	try:
	os.mkdir(f'experiments/{name}')
	print('Starting AutoCaptioning...')
	print(f'Results will be stored in experiments/{name}')

	except:
	None

	# Use local clip if not downloading from youtube
	my_clip = VideoFileClip(input_file)
	my_clip.write_videofile(f"experiments/{name}/{input_file.split('/')[-1]}")
	my_clip.audio.write_audiofile(f'experiments/{name}/audio_file.wav', codec="mp3")

	# Instantiate parakeet model
	model = asr_model

	# convert to format parakeet can interpret
	convert_audio_to_mono_16khz(f'experiments/{name}/audio_file.wav', f'experiments/{name}/audio_file.wav')

	# transcribe audio
	output = model.transcribe([f'experiments/{name}/audio_file.wav'], timestamps=True)

	# Convert audio to text with timestamps, dump into dataframe
	df = pd.DataFrame(output[0].timestamp['segment'])
	df['text'] = df['segment']
	df = df.drop(['start_offset', 'end_offset', 'segment'],axis = 1)

	# save csv and srt files
	df.to_csv(f'experiments/{name}/subs.csv')
	csv_to_srt(f"experiments/{name}/subs.csv",f"experiments/{name}/subs.srt")

	# Capture video
	vidcap = cv2.VideoCapture(f'''experiments/{name}/{input_file}''')
	success, image = vidcap.read()

	# Instantiate MoviePy subtitle generator with TextClip, subtitles, and SubtitlesClip
	generator = lambda txt: TextClip(
	"./P052-Roman.ttf",
	text = txt,
	font_size = int(my_clip.w/50),
	stroke_width=1,
	color= "white",
	stroke_color="black",
	size = (my_clip.w, my_clip.h),
	vertical_align = 'bottom',
	horizontal_align = 'center',
	method='caption')

	subs = SubtitlesClip(f"experiments/{name}/subs.srt", make_textclip=generator)

	video = VideoFileClip(input_file)
	final = CompositeVideoClip([video, subs])
	final.write_videofile(f'experiments/{name}/output.mp4', fps=video.fps, remove_temp=True, codec="libx264", audio_codec="aac")
	return f'experiments/{name}/output.mp4', df, f"experiments/{name}/subs.srt"

	with gr.Blocks() as demo:
	gr.Markdown("<div style='display:flex;justify-content:center;align-items:center;gap:.5rem;font-size:24px;'>🦜 <strong>Parakeet AutoCaption Web App</strong></div>")
	with gr.Column():
	input_video = gr.Video(label = 'Input your video for captioning')
	# input_name = gr.Textbox(label = 'Name of your experiment run')
	with gr.Column():
	run_button = gr.Button('Run Video Captioning')
	with gr.Column():
	output_video = gr.Video(label = 'Output Video')
	output_subs = gr.Dataframe(label = 'Output Subtitles')
	output_subs_srt_file = gr.DownloadButton(label = 'Download subtitles as SRT file')
	gr.on(
	triggers=[run_button.click],
	fn=subtitle_video,
	inputs=[
	input_video,
	],
	outputs=[output_video, output_subs, output_subs_srt_file],
	)
	if __name__ == "__main__":
	demo.launch(share=True)