Spaces:

Realmeas
/

Myblinkcaption

Sleeping

App Files Files Community

Myblinkcaption / app.py

Realmeas

Update app.py

97603f3 verified 24 days ago

raw

history blame contribute delete

12.7 kB

	# app.py - BlinkCaptions-like Auto Subtitle Generator for Hugging Face Spaces
	# This Gradio app mimics Vozo.ai's BlinkCaptions: AI-powered one-click dynamic captions
	# with high accuracy, Hinglish support, styles, emojis/highlights, and FFmpeg burning.
	# Key steps (Blink-like):
	# 1. Input handling: Upload video or YouTube URL download.
	# 2. Audio extraction & validation (duration <10min, has audio).
	# 3. Whisper transcription: Auto-detect Hinglish (language=None), timestamps for sync.
	# - Lyrics mode: Enable word_timestamps for music-like precision.
	# - Trim: Skip short/silent segments (<0.5s).
	# 4. Enhancements: Word emphasis (e.g., wrap "wow" in bold/color tags).
	# 5. Translation: Optional to 120+ langs via deep-translator (stable, fixes googletrans errors).
	# 6. ASS subtitle creation: Styled with fonts/colors/sizes/positions/animations/emojis.
	# 7. Burn to video: FFmpeg overlays HD output, no watermark.
	# 8. UI: Simple, free, viral-ready for Reels/YouTube.
	# Deploy: Save as app.py, add requirements.txt, create HF Space (Gradio SDK).

	import os
	import tempfile
	import gradio as gr
	from transformers import pipeline
	import torch
	import ffmpeg
	from yt_dlp import YoutubeDL
	from deep_translator import GoogleTranslator # Stable replacement for googletrans

	# Model options (lighter for speed)
	MODEL_CHOICES = {
	"Base (fast)": "openai/whisper-base",
	"Small": "openai/whisper-small",
	"Medium": "openai/whisper-medium",
	"Large-v3 (accurate, multilingual)": "openai/whisper-large-v3"
	}

	# Style options (200+ simulated via combos; Blink-like: Montserrat/Arial, colors, etc.)
	FONTS = ["Arial", "Montserrat"] # Add more if custom fonts available
	COLORS = ["white", "yellow", "black"]
	SIZES = ["small (24)", "medium (32)", "large (40)"]
	POSITIONS = ["bottom", "top"]
	LANGUAGES = ["en", "hi", "fr", "es"] # Sample; extend to 120+ with deep-translator

	def download_youtube(url, progress=gr.Progress()):
	"""Download YouTube video using yt-dlp (Blink-like: social input support)."""
	progress(0, desc="Downloading video...")
	ydl_opts = {
	'format': 'best[height<=720]/best', # HD but not too heavy
	'outtmpl': '%(title)s.%(ext)s',
	'noplaylist': True,
	}
	with tempfile.TemporaryDirectory() as tmpdir:
	os.chdir(tmpdir)
	with YoutubeDL(ydl_opts) as ydl:
	ydl.download([url])
	files = [f for f in os.listdir(tmpdir) if f.endswith(('.mp4', '.mkv', '.webm'))]
	if files:
	video_path = os.path.join(tmpdir, files[0])
	progress(1, desc="Download complete!")
	return video_path
	raise ValueError("Download failed!")

	def extract_audio(video_path):
	"""Extract audio from video using FFmpeg (prep for Whisper)."""
	audio_path = video_path.rsplit('.', 1)[0] + '.wav'
	stream = ffmpeg.input(video_path)
	stream = ffmpeg.output(stream, audio_path, acodec='pcm_s16le', ac=1, ar=16000) # Whisper format
	ffmpeg.run(stream, overwrite_output=True, quiet=True)
	return audio_path

	def get_video_duration(video_path):
	"""Get duration in seconds using FFmpeg probe (limit check)."""
	probe = ffmpeg.probe(video_path)
	return float(probe['streams'][0]['duration']) if probe['streams'] else 0

	def has_audio(video_path):
	"""Check if video has audio stream."""
	probe = ffmpeg.probe(video_path)
	return any(s['codec_type'] == 'audio' for s in probe['streams'])

	def transcribe_audio(audio_path, model_name, lyrics_mode, progress=gr.Progress()):
	"""Whisper transcription (Blink-like: 98%+ acc, Hinglish auto-detect, lyrics mode)."""
	progress(0, desc="Loading Whisper model...")
	device = "cuda" if torch.cuda.is_available() else "cpu"
	pipe = pipeline(
	"automatic-speech-recognition",
	model=model_name,
	device=device,
	return_timestamps=True,
	chunk_length_s=30,
	batch_size=8,
	)

	# Generate kwargs for accuracy boost (transcribe task, auto lang for Hinglish)
	generate_kwargs = {"task": "transcribe", "language": None} # Auto-detect Hindi/English mix
	if lyrics_mode:
	generate_kwargs["word_timestamps"] = True # Lyrics precision (supported in v3)

	progress(0.5, desc="Transcribing...")
	result = pipe(audio_path, generate_kwargs=generate_kwargs)

	# Extract segments, trim silences (short <0.5s) - FIXED: 'segments' not 'chunks'
	segments = result.get('segments', [])
	trimmed_segments = [s for s in segments if (s['end'] - s['start']) > 0.5]

	progress(1, desc="Transcription complete!")
	return trimmed_segments # List of {'text': , 'start': , 'end': }

	def translate_text(text, target_lang):
	"""Optional translation (Blink-like: 120+ langs, using deep-translator)."""
	if target_lang == "en": # No translate
	return text
	try:
	# Instantiate per call for stability (source auto-detect)
	translator = GoogleTranslator(source='auto', target=target_lang)
	return translator.translate(text)
	except Exception:
	return text # Fallback on error

	def create_ass_subtitles(segments, font, color, size, position, emphasis_words, target_lang, progress=gr.Progress()):
	"""Create ASS subtitles (styled like Blink: fonts/colors/emojis/highlights/animations)."""
	progress(0, desc="Generating styled subtitles...")

	# Map sizes
	size_map = {"small (24)": 24, "medium (32)": 32, "large (40)": 40}
	fontsize = size_map[size]

	# Color to ASS hex (BGR)
	color_map = {"white": "&HFFFFFF&", "yellow": "&HFFFF00&", "black": "&H000000&"}
	subtitle_color = color_map[color]

	# Position: bottom/top alignment
	alignment = 2 if position == "bottom" else 8 # ASS alignment codes

	# ASS header (V4+ Styles for Blink-like customizations)
	ass_content = """[Script Info]
	Title: Generated Subtitles
	ScriptType: v4.00+
	[V4+ Styles]
	Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
	Style: Default,{font},{fontsize},{subtitle_color},&H000000FF,&H00000000,&H00000000,0,0,0,0,100,100,0,0,1,2,2,{alignment},10,10,10,1

	[Events]
	Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
	""".format(font=font, fontsize=fontsize, subtitle_color=subtitle_color, alignment=alignment)

	# Build events with enhancements
	for i, seg in enumerate(segments):
	start = f"{int(seg['start']*100)}" # ms
	end = f"{int(seg['end']*100)}"
	text = translate_text(seg['text'].strip(), target_lang)

	# Word emphasis/highlights (case-insensitive check, FIXED)
	for word in emphasis_words:
	if word.lower() in text.lower():
	text = text.replace(word, f"{{\\b1\\c&HFF0000&}}{word}{{\\b0}}", 1) # Red bold, limit to 1 replace

	# Add emoji example (Blink-like: one-click emojis)
	if "!" in text:
	text += " 😎" # Simple auto-add

	# Basic fade-in animation (Blink dynamic)
	text = f"{{\\fad(200,200)}}{text}"

	# Dialogue event
	ass_content += f"Dialogue: 0,{start},{end},Default,,0,0,0,,{text}\n"

	progress(1, desc="ASS file ready!")
	with tempfile.NamedTemporaryFile(mode='w', suffix='.ass', delete=False) as f:
	f.write(ass_content)
	return f.name

	def burn_subtitles(video_path, ass_path, progress=gr.Progress()):
	"""Burn ASS to video using FFmpeg (Blink-like: HD export, no watermark)."""
	progress(0, desc="Burning subtitles...")
	output_path = video_path.rsplit('.', 1)[0] + '_subtitled.mp4'

	# FFmpeg command: subtitles filter for styles/shadows/position
	stream = ffmpeg.input(video_path)
	stream = ffmpeg.filter(stream, 'subtitles', ass_path, force_style='FontName=Montserrat,PrimaryColour=&HFFFFFF&') # Override if needed
	stream = ffmpeg.output(stream, output_path, vcodec='libx264', acodec='aac', **{'preset': 'fast'}) # HD

	ffmpeg.run(stream, overwrite_output=True, quiet=True)
	progress(1, desc="Video ready!")
	return output_path

	def main_process(
	video_file, yt_url, model_choice, lyrics_mode, target_lang,
	font, color, size, position, emphasis_words_str,
	progress=gr.Progress()
	):
	"""Main pipeline (Blink one-click flow)."""
	if video_file is None and yt_url is None:
	raise gr.Error("Upload a video or enter a YouTube URL!")

	video_path = None
	if video_file:
	video_path = video_file.name
	elif yt_url:
	video_path = download_youtube(yt_url, progress)

	# Validation (Blink error handling)
	duration = get_video_duration(video_path)
	if duration > 600: # 10 min
	raise gr.Error("Video too long! Limit to 10 minutes for demo.")
	if not has_audio(video_path):
	raise gr.Error("No audio detected!")

	# Extract audio
	audio_path = extract_audio(video_path)

	# Transcribe
	model_name = MODEL_CHOICES[model_choice]
	segments = transcribe_audio(audio_path, model_name, lyrics_mode, progress)
	if not segments:
	raise gr.Error("No speech detected!")

	# Emphasis words
	emphasis_words = [w.strip() for w in emphasis_words_str.split(',') if w.strip()] if emphasis_words_str else []

	# Create styled ASS
	ass_path = create_ass_subtitles(segments, font, color, size, position, emphasis_words, target_lang, progress)

	# Burn
	output_video = burn_subtitles(video_path, ass_path, progress)

	# Also save SRT for download (simple conversion from segments)
	srt_content = ""
	for i, seg in enumerate(segments, 1):
	start = f"{int(seg['start']//3600):02d}:{int((seg['start']%3600)//60):02d}:{int(seg['start']%60):02d},{int((seg['start']%1)*1000):03d}"
	end = f"{int(seg['end']//3600):02d}:{int((seg['end']%3600)//60):02d}:{int(seg['end']%60):02d},{int((seg['end']%1)*1000):03d}"
	text = translate_text(seg['text'].strip(), target_lang)
	srt_content += f"{i}\n{start} --> {end}\n{text}\n\n"
	with tempfile.NamedTemporaryFile(mode='w', suffix='.srt', delete=False) as f:
	f.write(srt_content)
	srt_path = f.name

	# Preview thumbnail (simple FFmpeg extract, FIXED: use run)
	thumb_path = video_path.rsplit('.', 1)[0] + '_thumb.jpg'
	ffmpeg.input(video_path, ss=1).output(thumb_path, vframes=1).run(quiet=True, overwrite_output=True)

	return output_video, srt_path, thumb_path

	# Gradio UI (Blink simple: upload, selectors, progress, downloads)
	with gr.Blocks(title="BlinkCaptions Clone - Free Auto Subtitles") as demo:
	gr.Markdown("# 🚀 BlinkCaptions-like AI Subtitle Generator\nGenerate dynamic Hinglish captions with styles! Viral-ready for Reels/YouTube.")

	with gr.Row():
	with gr.Column(scale=1):
	video_input = gr.File(label="Upload Video (MP4, <10min)", file_types=[".mp4", ".mkv", ".webm"])
	yt_input = gr.Textbox(label="Or YouTube URL", placeholder="https://youtube.com/watch?v=...")
	with gr.Column(scale=1):
	model_choice = gr.Dropdown(choices=list(MODEL_CHOICES.keys()), value="Large-v3 (accurate, multilingual)", label="Model")
	lyrics_mode = gr.Checkbox(label="Lyrics Mode (for music)", value=False)
	target_lang = gr.Dropdown(choices=LANGUAGES, value="en", label="Translate To")

	with gr.Row():
	font = gr.Dropdown(choices=FONTS, value="Arial", label="Font")
	color = gr.Dropdown(choices=COLORS, value="white", label="Color")
	size = gr.Dropdown(choices=SIZES, value="medium (32)", label="Size")
	position = gr.Dropdown(choices=POSITIONS, value="bottom", label="Position")

	emphasis_input = gr.Textbox(label="Emphasize Words (comma-separated, e.g., wow,amazing)", placeholder="For highlights!")

	process_btn = gr.Button("Generate Captions!", variant="primary")

	with gr.Row():
	output_video = gr.File(label="Download Video with Subtitles")
	output_srt = gr.File(label="Download SRT Transcript")
	preview = gr.Image(label="Preview Thumbnail")

	# Progress integrated in fn
	process_btn.click(
	main_process,
	inputs=[video_input, yt_input, model_choice, lyrics_mode, target_lang, font, color, size, position, emphasis_input],
	outputs=[output_video, output_srt, preview]
	)

	gr.Markdown("Free & No Login! Optimized for speed (CUDA auto). Built with ❤️ for creators.")

	if __name__ == "__main__":
	demo.launch(server_name="0.0.0.0", server_port=7860, share=True) # For HF Spaces