Spaces:
Sleeping
Sleeping
File size: 12,611 Bytes
b847dcb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 |
# app.py - BlinkCaptions-like Auto Subtitle Generator for Hugging Face Spaces
# This Gradio app mimics Vozo.ai's BlinkCaptions: AI-powered one-click dynamic captions
# with high accuracy, Hinglish support, styles, emojis/highlights, and FFmpeg burning.
# Key steps (Blink-like):
# 1. Input handling: Upload video or YouTube URL download.
# 2. Audio extraction & validation (duration <10min, has audio).
# 3. Whisper transcription: Auto-detect Hinglish (language=None), timestamps for sync.
# - Lyrics mode: Enable word_timestamps for music-like precision.
# - Trim: Skip short/silent segments (<0.5s).
# 4. Enhancements: Word emphasis (e.g., wrap "wow" in bold/color tags).
# 5. Translation: Optional to 120+ langs via argostranslate (pre-install common packs).
# 6. ASS subtitle creation: Styled with fonts/colors/sizes/positions/animations/emojis.
# 7. Burn to video: FFmpeg overlays HD output, no watermark.
# 8. UI: Simple, free, viral-ready for Reels/YouTube.
# Deploy: Save as app.py, add requirements.txt, create HF Space (Gradio SDK).
import os
import tempfile
import gradio as gr
from transformers import pipeline
import torch
import ffmpeg
from yt_dlp import YoutubeDL
from googletrans import Translator # Fallback to googletrans for simplicity (argos heavy for 120+ langs)
# Note: For argostranslate, uncomment below and pre-install packs in HF Space Dockerfile if needed.
# from argostranslate import package, translate
# package.update_package_index() # Run once
# Model options (lighter for speed)
MODEL_CHOICES = {
"Base (fast)": "openai/whisper-base",
"Small": "openai/whisper-small",
"Medium": "openai/whisper-medium",
"Large-v3 (accurate, multilingual)": "openai/whisper-large-v3"
}
# Style options (200+ simulated via combos; Blink-like: Montserrat/Arial, colors, etc.)
FONTS = ["Arial", "Montserrat"] # Add more if custom fonts available
COLORS = ["white", "yellow", "black"]
SIZES = ["small (24)", "medium (32)", "large (40)"]
POSITIONS = ["bottom", "top"]
LANGUAGES = ["en", "hi", "fr", "es"] # Sample; extend to 120+ with googletrans
translator = Translator() # googletrans instance
def download_youtube(url, progress=gr.Progress()):
"""Download YouTube video using yt-dlp (Blink-like: social input support)."""
progress(0, desc="Downloading video...")
ydl_opts = {
'format': 'best[height<=720]/best', # HD but not too heavy
'outtmpl': '%(title)s.%(ext)s',
'noplaylist': True,
}
with tempfile.TemporaryDirectory() as tmpdir:
os.chdir(tmpdir)
with YoutubeDL(ydl_opts) as ydl:
ydl.download([url])
files = [f for f in os.listdir(tmpdir) if f.endswith(('.mp4', '.mkv', '.webm'))]
if files:
video_path = os.path.join(tmpdir, files[0])
progress(1, desc="Download complete!")
return video_path
raise ValueError("Download failed!")
def extract_audio(video_path):
"""Extract audio from video using FFmpeg (prep for Whisper)."""
audio_path = video_path.rsplit('.', 1)[0] + '.wav'
stream = ffmpeg.input(video_path)
stream = ffmpeg.output(stream, audio_path, acodec='pcm_s16le', ac=1, ar=16000) # Whisper format
ffmpeg.run(stream, overwrite_output=True, quiet=True)
return audio_path
def get_video_duration(video_path):
"""Get duration in seconds using FFmpeg probe (limit check)."""
probe = ffmpeg.probe(video_path)
return float(probe['streams'][0]['duration']) if probe['streams'] else 0
def has_audio(video_path):
"""Check if video has audio stream."""
probe = ffmpeg.probe(video_path)
return any(s['codec_type'] == 'audio' for s in probe['streams'])
def transcribe_audio(audio_path, model_name, lyrics_mode, progress=gr.Progress()):
"""Whisper transcription (Blink-like: 98%+ acc, Hinglish auto-detect, lyrics mode)."""
progress(0, desc="Loading Whisper model...")
device = "cuda" if torch.cuda.is_available() else "cpu"
pipe = pipeline(
"automatic-speech-recognition",
model=model_name,
device=device,
return_timestamps=True,
chunk_length_s=30,
batch_size=8,
)
# Generate kwargs for accuracy boost (transcribe task, auto lang for Hinglish)
generate_kwargs = {"task": "transcribe", "language": None} # Auto-detect Hindi/English mix
if lyrics_mode:
generate_kwargs["word_timestamps"] = True # Lyrics precision
progress(0.5, desc="Transcribing...")
result = pipe(audio_path, generate_kwargs=generate_kwargs)
# Extract segments, trim silences (short <0.5s)
segments = result.get('chunks', [])
trimmed_segments = [s for s in segments if (s['end'] - s['start']) > 0.5]
progress(1, desc="Transcription complete!")
return trimmed_segments # List of {'text': , 'start': , 'end': }
def translate_text(text, target_lang):
"""Optional translation (Blink-like: 120+ langs)."""
if target_lang == "en": # No translate
return text
return translator.translate(text, dest=target_lang).text
def create_ass_subtitles(segments, font, color, size, position, emphasis_words, target_lang, progress=gr.Progress()):
"""Create ASS subtitles (styled like Blink: fonts/colors/emojis/highlights/animations)."""
progress(0, desc="Generating styled subtitles...")
# Map sizes
size_map = {"small (24)": 24, "medium (32)": 32, "large (40)": 40}
fontsize = size_map[size]
# Color to ASS hex (BGR)
color_map = {"white": "&HFFFFFF&", "yellow": "&HFFFF00&", "black": "&H000000&"}
subtitle_color = color_map[color]
# Position: bottom/top alignment
alignment = 2 if position == "bottom" else 8 # ASS alignment codes
# ASS header (V4+ Styles for Blink-like customizations)
ass_content = """[Script Info]
Title: Generated Subtitles
ScriptType: v4.00+
[V4+ Styles]
Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
Style: Default,{font},{fontsize},{subtitle_color},&H000000FF,&H00000000,&H00000000,0,0,0,0,100,100,0,0,1,2,2,{alignment},10,10,10,1
[Events]
Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
""".format(font=font, fontsize=fontsize, subtitle_color=subtitle_color, alignment=alignment)
# Build events with enhancements
for i, seg in enumerate(segments):
start = f"{int(seg['start']*100)}" # ms
end = f"{int(seg['end']*100)}"
text = translate_text(seg['text'].strip(), target_lang)
# Word emphasis/highlights (e.g., make "wow" pop with bold/color)
for word in emphasis_words:
if word.lower() in text.lower():
text = text.replace(word, f"{{\\b1\\c{&HFF0000&}}}" + word + "{\\b0}")
# Add emoji example (Blink-like: one-click emojis)
if "!" in text:
text += " π" # Simple auto-add
# Basic fade-in animation (Blink dynamic)
text = f"{{\\fad(200,200)}}{text}"
# Dialogue event
ass_content += f"Dialogue: 0,{start},{end},Default,,0,0,0,,{text}\n"
progress(1, desc="ASS file ready!")
with tempfile.NamedTemporaryFile(mode='w', suffix='.ass', delete=False) as f:
f.write(ass_content)
return f.name
def burn_subtitles(video_path, ass_path, progress=gr.Progress()):
"""Burn ASS to video using FFmpeg (Blink-like: HD export, no watermark)."""
progress(0, desc="Burning subtitles...")
output_path = video_path.rsplit('.', 1)[0] + '_subtitled.mp4'
# FFmpeg command: subtitles filter for styles/shadows/position
stream = ffmpeg.input(video_path)
stream = ffmpeg.filter(stream, 'subtitles', ass_path, force_style='FontName=Montserrat,PrimaryColour=&HFFFFFF&') # Override if needed
stream = ffmpeg.output(stream, output_path, vcodec='libx264', acodec='aac', **{'preset': 'fast'}) # HD
ffmpeg.run(stream, overwrite_output=True, quiet=True)
progress(1, desc="Video ready!")
return output_path
def main_process(
video_file, yt_url, model_choice, lyrics_mode, target_lang,
font, color, size, position, emphasis_words_str,
progress=gr.Progress()
):
"""Main pipeline (Blink one-click flow)."""
if video_file is None and yt_url is None:
raise gr.Error("Upload a video or enter a YouTube URL!")
video_path = None
if video_file:
video_path = video_file.name
elif yt_url:
video_path = download_youtube(yt_url, progress)
# Validation (Blink error handling)
duration = get_video_duration(video_path)
if duration > 600: # 10 min
raise gr.Error("Video too long! Limit to 10 minutes for demo.")
if not has_audio(video_path):
raise gr.Error("No audio detected!")
# Extract audio
audio_path = extract_audio(video_path)
# Transcribe
model_name = MODEL_CHOICES[model_choice]
segments = transcribe_audio(audio_path, model_name, lyrics_mode, progress)
if not segments:
raise gr.Error("No speech detected!")
# Emphasis words
emphasis_words = emphasis_words_str.split(',') if emphasis_words_str else []
# Create styled ASS
ass_path = create_ass_subtitles(segments, font, color, size, position, emphasis_words, target_lang, progress)
# Burn
output_video = burn_subtitles(video_path, ass_path, progress)
# Also save SRT for download (simple conversion from segments)
srt_content = ""
for i, seg in enumerate(segments, 1):
start = f"{int(seg['start']//3600):02d}:{int((seg['start']%3600)//60):02d}:{int(seg['start']%60):02d},{int((seg['start']%1)*1000):03d}"
end = f"{int(seg['end']//3600):02d}:{int((seg['end']%3600)//60):02d}:{int(seg['end']%60):02d},{int((seg['end']%1)*1000):03d}"
text = translate_text(seg['text'].strip(), target_lang)
srt_content += f"{i}\n{start} --> {end}\n{text}\n\n"
with tempfile.NamedTemporaryFile(mode='w', suffix='.srt', delete=False) as f:
f.write(srt_content)
srt_path = f.name
# Preview thumbnail (simple FFmpeg extract)
thumb_path = video_path.rsplit('.', 1)[0] + '_thumb.jpg'
ffmpeg.input(video_path, ss=1).output(thumb_path, vframes=1).run(quiet=True)
return output_video, srt_path, thumb_path
# Gradio UI (Blink simple: upload, selectors, progress, downloads)
with gr.Blocks(title="BlinkCaptions Clone - Free Auto Subtitles") as demo:
gr.Markdown("# π BlinkCaptions-like AI Subtitle Generator\nGenerate dynamic Hinglish captions with styles! Viral-ready for Reels/YouTube.")
with gr.Row():
with gr.Column(scale=1):
video_input = gr.File(label="Upload Video (MP4, <10min)", file_types=[".mp4", ".mkv", ".webm"])
yt_input = gr.Textbox(label="Or YouTube URL", placeholder="https://youtube.com/watch?v=...")
with gr.Column(scale=1):
model_choice = gr.Dropdown(choices=list(MODEL_CHOICES.keys()), value="Large-v3 (accurate, multilingual)", label="Model")
lyrics_mode = gr.Checkbox(label="Lyrics Mode (for music)", value=False)
target_lang = gr.Dropdown(choices=LANGUAGES, value="en", label="Translate To")
with gr.Row():
font = gr.Dropdown(choices=FONTS, value="Arial", label="Font")
color = gr.Dropdown(choices=COLORS, value="white", label="Color")
size = gr.Dropdown(choices=SIZES, value="medium (32)", label="Size")
position = gr.Dropdown(choices=POSITIONS, value="bottom", label="Position")
emphasis_input = gr.Textbox(label="Emphasize Words (comma-separated, e.g., wow,amazing)", placeholder="For highlights!")
process_btn = gr.Button("Generate Captions!", variant="primary")
with gr.Row():
output_video = gr.File(label="Download Video with Subtitles")
output_srt = gr.File(label="Download SRT Transcript")
preview = gr.Image(label="Preview Thumbnail")
# Progress integrated in fn
process_btn.click(
main_process,
inputs=[video_input, yt_input, model_choice, lyrics_mode, target_lang, font, color, size, position, emphasis_input],
outputs=[output_video, output_srt, preview]
)
gr.Markdown("**Free & No Login!** Optimized for speed (CUDA auto). Built with β€οΈ for creators.")
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860, share=True) # For HF Spaces |