Spaces:
Sleeping
Sleeping
Create app.py
Browse files
app.py
ADDED
|
@@ -0,0 +1,279 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# app.py - BlinkCaptions-like Auto Subtitle Generator for Hugging Face Spaces
|
| 2 |
+
# This Gradio app mimics Vozo.ai's BlinkCaptions: AI-powered one-click dynamic captions
|
| 3 |
+
# with high accuracy, Hinglish support, styles, emojis/highlights, and FFmpeg burning.
|
| 4 |
+
# Key steps (Blink-like):
|
| 5 |
+
# 1. Input handling: Upload video or YouTube URL download.
|
| 6 |
+
# 2. Audio extraction & validation (duration <10min, has audio).
|
| 7 |
+
# 3. Whisper transcription: Auto-detect Hinglish (language=None), timestamps for sync.
|
| 8 |
+
# - Lyrics mode: Enable word_timestamps for music-like precision.
|
| 9 |
+
# - Trim: Skip short/silent segments (<0.5s).
|
| 10 |
+
# 4. Enhancements: Word emphasis (e.g., wrap "wow" in bold/color tags).
|
| 11 |
+
# 5. Translation: Optional to 120+ langs via argostranslate (pre-install common packs).
|
| 12 |
+
# 6. ASS subtitle creation: Styled with fonts/colors/sizes/positions/animations/emojis.
|
| 13 |
+
# 7. Burn to video: FFmpeg overlays HD output, no watermark.
|
| 14 |
+
# 8. UI: Simple, free, viral-ready for Reels/YouTube.
|
| 15 |
+
# Deploy: Save as app.py, add requirements.txt, create HF Space (Gradio SDK).
|
| 16 |
+
|
| 17 |
+
import os
|
| 18 |
+
import tempfile
|
| 19 |
+
import gradio as gr
|
| 20 |
+
from transformers import pipeline
|
| 21 |
+
import torch
|
| 22 |
+
import ffmpeg
|
| 23 |
+
from yt_dlp import YoutubeDL
|
| 24 |
+
from googletrans import Translator # Fallback to googletrans for simplicity (argos heavy for 120+ langs)
|
| 25 |
+
# Note: For argostranslate, uncomment below and pre-install packs in HF Space Dockerfile if needed.
|
| 26 |
+
# from argostranslate import package, translate
|
| 27 |
+
# package.update_package_index() # Run once
|
| 28 |
+
|
| 29 |
+
# Model options (lighter for speed)
|
| 30 |
+
MODEL_CHOICES = {
|
| 31 |
+
"Base (fast)": "openai/whisper-base",
|
| 32 |
+
"Small": "openai/whisper-small",
|
| 33 |
+
"Medium": "openai/whisper-medium",
|
| 34 |
+
"Large-v3 (accurate, multilingual)": "openai/whisper-large-v3"
|
| 35 |
+
}
|
| 36 |
+
|
| 37 |
+
# Style options (200+ simulated via combos; Blink-like: Montserrat/Arial, colors, etc.)
|
| 38 |
+
FONTS = ["Arial", "Montserrat"] # Add more if custom fonts available
|
| 39 |
+
COLORS = ["white", "yellow", "black"]
|
| 40 |
+
SIZES = ["small (24)", "medium (32)", "large (40)"]
|
| 41 |
+
POSITIONS = ["bottom", "top"]
|
| 42 |
+
LANGUAGES = ["en", "hi", "fr", "es"] # Sample; extend to 120+ with googletrans
|
| 43 |
+
|
| 44 |
+
translator = Translator() # googletrans instance
|
| 45 |
+
|
| 46 |
+
def download_youtube(url, progress=gr.Progress()):
|
| 47 |
+
"""Download YouTube video using yt-dlp (Blink-like: social input support)."""
|
| 48 |
+
progress(0, desc="Downloading video...")
|
| 49 |
+
ydl_opts = {
|
| 50 |
+
'format': 'best[height<=720]/best', # HD but not too heavy
|
| 51 |
+
'outtmpl': '%(title)s.%(ext)s',
|
| 52 |
+
'noplaylist': True,
|
| 53 |
+
}
|
| 54 |
+
with tempfile.TemporaryDirectory() as tmpdir:
|
| 55 |
+
os.chdir(tmpdir)
|
| 56 |
+
with YoutubeDL(ydl_opts) as ydl:
|
| 57 |
+
ydl.download([url])
|
| 58 |
+
files = [f for f in os.listdir(tmpdir) if f.endswith(('.mp4', '.mkv', '.webm'))]
|
| 59 |
+
if files:
|
| 60 |
+
video_path = os.path.join(tmpdir, files[0])
|
| 61 |
+
progress(1, desc="Download complete!")
|
| 62 |
+
return video_path
|
| 63 |
+
raise ValueError("Download failed!")
|
| 64 |
+
|
| 65 |
+
def extract_audio(video_path):
|
| 66 |
+
"""Extract audio from video using FFmpeg (prep for Whisper)."""
|
| 67 |
+
audio_path = video_path.rsplit('.', 1)[0] + '.wav'
|
| 68 |
+
stream = ffmpeg.input(video_path)
|
| 69 |
+
stream = ffmpeg.output(stream, audio_path, acodec='pcm_s16le', ac=1, ar=16000) # Whisper format
|
| 70 |
+
ffmpeg.run(stream, overwrite_output=True, quiet=True)
|
| 71 |
+
return audio_path
|
| 72 |
+
|
| 73 |
+
def get_video_duration(video_path):
|
| 74 |
+
"""Get duration in seconds using FFmpeg probe (limit check)."""
|
| 75 |
+
probe = ffmpeg.probe(video_path)
|
| 76 |
+
return float(probe['streams'][0]['duration']) if probe['streams'] else 0
|
| 77 |
+
|
| 78 |
+
def has_audio(video_path):
|
| 79 |
+
"""Check if video has audio stream."""
|
| 80 |
+
probe = ffmpeg.probe(video_path)
|
| 81 |
+
return any(s['codec_type'] == 'audio' for s in probe['streams'])
|
| 82 |
+
|
| 83 |
+
def transcribe_audio(audio_path, model_name, lyrics_mode, progress=gr.Progress()):
|
| 84 |
+
"""Whisper transcription (Blink-like: 98%+ acc, Hinglish auto-detect, lyrics mode)."""
|
| 85 |
+
progress(0, desc="Loading Whisper model...")
|
| 86 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 87 |
+
pipe = pipeline(
|
| 88 |
+
"automatic-speech-recognition",
|
| 89 |
+
model=model_name,
|
| 90 |
+
device=device,
|
| 91 |
+
return_timestamps=True,
|
| 92 |
+
chunk_length_s=30,
|
| 93 |
+
batch_size=8,
|
| 94 |
+
)
|
| 95 |
+
|
| 96 |
+
# Generate kwargs for accuracy boost (transcribe task, auto lang for Hinglish)
|
| 97 |
+
generate_kwargs = {"task": "transcribe", "language": None} # Auto-detect Hindi/English mix
|
| 98 |
+
if lyrics_mode:
|
| 99 |
+
generate_kwargs["word_timestamps"] = True # Lyrics precision
|
| 100 |
+
|
| 101 |
+
progress(0.5, desc="Transcribing...")
|
| 102 |
+
result = pipe(audio_path, generate_kwargs=generate_kwargs)
|
| 103 |
+
|
| 104 |
+
# Extract segments, trim silences (short <0.5s)
|
| 105 |
+
segments = result.get('chunks', [])
|
| 106 |
+
trimmed_segments = [s for s in segments if (s['end'] - s['start']) > 0.5]
|
| 107 |
+
|
| 108 |
+
progress(1, desc="Transcription complete!")
|
| 109 |
+
return trimmed_segments # List of {'text': , 'start': , 'end': }
|
| 110 |
+
|
| 111 |
+
def translate_text(text, target_lang):
|
| 112 |
+
"""Optional translation (Blink-like: 120+ langs)."""
|
| 113 |
+
if target_lang == "en": # No translate
|
| 114 |
+
return text
|
| 115 |
+
return translator.translate(text, dest=target_lang).text
|
| 116 |
+
|
| 117 |
+
def create_ass_subtitles(segments, font, color, size, position, emphasis_words, target_lang, progress=gr.Progress()):
|
| 118 |
+
"""Create ASS subtitles (styled like Blink: fonts/colors/emojis/highlights/animations)."""
|
| 119 |
+
progress(0, desc="Generating styled subtitles...")
|
| 120 |
+
|
| 121 |
+
# Map sizes
|
| 122 |
+
size_map = {"small (24)": 24, "medium (32)": 32, "large (40)": 40}
|
| 123 |
+
fontsize = size_map[size]
|
| 124 |
+
|
| 125 |
+
# Color to ASS hex (BGR)
|
| 126 |
+
color_map = {"white": "&HFFFFFF&", "yellow": "&HFFFF00&", "black": "&H000000&"}
|
| 127 |
+
subtitle_color = color_map[color]
|
| 128 |
+
|
| 129 |
+
# Position: bottom/top alignment
|
| 130 |
+
alignment = 2 if position == "bottom" else 8 # ASS alignment codes
|
| 131 |
+
|
| 132 |
+
# ASS header (V4+ Styles for Blink-like customizations)
|
| 133 |
+
ass_content = """[Script Info]
|
| 134 |
+
Title: Generated Subtitles
|
| 135 |
+
ScriptType: v4.00+
|
| 136 |
+
[V4+ Styles]
|
| 137 |
+
Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
|
| 138 |
+
Style: Default,{font},{fontsize},{subtitle_color},&H000000FF,&H00000000,&H00000000,0,0,0,0,100,100,0,0,1,2,2,{alignment},10,10,10,1
|
| 139 |
+
|
| 140 |
+
[Events]
|
| 141 |
+
Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
|
| 142 |
+
""".format(font=font, fontsize=fontsize, subtitle_color=subtitle_color, alignment=alignment)
|
| 143 |
+
|
| 144 |
+
# Build events with enhancements
|
| 145 |
+
for i, seg in enumerate(segments):
|
| 146 |
+
start = f"{int(seg['start']*100)}" # ms
|
| 147 |
+
end = f"{int(seg['end']*100)}"
|
| 148 |
+
text = translate_text(seg['text'].strip(), target_lang)
|
| 149 |
+
|
| 150 |
+
# Word emphasis/highlights (e.g., make "wow" pop with bold/color)
|
| 151 |
+
for word in emphasis_words:
|
| 152 |
+
if word.lower() in text.lower():
|
| 153 |
+
text = text.replace(word, f"{{\\b1\\c{&HFF0000&}}}" + word + "{\\b0}")
|
| 154 |
+
|
| 155 |
+
# Add emoji example (Blink-like: one-click emojis)
|
| 156 |
+
if "!" in text:
|
| 157 |
+
text += " 😎" # Simple auto-add
|
| 158 |
+
|
| 159 |
+
# Basic fade-in animation (Blink dynamic)
|
| 160 |
+
text = f"{{\\fad(200,200)}}{text}"
|
| 161 |
+
|
| 162 |
+
# Dialogue event
|
| 163 |
+
ass_content += f"Dialogue: 0,{start},{end},Default,,0,0,0,,{text}\n"
|
| 164 |
+
|
| 165 |
+
progress(1, desc="ASS file ready!")
|
| 166 |
+
with tempfile.NamedTemporaryFile(mode='w', suffix='.ass', delete=False) as f:
|
| 167 |
+
f.write(ass_content)
|
| 168 |
+
return f.name
|
| 169 |
+
|
| 170 |
+
def burn_subtitles(video_path, ass_path, progress=gr.Progress()):
|
| 171 |
+
"""Burn ASS to video using FFmpeg (Blink-like: HD export, no watermark)."""
|
| 172 |
+
progress(0, desc="Burning subtitles...")
|
| 173 |
+
output_path = video_path.rsplit('.', 1)[0] + '_subtitled.mp4'
|
| 174 |
+
|
| 175 |
+
# FFmpeg command: subtitles filter for styles/shadows/position
|
| 176 |
+
stream = ffmpeg.input(video_path)
|
| 177 |
+
stream = ffmpeg.filter(stream, 'subtitles', ass_path, force_style='FontName=Montserrat,PrimaryColour=&HFFFFFF&') # Override if needed
|
| 178 |
+
stream = ffmpeg.output(stream, output_path, vcodec='libx264', acodec='aac', **{'preset': 'fast'}) # HD
|
| 179 |
+
|
| 180 |
+
ffmpeg.run(stream, overwrite_output=True, quiet=True)
|
| 181 |
+
progress(1, desc="Video ready!")
|
| 182 |
+
return output_path
|
| 183 |
+
|
| 184 |
+
def main_process(
|
| 185 |
+
video_file, yt_url, model_choice, lyrics_mode, target_lang,
|
| 186 |
+
font, color, size, position, emphasis_words_str,
|
| 187 |
+
progress=gr.Progress()
|
| 188 |
+
):
|
| 189 |
+
"""Main pipeline (Blink one-click flow)."""
|
| 190 |
+
if video_file is None and yt_url is None:
|
| 191 |
+
raise gr.Error("Upload a video or enter a YouTube URL!")
|
| 192 |
+
|
| 193 |
+
video_path = None
|
| 194 |
+
if video_file:
|
| 195 |
+
video_path = video_file.name
|
| 196 |
+
elif yt_url:
|
| 197 |
+
video_path = download_youtube(yt_url, progress)
|
| 198 |
+
|
| 199 |
+
# Validation (Blink error handling)
|
| 200 |
+
duration = get_video_duration(video_path)
|
| 201 |
+
if duration > 600: # 10 min
|
| 202 |
+
raise gr.Error("Video too long! Limit to 10 minutes for demo.")
|
| 203 |
+
if not has_audio(video_path):
|
| 204 |
+
raise gr.Error("No audio detected!")
|
| 205 |
+
|
| 206 |
+
# Extract audio
|
| 207 |
+
audio_path = extract_audio(video_path)
|
| 208 |
+
|
| 209 |
+
# Transcribe
|
| 210 |
+
model_name = MODEL_CHOICES[model_choice]
|
| 211 |
+
segments = transcribe_audio(audio_path, model_name, lyrics_mode, progress)
|
| 212 |
+
if not segments:
|
| 213 |
+
raise gr.Error("No speech detected!")
|
| 214 |
+
|
| 215 |
+
# Emphasis words
|
| 216 |
+
emphasis_words = emphasis_words_str.split(',') if emphasis_words_str else []
|
| 217 |
+
|
| 218 |
+
# Create styled ASS
|
| 219 |
+
ass_path = create_ass_subtitles(segments, font, color, size, position, emphasis_words, target_lang, progress)
|
| 220 |
+
|
| 221 |
+
# Burn
|
| 222 |
+
output_video = burn_subtitles(video_path, ass_path, progress)
|
| 223 |
+
|
| 224 |
+
# Also save SRT for download (simple conversion from segments)
|
| 225 |
+
srt_content = ""
|
| 226 |
+
for i, seg in enumerate(segments, 1):
|
| 227 |
+
start = f"{int(seg['start']//3600):02d}:{int((seg['start']%3600)//60):02d}:{int(seg['start']%60):02d},{int((seg['start']%1)*1000):03d}"
|
| 228 |
+
end = f"{int(seg['end']//3600):02d}:{int((seg['end']%3600)//60):02d}:{int(seg['end']%60):02d},{int((seg['end']%1)*1000):03d}"
|
| 229 |
+
text = translate_text(seg['text'].strip(), target_lang)
|
| 230 |
+
srt_content += f"{i}\n{start} --> {end}\n{text}\n\n"
|
| 231 |
+
with tempfile.NamedTemporaryFile(mode='w', suffix='.srt', delete=False) as f:
|
| 232 |
+
f.write(srt_content)
|
| 233 |
+
srt_path = f.name
|
| 234 |
+
|
| 235 |
+
# Preview thumbnail (simple FFmpeg extract)
|
| 236 |
+
thumb_path = video_path.rsplit('.', 1)[0] + '_thumb.jpg'
|
| 237 |
+
ffmpeg.input(video_path, ss=1).output(thumb_path, vframes=1).run(quiet=True)
|
| 238 |
+
|
| 239 |
+
return output_video, srt_path, thumb_path
|
| 240 |
+
|
| 241 |
+
# Gradio UI (Blink simple: upload, selectors, progress, downloads)
|
| 242 |
+
with gr.Blocks(title="BlinkCaptions Clone - Free Auto Subtitles") as demo:
|
| 243 |
+
gr.Markdown("# 🚀 BlinkCaptions-like AI Subtitle Generator\nGenerate dynamic Hinglish captions with styles! Viral-ready for Reels/YouTube.")
|
| 244 |
+
|
| 245 |
+
with gr.Row():
|
| 246 |
+
with gr.Column(scale=1):
|
| 247 |
+
video_input = gr.File(label="Upload Video (MP4, <10min)", file_types=[".mp4", ".mkv", ".webm"])
|
| 248 |
+
yt_input = gr.Textbox(label="Or YouTube URL", placeholder="https://youtube.com/watch?v=...")
|
| 249 |
+
with gr.Column(scale=1):
|
| 250 |
+
model_choice = gr.Dropdown(choices=list(MODEL_CHOICES.keys()), value="Large-v3 (accurate, multilingual)", label="Model")
|
| 251 |
+
lyrics_mode = gr.Checkbox(label="Lyrics Mode (for music)", value=False)
|
| 252 |
+
target_lang = gr.Dropdown(choices=LANGUAGES, value="en", label="Translate To")
|
| 253 |
+
|
| 254 |
+
with gr.Row():
|
| 255 |
+
font = gr.Dropdown(choices=FONTS, value="Arial", label="Font")
|
| 256 |
+
color = gr.Dropdown(choices=COLORS, value="white", label="Color")
|
| 257 |
+
size = gr.Dropdown(choices=SIZES, value="medium (32)", label="Size")
|
| 258 |
+
position = gr.Dropdown(choices=POSITIONS, value="bottom", label="Position")
|
| 259 |
+
|
| 260 |
+
emphasis_input = gr.Textbox(label="Emphasize Words (comma-separated, e.g., wow,amazing)", placeholder="For highlights!")
|
| 261 |
+
|
| 262 |
+
process_btn = gr.Button("Generate Captions!", variant="primary")
|
| 263 |
+
|
| 264 |
+
with gr.Row():
|
| 265 |
+
output_video = gr.File(label="Download Video with Subtitles")
|
| 266 |
+
output_srt = gr.File(label="Download SRT Transcript")
|
| 267 |
+
preview = gr.Image(label="Preview Thumbnail")
|
| 268 |
+
|
| 269 |
+
# Progress integrated in fn
|
| 270 |
+
process_btn.click(
|
| 271 |
+
main_process,
|
| 272 |
+
inputs=[video_input, yt_input, model_choice, lyrics_mode, target_lang, font, color, size, position, emphasis_input],
|
| 273 |
+
outputs=[output_video, output_srt, preview]
|
| 274 |
+
)
|
| 275 |
+
|
| 276 |
+
gr.Markdown("**Free & No Login!** Optimized for speed (CUDA auto). Built with ❤️ for creators.")
|
| 277 |
+
|
| 278 |
+
if __name__ == "__main__":
|
| 279 |
+
demo.launch(server_name="0.0.0.0", server_port=7860, share=True) # For HF Spaces
|