Realmeas commited on
Commit
b847dcb
·
verified ·
1 Parent(s): 1bd8ed9

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +279 -0
app.py ADDED
@@ -0,0 +1,279 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py - BlinkCaptions-like Auto Subtitle Generator for Hugging Face Spaces
2
+ # This Gradio app mimics Vozo.ai's BlinkCaptions: AI-powered one-click dynamic captions
3
+ # with high accuracy, Hinglish support, styles, emojis/highlights, and FFmpeg burning.
4
+ # Key steps (Blink-like):
5
+ # 1. Input handling: Upload video or YouTube URL download.
6
+ # 2. Audio extraction & validation (duration <10min, has audio).
7
+ # 3. Whisper transcription: Auto-detect Hinglish (language=None), timestamps for sync.
8
+ # - Lyrics mode: Enable word_timestamps for music-like precision.
9
+ # - Trim: Skip short/silent segments (<0.5s).
10
+ # 4. Enhancements: Word emphasis (e.g., wrap "wow" in bold/color tags).
11
+ # 5. Translation: Optional to 120+ langs via argostranslate (pre-install common packs).
12
+ # 6. ASS subtitle creation: Styled with fonts/colors/sizes/positions/animations/emojis.
13
+ # 7. Burn to video: FFmpeg overlays HD output, no watermark.
14
+ # 8. UI: Simple, free, viral-ready for Reels/YouTube.
15
+ # Deploy: Save as app.py, add requirements.txt, create HF Space (Gradio SDK).
16
+
17
+ import os
18
+ import tempfile
19
+ import gradio as gr
20
+ from transformers import pipeline
21
+ import torch
22
+ import ffmpeg
23
+ from yt_dlp import YoutubeDL
24
+ from googletrans import Translator # Fallback to googletrans for simplicity (argos heavy for 120+ langs)
25
+ # Note: For argostranslate, uncomment below and pre-install packs in HF Space Dockerfile if needed.
26
+ # from argostranslate import package, translate
27
+ # package.update_package_index() # Run once
28
+
29
+ # Model options (lighter for speed)
30
+ MODEL_CHOICES = {
31
+ "Base (fast)": "openai/whisper-base",
32
+ "Small": "openai/whisper-small",
33
+ "Medium": "openai/whisper-medium",
34
+ "Large-v3 (accurate, multilingual)": "openai/whisper-large-v3"
35
+ }
36
+
37
+ # Style options (200+ simulated via combos; Blink-like: Montserrat/Arial, colors, etc.)
38
+ FONTS = ["Arial", "Montserrat"] # Add more if custom fonts available
39
+ COLORS = ["white", "yellow", "black"]
40
+ SIZES = ["small (24)", "medium (32)", "large (40)"]
41
+ POSITIONS = ["bottom", "top"]
42
+ LANGUAGES = ["en", "hi", "fr", "es"] # Sample; extend to 120+ with googletrans
43
+
44
+ translator = Translator() # googletrans instance
45
+
46
+ def download_youtube(url, progress=gr.Progress()):
47
+ """Download YouTube video using yt-dlp (Blink-like: social input support)."""
48
+ progress(0, desc="Downloading video...")
49
+ ydl_opts = {
50
+ 'format': 'best[height<=720]/best', # HD but not too heavy
51
+ 'outtmpl': '%(title)s.%(ext)s',
52
+ 'noplaylist': True,
53
+ }
54
+ with tempfile.TemporaryDirectory() as tmpdir:
55
+ os.chdir(tmpdir)
56
+ with YoutubeDL(ydl_opts) as ydl:
57
+ ydl.download([url])
58
+ files = [f for f in os.listdir(tmpdir) if f.endswith(('.mp4', '.mkv', '.webm'))]
59
+ if files:
60
+ video_path = os.path.join(tmpdir, files[0])
61
+ progress(1, desc="Download complete!")
62
+ return video_path
63
+ raise ValueError("Download failed!")
64
+
65
+ def extract_audio(video_path):
66
+ """Extract audio from video using FFmpeg (prep for Whisper)."""
67
+ audio_path = video_path.rsplit('.', 1)[0] + '.wav'
68
+ stream = ffmpeg.input(video_path)
69
+ stream = ffmpeg.output(stream, audio_path, acodec='pcm_s16le', ac=1, ar=16000) # Whisper format
70
+ ffmpeg.run(stream, overwrite_output=True, quiet=True)
71
+ return audio_path
72
+
73
+ def get_video_duration(video_path):
74
+ """Get duration in seconds using FFmpeg probe (limit check)."""
75
+ probe = ffmpeg.probe(video_path)
76
+ return float(probe['streams'][0]['duration']) if probe['streams'] else 0
77
+
78
+ def has_audio(video_path):
79
+ """Check if video has audio stream."""
80
+ probe = ffmpeg.probe(video_path)
81
+ return any(s['codec_type'] == 'audio' for s in probe['streams'])
82
+
83
+ def transcribe_audio(audio_path, model_name, lyrics_mode, progress=gr.Progress()):
84
+ """Whisper transcription (Blink-like: 98%+ acc, Hinglish auto-detect, lyrics mode)."""
85
+ progress(0, desc="Loading Whisper model...")
86
+ device = "cuda" if torch.cuda.is_available() else "cpu"
87
+ pipe = pipeline(
88
+ "automatic-speech-recognition",
89
+ model=model_name,
90
+ device=device,
91
+ return_timestamps=True,
92
+ chunk_length_s=30,
93
+ batch_size=8,
94
+ )
95
+
96
+ # Generate kwargs for accuracy boost (transcribe task, auto lang for Hinglish)
97
+ generate_kwargs = {"task": "transcribe", "language": None} # Auto-detect Hindi/English mix
98
+ if lyrics_mode:
99
+ generate_kwargs["word_timestamps"] = True # Lyrics precision
100
+
101
+ progress(0.5, desc="Transcribing...")
102
+ result = pipe(audio_path, generate_kwargs=generate_kwargs)
103
+
104
+ # Extract segments, trim silences (short <0.5s)
105
+ segments = result.get('chunks', [])
106
+ trimmed_segments = [s for s in segments if (s['end'] - s['start']) > 0.5]
107
+
108
+ progress(1, desc="Transcription complete!")
109
+ return trimmed_segments # List of {'text': , 'start': , 'end': }
110
+
111
+ def translate_text(text, target_lang):
112
+ """Optional translation (Blink-like: 120+ langs)."""
113
+ if target_lang == "en": # No translate
114
+ return text
115
+ return translator.translate(text, dest=target_lang).text
116
+
117
+ def create_ass_subtitles(segments, font, color, size, position, emphasis_words, target_lang, progress=gr.Progress()):
118
+ """Create ASS subtitles (styled like Blink: fonts/colors/emojis/highlights/animations)."""
119
+ progress(0, desc="Generating styled subtitles...")
120
+
121
+ # Map sizes
122
+ size_map = {"small (24)": 24, "medium (32)": 32, "large (40)": 40}
123
+ fontsize = size_map[size]
124
+
125
+ # Color to ASS hex (BGR)
126
+ color_map = {"white": "&HFFFFFF&", "yellow": "&HFFFF00&", "black": "&H000000&"}
127
+ subtitle_color = color_map[color]
128
+
129
+ # Position: bottom/top alignment
130
+ alignment = 2 if position == "bottom" else 8 # ASS alignment codes
131
+
132
+ # ASS header (V4+ Styles for Blink-like customizations)
133
+ ass_content = """[Script Info]
134
+ Title: Generated Subtitles
135
+ ScriptType: v4.00+
136
+ [V4+ Styles]
137
+ Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
138
+ Style: Default,{font},{fontsize},{subtitle_color},&H000000FF,&H00000000,&H00000000,0,0,0,0,100,100,0,0,1,2,2,{alignment},10,10,10,1
139
+
140
+ [Events]
141
+ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
142
+ """.format(font=font, fontsize=fontsize, subtitle_color=subtitle_color, alignment=alignment)
143
+
144
+ # Build events with enhancements
145
+ for i, seg in enumerate(segments):
146
+ start = f"{int(seg['start']*100)}" # ms
147
+ end = f"{int(seg['end']*100)}"
148
+ text = translate_text(seg['text'].strip(), target_lang)
149
+
150
+ # Word emphasis/highlights (e.g., make "wow" pop with bold/color)
151
+ for word in emphasis_words:
152
+ if word.lower() in text.lower():
153
+ text = text.replace(word, f"{{\\b1\\c{&HFF0000&}}}" + word + "{\\b0}")
154
+
155
+ # Add emoji example (Blink-like: one-click emojis)
156
+ if "!" in text:
157
+ text += " 😎" # Simple auto-add
158
+
159
+ # Basic fade-in animation (Blink dynamic)
160
+ text = f"{{\\fad(200,200)}}{text}"
161
+
162
+ # Dialogue event
163
+ ass_content += f"Dialogue: 0,{start},{end},Default,,0,0,0,,{text}\n"
164
+
165
+ progress(1, desc="ASS file ready!")
166
+ with tempfile.NamedTemporaryFile(mode='w', suffix='.ass', delete=False) as f:
167
+ f.write(ass_content)
168
+ return f.name
169
+
170
+ def burn_subtitles(video_path, ass_path, progress=gr.Progress()):
171
+ """Burn ASS to video using FFmpeg (Blink-like: HD export, no watermark)."""
172
+ progress(0, desc="Burning subtitles...")
173
+ output_path = video_path.rsplit('.', 1)[0] + '_subtitled.mp4'
174
+
175
+ # FFmpeg command: subtitles filter for styles/shadows/position
176
+ stream = ffmpeg.input(video_path)
177
+ stream = ffmpeg.filter(stream, 'subtitles', ass_path, force_style='FontName=Montserrat,PrimaryColour=&HFFFFFF&') # Override if needed
178
+ stream = ffmpeg.output(stream, output_path, vcodec='libx264', acodec='aac', **{'preset': 'fast'}) # HD
179
+
180
+ ffmpeg.run(stream, overwrite_output=True, quiet=True)
181
+ progress(1, desc="Video ready!")
182
+ return output_path
183
+
184
+ def main_process(
185
+ video_file, yt_url, model_choice, lyrics_mode, target_lang,
186
+ font, color, size, position, emphasis_words_str,
187
+ progress=gr.Progress()
188
+ ):
189
+ """Main pipeline (Blink one-click flow)."""
190
+ if video_file is None and yt_url is None:
191
+ raise gr.Error("Upload a video or enter a YouTube URL!")
192
+
193
+ video_path = None
194
+ if video_file:
195
+ video_path = video_file.name
196
+ elif yt_url:
197
+ video_path = download_youtube(yt_url, progress)
198
+
199
+ # Validation (Blink error handling)
200
+ duration = get_video_duration(video_path)
201
+ if duration > 600: # 10 min
202
+ raise gr.Error("Video too long! Limit to 10 minutes for demo.")
203
+ if not has_audio(video_path):
204
+ raise gr.Error("No audio detected!")
205
+
206
+ # Extract audio
207
+ audio_path = extract_audio(video_path)
208
+
209
+ # Transcribe
210
+ model_name = MODEL_CHOICES[model_choice]
211
+ segments = transcribe_audio(audio_path, model_name, lyrics_mode, progress)
212
+ if not segments:
213
+ raise gr.Error("No speech detected!")
214
+
215
+ # Emphasis words
216
+ emphasis_words = emphasis_words_str.split(',') if emphasis_words_str else []
217
+
218
+ # Create styled ASS
219
+ ass_path = create_ass_subtitles(segments, font, color, size, position, emphasis_words, target_lang, progress)
220
+
221
+ # Burn
222
+ output_video = burn_subtitles(video_path, ass_path, progress)
223
+
224
+ # Also save SRT for download (simple conversion from segments)
225
+ srt_content = ""
226
+ for i, seg in enumerate(segments, 1):
227
+ start = f"{int(seg['start']//3600):02d}:{int((seg['start']%3600)//60):02d}:{int(seg['start']%60):02d},{int((seg['start']%1)*1000):03d}"
228
+ end = f"{int(seg['end']//3600):02d}:{int((seg['end']%3600)//60):02d}:{int(seg['end']%60):02d},{int((seg['end']%1)*1000):03d}"
229
+ text = translate_text(seg['text'].strip(), target_lang)
230
+ srt_content += f"{i}\n{start} --> {end}\n{text}\n\n"
231
+ with tempfile.NamedTemporaryFile(mode='w', suffix='.srt', delete=False) as f:
232
+ f.write(srt_content)
233
+ srt_path = f.name
234
+
235
+ # Preview thumbnail (simple FFmpeg extract)
236
+ thumb_path = video_path.rsplit('.', 1)[0] + '_thumb.jpg'
237
+ ffmpeg.input(video_path, ss=1).output(thumb_path, vframes=1).run(quiet=True)
238
+
239
+ return output_video, srt_path, thumb_path
240
+
241
+ # Gradio UI (Blink simple: upload, selectors, progress, downloads)
242
+ with gr.Blocks(title="BlinkCaptions Clone - Free Auto Subtitles") as demo:
243
+ gr.Markdown("# 🚀 BlinkCaptions-like AI Subtitle Generator\nGenerate dynamic Hinglish captions with styles! Viral-ready for Reels/YouTube.")
244
+
245
+ with gr.Row():
246
+ with gr.Column(scale=1):
247
+ video_input = gr.File(label="Upload Video (MP4, <10min)", file_types=[".mp4", ".mkv", ".webm"])
248
+ yt_input = gr.Textbox(label="Or YouTube URL", placeholder="https://youtube.com/watch?v=...")
249
+ with gr.Column(scale=1):
250
+ model_choice = gr.Dropdown(choices=list(MODEL_CHOICES.keys()), value="Large-v3 (accurate, multilingual)", label="Model")
251
+ lyrics_mode = gr.Checkbox(label="Lyrics Mode (for music)", value=False)
252
+ target_lang = gr.Dropdown(choices=LANGUAGES, value="en", label="Translate To")
253
+
254
+ with gr.Row():
255
+ font = gr.Dropdown(choices=FONTS, value="Arial", label="Font")
256
+ color = gr.Dropdown(choices=COLORS, value="white", label="Color")
257
+ size = gr.Dropdown(choices=SIZES, value="medium (32)", label="Size")
258
+ position = gr.Dropdown(choices=POSITIONS, value="bottom", label="Position")
259
+
260
+ emphasis_input = gr.Textbox(label="Emphasize Words (comma-separated, e.g., wow,amazing)", placeholder="For highlights!")
261
+
262
+ process_btn = gr.Button("Generate Captions!", variant="primary")
263
+
264
+ with gr.Row():
265
+ output_video = gr.File(label="Download Video with Subtitles")
266
+ output_srt = gr.File(label="Download SRT Transcript")
267
+ preview = gr.Image(label="Preview Thumbnail")
268
+
269
+ # Progress integrated in fn
270
+ process_btn.click(
271
+ main_process,
272
+ inputs=[video_input, yt_input, model_choice, lyrics_mode, target_lang, font, color, size, position, emphasis_input],
273
+ outputs=[output_video, output_srt, preview]
274
+ )
275
+
276
+ gr.Markdown("**Free & No Login!** Optimized for speed (CUDA auto). Built with ❤️ for creators.")
277
+
278
+ if __name__ == "__main__":
279
+ demo.launch(server_name="0.0.0.0", server_port=7860, share=True) # For HF Spaces