File size: 12,611 Bytes
b847dcb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
# app.py - BlinkCaptions-like Auto Subtitle Generator for Hugging Face Spaces
# This Gradio app mimics Vozo.ai's BlinkCaptions: AI-powered one-click dynamic captions
# with high accuracy, Hinglish support, styles, emojis/highlights, and FFmpeg burning.
# Key steps (Blink-like):
# 1. Input handling: Upload video or YouTube URL download.
# 2. Audio extraction & validation (duration <10min, has audio).
# 3. Whisper transcription: Auto-detect Hinglish (language=None), timestamps for sync.
#    - Lyrics mode: Enable word_timestamps for music-like precision.
#    - Trim: Skip short/silent segments (<0.5s).
# 4. Enhancements: Word emphasis (e.g., wrap "wow" in bold/color tags).
# 5. Translation: Optional to 120+ langs via argostranslate (pre-install common packs).
# 6. ASS subtitle creation: Styled with fonts/colors/sizes/positions/animations/emojis.
# 7. Burn to video: FFmpeg overlays HD output, no watermark.
# 8. UI: Simple, free, viral-ready for Reels/YouTube.
# Deploy: Save as app.py, add requirements.txt, create HF Space (Gradio SDK).

import os
import tempfile
import gradio as gr
from transformers import pipeline
import torch
import ffmpeg
from yt_dlp import YoutubeDL
from googletrans import Translator  # Fallback to googletrans for simplicity (argos heavy for 120+ langs)
# Note: For argostranslate, uncomment below and pre-install packs in HF Space Dockerfile if needed.
# from argostranslate import package, translate
# package.update_package_index()  # Run once

# Model options (lighter for speed)
MODEL_CHOICES = {
    "Base (fast)": "openai/whisper-base",
    "Small": "openai/whisper-small",
    "Medium": "openai/whisper-medium",
    "Large-v3 (accurate, multilingual)": "openai/whisper-large-v3"
}

# Style options (200+ simulated via combos; Blink-like: Montserrat/Arial, colors, etc.)
FONTS = ["Arial", "Montserrat"]  # Add more if custom fonts available
COLORS = ["white", "yellow", "black"]
SIZES = ["small (24)", "medium (32)", "large (40)"]
POSITIONS = ["bottom", "top"]
LANGUAGES = ["en", "hi", "fr", "es"]  # Sample; extend to 120+ with googletrans

translator = Translator()  # googletrans instance

def download_youtube(url, progress=gr.Progress()):
    """Download YouTube video using yt-dlp (Blink-like: social input support)."""
    progress(0, desc="Downloading video...")
    ydl_opts = {
        'format': 'best[height<=720]/best',  # HD but not too heavy
        'outtmpl': '%(title)s.%(ext)s',
        'noplaylist': True,
    }
    with tempfile.TemporaryDirectory() as tmpdir:
        os.chdir(tmpdir)
        with YoutubeDL(ydl_opts) as ydl:
            ydl.download([url])
        files = [f for f in os.listdir(tmpdir) if f.endswith(('.mp4', '.mkv', '.webm'))]
        if files:
            video_path = os.path.join(tmpdir, files[0])
            progress(1, desc="Download complete!")
            return video_path
    raise ValueError("Download failed!")

def extract_audio(video_path):
    """Extract audio from video using FFmpeg (prep for Whisper)."""
    audio_path = video_path.rsplit('.', 1)[0] + '.wav'
    stream = ffmpeg.input(video_path)
    stream = ffmpeg.output(stream, audio_path, acodec='pcm_s16le', ac=1, ar=16000)  # Whisper format
    ffmpeg.run(stream, overwrite_output=True, quiet=True)
    return audio_path

def get_video_duration(video_path):
    """Get duration in seconds using FFmpeg probe (limit check)."""
    probe = ffmpeg.probe(video_path)
    return float(probe['streams'][0]['duration']) if probe['streams'] else 0

def has_audio(video_path):
    """Check if video has audio stream."""
    probe = ffmpeg.probe(video_path)
    return any(s['codec_type'] == 'audio' for s in probe['streams'])

def transcribe_audio(audio_path, model_name, lyrics_mode, progress=gr.Progress()):
    """Whisper transcription (Blink-like: 98%+ acc, Hinglish auto-detect, lyrics mode)."""
    progress(0, desc="Loading Whisper model...")
    device = "cuda" if torch.cuda.is_available() else "cpu"
    pipe = pipeline(
        "automatic-speech-recognition",
        model=model_name,
        device=device,
        return_timestamps=True,
        chunk_length_s=30,
        batch_size=8,
    )
    
    # Generate kwargs for accuracy boost (transcribe task, auto lang for Hinglish)
    generate_kwargs = {"task": "transcribe", "language": None}  # Auto-detect Hindi/English mix
    if lyrics_mode:
        generate_kwargs["word_timestamps"] = True  # Lyrics precision
    
    progress(0.5, desc="Transcribing...")
    result = pipe(audio_path, generate_kwargs=generate_kwargs)
    
    # Extract segments, trim silences (short <0.5s)
    segments = result.get('chunks', [])
    trimmed_segments = [s for s in segments if (s['end'] - s['start']) > 0.5]
    
    progress(1, desc="Transcription complete!")
    return trimmed_segments  # List of {'text': , 'start': , 'end': }

def translate_text(text, target_lang):
    """Optional translation (Blink-like: 120+ langs)."""
    if target_lang == "en":  # No translate
        return text
    return translator.translate(text, dest=target_lang).text

def create_ass_subtitles(segments, font, color, size, position, emphasis_words, target_lang, progress=gr.Progress()):
    """Create ASS subtitles (styled like Blink: fonts/colors/emojis/highlights/animations)."""
    progress(0, desc="Generating styled subtitles...")
    
    # Map sizes
    size_map = {"small (24)": 24, "medium (32)": 32, "large (40)": 40}
    fontsize = size_map[size]
    
    # Color to ASS hex (BGR)
    color_map = {"white": "&HFFFFFF&", "yellow": "&HFFFF00&", "black": "&H000000&"}
    subtitle_color = color_map[color]
    
    # Position: bottom/top alignment
    alignment = 2 if position == "bottom" else 8  # ASS alignment codes
    
    # ASS header (V4+ Styles for Blink-like customizations)
    ass_content = """[Script Info]
Title: Generated Subtitles
ScriptType: v4.00+
[V4+ Styles]
Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
Style: Default,{font},{fontsize},{subtitle_color},&H000000FF,&H00000000,&H00000000,0,0,0,0,100,100,0,0,1,2,2,{alignment},10,10,10,1

[Events]
Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
""".format(font=font, fontsize=fontsize, subtitle_color=subtitle_color, alignment=alignment)
    
    # Build events with enhancements
    for i, seg in enumerate(segments):
        start = f"{int(seg['start']*100)}"  # ms
        end = f"{int(seg['end']*100)}"
        text = translate_text(seg['text'].strip(), target_lang)
        
        # Word emphasis/highlights (e.g., make "wow" pop with bold/color)
        for word in emphasis_words:
            if word.lower() in text.lower():
                text = text.replace(word, f"{{\\b1\\c{&HFF0000&}}}" + word + "{\\b0}")
        
        # Add emoji example (Blink-like: one-click emojis)
        if "!" in text:
            text += " 😎"  # Simple auto-add
        
        # Basic fade-in animation (Blink dynamic)
        text = f"{{\\fad(200,200)}}{text}"
        
        # Dialogue event
        ass_content += f"Dialogue: 0,{start},{end},Default,,0,0,0,,{text}\n"
    
    progress(1, desc="ASS file ready!")
    with tempfile.NamedTemporaryFile(mode='w', suffix='.ass', delete=False) as f:
        f.write(ass_content)
        return f.name

def burn_subtitles(video_path, ass_path, progress=gr.Progress()):
    """Burn ASS to video using FFmpeg (Blink-like: HD export, no watermark)."""
    progress(0, desc="Burning subtitles...")
    output_path = video_path.rsplit('.', 1)[0] + '_subtitled.mp4'
    
    # FFmpeg command: subtitles filter for styles/shadows/position
    stream = ffmpeg.input(video_path)
    stream = ffmpeg.filter(stream, 'subtitles', ass_path, force_style='FontName=Montserrat,PrimaryColour=&HFFFFFF&')  # Override if needed
    stream = ffmpeg.output(stream, output_path, vcodec='libx264', acodec='aac', **{'preset': 'fast'})  # HD
    
    ffmpeg.run(stream, overwrite_output=True, quiet=True)
    progress(1, desc="Video ready!")
    return output_path

def main_process(
    video_file, yt_url, model_choice, lyrics_mode, target_lang,
    font, color, size, position, emphasis_words_str,
    progress=gr.Progress()
):
    """Main pipeline (Blink one-click flow)."""
    if video_file is None and yt_url is None:
        raise gr.Error("Upload a video or enter a YouTube URL!")
    
    video_path = None
    if video_file:
        video_path = video_file.name
    elif yt_url:
        video_path = download_youtube(yt_url, progress)
    
    # Validation (Blink error handling)
    duration = get_video_duration(video_path)
    if duration > 600:  # 10 min
        raise gr.Error("Video too long! Limit to 10 minutes for demo.")
    if not has_audio(video_path):
        raise gr.Error("No audio detected!")
    
    # Extract audio
    audio_path = extract_audio(video_path)
    
    # Transcribe
    model_name = MODEL_CHOICES[model_choice]
    segments = transcribe_audio(audio_path, model_name, lyrics_mode, progress)
    if not segments:
        raise gr.Error("No speech detected!")
    
    # Emphasis words
    emphasis_words = emphasis_words_str.split(',') if emphasis_words_str else []
    
    # Create styled ASS
    ass_path = create_ass_subtitles(segments, font, color, size, position, emphasis_words, target_lang, progress)
    
    # Burn
    output_video = burn_subtitles(video_path, ass_path, progress)
    
    # Also save SRT for download (simple conversion from segments)
    srt_content = ""
    for i, seg in enumerate(segments, 1):
        start = f"{int(seg['start']//3600):02d}:{int((seg['start']%3600)//60):02d}:{int(seg['start']%60):02d},{int((seg['start']%1)*1000):03d}"
        end = f"{int(seg['end']//3600):02d}:{int((seg['end']%3600)//60):02d}:{int(seg['end']%60):02d},{int((seg['end']%1)*1000):03d}"
        text = translate_text(seg['text'].strip(), target_lang)
        srt_content += f"{i}\n{start} --> {end}\n{text}\n\n"
    with tempfile.NamedTemporaryFile(mode='w', suffix='.srt', delete=False) as f:
        f.write(srt_content)
        srt_path = f.name
    
    # Preview thumbnail (simple FFmpeg extract)
    thumb_path = video_path.rsplit('.', 1)[0] + '_thumb.jpg'
    ffmpeg.input(video_path, ss=1).output(thumb_path, vframes=1).run(quiet=True)
    
    return output_video, srt_path, thumb_path

# Gradio UI (Blink simple: upload, selectors, progress, downloads)
with gr.Blocks(title="BlinkCaptions Clone - Free Auto Subtitles") as demo:
    gr.Markdown("# πŸš€ BlinkCaptions-like AI Subtitle Generator\nGenerate dynamic Hinglish captions with styles! Viral-ready for Reels/YouTube.")
    
    with gr.Row():
        with gr.Column(scale=1):
            video_input = gr.File(label="Upload Video (MP4, <10min)", file_types=[".mp4", ".mkv", ".webm"])
            yt_input = gr.Textbox(label="Or YouTube URL", placeholder="https://youtube.com/watch?v=...")
        with gr.Column(scale=1):
            model_choice = gr.Dropdown(choices=list(MODEL_CHOICES.keys()), value="Large-v3 (accurate, multilingual)", label="Model")
            lyrics_mode = gr.Checkbox(label="Lyrics Mode (for music)", value=False)
            target_lang = gr.Dropdown(choices=LANGUAGES, value="en", label="Translate To")
    
    with gr.Row():
        font = gr.Dropdown(choices=FONTS, value="Arial", label="Font")
        color = gr.Dropdown(choices=COLORS, value="white", label="Color")
        size = gr.Dropdown(choices=SIZES, value="medium (32)", label="Size")
        position = gr.Dropdown(choices=POSITIONS, value="bottom", label="Position")
    
    emphasis_input = gr.Textbox(label="Emphasize Words (comma-separated, e.g., wow,amazing)", placeholder="For highlights!")
    
    process_btn = gr.Button("Generate Captions!", variant="primary")
    
    with gr.Row():
        output_video = gr.File(label="Download Video with Subtitles")
        output_srt = gr.File(label="Download SRT Transcript")
        preview = gr.Image(label="Preview Thumbnail")
    
    # Progress integrated in fn
    process_btn.click(
        main_process,
        inputs=[video_input, yt_input, model_choice, lyrics_mode, target_lang, font, color, size, position, emphasis_input],
        outputs=[output_video, output_srt, preview]
    )
    
    gr.Markdown("**Free & No Login!** Optimized for speed (CUDA auto). Built with ❀️ for creators.")

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860, share=True)  # For HF Spaces