youssef
commited on
Commit
·
b841197
1
Parent(s):
11484b5
use cuda for ffmpeg
Browse files- Dockerfile +2 -0
- src/app.py +16 -9
- src/video_processor/processor.py +65 -21
Dockerfile
CHANGED
|
@@ -23,6 +23,8 @@ RUN apt-get update && \
|
|
| 23 |
liblzma-dev \
|
| 24 |
# gradio dependencies \
|
| 25 |
ffmpeg \
|
|
|
|
|
|
|
| 26 |
&& apt-get clean \
|
| 27 |
&& rm -rf /var/lib/apt/lists/*
|
| 28 |
|
|
|
|
| 23 |
liblzma-dev \
|
| 24 |
# gradio dependencies \
|
| 25 |
ffmpeg \
|
| 26 |
+
# NVIDIA Video Codec SDK \
|
| 27 |
+
libnvidia-encode-12-3 \
|
| 28 |
&& apt-get clean \
|
| 29 |
&& rm -rf /var/lib/apt/lists/*
|
| 30 |
|
src/app.py
CHANGED
|
@@ -70,18 +70,21 @@ def on_process(video):
|
|
| 70 |
|
| 71 |
# Process segments and show progress
|
| 72 |
segments = []
|
| 73 |
-
|
|
|
|
| 74 |
|
| 75 |
for i, segment in enumerate(analyzer.process_video(video)):
|
| 76 |
-
segment_start = time.time()
|
| 77 |
segments.append(segment)
|
| 78 |
-
|
| 79 |
-
|
|
|
|
|
|
|
| 80 |
|
| 81 |
progress = int((i + 1) / total_segments * 100)
|
| 82 |
-
|
|
|
|
| 83 |
remaining_segments = total_segments - (i + 1)
|
| 84 |
-
estimated_remaining = remaining_segments *
|
| 85 |
|
| 86 |
# Format current segments
|
| 87 |
formatted_desc = "### Video Analysis by Segments:\n\n"
|
|
@@ -90,8 +93,9 @@ def on_process(video):
|
|
| 90 |
|
| 91 |
yield [
|
| 92 |
f"Processing segments... {progress}% complete\n" +
|
| 93 |
-
f"Segment {i+1}/{total_segments}
|
| 94 |
-
f"
|
|
|
|
| 95 |
f"Estimated time remaining: {estimated_remaining:.2f}s",
|
| 96 |
formatted_desc,
|
| 97 |
gr.update(visible=True)
|
|
@@ -101,7 +105,10 @@ def on_process(video):
|
|
| 101 |
yield [
|
| 102 |
f"Processing complete!\n" +
|
| 103 |
f"Total processing time: {total_time:.2f}s\n" +
|
| 104 |
-
f"Average
|
|
|
|
|
|
|
|
|
|
| 105 |
formatted_desc,
|
| 106 |
gr.update(visible=True)
|
| 107 |
]
|
|
|
|
| 70 |
|
| 71 |
# Process segments and show progress
|
| 72 |
segments = []
|
| 73 |
+
total_ffmpeg_time = 0
|
| 74 |
+
total_inference_time = 0
|
| 75 |
|
| 76 |
for i, segment in enumerate(analyzer.process_video(video)):
|
|
|
|
| 77 |
segments.append(segment)
|
| 78 |
+
|
| 79 |
+
# Update timing totals
|
| 80 |
+
total_ffmpeg_time += segment['processing_times']['ffmpeg']
|
| 81 |
+
total_inference_time += segment['processing_times']['inference']
|
| 82 |
|
| 83 |
progress = int((i + 1) / total_segments * 100)
|
| 84 |
+
avg_ffmpeg_time = total_ffmpeg_time / (i + 1)
|
| 85 |
+
avg_inference_time = total_inference_time / (i + 1)
|
| 86 |
remaining_segments = total_segments - (i + 1)
|
| 87 |
+
estimated_remaining = remaining_segments * (avg_ffmpeg_time + avg_inference_time)
|
| 88 |
|
| 89 |
# Format current segments
|
| 90 |
formatted_desc = "### Video Analysis by Segments:\n\n"
|
|
|
|
| 93 |
|
| 94 |
yield [
|
| 95 |
f"Processing segments... {progress}% complete\n" +
|
| 96 |
+
f"Segment {i+1}/{total_segments}\n" +
|
| 97 |
+
f"FFmpeg processing: {segment['processing_times']['ffmpeg']:.2f}s (avg: {avg_ffmpeg_time:.2f}s)\n" +
|
| 98 |
+
f"Model inference: {segment['processing_times']['inference']:.2f}s (avg: {avg_inference_time:.2f}s)\n" +
|
| 99 |
f"Estimated time remaining: {estimated_remaining:.2f}s",
|
| 100 |
formatted_desc,
|
| 101 |
gr.update(visible=True)
|
|
|
|
| 105 |
yield [
|
| 106 |
f"Processing complete!\n" +
|
| 107 |
f"Total processing time: {total_time:.2f}s\n" +
|
| 108 |
+
f"Average per segment:\n" +
|
| 109 |
+
f" - FFmpeg: {total_ffmpeg_time/total_segments:.2f}s\n" +
|
| 110 |
+
f" - Inference: {total_inference_time/total_segments:.2f}s\n" +
|
| 111 |
+
f" - Total: {(total_ffmpeg_time + total_inference_time)/total_segments:.2f}s",
|
| 112 |
formatted_desc,
|
| 113 |
gr.update(visible=True)
|
| 114 |
]
|
src/video_processor/processor.py
CHANGED
|
@@ -1,11 +1,12 @@
|
|
| 1 |
import torch
|
| 2 |
from transformers import AutoProcessor, AutoModelForImageTextToText
|
| 3 |
-
from typing import List, Dict
|
| 4 |
import logging
|
| 5 |
import os
|
| 6 |
import subprocess
|
| 7 |
import json
|
| 8 |
import tempfile
|
|
|
|
| 9 |
|
| 10 |
logger = logging.getLogger(__name__)
|
| 11 |
|
|
@@ -44,7 +45,7 @@ class VideoAnalyzer:
|
|
| 44 |
raise RuntimeError("CUDA is required but not available!")
|
| 45 |
|
| 46 |
logger.info("Initializing VideoAnalyzer")
|
| 47 |
-
self.model_path = "HuggingFaceTB/SmolVLM2-
|
| 48 |
logger.info(f"Loading model from {self.model_path} - Using device: {DEVICE}")
|
| 49 |
|
| 50 |
# Load processor and model
|
|
@@ -53,6 +54,7 @@ class VideoAnalyzer:
|
|
| 53 |
self.model = AutoModelForImageTextToText.from_pretrained(
|
| 54 |
self.model_path,
|
| 55 |
torch_dtype=torch.bfloat16,
|
|
|
|
| 56 |
_attn_implementation="flash_attention_2"
|
| 57 |
).to(DEVICE)
|
| 58 |
logger.info(f"Model loaded on device: {self.model.device}")
|
|
@@ -101,20 +103,19 @@ Be specific about visual details but stay concise."""}
|
|
| 101 |
)
|
| 102 |
return self.processor.batch_decode(outputs, skip_special_tokens=True)[0].split("Assistant: ")[-1]
|
| 103 |
|
| 104 |
-
def process_video(self, video_path: str, segment_length: int = 10) ->
|
| 105 |
try:
|
| 106 |
# Create temp directory for segments
|
| 107 |
temp_dir = tempfile.mkdtemp()
|
| 108 |
-
segments_info = []
|
| 109 |
|
| 110 |
# Get video duration
|
| 111 |
duration = get_video_duration_seconds(video_path)
|
| 112 |
-
|
| 113 |
-
total_segments = int(duration / segment_length)
|
| 114 |
logger.info(f"Processing {total_segments} segments for video of length {duration:.2f} seconds")
|
| 115 |
|
| 116 |
# Process video in segments
|
| 117 |
for segment_idx in range(total_segments):
|
|
|
|
| 118 |
start_time = segment_idx * segment_length
|
| 119 |
end_time = min(start_time + segment_length, duration)
|
| 120 |
|
|
@@ -122,40 +123,83 @@ Be specific about visual details but stay concise."""}
|
|
| 122 |
if start_time >= duration:
|
| 123 |
break
|
| 124 |
|
| 125 |
-
# Create segment
|
| 126 |
segment_path = os.path.join(temp_dir, f"segment_{start_time}.mp4")
|
| 127 |
cmd = [
|
| 128 |
"ffmpeg",
|
| 129 |
-
"-y",
|
|
|
|
|
|
|
|
|
|
|
|
|
| 130 |
"-i", video_path,
|
| 131 |
-
"-ss", str(start_time),
|
| 132 |
-
"-t", str(end_time - start_time), # Duration
|
| 133 |
-
"-c:v", "
|
| 134 |
-
"-preset", "
|
| 135 |
-
"-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 136 |
segment_path
|
| 137 |
]
|
| 138 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 139 |
|
| 140 |
# Analyze segment
|
|
|
|
| 141 |
description = self.analyze_segment(segment_path, start_time)
|
|
|
|
| 142 |
|
| 143 |
# Add segment info with timestamp
|
| 144 |
-
|
| 145 |
-
"timestamp": format_duration(start_time),
|
| 146 |
-
"description": description
|
| 147 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 148 |
|
| 149 |
# Clean up segment file
|
| 150 |
os.remove(segment_path)
|
| 151 |
|
| 152 |
-
logger.info(
|
|
|
|
|
|
|
|
|
|
| 153 |
|
| 154 |
# Clean up temp directory
|
| 155 |
os.rmdir(temp_dir)
|
| 156 |
|
| 157 |
-
return segments_info
|
| 158 |
-
|
| 159 |
except Exception as e:
|
| 160 |
logger.error(f"Error processing video: {str(e)}", exc_info=True)
|
| 161 |
raise
|
|
|
|
| 1 |
import torch
|
| 2 |
from transformers import AutoProcessor, AutoModelForImageTextToText
|
| 3 |
+
from typing import List, Dict, Generator
|
| 4 |
import logging
|
| 5 |
import os
|
| 6 |
import subprocess
|
| 7 |
import json
|
| 8 |
import tempfile
|
| 9 |
+
import time
|
| 10 |
|
| 11 |
logger = logging.getLogger(__name__)
|
| 12 |
|
|
|
|
| 45 |
raise RuntimeError("CUDA is required but not available!")
|
| 46 |
|
| 47 |
logger.info("Initializing VideoAnalyzer")
|
| 48 |
+
self.model_path = "HuggingFaceTB/SmolVLM2-500M-Video-Instruct"
|
| 49 |
logger.info(f"Loading model from {self.model_path} - Using device: {DEVICE}")
|
| 50 |
|
| 51 |
# Load processor and model
|
|
|
|
| 54 |
self.model = AutoModelForImageTextToText.from_pretrained(
|
| 55 |
self.model_path,
|
| 56 |
torch_dtype=torch.bfloat16,
|
| 57 |
+
device_map=DEVICE,
|
| 58 |
_attn_implementation="flash_attention_2"
|
| 59 |
).to(DEVICE)
|
| 60 |
logger.info(f"Model loaded on device: {self.model.device}")
|
|
|
|
| 103 |
)
|
| 104 |
return self.processor.batch_decode(outputs, skip_special_tokens=True)[0].split("Assistant: ")[-1]
|
| 105 |
|
| 106 |
+
def process_video(self, video_path: str, segment_length: int = 10) -> Generator[Dict, None, None]:
|
| 107 |
try:
|
| 108 |
# Create temp directory for segments
|
| 109 |
temp_dir = tempfile.mkdtemp()
|
|
|
|
| 110 |
|
| 111 |
# Get video duration
|
| 112 |
duration = get_video_duration_seconds(video_path)
|
| 113 |
+
total_segments = (int(duration) + segment_length - 1) // segment_length
|
|
|
|
| 114 |
logger.info(f"Processing {total_segments} segments for video of length {duration:.2f} seconds")
|
| 115 |
|
| 116 |
# Process video in segments
|
| 117 |
for segment_idx in range(total_segments):
|
| 118 |
+
segment_start_time = time.time()
|
| 119 |
start_time = segment_idx * segment_length
|
| 120 |
end_time = min(start_time + segment_length, duration)
|
| 121 |
|
|
|
|
| 123 |
if start_time >= duration:
|
| 124 |
break
|
| 125 |
|
| 126 |
+
# Create segment - Optimized ffmpeg settings
|
| 127 |
segment_path = os.path.join(temp_dir, f"segment_{start_time}.mp4")
|
| 128 |
cmd = [
|
| 129 |
"ffmpeg",
|
| 130 |
+
"-y", # Overwrite output files
|
| 131 |
+
"-hwaccel", "cuda", # Use CUDA hardware acceleration
|
| 132 |
+
"-hwaccel_output_format", "cuda", # Keep frames in GPU memory
|
| 133 |
+
"-threads", "0", # Use all available CPU threads
|
| 134 |
+
"-thread_type", "frame", # Frame-level multi-threading
|
| 135 |
"-i", video_path,
|
| 136 |
+
"-ss", str(start_time), # Seek position
|
| 137 |
+
"-t", str(end_time - start_time), # Duration
|
| 138 |
+
"-c:v", "h264_nvenc", # Use NVIDIA hardware encoder
|
| 139 |
+
"-preset", "p1", # Lowest latency preset for NVENC
|
| 140 |
+
"-tune", "ll", # Low latency tuning
|
| 141 |
+
"-rc", "vbr", # Variable bitrate mode
|
| 142 |
+
"-cq", "28", # Quality-based VBR
|
| 143 |
+
"-b:v", "0", # Let VBR control bitrate
|
| 144 |
+
"-vf", "scale_cuda=640:-2", # GPU-accelerated scaling
|
| 145 |
+
"-an", # Remove audio
|
| 146 |
segment_path
|
| 147 |
]
|
| 148 |
+
|
| 149 |
+
ffmpeg_start = time.time()
|
| 150 |
+
try:
|
| 151 |
+
result = subprocess.run(cmd, check=True, capture_output=True, text=True)
|
| 152 |
+
logger.debug(f"FFmpeg output: {result.stderr}")
|
| 153 |
+
except subprocess.CalledProcessError as e:
|
| 154 |
+
logger.error(f"FFmpeg error: {e.stderr}")
|
| 155 |
+
# Fallback to CPU if GPU encoding fails
|
| 156 |
+
logger.warning("Falling back to CPU encoding")
|
| 157 |
+
cmd = [
|
| 158 |
+
"ffmpeg",
|
| 159 |
+
"-y",
|
| 160 |
+
"-threads", "0",
|
| 161 |
+
"-i", video_path,
|
| 162 |
+
"-ss", str(start_time),
|
| 163 |
+
"-t", str(end_time - start_time),
|
| 164 |
+
"-c:v", "libx264",
|
| 165 |
+
"-preset", "ultrafast",
|
| 166 |
+
"-tune", "fastdecode",
|
| 167 |
+
"-crf", "28",
|
| 168 |
+
"-vf", "scale=640:-2",
|
| 169 |
+
"-an",
|
| 170 |
+
"-pix_fmt", "yuv420p",
|
| 171 |
+
segment_path
|
| 172 |
+
]
|
| 173 |
+
subprocess.run(cmd, check=True, capture_output=True)
|
| 174 |
+
ffmpeg_time = time.time() - ffmpeg_start
|
| 175 |
|
| 176 |
# Analyze segment
|
| 177 |
+
inference_start = time.time()
|
| 178 |
description = self.analyze_segment(segment_path, start_time)
|
| 179 |
+
inference_time = time.time() - inference_start
|
| 180 |
|
| 181 |
# Add segment info with timestamp
|
| 182 |
+
yield {
|
| 183 |
+
"timestamp": format_duration(int(start_time)),
|
| 184 |
+
"description": description,
|
| 185 |
+
"processing_times": {
|
| 186 |
+
"ffmpeg": ffmpeg_time,
|
| 187 |
+
"inference": inference_time,
|
| 188 |
+
"total": time.time() - segment_start_time
|
| 189 |
+
}
|
| 190 |
+
}
|
| 191 |
|
| 192 |
# Clean up segment file
|
| 193 |
os.remove(segment_path)
|
| 194 |
|
| 195 |
+
logger.info(
|
| 196 |
+
f"Segment {segment_idx + 1}/{total_segments} ({start_time}-{end_time}s) - "
|
| 197 |
+
f"FFmpeg: {ffmpeg_time:.2f}s, Inference: {inference_time:.2f}s"
|
| 198 |
+
)
|
| 199 |
|
| 200 |
# Clean up temp directory
|
| 201 |
os.rmdir(temp_dir)
|
| 202 |
|
|
|
|
|
|
|
| 203 |
except Exception as e:
|
| 204 |
logger.error(f"Error processing video: {str(e)}", exc_info=True)
|
| 205 |
raise
|