Spaces:

becteur92
/

smollvm

Paused

App Files Files Community

youssef commited on Feb 23

Commit

b841197

1 Parent(s): 11484b5

use cuda for ffmpeg

Browse files

Files changed (3) hide show

Dockerfile +2 -0
src/app.py +16 -9
src/video_processor/processor.py +65 -21

Dockerfile CHANGED Viewed

@@ -23,6 +23,8 @@ RUN apt-get update && \
     liblzma-dev \
     # gradio dependencies \
     ffmpeg \
     && apt-get clean \
     && rm -rf /var/lib/apt/lists/*

     liblzma-dev \
     # gradio dependencies \
     ffmpeg \
+    # NVIDIA Video Codec SDK \
+    libnvidia-encode-12-3 \
     && apt-get clean \
     && rm -rf /var/lib/apt/lists/*

src/app.py CHANGED Viewed

@@ -70,18 +70,21 @@ def on_process(video):
         # Process segments and show progress
         segments = []
-        total_processing_time = 0
         for i, segment in enumerate(analyzer.process_video(video)):
-            segment_start = time.time()
             segments.append(segment)
-            segment_time = time.time() - segment_start
-            total_processing_time += segment_time
             progress = int((i + 1) / total_segments * 100)
-            avg_time_per_segment = total_processing_time / (i + 1)
             remaining_segments = total_segments - (i + 1)
-            estimated_remaining = remaining_segments * avg_time_per_segment
             # Format current segments
             formatted_desc = "### Video Analysis by Segments:\n\n"
@@ -90,8 +93,9 @@ def on_process(video):
             yield [
                 f"Processing segments... {progress}% complete\n" +
-                f"Segment {i+1}/{total_segments} processed in {segment_time:.2f}s\n" +
-                f"Average time per segment: {avg_time_per_segment:.2f}s\n" +
                 f"Estimated time remaining: {estimated_remaining:.2f}s",
                 formatted_desc,
                 gr.update(visible=True)
@@ -101,7 +105,10 @@ def on_process(video):
         yield [
             f"Processing complete!\n" +
             f"Total processing time: {total_time:.2f}s\n" +
-            f"Average time per segment: {total_processing_time/total_segments:.2f}s",
             formatted_desc,
             gr.update(visible=True)
         ]

         # Process segments and show progress
         segments = []
+        total_ffmpeg_time = 0
+        total_inference_time = 0
         for i, segment in enumerate(analyzer.process_video(video)):
             segments.append(segment)
+            # Update timing totals
+            total_ffmpeg_time += segment['processing_times']['ffmpeg']
+            total_inference_time += segment['processing_times']['inference']
             progress = int((i + 1) / total_segments * 100)
+            avg_ffmpeg_time = total_ffmpeg_time / (i + 1)
+            avg_inference_time = total_inference_time / (i + 1)
             remaining_segments = total_segments - (i + 1)
+            estimated_remaining = remaining_segments * (avg_ffmpeg_time + avg_inference_time)
             # Format current segments
             formatted_desc = "### Video Analysis by Segments:\n\n"
             yield [
                 f"Processing segments... {progress}% complete\n" +
+                f"Segment {i+1}/{total_segments}\n" +
+                f"FFmpeg processing: {segment['processing_times']['ffmpeg']:.2f}s (avg: {avg_ffmpeg_time:.2f}s)\n" +
+                f"Model inference: {segment['processing_times']['inference']:.2f}s (avg: {avg_inference_time:.2f}s)\n" +
                 f"Estimated time remaining: {estimated_remaining:.2f}s",
                 formatted_desc,
                 gr.update(visible=True)
         yield [
             f"Processing complete!\n" +
             f"Total processing time: {total_time:.2f}s\n" +
+            f"Average per segment:\n" +
+            f"  - FFmpeg: {total_ffmpeg_time/total_segments:.2f}s\n" +
+            f"  - Inference: {total_inference_time/total_segments:.2f}s\n" +
+            f"  - Total: {(total_ffmpeg_time + total_inference_time)/total_segments:.2f}s",
             formatted_desc,
             gr.update(visible=True)
         ]

src/video_processor/processor.py CHANGED Viewed

@@ -1,11 +1,12 @@
 import torch
 from transformers import AutoProcessor, AutoModelForImageTextToText
-from typing import List, Dict
 import logging
 import os
 import subprocess
 import json
 import tempfile
 logger = logging.getLogger(__name__)
@@ -44,7 +45,7 @@ class VideoAnalyzer:
             raise RuntimeError("CUDA is required but not available!")
         logger.info("Initializing VideoAnalyzer")
-        self.model_path = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
         logger.info(f"Loading model from {self.model_path} - Using device: {DEVICE}")
         # Load processor and model
@@ -53,6 +54,7 @@ class VideoAnalyzer:
         self.model = AutoModelForImageTextToText.from_pretrained(
             self.model_path,
             torch_dtype=torch.bfloat16,
             _attn_implementation="flash_attention_2"
         ).to(DEVICE)
         logger.info(f"Model loaded on device: {self.model.device}")
@@ -101,20 +103,19 @@ Be specific about visual details but stay concise."""}
         )
         return self.processor.batch_decode(outputs, skip_special_tokens=True)[0].split("Assistant: ")[-1]
-    def process_video(self, video_path: str, segment_length: int = 10) -> List[Dict]:
         try:
             # Create temp directory for segments
             temp_dir = tempfile.mkdtemp()
-            segments_info = []
             # Get video duration
             duration = get_video_duration_seconds(video_path)
-            segments_processed = 0
-            total_segments = int(duration / segment_length)
             logger.info(f"Processing {total_segments} segments for video of length {duration:.2f} seconds")
             # Process video in segments
             for segment_idx in range(total_segments):
                 start_time = segment_idx * segment_length
                 end_time = min(start_time + segment_length, duration)
@@ -122,40 +123,83 @@ Be specific about visual details but stay concise."""}
                 if start_time >= duration:
                     break
-                # Create segment
                 segment_path = os.path.join(temp_dir, f"segment_{start_time}.mp4")
                 cmd = [
                     "ffmpeg",
-                    "-y",
                     "-i", video_path,
-                    "-ss", str(start_time),
-                    "-t", str(end_time - start_time),  # Duration of this segment
-                    "-c:v", "libx264",
-                    "-preset", "ultrafast",
-                    "-pix_fmt", "yuv420p",
                     segment_path
                 ]
-                subprocess.run(cmd, check=True)
                 # Analyze segment
                 description = self.analyze_segment(segment_path, start_time)
                 # Add segment info with timestamp
-                segments_info.append({
-                    "timestamp": format_duration(start_time),
-                    "description": description
-                })
                 # Clean up segment file
                 os.remove(segment_path)
-                logger.info(f"Processed segment {segment_idx + 1}/{total_segments} ({start_time}-{end_time}s)")
             # Clean up temp directory
             os.rmdir(temp_dir)
-            return segments_info
         except Exception as e:
             logger.error(f"Error processing video: {str(e)}", exc_info=True)
             raise

 import torch
 from transformers import AutoProcessor, AutoModelForImageTextToText
+from typing import List, Dict, Generator
 import logging
 import os
 import subprocess
 import json
 import tempfile
+import time
 logger = logging.getLogger(__name__)
             raise RuntimeError("CUDA is required but not available!")
         logger.info("Initializing VideoAnalyzer")
+        self.model_path = "HuggingFaceTB/SmolVLM2-500M-Video-Instruct"
         logger.info(f"Loading model from {self.model_path} - Using device: {DEVICE}")
         # Load processor and model
         self.model = AutoModelForImageTextToText.from_pretrained(
             self.model_path,
             torch_dtype=torch.bfloat16,
+            device_map=DEVICE,
             _attn_implementation="flash_attention_2"
         ).to(DEVICE)
         logger.info(f"Model loaded on device: {self.model.device}")
         )
         return self.processor.batch_decode(outputs, skip_special_tokens=True)[0].split("Assistant: ")[-1]
+    def process_video(self, video_path: str, segment_length: int = 10) -> Generator[Dict, None, None]:
         try:
             # Create temp directory for segments
             temp_dir = tempfile.mkdtemp()
             # Get video duration
             duration = get_video_duration_seconds(video_path)
+            total_segments = (int(duration) + segment_length - 1) // segment_length
             logger.info(f"Processing {total_segments} segments for video of length {duration:.2f} seconds")
             # Process video in segments
             for segment_idx in range(total_segments):
+                segment_start_time = time.time()
                 start_time = segment_idx * segment_length
                 end_time = min(start_time + segment_length, duration)
                 if start_time >= duration:
                     break
+                # Create segment - Optimized ffmpeg settings
                 segment_path = os.path.join(temp_dir, f"segment_{start_time}.mp4")
                 cmd = [
                     "ffmpeg",
+                    "-y",  # Overwrite output files
+                    "-hwaccel", "cuda",  # Use CUDA hardware acceleration
+                    "-hwaccel_output_format", "cuda",  # Keep frames in GPU memory
+                    "-threads", "0",  # Use all available CPU threads
+                    "-thread_type", "frame",  # Frame-level multi-threading
                     "-i", video_path,
+                    "-ss", str(start_time),  # Seek position
+                    "-t", str(end_time - start_time),  # Duration
+                    "-c:v", "h264_nvenc",  # Use NVIDIA hardware encoder
+                    "-preset", "p1",  # Lowest latency preset for NVENC
+                    "-tune", "ll",  # Low latency tuning
+                    "-rc", "vbr",  # Variable bitrate mode
+                    "-cq", "28",  # Quality-based VBR
+                    "-b:v", "0",  # Let VBR control bitrate
+                    "-vf", "scale_cuda=640:-2",  # GPU-accelerated scaling
+                    "-an",  # Remove audio
                     segment_path
                 ]
+                ffmpeg_start = time.time()
+                try:
+                    result = subprocess.run(cmd, check=True, capture_output=True, text=True)
+                    logger.debug(f"FFmpeg output: {result.stderr}")
+                except subprocess.CalledProcessError as e:
+                    logger.error(f"FFmpeg error: {e.stderr}")
+                    # Fallback to CPU if GPU encoding fails
+                    logger.warning("Falling back to CPU encoding")
+                    cmd = [
+                        "ffmpeg",
+                        "-y",
+                        "-threads", "0",
+                        "-i", video_path,
+                        "-ss", str(start_time),
+                        "-t", str(end_time - start_time),
+                        "-c:v", "libx264",
+                        "-preset", "ultrafast",
+                        "-tune", "fastdecode",
+                        "-crf", "28",
+                        "-vf", "scale=640:-2",
+                        "-an",
+                        "-pix_fmt", "yuv420p",
+                        segment_path
+                    ]
+                    subprocess.run(cmd, check=True, capture_output=True)
+                ffmpeg_time = time.time() - ffmpeg_start
                 # Analyze segment
+                inference_start = time.time()
                 description = self.analyze_segment(segment_path, start_time)
+                inference_time = time.time() - inference_start
                 # Add segment info with timestamp
+                yield {
+                    "timestamp": format_duration(int(start_time)),
+                    "description": description,
+                    "processing_times": {
+                        "ffmpeg": ffmpeg_time,
+                        "inference": inference_time,
+                        "total": time.time() - segment_start_time
+                    }
+                }
                 # Clean up segment file
                 os.remove(segment_path)
+                logger.info(
+                    f"Segment {segment_idx + 1}/{total_segments} ({start_time}-{end_time}s) - "
+                    f"FFmpeg: {ffmpeg_time:.2f}s, Inference: {inference_time:.2f}s"
+                )
             # Clean up temp directory
             os.rmdir(temp_dir)
         except Exception as e:
             logger.error(f"Error processing video: {str(e)}", exc_info=True)
             raise