Spaces:

becteur92
/

smollvm

Paused

App Files Files Community

youssef commited on Feb 23

Commit

0820857

1 Parent(s): fb1b414

test

Browse files

Files changed (2) hide show

src/app.py +19 -5
src/video_processor/processor.py +106 -50

src/app.py CHANGED Viewed

@@ -50,11 +50,25 @@ def on_process(video):
         ]
         logger.info(f"Processing video: {video}")
-        result = analyzer.process_video(video)
-        description = result[0]["description"]
-        # Format output
-        formatted_desc = f"### Analysis:\n{description}"
         yield [
             "Processing complete!",
@@ -76,7 +90,7 @@ def on_process(video):
 # Create Gradio interface
 with gr.Blocks() as demo:
     gr.Markdown("# SmolVLM Video Analyzer")
-    gr.Markdown("Upload a video to get a detailed analysis of its content.")
     with gr.Row():
         with gr.Column(scale=1):

         ]
         logger.info(f"Processing video: {video}")
+        segments = []
+        duration = analyzer.get_video_duration_seconds(video)
+        total_segments = int(duration / 10)  # Using default 10-second segments
+        # Process video segments
+        for i, segment in enumerate(analyzer.process_video(video)):
+            segments.append(segment)
+            progress = int((i + 1) / total_segments * 100)
+            # Format current segments
+            formatted_desc = "### Video Analysis by Segments:\n\n"
+            for seg in segments:
+                formatted_desc += f"**[{seg['timestamp']}]** {seg['description']}\n\n"
+            yield [
+                f"Processing segments... {progress}% complete",
+                formatted_desc,
+                gr.update(visible=True)
+            ]
         yield [
             "Processing complete!",
 # Create Gradio interface
 with gr.Blocks() as demo:
     gr.Markdown("# SmolVLM Video Analyzer")
+    gr.Markdown("Upload a video to get a detailed analysis of its content, split into segments with timestamps.")
     with gr.Row():
         with gr.Column(scale=1):

src/video_processor/processor.py CHANGED Viewed

@@ -2,6 +2,10 @@ import torch
 from transformers import AutoProcessor, AutoModelForImageTextToText
 from typing import List, Dict
 import logging
 logger = logging.getLogger(__name__)
@@ -12,6 +16,24 @@ def _grab_best_device(use_gpu=True):
         device = "cpu"
     return device
 DEVICE = _grab_best_device()
 logger.info(f"Using device: {DEVICE}")
@@ -35,60 +57,94 @@ class VideoAnalyzer:
         ).to(DEVICE)
         logger.info(f"Model loaded on device: {self.model.device}")
-    def process_video(self, video_path: str, frame_interval: int = 30) -> List[Dict]:
-        logger.info(f"Processing video: {video_path} with frame_interval={frame_interval}")
-        try:
-            # Create message for model with detailed system prompt
-            messages = [
-                {
-                    "role": "system",
-                    "content": [
-                        {
-                            "type": "text",
-                            "text": "You are a detailed video analysis assistant that can understand videos. Your task is to provide comprehensive descriptions including all events, actions, and important details with their timestamps. Focus on being specific and thorough."
-                        }
-                    ]
-                },
-                {
-                    "role": "user",
-                    "content": [
-                        {"type": "video", "path": video_path},
-                        {"type": "text", "text": "Please provide a detailed analysis of this video. Include:\n1. All significant actions and events\n2. Temporal information and timestamps\n3. Important visual details and context\n4. Any text or speech content if present\n5. Scene transitions and changes\nBe thorough and specific so the description can be used for detailed searching later."}
-                    ]
-                }
-            ]
-            logger.info(f"Applying chat template - message: {messages}")
-            # Process video using chat template
-            inputs = self.processor.apply_chat_template(
-                messages,
-                add_generation_prompt=True,
-                tokenize=True,
-                return_dict=True,
-                return_tensors="pt"
-            ).to(DEVICE, dtype=torch.bfloat16)
-            logger.info(f"Generating IDs")
-            # Generate description with increased token limit
-            generated_ids = self.model.generate(
-                **inputs,
-                do_sample=True,
-                temperature=0.7,
-                max_new_tokens=512  # Increased from 100 to get more detailed descriptions
-            )
-            logger.info(f"batch decoding...")
-            description = self.processor.batch_decode(
-                generated_ids,
-                skip_special_tokens=True
-            )[0]
-            return [{
-                "description": description.split("Assistant: ")[-1]  # Remove assistant prefix if present
-            }]
         except Exception as e:
             logger.error(f"Error processing video: {str(e)}", exc_info=True)

 from transformers import AutoProcessor, AutoModelForImageTextToText
 from typing import List, Dict
 import logging
+import os
+import subprocess
+import json
+import tempfile
 logger = logging.getLogger(__name__)
         device = "cpu"
     return device
+def get_video_duration_seconds(video_path: str) -> float:
+    """Use ffprobe to get video duration in seconds."""
+    cmd = [
+        "ffprobe",
+        "-v", "quiet",
+        "-print_format", "json",
+        "-show_format",
+        video_path
+    ]
+    result = subprocess.run(cmd, capture_output=True, text=True)
+    info = json.loads(result.stdout)
+    return float(info["format"]["duration"])
+def format_duration(seconds: int) -> str:
+    minutes = seconds // 60
+    secs = seconds % 60
+    return f"{minutes:02d}:{secs:02d}"
 DEVICE = _grab_best_device()
 logger.info(f"Using device: {DEVICE}")
         ).to(DEVICE)
         logger.info(f"Model loaded on device: {self.model.device}")
+    def analyze_segment(self, video_path: str, start_time: float) -> str:
+        """Analyze a single video segment."""
+        messages = [
+            {
+                "role": "system",
+                "content": [{"type": "text", "text": """You are a detailed video analysis assistant with expertise in scene description. Your task is to:
+1. Describe the visual content with precise details
+2. Note any significant actions or movements
+3. Describe important objects, people, or elements in the scene
+4. Capture the mood, atmosphere, or emotional content if present
+5. Mention any scene transitions or camera movements
+Be specific and thorough, but focus only on what is visually present in this segment."""}]
+            },
+            {
+                "role": "user",
+                "content": [
+                    {"type": "video", "path": video_path},
+                    {"type": "text", "text": """Describe this video segment in detail. Focus on:
+- What objects, people, or elements are visible?
+- What actions or movements are occurring?
+- What is the setting or environment?
+- Are there any notable visual effects or transitions?
+- What is the overall mood or atmosphere?
+Be specific about visual details but stay concise."""}
+                ]
+            }
+        ]
+        inputs = self.processor.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            return_tensors="pt"
+        ).to(DEVICE, dtype=torch.bfloat16)
+        outputs = self.model.generate(
+            **inputs,
+            do_sample=True,
+            temperature=0.7,
+            max_new_tokens=256
+        )
+        return self.processor.batch_decode(outputs, skip_special_tokens=True)[0].split("Assistant: ")[-1]
+    def process_video(self, video_path: str, segment_length: int = 10) -> List[Dict]:
+        try:
+            # Create temp directory for segments
+            temp_dir = tempfile.mkdtemp()
+            segments_info = []
+            # Get video duration
+            duration = get_video_duration_seconds(video_path)
+            # Process video in segments
+            for start_time in range(0, int(duration), segment_length):
+                end_time = min(start_time + segment_length, duration)
+                # Create segment
+                segment_path = os.path.join(temp_dir, f"segment_{start_time}.mp4")
+                cmd = [
+                    "ffmpeg",
+                    "-y",
+                    "-i", video_path,
+                    "-ss", str(start_time),
+                    "-t", str(segment_length),
+                    "-c:v", "libx264",
+                    "-preset", "ultrafast",
+                    "-pix_fmt", "yuv420p",
+                    segment_path
+                ]
+                subprocess.run(cmd, check=True)
+                # Analyze segment
+                description = self.analyze_segment(segment_path, start_time)
+                # Add segment info with timestamp
+                segments_info.append({
+                    "timestamp": format_duration(start_time),
+                    "description": description
+                })
+                # Clean up segment file
+                os.remove(segment_path)
+            # Clean up temp directory
+            os.rmdir(temp_dir)
+            return segments_info
         except Exception as e:
             logger.error(f"Error processing video: {str(e)}", exc_info=True)