Spaces:

becteur92
/

smollvm

Paused

App Files Files Community

youssef commited on Feb 23

Commit

5f42812

1 Parent(s): 97c5040

Initial setup for HF Space

Browse files

Files changed (4) hide show

.github/workflows/sync-to-hub.yml +15 -0
requirements.txt +8 -0
src/app.py +31 -0
src/video_processor/processor.py +81 -0

.github/workflows/sync-to-hub.yml ADDED Viewed

	@@ -0,0 +1,15 @@

+name: Sync to Hugging Face Hub
+on:
+  push:
+    branches: [main]
+jobs:
+  sync-to-hub:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - name: Push to HF Hub
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        run: |
+          git push https://bnkd:$HF_TOKEN@huggingface.co/spaces/bnkd/smolvm-demo main

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+torch==2.1.2
+torchvision==0.16.2
+transformers @ git+https://github.com/huggingface/transformers@v4.49.0-SmolVLM-2
+num2words==0.5.13
+gradio==4.19.2
+av==10.0.0
+numpy==1.24.3
+Pillow==10.0.0

src/app.py ADDED Viewed

	@@ -0,0 +1,31 @@

+import gradio as gr
+from video_processor.processor import VideoAnalyzer
+import logging
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+analyzer = VideoAnalyzer()
+def process_video(video_path):
+    """Process video and return description"""
+    try:
+        logger.info(f"Processing video: {video_path}")
+        results = analyzer.process_video(video_path)
+        return results[0]["description"]
+    except Exception as e:
+        logger.error(f"Error processing video: {e}")
+        return str(e)
+# Create Gradio interface
+demo = gr.Interface(
+    fn=process_video,
+    inputs=gr.Video(label="Upload Video"),
+    outputs=gr.Textbox(label="Video Description"),
+    title="SmolVLM Video Analyzer",
+    description="Upload a video to get a detailed description of its contents."
+)
+if __name__ == "__main__":
+    demo.launch()

src/video_processor/processor.py ADDED Viewed

	@@ -0,0 +1,81 @@

+import torch
+from transformers import AutoProcessor, AutoModelForImageTextToText
+from typing import List, Dict
+import decord
+import numpy as np
+import logging
+logger = logging.getLogger(__name__)
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+logger.info(f"Using device: {DEVICE}")
+class VideoAnalyzer:
+    def __init__(self):
+        if not torch.cuda.is_available():
+            raise RuntimeError("CUDA is required but not available!")
+        logger.info("Initializing VideoAnalyzer")
+        self.model_path = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
+        logger.info(f"Loading model from {self.model_path}")
+        cache_dir = "/models"
+        logger.info(f"Using cache directory: {cache_dir}")
+        # Load processor and model
+        self.processor = AutoProcessor.from_pretrained(
+            self.model_path,
+            cache_dir=cache_dir,
+            torch_dtype=torch.bfloat16
+        )
+        # Load model directly to CUDA
+        device_map = {"": 0}  # Force model to GPU 0
+        self.model = AutoModelForImageTextToText.from_pretrained(
+            self.model_path,
+            torch_dtype=torch.bfloat16,
+            device_map=device_map,
+            _attn_implementation="flash_attention_2",
+            cache_dir=cache_dir
+        )
+        logger.info(f"Model loaded on device: {self.model.device}")
+    def process_video(self, video_path: str, frame_interval: int = 30) -> List[Dict]:
+        logger.info(f"Processing video: {video_path} with frame_interval={frame_interval}")
+        try:
+            # Create message for model
+            messages = [{
+                "role": "user",
+                "content": [
+                    {"type": "video", "path": video_path},
+                    {"type": "text", "text": "Describe this video in detail - with all the timestamps and the actions happening in the video. I should be able to understand the video by reading the description, and search for it later."}
+                ]
+            }]
+            # Process video using chat template
+            inputs = self.processor.apply_chat_template(
+                messages,
+                add_generation_prompt=True,
+                tokenize=True,
+                return_dict=True,
+                return_tensors="pt"
+            ).to(self.model.device)
+            # Generate description
+            generated_ids = self.model.generate(
+                **inputs,
+                do_sample=False,
+                max_new_tokens=100
+            )
+            description = self.processor.batch_decode(
+                generated_ids,
+                skip_special_tokens=True
+            )[0]
+            return [{
+                "description": description
+            }]
+        except Exception as e:
+            logger.error(f"Error processing video: {str(e)}", exc_info=True)
+            raise