youssef
commited on
Commit
·
5f42812
1
Parent(s):
97c5040
Initial setup for HF Space
Browse files- .github/workflows/sync-to-hub.yml +15 -0
- requirements.txt +8 -0
- src/app.py +31 -0
- src/video_processor/processor.py +81 -0
.github/workflows/sync-to-hub.yml
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: Sync to Hugging Face Hub
|
| 2 |
+
on:
|
| 3 |
+
push:
|
| 4 |
+
branches: [main]
|
| 5 |
+
|
| 6 |
+
jobs:
|
| 7 |
+
sync-to-hub:
|
| 8 |
+
runs-on: ubuntu-latest
|
| 9 |
+
steps:
|
| 10 |
+
- uses: actions/checkout@v3
|
| 11 |
+
- name: Push to HF Hub
|
| 12 |
+
env:
|
| 13 |
+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
| 14 |
+
run: |
|
| 15 |
+
git push https://bnkd:$HF_TOKEN@huggingface.co/spaces/bnkd/smolvm-demo main
|
requirements.txt
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
torch==2.1.2
|
| 2 |
+
torchvision==0.16.2
|
| 3 |
+
transformers @ git+https://github.com/huggingface/transformers@v4.49.0-SmolVLM-2
|
| 4 |
+
num2words==0.5.13
|
| 5 |
+
gradio==4.19.2
|
| 6 |
+
av==10.0.0
|
| 7 |
+
numpy==1.24.3
|
| 8 |
+
Pillow==10.0.0
|
src/app.py
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
from video_processor.processor import VideoAnalyzer
|
| 3 |
+
import logging
|
| 4 |
+
|
| 5 |
+
# Configure logging
|
| 6 |
+
logging.basicConfig(level=logging.INFO)
|
| 7 |
+
logger = logging.getLogger(__name__)
|
| 8 |
+
|
| 9 |
+
analyzer = VideoAnalyzer()
|
| 10 |
+
|
| 11 |
+
def process_video(video_path):
|
| 12 |
+
"""Process video and return description"""
|
| 13 |
+
try:
|
| 14 |
+
logger.info(f"Processing video: {video_path}")
|
| 15 |
+
results = analyzer.process_video(video_path)
|
| 16 |
+
return results[0]["description"]
|
| 17 |
+
except Exception as e:
|
| 18 |
+
logger.error(f"Error processing video: {e}")
|
| 19 |
+
return str(e)
|
| 20 |
+
|
| 21 |
+
# Create Gradio interface
|
| 22 |
+
demo = gr.Interface(
|
| 23 |
+
fn=process_video,
|
| 24 |
+
inputs=gr.Video(label="Upload Video"),
|
| 25 |
+
outputs=gr.Textbox(label="Video Description"),
|
| 26 |
+
title="SmolVLM Video Analyzer",
|
| 27 |
+
description="Upload a video to get a detailed description of its contents."
|
| 28 |
+
)
|
| 29 |
+
|
| 30 |
+
if __name__ == "__main__":
|
| 31 |
+
demo.launch()
|
src/video_processor/processor.py
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
from transformers import AutoProcessor, AutoModelForImageTextToText
|
| 3 |
+
from typing import List, Dict
|
| 4 |
+
import decord
|
| 5 |
+
import numpy as np
|
| 6 |
+
import logging
|
| 7 |
+
|
| 8 |
+
logger = logging.getLogger(__name__)
|
| 9 |
+
|
| 10 |
+
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
| 11 |
+
logger.info(f"Using device: {DEVICE}")
|
| 12 |
+
|
| 13 |
+
class VideoAnalyzer:
|
| 14 |
+
def __init__(self):
|
| 15 |
+
if not torch.cuda.is_available():
|
| 16 |
+
raise RuntimeError("CUDA is required but not available!")
|
| 17 |
+
|
| 18 |
+
logger.info("Initializing VideoAnalyzer")
|
| 19 |
+
self.model_path = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
|
| 20 |
+
logger.info(f"Loading model from {self.model_path}")
|
| 21 |
+
|
| 22 |
+
cache_dir = "/models"
|
| 23 |
+
logger.info(f"Using cache directory: {cache_dir}")
|
| 24 |
+
|
| 25 |
+
# Load processor and model
|
| 26 |
+
self.processor = AutoProcessor.from_pretrained(
|
| 27 |
+
self.model_path,
|
| 28 |
+
cache_dir=cache_dir,
|
| 29 |
+
torch_dtype=torch.bfloat16
|
| 30 |
+
)
|
| 31 |
+
|
| 32 |
+
# Load model directly to CUDA
|
| 33 |
+
device_map = {"": 0} # Force model to GPU 0
|
| 34 |
+
self.model = AutoModelForImageTextToText.from_pretrained(
|
| 35 |
+
self.model_path,
|
| 36 |
+
torch_dtype=torch.bfloat16,
|
| 37 |
+
device_map=device_map,
|
| 38 |
+
_attn_implementation="flash_attention_2",
|
| 39 |
+
cache_dir=cache_dir
|
| 40 |
+
)
|
| 41 |
+
logger.info(f"Model loaded on device: {self.model.device}")
|
| 42 |
+
|
| 43 |
+
def process_video(self, video_path: str, frame_interval: int = 30) -> List[Dict]:
|
| 44 |
+
logger.info(f"Processing video: {video_path} with frame_interval={frame_interval}")
|
| 45 |
+
try:
|
| 46 |
+
# Create message for model
|
| 47 |
+
messages = [{
|
| 48 |
+
"role": "user",
|
| 49 |
+
"content": [
|
| 50 |
+
{"type": "video", "path": video_path},
|
| 51 |
+
{"type": "text", "text": "Describe this video in detail - with all the timestamps and the actions happening in the video. I should be able to understand the video by reading the description, and search for it later."}
|
| 52 |
+
]
|
| 53 |
+
}]
|
| 54 |
+
|
| 55 |
+
# Process video using chat template
|
| 56 |
+
inputs = self.processor.apply_chat_template(
|
| 57 |
+
messages,
|
| 58 |
+
add_generation_prompt=True,
|
| 59 |
+
tokenize=True,
|
| 60 |
+
return_dict=True,
|
| 61 |
+
return_tensors="pt"
|
| 62 |
+
).to(self.model.device)
|
| 63 |
+
|
| 64 |
+
# Generate description
|
| 65 |
+
generated_ids = self.model.generate(
|
| 66 |
+
**inputs,
|
| 67 |
+
do_sample=False,
|
| 68 |
+
max_new_tokens=100
|
| 69 |
+
)
|
| 70 |
+
description = self.processor.batch_decode(
|
| 71 |
+
generated_ids,
|
| 72 |
+
skip_special_tokens=True
|
| 73 |
+
)[0]
|
| 74 |
+
|
| 75 |
+
return [{
|
| 76 |
+
"description": description
|
| 77 |
+
}]
|
| 78 |
+
|
| 79 |
+
except Exception as e:
|
| 80 |
+
logger.error(f"Error processing video: {str(e)}", exc_info=True)
|
| 81 |
+
raise
|