Spaces:
Runtime error
Runtime error
| # app.py | |
| import torch; torch.classes.__path__ = [] # Neutralizes the path inspection | |
| import os | |
| import sys | |
| import time | |
| import socket | |
| import subprocess | |
| import atexit | |
| import base64 | |
| import shutil | |
| import cv2 | |
| import streamlit as st | |
| import requests | |
| from streamlit_webrtc import webrtc_streamer, VideoProcessorBase | |
| from huggingface_hub import hf_hub_download | |
| # ββ Configuration ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| PORT = 8000 | |
| BASE_URL = f"http://localhost:{PORT}/v1" | |
| MODEL_ALIAS = "gpt-4-vision-preview" | |
| REPO_ID = "ggml-org/SmolVLM2-500M-Video-Instruct-GGUF" | |
| MODEL_FILE = "SmolVLM2-500M-Video-Instruct-Q8_0.gguf" | |
| PROJ_FILE = "mmproj-SmolVLM2-500M-Video-Instruct-Q8_0.gguf" | |
| # ββ Helpers to download & launch server βββββββββββββββββββββββββββββββββββββββββ | |
| def download_if_missing(repo_id: str, filename: str): | |
| if not os.path.exists(filename): | |
| cached = hf_hub_download(repo_id=repo_id, filename=filename, repo_type="model") | |
| shutil.copy(cached, filename) | |
| def ensure_models(): | |
| download_if_missing(REPO_ID, MODEL_FILE) | |
| download_if_missing(REPO_ID, PROJ_FILE) | |
| def start_server(): | |
| cmd = [ | |
| sys.executable, "-m", "llama_cpp.server", | |
| "--model", MODEL_FILE, | |
| "--clip_model_path", PROJ_FILE, | |
| "--chat_format", "llava-1-5", | |
| "--port", str(PORT), | |
| "--model_alias", MODEL_ALIAS, | |
| ] | |
| proc = subprocess.Popen( | |
| cmd, | |
| stdout=subprocess.PIPE, | |
| stderr=subprocess.STDOUT, | |
| text=True, # so line buffering works | |
| bufsize=1, | |
| ) | |
| atexit.register(proc.terminate) | |
| for line in proc.stdout: | |
| if "Application startup complete." in line: | |
| return proc | |
| raise RuntimeError(f"Server failed to start on port {PORT}") | |
| # ββ Boot llama-cpp-python server ββββββββββββββββββββββββββββββββββββββββββββββββ | |
| ensure_models() | |
| _server_proc = start_server() | |
| # ββ Streamlit UI βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| st.set_page_config(page_title="SmolVLM Live Caption Demo", layout="wide") | |
| st.title("πΈ Live Camera Captioning with SmolVLM") | |
| st.markdown( | |
| """ | |
| Use the **slider** below to choose how often (in milliseconds) to | |
| send a frame to SmolVLM for captioning. The latest caption will | |
| be overlaid on your video feed. | |
| """ | |
| ) | |
| interval_ms = st.sidebar.slider("Caption every N ms", 100, 5000, 3000) | |
| # ββ Video processor ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class CaptionProcessor(VideoProcessorBase): | |
| def __init__(self, interval_ms: int): | |
| self.interval = interval_ms / 1000.0 | |
| self.last_time = 0.0 | |
| self.caption = "Waiting for caption..." | |
| self.font = cv2.FONT_HERSHEY_SIMPLEX | |
| def recv(self, frame): | |
| img = frame.to_ndarray(format="bgr24") | |
| now = time.time() | |
| if now - self.last_time >= self.interval: | |
| self.last_time = now | |
| # JPEG + base64 encode | |
| success, buf = cv2.imencode(".jpg", img) | |
| if success: | |
| b64 = base64.b64encode(buf).decode("utf-8") | |
| payload = { | |
| "model": MODEL_ALIAS, | |
| "messages": [ | |
| { | |
| "role": "system", | |
| "content": ( | |
| "You are a precise imageβcaptioning assistant. " | |
| "Identify the main subject, their clothing, posture, and environment." | |
| ), | |
| }, | |
| { | |
| "role": "user", | |
| "content": [ | |
| {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{b64}"}}, | |
| {"type": "text", "text": "Caption this image in one detailed sentence."}, | |
| ], | |
| }, | |
| ], | |
| "temperature": 0.1, | |
| "max_tokens": 100, | |
| } | |
| try: | |
| r = requests.post(f"{BASE_URL}/chat/completions", json=payload, timeout=10) | |
| r.raise_for_status() | |
| self.caption = r.json()["choices"][0]["message"]["content"].strip() | |
| except Exception as e: | |
| self.caption = f"[Error] {e}" | |
| # overlay caption | |
| y = img.shape[0] - 20 | |
| cv2.putText(img, self.caption, (10, y), self.font, 0.7, (0, 255, 0), 2) | |
| return frame.from_ndarray(img, format="bgr24") | |
| webrtc_streamer( | |
| key=f"caption_{interval_ms}", | |
| video_processor_factory=lambda: CaptionProcessor(interval_ms), | |
| media_stream_constraints={"video": True, "audio": False}, | |
| ) | |