import gradio as gr import cv2 import tempfile import os from pathlib import Path from huggingface_hub import hf_hub_download from llama_cpp import Llama from llama_cpp.llama_chat_format import Llava15ChatHandler from termcolor import cprint # ————————————————————————————————————————— # 1) Inline definition & registration of SmolVLM2ChatHandler class SmolVLM2ChatHandler(Llava15ChatHandler): CHAT_FORMAT = ( "<|im_start|>" "{% for message in messages %}" "{{ message['role'] | capitalize }}" "{% if message['role']=='user' and message['content'][0]['type']=='image_url' %}:" "{% else %}: " "{% endif %}" "{% for content in message['content'] %}" "{% if content['type']=='text' %}{{ content['text'] }}" "{% elif content['type']=='image_url' %}" "{% if content['image_url'] is string %}" "{{ content['image_url'] }}\n" "{% elif content['image_url'] is mapping %}" "{{ content['image_url']['url'] }}\n" "{% endif %}" "{% endif %}" "{% endfor %}" "\n" "{% endfor %}" "{% if add_generation_prompt %}Assistant:{% endif %}" ) # ————————————————————————————————————————— # 2) Model & CLIP files — download if missing MODEL_FILE = "SmolVLM2-500M-Video-Instruct.Q8_0.gguf" CLIP_FILE = "mmproj-SmolVLM2-500M-Video-Instruct-Q8_0.gguf" MODEL_REPO = "mradermacher/SmolVLM2-500M-Video-Instruct-GGUF" CLIP_REPO = "ggml-org/SmolVLM2-500M-Video-Instruct-GGUF" def ensure_models(): if not os.path.exists(MODEL_FILE): path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE) os.symlink(path, MODEL_FILE) if not os.path.exists(CLIP_FILE): path = hf_hub_download(repo_id=CLIP_REPO, filename=CLIP_FILE) os.symlink(path, CLIP_FILE) ensure_models() def load_llm(): handler = SmolVLM2ChatHandler(clip_model_path=CLIP_FILE, verbose=False) return Llama( model_path=MODEL_FILE, chat_handler=handler, n_ctx=8192, verbose=False, ) llm = load_llm() # ————————————————————————————————————————— # 4) Captioning helper (stateless prompt) def caption_frame(frame): # make a writable copy frame = frame.copy() # save frame to temporary file for URI with tempfile.NamedTemporaryFile(suffix='.jpg') as f: cv2.imwrite(f.name, frame) uri = Path(f.name).absolute().as_uri() # build a single prompt string messages = [ { "role": "system", "content": ( "Focus only on describing the key dramatic action or notable event occurring " "in this image. Skip general context or scene-setting details unless they are " "crucial to understanding the main action." ), }, { "role": "user", "content": [ {"type": "image_url", "image_url": uri}, {"type": "text", "text": "What is happening in this image?"}, ], }, ] # stateless completion call llm.chat_handler = SmolVLM2ChatHandler(clip_model_path=CLIP_FILE, verbose=False) llm.reset() # reset n_tokens back to 0 llm._ctx.kv_cache_clear() # clear any cached key/values resp = llm.create_chat_completion( messages = messages, max_tokens=256, temperature=0.1, stop=[""], ) # extract caption caption = (resp.get("choices", [])[0]['message'].get("content", "") or "").strip() return caption # ————————————————————————————————————————— # 5) Gradio UI (v5 streaming) demo = gr.Blocks() with demo: gr.Markdown("## 🎥 Real-Time Camera Captioning with SmolVLM2 (CPU)") input_img = gr.Image(sources=["webcam"], streaming=True, label="Webcam Feed") caption_box = gr.Textbox(interactive=False, label="Caption") # stream frames and captions input_img.stream( fn=caption_frame, inputs=[input_img], outputs=[caption_box], stream_every=3, time_limit=600 ) if __name__ == "__main__": demo.launch()