Spaces:
Runtime error
Runtime error
| import logging | |
| import gradio as gr | |
| import cv2 | |
| import tempfile | |
| import os | |
| from pathlib import Path | |
| from huggingface_hub import hf_hub_download | |
| from llama_cpp import Llama | |
| from llama_cpp.llama_chat_format import Llava15ChatHandler | |
| from termcolor import cprint | |
| # Configure logging | |
| logging.basicConfig( | |
| level=logging.DEBUG, | |
| format='[%(asctime)s] %(levelname)s: %(message)s', | |
| datefmt='%Y-%m-%d %H:%M:%S' | |
| ) | |
| # βββββββββββββββββββββββββββββββββββββββββ | |
| # 1) Inline definition & registration of SmolVLM2ChatHandler | |
| class SmolVLM2ChatHandler(Llava15ChatHandler): | |
| CHAT_FORMAT = ( | |
| "<|im_start|>" | |
| "{% for message in messages %}" | |
| "{{ message['role'] | capitalize }}" | |
| "{% if message['role']=='user' and message['content'][0]['type']=='image_url' %}:" | |
| "{% else %}: " | |
| "{% endif %}" | |
| "{% for content in message['content'] %}" | |
| "{% if content['type']=='text' %}{{ content['text'] }}" | |
| "{% elif content['type']=='image_url' %}" | |
| "{% if content['image_url'] is string %}" | |
| "{{ content['image_url'] }}\n" | |
| "{% elif content['image_url'] is mapping %}" | |
| "{{ content['image_url']['url'] }}\n" | |
| "{% endif %}" | |
| "{% endif %}" | |
| "{% endfor %}" | |
| "<end_of_utterance>\n" | |
| "{% endfor %}" | |
| "{% if add_generation_prompt %}Assistant:{% endif %}" | |
| ) | |
| # βββββββββββββββββββββββββββββββββββββββββ | |
| # 2) Model & CLIP files β download if missing | |
| MODEL_FILE = "SmolVLM2-500M-Video-Instruct.Q8_0.gguf" | |
| CLIP_FILE = "mmproj-SmolVLM2-500M-Video-Instruct-Q8_0.gguf" | |
| MODEL_REPO = "mradermacher/SmolVLM2-500M-Video-Instruct-GGUF" | |
| CLIP_REPO = "ggml-org/SmolVLM2-500M-Video-Instruct-GGUF" | |
| def ensure_models(): | |
| logging.debug("Ensuring model files are present...") | |
| if not os.path.exists(MODEL_FILE): | |
| logging.info(f"Downloading model file {MODEL_FILE} from {MODEL_REPO}...") | |
| path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE) | |
| os.symlink(path, MODEL_FILE) | |
| logging.info(f"Created symlink: {path} -> {MODEL_FILE}") | |
| else: | |
| logging.debug(f"Model file {MODEL_FILE} already exists.") | |
| if not os.path.exists(CLIP_FILE): | |
| logging.info(f"Downloading CLIP file {CLIP_FILE} from {CLIP_REPO}...") | |
| path = hf_hub_download(repo_id=CLIP_REPO, filename=CLIP_FILE) | |
| os.symlink(path, CLIP_FILE) | |
| logging.info(f"Created symlink: {path} -> {CLIP_FILE}") | |
| else: | |
| logging.debug(f"CLIP file {CLIP_FILE} already exists.") | |
| ensure_models() | |
| def load_llm(): | |
| logging.debug("Loading Llama model with SmolVLM2ChatHandler...") | |
| handler = SmolVLM2ChatHandler(clip_model_path=CLIP_FILE, verbose=False) | |
| llm = Llama( | |
| model_path=MODEL_FILE, | |
| chat_handler=handler, | |
| n_ctx=8192, | |
| verbose=False, | |
| ) | |
| logging.info("Llama model loaded successfully.") | |
| return llm | |
| llm = load_llm() | |
| # βββββββββββββββββββββββββββββββββββββββββ | |
| # 4) Captioning helper (stateless prompt) | |
| def caption_frame(frame): | |
| logging.debug("caption_frame called.") | |
| # make a writable copy | |
| frame = frame.copy() | |
| frame = cv2.resize(frame, (384, 384)) | |
| logging.debug(f"Frame shape: {frame.shape}, dtype: {frame.dtype}") | |
| # save frame to temporary file for URI | |
| with tempfile.NamedTemporaryFile(suffix='.jpg') as f: | |
| success = cv2.imwrite(f.name, frame) | |
| if not success: | |
| logging.error(f"Failed to write frame to {f.name}") | |
| else: | |
| logging.debug(f"Frame written to temp file: {f.name}") | |
| uri = Path(f.name).absolute().as_uri() | |
| logging.debug(f"Frame URI: {uri}") | |
| # build a single prompt string | |
| messages = [ | |
| { | |
| "role": "system", | |
| "content": ( | |
| "Focus only on describing the key dramatic action or notable event occurring " | |
| "in this image. Skip general context or scene-setting details unless they are " | |
| "crucial to understanding the main action." | |
| ), | |
| }, | |
| { | |
| "role": "user", | |
| "content": [ | |
| {"type": "image_url", "image_url": uri}, | |
| {"type": "text", "text": "What is happening in this image?"}, | |
| ], | |
| }, | |
| ] | |
| logging.debug(f"Constructed messages: {messages}") | |
| # stateless completion call | |
| logging.debug("Resetting LLM and clearing cache.") | |
| llm.chat_handler = SmolVLM2ChatHandler(clip_model_path=CLIP_FILE, verbose=False) | |
| llm.reset() # reset n_tokens back to 0 | |
| llm._ctx.kv_cache_clear() # clear any cached key/values | |
| logging.debug("Sending chat completion request...") | |
| resp = llm.create_chat_completion( | |
| messages=messages, | |
| max_tokens=256, | |
| temperature=0.1, | |
| stop=["<end_of_utterance>"], | |
| ) | |
| logging.debug(f"LLM raw response: {resp}") | |
| # extract caption | |
| caption = (resp.get("choices", [])[0]["message"].get("content", "") or "").strip() | |
| logging.debug(f"Extracted caption: {caption}") | |
| return caption | |
| # βββββββββββββββββββββββββββββββββββββββββ | |
| # 5) Gradio UI (v5 streaming) | |
| demo = gr.Blocks() | |
| with demo: | |
| gr.Markdown("## π₯ Real-Time Camera Captioning with SmolVLM2 (CPU)") | |
| input_img = gr.Image(sources=["webcam"], streaming=True, label="Webcam Feed") | |
| caption_box = gr.Textbox(interactive=False, label="Caption") | |
| # stream frames and captions | |
| input_img.stream( | |
| fn=caption_frame, | |
| inputs=[input_img], | |
| outputs=[caption_box], | |
| stream_every=3, | |
| time_limit=600 | |
| ) | |
| if __name__ == "__main__": | |
| logging.debug("Launching Gradio demo...") | |
| demo.launch() | |