Spaces:
Runtime error
Runtime error
| import time | |
| import logging | |
| import gradio as gr | |
| import cv2 | |
| import os | |
| from pathlib import Path | |
| from huggingface_hub import hf_hub_download | |
| from llama_cpp import Llama | |
| from llama_cpp.llama_chat_format import Llava15ChatHandler | |
| import base64 | |
| import gc | |
| # ---------------------------------------- | |
| # Model configurations: per-size prefixes and repos | |
| MODELS = { | |
| "256M": { | |
| "model_repo": "mradermacher/SmolVLM2-256M-Video-Instruct-GGUF", | |
| "clip_repo": "ggml-org/SmolVLM2-256M-Video-Instruct-GGUF", | |
| "model_prefix": "SmolVLM2-256M-Video-Instruct", | |
| "clip_prefix": "mmproj-SmolVLM2-256M-Video-Instruct", | |
| "model_variants": ["Q2_K","Q8_0", "f16"], | |
| "clip_variants": ["Q8_0", "f16"], | |
| }, | |
| "500M": { | |
| "model_repo": "mradermacher/SmolVLM2-500M-Video-Instruct-GGUF", | |
| "clip_repo": "ggml-org/SmolVLM2-500M-Video-Instruct-GGUF", | |
| "model_prefix": "SmolVLM2-500M-Video-Instruct", | |
| "clip_prefix": "mmproj-SmolVLM2-500M-Video-Instruct", | |
| "model_variants": ["Q2_K","Q8_0", "f16"], | |
| "clip_variants": ["Q8_0", "f16"], | |
| }, | |
| "2.2B": { | |
| "model_repo": "mradermacher/SmolVLM2-2.2B-Instruct-GGUF", | |
| "clip_repo": "ggml-org/SmolVLM2-2.2B-Instruct-GGUF", | |
| "model_prefix": "SmolVLM2-2.2B-Instruct", | |
| "clip_prefix": "mmproj-SmolVLM2-2.2B-Instruct", | |
| "model_variants": ["Q2_K","Q4_K_M", "Q8_0", "f16"], | |
| "clip_variants": ["Q8_0", "f16"], | |
| }, | |
| } | |
| # ---------------------------------------- | |
| # Cache for loaded model instance | |
| model_cache = { | |
| 'size': None, | |
| 'model_file': None, | |
| 'clip_file': None, | |
| 'llm': None | |
| } | |
| # Helper to download & symlink weights | |
| def ensure_weights(size, model_file, clip_file): | |
| cfg = MODELS[size] | |
| if not os.path.exists(model_file): | |
| logging.info(f"Downloading model file {model_file} from {cfg['model_repo']}...") | |
| path = hf_hub_download(repo_id=cfg['model_repo'], filename=model_file) | |
| os.symlink(path, model_file) | |
| if not os.path.exists(clip_file): | |
| logging.info(f"Downloading CLIP file {clip_file} from {cfg['clip_repo']}...") | |
| path = hf_hub_download(repo_id=cfg['clip_repo'], filename=clip_file) | |
| os.symlink(path, clip_file) | |
| return model_file, clip_file | |
| # Custom chat handler | |
| class SmolVLM2ChatHandler(Llava15ChatHandler): | |
| CHAT_FORMAT = ( | |
| "<|im_start|>" | |
| "{% for message in messages %}" | |
| "{{ message['role'] | capitalize }}" | |
| "{% if message['role']=='user' and message['content'][0]['type']=='image_url' %}:" | |
| "{% else %}: " | |
| "{% endif %}" | |
| "{% for content in message['content'] %}" | |
| "{% if content['type']=='text' %}{{ content['text'] }}" | |
| "{% elif content['type']=='image_url' %}" | |
| "{% if content['image_url'] is string %}" | |
| "{{ content['image_url'] }}\n" | |
| "{% elif content['image_url'] is mapping %}" | |
| "{{ content['image_url']['url'] }}\n" | |
| "{% endif %}" | |
| "{% endif %}" | |
| "{% endfor %}" | |
| "<end_of_utterance>\n" | |
| "{% endfor %}" | |
| "{% if add_generation_prompt %}Assistant:{% endif %}" | |
| ) | |
| # Load and cache LLM (only on dropdown change) | |
| def update_llm(size, model_file, clip_file): | |
| if (model_cache['size'], model_cache['model_file'], model_cache['clip_file']) != (size, model_file, clip_file): | |
| mf, cf = ensure_weights(size, model_file, clip_file) | |
| handler = SmolVLM2ChatHandler(clip_model_path=cf, verbose=False) | |
| llm = Llama(model_path=mf, chat_handler=handler, n_ctx=1024, | |
| verbose=False, n_threads=min(2, os.cpu_count())) | |
| model_cache.update({'size': size, 'model_file': mf, 'clip_file': cf, 'llm': llm}) | |
| return None # no UI output | |
| # Build weight filename lists | |
| def get_weight_files(size): | |
| cfg = MODELS[size] | |
| model_files = [f"{cfg['model_prefix']}.{v}.gguf" for v in cfg['model_variants']] | |
| clip_files = [f"{cfg['clip_prefix']}-{v}.gguf" for v in cfg['clip_variants']] | |
| return model_files, clip_files | |
| # Caption using cached llm with real-time debug logs | |
| def caption_frame(frame, size, model_file, clip_file, interval_ms, sys_prompt, usr_prompt): | |
| debug_msgs = [] | |
| timestamp = time.strftime('%H:%M:%S') | |
| debug_msgs.append(f"[{timestamp}] Received frame shape: {frame.shape}") | |
| t_resize = time.time() | |
| img = cv2.resize(frame.copy(), (384, 384)) | |
| elapsed = (time.time() - t_resize) * 1000 | |
| timestamp = time.strftime('%H:%M:%S') | |
| debug_msgs.append(f"[{timestamp}] Resized to 384x384 in {elapsed:.1f} ms") | |
| timestamp = time.strftime('%H:%M:%S') | |
| debug_msgs.append(f"[{timestamp}] Sleeping for {interval_ms} ms") | |
| time.sleep(interval_ms / 1000) | |
| t_enc = time.time() | |
| success, jpeg = cv2.imencode('.jpg', img) | |
| elapsed = (time.time() - t_enc) * 1000 | |
| timestamp = time.strftime('%H:%M:%S') | |
| debug_msgs.append(f"[{timestamp}] JPEG encode: success={success}, bytes={len(jpeg)} in {elapsed:.1f} ms") | |
| uri = 'data:image/jpeg;base64,' + base64.b64encode(jpeg.tobytes()).decode() | |
| messages = [ | |
| {"role": "system", "content": sys_prompt}, | |
| {"role": "user", "content": [ | |
| {"type": "image_url", "image_url": uri}, | |
| {"type": "text", "text": usr_prompt} | |
| ]} | |
| ] | |
| timestamp = time.strftime('%H:%M:%S') | |
| debug_msgs.append(f"[{timestamp}] Sending prompt of length {len(usr_prompt)} to LLM") | |
| # re-init handler for image | |
| model_cache['llm'].chat_handler = SmolVLM2ChatHandler(clip_model_path=clip_file, verbose=False) | |
| timestamp = time.strftime('%H:%M:%S') | |
| debug_msgs.append(f"[{timestamp}] Reinitialized chat handler") | |
| debug_msgs.append(f"[{timestamp}] CPU count = {os.cpu_count()}") | |
| t_start = time.time() | |
| resp = model_cache['llm'].create_chat_completion( | |
| messages=messages, | |
| max_tokens=128, | |
| temperature=0.1, | |
| stop=["<end_of_utterance>"] | |
| ) | |
| elapsed = (time.time() - t_start) * 1000 | |
| timestamp = time.strftime('%H:%M:%S') | |
| debug_msgs.append(f"[{timestamp}] LLM response in {elapsed:.1f} ms") | |
| content = resp.get('choices', [{}])[0].get('message', {}).get('content', '').strip() | |
| timestamp = time.strftime('%H:%M:%S') | |
| debug_msgs.append(f"[{timestamp}] Caption length: {len(content)} chars") | |
| gc.collect() | |
| timestamp = time.strftime('%H:%M:%S') | |
| debug_msgs.append(f"[{timestamp}] Garbage collected") | |
| return content, "\n".join(debug_msgs) | |
| # Gradio UI | |
| def main(): | |
| logging.basicConfig(level=logging.INFO) | |
| default = '2.2B' | |
| mf, cf = get_weight_files(default) | |
| with gr.Blocks() as demo: | |
| gr.Markdown("## 🎥 Real-Time Camera Captioning with Debug Logs") | |
| with gr.Row(): | |
| size_dd = gr.Dropdown(list(MODELS.keys()), value=default, label='Model Size') | |
| model_dd = gr.Dropdown(mf, value=mf[0], label='Decoder Weights') | |
| clip_dd = gr.Dropdown(cf, value=cf[0], label='CLIP Weights') | |
| # When size changes: update dropdowns AND preload llm with the new first weights | |
| def on_size_change(sz): | |
| mlist, clist = get_weight_files(sz) | |
| # update dropdown choices and default values | |
| update_ui = ( | |
| gr.update(choices=mlist, value=mlist[0]), | |
| gr.update(choices=clist, value=clist[0]) | |
| ) | |
| # preload with first weights | |
| update_llm(sz, mlist[0], clist[0]) | |
| return update_ui | |
| size_dd.change( | |
| fn=on_size_change, | |
| inputs=[size_dd], | |
| outputs=[model_dd, clip_dd] | |
| ) | |
| model_dd.change(lambda sz, mf, cf: update_llm(sz, mf, cf), inputs=[size_dd, model_dd, clip_dd], outputs=[]) | |
| clip_dd.change(lambda sz, mf, cf: update_llm(sz, mf, cf), inputs=[size_dd, model_dd, clip_dd], outputs=[]) | |
| update_llm(default, mf[0], cf[0]) | |
| interval = gr.Slider(100, 20000, step=100, value=3000, label='Interval (ms)') | |
| sys_p = gr.Textbox(lines=2, value="Focus on key dramatic action…", label='System Prompt') | |
| usr_p = gr.Textbox(lines=1, value="What is happening in this image?", label='User Prompt') | |
| cam = gr.Image(sources=['webcam'], streaming=True, label='Webcam Feed') | |
| cap = gr.Textbox(interactive=False, label='Caption') | |
| log_box = gr.Textbox(lines=8, interactive=False, label='Debug Log') | |
| cam.stream( | |
| fn=caption_frame, | |
| inputs=[cam, size_dd, model_dd, clip_dd, interval, sys_p, usr_p], | |
| outputs=[cap, log_box], | |
| time_limit=600 | |
| ) | |
| demo.launch() | |
| if __name__ == '__main__': | |
| main() | |