Spaces:
Runtime error
Runtime error
add vebose mode switch
Browse files
app.py
CHANGED
|
@@ -45,6 +45,7 @@ model_cache = {
|
|
| 45 |
'size': None,
|
| 46 |
'model_file': None,
|
| 47 |
'clip_file': None,
|
|
|
|
| 48 |
'llm': None
|
| 49 |
}
|
| 50 |
|
|
@@ -86,19 +87,22 @@ class SmolVLM2ChatHandler(Llava15ChatHandler):
|
|
| 86 |
"{% if add_generation_prompt %}Assistant:{% endif %}"
|
| 87 |
)
|
| 88 |
|
| 89 |
-
# Load and cache LLM (only on dropdown change)
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
if (model_cache['size'], model_cache['model_file'], model_cache['clip_file']) != (size, model_file, clip_file):
|
| 93 |
mf, cf = ensure_weights(size, model_file, clip_file)
|
| 94 |
-
handler = SmolVLM2ChatHandler(clip_model_path=cf, verbose=
|
| 95 |
-
llm = Llama(
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
|
| 100 |
# Build weight filename lists
|
| 101 |
-
|
| 102 |
def get_weight_files(size):
|
| 103 |
cfg = MODELS[size]
|
| 104 |
model_files = [f"{cfg['model_prefix']}.{v}.gguf" for v in cfg['model_variants']]
|
|
@@ -106,14 +110,13 @@ def get_weight_files(size):
|
|
| 106 |
return model_files, clip_files
|
| 107 |
|
| 108 |
# Caption using cached llm with real-time debug logs
|
| 109 |
-
|
| 110 |
-
def caption_frame(frame, size, model_file, clip_file, interval_ms, sys_prompt, usr_prompt,
|
| 111 |
-
reset_clip: bool):
|
| 112 |
debug_msgs = []
|
| 113 |
timestamp = time.strftime('%H:%M:%S')
|
|
|
|
|
|
|
| 114 |
debug_msgs.append(f"[{timestamp}] Received frame shape: {frame.shape}")
|
| 115 |
|
| 116 |
-
# show which weight files we’re using this run
|
| 117 |
timestamp = time.strftime('%H:%M:%S')
|
| 118 |
debug_msgs.append(f"[{timestamp}] Using model weights: {model_file}")
|
| 119 |
debug_msgs.append(f"[{timestamp}] Using CLIP weights: {clip_file}")
|
|
@@ -145,9 +148,8 @@ def caption_frame(frame, size, model_file, clip_file, interval_ms, sys_prompt, u
|
|
| 145 |
|
| 146 |
timestamp = time.strftime('%H:%M:%S')
|
| 147 |
debug_msgs.append(f"[{timestamp}] Sending prompt of length {len(usr_prompt)} to LLM")
|
| 148 |
-
# re-init handler for image
|
| 149 |
if reset_clip:
|
| 150 |
-
model_cache['llm'].chat_handler = SmolVLM2ChatHandler(clip_model_path=clip_file, verbose=
|
| 151 |
timestamp = time.strftime('%H:%M:%S')
|
| 152 |
debug_msgs.append(f"[{timestamp}] Reinitialized chat handler")
|
| 153 |
|
|
@@ -176,10 +178,10 @@ def caption_frame(frame, size, model_file, clip_file, interval_ms, sys_prompt, u
|
|
| 176 |
return content, "\n".join(debug_msgs)
|
| 177 |
|
| 178 |
# Gradio UI
|
| 179 |
-
|
| 180 |
def main():
|
| 181 |
logging.basicConfig(level=logging.INFO)
|
| 182 |
default = '2.2B'
|
|
|
|
| 183 |
mf, cf = get_weight_files(default)
|
| 184 |
|
| 185 |
with gr.Blocks() as demo:
|
|
@@ -188,38 +190,46 @@ def main():
|
|
| 188 |
size_dd = gr.Dropdown(list(MODELS.keys()), value=default, label='Model Size')
|
| 189 |
model_dd = gr.Dropdown(mf, value=mf[0], label='Decoder Weights')
|
| 190 |
clip_dd = gr.Dropdown(cf, value=cf[0], label='CLIP Weights')
|
|
|
|
| 191 |
|
| 192 |
-
|
| 193 |
-
def on_size_change(sz):
|
| 194 |
mlist, clist = get_weight_files(sz)
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
gr.update(choices=clist, value=clist[0])
|
| 199 |
-
)
|
| 200 |
-
# preload with first weights
|
| 201 |
-
update_llm(sz, mlist[0], clist[0])
|
| 202 |
-
return update_ui
|
| 203 |
size_dd.change(
|
| 204 |
fn=on_size_change,
|
| 205 |
-
inputs=[size_dd],
|
| 206 |
outputs=[model_dd, clip_dd]
|
| 207 |
)
|
| 208 |
-
model_dd.change(
|
| 209 |
-
|
| 210 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 211 |
|
| 212 |
-
interval
|
| 213 |
-
sys_p
|
| 214 |
-
usr_p
|
| 215 |
reset_clip = gr.Checkbox(value=True, label="Reset CLIP handler each frame")
|
| 216 |
-
cam
|
| 217 |
-
cap
|
| 218 |
-
log_box
|
| 219 |
|
| 220 |
cam.stream(
|
| 221 |
fn=caption_frame,
|
| 222 |
-
inputs=[cam, size_dd, model_dd, clip_dd, interval, sys_p, usr_p, reset_clip],
|
| 223 |
outputs=[cap, log_box],
|
| 224 |
time_limit=600,
|
| 225 |
)
|
|
|
|
| 45 |
'size': None,
|
| 46 |
'model_file': None,
|
| 47 |
'clip_file': None,
|
| 48 |
+
'verbose': None,
|
| 49 |
'llm': None
|
| 50 |
}
|
| 51 |
|
|
|
|
| 87 |
"{% if add_generation_prompt %}Assistant:{% endif %}"
|
| 88 |
)
|
| 89 |
|
| 90 |
+
# Load and cache LLM (only on dropdown or verbose change)
|
| 91 |
+
def update_llm(size, model_file, clip_file, verbose_mode):
|
| 92 |
+
if (model_cache['size'], model_cache['model_file'], model_cache['clip_file'], model_cache['verbose']) != (size, model_file, clip_file, verbose_mode):
|
|
|
|
| 93 |
mf, cf = ensure_weights(size, model_file, clip_file)
|
| 94 |
+
handler = SmolVLM2ChatHandler(clip_model_path=cf, verbose=verbose_mode)
|
| 95 |
+
llm = Llama(
|
| 96 |
+
model_path=mf,
|
| 97 |
+
chat_handler=handler,
|
| 98 |
+
n_ctx=8192,
|
| 99 |
+
verbose=verbose_mode,
|
| 100 |
+
n_threads=max(2, os.cpu_count())
|
| 101 |
+
)
|
| 102 |
+
model_cache.update({'size': size, 'model_file': mf, 'clip_file': cf, 'verbose': verbose_mode, 'llm': llm})
|
| 103 |
+
return None
|
| 104 |
|
| 105 |
# Build weight filename lists
|
|
|
|
| 106 |
def get_weight_files(size):
|
| 107 |
cfg = MODELS[size]
|
| 108 |
model_files = [f"{cfg['model_prefix']}.{v}.gguf" for v in cfg['model_variants']]
|
|
|
|
| 110 |
return model_files, clip_files
|
| 111 |
|
| 112 |
# Caption using cached llm with real-time debug logs
|
| 113 |
+
def caption_frame(frame, size, model_file, clip_file, interval_ms, sys_prompt, usr_prompt, reset_clip, verbose_mode):
|
|
|
|
|
|
|
| 114 |
debug_msgs = []
|
| 115 |
timestamp = time.strftime('%H:%M:%S')
|
| 116 |
+
debug_msgs.append(f"[{timestamp}] Verbose mode: {verbose_mode}")
|
| 117 |
+
timestamp = time.strftime('%H:%M:%S')
|
| 118 |
debug_msgs.append(f"[{timestamp}] Received frame shape: {frame.shape}")
|
| 119 |
|
|
|
|
| 120 |
timestamp = time.strftime('%H:%M:%S')
|
| 121 |
debug_msgs.append(f"[{timestamp}] Using model weights: {model_file}")
|
| 122 |
debug_msgs.append(f"[{timestamp}] Using CLIP weights: {clip_file}")
|
|
|
|
| 148 |
|
| 149 |
timestamp = time.strftime('%H:%M:%S')
|
| 150 |
debug_msgs.append(f"[{timestamp}] Sending prompt of length {len(usr_prompt)} to LLM")
|
|
|
|
| 151 |
if reset_clip:
|
| 152 |
+
model_cache['llm'].chat_handler = SmolVLM2ChatHandler(clip_model_path=clip_file, verbose=verbose_mode)
|
| 153 |
timestamp = time.strftime('%H:%M:%S')
|
| 154 |
debug_msgs.append(f"[{timestamp}] Reinitialized chat handler")
|
| 155 |
|
|
|
|
| 178 |
return content, "\n".join(debug_msgs)
|
| 179 |
|
| 180 |
# Gradio UI
|
|
|
|
| 181 |
def main():
|
| 182 |
logging.basicConfig(level=logging.INFO)
|
| 183 |
default = '2.2B'
|
| 184 |
+
default_verbose = False
|
| 185 |
mf, cf = get_weight_files(default)
|
| 186 |
|
| 187 |
with gr.Blocks() as demo:
|
|
|
|
| 190 |
size_dd = gr.Dropdown(list(MODELS.keys()), value=default, label='Model Size')
|
| 191 |
model_dd = gr.Dropdown(mf, value=mf[0], label='Decoder Weights')
|
| 192 |
clip_dd = gr.Dropdown(cf, value=cf[0], label='CLIP Weights')
|
| 193 |
+
verbose_cb= gr.Checkbox(value=default_verbose, label='Verbose Mode')
|
| 194 |
|
| 195 |
+
def on_size_change(sz, verbose):
|
|
|
|
| 196 |
mlist, clist = get_weight_files(sz)
|
| 197 |
+
update_llm(sz, mlist[0], clist[0], verbose)
|
| 198 |
+
return gr.update(choices=mlist, value=mlist[0]), gr.update(choices=clist, value=clist[0])
|
| 199 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 200 |
size_dd.change(
|
| 201 |
fn=on_size_change,
|
| 202 |
+
inputs=[size_dd, verbose_cb],
|
| 203 |
outputs=[model_dd, clip_dd]
|
| 204 |
)
|
| 205 |
+
model_dd.change(
|
| 206 |
+
fn=lambda sz, mf, cf, verbose: update_llm(sz, mf, cf, verbose),
|
| 207 |
+
inputs=[size_dd, model_dd, clip_dd, verbose_cb],
|
| 208 |
+
outputs=[]
|
| 209 |
+
)
|
| 210 |
+
clip_dd.change(
|
| 211 |
+
fn=lambda sz, mf, cf, verbose: update_llm(sz, mf, cf, verbose),
|
| 212 |
+
inputs=[size_dd, model_dd, clip_dd, verbose_cb],
|
| 213 |
+
outputs=[]
|
| 214 |
+
)
|
| 215 |
+
verbose_cb.change(
|
| 216 |
+
fn=lambda sz, mf, cf, verbose: update_llm(sz, mf, cf, verbose),
|
| 217 |
+
inputs=[size_dd, model_dd, clip_dd, verbose_cb],
|
| 218 |
+
outputs=[]
|
| 219 |
+
)
|
| 220 |
+
update_llm(default, mf[0], cf[0], default_verbose)
|
| 221 |
|
| 222 |
+
interval = gr.Slider(100, 20000, step=100, value=3000, label='Interval (ms)')
|
| 223 |
+
sys_p = gr.Textbox(lines=2, value="Focus on key dramatic action…", label='System Prompt')
|
| 224 |
+
usr_p = gr.Textbox(lines=1, value="What is happening in this image?", label='User Prompt')
|
| 225 |
reset_clip = gr.Checkbox(value=True, label="Reset CLIP handler each frame")
|
| 226 |
+
cam = gr.Image(sources=['webcam'], streaming=True, label='Webcam Feed')
|
| 227 |
+
cap = gr.Textbox(interactive=False, label='Caption')
|
| 228 |
+
log_box = gr.Textbox(lines=8, interactive=False, label='Debug Log')
|
| 229 |
|
| 230 |
cam.stream(
|
| 231 |
fn=caption_frame,
|
| 232 |
+
inputs=[cam, size_dd, model_dd, clip_dd, interval, sys_p, usr_p, reset_clip, verbose_cb],
|
| 233 |
outputs=[cap, log_box],
|
| 234 |
time_limit=600,
|
| 235 |
)
|