Spaces:

GrassData
/

cliptagger-12b

Running on A100

App Files Files Community

andrejrad commited on Aug 18

Commit

dcdd99b

verified ·

1 Parent(s): 1cffa06

Update app.py

Browse files

Files changed (1) hide show

app.py +113 -117

app.py CHANGED Viewed

@@ -1,22 +1,18 @@
-import os, json, re, traceback
 from typing import Any, Dict, Tuple
 import gradio as gr
 from PIL import Image
 import torch
 from transformers import AutoProcessor, AutoTokenizer, AutoModelForCausalLM, AutoConfig
-# --------------------------
-# Env / params
-# --------------------------
 MODEL_ID = os.environ.get("MODEL_ID", "inference-net/ClipTagger-12b")
-HF_TOKEN = os.environ.get("HF_TOKEN")  # set in Space → Settings → Variables & secrets
 TEMP = 0.1
-MAX_NEW_TOKENS = 2000
 DTYPE = torch.bfloat16 if torch.cuda.is_available() else torch.float32
-# --------------------------
-# Prompts (yours)
-# --------------------------
 SYSTEM_PROMPT = (
     "You are an image annotation API trained to analyze YouTube video keyframes. "
     "You will be given instructions on the output format, what to caption, and how to perform your job. "
@@ -53,55 +49,61 @@ Rules:
 - Output **only the JSON**, no extra text or explanation.
 """
-# --------------------------
-# Utilities
-# --------------------------
-def _json_extract(text: str):
-    """Strict JSON parse with top-level {...} fallback."""
     try:
-        return json.loads(text)
     except Exception:
-        m = re.search(r"\{(?:[^{}]|(?R))*\}", text, flags=re.DOTALL)
-        if m:
-            try:
-                return json.loads(m.group(0))
-            except Exception:
-                pass
     return None
-def _build_messages(image: Image.Image):
     return [
         {"role": "system", "content": [{"type": "text", "text": SYSTEM_PROMPT}]},
         {"role": "user",   "content": [{"type": "image", "image": image},
                                        {"type": "text",  "text": USER_PROMPT}]}
     ]
-def _downscale_if_huge(pil: Image.Image, max_side: int = 1792) -> Image.Image:
-    """Cap longest side to keep memory predictable; A100 is roomy but this avoids extreme uploads."""
     if pil is None:
         return pil
     w, h = pil.size
     m = max(w, h)
     if m <= max_side:
         return pil.convert("RGB")
-    scale = max_side / m
-    new_w, new_h = int(w * scale), int(h * scale)
-    return pil.convert("RGB").resize((new_w, new_h), Image.BICUBIC)
-# --------------------------
-# Load model (dedicated GPU)
-# --------------------------
 processor = tokenizer = model = None
 LOAD_ERROR = None
 try:
     cfg = AutoConfig.from_pretrained(MODEL_ID, token=HF_TOKEN, trust_remote_code=True)
     if "clip" in cfg.__class__.__name__.lower():
-        raise RuntimeError(
-            f"MODEL_ID '{MODEL_ID}' resolves to a CLIP/encoder config; need a causal VLM checkpoint."
-        )
-    # Try quantized path (compressed-tensors) per your config
     try:
         processor = AutoProcessor.from_pretrained(
             MODEL_ID, token=HF_TOKEN, trust_remote_code=True, use_fast=True
@@ -111,107 +113,105 @@ try:
             MODEL_ID, token=HF_TOKEN, trust_remote_code=True
         )
-    try:
-        model = AutoModelForCausalLM.from_pretrained(
-            MODEL_ID,
-            token=HF_TOKEN,
-            device_map="auto",
-            torch_dtype=DTYPE,
-            trust_remote_code=True,
-        )
-    except Exception as e:
-        # Fallback: disable quantization if the backend isn't available
-        if "compressed_tensors" in str(e):
-            model = AutoModelForCausalLM.from_pretrained(
-                MODEL_ID,
-                token=HF_TOKEN,
-                device_map="auto",
-                torch_dtype=DTYPE,
-                trust_remote_code=True,
-                quantization_config=None,
-            )
-        else:
-            raise
     tokenizer = getattr(processor, "tokenizer", None) or AutoTokenizer.from_pretrained(
         MODEL_ID, token=HF_TOKEN, trust_remote_code=True, use_fast=True
     )
 except Exception as e:
     LOAD_ERROR = f"{e}\n\n{traceback.format_exc()}"
-# --------------------------
-# Inference
-# --------------------------
-def run(image: Image.Image) -> Tuple[str, Dict[str, Any] | None, bool]:
     if image is None:
         return "Please upload an image.", None, False
     if model is None or processor is None:
-        msg = (
-            "❌ Model failed to load.\n\n"
-            f"{LOAD_ERROR or 'Unknown error.'}\n"
-            "Check MODEL_ID/HF_TOKEN and that the repo includes model + processor files."
-        )
-        return msg, None, False
-    image = _downscale_if_huge(image)
-    # Build chat prompt
     if hasattr(processor, "apply_chat_template"):
-        prompt = processor.apply_chat_template(_build_messages(image), add_generation_prompt=True, tokenize=False)
     else:
-        # Very rare fallback path
-        msgs = _build_messages(image)
-        prompt = ""
-        for m in msgs:
-            role = m["role"].upper()
-            for chunk in m["content"]:
-                if chunk["type"] == "text":
-                    prompt += f"{role}: {chunk['text']}\n"
-                elif chunk["type"] == "image":
-                    prompt += f"{role}: [IMAGE]\n"
     # Tokenize with vision
     inputs = processor(text=prompt, images=image, return_tensors="pt").to(model.device)
-    # Gen args
-    gen_kwargs = dict(
-        temperature=TEMP,
-        max_new_tokens=MAX_NEW_TOKENS,
-    )
     eos = getattr(model.config, "eos_token_id", None)
-    if eos is not None:
-        gen_kwargs["eos_token_id"] = eos
-    # Try to enforce JSON; if unsupported, we'll retry without
     tried = []
-    for tag, extra in [
-        ("json_object", {"response_format": {"type": "json_object"}}),
-        ("no_response_format", {}),
-        ("short_deterministic", {"temperature": 0.0, "max_new_tokens": min(512, MAX_NEW_TOKENS)}),
-    ]:
-        try:
-            with torch.inference_mode():
-                out = model.generate(**inputs, **{**gen_kwargs, **extra})
-            text = (processor.decode(out[0], skip_special_tokens=True)
-                    if hasattr(processor, "decode")
-                    else AutoTokenizer.from_pretrained(MODEL_ID, token=HF_TOKEN, use_fast=True).decode(out[0], skip_special_tokens=True))
-            if USER_PROMPT in text:
-                text = text.split(USER_PROMPT)[-1].strip()
-            parsed = _json_extract(text)
-            if isinstance(parsed, dict):
-                return json.dumps(parsed, indent=2), parsed, True
-            tried.append((tag, "parsed-failed"))
-        except Exception as e:
-            tried.append((tag, f"err={e}"))
-    # If all strategies failed, return debug info
     return "Generation failed.\nTried: " + "\n".join([f"{t[0]} -> {t[1]}" for t in tried]), None, False
-# --------------------------
-# UI
-# --------------------------
-with gr.Blocks(theme=gr.themes.Soft(), analytics_enabled=False, title="Keyframe Annotator (Gemma-3 VLM)") as demo:
     gr.Markdown("# Keyframe Annotator (Gemma-3-12B FT · A100)\nUpload an image to get **strict JSON** annotations.")
     if LOAD_ERROR:
         with gr.Accordion("Startup Error Details", open=False):
@@ -224,12 +224,8 @@ with gr.Blocks(theme=gr.themes.Soft(), analytics_enabled=False, title="Keyframe
         with gr.Column(scale=1):
             out_text = gr.Code(label="Output (JSON or error)")
             out_json = gr.JSON(label="Parsed JSON")
-            ok = gr.Checkbox(label="Valid JSON", value=False, interactive=False)
-    def on_click(img):
-        return run(img)
-    btn.click(on_click, inputs=[image], outputs=[out_text, out_json, ok])
 demo.queue(max_size=32).launch()

+import os, json, traceback
 from typing import Any, Dict, Tuple
 import gradio as gr
 from PIL import Image
 import torch
 from transformers import AutoProcessor, AutoTokenizer, AutoModelForCausalLM, AutoConfig
+# -------- Env / params --------
 MODEL_ID = os.environ.get("MODEL_ID", "inference-net/ClipTagger-12b")
+HF_TOKEN = os.environ.get("HF_TOKEN")
 TEMP = 0.1
+MAX_NEW_TOKENS = 768  # faster demo; raise later if needed
 DTYPE = torch.bfloat16 if torch.cuda.is_available() else torch.float32
+# -------- Prompts (yours) --------
 SYSTEM_PROMPT = (
     "You are an image annotation API trained to analyze YouTube video keyframes. "
     "You will be given instructions on the output format, what to caption, and how to perform your job. "
 - Output **only the JSON**, no extra text or explanation.
 """
+# -------- Utils --------
+def extract_top_level_json(s: str):
+    """Parse JSON; if extra text around it, extract the first balanced {...} block."""
+    # Fast path
     try:
+        return json.loads(s)
     except Exception:
+        pass
+    # Brace-stack extraction
+    start = None
+    depth = 0
+    for i, ch in enumerate(s):
+        if ch == '{':
+            if depth == 0:
+                start = i
+            depth += 1
+        elif ch == '}':
+            if depth > 0:
+                depth -= 1
+                if depth == 0 and start is not None:
+                    chunk = s[start:i+1]
+                    try:
+                        return json.loads(chunk)
+                    except Exception:
+                        # continue scanning for the next candidate
+                        start = None
     return None
+def build_messages(image: Image.Image):
     return [
         {"role": "system", "content": [{"type": "text", "text": SYSTEM_PROMPT}]},
         {"role": "user",   "content": [{"type": "image", "image": image},
                                        {"type": "text",  "text": USER_PROMPT}]}
     ]
+def downscale_if_huge(pil: Image.Image, max_side: int = 1792) -> Image.Image:
     if pil is None:
         return pil
     w, h = pil.size
     m = max(w, h)
     if m <= max_side:
         return pil.convert("RGB")
+    s = max_side / m
+    return pil.convert("RGB").resize((int(w*s), int(h*s)), Image.BICUBIC)
+# -------- Load model (A100) --------
 processor = tokenizer = model = None
 LOAD_ERROR = None
 try:
     cfg = AutoConfig.from_pretrained(MODEL_ID, token=HF_TOKEN, trust_remote_code=True)
     if "clip" in cfg.__class__.__name__.lower():
+        raise RuntimeError(f"MODEL_ID '{MODEL_ID}' is a CLIP/encoder repo; need a causal VLM.")
+    print("[boot] loading processor…", flush=True)
     try:
         processor = AutoProcessor.from_pretrained(
             MODEL_ID, token=HF_TOKEN, trust_remote_code=True, use_fast=True
             MODEL_ID, token=HF_TOKEN, trust_remote_code=True
         )
+    print("[boot] loading model…", flush=True)
+    # Force full-precision path on A100 first; add quantized path later if desired
+    model = AutoModelForCausalLM.from_pretrained(
+        MODEL_ID,
+        token=HF_TOKEN,
+        device_map="auto",
+        torch_dtype=DTYPE,
+        trust_remote_code=True,
+        # quantization_config=None,  # keep commented if you want to honor repo quant; uncomment to force dequant
+    )
     tokenizer = getattr(processor, "tokenizer", None) or AutoTokenizer.from_pretrained(
         MODEL_ID, token=HF_TOKEN, trust_remote_code=True, use_fast=True
     )
+    print("[boot] ready.", flush=True)
 except Exception as e:
     LOAD_ERROR = f"{e}\n\n{traceback.format_exc()}"
+# -------- Inference --------
+def generate(image: Image.Image) -> Tuple[str, Dict[str, Any] | None, bool]:
     if image is None:
         return "Please upload an image.", None, False
     if model is None or processor is None:
+        return f"❌ Load error:\n{LOAD_ERROR}", None, False
+    image = downscale_if_huge(image)
+    # Build prompt
     if hasattr(processor, "apply_chat_template"):
+        prompt = processor.apply_chat_template(build_messages(image), add_generation_prompt=True, tokenize=False)
     else:
+        # fallback join (rare)
+        prompt = USER_PROMPT
     # Tokenize with vision
     inputs = processor(text=prompt, images=image, return_tensors="pt").to(model.device)
+    # Common gen kwargs
     eos = getattr(model.config, "eos_token_id", None)
     tried = []
+    # (1) Greedy, no sampling (most stable, no temperature arg)
+    try:
+        g = dict(do_sample=False, max_new_tokens=MAX_NEW_TOKENS)
+        if eos is not None:
+            g["eos_token_id"] = eos
+        with torch.inference_mode():
+            out = model.generate(**inputs, **g)
+        text = (processor.decode(out[0], skip_special_tokens=True)
+                if hasattr(processor, "decode")
+                else tokenizer.decode(out[0], skip_special_tokens=True))
+        parsed = extract_top_level_json(text)
+        if isinstance(parsed, dict):
+            return json.dumps(parsed, indent=2), parsed, True
+        tried.append(("greedy", "parsed-failed"))
+    except Exception as e:
+        tried.append(("greedy", f"err={e}"))
+    # (2) Sampling with temperature=0.1
+    try:
+        g = dict(do_sample=True, temperature=TEMP, max_new_tokens=MAX_NEW_TOKENS)
+        if eos is not None:
+            g["eos_token_id"] = eos
+        with torch.inference_mode():
+            out = model.generate(**inputs, **g)
+        text = (processor.decode(out[0], skip_special_tokens=True)
+                if hasattr(processor, "decode")
+                else tokenizer.decode(out[0], skip_special_tokens=True))
+        parsed = extract_top_level_json(text)
+        if isinstance(parsed, dict):
+            return json.dumps(parsed, indent=2), parsed, True
+        tried.append(("sample_t0.1", "parsed-failed"))
+    except Exception as e:
+        tried.append(("sample_t0.1", f"err={e}"))
+    # (3) Shorter greedy
+    try:
+        g = dict(do_sample=False, max_new_tokens=min(512, MAX_NEW_TOKENS))
+        if eos is not None:
+            g["eos_token_id"] = eos
+        with torch.inference_mode():
+            out = model.generate(**inputs, **g)
+        text = (processor.decode(out[0], skip_special_tokens=True)
+                if hasattr(processor, "decode")
+                else tokenizer.decode(out[0], skip_special_tokens=True))
+        parsed = extract_top_level_json(text)
+        if isinstance(parsed, dict):
+            return json.dumps(parsed, indent=2), parsed, True
+        tried.append(("greedy_short", "parsed-failed"))
+    except Exception as e:
+        tried.append(("greedy_short", f"err={e}"))
+    # Debug info if all fail
     return "Generation failed.\nTried: " + "\n".join([f"{t[0]} -> {t[1]}" for t in tried]), None, False
+# -------- UI --------
+with gr.Blocks(theme=gr.themes.Soft(), analytics_enabled=False, title="Keyframe Annotator (Gemma-3 VLM · A100)") as demo:
     gr.Markdown("# Keyframe Annotator (Gemma-3-12B FT · A100)\nUpload an image to get **strict JSON** annotations.")
     if LOAD_ERROR:
         with gr.Accordion("Startup Error Details", open=False):
         with gr.Column(scale=1):
             out_text = gr.Code(label="Output (JSON or error)")
             out_json = gr.JSON(label="Parsed JSON")
+            ok_flag = gr.Checkbox(label="Valid JSON", value=False, interactive=False)
+    btn.click(generate, inputs=[image], outputs=[out_text, out_json, ok_flag])
 demo.queue(max_size=32).launch()