Spaces:

GrassData
/

cliptagger-12b

Running on A100

App Files Files Community

andrejrad commited on Aug 18

Commit

8d3d460

verified ·

1 Parent(s): dcdd99b

Update app.py

Browse files

Files changed (1) hide show

app.py +31 -80

app.py CHANGED Viewed

@@ -9,10 +9,10 @@ from transformers import AutoProcessor, AutoTokenizer, AutoModelForCausalLM, Aut
 MODEL_ID = os.environ.get("MODEL_ID", "inference-net/ClipTagger-12b")
 HF_TOKEN = os.environ.get("HF_TOKEN")
 TEMP = 0.1
-MAX_NEW_TOKENS = 768  # faster demo; raise later if needed
 DTYPE = torch.bfloat16 if torch.cuda.is_available() else torch.float32
-# -------- Prompts (yours) --------
 SYSTEM_PROMPT = (
     "You are an image annotation API trained to analyze YouTube video keyframes. "
     "You will be given instructions on the output format, what to caption, and how to perform your job. "
@@ -27,37 +27,33 @@ Your job is to extract detailed **factual elements directly visible** in the ima
 Return JSON in this structure:
 {
-    "description": "A detailed, factual account of what is visibly happening (4 sentences max). Only mention concrete elements or actions that are clearly shown. Do not include anything about how the image is styled, shot, or composed. Do not lead the description with something like 'This image shows' or 'this keyframe is...', just get right into the details.",
-    "objects": ["object1 with relevant visual details", "object2 with relevant visual details", ...],
-    "actions": ["action1 with participants and context", "action2 with participants and context", ...],
-    "environment": "Detailed factual description of the setting and atmosphere based on visible cues (e.g., interior of a classroom with fluorescent lighting, or outdoor forest path with snow-covered trees).",
-    "content_type": "The type of content it is, e.g. 'real-world footage', 'video game', 'animation', 'cartoon', 'CGI', 'VTuber', etc.",
-    "specific_style": "Specific genre, aesthetic, or platform style (e.g., anime, 3D animation, mobile gameplay, vlog, tutorial, news broadcast, etc.)",
-    "production_quality": "Visible production level: e.g., 'professional studio', 'amateur handheld', 'webcam recording', 'TV broadcast', etc.",
-    "summary": "One clear, comprehensive sentence summarizing the visual content of the frame. Like the description, get right to the point.",
-    "logos": ["logo1 with visual description", "logo2 with visual description", ...]
 }
 Rules:
 - Be specific and literal. Focus on what is explicitly visible.
 - Do NOT include interpretations of emotion, mood, or narrative unless it's visually explicit.
 - No artistic or cinematic analysis.
-- Always include the language of any text in the image if present as an object, e.g. "English text", "Japanese text", "Russian text", etc.
 - Maximum 10 objects and 5 actions.
-- Return an empty array for 'logos' if none are present.
-- Always output strictly valid JSON with proper escaping.
-- Output **only the JSON**, no extra text or explanation.
 """
 # -------- Utils --------
 def extract_top_level_json(s: str):
-    """Parse JSON; if extra text around it, extract the first balanced {...} block."""
-    # Fast path
     try:
         return json.loads(s)
     except Exception:
         pass
-    # Brace-stack extraction
     start = None
     depth = 0
     for i, ch in enumerate(s):
@@ -73,11 +69,10 @@ def extract_top_level_json(s: str):
                     try:
                         return json.loads(chunk)
                     except Exception:
-                        # continue scanning for the next candidate
                         start = None
     return None
-def build_messages(image: Image.Image):
     return [
         {"role": "system", "content": [{"type": "text", "text": SYSTEM_PROMPT}]},
         {"role": "user",   "content": [{"type": "image", "image": image},
@@ -85,8 +80,7 @@ def build_messages(image: Image.Image):
     ]
 def downscale_if_huge(pil: Image.Image, max_side: int = 1792) -> Image.Image:
-    if pil is None:
-        return pil
     w, h = pil.size
     m = max(w, h)
     if m <= max_side:
@@ -94,41 +88,26 @@ def downscale_if_huge(pil: Image.Image, max_side: int = 1792) -> Image.Image:
     s = max_side / m
     return pil.convert("RGB").resize((int(w*s), int(h*s)), Image.BICUBIC)
-# -------- Load model (A100) --------
 processor = tokenizer = model = None
 LOAD_ERROR = None
 try:
     cfg = AutoConfig.from_pretrained(MODEL_ID, token=HF_TOKEN, trust_remote_code=True)
     if "clip" in cfg.__class__.__name__.lower():
         raise RuntimeError(f"MODEL_ID '{MODEL_ID}' is a CLIP/encoder repo; need a causal VLM.")
-    print("[boot] loading processor…", flush=True)
-    try:
-        processor = AutoProcessor.from_pretrained(
-            MODEL_ID, token=HF_TOKEN, trust_remote_code=True, use_fast=True
-        )
-    except TypeError:
-        processor = AutoProcessor.from_pretrained(
-            MODEL_ID, token=HF_TOKEN, trust_remote_code=True
-        )
-    print("[boot] loading model…", flush=True)
-    # Force full-precision path on A100 first; add quantized path later if desired
     model = AutoModelForCausalLM.from_pretrained(
         MODEL_ID,
         token=HF_TOKEN,
         device_map="auto",
         torch_dtype=DTYPE,
         trust_remote_code=True,
-        # quantization_config=None,  # keep commented if you want to honor repo quant; uncomment to force dequant
     )
     tokenizer = getattr(processor, "tokenizer", None) or AutoTokenizer.from_pretrained(
         MODEL_ID, token=HF_TOKEN, trust_remote_code=True, use_fast=True
     )
-    print("[boot] ready.", flush=True)
 except Exception as e:
     LOAD_ERROR = f"{e}\n\n{traceback.format_exc()}"
@@ -141,82 +120,54 @@ def generate(image: Image.Image) -> Tuple[str, Dict[str, Any] | None, bool]:
     image = downscale_if_huge(image)
-    # Build prompt
     if hasattr(processor, "apply_chat_template"):
         prompt = processor.apply_chat_template(build_messages(image), add_generation_prompt=True, tokenize=False)
     else:
-        # fallback join (rare)
         prompt = USER_PROMPT
-    # Tokenize with vision
     inputs = processor(text=prompt, images=image, return_tensors="pt").to(model.device)
-    # Common gen kwargs
-    eos = getattr(model.config, "eos_token_id", None)
     tried = []
-    # (1) Greedy, no sampling (most stable, no temperature arg)
     try:
         g = dict(do_sample=False, max_new_tokens=MAX_NEW_TOKENS)
         if eos is not None:
             g["eos_token_id"] = eos
         with torch.inference_mode():
             out = model.generate(**inputs, **g)
-        text = (processor.decode(out[0], skip_special_tokens=True)
-                if hasattr(processor, "decode")
-                else tokenizer.decode(out[0], skip_special_tokens=True))
         parsed = extract_top_level_json(text)
         if isinstance(parsed, dict):
             return json.dumps(parsed, indent=2), parsed, True
-        tried.append(("greedy", "parsed-failed"))
     except Exception as e:
         tried.append(("greedy", f"err={e}"))
-    # (2) Sampling with temperature=0.1
     try:
         g = dict(do_sample=True, temperature=TEMP, max_new_tokens=MAX_NEW_TOKENS)
         if eos is not None:
             g["eos_token_id"] = eos
         with torch.inference_mode():
             out = model.generate(**inputs, **g)
-        text = (processor.decode(out[0], skip_special_tokens=True)
-                if hasattr(processor, "decode")
-                else tokenizer.decode(out[0], skip_special_tokens=True))
         parsed = extract_top_level_json(text)
         if isinstance(parsed, dict):
             return json.dumps(parsed, indent=2), parsed, True
-        tried.append(("sample_t0.1", "parsed-failed"))
     except Exception as e:
-        tried.append(("sample_t0.1", f"err={e}"))
-    # (3) Shorter greedy
-    try:
-        g = dict(do_sample=False, max_new_tokens=min(512, MAX_NEW_TOKENS))
-        if eos is not None:
-            g["eos_token_id"] = eos
-        with torch.inference_mode():
-            out = model.generate(**inputs, **g)
-        text = (processor.decode(out[0], skip_special_tokens=True)
-                if hasattr(processor, "decode")
-                else tokenizer.decode(out[0], skip_special_tokens=True))
-        parsed = extract_top_level_json(text)
-        if isinstance(parsed, dict):
-            return json.dumps(parsed, indent=2), parsed, True
-        tried.append(("greedy_short", "parsed-failed"))
-    except Exception as e:
-        tried.append(("greedy_short", f"err={e}"))
-    # Debug info if all fail
-    return "Generation failed.\nTried: " + "\n".join([f"{t[0]} -> {t[1]}" for t in tried]), None, False
 # -------- UI --------
-with gr.Blocks(theme=gr.themes.Soft(), analytics_enabled=False, title="Keyframe Annotator (Gemma-3 VLM · A100)") as demo:
-    gr.Markdown("# Keyframe Annotator (Gemma-3-12B FT · A100)\nUpload an image to get **strict JSON** annotations.")
     if LOAD_ERROR:
         with gr.Accordion("Startup Error Details", open=False):
             gr.Markdown(f"```\n{LOAD_ERROR}\n```")
     with gr.Row():
         with gr.Column(scale=1):
             image = gr.Image(type="pil", label="Upload Image", image_mode="RGB")

 MODEL_ID = os.environ.get("MODEL_ID", "inference-net/ClipTagger-12b")
 HF_TOKEN = os.environ.get("HF_TOKEN")
 TEMP = 0.1
+MAX_NEW_TOKENS = 768  # safe for demo; raise if needed
 DTYPE = torch.bfloat16 if torch.cuda.is_available() else torch.float32
+# -------- Prompts --------
 SYSTEM_PROMPT = (
     "You are an image annotation API trained to analyze YouTube video keyframes. "
     "You will be given instructions on the output format, what to caption, and how to perform your job. "
 Return JSON in this structure:
 {
+    "description": "...",
+    "objects": ["..."],
+    "actions": ["..."],
+    "environment": "...",
+    "content_type": "...",
+    "specific_style": "...",
+    "production_quality": "...",
+    "summary": "...",
+    "logos": ["..."]
 }
 Rules:
 - Be specific and literal. Focus on what is explicitly visible.
 - Do NOT include interpretations of emotion, mood, or narrative unless it's visually explicit.
 - No artistic or cinematic analysis.
+- Always include the language of any text in the image if present as an object.
 - Maximum 10 objects and 5 actions.
+- Return [] for 'logos' if none are present.
+- Strictly valid JSON only.
 """
 # -------- Utils --------
 def extract_top_level_json(s: str):
     try:
         return json.loads(s)
     except Exception:
         pass
     start = None
     depth = 0
     for i, ch in enumerate(s):
                     try:
                         return json.loads(chunk)
                     except Exception:
                         start = None
     return None
+def build_messages(image):
     return [
         {"role": "system", "content": [{"type": "text", "text": SYSTEM_PROMPT}]},
         {"role": "user",   "content": [{"type": "image", "image": image},
     ]
 def downscale_if_huge(pil: Image.Image, max_side: int = 1792) -> Image.Image:
+    if pil is None: return pil
     w, h = pil.size
     m = max(w, h)
     if m <= max_side:
     s = max_side / m
     return pil.convert("RGB").resize((int(w*s), int(h*s)), Image.BICUBIC)
+# -------- Load model --------
 processor = tokenizer = model = None
 LOAD_ERROR = None
 try:
     cfg = AutoConfig.from_pretrained(MODEL_ID, token=HF_TOKEN, trust_remote_code=True)
     if "clip" in cfg.__class__.__name__.lower():
         raise RuntimeError(f"MODEL_ID '{MODEL_ID}' is a CLIP/encoder repo; need a causal VLM.")
+    processor = AutoProcessor.from_pretrained(MODEL_ID, token=HF_TOKEN, trust_remote_code=True)
     model = AutoModelForCausalLM.from_pretrained(
         MODEL_ID,
         token=HF_TOKEN,
         device_map="auto",
         torch_dtype=DTYPE,
         trust_remote_code=True,
+        # quantization_config=None,  # uncomment if you want to force full precision
     )
     tokenizer = getattr(processor, "tokenizer", None) or AutoTokenizer.from_pretrained(
         MODEL_ID, token=HF_TOKEN, trust_remote_code=True, use_fast=True
     )
 except Exception as e:
     LOAD_ERROR = f"{e}\n\n{traceback.format_exc()}"
     image = downscale_if_huge(image)
     if hasattr(processor, "apply_chat_template"):
         prompt = processor.apply_chat_template(build_messages(image), add_generation_prompt=True, tokenize=False)
     else:
         prompt = USER_PROMPT
     inputs = processor(text=prompt, images=image, return_tensors="pt").to(model.device)
     tried = []
+    # (1) Greedy
     try:
         g = dict(do_sample=False, max_new_tokens=MAX_NEW_TOKENS)
+        eos = getattr(model.config, "eos_token_id", None)
         if eos is not None:
             g["eos_token_id"] = eos
         with torch.inference_mode():
             out = model.generate(**inputs, **g)
+        text = processor.decode(out[0], skip_special_tokens=True)
         parsed = extract_top_level_json(text)
         if isinstance(parsed, dict):
             return json.dumps(parsed, indent=2), parsed, True
+        tried.append(("greedy", "parse-failed"))
     except Exception as e:
         tried.append(("greedy", f"err={e}"))
+    # (2) Sampling
     try:
         g = dict(do_sample=True, temperature=TEMP, max_new_tokens=MAX_NEW_TOKENS)
+        eos = getattr(model.config, "eos_token_id", None)
         if eos is not None:
             g["eos_token_id"] = eos
         with torch.inference_mode():
             out = model.generate(**inputs, **g)
+        text = processor.decode(out[0], skip_special_tokens=True)
         parsed = extract_top_level_json(text)
         if isinstance(parsed, dict):
             return json.dumps(parsed, indent=2), parsed, True
+        tried.append(("sample", "parse-failed"))
     except Exception as e:
+        tried.append(("sample", f"err={e}"))
+    return "Generation failed.\n" + str(tried), None, False
 # -------- UI --------
+with gr.Blocks(theme=gr.themes.Soft(), analytics_enabled=False, title="ClipTagger (VLM)") as demo:
+    gr.Markdown("# ClipTagger\nUpload an image to get **strict JSON** annotations.")
     if LOAD_ERROR:
         with gr.Accordion("Startup Error Details", open=False):
             gr.Markdown(f"```\n{LOAD_ERROR}\n```")
     with gr.Row():
         with gr.Column(scale=1):
             image = gr.Image(type="pil", label="Upload Image", image_mode="RGB")