Spaces:

vishalvaka
/

bio-ner-demo

Sleeping

App Files Files Community

vishalvaka commited on May 24

Commit

aaf7fac

1 Parent(s): 4913f36

made some fixes for CRF models

Browse files

Files changed (2) hide show

.gitignore +2 -0
app.py +104 -44

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ .venv/*
2	+ .venv

app.py CHANGED Viewed

@@ -1,59 +1,120 @@
-import gradio as gr, torch
 from transformers import AutoTokenizer, AutoModelForTokenClassification
 from peft import PeftModel
-# --------------------------------------------------------
 BASE = "dmis-lab/biobert-base-cased-v1.2"
-REPO = "vishalvaka/biobert-finetuned-ner"
-VARIANTS = {
-    "Full fine-tune"      : ("full",       "full"),
-    "LoRA-r32"            : ("lora-r32",   "lora"),
-    "LoRA-r32-fast"       : ("lora-r32-fast", "lora"),
-    "LoRA-r16-CRF"        : ("lora-r16-crf",   "lora"),
-    "LoRA-r16-CRF-long"   : ("lora-r16-crf-long", "lora"),
 }
-# (folder_name, loader_type)
-@gr.memoize()
-def load_model(folder, mode):
     """
-    folder = sub-directory inside the model repo
-    mode   = "full" | "lora"
     """
     if mode == "full":
         model = AutoModelForTokenClassification.from_pretrained(
             REPO, subfolder=folder
         )
         tok   = AutoTokenizer.from_pretrained(REPO, subfolder=folder)
-    else:                           # LoRA – reattach to BASE
-        base  = AutoModelForTokenClassification.from_pretrained(BASE)
-        model = PeftModel.from_pretrained(base, REPO, subfolder=folder)
-        tok   = AutoTokenizer.from_pretrained(BASE)
     return tok, model.eval()
-def ner(text, variant):
-    folder, mode      = VARIANTS[variant]
-    tok, model        = load_model(folder, mode)
-    tokens = tok(text, return_tensors="pt")
-    with torch.no_grad():
-        logits = model(**tokens).logits.squeeze(0)
-    ids = logits.argmax(dim=-1).tolist()[1:-1]          # drop CLS/SEP
-    words = tok.convert_ids_to_tokens(tokens["input_ids"][0])[1:-1]
-    # colours: B-Chemical / B-Disease / I-* already encoded in labels
-    label_map = model.config.id2label
-    html = ""
-    for w, i in zip(words, ids):
-        lab = label_map[i]
-        if lab == "O":
-            html += " " + w
         else:
-            entity = lab.split("-")[-1]      # Chemical | Disease
-            html += f' <span class="{entity}">{w}</span>'
-    return html.strip()
 CSS = """
 span.Chemical {background:#ffddff; padding:2px 4px; border-radius:4px}
 span.Disease  {background:#ffdddd; padding:2px 4px; border-radius:4px}
@@ -67,14 +128,13 @@ demo = gr.Interface(
     ],
     outputs=gr.HTML(label="Tagged output"),
     examples=[
-        ["Airway inflammation was reduced by salbutamol in cystic fibrosis patients."],
     ],
     css=CSS,
-    title="Biomedical NER – full vs. LoRA",
-    description=(
-        "Compare a full fine-tune with several parameter-efficient LoRA/CRF adapters.\n"
-        "Entities are highlighted inline. Pick a variant and paste any PubMed-style text."
-    ),
 )
 if __name__ == "__main__":

+# app.py  ── Biomedical NER demo (full vs. LoRA/CRF)
+# --------------------------------------------------
+from __future__ import annotations
+import html, logging, warnings
+from functools import lru_cache
+import gradio as gr
+import torch
 from transformers import AutoTokenizer, AutoModelForTokenClassification
 from peft import PeftModel
+from huggingface_hub import hf_hub_download      # ← missing import added
+# ─────────── silence library warnings ───────────
+warnings.filterwarnings("ignore", category=UserWarning, module="peft")
+logging.getLogger("transformers.modeling_utils").setLevel(logging.ERROR)
+# ─────────── constants ───────────
 BASE = "dmis-lab/biobert-base-cased-v1.2"
+REPO = "vishalvaka/biobert-finetuned-ner"     # one repo; variants in sub-folders
+VARIANTS: dict[str, tuple[str, str]] = {
+    "Full fine-tune"     : ("full",              "full"),
+    "LoRA-r32"           : ("lora-r32",          "lora"),
+    "LoRA-r32-fast"      : ("lora-r32-fast",     "lora"),
+    "LoRA-r16-CRF"       : ("lora-r16-crf",      "lora"),
+    "LoRA-r16-CRF-long"  : ("lora-r16-crf-long", "lora"),
 }
+LABELS   = ["O", "B-Chemical", "I-Chemical", "B-Disease", "I-Disease"]
+id2label = {i: lab for i, lab in enumerate(LABELS)}
+label2id = {lab: i for i, lab in id2label.items()}
+# ─────────── model loader (cached) ───────────
+@lru_cache(maxsize=None)
+def load_model(folder: str, mode: str):
     """
+    Returns (tokenizer, model) cached per variant.
+    • mode == "full"  → load full checkpoint from sub-folder
+    • mode == "lora"  → load BASE + LoRA adapter from sub-folder
     """
     if mode == "full":
         model = AutoModelForTokenClassification.from_pretrained(
             REPO, subfolder=folder
         )
         tok   = AutoTokenizer.from_pretrained(REPO, subfolder=folder)
+        # ensure human-readable label maps
+        model.config.id2label, model.config.label2id = id2label, label2id
+        return tok, model.eval()
+    # ---------- LoRA (with or without CRF) ----------
+    base = AutoModelForTokenClassification.from_pretrained(
+        BASE, num_labels=len(LABELS), id2label=id2label, label2id=label2id
+    )
+    model = PeftModel.from_pretrained(base, REPO, subfolder=folder)
+    tok   = AutoTokenizer.from_pretrained(BASE)
+    # attach CRF/classifier weights **iff the file exists**
+    try:
+        if "crf" in folder:                       # only these have it
+            head_path = hf_hub_download(REPO,
+                                         "non_encoder_head.pth",
+                                         subfolder=folder,
+                                         repo_type="model")
+            extra = torch.load(head_path, map_location="cpu")
+            model.load_state_dict(extra, strict=False)
+    except Exception as e:
+        warnings.warn(f"[{folder}] couldn’t load CRF head: {e}")
     return tok, model.eval()
+# ─────────── helper to build HTML output ───────────
+def build_html(tokens: list[str], labels: list[str]) -> str:
+    """Merge WordPieces and contiguous I-tokens."""
+    segments: list[tuple[str | None, str]] = []   # (entity_tag, text)
+    cur_tag, buf = None, ""
+    for tok, lab in zip(tokens, labels):
+        tag  = None if lab == "O" else lab.split("-")[-1]          # Chemical / Disease
+        text = tok[2:] if tok.startswith("##") else tok            # drop ##
+        continuation = tok.startswith("##") or lab.startswith("I-")
+        if tag == cur_tag and continuation:
+            buf += text
         else:
+            if buf:
+                segments.append((cur_tag, buf))
+            buf, cur_tag = text, tag
+    if buf:
+        segments.append((cur_tag, buf))
+    html_out, first = "", True
+    for tag, chunk in segments:
+        spacer = "" if first else " "
+        first  = False
+        chunk  = html.escape(chunk)
+        html_out += spacer + (
+            chunk if tag is None else f'<span class="{tag}">{chunk}</span>'
+        )
+    return html_out
+# ─────────── Gradio inference fn ───────────
+def ner(text: str, variant: str):
+    folder, mode = VARIANTS[variant]
+    tok, model   = load_model(folder, mode)
+    enc = tok(text, return_tensors="pt")
+    with torch.no_grad():
+        logits = model(**enc).logits.squeeze(0)
+    ids     = logits.argmax(dim=-1).tolist()[1:-1]              # drop CLS/SEP
+    tokens  = tok.convert_ids_to_tokens(enc["input_ids"][0])[1:-1]
+    labels  = [model.config.id2label[i] for i in ids]
+    return build_html(tokens, labels)
+# ─────────── UI definition ───────────
 CSS = """
 span.Chemical {background:#ffddff; padding:2px 4px; border-radius:4px}
 span.Disease  {background:#ffdddd; padding:2px 4px; border-radius:4px}
     ],
     outputs=gr.HTML(label="Tagged output"),
     examples=[
+        ["Intravenous administration of infliximab significantly reduced C-reactive protein levels and improved remission rates in Crohn’s disease patients."],
     ],
     css=CSS,
+    theme=gr.themes.Soft(),    # quiet built-in theme, no 404
+    title="Biomedical NER — Full vs. LoRA / CRF",
+    description="Toggle a variant and watch **Chemical** / **Disease** entities light up. "
+                "Full checkpoints for CRF models, compact adapters for LoRA runs.",
 )
 if __name__ == "__main__":