Spaces:

Suzana
/

labelit-mini-ner

Sleeping

App Files Files Community

Suzana commited on May 28

Commit

f3b49b2

verified ·

1 Parent(s): 1736c22

Create app.py

Browse files

Files changed (1) hide show

app.py +123 -0

app.py ADDED Viewed

	@@ -0,0 +1,123 @@

+import gradio as gr
+import pandas as pd
+from pathlib import Path
+# Global token storage
+token_df = pd.DataFrame()
+# Generate generic sample sentences
+def make_sample_data(n=100):
+    people = ["Alice","Bob","Charlie","Diane","Eve"]
+    orgs   = ["Acme Corp","Globex","Initech","Umbrella","Stark Inc"]
+    locs   = ["Paris","New York","London","Tokyo","Sydney"]
+    verbs  = ["visited","joined","founded","traveled to","met with"]
+    rows = []
+    for i in range(n):
+        p = people[i % len(people)]
+        v = verbs[i % len(verbs)]
+        o = orgs[i % len(orgs)]
+        l = locs[i % len(locs)]
+        rows.append({"text": f"{p} {v} {o} in {l}."})
+    return pd.DataFrame(rows)
+def load_data(file):
+    global token_df
+    # Load user CSV or fallback to sample
+    if file:
+        df = pd.read_csv(file.name)
+    else:
+        df = make_sample_data(100)
+    if "text" not in df.columns:
+        return (
+            gr.update(visible=False),
+            "❌ CSV must contain a `text` column.",
+            gr.update(visible=False)
+        )
+    # Tokenize into (sentence_id, token, label)
+    records = []
+    for sid, txt in enumerate(df["text"]):
+        for tok in txt.split():
+            records.append({"sentence_id": sid, "token": tok, "label": "O"})
+    token_df = pd.DataFrame(records)
+    return (
+        gr.update(value=token_df, visible=True),
+        f"✅ Loaded {len(df)} sentences → {len(token_df)} tokens.",
+        gr.update(visible=True)
+    )
+def save_edits(table):
+    global token_df
+    token_df = pd.DataFrame(table, columns=["sentence_id","token","label"])
+    return "💾 Edits saved."
+def download_tokens():
+    token_df.to_csv("raw_tokens.csv", index=False)
+    return Path("raw_tokens.csv")
+def download_iob():
+    # Convert to IOB
+    iob, prev = [], {}
+    for _, r in token_df.iterrows():
+        sid, lbl = r["sentence_id"], r["label"]
+        if lbl == "O":
+            iob.append("O")
+            prev[sid] = None
+        else:
+            tag = ("I-" if prev.get(sid)==lbl else "B-") + lbl
+            iob.append(tag)
+            prev[sid] = lbl
+    out = token_df.copy()
+    out["iob"] = iob
+    out.to_csv("ner_iob.csv", index=False)
+    return Path("ner_iob.csv")
+with gr.Blocks() as app:
+    gr.Markdown("# 🏷️ Label It! Mini-NER")
+    gr.Markdown("**Step 1:** Upload a CSV with a `text` column, or leave blank for sample sentences.")
+    with gr.Row():
+        file_in = gr.File(label="📁 Upload CSV", file_types=[".csv"])
+        load_btn = gr.Button("Load Data")
+    status = gr.Textbox(label="Status", interactive=False)
+    table = gr.Dataframe(
+        headers=["sentence_id","token","label"],
+        editable=True,
+        visible=False,
+        label="📝 Annotate Tokens"
+    )
+    with gr.Row(visible=False) as actions:
+        save_btn    = gr.Button("💾 Save Edits")
+        dl_tokens   = gr.DownloadButton(
+            fn=download_tokens,
+            file_name="raw_tokens.csv",
+            label="⬇️ Download Tokens CSV"
+        )
+        dl_iob      = gr.DownloadButton(
+            fn=download_iob,
+            file_name="ner_iob.csv",
+            label="⬇️ Download IOB CSV"
+        )
+    # Bind events
+    load_btn.click(
+        load_data,
+        inputs=file_in,
+        outputs=[table, status, actions]
+    )
+    save_btn.click(
+        save_edits,
+        inputs=table,
+        outputs=status
+    )
+    gr.Markdown("""
+    **Step 2:**
+    - Click into the **label** column and type one of:
+      `PER`, `ORG`, `LOC`, or leave as `O`.
+    - **Save Edits**, then download your token CSV or IOB‐tagged CSV.
+    """)
+app.launch()