Spaces:

Shenuki
/

NER

Sleeping

App Files Files Community

Shenuki commited on May 23

Commit

ce5002a

verified ·

1 Parent(s): 6bebb04

Update app.py

Browse files

Files changed (1) hide show

app.py +18 -16

app.py CHANGED Viewed

@@ -17,7 +17,7 @@ from transformers import (
     pipeline as hf_pipeline
 )
-# ── 1) Model setup (unchanged) ───────────────────────────────────────────────
 MODEL = "facebook/hf-seamless-m4t-medium"
 device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -31,7 +31,6 @@ if device == "cuda":
 m4t_model.eval()
 def translate_m4t(text: str, src_iso3: str, tgt_iso3: str, auto_detect=False) -> str:
-    """Single-string translation (used for initial auto‐detect → English)."""
     src = None if auto_detect else src_iso3
     inputs = processor(text=text, src_lang=src, return_tensors="pt").to(device)
     tokens = m4t_model.generate(**inputs, tgt_lang=tgt_iso3)
@@ -40,7 +39,6 @@ def translate_m4t(text: str, src_iso3: str, tgt_iso3: str, auto_detect=False) ->
 def translate_m4t_batch(
     texts: List[str], src_iso3: str, tgt_iso3: str, auto_detect=False
 ) -> List[str]:
-    """Batch‐mode translation: one generate() for many inputs."""
     src = None if auto_detect else src_iso3
     inputs = processor(
         text=texts, src_lang=src, return_tensors="pt", padding=True
@@ -53,9 +51,15 @@ def translate_m4t_batch(
     )
     return processor.batch_decode(tokens, skip_special_tokens=True)
-# ── 2) NER pipeline ─────────────────────────────────────────────────────────
-ner = hf_pipeline("ner", model="dslim/bert-base-NER-uncased", grouped_entities=True)
 # ── 3) CACHING helpers ──────────────────────────────────────────────────────
@@ -98,6 +102,7 @@ def wiki_summary_cache(name: str) -> str:
     except:
         return "No summary available."
 # ── 4) Per-entity worker ────────────────────────────────────────────────────
 def process_entity(ent) -> dict:
@@ -135,11 +140,11 @@ def process_entity(ent) -> dict:
 def get_context(
     text: str,
-    source_lang: str,  # ISO-639-3 e.g. "eng"
-    output_lang: str,  # ISO-639-3 e.g. "fra"
     auto_detect: bool
 ):
-    # a) Ensure we have English for NER
     if auto_detect or source_lang != "eng":
         en = translate_m4t(text, source_lang, "eng", auto_detect=auto_detect)
     else:
@@ -156,23 +161,22 @@ def get_context(
         seen.add(w)
         unique_ents.append(ent)
-    # c) Process each entity in parallel
     entities = []
     with ThreadPoolExecutor(max_workers=8) as exe:
         futures = [exe.submit(process_entity, ent) for ent in unique_ents]
         for fut in futures:
             entities.append(fut.result())
-    # d) Batch-translate any non-English fields
     if output_lang != "eng":
         to_translate = []
-        translations_info = []  # (kind, entity_i, [sub_i])
         for i, e in enumerate(entities):
             if e["type"] == "wiki":
                 translations_info.append(("summary", i))
                 to_translate.append(e["summary"])
             elif e["type"] == "location":
                 for j, r in enumerate(e["restaurants"]):
                     translations_info.append(("restaurant", i, j))
@@ -181,10 +185,8 @@ def get_context(
                     translations_info.append(("attraction", i, j))
                     to_translate.append(a["name"])
-        # single batched call
         translated = translate_m4t_batch(to_translate, "eng", output_lang)
-        # redistribute
         for txt, info in zip(translated, translations_info):
             kind = info[0]
             if kind == "summary":
@@ -200,7 +202,7 @@ def get_context(
     return {"entities": entities}
-# ── 6) Gradio interface with concurrency tuning ─────────────────────────────
 iface = gr.Interface(
     fn=get_context,
@@ -213,7 +215,7 @@ iface = gr.Interface(
     outputs="json",
     title="iVoice Context-Aware",
     description="Returns only the detected entities and their related info."
-).queue(concurrency_count=4, max_size=8)
 if __name__ == "__main__":
     iface.launch(

     pipeline as hf_pipeline
 )
+# ── 1) Model setup ────────────────────────────────────────────────────────────
 MODEL = "facebook/hf-seamless-m4t-medium"
 device = "cuda" if torch.cuda.is_available() else "cpu"
 m4t_model.eval()
 def translate_m4t(text: str, src_iso3: str, tgt_iso3: str, auto_detect=False) -> str:
     src = None if auto_detect else src_iso3
     inputs = processor(text=text, src_lang=src, return_tensors="pt").to(device)
     tokens = m4t_model.generate(**inputs, tgt_lang=tgt_iso3)
 def translate_m4t_batch(
     texts: List[str], src_iso3: str, tgt_iso3: str, auto_detect=False
 ) -> List[str]:
     src = None if auto_detect else src_iso3
     inputs = processor(
         text=texts, src_lang=src, return_tensors="pt", padding=True
     )
     return processor.batch_decode(tokens, skip_special_tokens=True)
+# ── 2) NER pipeline (updated for deprecation) ────────────────────────────────
+ner = hf_pipeline(
+    "ner",
+    model="dslim/bert-base-NER-uncased",
+    aggregation_strategy="simple"
+)
 # ── 3) CACHING helpers ──────────────────────────────────────────────────────
     except:
         return "No summary available."
 # ── 4) Per-entity worker ────────────────────────────────────────────────────
 def process_entity(ent) -> dict:
 def get_context(
     text: str,
+    source_lang: str,
+    output_lang: str,
     auto_detect: bool
 ):
+    # a) Ensure English for NER
     if auto_detect or source_lang != "eng":
         en = translate_m4t(text, source_lang, "eng", auto_detect=auto_detect)
     else:
         seen.add(w)
         unique_ents.append(ent)
+    # c) Parallel I/O
     entities = []
     with ThreadPoolExecutor(max_workers=8) as exe:
         futures = [exe.submit(process_entity, ent) for ent in unique_ents]
         for fut in futures:
             entities.append(fut.result())
+    # d) Batch-translate non-English fields
     if output_lang != "eng":
         to_translate = []
+        translations_info = []
         for i, e in enumerate(entities):
             if e["type"] == "wiki":
                 translations_info.append(("summary", i))
                 to_translate.append(e["summary"])
             elif e["type"] == "location":
                 for j, r in enumerate(e["restaurants"]):
                     translations_info.append(("restaurant", i, j))
                     translations_info.append(("attraction", i, j))
                     to_translate.append(a["name"])
         translated = translate_m4t_batch(to_translate, "eng", output_lang)
         for txt, info in zip(translated, translations_info):
             kind = info[0]
             if kind == "summary":
     return {"entities": entities}
+# ── 6) Gradio interface ─────────────────────────────────────────────────────
 iface = gr.Interface(
     fn=get_context,
     outputs="json",
     title="iVoice Context-Aware",
     description="Returns only the detected entities and their related info."
+).queue()    # ← removed unsupported kwargs
 if __name__ == "__main__":
     iface.launch(