Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -22,21 +22,10 @@ def read_file(file):
|
|
| 22 |
st.error("Unsupported file type")
|
| 23 |
return None
|
| 24 |
|
| 25 |
-
# Rest of your code remains the same
|
| 26 |
-
example_list = [
|
| 27 |
-
"Mustafa Kemal Atatürk 1919 yılında Samsun'a çıktı.",
|
| 28 |
-
"""Mustafa Kemal Atatürk, Türk asker, devlet adamı ve Türkiye Cumhuriyeti'nin kurucusudur.
|
| 29 |
-
# ... (rest of the example text)
|
| 30 |
-
"""
|
| 31 |
-
]
|
| 32 |
-
|
| 33 |
st.title("Demo for Turkish NER Models")
|
| 34 |
|
| 35 |
model_list = [
|
| 36 |
-
'akdeniz27/bert-base-turkish-cased-ner',
|
| 37 |
-
'akdeniz27/convbert-base-turkish-cased-ner',
|
| 38 |
'girayyagmur/bert-base-turkish-ner-cased',
|
| 39 |
-
'FacebookAI/xlm-roberta-large',
|
| 40 |
'savasy/bert-base-turkish-ner-cased',
|
| 41 |
'xlm-roberta-large-finetuned-conll03-english',
|
| 42 |
'asahi417/tner-xlm-roberta-base-ontonotes5'
|
|
@@ -46,46 +35,41 @@ st.sidebar.header("Select NER Model")
|
|
| 46 |
model_checkpoint = st.sidebar.radio("", model_list)
|
| 47 |
|
| 48 |
st.sidebar.write("For details of models: 'https://huggingface.co/akdeniz27/")
|
| 49 |
-
st.sidebar.write("")
|
| 50 |
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
else:
|
| 56 |
-
aggregation = "first"
|
| 57 |
|
| 58 |
st.subheader("Select Text Input Method")
|
| 59 |
input_method = st.radio("", ('Select from Examples', 'Write or Paste New Text', 'Upload File'))
|
| 60 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
if input_method == 'Select from Examples':
|
| 62 |
-
selected_text = st.selectbox('Select Text from List', example_list, index=0
|
| 63 |
-
input_text = st.text_area("Selected Text", selected_text, height=128
|
| 64 |
elif input_method == "Write or Paste New Text":
|
| 65 |
-
input_text = st.text_area('Write or Paste Text Below', value="", height=128
|
| 66 |
else:
|
| 67 |
uploaded_file = st.file_uploader("Choose a file", type=["txt", "pdf", "docx"])
|
| 68 |
if uploaded_file is not None:
|
| 69 |
input_text = read_file(uploaded_file)
|
| 70 |
if input_text:
|
| 71 |
-
st.text_area("Extracted Text", input_text, height=128
|
| 72 |
else:
|
| 73 |
input_text = ""
|
| 74 |
|
| 75 |
-
# Rest of your functions (setModel, get_html, entity_comb) remain the same
|
| 76 |
-
|
| 77 |
@st.cache_resource
|
| 78 |
def setModel(model_checkpoint, aggregation):
|
| 79 |
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint)
|
| 80 |
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
|
| 81 |
return pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy=aggregation)
|
| 82 |
|
| 83 |
-
@st.cache_resource
|
| 84 |
-
def get_html(html: str):
|
| 85 |
-
WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; margin-bottom: 2.5rem">{}</div>"""
|
| 86 |
-
html = html.replace("\n", " ")
|
| 87 |
-
return WRAPPER.format(html)
|
| 88 |
-
|
| 89 |
@st.cache_resource
|
| 90 |
def entity_comb(output):
|
| 91 |
output_comb = []
|
|
@@ -93,46 +77,31 @@ def entity_comb(output):
|
|
| 93 |
if ind == 0:
|
| 94 |
output_comb.append(entity)
|
| 95 |
elif output[ind]["start"] == output[ind-1]["end"] and output[ind]["entity_group"] == output[ind-1]["entity_group"]:
|
| 96 |
-
output_comb[-1]["word"]
|
| 97 |
output_comb[-1]["end"] = output[ind]["end"]
|
| 98 |
else:
|
| 99 |
output_comb.append(entity)
|
| 100 |
return output_comb
|
| 101 |
|
| 102 |
-
Run_Button = st.button("Run"
|
| 103 |
|
| 104 |
-
if Run_Button and input_text
|
| 105 |
-
# Your existing processing code remains the same
|
| 106 |
ner_pipeline = setModel(model_checkpoint, aggregation)
|
| 107 |
output = ner_pipeline(input_text)
|
| 108 |
|
| 109 |
output_comb = entity_comb(output)
|
| 110 |
|
| 111 |
df = pd.DataFrame.from_dict(output_comb)
|
| 112 |
-
cols_to_keep = ['word','entity_group','score','start','end']
|
| 113 |
df_final = df[cols_to_keep]
|
| 114 |
|
| 115 |
st.subheader("Recognized Entities")
|
| 116 |
st.dataframe(df_final)
|
| 117 |
|
| 118 |
-
|
| 119 |
-
spacy_display = {}
|
| 120 |
-
spacy_display["ents"] = []
|
| 121 |
-
spacy_display["text"] = input_text
|
| 122 |
-
spacy_display["title"] = None
|
| 123 |
-
|
| 124 |
for entity in output_comb:
|
| 125 |
spacy_display["ents"].append({"start": entity["start"], "end": entity["end"], "label": entity["entity_group"]})
|
| 126 |
-
|
| 127 |
-
tner_entity_list = ["person", "group", "facility", "organization", "geopolitical area", "location", "product", "event", "work of art", "law", "language", "date", "time", "percent", "money", "quantity", "ordinal number", "cardinal number"]
|
| 128 |
-
spacy_entity_list = ["PERSON", "NORP", "FAC", "ORG", "GPE", "LOC", "PRODUCT", "EVENT", "WORK_OF_ART", "LAW", "LANGUAGE", "DATE", "TIME", "PERCENT", "MONEY", "QUANTITY", "ORDINAL", "CARDINAL", "MISC"]
|
| 129 |
-
|
| 130 |
-
for ent in spacy_display["ents"]:
|
| 131 |
-
if model_checkpoint == "asahi417/tner-xlm-roberta-base-ontonotes5":
|
| 132 |
-
ent["label"] = spacy_entity_list[tner_entity_list.index(ent["label"])]
|
| 133 |
-
else:
|
| 134 |
-
if ent["label"] == "PER": ent["label"] = "PERSON"
|
| 135 |
|
| 136 |
-
html = spacy.displacy.render(spacy_display, style="ent", minify=True, manual=True
|
| 137 |
-
|
| 138 |
-
st.write(f"{style}{get_html(html)}", unsafe_allow_html=True)
|
|
|
|
| 22 |
st.error("Unsupported file type")
|
| 23 |
return None
|
| 24 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
st.title("Demo for Turkish NER Models")
|
| 26 |
|
| 27 |
model_list = [
|
|
|
|
|
|
|
| 28 |
'girayyagmur/bert-base-turkish-ner-cased',
|
|
|
|
| 29 |
'savasy/bert-base-turkish-ner-cased',
|
| 30 |
'xlm-roberta-large-finetuned-conll03-english',
|
| 31 |
'asahi417/tner-xlm-roberta-base-ontonotes5'
|
|
|
|
| 35 |
model_checkpoint = st.sidebar.radio("", model_list)
|
| 36 |
|
| 37 |
st.sidebar.write("For details of models: 'https://huggingface.co/akdeniz27/")
|
| 38 |
+
st.sidebar.write("Only PDF, DOCX, and TXT files are supported.")
|
| 39 |
|
| 40 |
+
# Determine aggregation strategy
|
| 41 |
+
aggregation = "simple" if model_checkpoint in ["akdeniz27/xlm-roberta-base-turkish-ner",
|
| 42 |
+
"xlm-roberta-large-finetuned-conll03-english",
|
| 43 |
+
"asahi417/tner-xlm-roberta-base-ontonotes5"] else "first"
|
|
|
|
|
|
|
| 44 |
|
| 45 |
st.subheader("Select Text Input Method")
|
| 46 |
input_method = st.radio("", ('Select from Examples', 'Write or Paste New Text', 'Upload File'))
|
| 47 |
|
| 48 |
+
example_list = [
|
| 49 |
+
"Mustafa Kemal Atatürk 1919 yılında Samsun'a çıktı.",
|
| 50 |
+
"""Mustafa Kemal Atatürk, Türk asker, devlet adamı ve Türkiye Cumhuriyeti'nin kurucusudur."""
|
| 51 |
+
]
|
| 52 |
+
|
| 53 |
if input_method == 'Select from Examples':
|
| 54 |
+
selected_text = st.selectbox('Select Text from List', example_list, index=0)
|
| 55 |
+
input_text = st.text_area("Selected Text", selected_text, height=128)
|
| 56 |
elif input_method == "Write or Paste New Text":
|
| 57 |
+
input_text = st.text_area('Write or Paste Text Below', value="", height=128)
|
| 58 |
else:
|
| 59 |
uploaded_file = st.file_uploader("Choose a file", type=["txt", "pdf", "docx"])
|
| 60 |
if uploaded_file is not None:
|
| 61 |
input_text = read_file(uploaded_file)
|
| 62 |
if input_text:
|
| 63 |
+
st.text_area("Extracted Text", input_text, height=128)
|
| 64 |
else:
|
| 65 |
input_text = ""
|
| 66 |
|
|
|
|
|
|
|
| 67 |
@st.cache_resource
|
| 68 |
def setModel(model_checkpoint, aggregation):
|
| 69 |
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint)
|
| 70 |
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
|
| 71 |
return pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy=aggregation)
|
| 72 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
@st.cache_resource
|
| 74 |
def entity_comb(output):
|
| 75 |
output_comb = []
|
|
|
|
| 77 |
if ind == 0:
|
| 78 |
output_comb.append(entity)
|
| 79 |
elif output[ind]["start"] == output[ind-1]["end"] and output[ind]["entity_group"] == output[ind-1]["entity_group"]:
|
| 80 |
+
output_comb[-1]["word"] += output[ind]["word"]
|
| 81 |
output_comb[-1]["end"] = output[ind]["end"]
|
| 82 |
else:
|
| 83 |
output_comb.append(entity)
|
| 84 |
return output_comb
|
| 85 |
|
| 86 |
+
Run_Button = st.button("Run")
|
| 87 |
|
| 88 |
+
if Run_Button and input_text:
|
|
|
|
| 89 |
ner_pipeline = setModel(model_checkpoint, aggregation)
|
| 90 |
output = ner_pipeline(input_text)
|
| 91 |
|
| 92 |
output_comb = entity_comb(output)
|
| 93 |
|
| 94 |
df = pd.DataFrame.from_dict(output_comb)
|
| 95 |
+
cols_to_keep = ['word', 'entity_group', 'score', 'start', 'end']
|
| 96 |
df_final = df[cols_to_keep]
|
| 97 |
|
| 98 |
st.subheader("Recognized Entities")
|
| 99 |
st.dataframe(df_final)
|
| 100 |
|
| 101 |
+
# Spacy display logic
|
| 102 |
+
spacy_display = {"ents": [], "text": input_text, "title": None}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
for entity in output_comb:
|
| 104 |
spacy_display["ents"].append({"start": entity["start"], "end": entity["end"], "label": entity["entity_group"]})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 105 |
|
| 106 |
+
html = spacy.displacy.render(spacy_display, style="ent", minify=True, manual=True)
|
| 107 |
+
st.write(html, unsafe_allow_html=True)
|
|
|