turkish-named-entity-recognition-tests

Runtime error

App Files Files Community

umarigan commited on Oct 17, 2024

Commit

9de7f58

verified ·

1 Parent(s): 418cafa

Update app.py

Browse files

Files changed (1) hide show

app.py +42 -4

app.py CHANGED Viewed

@@ -4,6 +4,7 @@ from transformers import pipeline, AutoModelForTokenClassification, AutoTokenize
 import PyPDF2
 import docx
 import io
 def chunk_text(text, chunk_size=128):
     words = text.split()
@@ -92,9 +93,10 @@ def entity_comb(output):
             output_comb.append(entity)
     return output_comb
-def create_mask_dict(entities):
     mask_dict = {}
     entity_counters = {}
     for entity in entities:
         if entity['entity_group'] not in ['CARDINAL', 'EVENT']:
             if entity['word'] not in mask_dict:
@@ -103,6 +105,11 @@ def create_mask_dict(entities):
                 else:
                     entity_counters[entity['entity_group']] += 1
                 mask_dict[entity['word']] = f"{entity['entity_group']}_{entity_counters[entity['entity_group']]}"
     return mask_dict
 def replace_words_in_text(input_text, entities):
@@ -111,6 +118,34 @@ def replace_words_in_text(input_text, entities):
         input_text = input_text.replace(word, replacement)
     return input_text
 Run_Button = st.button("Run")
 if Run_Button and input_text:
@@ -136,13 +171,16 @@ if Run_Button and input_text:
     # Combine entities
     output_comb = entity_comb(all_outputs)
     # Create masked text and masking dictionary
-    masked_text = replace_words_in_text(input_text, output_comb)#create_masked_text(input_text, output_comb)
-    mask_dict = create_mask_dict(output_comb)
     # Display the masked text and masking dictionary
     st.subheader("Masked Text Preview")
     st.text(masked_text)
     st.subheader("Masking Dictionary")
-    st.json(mask_dict)

 import PyPDF2
 import docx
 import io
+import re
 def chunk_text(text, chunk_size=128):
     words = text.split()
             output_comb.append(entity)
     return output_comb
+def create_mask_dict(entities, additional_masks=None):
     mask_dict = {}
     entity_counters = {}
     for entity in entities:
         if entity['entity_group'] not in ['CARDINAL', 'EVENT']:
             if entity['word'] not in mask_dict:
                 else:
                     entity_counters[entity['entity_group']] += 1
                 mask_dict[entity['word']] = f"{entity['entity_group']}_{entity_counters[entity['entity_group']]}"
+    if additional_masks:
+        for word, replacement in additional_masks.items():
+            mask_dict[word] = replacement
     return mask_dict
 def replace_words_in_text(input_text, entities):
         input_text = input_text.replace(word, replacement)
     return input_text
+# Function to mask email, phone, and address patterns
+def mask_patterns(text):
+    masks = {}
+    # Email pattern
+    email_pattern = r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"
+    emails = re.findall(email_pattern, text)
+    for email in emails:
+        masks[email] = "<EMAIL>"
+    # Phone pattern (Turkish)
+    phone_pattern = r"\+90\d{10}|\b\d{3}[-.\s]?\d{3}[-.\s]?\d{2}[-.\s]?\d{2}\b"
+    phones = re.findall(phone_pattern, text)
+    for phone in phones:
+        masks[phone] = "<PHONE>"
+    # Address pattern (basic example, can be enhanced)
+    address_pattern = r"\d{1,5}\s\w+(\s\w+)*"  # Simplified address pattern
+    addresses = re.findall(address_pattern, text)
+    for address in addresses:
+        masks[address] = "<ADDRESS>"
+    # Replace patterns in text
+    for word, replacement in masks.items():
+        text = text.replace(word, replacement)
+    return text, masks
 Run_Button = st.button("Run")
 if Run_Button and input_text:
     # Combine entities
     output_comb = entity_comb(all_outputs)
+    # Mask emails, phone numbers, and addresses
+    masked_text, additional_masks = mask_patterns(input_text)
     # Create masked text and masking dictionary
+    masked_text = replace_words_in_text(masked_text, output_comb)
+    mask_dict = create_mask_dict(output_comb, additional_masks)
     # Display the masked text and masking dictionary
     st.subheader("Masked Text Preview")
     st.text(masked_text)
     st.subheader("Masking Dictionary")
+    st.json(mask_dict)