Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -4,6 +4,7 @@ from transformers import pipeline, AutoModelForTokenClassification, AutoTokenize
|
|
| 4 |
import PyPDF2
|
| 5 |
import docx
|
| 6 |
import io
|
|
|
|
| 7 |
|
| 8 |
def chunk_text(text, chunk_size=128):
|
| 9 |
words = text.split()
|
|
@@ -92,9 +93,10 @@ def entity_comb(output):
|
|
| 92 |
output_comb.append(entity)
|
| 93 |
return output_comb
|
| 94 |
|
| 95 |
-
def create_mask_dict(entities):
|
| 96 |
mask_dict = {}
|
| 97 |
entity_counters = {}
|
|
|
|
| 98 |
for entity in entities:
|
| 99 |
if entity['entity_group'] not in ['CARDINAL', 'EVENT']:
|
| 100 |
if entity['word'] not in mask_dict:
|
|
@@ -103,6 +105,11 @@ def create_mask_dict(entities):
|
|
| 103 |
else:
|
| 104 |
entity_counters[entity['entity_group']] += 1
|
| 105 |
mask_dict[entity['word']] = f"{entity['entity_group']}_{entity_counters[entity['entity_group']]}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
return mask_dict
|
| 107 |
|
| 108 |
def replace_words_in_text(input_text, entities):
|
|
@@ -111,6 +118,34 @@ def replace_words_in_text(input_text, entities):
|
|
| 111 |
input_text = input_text.replace(word, replacement)
|
| 112 |
return input_text
|
| 113 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
Run_Button = st.button("Run")
|
| 115 |
|
| 116 |
if Run_Button and input_text:
|
|
@@ -136,13 +171,16 @@ if Run_Button and input_text:
|
|
| 136 |
# Combine entities
|
| 137 |
output_comb = entity_comb(all_outputs)
|
| 138 |
|
|
|
|
|
|
|
|
|
|
| 139 |
# Create masked text and masking dictionary
|
| 140 |
-
masked_text = replace_words_in_text(
|
| 141 |
-
mask_dict = create_mask_dict(output_comb)
|
| 142 |
|
| 143 |
# Display the masked text and masking dictionary
|
| 144 |
st.subheader("Masked Text Preview")
|
| 145 |
st.text(masked_text)
|
| 146 |
|
| 147 |
st.subheader("Masking Dictionary")
|
| 148 |
-
st.json(mask_dict)
|
|
|
|
| 4 |
import PyPDF2
|
| 5 |
import docx
|
| 6 |
import io
|
| 7 |
+
import re
|
| 8 |
|
| 9 |
def chunk_text(text, chunk_size=128):
|
| 10 |
words = text.split()
|
|
|
|
| 93 |
output_comb.append(entity)
|
| 94 |
return output_comb
|
| 95 |
|
| 96 |
+
def create_mask_dict(entities, additional_masks=None):
|
| 97 |
mask_dict = {}
|
| 98 |
entity_counters = {}
|
| 99 |
+
|
| 100 |
for entity in entities:
|
| 101 |
if entity['entity_group'] not in ['CARDINAL', 'EVENT']:
|
| 102 |
if entity['word'] not in mask_dict:
|
|
|
|
| 105 |
else:
|
| 106 |
entity_counters[entity['entity_group']] += 1
|
| 107 |
mask_dict[entity['word']] = f"{entity['entity_group']}_{entity_counters[entity['entity_group']]}"
|
| 108 |
+
|
| 109 |
+
if additional_masks:
|
| 110 |
+
for word, replacement in additional_masks.items():
|
| 111 |
+
mask_dict[word] = replacement
|
| 112 |
+
|
| 113 |
return mask_dict
|
| 114 |
|
| 115 |
def replace_words_in_text(input_text, entities):
|
|
|
|
| 118 |
input_text = input_text.replace(word, replacement)
|
| 119 |
return input_text
|
| 120 |
|
| 121 |
+
# Function to mask email, phone, and address patterns
|
| 122 |
+
def mask_patterns(text):
|
| 123 |
+
masks = {}
|
| 124 |
+
|
| 125 |
+
# Email pattern
|
| 126 |
+
email_pattern = r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"
|
| 127 |
+
emails = re.findall(email_pattern, text)
|
| 128 |
+
for email in emails:
|
| 129 |
+
masks[email] = "<EMAIL>"
|
| 130 |
+
|
| 131 |
+
# Phone pattern (Turkish)
|
| 132 |
+
phone_pattern = r"\+90\d{10}|\b\d{3}[-.\s]?\d{3}[-.\s]?\d{2}[-.\s]?\d{2}\b"
|
| 133 |
+
phones = re.findall(phone_pattern, text)
|
| 134 |
+
for phone in phones:
|
| 135 |
+
masks[phone] = "<PHONE>"
|
| 136 |
+
|
| 137 |
+
# Address pattern (basic example, can be enhanced)
|
| 138 |
+
address_pattern = r"\d{1,5}\s\w+(\s\w+)*" # Simplified address pattern
|
| 139 |
+
addresses = re.findall(address_pattern, text)
|
| 140 |
+
for address in addresses:
|
| 141 |
+
masks[address] = "<ADDRESS>"
|
| 142 |
+
|
| 143 |
+
# Replace patterns in text
|
| 144 |
+
for word, replacement in masks.items():
|
| 145 |
+
text = text.replace(word, replacement)
|
| 146 |
+
|
| 147 |
+
return text, masks
|
| 148 |
+
|
| 149 |
Run_Button = st.button("Run")
|
| 150 |
|
| 151 |
if Run_Button and input_text:
|
|
|
|
| 171 |
# Combine entities
|
| 172 |
output_comb = entity_comb(all_outputs)
|
| 173 |
|
| 174 |
+
# Mask emails, phone numbers, and addresses
|
| 175 |
+
masked_text, additional_masks = mask_patterns(input_text)
|
| 176 |
+
|
| 177 |
# Create masked text and masking dictionary
|
| 178 |
+
masked_text = replace_words_in_text(masked_text, output_comb)
|
| 179 |
+
mask_dict = create_mask_dict(output_comb, additional_masks)
|
| 180 |
|
| 181 |
# Display the masked text and masking dictionary
|
| 182 |
st.subheader("Masked Text Preview")
|
| 183 |
st.text(masked_text)
|
| 184 |
|
| 185 |
st.subheader("Masking Dictionary")
|
| 186 |
+
st.json(mask_dict)
|