Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -19,7 +19,7 @@ model = joblib.load("model.joblib")
|
|
| 19 |
class EmailInput(BaseModel):
|
| 20 |
input_email_body: str
|
| 21 |
|
| 22 |
-
#
|
| 23 |
def mask_and_store_all_pii(text):
|
| 24 |
text = str(text)
|
| 25 |
pii_map = {}
|
|
@@ -27,29 +27,35 @@ def mask_and_store_all_pii(text):
|
|
| 27 |
|
| 28 |
patterns = {
|
| 29 |
"email": r"\b[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+\b",
|
| 30 |
-
"phone_number": r"\+?\d[\d\s\-]{7,14}\d",
|
| 31 |
"dob": r"\b\d{1,2}[\/\-\.\s]\d{1,2}[\/\-\.\s]\d{2,4}\b",
|
| 32 |
-
"aadhar_num": r"\b\d{4}[ -]?\d{4}[ -]?\d{4}\b(
|
| 33 |
"credit_debit_no": r"\b(?:\d[ -]*?){13,19}\b",
|
| 34 |
-
"cvv_no": r"(?i)\b(?:
|
| 35 |
"expiry_no": r"\b(0[1-9]|1[0-2])[\/\-]\d{2,4}\b",
|
| 36 |
-
# Avoid naive full name pattern to prevent false positives like 'Dear Sir'
|
| 37 |
}
|
| 38 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
for label, pattern in patterns.items():
|
| 40 |
for match in re.finditer(pattern, text):
|
| 41 |
original = match.group()
|
| 42 |
start, end = match.start(), match.end()
|
| 43 |
-
|
| 44 |
-
if original not in text:
|
| 45 |
continue
|
|
|
|
| 46 |
pii_map[placeholder] = original
|
| 47 |
entity_list.append({
|
| 48 |
"position": [start, end],
|
| 49 |
"classification": label,
|
| 50 |
"entity": original
|
| 51 |
})
|
| 52 |
-
text = text
|
|
|
|
| 53 |
|
| 54 |
return text, pii_map, entity_list
|
| 55 |
|
|
|
|
| 19 |
class EmailInput(BaseModel):
|
| 20 |
input_email_body: str
|
| 21 |
|
| 22 |
+
# PII Masking Function
|
| 23 |
def mask_and_store_all_pii(text):
|
| 24 |
text = str(text)
|
| 25 |
pii_map = {}
|
|
|
|
| 27 |
|
| 28 |
patterns = {
|
| 29 |
"email": r"\b[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+\b",
|
| 30 |
+
"phone_number": r"(?<!\d)(\+?\d[\d\s\-]{7,14}\d)(?!\d)",
|
| 31 |
"dob": r"\b\d{1,2}[\/\-\.\s]\d{1,2}[\/\-\.\s]\d{2,4}\b",
|
| 32 |
+
"aadhar_num": r"\b\d{4}[ -]?\d{4}[ -]?\d{4}\b(?![\d])",
|
| 33 |
"credit_debit_no": r"\b(?:\d[ -]*?){13,19}\b",
|
| 34 |
+
"cvv_no": r"(?i)\b(?:CVV[:\s]*)?(\d{3,4})\b",
|
| 35 |
"expiry_no": r"\b(0[1-9]|1[0-2])[\/\-]\d{2,4}\b",
|
|
|
|
| 36 |
}
|
| 37 |
|
| 38 |
+
# Track masked spans to prevent overlapping matches
|
| 39 |
+
masked_spans = []
|
| 40 |
+
|
| 41 |
+
def is_overlapping(start, end):
|
| 42 |
+
return any(s <= start < e or s < end <= e for s, e in masked_spans)
|
| 43 |
+
|
| 44 |
for label, pattern in patterns.items():
|
| 45 |
for match in re.finditer(pattern, text):
|
| 46 |
original = match.group()
|
| 47 |
start, end = match.start(), match.end()
|
| 48 |
+
if is_overlapping(start, end):
|
|
|
|
| 49 |
continue
|
| 50 |
+
placeholder = f"[{label}_{len(pii_map):03d}]"
|
| 51 |
pii_map[placeholder] = original
|
| 52 |
entity_list.append({
|
| 53 |
"position": [start, end],
|
| 54 |
"classification": label,
|
| 55 |
"entity": original
|
| 56 |
})
|
| 57 |
+
text = text[:start] + placeholder + text[end:]
|
| 58 |
+
masked_spans.append((start, start + len(placeholder)))
|
| 59 |
|
| 60 |
return text, pii_map, entity_list
|
| 61 |
|