Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -19,7 +19,7 @@ model = joblib.load("model.joblib")
|
|
| 19 |
class EmailInput(BaseModel):
|
| 20 |
input_email_body: str
|
| 21 |
|
| 22 |
-
# PII Masking Function
|
| 23 |
def mask_and_store_all_pii(text):
|
| 24 |
text = str(text)
|
| 25 |
pii_map = {}
|
|
@@ -27,20 +27,22 @@ def mask_and_store_all_pii(text):
|
|
| 27 |
|
| 28 |
patterns = {
|
| 29 |
"email": r"\b[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+\b",
|
| 30 |
-
"phone_number": r"\
|
| 31 |
-
"dob": r"\b\d{2}[
|
| 32 |
-
"aadhar_num": r"\b\d{4}[-
|
| 33 |
-
"credit_debit_no": r"\b(?:\d[ -]*?){13,
|
| 34 |
-
"cvv_no": r"\b\d{3}\b",
|
| 35 |
-
"expiry_no": r"\b(0[1-9]|1[0-2])
|
| 36 |
-
|
| 37 |
}
|
| 38 |
|
| 39 |
for label, pattern in patterns.items():
|
| 40 |
for match in re.finditer(pattern, text):
|
| 41 |
original = match.group()
|
| 42 |
start, end = match.start(), match.end()
|
| 43 |
-
placeholder = f"[{label}_{len(pii_map)}]"
|
|
|
|
|
|
|
| 44 |
pii_map[placeholder] = original
|
| 45 |
entity_list.append({
|
| 46 |
"position": [start, end],
|
|
@@ -81,4 +83,3 @@ def classify_email(data: EmailInput):
|
|
| 81 |
@app.get("/")
|
| 82 |
def root():
|
| 83 |
return {"message": "Email Classification API is running."}
|
| 84 |
-
|
|
|
|
| 19 |
class EmailInput(BaseModel):
|
| 20 |
input_email_body: str
|
| 21 |
|
| 22 |
+
# Updated PII Masking Function (fixes Aadhaar vs Card and name misclassifications)
|
| 23 |
def mask_and_store_all_pii(text):
|
| 24 |
text = str(text)
|
| 25 |
pii_map = {}
|
|
|
|
| 27 |
|
| 28 |
patterns = {
|
| 29 |
"email": r"\b[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+\b",
|
| 30 |
+
"phone_number": r"\+?\d[\d\s\-]{7,14}\d",
|
| 31 |
+
"dob": r"\b\d{1,2}[\/\-\.\s]\d{1,2}[\/\-\.\s]\d{2,4}\b",
|
| 32 |
+
"aadhar_num": r"\b\d{4}[ -]?\d{4}[ -]?\d{4}\b(?!\d)",
|
| 33 |
+
"credit_debit_no": r"\b(?:\d[ -]*?){13,19}\b",
|
| 34 |
+
"cvv_no": r"(?i)\b(?:cvv[:\s\-]*)?(\d{3,4})\b",
|
| 35 |
+
"expiry_no": r"\b(0[1-9]|1[0-2])[\/\-]\d{2,4}\b",
|
| 36 |
+
# Avoid naive full name pattern to prevent false positives like 'Dear Sir'
|
| 37 |
}
|
| 38 |
|
| 39 |
for label, pattern in patterns.items():
|
| 40 |
for match in re.finditer(pattern, text):
|
| 41 |
original = match.group()
|
| 42 |
start, end = match.start(), match.end()
|
| 43 |
+
placeholder = f"[{label}_{len(pii_map):03d}]"
|
| 44 |
+
if original not in text:
|
| 45 |
+
continue
|
| 46 |
pii_map[placeholder] = original
|
| 47 |
entity_list.append({
|
| 48 |
"position": [start, end],
|
|
|
|
| 83 |
@app.get("/")
|
| 84 |
def root():
|
| 85 |
return {"message": "Email Classification API is running."}
|
|
|