File size: 5,045 Bytes
6b21b32
bf70aa2
 
d0995a7
bf70aa2
d0995a7
 
9bd35d7
bf70aa2
11184ec
cea309f
 
 
6b21b32
11184ec
 
cea309f
 
d0995a7
bf70aa2
d0995a7
bf70aa2
9bd35d7
 
 
d0995a7
 
 
 
 
 
 
 
 
9bd35d7
 
 
 
 
 
d0995a7
9bd35d7
 
 
 
 
 
 
 
d0995a7
bf70aa2
11184ec
9bd35d7
 
d0995a7
 
9bd35d7
11184ec
 
d0995a7
9bd35d7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d0995a7
9bd35d7
 
 
 
 
 
 
 
 
 
 
 
 
 
d0995a7
9bd35d7
 
 
 
 
 
 
 
 
 
bf70aa2
d0995a7
11184ec
 
9bd35d7
 
bf70aa2
d0995a7
bf70aa2
 
11184ec
 
d0995a7
 
bf70aa2
11184ec
 
 
 
bf70aa2
6b21b32
d0995a7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6b21b32
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
from fastapi import FastAPI
from pydantic import BaseModel
import joblib
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from transformers import pipeline

# Initialize FastAPI app
app = FastAPI(
    title="Email Classification API",
    version="1.0.0",
    description="Classifies support emails into categories and masks personal information.",
    docs_url="/docs",
    redoc_url="/redoc"
)

# Load model and vectorizer
model = joblib.load("model.joblib")
vectorizer = joblib.load("vectorizer.joblib")

# Initialize NER pipeline
ner = pipeline('ner', model='Davlan/xlm-roberta-base-ner-hrl', grouped_entities=True)

# Input schemas
class EmailInput(BaseModel):
    input_email_body: str

class TrainingExample(BaseModel):
    email_body: str
    label: str

# Map NER labels to types
NER_TO_TOKEN = {
    'PER': 'full_name',
    'EMAIL': 'email',
    'DATE': 'dob'
}

# Regex patterns for PII
EMAIL_REGEX = r'\b[\w\.-]+@[\w\.-]+\.\w{2,}\b'
AADHAAR_REGEX = r'\b\d{4}\s?\d{4}\s?\d{4}\b'
CARD_REGEX = r'\b(?:\d[ -]*?){13,19}\b'
CVV_REGEX = r'(?i)\b(?:cvv[:\s\-]*)?(\d{3,4})\b'
EXPIRY_REGEX = r'\b(0[1-9]|1[0-2])[\/\-]\d{2,4}\b'
PHONE_REGEX = r'\+?\d[\d\s\-]{7,14}\d'
DOB_REGEX = r'\b\d{1,2}[\/\-\.\s]\d{1,2}[\/\-\.\s]\d{2,4}\b'

# Masking function
def mask_and_store_all_pii(text):
    text = str(text)
    mapping = {}
    counter = {
        'full_name': 0, 'email': 0, 'phone_number': 0, 'dob': 0,
        'aadhar_num': 0, 'credit_debit_no': 0, 'cvv_no': 0, 'expiry_no': 0
    }
    entity_list = []

    # NER-based masking
    entities = ner(text)
    for ent in entities:
        label = ent['entity_group']
        if label in NER_TO_TOKEN:
            token_name = NER_TO_TOKEN[label]
            original = ent['word'].replace('##', '')
            token = f"[{token_name}_{counter[token_name]:03d}]"
            if original in text:
                start = text.index(original)
                text = text.replace(original, token, 1)
                mapping[token] = original
                counter[token_name] += 1
                entity_list.append({
                    "position": [start, start + len(token)],
                    "classification": token_name,
                    "entity": original
                })

    # Regex-based masking
    regex_map = [
        (CARD_REGEX, 'credit_debit_no'),
        (AADHAAR_REGEX, 'aadhar_num'),
        (PHONE_REGEX, 'phone_number'),
        (CVV_REGEX, 'cvv_no'),
        (EXPIRY_REGEX, 'expiry_no'),
        (EMAIL_REGEX, 'email'),
        (DOB_REGEX, 'dob')
    ]
    for regex, token_name in regex_map:
        for match in re.finditer(regex, text):
            original = match.group(0)
            token = f"[{token_name}_{counter[token_name]:03d}]"
            if original in text:
                start = text.index(original)
                text = text.replace(original, token, 1)
                mapping[token] = original
                counter[token_name] += 1
                entity_list.append({
                    "position": [start, start + len(token)],
                    "classification": token_name,
                    "entity": original
                })

    return text, mapping, entity_list

# Restore PII (optional use)
def restore_pii(masked_text, pii_map):
    for placeholder, original in pii_map.items():
        masked_text = masked_text.replace(placeholder, original)
    return masked_text

# Prediction endpoint
@app.post("/classify")
def classify_email(data: EmailInput):
    raw_text = data.input_email_body
    masked_text, pii_map, entity_list = mask_and_store_all_pii(raw_text)
    features = vectorizer.transform([masked_text])
    predicted_category = model.predict(features)[0]
    return {
        "input_email_body": raw_text,
        "list_of_masked_entities": entity_list,
        "masked_email": masked_text,
        "category_of_the_email": predicted_category
    }

# Retraining endpoint
@app.post("/train")
def train_model(new_example: TrainingExample):
    df = pd.DataFrame([{"email_body": new_example.email_body, "label": new_example.label}])
    try:
        df.to_csv("training_data.csv", mode='a', header=not pd.io.common.file_exists("training_data.csv"), index=False)
    except Exception as e:
        return {"error": f"Failed to append to dataset: {str(e)}"}

    # Load dataset
    full_df = pd.read_csv("training_data.csv")
    full_df['masked_text'] = full_df['email_body'].apply(lambda x: mask_and_store_all_pii(x)[0])

    # Vectorize and train
    new_vectorizer = TfidfVectorizer()
    X = new_vectorizer.fit_transform(full_df['masked_text'])
    y = full_df['label']
    new_model = LinearSVC()
    new_model.fit(X, y)

    # Save updated model and vectorizer
    joblib.dump(new_model, "model.joblib")
    joblib.dump(new_vectorizer, "vectorizer.joblib")

    return {"message": "Model retrained successfully with new example."}

# Health check
@app.get("/")
def root():
    return {"message": "Email Classification API is running."}