File size: 4,482 Bytes
5bc8f7d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import string
import numpy as np
import faiss
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

# ----------------- Load Model -----------------
MODEL = SentenceTransformer("./paraphrase-MiniLM-L6-v2", device="cpu")
MODEL = torch.quantization.quantize_dynamic(MODEL, {torch.nn.Linear}, dtype=torch.qint8)


class JobRecommendationSystem:
    def __init__(self, jobs_csv):
        """Initialize the system and load enriched job data from CSV file."""
        self.jobs_df = pd.read_csv(jobs_csv)

        # Ensure apply_link exists
        if "apply_link" not in self.jobs_df.columns:
            self.jobs_df["apply_link"] = None

        # --- Safe column concat ---
        def safe_col(col):
            return self.jobs_df[col].astype(str) + " " if col in self.jobs_df.columns else ""

        # Build job_text
        self.jobs_df["job_text"] = (
            safe_col("workplace") +
            safe_col("position") +
            safe_col("job_role_and_duties") +
            safe_col("requisite_skill") +
            safe_col("benefits") +
            safe_col("industry_id") +
            safe_col("formatted_work_type") +
            safe_col("work_type") +
            safe_col("formatted_experience_level") +
            safe_col("country") +
            safe_col("state") +
            safe_col("city")
        )

        self.jobs_texts = self.jobs_df["job_text"].tolist()
        self.job_info = self.jobs_df.copy()

        # --- Load or compute embeddings ---
        try:
            self.job_embeddings = np.load("job_embeddings.npy").astype(np.float16)
            print("✅ Loaded precomputed embeddings from job_embeddings.npy")
        except FileNotFoundError:
            print("⚠️ job_embeddings.npy not found. Generating embeddings now...")
            self.job_embeddings = MODEL.encode(
                self.jobs_texts,
                convert_to_numpy=True,
                batch_size=32,
                show_progress_bar=True
            ).astype(np.float16)
            np.save("job_embeddings.npy", self.job_embeddings)
            print("✅ Saved embeddings to job_embeddings.npy")

        # --- Build FAISS index (global, on all jobs) ---
        self.dim = self.job_embeddings.shape[1]
        self.index = faiss.IndexFlatIP(self.dim)
        self.index.add(self.job_embeddings.astype(np.float16))

        # --- Precompute TF-IDF once ---
        self.vectorizer = TfidfVectorizer()
        self.job_tfidf = self.vectorizer.fit_transform(self.jobs_texts)

    # ----------------- Helpers -----------------
    def clean_text(self, text):
        """Lowercase, strip punctuation, clean text."""
        return text.lower().translate(str.maketrans("", "", string.punctuation)).strip()

    def filter_top_jobs(self, resume_text, top_n=500):
        """Use TF-IDF to preselect most relevant jobs (fast)."""
        resume_vector = self.vectorizer.transform([resume_text])
        similarity_scores = (self.job_tfidf @ resume_vector.T).toarray().flatten()
        top_indices = np.argsort(similarity_scores)[-top_n:]
        return (
            [self.jobs_texts[i] for i in top_indices],
            self.job_info.iloc[top_indices].reset_index(drop=True),
            self.job_embeddings[top_indices],
        )

    def recommend_jobs(self, resume_text, top_n=20):
        """Recommend jobs using FAISS similarity search + deduplication."""
        resume_text = self.clean_text(resume_text)
        filtered_jobs_texts, filtered_jobs_df, filtered_embeddings = (
            self.filter_top_jobs(resume_text)
        )

        # Encode resume
        resume_embedding = MODEL.encode([resume_text], convert_to_numpy=True).astype(np.float16)

        # Build temporary FAISS index on filtered jobs
        index = faiss.IndexFlatIP(self.dim)
        index.add(filtered_embeddings.astype(np.float16))

        # Search more than top_n to handle duplicates
        distances, indices = index.search(resume_embedding.astype(np.float16), top_n * 2)
        results = filtered_jobs_df.iloc[indices[0]]

        # Deduplicate by job_id and return top_n
        results = results.drop_duplicates(subset=["job_id"]).head(top_n)
        recommended_jobs = results.to_dict(orient="records")

        return {"recommended_jobs": recommended_jobs}