shreyan67 commited on
Commit
5bc8f7d
·
verified ·
1 Parent(s): 3026002

Upload precompute_embeddings.py

Browse files
Files changed (1) hide show
  1. precompute_embeddings.py +108 -108
precompute_embeddings.py CHANGED
@@ -1,108 +1,108 @@
1
- import string
2
- import numpy as np
3
- import faiss
4
- import pandas as pd
5
- import torch
6
- from sentence_transformers import SentenceTransformer
7
- from sklearn.feature_extraction.text import TfidfVectorizer
8
-
9
- # ----------------- Load Model -----------------
10
- MODEL = SentenceTransformer("./paraphrase-MiniLM-L6-v2", device="cpu")
11
- MODEL = torch.quantization.quantize_dynamic(MODEL, {torch.nn.Linear}, dtype=torch.qint8)
12
-
13
-
14
- class JobRecommendationSystem:
15
- def __init__(self, jobs_csv):
16
- """Initialize the system and load enriched job data from CSV file."""
17
- self.jobs_df = pd.read_csv(jobs_csv)
18
-
19
- # Ensure apply_link exists
20
- if "apply_link" not in self.jobs_df.columns:
21
- self.jobs_df["apply_link"] = None
22
-
23
- # --- Safe column concat ---
24
- def safe_col(col):
25
- return self.jobs_df[col].astype(str) + " " if col in self.jobs_df.columns else ""
26
-
27
- # Build job_text
28
- self.jobs_df["job_text"] = (
29
- safe_col("workplace") +
30
- safe_col("position") +
31
- safe_col("job_role_and_duties") +
32
- safe_col("requisite_skill") +
33
- safe_col("benefits") +
34
- safe_col("industry_id") +
35
- safe_col("formatted_work_type") +
36
- safe_col("work_type") +
37
- safe_col("formatted_experience_level") +
38
- safe_col("country") +
39
- safe_col("state") +
40
- safe_col("city")
41
- )
42
-
43
- self.jobs_texts = self.jobs_df["job_text"].tolist()
44
- self.job_info = self.jobs_df.copy()
45
-
46
- # --- Load or compute embeddings ---
47
- try:
48
- self.job_embeddings = np.load("job_embeddings.npy").astype(np.float16)
49
- print("✅ Loaded precomputed embeddings from job_embeddings.npy")
50
- except FileNotFoundError:
51
- print("⚠️ job_embeddings.npy not found. Generating embeddings now...")
52
- self.job_embeddings = MODEL.encode(
53
- self.jobs_texts,
54
- convert_to_numpy=True,
55
- batch_size=32,
56
- show_progress_bar=True
57
- ).astype(np.float16)
58
- np.save("job_embeddings.npy", self.job_embeddings)
59
- print("✅ Saved embeddings to job_embeddings.npy")
60
-
61
- # --- Build FAISS index (global, on all jobs) ---
62
- self.dim = self.job_embeddings.shape[1]
63
- self.index = faiss.IndexFlatIP(self.dim)
64
- self.index.add(self.job_embeddings.astype(np.float16))
65
-
66
- # --- Precompute TF-IDF once ---
67
- self.vectorizer = TfidfVectorizer()
68
- self.job_tfidf = self.vectorizer.fit_transform(self.jobs_texts)
69
-
70
- # ----------------- Helpers -----------------
71
- def clean_text(self, text):
72
- """Lowercase, strip punctuation, clean text."""
73
- return text.lower().translate(str.maketrans("", "", string.punctuation)).strip()
74
-
75
- def filter_top_jobs(self, resume_text, top_n=500):
76
- """Use TF-IDF to preselect most relevant jobs (fast)."""
77
- resume_vector = self.vectorizer.transform([resume_text])
78
- similarity_scores = (self.job_tfidf @ resume_vector.T).toarray().flatten()
79
- top_indices = np.argsort(similarity_scores)[-top_n:]
80
- return (
81
- [self.jobs_texts[i] for i in top_indices],
82
- self.job_info.iloc[top_indices].reset_index(drop=True),
83
- self.job_embeddings[top_indices],
84
- )
85
-
86
- def recommend_jobs(self, resume_text, top_n=20):
87
- """Recommend jobs using FAISS similarity search + deduplication."""
88
- resume_text = self.clean_text(resume_text)
89
- filtered_jobs_texts, filtered_jobs_df, filtered_embeddings = (
90
- self.filter_top_jobs(resume_text)
91
- )
92
-
93
- # Encode resume
94
- resume_embedding = MODEL.encode([resume_text], convert_to_numpy=True).astype(np.float16)
95
-
96
- # Build temporary FAISS index on filtered jobs
97
- index = faiss.IndexFlatIP(self.dim)
98
- index.add(filtered_embeddings.astype(np.float16))
99
-
100
- # Search more than top_n to handle duplicates
101
- distances, indices = index.search(resume_embedding.astype(np.float16), top_n * 2)
102
- results = filtered_jobs_df.iloc[indices[0]]
103
-
104
- # Deduplicate by job_id and return top_n
105
- results = results.drop_duplicates(subset=["job_id"]).head(top_n)
106
- recommended_jobs = results.to_dict(orient="records")
107
-
108
- return {"recommended_jobs": recommended_jobs}
 
1
+ import string
2
+ import numpy as np
3
+ import faiss
4
+ import pandas as pd
5
+ import torch
6
+ from sentence_transformers import SentenceTransformer
7
+ from sklearn.feature_extraction.text import TfidfVectorizer
8
+
9
+ # ----------------- Load Model -----------------
10
+ MODEL = SentenceTransformer("./paraphrase-MiniLM-L6-v2", device="cpu")
11
+ MODEL = torch.quantization.quantize_dynamic(MODEL, {torch.nn.Linear}, dtype=torch.qint8)
12
+
13
+
14
+ class JobRecommendationSystem:
15
+ def __init__(self, jobs_csv):
16
+ """Initialize the system and load enriched job data from CSV file."""
17
+ self.jobs_df = pd.read_csv(jobs_csv)
18
+
19
+ # Ensure apply_link exists
20
+ if "apply_link" not in self.jobs_df.columns:
21
+ self.jobs_df["apply_link"] = None
22
+
23
+ # --- Safe column concat ---
24
+ def safe_col(col):
25
+ return self.jobs_df[col].astype(str) + " " if col in self.jobs_df.columns else ""
26
+
27
+ # Build job_text
28
+ self.jobs_df["job_text"] = (
29
+ safe_col("workplace") +
30
+ safe_col("position") +
31
+ safe_col("job_role_and_duties") +
32
+ safe_col("requisite_skill") +
33
+ safe_col("benefits") +
34
+ safe_col("industry_id") +
35
+ safe_col("formatted_work_type") +
36
+ safe_col("work_type") +
37
+ safe_col("formatted_experience_level") +
38
+ safe_col("country") +
39
+ safe_col("state") +
40
+ safe_col("city")
41
+ )
42
+
43
+ self.jobs_texts = self.jobs_df["job_text"].tolist()
44
+ self.job_info = self.jobs_df.copy()
45
+
46
+ # --- Load or compute embeddings ---
47
+ try:
48
+ self.job_embeddings = np.load("job_embeddings.npy").astype(np.float16)
49
+ print("✅ Loaded precomputed embeddings from job_embeddings.npy")
50
+ except FileNotFoundError:
51
+ print("⚠️ job_embeddings.npy not found. Generating embeddings now...")
52
+ self.job_embeddings = MODEL.encode(
53
+ self.jobs_texts,
54
+ convert_to_numpy=True,
55
+ batch_size=32,
56
+ show_progress_bar=True
57
+ ).astype(np.float16)
58
+ np.save("job_embeddings.npy", self.job_embeddings)
59
+ print("✅ Saved embeddings to job_embeddings.npy")
60
+
61
+ # --- Build FAISS index (global, on all jobs) ---
62
+ self.dim = self.job_embeddings.shape[1]
63
+ self.index = faiss.IndexFlatIP(self.dim)
64
+ self.index.add(self.job_embeddings.astype(np.float16))
65
+
66
+ # --- Precompute TF-IDF once ---
67
+ self.vectorizer = TfidfVectorizer()
68
+ self.job_tfidf = self.vectorizer.fit_transform(self.jobs_texts)
69
+
70
+ # ----------------- Helpers -----------------
71
+ def clean_text(self, text):
72
+ """Lowercase, strip punctuation, clean text."""
73
+ return text.lower().translate(str.maketrans("", "", string.punctuation)).strip()
74
+
75
+ def filter_top_jobs(self, resume_text, top_n=500):
76
+ """Use TF-IDF to preselect most relevant jobs (fast)."""
77
+ resume_vector = self.vectorizer.transform([resume_text])
78
+ similarity_scores = (self.job_tfidf @ resume_vector.T).toarray().flatten()
79
+ top_indices = np.argsort(similarity_scores)[-top_n:]
80
+ return (
81
+ [self.jobs_texts[i] for i in top_indices],
82
+ self.job_info.iloc[top_indices].reset_index(drop=True),
83
+ self.job_embeddings[top_indices],
84
+ )
85
+
86
+ def recommend_jobs(self, resume_text, top_n=20):
87
+ """Recommend jobs using FAISS similarity search + deduplication."""
88
+ resume_text = self.clean_text(resume_text)
89
+ filtered_jobs_texts, filtered_jobs_df, filtered_embeddings = (
90
+ self.filter_top_jobs(resume_text)
91
+ )
92
+
93
+ # Encode resume
94
+ resume_embedding = MODEL.encode([resume_text], convert_to_numpy=True).astype(np.float16)
95
+
96
+ # Build temporary FAISS index on filtered jobs
97
+ index = faiss.IndexFlatIP(self.dim)
98
+ index.add(filtered_embeddings.astype(np.float16))
99
+
100
+ # Search more than top_n to handle duplicates
101
+ distances, indices = index.search(resume_embedding.astype(np.float16), top_n * 2)
102
+ results = filtered_jobs_df.iloc[indices[0]]
103
+
104
+ # Deduplicate by job_id and return top_n
105
+ results = results.drop_duplicates(subset=["job_id"]).head(top_n)
106
+ recommended_jobs = results.to_dict(orient="records")
107
+
108
+ return {"recommended_jobs": recommended_jobs}