Spaces:
Sleeping
Sleeping
Upload precompute_embeddings.py
Browse files- precompute_embeddings.py +108 -108
precompute_embeddings.py
CHANGED
|
@@ -1,108 +1,108 @@
|
|
| 1 |
-
import string
|
| 2 |
-
import numpy as np
|
| 3 |
-
import faiss
|
| 4 |
-
import pandas as pd
|
| 5 |
-
import torch
|
| 6 |
-
from sentence_transformers import SentenceTransformer
|
| 7 |
-
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 8 |
-
|
| 9 |
-
# ----------------- Load Model -----------------
|
| 10 |
-
MODEL = SentenceTransformer("./paraphrase-MiniLM-L6-v2", device="cpu")
|
| 11 |
-
MODEL = torch.quantization.quantize_dynamic(MODEL, {torch.nn.Linear}, dtype=torch.qint8)
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
class JobRecommendationSystem:
|
| 15 |
-
def __init__(self, jobs_csv):
|
| 16 |
-
"""Initialize the system and load enriched job data from CSV file."""
|
| 17 |
-
self.jobs_df = pd.read_csv(jobs_csv)
|
| 18 |
-
|
| 19 |
-
# Ensure apply_link exists
|
| 20 |
-
if "apply_link" not in self.jobs_df.columns:
|
| 21 |
-
self.jobs_df["apply_link"] = None
|
| 22 |
-
|
| 23 |
-
# --- Safe column concat ---
|
| 24 |
-
def safe_col(col):
|
| 25 |
-
return self.jobs_df[col].astype(str) + " " if col in self.jobs_df.columns else ""
|
| 26 |
-
|
| 27 |
-
# Build job_text
|
| 28 |
-
self.jobs_df["job_text"] = (
|
| 29 |
-
safe_col("workplace") +
|
| 30 |
-
safe_col("position") +
|
| 31 |
-
safe_col("job_role_and_duties") +
|
| 32 |
-
safe_col("requisite_skill") +
|
| 33 |
-
safe_col("benefits") +
|
| 34 |
-
safe_col("industry_id") +
|
| 35 |
-
safe_col("formatted_work_type") +
|
| 36 |
-
safe_col("work_type") +
|
| 37 |
-
safe_col("formatted_experience_level") +
|
| 38 |
-
safe_col("country") +
|
| 39 |
-
safe_col("state") +
|
| 40 |
-
safe_col("city")
|
| 41 |
-
)
|
| 42 |
-
|
| 43 |
-
self.jobs_texts = self.jobs_df["job_text"].tolist()
|
| 44 |
-
self.job_info = self.jobs_df.copy()
|
| 45 |
-
|
| 46 |
-
# --- Load or compute embeddings ---
|
| 47 |
-
try:
|
| 48 |
-
self.job_embeddings = np.load("job_embeddings.npy").astype(np.float16)
|
| 49 |
-
print("✅ Loaded precomputed embeddings from job_embeddings.npy")
|
| 50 |
-
except FileNotFoundError:
|
| 51 |
-
print("⚠️ job_embeddings.npy not found. Generating embeddings now...")
|
| 52 |
-
self.job_embeddings = MODEL.encode(
|
| 53 |
-
self.jobs_texts,
|
| 54 |
-
convert_to_numpy=True,
|
| 55 |
-
batch_size=32,
|
| 56 |
-
show_progress_bar=True
|
| 57 |
-
).astype(np.float16)
|
| 58 |
-
np.save("job_embeddings.npy", self.job_embeddings)
|
| 59 |
-
print("✅ Saved embeddings to job_embeddings.npy")
|
| 60 |
-
|
| 61 |
-
# --- Build FAISS index (global, on all jobs) ---
|
| 62 |
-
self.dim = self.job_embeddings.shape[1]
|
| 63 |
-
self.index = faiss.IndexFlatIP(self.dim)
|
| 64 |
-
self.index.add(self.job_embeddings.astype(np.float16))
|
| 65 |
-
|
| 66 |
-
# --- Precompute TF-IDF once ---
|
| 67 |
-
self.vectorizer = TfidfVectorizer()
|
| 68 |
-
self.job_tfidf = self.vectorizer.fit_transform(self.jobs_texts)
|
| 69 |
-
|
| 70 |
-
# ----------------- Helpers -----------------
|
| 71 |
-
def clean_text(self, text):
|
| 72 |
-
"""Lowercase, strip punctuation, clean text."""
|
| 73 |
-
return text.lower().translate(str.maketrans("", "", string.punctuation)).strip()
|
| 74 |
-
|
| 75 |
-
def filter_top_jobs(self, resume_text, top_n=500):
|
| 76 |
-
"""Use TF-IDF to preselect most relevant jobs (fast)."""
|
| 77 |
-
resume_vector = self.vectorizer.transform([resume_text])
|
| 78 |
-
similarity_scores = (self.job_tfidf @ resume_vector.T).toarray().flatten()
|
| 79 |
-
top_indices = np.argsort(similarity_scores)[-top_n:]
|
| 80 |
-
return (
|
| 81 |
-
[self.jobs_texts[i] for i in top_indices],
|
| 82 |
-
self.job_info.iloc[top_indices].reset_index(drop=True),
|
| 83 |
-
self.job_embeddings[top_indices],
|
| 84 |
-
)
|
| 85 |
-
|
| 86 |
-
def recommend_jobs(self, resume_text, top_n=20):
|
| 87 |
-
"""Recommend jobs using FAISS similarity search + deduplication."""
|
| 88 |
-
resume_text = self.clean_text(resume_text)
|
| 89 |
-
filtered_jobs_texts, filtered_jobs_df, filtered_embeddings = (
|
| 90 |
-
self.filter_top_jobs(resume_text)
|
| 91 |
-
)
|
| 92 |
-
|
| 93 |
-
# Encode resume
|
| 94 |
-
resume_embedding = MODEL.encode([resume_text], convert_to_numpy=True).astype(np.float16)
|
| 95 |
-
|
| 96 |
-
# Build temporary FAISS index on filtered jobs
|
| 97 |
-
index = faiss.IndexFlatIP(self.dim)
|
| 98 |
-
index.add(filtered_embeddings.astype(np.float16))
|
| 99 |
-
|
| 100 |
-
# Search more than top_n to handle duplicates
|
| 101 |
-
distances, indices = index.search(resume_embedding.astype(np.float16), top_n * 2)
|
| 102 |
-
results = filtered_jobs_df.iloc[indices[0]]
|
| 103 |
-
|
| 104 |
-
# Deduplicate by job_id and return top_n
|
| 105 |
-
results = results.drop_duplicates(subset=["job_id"]).head(top_n)
|
| 106 |
-
recommended_jobs = results.to_dict(orient="records")
|
| 107 |
-
|
| 108 |
-
return {"recommended_jobs": recommended_jobs}
|
|
|
|
| 1 |
+
import string
|
| 2 |
+
import numpy as np
|
| 3 |
+
import faiss
|
| 4 |
+
import pandas as pd
|
| 5 |
+
import torch
|
| 6 |
+
from sentence_transformers import SentenceTransformer
|
| 7 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 8 |
+
|
| 9 |
+
# ----------------- Load Model -----------------
|
| 10 |
+
MODEL = SentenceTransformer("./paraphrase-MiniLM-L6-v2", device="cpu")
|
| 11 |
+
MODEL = torch.quantization.quantize_dynamic(MODEL, {torch.nn.Linear}, dtype=torch.qint8)
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class JobRecommendationSystem:
|
| 15 |
+
def __init__(self, jobs_csv):
|
| 16 |
+
"""Initialize the system and load enriched job data from CSV file."""
|
| 17 |
+
self.jobs_df = pd.read_csv(jobs_csv)
|
| 18 |
+
|
| 19 |
+
# Ensure apply_link exists
|
| 20 |
+
if "apply_link" not in self.jobs_df.columns:
|
| 21 |
+
self.jobs_df["apply_link"] = None
|
| 22 |
+
|
| 23 |
+
# --- Safe column concat ---
|
| 24 |
+
def safe_col(col):
|
| 25 |
+
return self.jobs_df[col].astype(str) + " " if col in self.jobs_df.columns else ""
|
| 26 |
+
|
| 27 |
+
# Build job_text
|
| 28 |
+
self.jobs_df["job_text"] = (
|
| 29 |
+
safe_col("workplace") +
|
| 30 |
+
safe_col("position") +
|
| 31 |
+
safe_col("job_role_and_duties") +
|
| 32 |
+
safe_col("requisite_skill") +
|
| 33 |
+
safe_col("benefits") +
|
| 34 |
+
safe_col("industry_id") +
|
| 35 |
+
safe_col("formatted_work_type") +
|
| 36 |
+
safe_col("work_type") +
|
| 37 |
+
safe_col("formatted_experience_level") +
|
| 38 |
+
safe_col("country") +
|
| 39 |
+
safe_col("state") +
|
| 40 |
+
safe_col("city")
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
+
self.jobs_texts = self.jobs_df["job_text"].tolist()
|
| 44 |
+
self.job_info = self.jobs_df.copy()
|
| 45 |
+
|
| 46 |
+
# --- Load or compute embeddings ---
|
| 47 |
+
try:
|
| 48 |
+
self.job_embeddings = np.load("job_embeddings.npy").astype(np.float16)
|
| 49 |
+
print("✅ Loaded precomputed embeddings from job_embeddings.npy")
|
| 50 |
+
except FileNotFoundError:
|
| 51 |
+
print("⚠️ job_embeddings.npy not found. Generating embeddings now...")
|
| 52 |
+
self.job_embeddings = MODEL.encode(
|
| 53 |
+
self.jobs_texts,
|
| 54 |
+
convert_to_numpy=True,
|
| 55 |
+
batch_size=32,
|
| 56 |
+
show_progress_bar=True
|
| 57 |
+
).astype(np.float16)
|
| 58 |
+
np.save("job_embeddings.npy", self.job_embeddings)
|
| 59 |
+
print("✅ Saved embeddings to job_embeddings.npy")
|
| 60 |
+
|
| 61 |
+
# --- Build FAISS index (global, on all jobs) ---
|
| 62 |
+
self.dim = self.job_embeddings.shape[1]
|
| 63 |
+
self.index = faiss.IndexFlatIP(self.dim)
|
| 64 |
+
self.index.add(self.job_embeddings.astype(np.float16))
|
| 65 |
+
|
| 66 |
+
# --- Precompute TF-IDF once ---
|
| 67 |
+
self.vectorizer = TfidfVectorizer()
|
| 68 |
+
self.job_tfidf = self.vectorizer.fit_transform(self.jobs_texts)
|
| 69 |
+
|
| 70 |
+
# ----------------- Helpers -----------------
|
| 71 |
+
def clean_text(self, text):
|
| 72 |
+
"""Lowercase, strip punctuation, clean text."""
|
| 73 |
+
return text.lower().translate(str.maketrans("", "", string.punctuation)).strip()
|
| 74 |
+
|
| 75 |
+
def filter_top_jobs(self, resume_text, top_n=500):
|
| 76 |
+
"""Use TF-IDF to preselect most relevant jobs (fast)."""
|
| 77 |
+
resume_vector = self.vectorizer.transform([resume_text])
|
| 78 |
+
similarity_scores = (self.job_tfidf @ resume_vector.T).toarray().flatten()
|
| 79 |
+
top_indices = np.argsort(similarity_scores)[-top_n:]
|
| 80 |
+
return (
|
| 81 |
+
[self.jobs_texts[i] for i in top_indices],
|
| 82 |
+
self.job_info.iloc[top_indices].reset_index(drop=True),
|
| 83 |
+
self.job_embeddings[top_indices],
|
| 84 |
+
)
|
| 85 |
+
|
| 86 |
+
def recommend_jobs(self, resume_text, top_n=20):
|
| 87 |
+
"""Recommend jobs using FAISS similarity search + deduplication."""
|
| 88 |
+
resume_text = self.clean_text(resume_text)
|
| 89 |
+
filtered_jobs_texts, filtered_jobs_df, filtered_embeddings = (
|
| 90 |
+
self.filter_top_jobs(resume_text)
|
| 91 |
+
)
|
| 92 |
+
|
| 93 |
+
# Encode resume
|
| 94 |
+
resume_embedding = MODEL.encode([resume_text], convert_to_numpy=True).astype(np.float16)
|
| 95 |
+
|
| 96 |
+
# Build temporary FAISS index on filtered jobs
|
| 97 |
+
index = faiss.IndexFlatIP(self.dim)
|
| 98 |
+
index.add(filtered_embeddings.astype(np.float16))
|
| 99 |
+
|
| 100 |
+
# Search more than top_n to handle duplicates
|
| 101 |
+
distances, indices = index.search(resume_embedding.astype(np.float16), top_n * 2)
|
| 102 |
+
results = filtered_jobs_df.iloc[indices[0]]
|
| 103 |
+
|
| 104 |
+
# Deduplicate by job_id and return top_n
|
| 105 |
+
results = results.drop_duplicates(subset=["job_id"]).head(top_n)
|
| 106 |
+
recommended_jobs = results.to_dict(orient="records")
|
| 107 |
+
|
| 108 |
+
return {"recommended_jobs": recommended_jobs}
|