Spaces:
Sleeping
Sleeping
File size: 5,136 Bytes
88dce1c 7f21468 88dce1c 7f21468 88dce1c 7f21468 88dce1c 37b08b8 7f21468 857ba47 88dce1c 37b08b8 88dce1c 857ba47 88dce1c 7f21468 88dce1c 7f21468 857ba47 7f21468 857ba47 37b08b8 d63569b 857ba47 d63569b 857ba47 cd3526a d63569b 7f21468 857ba47 88dce1c 37b08b8 88dce1c 857ba47 7f21468 857ba47 d63569b 88dce1c 857ba47 88dce1c 37b08b8 7f21468 857ba47 88dce1c 857ba47 88dce1c 7f21468 857ba47 7f21468 88dce1c 7f21468 857ba47 7f21468 857ba47 88dce1c 857ba47 88dce1c 857ba47 88dce1c 857ba47 88dce1c 857ba47 88dce1c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 |
import os
import pandas as pd
import numpy as np
import torch
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer
from huggingface_hub import hf_hub_download
# ---------------- Load Sentence-BERT Model ----------------
def load_sentence_model():
try:
model = SentenceTransformer("./paraphrase-MiniLM-L6-v2", device="cpu") # local
except Exception:
model = SentenceTransformer("sentence-transformers/paraphrase-MiniLM-L6-v2", device="cpu") # fallback
return torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)
MODEL = load_sentence_model()
# ---------------- Job Recommendation System ----------------
class JobRecommendationSystem:
def __init__(self, jobs_csv: str = "JobsFE.csv"):
# Load CSV from local or HF Hub
if os.path.exists(jobs_csv):
print(f"✅ Loading dataset locally from {jobs_csv}")
self.jobs_df = pd.read_csv(jobs_csv)
else:
print("📥 Fetching dataset from Hugging Face Hub...")
dataset_path = hf_hub_download(
repo_id="shreyan67/Job-Catalyst_AI", # your HF dataset repo
filename="JobsFE.csv",
repo_type="dataset"
)
self.jobs_df = pd.read_csv(dataset_path)
self.jobs_df = self.jobs_df.fillna("")
text_cols = [
"position", "job_role_and_duties", "requisite_skill", "benefits",
"formatted_experience_level", "formatted_work_type", "work_type",
"city", "state", "country",
]
self.jobs_df["job_text"] = self.jobs_df[text_cols].astype(str).agg(" ".join, axis=1)
self.jobs_df = self.jobs_df.drop_duplicates(subset=["job_text"]).reset_index(drop=True)
self.jobs_texts = self.jobs_df["job_text"].tolist()
print("⚡ Precomputing TF-IDF vectors...")
self.vectorizer = TfidfVectorizer(max_features=10000)
self.job_tfidf_matrix = self.vectorizer.fit_transform(self.jobs_texts)
# Load embeddings from HF if available
try:
emb_path = hf_hub_download(
repo_id="shreyan67/Job-Catalyst_AI",
filename="job_embeddings.npy",
repo_type="dataset"
)
print("✅ Loaded precomputed embeddings from Hugging Face Hub")
self.job_embeddings = np.load(emb_path)
except Exception:
print("⚠️ No precomputed embeddings found. Generating now...")
self.job_embeddings = MODEL.encode(
self.jobs_texts,
batch_size=64,
show_progress_bar=True,
convert_to_numpy=True,
)
np.save("job_embeddings.npy", self.job_embeddings)
print("✅ Saved embeddings to job_embeddings.npy")
def filter_top_jobs(self, resume_text: str, top_k: int = 500):
resume_vector = self.vectorizer.transform([resume_text])
cosine_similarities = linear_kernel(resume_vector, self.job_tfidf_matrix).flatten()
top_indices = cosine_similarities.argsort()[-top_k:][::-1]
return (
self.jobs_df.iloc[top_indices].reset_index(drop=True),
self.job_embeddings[top_indices],
)
def recommend_jobs(self, resume_text: str, top_n: int = 20):
filtered_jobs_df, filtered_embeddings = self.filter_top_jobs(resume_text)
resume_embedding = MODEL.encode(resume_text, convert_to_numpy=True).reshape(1, -1)
similarities = cosine_similarity(resume_embedding, filtered_embeddings)[0]
top_indices = similarities.argsort()[-top_n:][::-1]
recommendations = []
for idx in top_indices:
job = filtered_jobs_df.iloc[idx]
recommendations.append({
"job_id": job.get("job_id", ""),
"position": job.get("position", "N/A"),
"workplace": job.get("workplace", "N/A"),
"formatted_work_type": job.get("formatted_work_type", "N/A"),
"remote_allowed": job.get("remote_allowed", "N/A"),
"salary_range": f"{job.get('min_salary','')} - {job.get('max_salary','')} {job.get('currency','')} ({job.get('pay_period','')})",
"experience_level": job.get("formatted_experience_level", "N/A"),
"job_role_and_duties": job.get("job_role_and_duties", "N/A"),
"skills": job.get("requisite_skill", "N/A"),
"benefits": job.get("benefits", "N/A"),
"location": f"{job.get('city','')}, {job.get('state','')}, {job.get('country','')}",
"company_size": job.get("company_size", "N/A"),
"employee_count": job.get("employee_count", "N/A"),
"company_website": job.get("company_website", "N/A"),
"apply_link": job.get("apply_link", job.get("job_posting_url", "")),
"similarity": float(similarities[idx]),
})
return recommendations
|