import os import pandas as pd import numpy as np import torch from sklearn.metrics.pairwise import cosine_similarity, linear_kernel from sklearn.feature_extraction.text import TfidfVectorizer from sentence_transformers import SentenceTransformer from huggingface_hub import hf_hub_download # ---------------- Load Sentence-BERT Model ---------------- def load_sentence_model(): try: model = SentenceTransformer("./paraphrase-MiniLM-L6-v2", device="cpu") # local except Exception: model = SentenceTransformer("sentence-transformers/paraphrase-MiniLM-L6-v2", device="cpu") # fallback return torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8) MODEL = load_sentence_model() # ---------------- Job Recommendation System ---------------- class JobRecommendationSystem: def __init__(self, jobs_csv: str = "JobsFE.csv"): # Load CSV from local or HF Hub if os.path.exists(jobs_csv): print(f"✅ Loading dataset locally from {jobs_csv}") self.jobs_df = pd.read_csv(jobs_csv) else: print("📥 Fetching dataset from Hugging Face Hub...") dataset_path = hf_hub_download( repo_id="shreyan67/Job-Catalyst_AI", # your HF dataset repo filename="JobsFE.csv", repo_type="dataset" ) self.jobs_df = pd.read_csv(dataset_path) self.jobs_df = self.jobs_df.fillna("") text_cols = [ "position", "job_role_and_duties", "requisite_skill", "benefits", "formatted_experience_level", "formatted_work_type", "work_type", "city", "state", "country", ] self.jobs_df["job_text"] = self.jobs_df[text_cols].astype(str).agg(" ".join, axis=1) self.jobs_df = self.jobs_df.drop_duplicates(subset=["job_text"]).reset_index(drop=True) self.jobs_texts = self.jobs_df["job_text"].tolist() print("⚡ Precomputing TF-IDF vectors...") self.vectorizer = TfidfVectorizer(max_features=10000) self.job_tfidf_matrix = self.vectorizer.fit_transform(self.jobs_texts) # Load embeddings from HF if available try: emb_path = hf_hub_download( repo_id="shreyan67/Job-Catalyst_AI", filename="job_embeddings.npy", repo_type="dataset" ) print("✅ Loaded precomputed embeddings from Hugging Face Hub") self.job_embeddings = np.load(emb_path) except Exception: print("⚠️ No precomputed embeddings found. Generating now...") self.job_embeddings = MODEL.encode( self.jobs_texts, batch_size=64, show_progress_bar=True, convert_to_numpy=True, ) np.save("job_embeddings.npy", self.job_embeddings) print("✅ Saved embeddings to job_embeddings.npy") def filter_top_jobs(self, resume_text: str, top_k: int = 500): resume_vector = self.vectorizer.transform([resume_text]) cosine_similarities = linear_kernel(resume_vector, self.job_tfidf_matrix).flatten() top_indices = cosine_similarities.argsort()[-top_k:][::-1] return ( self.jobs_df.iloc[top_indices].reset_index(drop=True), self.job_embeddings[top_indices], ) def recommend_jobs(self, resume_text: str, top_n: int = 20): filtered_jobs_df, filtered_embeddings = self.filter_top_jobs(resume_text) resume_embedding = MODEL.encode(resume_text, convert_to_numpy=True).reshape(1, -1) similarities = cosine_similarity(resume_embedding, filtered_embeddings)[0] top_indices = similarities.argsort()[-top_n:][::-1] recommendations = [] for idx in top_indices: job = filtered_jobs_df.iloc[idx] recommendations.append({ "job_id": job.get("job_id", ""), "position": job.get("position", "N/A"), "workplace": job.get("workplace", "N/A"), "formatted_work_type": job.get("formatted_work_type", "N/A"), "remote_allowed": job.get("remote_allowed", "N/A"), "salary_range": f"{job.get('min_salary','')} - {job.get('max_salary','')} {job.get('currency','')} ({job.get('pay_period','')})", "experience_level": job.get("formatted_experience_level", "N/A"), "job_role_and_duties": job.get("job_role_and_duties", "N/A"), "skills": job.get("requisite_skill", "N/A"), "benefits": job.get("benefits", "N/A"), "location": f"{job.get('city','')}, {job.get('state','')}, {job.get('country','')}", "company_size": job.get("company_size", "N/A"), "employee_count": job.get("employee_count", "N/A"), "company_website": job.get("company_website", "N/A"), "apply_link": job.get("apply_link", job.get("job_posting_url", "")), "similarity": float(similarities[idx]), }) return recommendations