File size: 5,136 Bytes
88dce1c
7f21468
88dce1c
7f21468
88dce1c
7f21468
88dce1c
37b08b8
7f21468
857ba47
88dce1c
 
37b08b8
88dce1c
857ba47
88dce1c
7f21468
88dce1c
7f21468
857ba47
7f21468
857ba47
37b08b8
d63569b
 
 
 
857ba47
d63569b
857ba47
cd3526a
 
d63569b
 
7f21468
857ba47
88dce1c
 
37b08b8
 
 
88dce1c
857ba47
7f21468
857ba47
 
d63569b
88dce1c
857ba47
88dce1c
 
37b08b8
 
 
 
 
 
 
 
 
 
 
7f21468
857ba47
88dce1c
 
857ba47
88dce1c
7f21468
 
 
857ba47
7f21468
88dce1c
 
7f21468
857ba47
 
7f21468
 
857ba47
 
88dce1c
 
 
 
 
 
 
 
857ba47
88dce1c
 
857ba47
 
 
 
 
88dce1c
 
857ba47
88dce1c
857ba47
 
 
 
88dce1c
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import os
import pandas as pd
import numpy as np
import torch
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer
from huggingface_hub import hf_hub_download

# ---------------- Load Sentence-BERT Model ----------------
def load_sentence_model():
    try:
        model = SentenceTransformer("./paraphrase-MiniLM-L6-v2", device="cpu")  # local
    except Exception:
        model = SentenceTransformer("sentence-transformers/paraphrase-MiniLM-L6-v2", device="cpu")  # fallback
    return torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)

MODEL = load_sentence_model()

# ---------------- Job Recommendation System ----------------
class JobRecommendationSystem:
    def __init__(self, jobs_csv: str = "JobsFE.csv"):
        # Load CSV from local or HF Hub
        if os.path.exists(jobs_csv):
            print(f"✅ Loading dataset locally from {jobs_csv}")
            self.jobs_df = pd.read_csv(jobs_csv)
        else:
            print("📥 Fetching dataset from Hugging Face Hub...")
            dataset_path = hf_hub_download(
                repo_id="shreyan67/Job-Catalyst_AI",   # your HF dataset repo
                filename="JobsFE.csv",
                repo_type="dataset"
            )
            self.jobs_df = pd.read_csv(dataset_path)

        self.jobs_df = self.jobs_df.fillna("")

        text_cols = [
            "position", "job_role_and_duties", "requisite_skill", "benefits",
            "formatted_experience_level", "formatted_work_type", "work_type",
            "city", "state", "country",
        ]
        self.jobs_df["job_text"] = self.jobs_df[text_cols].astype(str).agg(" ".join, axis=1)

        self.jobs_df = self.jobs_df.drop_duplicates(subset=["job_text"]).reset_index(drop=True)
        self.jobs_texts = self.jobs_df["job_text"].tolist()

        print("⚡ Precomputing TF-IDF vectors...")
        self.vectorizer = TfidfVectorizer(max_features=10000)
        self.job_tfidf_matrix = self.vectorizer.fit_transform(self.jobs_texts)

        # Load embeddings from HF if available
        try:
            emb_path = hf_hub_download(
                repo_id="shreyan67/Job-Catalyst_AI",
                filename="job_embeddings.npy",
                repo_type="dataset"
            )
            print("✅ Loaded precomputed embeddings from Hugging Face Hub")
            self.job_embeddings = np.load(emb_path)
        except Exception:
            print("⚠️ No precomputed embeddings found. Generating now...")
            self.job_embeddings = MODEL.encode(
                self.jobs_texts,
                batch_size=64,
                show_progress_bar=True,
                convert_to_numpy=True,
            )
            np.save("job_embeddings.npy", self.job_embeddings)
            print("✅ Saved embeddings to job_embeddings.npy")

    def filter_top_jobs(self, resume_text: str, top_k: int = 500):
        resume_vector = self.vectorizer.transform([resume_text])
        cosine_similarities = linear_kernel(resume_vector, self.job_tfidf_matrix).flatten()
        top_indices = cosine_similarities.argsort()[-top_k:][::-1]
        return (
            self.jobs_df.iloc[top_indices].reset_index(drop=True),
            self.job_embeddings[top_indices],
        )

    def recommend_jobs(self, resume_text: str, top_n: int = 20):
        filtered_jobs_df, filtered_embeddings = self.filter_top_jobs(resume_text)
        resume_embedding = MODEL.encode(resume_text, convert_to_numpy=True).reshape(1, -1)
        similarities = cosine_similarity(resume_embedding, filtered_embeddings)[0]
        top_indices = similarities.argsort()[-top_n:][::-1]

        recommendations = []
        for idx in top_indices:
            job = filtered_jobs_df.iloc[idx]
            recommendations.append({
                "job_id": job.get("job_id", ""),
                "position": job.get("position", "N/A"),
                "workplace": job.get("workplace", "N/A"),
                "formatted_work_type": job.get("formatted_work_type", "N/A"),
                "remote_allowed": job.get("remote_allowed", "N/A"),
                "salary_range": f"{job.get('min_salary','')} - {job.get('max_salary','')} {job.get('currency','')} ({job.get('pay_period','')})",
                "experience_level": job.get("formatted_experience_level", "N/A"),
                "job_role_and_duties": job.get("job_role_and_duties", "N/A"),
                "skills": job.get("requisite_skill", "N/A"),
                "benefits": job.get("benefits", "N/A"),
                "location": f"{job.get('city','')}, {job.get('state','')}, {job.get('country','')}",
                "company_size": job.get("company_size", "N/A"),
                "employee_count": job.get("employee_count", "N/A"),
                "company_website": job.get("company_website", "N/A"),
                "apply_link": job.get("apply_link", job.get("job_posting_url", "")),
                "similarity": float(similarities[idx]),
            })
        return recommendations