# --- Set Cache Folders for Hugging Face Environment --- # THIS IS THE CRUCIAL FIX for the PermissionError. It must be at the top. import os os.environ["TRANSFORMERS_CACHE"] = "/tmp/transformers" os.environ["HF_HOME"] = "/tmp/huggingface" import json import re import logging import math import docx import fitz # PyMuPDF from dotenv import load_dotenv import google.generativeai as genai from flask import Flask, render_template, request, jsonify from sentence_transformers import SentenceTransformer, util from rapidfuzz import fuzz, process from urllib.parse import quote_plus # --- Load environment variables and configure Gemini API --- load_dotenv() GEMINI_API_KEY = os.getenv("GEMINI_API_KEY") genai.configure(api_key=GEMINI_API_KEY) app = Flask(__name__) # --- Setup Loggers --- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S') # --- Load Model & Data --- model = SentenceTransformer("sentence-transformers/paraphrase-albert-small-v2") def load_json(filename): try: with open(filename, "r", encoding="utf-8") as f: return json.load(f) except Exception as e: logging.error(f"Error loading {filename}: {e}") return [] if filename != "pincodes.json" else {} COURSE_DATA = load_json("careers.json") COLLEGE_DATA = load_json("colleges.json") PINCODE_DATA = load_json("pincodes.json") ALL_TAGS = set() for course in COURSE_DATA: tags_obj = course.get("tags", {}) if isinstance(tags_obj, dict): for category_tags in tags_obj.values(): ALL_TAGS.update(category_tags) ALL_TAGS.update(["theory", "research", "practical"]) # --- Constants & Dictionaries --- QUESTIONS = { "stream": "What was your academic stream after 10th?", "subject_strengths": "Which subjects do you feel strongest in?", "subject_weaknesses": "Which subjects do you find most difficult?", "learning_style": "Do you learn better through practical work or theory/research?", "work_environment": "What kind of work environment do you prefer? (e.g., an office, a lab, outdoors, a workshop)", "team_preference": "Do you prefer working alone or collaboratively?", "interest_activities": "Outside academics, what hobbies do you enjoy?", "general_interests": "What topics or fields are you generally curious about?", "primary_driver": "What motivates your future most? (e.g., money, creativity, helping people, innovation, stability)" } STOP_WORDS = {"a", "an", "and", "the", "in", "on", "for", "with", "i", "my", "is", "are", "like", "to", "of"} TRAIT_KEYWORDS = { "analytical": ["math", "physics", "science", "data", "logic", "puzzles", "engineering", "theory", "research"], "creative": ["art", "design", "music", "writing", "media", "film", "painting"], "social": ["helping", "teaching", "volunteering", "communication", "people", "society", "healthcare", "environment"], "structured": ["commerce", "law", "management", "finance", "corporate", "office"], "hands_on": ["practical", "projects", "sports", "repair", "construction", "biology"], "collaborative": ["team", "teamwork", "collaboration", "people", "social"], "independent": ["alone", "independent", "self-directed", "focus", "quiet"], "field_work": ["outdoors", "on-the-move", "travel", "construction", "farming"], "lab_work": ["lab", "research", "science", "biotech", "forensic"] } # --- Helper Functions --- def preprocess_text(text): text = text.lower() text = re.sub(r'[.&,]', '', text) text = re.sub(r'\b(in|and)\b', '', text) return re.sub(r'\s+', ' ', text).strip() def normalize_word(word): if not ALL_TAGS: return word best, score, _ = process.extractOne(word, ALL_TAGS, scorer=fuzz.ratio) return best if score >= 85 else word def parse_input(text): words = re.split(r"[,\s]+", text.lower()) return {normalize_word(word) for word in words if word and word not in STOP_WORDS} def build_user_profile(answers): profile = {key: parse_input(value) for key, value in answers.items() if value} all_keywords = set().union(*profile.values()) if profile else set() persona = {trait for trait, kws in TRAIT_KEYWORDS.items() if not all_keywords.isdisjoint(kws)} return profile, persona def calculate_distance(pin1, pin2, pincode_data): if pin1 not in pincode_data or pin2 not in pincode_data: return float('inf') lat1, lon1 = pincode_data[pin1]['lat'], pincode_data[pin1]['lon'] lat2, lon2 = pincode_data[pin2]['lat'], pincode_data[pin2]['lon'] R, dLat, dLon = 6371, math.radians(lat2 - lat1), math.radians(lon2 - lon1) a = math.sin(dLat / 2)**2 + math.cos(math.radians(lat1)) * math.cos(math.radians(lat2)) * math.sin(dLon / 2)**2 c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a)) return R * c def find_nearby_colleges(course_name, user_pincode): if user_pincode not in PINCODE_DATA: return "
Sorry, I don't have location data for that PIN code.
", [] colleges_with_course = [] for college in COLLEGE_DATA: for offered_course in college.get('courses_offered', []): if course_name.lower() == offered_course.lower(): colleges_with_course.append(college) break if not colleges_with_course: return f"
I couldn't find any colleges in my database offering {course_name}.
", [] nearby_colleges = sorted([(c, calculate_distance(user_pincode, c['pincode'], PINCODE_DATA)) for c in colleges_with_course], key=lambda x: x[1]) response_html = "
" response_html += f"

🎯 Top Matches for {course_name} near {user_pincode}

" return response_html, nearby_colleges def format_course_details(course): details = "
" details += f"

🎓 {course.get('course', 'N/A')}

{course.get('description', '')}

" careers = course.get('possible_careers', []) if careers: details += "💼 Potential Career Paths:" education = course.get('required_education', '') if education: details += f"

Entry Requirements: {education}

" related = course.get('related_courses', []) if related: details += "📚 Key Subjects You'll Study:" details += "
" return details def format_comparison(courses): if not courses: return "

I couldn't find any valid courses to compare. Please check the names and try again.

" table_style = "width:100%;border-collapse:collapse;text-align:left;" th_style = "border-bottom:2px solid #dee2e6;padding:12px;font-size:1rem;" td_style = "border-bottom:1px solid #dee2e6;padding:12px;vertical-align:top;" html = f"
" for course in courses: course_name = course.get('course') html += f"" html += "" html += f"" for course in courses: html += f"" html += "" for course in courses: html += f"" html += "" for course in courses: html += f"" html += "" for course in courses: skills = course.get('tags', {}).get('skills', ['N/A']) html += f"" html += "
Feature
{course_name}
💼 Careers{', '.join(course.get('possible_careers', ['N/A']))}
✅ Requirements{course.get('required_education', 'N/A')}
📚 Key Subjects{', '.join(course.get('related_courses', ['N/A']))}
🛠️ Core Skills{', '.join(s.capitalize() for s in skills)}
" html += "

You can click on a course title in the table above for a detailed view.

" return html def get_recommendations(answers, courses): profile, persona = build_user_profile(answers) if not any(profile.values()): return "", [] user_profile_text = " ".join(set().union(*profile.values())) user_emb = model.encode(user_profile_text, convert_to_tensor=True) if user_profile_text.strip() else None if user_emb is None: return "", [] scored_courses = [] for course in courses: tags_obj = course.get("tags", {}) course_tags = set().union(*tags_obj.values()) if isinstance(tags_obj, dict) else set() rich_course_text = f"{course.get('course', '')} {course.get('description', '')} {' '.join(course.get('possible_careers', []))} {' '.join(course_tags)}" course_emb = model.encode(rich_course_text, convert_to_tensor=True) semantic_score = float(util.cos_sim(user_emb, course_emb)[0][0]) heuristic_score = sum([0.3 if profile.get("general_interests", set()).intersection(course_tags) else 0, 0.15 if profile.get("subject_strengths", set()).intersection(course_tags) else 0, 0.1 if profile.get("stream", set()).intersection(course_tags) else 0, 0.15 if persona.intersection(course_tags) else 0]) penalty_score = sum([len(profile.get("subject_weaknesses", set()).intersection(course_tags)) * 0.2, 0.15 if persona.intersection(course.get("anti_tags", [])) else 0]) final_score = (semantic_score + heuristic_score - penalty_score) * 100 if final_score > 20: scored_courses.append((final_score, course)) scored_courses.sort(key=lambda x: x[0], reverse=True) top_courses = scored_courses[:3] raw_recs = [course for _, course in top_courses] if not raw_recs: return "🤔 I couldn’t find a strong match. Would you like to try again?", [] response_html = "

🚀 Here are my top recommendations for you:

" for i, (_, course_data) in enumerate(top_courses): course_name = course_data.get('course') description = course_data.get('description', '') skills = course_data.get('tags', {}).get('skills', [])[:3] careers = course_data.get('possible_careers', [])[:3] response_html += f"
" response_html += f"

{i+1}. {course_name}
{description}

" response_html += "
" if skills: response_html += f"
🛠️ Key Skills: {', '.join(s.capitalize() for s in skills)}
" if careers: response_html += f"
💼 Potential Careers: {', '.join(careers)}
" response_html += "
" response_html += "
" response_html += f"

⚖️ Compare Courses

" return response_html, raw_recs def next_question(answers): for key, q in QUESTIONS.items(): if key not in answers: return key, q return None, None def extract_text_from_file(file): text = "" filename = file.filename.lower() if filename.endswith('.pdf'): pdf_document = fitz.open(stream=file.read(), filetype="pdf") for page in pdf_document: text += page.get_text() pdf_document.close() elif filename.endswith('.docx'): doc = docx.Document(file) for para in doc.paragraphs: text += para.text + "\n" return text def analyze_resume_and_suggest_jobs(resume_text): if not GEMINI_API_KEY: return "
Error: Gemini API key is not configured.
" prompt = f""" You are an expert career coach. Analyze the following resume text. Your response must be a single JSON object with four keys: 1. "person_name": A string containing the full name of the candidate found in the resume. If no name is clear, return an empty string. 2. "overall_score": An integer score out of 100 for the resume's quality. 3. "summary": A brief, encouraging 1-2 sentence summary of the resume. 4. "job_titles": A list of 3-5 specific job titles the candidate is well-suited for based on their skills and experience. Do not add any text before or after the JSON object. Resume Text to analyze: --- {resume_text} --- """ try: model_gen = genai.GenerativeModel('gemini-1.5-pro-latest') response = model_gen.generate_content(prompt) json_match = re.search(r'\{.*\}', response.text, re.DOTALL) if not json_match: raise ValueError("Invalid JSON response from API") feedback = json.loads(json_match.group(0)) name = feedback.get('person_name', '').strip() html_response = "
" if name: html_response += f"

📝 Resume Analysis for {name}

" else: html_response += f"

📝 Resume Analysis

" html_response += f"

Overall Score: {feedback.get('overall_score', 'N/A')}/100

" html_response += f"

Summary: {feedback.get('summary', '')}

" html_response += "🚀 Potential Job Roles:" html_response += "
" return html_response except Exception as e: print(f"\n---!!! GEMINI API ERROR !!!---\n{e}\n-----------------------------\n") logging.error(f"Gemini API Error: {e}") error_message = ("Sorry, the analysis failed. This is often an API key issue. " "Please check the terminal where you ran `python app.py` for the specific error message.") return f"
{error_message}
" # --- Flask Routes --- @app.route("/") def index(): return render_template("index.html") @app.route("/upload_resume", methods=["POST"]) def upload_resume(): if 'resume_file' not in request.files: return jsonify({"error": "No file part"}), 400 file = request.files['resume_file'] if file.filename == '': return jsonify({"error": "No selected file"}), 400 if file and (file.filename.lower().endswith('.pdf') or file.filename.lower().endswith('.docx')): try: resume_text = extract_text_from_file(file) if not resume_text.strip(): return jsonify({"response": "
The uploaded file seems to be empty.
"}) feedback_html = analyze_resume_and_suggest_jobs(resume_text) return jsonify({"response": feedback_html}) except Exception as e: logging.error(f"Resume Upload Error: {e}") return jsonify({"response": "
Sorry, an error occurred while processing your file.
"}) return jsonify({"error": "Invalid file type. Please upload a PDF or DOCX file."}), 400 @app.route("/chat", methods=["POST"]) def chat(): data = request.get_json() msg = data.get("message", "").strip() convo = data.get("conversation", {}) bot_response = "" if not convo: convo = {"state": "awaiting_initial_action", "answers": {}} bot_response = "Welcome to CareerPal! You can type `start` to begin a personalized guidance session, or select a specific tool from the panel on the left." logging.info("--- NEW SESSION INITIALIZED ---") return jsonify({"response": bot_response, "conversation": convo}) current_state = convo.get("state", "awaiting_initial_action") msg_lower = msg.lower() logging.info(f"STATE: {current_state} | USER: {msg}") feature_commands = ["personalized guidance", "compare courses", "college location finder", "resume analyser"] if msg_lower in feature_commands: current_state = "awaiting_initial_action" if current_state == "awaiting_initial_action": if msg_lower == "start" or msg_lower == "personalized guidance": convo["state"] = "asking_questions" convo["answers"] = {} key, question = next_question(convo["answers"]) bot_response = f"Great, let's find your perfect career path! I'll ask a few questions to get started.

{question}" elif msg_lower == "compare courses": convo["state"] = "awaiting_compare_confirmation" bot_response = "Do you have specific courses in mind to compare?
Yes
No
" elif msg_lower == "college location finder": convo["state"] = "awaiting_course_for_college_search" bot_response = "Happy to help you find colleges! What is the name of the course you're interested in?" elif msg_lower == "resume analyser": convo["state"] = "awaiting_resume_upload" bot_response = "Great! Please upload your resume (PDF or DOCX format) using the upload button below." elif msg_lower == 'end chat': bot_response = "Sure. Would you like to leave some feedback about your experience?
👍 Yes
👎 No
" convo['state'] = 'awaiting_end_confirmation' else: bot_response = "Sorry, I didn't understand. You can type `start` or select a feature from the panel." elif current_state == "asking_questions": last_key, _ = next_question(convo["answers"]) if last_key: convo["answers"][last_key] = msg parsed_tags = parse_input(msg) cleaned = ", ".join(tag.capitalize() for tag in parsed_tags) or msg if last_key == "subject_weaknesses": bot_response = f"👌 Got it — I’ll stay away from careers heavy in {cleaned}. " elif last_key == "subject_strengths": bot_response = f"🔥 Nice! Being strong in {cleaned} is a great asset. " elif last_key == "interest_activities": bot_response = f"😎 Cool! Enjoying {cleaned} gives me clues about your personality. " elif last_key == "general_interests": bot_response = f"👍 That's insightful! An interest in {cleaned} helps narrow down the options. " else: bot_response = "✅ Okay, noted. " next_key, next_q = next_question(convo["answers"]) if next_q: bot_response += next_q else: bot_response, recs = get_recommendations(convo["answers"], COURSE_DATA) if recs: convo["last_recommendations"] = recs convo["state"] = "awaiting_more_details" bot_response += "
Click a course for more details, compare, or end the session." bot_response += "
🚪 End Chat
" else: convo["state"] = "awaiting_initial_action" elif current_state == "awaiting_compare_confirmation": if msg_lower == 'yes': convo["state"] = "awaiting_course_names_for_compare" bot_response = "Please enter up to 3 course names, separated by commas." else: convo["state"] = "asking_questions" convo["answers"] = {} key, question = next_question(convo["answers"]) bot_response = f"No problem! Let's find some courses for you first.

{question}" elif current_state == "awaiting_course_names_for_compare": user_courses = [name.strip() for name in msg_lower.split(',')[:3]] matched_courses = [] course_titles = [c['course'] for c in COURSE_DATA] for user_course in user_courses: best_match, score, _ = process.extractOne(user_course, course_titles, scorer=fuzz.token_set_ratio, processor=preprocess_text) if score > 85: matched_courses.append(next(c for c in COURSE_DATA if c['course'] == best_match)) bot_response = format_comparison(matched_courses) convo["last_recommendations"] = matched_courses convo["state"] = "awaiting_more_details" bot_response += "
🚪 End Chat
" elif current_state == "awaiting_course_for_college_search": course_titles = [c['course'] for c in COURSE_DATA] best_match, score, _ = process.extractOne(msg_lower, course_titles, scorer=fuzz.token_set_ratio, processor=preprocess_text) if score > 85: convo["course_for_college_search"] = best_match convo["state"] = "awaiting_pincode" bot_response = f"Okay, searching for colleges offering '{best_match}'. Please provide your 6-digit area PIN code." else: bot_response = "I couldn't find a clear match for that course. Could you please try rephrasing or be more specific?" convo["state"] = "awaiting_course_for_college_search" elif current_state == "awaiting_pincode": if re.match(r"^\d{6}$", msg): course_name = convo.get("course_for_college_search", "this course") bot_response, _ = find_nearby_colleges(course_name, msg) bot_response += "
🔎 Search Again
🚪 End Chat
" convo["state"] = "awaiting_initial_action" else: bot_response = "That doesn't seem like a valid 6-digit PIN code. Please try again." elif current_state == "awaiting_more_details": recs = convo.get("last_recommendations", []) if msg_lower == 'end chat': bot_response = "Sure. Would you like to leave some feedback about your experience?
👍 Yes
👎 No
" convo['state'] = 'awaiting_end_confirmation' elif msg_lower == 'compare': bot_response = format_comparison(recs) bot_response += "
🚪 End Chat
" else: chosen_course = None if msg.isdigit() and 1 <= int(msg) <= len(recs): chosen_course = recs[int(msg) - 1] else: course_titles = [r.get('course', '') for r in recs] best_match, score, _ = process.extractOne(msg, course_titles, scorer=fuzz.ratio) if score > 70: chosen_course = next((r for r in recs if r.get('course') == best_match), None) if chosen_course: bot_response = format_course_details(chosen_course) convo["course_for_college_search"] = chosen_course.get('course') bot_response += "

Would you like to find nearby colleges for this course?
👍 Yes
👎 No
" convo['state'] = 'awaiting_college_search_confirmation' else: bot_response = "Sorry, I didn't recognize that selection. Please choose an option from your recommendations." elif current_state == 'awaiting_college_search_confirmation': if 'yes' in msg_lower: bot_response = "Great! Please provide your 6-digit PIN code." convo['state'] = 'awaiting_pincode' else: bot_response = "No problem. You can explore other recommendations, compare courses, or select a new feature from the left panel." bot_response += "
🚪 End Chat
" convo['state'] = 'awaiting_more_details' elif current_state == 'awaiting_end_confirmation': if 'yes' in msg_lower: bot_response = "I'd love to hear your thoughts. How was your experience?" convo['state'] = 'awaiting_feedback' else: bot_response = "No problem! It was great helping you." bot_response += "
🔄 Start Over
" convo['state'] = 'session_ended' elif current_state == 'awaiting_feedback': logging.info(f"FEEDBACK: {msg}") bot_response = "Thank you for your feedback!" bot_response += "
🔄 Start Over
" convo['state'] = 'session_ended' logging.info(f"BOT: {re.sub('<[^<]+?>', ' ', bot_response).strip()}") return jsonify({"response": bot_response, "conversation": convo}) if __name__ == "__main__": app.run(debug=True)