markobinario commited on
Commit
d1e5d7d
·
verified ·
1 Parent(s): ef8a26d

Upload 3 files

Browse files
Files changed (3) hide show
  1. ai_chatbot.py +160 -0
  2. database_recommender.py +293 -0
  3. requirements.txt +7 -1
ai_chatbot.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sentence_transformers import SentenceTransformer
2
+ import numpy as np
3
+ from typing import List, Dict, Tuple
4
+ import re
5
+
6
+ class AIChatbot:
7
+ def __init__(self):
8
+ # Load the pre-trained model (can use a smaller model for more speed)
9
+ self.model = SentenceTransformer('all-MiniLM-L6-v2')
10
+ # Warm up the model to avoid first-request slowness
11
+ _ = self.model.encode(["Hello, world!"])
12
+ self.faq_embeddings = None
13
+ self.faqs = None
14
+ self.load_faqs()
15
+
16
+ def load_faqs(self):
17
+ """Load static FAQs and compute their normalized embeddings"""
18
+ # Static FAQ data
19
+ self.faqs = [
20
+ {"id": 1, "question": "What are the admission requirements?", "answer": "To apply for admission, you need to submit your high school diploma, transcript of records, 2x2 ID photo, and completed application form. You also need to take the entrance examination."},
21
+ {"id": 2, "question": "When is the application deadline?", "answer": "The application deadline is usually in March for the first semester and October for the second semester. Please check our website for the exact dates."},
22
+ {"id": 3, "question": "What courses are available?", "answer": "We offer various courses including BS Computer Science, BS Information Technology, BS Business Administration, BS Education, BS Nursing, BS Architecture, and more. Check our course catalog for the complete list."},
23
+ {"id": 4, "question": "How much is the tuition fee?", "answer": "Tuition fees vary by program. For undergraduate programs, it ranges from ₱15,000 to ₱25,000 per semester. Please contact the registrar's office for specific program fees."},
24
+ {"id": 5, "question": "Do you offer scholarships?", "answer": "Yes, we offer various scholarships including academic scholarships, athletic scholarships, and need-based financial aid. Applications are available at the student affairs office."},
25
+ {"id": 6, "question": "What is the minimum GWA requirement?", "answer": "The minimum GWA requirement is 80% for most programs. Some programs may have higher requirements. Please check the specific requirements for your chosen program."},
26
+ {"id": 7, "question": "How can I contact the admissions office?", "answer": "You can contact the admissions office at (02) 123-4567 or email admissions@psau.edu.ph. Office hours are Monday to Friday, 8:00 AM to 5:00 PM."},
27
+ {"id": 8, "question": "Is there a dormitory available?", "answer": "Yes, we have dormitory facilities for both male and female students. Dormitory fees are separate from tuition. Please contact the housing office for availability and rates."},
28
+ {"id": 9, "question": "What documents do I need for enrollment?", "answer": "For enrollment, you need your admission letter, original and photocopy of birth certificate, original and photocopy of high school diploma, 2x2 ID photos, and medical certificate."},
29
+ {"id": 10, "question": "Can I transfer from another school?", "answer": "Yes, we accept transferees. You need to submit your transcript of records, honorable dismissal, and other required documents. Some credits may be credited depending on the program."}
30
+ ]
31
+
32
+ if self.faqs:
33
+ # Compute and normalize embeddings for all questions
34
+ questions = [faq['question'] for faq in self.faqs]
35
+ embeddings = self.model.encode(questions, normalize_embeddings=True)
36
+ self.faq_embeddings = np.array(embeddings)
37
+
38
+ def save_unanswered_question(self, question):
39
+ """Log unanswered questions to console (can be extended to save to file)"""
40
+ print(f"Unanswered question logged: {question}")
41
+ # In a real implementation, you could save this to a file or send to an admin
42
+
43
+ def _tokenize(self, text: str):
44
+ if not text:
45
+ return []
46
+ return [t for t in re.findall(r"[a-z0-9]+", text.lower()) if len(t) > 2]
47
+
48
+ def _overlap_ratio(self, q_tokens, faq_tokens):
49
+ if not q_tokens or not faq_tokens:
50
+ return 0.0
51
+ q_set = set(q_tokens)
52
+ f_set = set(faq_tokens)
53
+ inter = len(q_set & f_set)
54
+ denom = max(len(q_set), 1)
55
+ return inter / denom
56
+
57
+ def _wh_class(self, text: str) -> str:
58
+ if not text:
59
+ return ''
60
+ s = text.strip().lower()
61
+ # simple heuristic classification by leading wh-word
62
+ for key in ['who', 'where', 'when', 'what', 'how', 'why', 'which']:
63
+ if s.startswith(key + ' ') or s.startswith(key + "?"):
64
+ return key
65
+ # also check presence if not leading
66
+ for key in ['who', 'where', 'when', 'what', 'how', 'why', 'which']:
67
+ if f' {key} ' in f' {s} ':
68
+ return key
69
+ return ''
70
+
71
+ def find_best_match(self, question: str, threshold: float = 0.7) -> Tuple[str, float]:
72
+ print(f"find_best_match called with: {question}") # Debug print
73
+ if not self.faqs or self.faq_embeddings is None:
74
+ return "I'm sorry, I couldn't find any FAQs in the database.", 0.0
75
+
76
+ # Compute and normalize embedding for the input question
77
+ question_embedding = self.model.encode([question], normalize_embeddings=True)[0]
78
+ similarities = np.dot(self.faq_embeddings, question_embedding)
79
+
80
+ # Compute keyword overlap with each FAQ question
81
+ q_tokens = self._tokenize(question)
82
+ overlap_scores = []
83
+ for faq in self.faqs:
84
+ overlap_scores.append(self._overlap_ratio(q_tokens, self._tokenize(faq['question'])))
85
+
86
+ similarities = np.array(similarities)
87
+ overlap_scores = np.array(overlap_scores)
88
+
89
+ # Combined score to reduce false positives
90
+ combined = 0.7 * similarities + 0.3 * overlap_scores
91
+
92
+ # Apply WH-word intent consistency penalty
93
+ q_wh = self._wh_class(question)
94
+ if q_wh:
95
+ for i, faq in enumerate(self.faqs):
96
+ f_wh = self._wh_class(faq['question'])
97
+ if f_wh and f_wh != q_wh:
98
+ combined[i] *= 0.6 # penalize mismatched intent significantly
99
+ best_idx = int(np.argmax(combined))
100
+ best_semantic = float(similarities[best_idx])
101
+ best_overlap = float(overlap_scores[best_idx])
102
+ best_combined = float(combined[best_idx])
103
+ best_wh = self._wh_class(self.faqs[best_idx]['question'])
104
+
105
+ # Acceptance criteria: require good semantic OR strong combined with overlap
106
+ accept = (
107
+ best_semantic >= max(0.7, threshold)
108
+ or (best_combined >= threshold and best_overlap >= 0.3)
109
+ )
110
+ # Enforce WH intent match when present
111
+ if accept and q_wh and best_wh and q_wh != best_wh:
112
+ accept = False
113
+
114
+ if accept:
115
+ return self.faqs[best_idx]['answer'], best_combined
116
+ else:
117
+ # Log as unanswered so admins can curate (ignore errors)
118
+ try:
119
+ self.save_unanswered_question(question)
120
+ except Exception:
121
+ pass
122
+ fallback = (
123
+ "Sorry, I don’t have the knowledge to answer that yet.\n"
124
+ "I’ll notify an admin about your question and we’ll add the answer soon.\n"
125
+ "Please come back in a while."
126
+ )
127
+ return (fallback, best_combined)
128
+
129
+ def get_suggested_questions(self, question: str, num_suggestions: int = 3) -> List[str]:
130
+ """Get suggested questions based on the input question"""
131
+ if not self.faqs or self.faq_embeddings is None:
132
+ return []
133
+
134
+ # Compute and normalize embedding for the input question
135
+ question_embedding = self.model.encode([question], normalize_embeddings=True)[0]
136
+
137
+ # Calculate cosine similarity
138
+ similarities = np.dot(self.faq_embeddings, question_embedding)
139
+
140
+ # Get top N similar questions
141
+ top_indices = np.argsort(similarities)[-num_suggestions:][::-1]
142
+ return [self.faqs[idx]['question'] for idx in top_indices if similarities[idx] > 0.3]
143
+
144
+ def add_faq(self, question: str, answer: str) -> bool:
145
+ """Add a new FAQ to the static list (for demonstration purposes)"""
146
+ try:
147
+ new_id = max([faq['id'] for faq in self.faqs]) + 1 if self.faqs else 1
148
+ new_faq = {"id": new_id, "question": question, "answer": answer}
149
+ self.faqs.append(new_faq)
150
+
151
+ # Recompute embeddings
152
+ questions = [faq['question'] for faq in self.faqs]
153
+ embeddings = self.model.encode(questions, normalize_embeddings=True)
154
+ self.faq_embeddings = np.array(embeddings)
155
+
156
+ print(f"FAQ added: {question}")
157
+ return True
158
+ except Exception as e:
159
+ print(f"Error adding FAQ: {e}")
160
+ return False
database_recommender.py ADDED
@@ -0,0 +1,293 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from sklearn.neighbors import KNeighborsClassifier
4
+ from sklearn.preprocessing import LabelEncoder, StandardScaler
5
+ import joblib
6
+ import json
7
+
8
+ class CourseRecommender:
9
+ def __init__(self):
10
+ self.model = None
11
+ self.label_encoders = {}
12
+ self.scaler = StandardScaler()
13
+ self.courses = self.get_courses()
14
+ self.training_data = self.get_training_data()
15
+ self.train_model()
16
+
17
+ def get_courses(self):
18
+ """Get static course data"""
19
+ return {
20
+ 'BSCS': 'Bachelor of Science in Computer Science',
21
+ 'BSIT': 'Bachelor of Science in Information Technology',
22
+ 'BSBA': 'Bachelor of Science in Business Administration',
23
+ 'BSED': 'Bachelor of Science in Education',
24
+ 'BSN': 'Bachelor of Science in Nursing',
25
+ 'BSArch': 'Bachelor of Science in Architecture',
26
+ 'BSIE': 'Bachelor of Science in Industrial Engineering',
27
+ 'BSHM': 'Bachelor of Science in Hospitality Management',
28
+ 'BSA': 'Bachelor of Science in Accountancy',
29
+ 'BSPsych': 'Bachelor of Science in Psychology',
30
+ 'BSAgri': 'Bachelor of Science in Agriculture'
31
+ }
32
+
33
+ def save_student_data(self, stanine, gwa, strand, course, rating, hobbies=None):
34
+ """Save student feedback to in-memory storage (for demonstration purposes)"""
35
+ try:
36
+ # In a real implementation, you could save this to a file or external storage
37
+ print(f"Student feedback saved: Stanine={stanine}, GWA={gwa}, Strand={strand}, Course={course}, Rating={rating}, Hobbies={hobbies}")
38
+ return True
39
+ except Exception as e:
40
+ print(f"Error saving student feedback: {e}")
41
+ return False
42
+
43
+ def get_training_data(self):
44
+ """Get static training data for demonstration purposes"""
45
+ # Sample training data to demonstrate the recommender system
46
+ training_data = [
47
+ # STEM students
48
+ (8, 95, 'STEM', 'BSCS', 5, 'programming, gaming, technology'),
49
+ (7, 90, 'STEM', 'BSIT', 4, 'computers, software, coding'),
50
+ (9, 98, 'STEM', 'BSCS', 5, 'programming, algorithms, math'),
51
+ (6, 85, 'STEM', 'BSIT', 3, 'technology, computers'),
52
+ (8, 92, 'STEM', 'BSArch', 4, 'design, drawing, creativity'),
53
+ (7, 88, 'STEM', 'BSIE', 4, 'engineering, problem solving'),
54
+
55
+ # ABM students
56
+ (8, 90, 'ABM', 'BSBA', 5, 'business, management, leadership'),
57
+ (7, 85, 'ABM', 'BSA', 4, 'accounting, numbers, finance'),
58
+ (6, 82, 'ABM', 'BSBA', 3, 'business, marketing'),
59
+ (9, 95, 'ABM', 'BSA', 5, 'accounting, finance, analysis'),
60
+
61
+ # HUMSS students
62
+ (8, 88, 'HUMSS', 'BSED', 5, 'teaching, helping, education'),
63
+ (7, 85, 'HUMSS', 'BSPsych', 4, 'psychology, helping, people'),
64
+ (6, 80, 'HUMSS', 'BSED', 3, 'teaching, children'),
65
+ (9, 92, 'HUMSS', 'BSPsych', 5, 'psychology, counseling, people'),
66
+
67
+ # General interests
68
+ (7, 87, 'STEM', 'BSN', 4, 'helping, healthcare, caring'),
69
+ (8, 89, 'ABM', 'BSHM', 4, 'hospitality, service, management'),
70
+ (6, 83, 'HUMSS', 'BSAgri', 3, 'agriculture, environment, nature'),
71
+ ]
72
+
73
+ return pd.DataFrame(training_data, columns=['stanine', 'gwa', 'strand', 'course', 'rating', 'hobbies'])
74
+
75
+ def train_model(self):
76
+ """Train the recommendation model using the training data"""
77
+ try:
78
+ training_data = self.get_training_data()
79
+
80
+ if training_data.empty:
81
+ print("No training data available - using default recommendations")
82
+ return
83
+
84
+ # Prepare features (hobbies required)
85
+ feature_columns = ['stanine', 'gwa', 'strand', 'hobbies']
86
+
87
+ # Create feature matrix
88
+ X = training_data[feature_columns].copy()
89
+ y = training_data['course']
90
+
91
+ # Handle categorical variables
92
+ categorical_columns = ['strand', 'hobbies']
93
+
94
+ # Refit encoders every training to incorporate new categories
95
+ for col in categorical_columns:
96
+ if col in X.columns:
97
+ X[col] = X[col].fillna('unknown')
98
+ self.label_encoders[col] = LabelEncoder()
99
+ X[col] = self.label_encoders[col].fit_transform(X[col])
100
+
101
+ # Scale numerical features
102
+ numerical_columns = ['stanine', 'gwa']
103
+ if not X[numerical_columns].empty:
104
+ X[numerical_columns] = self.scaler.fit_transform(X[numerical_columns])
105
+
106
+ # Train KNN model
107
+ self.model = KNeighborsClassifier(n_neighbors=3, weights='distance')
108
+ self.model.fit(X, y)
109
+
110
+ print("✅ Model trained successfully (hobbies required and encoded)")
111
+
112
+ except Exception as e:
113
+ print(f"Error training model: {e}")
114
+ self.model = None
115
+
116
+ def get_default_recommendations(self, stanine, gwa, strand):
117
+ """Provide default recommendations based on basic rules when no training data is available"""
118
+ courses = self.courses
119
+ recommendations = []
120
+
121
+ # Basic rules for recommendations
122
+ if strand == 'STEM':
123
+ if stanine >= 8 and gwa >= 90:
124
+ priority_courses = ['BSCS', 'BSIT']
125
+ else:
126
+ priority_courses = ['BSIT', 'BSCS']
127
+ elif strand == 'ABM':
128
+ priority_courses = ['BSBA']
129
+ elif strand == 'HUMSS':
130
+ priority_courses = ['BSED']
131
+ else:
132
+ priority_courses = list(courses.keys())
133
+
134
+ # Add courses with default probabilities
135
+ for i, course in enumerate(priority_courses[:2]): # Only take top 2
136
+ if course in courses:
137
+ recommendations.append({
138
+ 'code': course,
139
+ 'name': courses[course],
140
+ 'probability': 1.0 - (i * 0.2) # Decreasing probability for each course
141
+ })
142
+
143
+ return recommendations
144
+
145
+ def recommend_courses(self, stanine, gwa, strand, hobbies=None, top_n=5):
146
+ """Recommend courses based on student profile (hobbies required)"""
147
+ try:
148
+ if self.model is None:
149
+ return self.get_default_recommendations(stanine, gwa, strand)
150
+
151
+ # Prepare input features
152
+ input_data = pd.DataFrame([{
153
+ 'stanine': stanine,
154
+ 'gwa': gwa,
155
+ 'strand': strand,
156
+ 'hobbies': (hobbies or '').strip()
157
+ }])
158
+ # Validate hobbies
159
+ if not input_data['hobbies'].iloc[0]:
160
+ raise ValueError('hobbies is required for recommendations')
161
+
162
+ # Encode categorical variables
163
+ for col in ['strand', 'hobbies']:
164
+ if col in input_data.columns and col in self.label_encoders:
165
+ value = input_data[col].iloc[0]
166
+ if value not in self.label_encoders[col].classes_:
167
+ # Extend encoder classes to include unseen value at inference
168
+ self.label_encoders[col].classes_ = np.append(self.label_encoders[col].classes_, value)
169
+ input_data[col] = self.label_encoders[col].transform(input_data[col])
170
+
171
+ # Scale numerical features
172
+ numerical_columns = ['stanine', 'gwa']
173
+ if not input_data[numerical_columns].empty:
174
+ input_data[numerical_columns] = self.scaler.transform(input_data[numerical_columns])
175
+
176
+ # Get predictions
177
+ predictions = self.model.predict_proba(input_data)
178
+ courses = self.model.classes_
179
+
180
+ # Get top recommendations
181
+ top_indices = np.argsort(predictions[0])[-top_n:][::-1]
182
+ recommendations = []
183
+
184
+ course_map = self.courses
185
+ for idx in top_indices:
186
+ code = courses[idx]
187
+ confidence = predictions[0][idx]
188
+ recommendations.append({
189
+ 'code': code,
190
+ 'name': course_map.get(code, code),
191
+ 'rating': round(confidence * 100, 1)
192
+ })
193
+
194
+ return recommendations
195
+
196
+ except Exception as e:
197
+ print(f"Error recommending courses: {e}")
198
+ return self.get_default_recommendations(stanine, gwa, strand)
199
+
200
+ def _get_recommendation_reason(self, course, stanine, gwa, strand, hobbies, interests, personality_type, learning_style, career_goals):
201
+ """Generate personalized reason for recommendation"""
202
+ reasons = []
203
+
204
+ # Academic performance reasons
205
+ if stanine >= 8:
206
+ reasons.append("Excellent academic performance")
207
+ elif stanine >= 6:
208
+ reasons.append("Good academic foundation")
209
+
210
+ if gwa >= 85:
211
+ reasons.append("High academic achievement")
212
+ elif gwa >= 80:
213
+ reasons.append("Strong academic record")
214
+
215
+ # Strand alignment
216
+ if strand == "STEM" and course in ["BSCS", "BSIT", "BSArch", "BSIE", "BSN"]:
217
+ reasons.append("Perfect match with your STEM background")
218
+ elif strand == "ABM" and course in ["BSBA", "BSA"]:
219
+ reasons.append("Excellent alignment with your ABM strand")
220
+ elif strand == "HUMSS" and course in ["BSED", "BSPsych"]:
221
+ reasons.append("Great fit with your HUMSS background")
222
+
223
+ # Hobbies and interests alignment
224
+ if hobbies and any(hobby in hobbies.lower() for hobby in ["gaming", "programming", "technology", "computers"]):
225
+ if course in ["BSCS", "BSIT"]:
226
+ reasons.append("Matches your technology interests")
227
+
228
+ if hobbies and any(hobby in hobbies.lower() for hobby in ["business", "leadership", "management"]):
229
+ if course in ["BSBA", "BSA"]:
230
+ reasons.append("Aligns with your business interests")
231
+
232
+ if hobbies and any(hobby in hobbies.lower() for hobby in ["helping", "teaching", "caring"]):
233
+ if course in ["BSED", "BSN", "BSPsych"]:
234
+ reasons.append("Perfect for your helping nature")
235
+
236
+ # Personality type alignment
237
+ if personality_type == "introvert" and course in ["BSCS", "BSA", "BSArch"]:
238
+ reasons.append("Suits your introverted personality")
239
+ elif personality_type == "extrovert" and course in ["BSBA", "BSED", "BSHM"]:
240
+ reasons.append("Great for your outgoing personality")
241
+
242
+ # Learning style alignment
243
+ if learning_style == "hands-on" and course in ["BSIT", "BSHM", "BSAgri"]:
244
+ reasons.append("Matches your hands-on learning preference")
245
+ elif learning_style == "visual" and course in ["BSArch", "BSCS"]:
246
+ reasons.append("Perfect for your visual learning style")
247
+
248
+ # Career goals alignment
249
+ if career_goals and any(goal in career_goals.lower() for goal in ["developer", "programmer", "software"]):
250
+ if course in ["BSCS", "BSIT"]:
251
+ reasons.append("Direct path to your career goals")
252
+
253
+ if career_goals and any(goal in career_goals.lower() for goal in ["business", "entrepreneur", "manager"]):
254
+ if course in ["BSBA", "BSA"]:
255
+ reasons.append("Direct path to your business goals")
256
+
257
+ # Default reason if no specific matches
258
+ if not reasons:
259
+ reasons.append("Good academic and personal fit")
260
+
261
+ return " • ".join(reasons[:3]) # Limit to top 3 reasons
262
+
263
+ def save_model(self, model_path='course_recommender_model.joblib'):
264
+ """Save the trained model"""
265
+ if self.model is None:
266
+ raise Exception("No model to save!")
267
+
268
+ model_data = {
269
+ 'model': self.model,
270
+ 'scaler': self.scaler,
271
+ 'label_encoders': self.label_encoders
272
+ }
273
+ joblib.dump(model_data, model_path)
274
+
275
+ def load_model(self, model_path='course_recommender_model.joblib'):
276
+ """Load a trained model"""
277
+ model_data = joblib.load(model_path)
278
+ self.model = model_data['model']
279
+ self.scaler = model_data['scaler']
280
+ self.label_encoders = model_data['label_encoders']
281
+
282
+ # Example usage
283
+ if __name__ == "__main__":
284
+ recommender = CourseRecommender()
285
+
286
+ # Example recommendation
287
+ recommendations = recommender.recommend_courses(
288
+ stanine=8,
289
+ gwa=95,
290
+ strand='STEM',
291
+ hobbies='programming, gaming, technology'
292
+ )
293
+ print("Recommended courses:", json.dumps(recommendations, indent=2))
requirements.txt CHANGED
@@ -1 +1,7 @@
1
- gradio>=4.42.0
 
 
 
 
 
 
 
1
+ gradio
2
+ numpy
3
+ pandas
4
+ scikit-learn
5
+ joblib
6
+ sentence-transformers
7
+ torch