File size: 10,579 Bytes
a2ba3d2
 
 
a09117f
 
d1e5d7d
 
 
a09117f
 
a2ba3d2
 
 
 
a09117f
 
 
a2ba3d2
a09117f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d1e5d7d
a09117f
 
 
 
 
 
 
 
 
 
 
 
 
a2ba3d2
 
d1e5d7d
a2ba3d2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a09117f
 
 
 
 
 
 
 
 
 
 
 
 
 
a2ba3d2
a09117f
 
a2ba3d2
a09117f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a2ba3d2
a09117f
 
 
 
 
 
 
 
a2ba3d2
a09117f
 
47cf4fa
a09117f
 
 
 
 
 
47cf4fa
a09117f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d1e5d7d
a09117f
 
 
a2ba3d2
d1e5d7d
a09117f
 
d1e5d7d
a2ba3d2
a09117f
d1e5d7d
a09117f
a2ba3d2
a09117f
d1e5d7d
a09117f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
from sentence_transformers import SentenceTransformer
import numpy as np
from typing import List, Dict, Tuple
import mysql.connector
from mysql.connector import Error
import re

class AIChatbot:
    def __init__(self, db_config: Dict[str, str]):
        self.db_config = db_config
        # Load the pre-trained model (can use a smaller model for more speed)
        self.model = SentenceTransformer('all-MiniLM-L6-v2')
        # Warm up the model to avoid first-request slowness
        _ = self.model.encode(["Hello, world!"])
        self.faq_embeddings = None
        self.faqs = None
        self.load_faqs()
    
    def get_db_connection(self):
        try:
            connection = mysql.connector.connect(**self.db_config)
            return connection
        except Error as e:
            print(f"Error connecting to database: {e}")
            return None
    
    def load_faqs(self):
        """Load active FAQs from database and compute their normalized embeddings"""
        connection = self.get_db_connection()
        if connection:
            try:
                cursor = connection.cursor(dictionary=True)
                cursor.execute("SELECT id, question, answer FROM faqs WHERE is_active = 1 ORDER BY sort_order, id")
                self.faqs = cursor.fetchall()
                cursor.close()
                
                if self.faqs:
                    # Compute and normalize embeddings for all questions
                    questions = [faq['question'] for faq in self.faqs]
                    embeddings = self.model.encode(questions, normalize_embeddings=True)
                    self.faq_embeddings = np.array(embeddings)
            except Error as e:
                print(f"Error loading FAQs: {e}")
            finally:
                connection.close()
    
    def save_unanswered_question(self, question):
        print(f"Saving unanswered question: {question}")  # Debug print
        try:
            connection = self.get_db_connection()
            if connection:
                cursor = connection.cursor()
                query = "INSERT INTO unanswered_questions (question) VALUES (%s)"
                cursor.execute(query, (question,))
                connection.commit()
                cursor.close()
                connection.close()
        except Error as e:
            print(f"Error saving unanswered question: {e}")

    def _tokenize(self, text: str):
        if not text:
            return []
        return [t for t in re.findall(r"[a-z0-9]+", text.lower()) if len(t) > 2]

    def _overlap_ratio(self, q_tokens, faq_tokens):
        if not q_tokens or not faq_tokens:
            return 0.0
        q_set = set(q_tokens)
        f_set = set(faq_tokens)
        inter = len(q_set & f_set)
        denom = max(len(q_set), 1)
        return inter / denom

    def _wh_class(self, text: str) -> str:
        if not text:
            return ''
        s = text.strip().lower()
        # simple heuristic classification by leading wh-word
        for key in ['who', 'where', 'when', 'what', 'how', 'why', 'which']:
            if s.startswith(key + ' ') or s.startswith(key + "?"):
                return key
        # also check presence if not leading
        for key in ['who', 'where', 'when', 'what', 'how', 'why', 'which']:
            if f' {key} ' in f' {s} ':
                return key
        return ''

    def find_best_match(self, question: str, threshold: float = 0.7) -> Tuple[str, float]:
        print(f"find_best_match called with: {question}")  # Debug print
        
        # First try to match with FAQs
        if self.faqs and self.faq_embeddings is not None:
            # Compute and normalize embedding for the input question
            question_embedding = self.model.encode([question], normalize_embeddings=True)[0]
            similarities = np.dot(self.faq_embeddings, question_embedding)

            # Compute keyword overlap with each FAQ question
            q_tokens = self._tokenize(question)
            overlap_scores = []
            for faq in self.faqs:
                overlap_scores.append(self._overlap_ratio(q_tokens, self._tokenize(faq['question'])))

            similarities = np.array(similarities)
            overlap_scores = np.array(overlap_scores)

            # Combined score to reduce false positives
            combined = 0.7 * similarities + 0.3 * overlap_scores
            
            # Apply WH-word intent consistency penalty
            q_wh = self._wh_class(question)
            if q_wh:
                for i, faq in enumerate(self.faqs):
                    f_wh = self._wh_class(faq['question'])
                    if f_wh and f_wh != q_wh:
                        combined[i] *= 0.6  # penalize mismatched intent significantly
            best_idx = int(np.argmax(combined))
            best_semantic = float(similarities[best_idx])
            best_overlap = float(overlap_scores[best_idx])
            best_combined = float(combined[best_idx])
            best_wh = self._wh_class(self.faqs[best_idx]['question'])

            # Acceptance criteria: require good semantic OR strong combined with overlap
            accept = (
                best_semantic >= max(0.7, threshold)
                or (best_combined >= threshold and best_overlap >= 0.3)
            )
            # Enforce WH intent match when present
            if accept and q_wh and best_wh and q_wh != best_wh:
                accept = False

            if accept:
                return self.faqs[best_idx]['answer'], best_combined
        
        # If no FAQ match, provide general conversation response
        return self._generate_general_response(question)
    
    def _generate_general_response(self, question: str) -> Tuple[str, float]:
        """Generate general conversation responses for non-FAQ questions"""
        question_lower = question.lower().strip()
        
        # Greeting responses
        if any(greeting in question_lower for greeting in ['hello', 'hi', 'hey', 'good morning', 'good afternoon', 'good evening']):
            return "Hello! I'm the PSAU AI assistant. I'm here to help you with questions about university admissions, courses, and general information about Pangasinan State University. How can I assist you today?", 0.8
        
        # Thank you responses
        if any(thanks in question_lower for thanks in ['thank you', 'thanks', 'thank', 'appreciate']):
            return "You're very welcome! I'm happy to help. Is there anything else you'd like to know about PSAU or university admissions?", 0.9
        
        # Goodbye responses
        if any(goodbye in question_lower for goodbye in ['bye', 'goodbye', 'see you', 'farewell']):
            return "Goodbye! It was nice chatting with you. Feel free to come back anytime if you have more questions about PSAU. Good luck with your academic journey!", 0.9
        
        # How are you responses
        if any(how in question_lower for how in ['how are you', 'how do you do', 'how is it going']):
            return "I'm doing great, thank you for asking! I'm here and ready to help you with any questions about PSAU admissions, courses, or university life. What would you like to know?", 0.8
        
        # What can you do responses
        if any(what in question_lower for what in ['what can you do', 'what do you do', 'what are your capabilities']):
            return "I can help you with:\n• University admission requirements and procedures\n• Course information and recommendations\n• General questions about PSAU\n• Academic guidance and support\n• Information about campus life\n\nWhat specific information are you looking for?", 0.9
        
        # About PSAU responses
        if any(about in question_lower for about in ['about psa', 'about psu', 'about pangasinan state', 'tell me about']):
            return "Pangasinan State University (PSAU) is a premier state university in the Philippines offering quality education across various fields. We provide undergraduate and graduate programs in areas like Computer Science, Business, Education, Nursing, and more. We're committed to academic excellence and student success. What would you like to know more about?", 0.8
        
        # Help responses
        if any(help in question_lower for help in ['help', 'assist', 'support']):
            return "I'm here to help! I can assist you with:\n• Admission requirements and deadlines\n• Course information and recommendations\n• Academic programs and majors\n• Campus facilities and services\n• General university information\n\nJust ask me any question and I'll do my best to help you!", 0.9
        
        # Default general response
        return "I understand you're asking about something, but I'm specifically designed to help with PSAU-related questions like admissions, courses, and university information. Could you rephrase your question to be more specific about what you'd like to know about Pangasinan State University? I'm here to help with academic guidance and university-related inquiries!", 0.6
    
    def get_suggested_questions(self, question: str, num_suggestions: int = 3) -> List[str]:
        """Get suggested questions based on the input question"""
        if not self.faqs or self.faq_embeddings is None:
            return []
        
        # Compute and normalize embedding for the input question
        question_embedding = self.model.encode([question], normalize_embeddings=True)[0]
        
        # Calculate cosine similarity
        similarities = np.dot(self.faq_embeddings, question_embedding)
        
        # Get top N similar questions
        top_indices = np.argsort(similarities)[-num_suggestions:][::-1]
        return [self.faqs[idx]['question'] for idx in top_indices if similarities[idx] > 0.3]
    
    def add_faq(self, question: str, answer: str) -> bool:
        """Add a new FAQ to the database"""
        connection = self.get_db_connection()
        if connection:
            try:
                cursor = connection.cursor()
                query = "INSERT INTO faqs (question, answer) VALUES (%s, %s)"
                cursor.execute(query, (question, answer))
                connection.commit()
                cursor.close()
                
                # Reload FAQs to update embeddings
                self.load_faqs()
                return True
            except Error as e:
                print(f"Error adding FAQ: {e}")
                return False
            finally:
                connection.close()
        return False