datbkpro commited on
Commit
deb8dee
·
verified ·
1 Parent(s): 502e29f

Update core/rag_system.py

Browse files
Files changed (1) hide show
  1. core/rag_system.py +82 -85
core/rag_system.py CHANGED
@@ -2,11 +2,9 @@ import numpy as np
2
  import faiss
3
  from typing import List, Dict, Optional
4
  from sentence_transformers import SentenceTransformer
5
- import os
6
- import json
7
- import pandas as pd
8
- from typing import List
9
- import traceback
10
 
11
  class EnhancedRAGSystem:
12
  def __init__(self):
@@ -15,12 +13,13 @@ class EnhancedRAGSystem:
15
  self.embeddings: Optional[np.ndarray] = None
16
  self.index: Optional[faiss.Index] = None
17
 
18
- # Multilingual support - simplified for now
19
- self.current_dimension = 384 # Default dimension
 
20
 
21
- self._initialize_sample_data()
22
 
23
- def _initialize_sample_data(self):
24
  """Khởi tạo dữ liệu mẫu"""
25
  # Vietnamese sample data
26
  vietnamese_data = [
@@ -44,7 +43,7 @@ class EnhancedRAGSystem:
44
  "The United States has diverse climate zones from tropical to arctic"
45
  ]
46
 
47
- # Vietnamese metadata
48
  vietnamese_metadatas = [
49
  {"type": "nutrition", "source": "sample", "language": "vi"},
50
  {"type": "nutrition", "source": "sample", "language": "vi"},
@@ -55,7 +54,7 @@ class EnhancedRAGSystem:
55
  {"type": "geography", "source": "sample", "language": "vi"}
56
  ]
57
 
58
- # English metadata
59
  english_metadatas = [
60
  {"type": "nutrition", "source": "sample", "language": "en"},
61
  {"type": "nutrition", "source": "sample", "language": "en"},
@@ -70,15 +69,6 @@ class EnhancedRAGSystem:
70
  self.add_documents(vietnamese_data, vietnamese_metadatas)
71
  self.add_documents(english_data, english_metadatas)
72
 
73
- def _get_embedding_model(self):
74
- """Lấy embedding model - simplified version"""
75
- try:
76
- # Sử dụng model nhỏ để tiết kiệm bộ nhớ
77
- return SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
78
- except Exception as e:
79
- print(f"❌ Lỗi load embedding model: {e}")
80
- return None
81
-
82
  def add_documents(self, documents: List[str], metadatas: List[Dict] = None):
83
  """Thêm documents vào database - ĐÃ SỬA LỖI"""
84
  print(f"🔄 RAG System: Bắt đầu thêm {len(documents)} documents...")
@@ -107,11 +97,11 @@ class EnhancedRAGSystem:
107
  valid_metadatas = []
108
 
109
  for i, doc in enumerate(documents):
110
- if doc and isinstance(doc, str) and len(doc.strip()) > 3: # Giảm độ dài tối thiểu
111
  valid_documents.append(doc.strip())
112
  valid_metadatas.append(metadatas[i] if i < len(metadatas) else {})
113
  else:
114
- print(f"⚠️ Bỏ qua document {i}: không hợp lệ - '{doc}'")
115
 
116
  print(f"📊 Documents hợp lệ: {len(valid_documents)}/{len(documents)}")
117
 
@@ -120,36 +110,30 @@ class EnhancedRAGSystem:
120
  return
121
 
122
  # Create embeddings
123
- embedding_model = self._get_embedding_model()
124
- if embedding_model is None:
125
- print("❌ Không thể tạo embedding model")
126
- # Vẫn thêm documents không có embedding
127
- self._add_documents_without_embeddings(valid_documents, valid_metadatas)
128
- return
129
-
130
  new_embeddings_list = []
131
  successful_embeddings = 0
132
 
133
  for i, doc in enumerate(valid_documents):
134
  try:
135
- # Create embedding - sử dụng model duy nhất
 
 
 
 
 
 
 
136
  doc_embedding = embedding_model.encode([doc])
137
  new_embeddings_list.append(doc_embedding[0])
138
  successful_embeddings += 1
139
 
140
- if i % 10 == 0: # Log tiến độ
141
- print(f"📊 Đã embedding {i+1}/{len(valid_documents)} documents")
142
-
143
  except Exception as e:
144
  print(f"❌ Lỗi embedding document {i}: {e}")
145
- # Thêm document không có embedding
146
- new_embeddings_list.append(np.zeros(self.current_dimension))
147
 
148
  print(f"📊 Embeddings thành công: {successful_embeddings}/{len(valid_documents)}")
149
 
150
  if not new_embeddings_list:
151
- print("❌ Không tạo được embeddings nào, thêm documents không embedding")
152
- self._add_documents_without_embeddings(valid_documents, valid_metadatas)
153
  return
154
 
155
  # Convert to numpy array
@@ -158,7 +142,6 @@ class EnhancedRAGSystem:
158
  print(f"✅ Embedding matrix shape: {new_embeddings.shape}")
159
  except Exception as e:
160
  print(f"❌ Lỗi tạo embedding matrix: {e}")
161
- self._add_documents_without_embeddings(valid_documents, valid_metadatas)
162
  return
163
 
164
  # Handle existing embeddings
@@ -176,26 +159,19 @@ class EnhancedRAGSystem:
176
  # Check dimension compatibility
177
  if self.embeddings.shape[1] != new_embeddings.shape[1]:
178
  print(f"⚠️ Dimension mismatch: {self.embeddings.shape[1]} vs {new_embeddings.shape[1]}")
179
- # Resize embeddings để phù hợp
180
- if self.embeddings.shape[1] < new_embeddings.shape[1]:
181
- # Pad existing embeddings
182
- pad_width = new_embeddings.shape[1] - self.embeddings.shape[1]
183
- self.embeddings = np.pad(self.embeddings, ((0,0), (0,pad_width)))
184
- else:
185
- # Truncate new embeddings
186
- new_embeddings = new_embeddings[:, :self.embeddings.shape[1]]
187
-
188
- print("🔄 Đã điều chỉnh dimension")
189
-
190
- # Compatible dimensions, append
191
- self.embeddings = np.vstack([self.embeddings, new_embeddings])
192
- self.documents.extend(valid_documents)
193
- self.metadatas.extend(valid_metadatas)
194
- print("✅ Đã thêm vào system hiện có")
195
 
196
  except Exception as e:
197
  print(f"❌ Lỗi khi thêm vào system: {e}")
198
- self._add_documents_without_embeddings(valid_documents, valid_metadatas)
199
  return
200
 
201
  # Update FAISS index
@@ -205,16 +181,9 @@ class EnhancedRAGSystem:
205
  print(f"🎉 THÀNH CÔNG: Đã thêm {new_doc_count - old_doc_count} documents mới")
206
  print(f"📊 Tổng documents: {new_doc_count}")
207
 
208
- def _add_documents_without_embeddings(self, documents: List[str], metadatas: List[Dict]):
209
- """Thêm documents không có embeddings (fallback)"""
210
- self.documents.extend(documents)
211
- self.metadatas.extend(metadatas)
212
- print(f"✅ Đã thêm {len(documents)} documents không có embeddings")
213
-
214
  def _update_faiss_index(self):
215
  """Cập nhật FAISS index với embeddings hiện tại"""
216
  if self.embeddings is None or len(self.embeddings) == 0:
217
- print("⚠️ Không có embeddings để cập nhật index")
218
  return
219
 
220
  try:
@@ -229,20 +198,23 @@ class EnhancedRAGSystem:
229
  except Exception as e:
230
  print(f"❌ Lỗi cập nhật FAISS index: {e}")
231
 
232
- def semantic_search(self, query: str, top_k: int = 5) -> List[Dict]:
233
- """Tìm kiếm ngữ nghĩa - simplified version"""
234
  if top_k is None:
235
- top_k = 5
236
 
237
  if not self.documents or self.index is None:
238
  return self._fallback_keyword_search(query, top_k)
239
 
240
- embedding_model = self._get_embedding_model()
 
 
 
241
  if embedding_model is None:
242
  return self._fallback_keyword_search(query, top_k)
243
 
244
  try:
245
- # Encode query
246
  query_embedding = embedding_model.encode([query])
247
 
248
  # Normalize query embedding for cosine similarity
@@ -257,27 +229,52 @@ class EnhancedRAGSystem:
257
  results = []
258
  for i, (similarity, idx) in enumerate(zip(similarities[0], indices[0])):
259
  if idx < len(self.documents):
260
- results.append({
261
- "id": str(idx),
262
- "text": self.documents[idx],
263
- "similarity": float(similarity),
264
- "metadata": self.metadatas[idx] if idx < len(self.metadatas) else {}
265
- })
266
 
267
- print(f"🔍 Tìm kiếm '{query[:50]}...' - Tìm thấy {len(results)} kết quả")
268
- return results
 
 
 
269
 
270
  except Exception as e:
271
  print(f"❌ Lỗi tìm kiếm ngữ nghĩa: {e}")
272
  return self._fallback_keyword_search(query, top_k)
273
 
274
- def _fallback_keyword_search(self, query: str, top_k: int) -> List[Dict]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
275
  """Tìm kiếm dự phòng dựa trên từ khóa"""
276
  query_lower = query.lower()
277
  results = []
278
 
279
  for i, doc in enumerate(self.documents):
280
  score = 0
 
 
 
 
 
 
281
 
282
  # Keyword matching
283
  for word in query_lower.split():
@@ -285,18 +282,18 @@ class EnhancedRAGSystem:
285
  score += 1
286
 
287
  if score > 0:
288
- results.append({
289
- "id": str(i),
290
- "text": doc,
291
- "similarity": min(score / 5, 1.0),
292
- "metadata": self.metadatas[i] if i < len(self.metadatas) else {}
293
- })
294
 
295
- results.sort(key=lambda x: x["similarity"], reverse=True)
296
  return results[:top_k]
297
 
298
  def get_collection_stats(self) -> Dict:
299
- """Lấy thống kê collection"""
300
  language_stats = {}
301
  for metadata in self.metadatas:
302
  lang = metadata.get('language', 'unknown')
@@ -310,4 +307,4 @@ class EnhancedRAGSystem:
310
  'name': 'multilingual_rag_system',
311
  'status': 'active',
312
  'has_embeddings': self.embeddings is not None
313
- }
 
2
  import faiss
3
  from typing import List, Dict, Optional
4
  from sentence_transformers import SentenceTransformer
5
+ from models.schemas import RAGSearchResult
6
+ from config.settings import settings
7
+ from core.multilingual_manager import MultilingualManager
 
 
8
 
9
  class EnhancedRAGSystem:
10
  def __init__(self):
 
13
  self.embeddings: Optional[np.ndarray] = None
14
  self.index: Optional[faiss.Index] = None
15
 
16
+ # Multilingual support
17
+ self.multilingual_manager = MultilingualManager()
18
+ self.current_dimension = settings.EMBEDDING_DIMENSION
19
 
20
+ self._initialize_sample_data() # SỬA TÊN HÀM
21
 
22
+ def _initialize_sample_data(self): # SỬA TÊN HÀM
23
  """Khởi tạo dữ liệu mẫu"""
24
  # Vietnamese sample data
25
  vietnamese_data = [
 
43
  "The United States has diverse climate zones from tropical to arctic"
44
  ]
45
 
46
+ # Vietnamese metadata - SỬA LỖI SYNTAX
47
  vietnamese_metadatas = [
48
  {"type": "nutrition", "source": "sample", "language": "vi"},
49
  {"type": "nutrition", "source": "sample", "language": "vi"},
 
54
  {"type": "geography", "source": "sample", "language": "vi"}
55
  ]
56
 
57
+ # English metadata - SỬA LỖI SYNTAX
58
  english_metadatas = [
59
  {"type": "nutrition", "source": "sample", "language": "en"},
60
  {"type": "nutrition", "source": "sample", "language": "en"},
 
69
  self.add_documents(vietnamese_data, vietnamese_metadatas)
70
  self.add_documents(english_data, english_metadatas)
71
 
 
 
 
 
 
 
 
 
 
72
  def add_documents(self, documents: List[str], metadatas: List[Dict] = None):
73
  """Thêm documents vào database - ĐÃ SỬA LỖI"""
74
  print(f"🔄 RAG System: Bắt đầu thêm {len(documents)} documents...")
 
97
  valid_metadatas = []
98
 
99
  for i, doc in enumerate(documents):
100
+ if doc and isinstance(doc, str) and len(doc.strip()) > 5: # At least 5 characters
101
  valid_documents.append(doc.strip())
102
  valid_metadatas.append(metadatas[i] if i < len(metadatas) else {})
103
  else:
104
+ print(f"⚠️ Bỏ qua document {i}: không hợp lệ")
105
 
106
  print(f"📊 Documents hợp lệ: {len(valid_documents)}/{len(documents)}")
107
 
 
110
  return
111
 
112
  # Create embeddings
 
 
 
 
 
 
 
113
  new_embeddings_list = []
114
  successful_embeddings = 0
115
 
116
  for i, doc in enumerate(valid_documents):
117
  try:
118
+ language = valid_metadatas[i].get('language', 'vi')
119
+ embedding_model = self.multilingual_manager.get_embedding_model(language)
120
+
121
+ if embedding_model is None:
122
+ print(f"⚠️ Không có embedding model cho document {i}")
123
+ continue
124
+
125
+ # Create embedding
126
  doc_embedding = embedding_model.encode([doc])
127
  new_embeddings_list.append(doc_embedding[0])
128
  successful_embeddings += 1
129
 
 
 
 
130
  except Exception as e:
131
  print(f"❌ Lỗi embedding document {i}: {e}")
 
 
132
 
133
  print(f"📊 Embeddings thành công: {successful_embeddings}/{len(valid_documents)}")
134
 
135
  if not new_embeddings_list:
136
+ print("❌ Không tạo được embeddings nào")
 
137
  return
138
 
139
  # Convert to numpy array
 
142
  print(f"✅ Embedding matrix shape: {new_embeddings.shape}")
143
  except Exception as e:
144
  print(f"❌ Lỗi tạo embedding matrix: {e}")
 
145
  return
146
 
147
  # Handle existing embeddings
 
159
  # Check dimension compatibility
160
  if self.embeddings.shape[1] != new_embeddings.shape[1]:
161
  print(f"⚠️ Dimension mismatch: {self.embeddings.shape[1]} vs {new_embeddings.shape[1]}")
162
+ print("🔄 Tạo system mới do dimension không khớp")
163
+ self.embeddings = new_embeddings
164
+ self.documents = valid_documents
165
+ self.metadatas = valid_metadatas
166
+ else:
167
+ # Compatible dimensions, append
168
+ self.embeddings = np.vstack([self.embeddings, new_embeddings])
169
+ self.documents.extend(valid_documents)
170
+ self.metadatas.extend(valid_metadatas)
171
+ print(" Đã thêm vào system hiện có")
 
 
 
 
 
 
172
 
173
  except Exception as e:
174
  print(f"❌ Lỗi khi thêm vào system: {e}")
 
175
  return
176
 
177
  # Update FAISS index
 
181
  print(f"🎉 THÀNH CÔNG: Đã thêm {new_doc_count - old_doc_count} documents mới")
182
  print(f"📊 Tổng documents: {new_doc_count}")
183
 
 
 
 
 
 
 
184
  def _update_faiss_index(self):
185
  """Cập nhật FAISS index với embeddings hiện tại"""
186
  if self.embeddings is None or len(self.embeddings) == 0:
 
187
  return
188
 
189
  try:
 
198
  except Exception as e:
199
  print(f"❌ Lỗi cập nhật FAISS index: {e}")
200
 
201
+ def semantic_search(self, query: str, top_k: int = None) -> List[RAGSearchResult]:
202
+ """Tìm kiếm ngữ nghĩa với model phù hợp theo ngôn ngữ"""
203
  if top_k is None:
204
+ top_k = settings.TOP_K_RESULTS
205
 
206
  if not self.documents or self.index is None:
207
  return self._fallback_keyword_search(query, top_k)
208
 
209
+ # Detect query language and get appropriate model
210
+ query_language = self.multilingual_manager.detect_language(query)
211
+ embedding_model = self.multilingual_manager.get_embedding_model(query_language)
212
+
213
  if embedding_model is None:
214
  return self._fallback_keyword_search(query, top_k)
215
 
216
  try:
217
+ # Encode query with appropriate model
218
  query_embedding = embedding_model.encode([query])
219
 
220
  # Normalize query embedding for cosine similarity
 
229
  results = []
230
  for i, (similarity, idx) in enumerate(zip(similarities[0], indices[0])):
231
  if idx < len(self.documents):
232
+ results.append(RAGSearchResult(
233
+ id=str(idx),
234
+ text=self.documents[idx],
235
+ similarity=float(similarity),
236
+ metadata=self.metadatas[idx] if idx < len(self.metadatas) else {}
237
+ ))
238
 
239
+ # Filter results by language relevance
240
+ filtered_results = self._filter_by_language_relevance(results, query_language)
241
+
242
+ print(f"🔍 Tìm kiếm '{query[:50]}...' (ngôn ngữ: {query_language}) - Tìm thấy {len(filtered_results)} kết quả")
243
+ return filtered_results
244
 
245
  except Exception as e:
246
  print(f"❌ Lỗi tìm kiếm ngữ nghĩa: {e}")
247
  return self._fallback_keyword_search(query, top_k)
248
 
249
+ def _filter_by_language_relevance(self, results: List[RAGSearchResult], query_language: str) -> List[RAGSearchResult]:
250
+ """Lọc kết quả theo độ liên quan ngôn ngữ"""
251
+ if not results:
252
+ return results
253
+
254
+ # Boost scores for documents in the same language
255
+ for result in results:
256
+ doc_language = result.metadata.get('language', 'vi')
257
+ if doc_language == query_language:
258
+ # Boost similarity score for same language documents
259
+ result.similarity = min(result.similarity * 1.2, 1.0)
260
+
261
+ # Re-sort by updated similarity scores
262
+ results.sort(key=lambda x: x.similarity, reverse=True)
263
+ return results
264
+
265
+ def _fallback_keyword_search(self, query: str, top_k: int) -> List[RAGSearchResult]:
266
  """Tìm kiếm dự phòng dựa trên từ khóa"""
267
  query_lower = query.lower()
268
  results = []
269
 
270
  for i, doc in enumerate(self.documents):
271
  score = 0
272
+ doc_language = self.metadatas[i].get('language', 'vi') if i < len(self.metadatas) else 'vi'
273
+ query_language = self.multilingual_manager.detect_language(query)
274
+
275
+ # Language matching bonus
276
+ if doc_language == query_language:
277
+ score += 0.5
278
 
279
  # Keyword matching
280
  for word in query_lower.split():
 
282
  score += 1
283
 
284
  if score > 0:
285
+ results.append(RAGSearchResult(
286
+ id=str(i),
287
+ text=doc,
288
+ similarity=min(score / 5, 1.0),
289
+ metadata=self.metadatas[i] if i < len(self.metadatas) else {}
290
+ ))
291
 
292
+ results.sort(key=lambda x: x.similarity, reverse=True)
293
  return results[:top_k]
294
 
295
  def get_collection_stats(self) -> Dict:
296
+ """Lấy thống kê collection với thông tin đa ngôn ngữ"""
297
  language_stats = {}
298
  for metadata in self.metadatas:
299
  lang = metadata.get('language', 'unknown')
 
307
  'name': 'multilingual_rag_system',
308
  'status': 'active',
309
  'has_embeddings': self.embeddings is not None
310
+ }