import os import json import pandas as pd from typing import List from models.schemas import RAGDocument class WikipediaProcessor: def __init__(self): self.supported_formats = ['.txt', '.csv', '.json'] def process_uploaded_file(self, file_path: str) -> List[str]: """Xử lý file Wikipedia uploaded""" file_ext = os.path.splitext(file_path)[1].lower() try: if file_ext == '.txt': return self._process_txt_file(file_path) elif file_ext == '.csv': return self._process_csv_file(file_path) elif file_ext == '.json': return self._process_json_file(file_path) else: raise ValueError(f"Định dạng file không được hỗ trợ: {file_ext}") except Exception as e: raise Exception(f"Lỗi xử lý file: {str(e)}") def _process_txt_file(self, file_path: str) -> List[str]: """Xử lý file text""" with open(file_path, 'r', encoding='utf-8') as f: content = f.read() paragraphs = [p.strip() for p in content.split('\n\n') if p.strip() and len(p.strip()) > 20] return paragraphs def _process_csv_file(self, file_path: str) -> List[str]: """Xử lý file CSV""" try: df = pd.read_csv(file_path) documents = [] for _, row in df.iterrows(): doc_parts = [] for col in df.columns: if pd.notna(row[col]) and str(row[col]).strip(): doc_parts.append(f"{col}: {row[col]}") if doc_parts: documents.append(" | ".join(doc_parts)) return documents except Exception as e: raise Exception(f"Lỗi đọc CSV: {str(e)}") def _process_json_file(self, file_path: str) -> List[str]: """Xử lý file JSON""" try: with open(file_path, 'r', encoding='utf-8') as f: data = json.load(f) documents = [] def extract_text(obj, current_path=""): if isinstance(obj, dict): for key, value in obj.items(): extract_text(value, f"{current_path}.{key}" if current_path else key) elif isinstance(obj, list): for item in obj: extract_text(item, current_path) elif isinstance(obj, str) and len(obj.strip()) > 10: documents.append(f"{current_path}: {obj.strip()}") extract_text(data) return documents except Exception as e: raise Exception(f"Lỗi đọc JSON: {str(e)}")