| import os | |
| import json | |
| import pandas as pd | |
| from typing import List | |
| from models.schemas import RAGDocument | |
| class WikipediaProcessor: | |
| def __init__(self): | |
| self.supported_formats = ['.txt', '.csv', '.json'] | |
| def process_uploaded_file(self, file_path: str) -> List[str]: | |
| """Xử lý file Wikipedia uploaded""" | |
| file_ext = os.path.splitext(file_path)[1].lower() | |
| try: | |
| if file_ext == '.txt': | |
| return self._process_txt_file(file_path) | |
| elif file_ext == '.csv': | |
| return self._process_csv_file(file_path) | |
| elif file_ext == '.json': | |
| return self._process_json_file(file_path) | |
| else: | |
| raise ValueError(f"Định dạng file không được hỗ trợ: {file_ext}") | |
| except Exception as e: | |
| raise Exception(f"Lỗi xử lý file: {str(e)}") | |
| def _process_txt_file(self, file_path: str) -> List[str]: | |
| """Xử lý file text""" | |
| with open(file_path, 'r', encoding='utf-8') as f: | |
| content = f.read() | |
| paragraphs = [p.strip() for p in content.split('\n\n') if p.strip() and len(p.strip()) > 20] | |
| return paragraphs | |
| def _process_csv_file(self, file_path: str) -> List[str]: | |
| """Xử lý file CSV""" | |
| try: | |
| df = pd.read_csv(file_path) | |
| documents = [] | |
| for _, row in df.iterrows(): | |
| doc_parts = [] | |
| for col in df.columns: | |
| if pd.notna(row[col]) and str(row[col]).strip(): | |
| doc_parts.append(f"{col}: {row[col]}") | |
| if doc_parts: | |
| documents.append(" | ".join(doc_parts)) | |
| return documents | |
| except Exception as e: | |
| raise Exception(f"Lỗi đọc CSV: {str(e)}") | |
| def _process_json_file(self, file_path: str) -> List[str]: | |
| """Xử lý file JSON""" | |
| try: | |
| with open(file_path, 'r', encoding='utf-8') as f: | |
| data = json.load(f) | |
| documents = [] | |
| def extract_text(obj, current_path=""): | |
| if isinstance(obj, dict): | |
| for key, value in obj.items(): | |
| extract_text(value, f"{current_path}.{key}" if current_path else key) | |
| elif isinstance(obj, list): | |
| for item in obj: | |
| extract_text(item, current_path) | |
| elif isinstance(obj, str) and len(obj.strip()) > 10: | |
| documents.append(f"{current_path}: {obj.strip()}") | |
| extract_text(data) | |
| return documents | |
| except Exception as e: | |
| raise Exception(f"Lỗi đọc JSON: {str(e)}") |