voicebot / core /wikipedia_processor.py
datbkpro's picture
voicebot offical
dbf2148
raw
history blame
2.83 kB
import os
import json
import pandas as pd
from typing import List
from models.schemas import RAGDocument
class WikipediaProcessor:
def __init__(self):
self.supported_formats = ['.txt', '.csv', '.json']
def process_uploaded_file(self, file_path: str) -> List[str]:
"""Xử lý file Wikipedia uploaded"""
file_ext = os.path.splitext(file_path)[1].lower()
try:
if file_ext == '.txt':
return self._process_txt_file(file_path)
elif file_ext == '.csv':
return self._process_csv_file(file_path)
elif file_ext == '.json':
return self._process_json_file(file_path)
else:
raise ValueError(f"Định dạng file không được hỗ trợ: {file_ext}")
except Exception as e:
raise Exception(f"Lỗi xử lý file: {str(e)}")
def _process_txt_file(self, file_path: str) -> List[str]:
"""Xử lý file text"""
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
paragraphs = [p.strip() for p in content.split('\n\n') if p.strip() and len(p.strip()) > 20]
return paragraphs
def _process_csv_file(self, file_path: str) -> List[str]:
"""Xử lý file CSV"""
try:
df = pd.read_csv(file_path)
documents = []
for _, row in df.iterrows():
doc_parts = []
for col in df.columns:
if pd.notna(row[col]) and str(row[col]).strip():
doc_parts.append(f"{col}: {row[col]}")
if doc_parts:
documents.append(" | ".join(doc_parts))
return documents
except Exception as e:
raise Exception(f"Lỗi đọc CSV: {str(e)}")
def _process_json_file(self, file_path: str) -> List[str]:
"""Xử lý file JSON"""
try:
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
documents = []
def extract_text(obj, current_path=""):
if isinstance(obj, dict):
for key, value in obj.items():
extract_text(value, f"{current_path}.{key}" if current_path else key)
elif isinstance(obj, list):
for item in obj:
extract_text(item, current_path)
elif isinstance(obj, str) and len(obj.strip()) > 10:
documents.append(f"{current_path}: {obj.strip()}")
extract_text(data)
return documents
except Exception as e:
raise Exception(f"Lỗi đọc JSON: {str(e)}")