voicebot / core /wikipedia_processor.py
datbkpro's picture
Update core/wikipedia_processor.py
502e29f verified
import os
import json
import pandas as pd
from typing import List
class WikipediaProcessor:
def __init__(self):
self.supported_formats = ['.txt', '.csv', '.json']
def process_uploaded_file(self, file_path: str) -> List[str]:
"""Xử lý file Wikipedia uploaded"""
file_ext = os.path.splitext(file_path)[1].lower()
try:
print(f"🔄 Đang xử lý file: {file_path}, định dạng: {file_ext}")
if file_ext == '.txt':
return self._process_txt_file(file_path)
elif file_ext == '.csv':
return self._process_csv_file(file_path)
elif file_ext == '.json':
return self._process_json_file(file_path)
else:
raise ValueError(f"Định dạng file không được hỗ trợ: {file_ext}")
except Exception as e:
print(f"❌ Lỗi xử lý file: {traceback.format_exc()}")
raise Exception(f"Lỗi xử lý file: {str(e)}")
def _process_txt_file(self, file_path: str) -> List[str]:
"""Xử lý file text"""
print(f"📖 Đọc file text: {file_path}")
try:
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
# Multiple splitting strategies
paragraphs = []
# Try splitting by double newlines first
if '\n\n' in content:
paragraphs = [p.strip() for p in content.split('\n\n') if p.strip()]
else:
# Try splitting by single newlines
paragraphs = [p.strip() for p in content.split('\n') if p.strip()]
# Filter by length
paragraphs = [p for p in paragraphs if len(p.strip()) > 10]
print(f"✅ Đã trích xuất {len(paragraphs)} đoạn văn từ file text")
return paragraphs
except UnicodeDecodeError:
# Try with different encoding
with open(file_path, 'r', encoding='latin-1') as f:
content = f.read()
paragraphs = [p.strip() for p in content.split('\n\n') if p.strip() and len(p.strip()) > 10]
print(f"✅ Đã trích xuất {len(paragraphs)} đoạn văn từ file text (latin-1)")
return paragraphs
def _process_csv_file(self, file_path: str) -> List[str]:
"""Xử lý file CSV"""
print(f"📊 Đọc file CSV: {file_path}")
try:
df = pd.read_csv(file_path)
documents = []
print(f"📋 CSV có {len(df)} hàng và {len(df.columns)} cột")
for idx, row in df.iterrows():
doc_parts = []
for col in df.columns:
if pd.notna(row[col]) and str(row[col]).strip():
doc_parts.append(f"{col}: {row[col]}")
if doc_parts:
full_doc = " | ".join(doc_parts)
if len(full_doc) > 10: # Ensure minimum length
documents.append(full_doc)
if idx < 3: # Log first few rows
print(f"📝 Hàng {idx}: {doc_parts}")
print(f"✅ Đã trích xuất {len(documents)} documents từ CSV")
return documents
except Exception as e:
print(f"❌ Lỗi đọc CSV: {traceback.format_exc()}")
raise Exception(f"Lỗi đọc CSV: {str(e)}")
def _process_json_file(self, file_path: str) -> List[str]:
"""Xử lý file JSON"""
print(f"📄 Đọc file JSON: {file_path}")
try:
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
documents = []
def extract_text(obj, current_path=""):
if isinstance(obj, dict):
for key, value in obj.items():
extract_text(value, f"{current_path}.{key}" if current_path else key)
elif isinstance(obj, list):
for i, item in enumerate(obj):
extract_text(item, f"{current_path}[{i}]")
elif isinstance(obj, str) and len(obj.strip()) > 10:
documents.append(f"{current_path}: {obj.strip()}")
elif isinstance(obj, (int, float, bool)):
documents.append(f"{current_path}: {obj}")
extract_text(data)
print(f"✅ Đã trích xuất {len(documents)} documents từ JSON")
return documents
except Exception as e:
print(f"❌ Lỗi đọc JSON: {traceback.format_exc()}")
raise Exception(f"Lỗi đọc JSON: {str(e)}")