|
|
import os |
|
|
import json |
|
|
import pandas as pd |
|
|
from typing import List |
|
|
class WikipediaProcessor: |
|
|
def __init__(self): |
|
|
self.supported_formats = ['.txt', '.csv', '.json'] |
|
|
|
|
|
def process_uploaded_file(self, file_path: str) -> List[str]: |
|
|
"""Xử lý file Wikipedia uploaded""" |
|
|
file_ext = os.path.splitext(file_path)[1].lower() |
|
|
|
|
|
try: |
|
|
print(f"🔄 Đang xử lý file: {file_path}, định dạng: {file_ext}") |
|
|
|
|
|
if file_ext == '.txt': |
|
|
return self._process_txt_file(file_path) |
|
|
elif file_ext == '.csv': |
|
|
return self._process_csv_file(file_path) |
|
|
elif file_ext == '.json': |
|
|
return self._process_json_file(file_path) |
|
|
else: |
|
|
raise ValueError(f"Định dạng file không được hỗ trợ: {file_ext}") |
|
|
except Exception as e: |
|
|
print(f"❌ Lỗi xử lý file: {traceback.format_exc()}") |
|
|
raise Exception(f"Lỗi xử lý file: {str(e)}") |
|
|
|
|
|
def _process_txt_file(self, file_path: str) -> List[str]: |
|
|
"""Xử lý file text""" |
|
|
print(f"📖 Đọc file text: {file_path}") |
|
|
try: |
|
|
with open(file_path, 'r', encoding='utf-8') as f: |
|
|
content = f.read() |
|
|
|
|
|
|
|
|
paragraphs = [] |
|
|
|
|
|
|
|
|
if '\n\n' in content: |
|
|
paragraphs = [p.strip() for p in content.split('\n\n') if p.strip()] |
|
|
else: |
|
|
|
|
|
paragraphs = [p.strip() for p in content.split('\n') if p.strip()] |
|
|
|
|
|
|
|
|
paragraphs = [p for p in paragraphs if len(p.strip()) > 10] |
|
|
|
|
|
print(f"✅ Đã trích xuất {len(paragraphs)} đoạn văn từ file text") |
|
|
return paragraphs |
|
|
|
|
|
except UnicodeDecodeError: |
|
|
|
|
|
with open(file_path, 'r', encoding='latin-1') as f: |
|
|
content = f.read() |
|
|
|
|
|
paragraphs = [p.strip() for p in content.split('\n\n') if p.strip() and len(p.strip()) > 10] |
|
|
print(f"✅ Đã trích xuất {len(paragraphs)} đoạn văn từ file text (latin-1)") |
|
|
return paragraphs |
|
|
|
|
|
def _process_csv_file(self, file_path: str) -> List[str]: |
|
|
"""Xử lý file CSV""" |
|
|
print(f"📊 Đọc file CSV: {file_path}") |
|
|
try: |
|
|
df = pd.read_csv(file_path) |
|
|
documents = [] |
|
|
|
|
|
print(f"📋 CSV có {len(df)} hàng và {len(df.columns)} cột") |
|
|
|
|
|
for idx, row in df.iterrows(): |
|
|
doc_parts = [] |
|
|
for col in df.columns: |
|
|
if pd.notna(row[col]) and str(row[col]).strip(): |
|
|
doc_parts.append(f"{col}: {row[col]}") |
|
|
|
|
|
if doc_parts: |
|
|
full_doc = " | ".join(doc_parts) |
|
|
if len(full_doc) > 10: |
|
|
documents.append(full_doc) |
|
|
|
|
|
if idx < 3: |
|
|
print(f"📝 Hàng {idx}: {doc_parts}") |
|
|
|
|
|
print(f"✅ Đã trích xuất {len(documents)} documents từ CSV") |
|
|
return documents |
|
|
except Exception as e: |
|
|
print(f"❌ Lỗi đọc CSV: {traceback.format_exc()}") |
|
|
raise Exception(f"Lỗi đọc CSV: {str(e)}") |
|
|
|
|
|
def _process_json_file(self, file_path: str) -> List[str]: |
|
|
"""Xử lý file JSON""" |
|
|
print(f"📄 Đọc file JSON: {file_path}") |
|
|
try: |
|
|
with open(file_path, 'r', encoding='utf-8') as f: |
|
|
data = json.load(f) |
|
|
|
|
|
documents = [] |
|
|
|
|
|
def extract_text(obj, current_path=""): |
|
|
if isinstance(obj, dict): |
|
|
for key, value in obj.items(): |
|
|
extract_text(value, f"{current_path}.{key}" if current_path else key) |
|
|
elif isinstance(obj, list): |
|
|
for i, item in enumerate(obj): |
|
|
extract_text(item, f"{current_path}[{i}]") |
|
|
elif isinstance(obj, str) and len(obj.strip()) > 10: |
|
|
documents.append(f"{current_path}: {obj.strip()}") |
|
|
elif isinstance(obj, (int, float, bool)): |
|
|
documents.append(f"{current_path}: {obj}") |
|
|
|
|
|
extract_text(data) |
|
|
print(f"✅ Đã trích xuất {len(documents)} documents từ JSON") |
|
|
return documents |
|
|
except Exception as e: |
|
|
print(f"❌ Lỗi đọc JSON: {traceback.format_exc()}") |
|
|
raise Exception(f"Lỗi đọc JSON: {str(e)}") |