File size: 4,912 Bytes
502e29f dbf2148 5d11746 dbf2148 6469f7e dbf2148 5d11746 dbf2148 5d11746 dbf2148 5d11746 dbf2148 6469f7e 5d11746 dbf2148 5d11746 6469f7e dbf2148 5d11746 6469f7e dbf2148 5d11746 dbf2148 6469f7e dbf2148 5d11746 6469f7e dbf2148 6469f7e dbf2148 6469f7e dbf2148 6469f7e dbf2148 5d11746 6469f7e dbf2148 5d11746 dbf2148 5d11746 dbf2148 6469f7e 5d11746 6469f7e dbf2148 6469f7e 5d11746 dbf2148 6469f7e dbf2148 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 |
import os
import json
import pandas as pd
from typing import List
class WikipediaProcessor:
def __init__(self):
self.supported_formats = ['.txt', '.csv', '.json']
def process_uploaded_file(self, file_path: str) -> List[str]:
"""Xử lý file Wikipedia uploaded"""
file_ext = os.path.splitext(file_path)[1].lower()
try:
print(f"🔄 Đang xử lý file: {file_path}, định dạng: {file_ext}")
if file_ext == '.txt':
return self._process_txt_file(file_path)
elif file_ext == '.csv':
return self._process_csv_file(file_path)
elif file_ext == '.json':
return self._process_json_file(file_path)
else:
raise ValueError(f"Định dạng file không được hỗ trợ: {file_ext}")
except Exception as e:
print(f"❌ Lỗi xử lý file: {traceback.format_exc()}")
raise Exception(f"Lỗi xử lý file: {str(e)}")
def _process_txt_file(self, file_path: str) -> List[str]:
"""Xử lý file text"""
print(f"📖 Đọc file text: {file_path}")
try:
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
# Multiple splitting strategies
paragraphs = []
# Try splitting by double newlines first
if '\n\n' in content:
paragraphs = [p.strip() for p in content.split('\n\n') if p.strip()]
else:
# Try splitting by single newlines
paragraphs = [p.strip() for p in content.split('\n') if p.strip()]
# Filter by length
paragraphs = [p for p in paragraphs if len(p.strip()) > 10]
print(f"✅ Đã trích xuất {len(paragraphs)} đoạn văn từ file text")
return paragraphs
except UnicodeDecodeError:
# Try with different encoding
with open(file_path, 'r', encoding='latin-1') as f:
content = f.read()
paragraphs = [p.strip() for p in content.split('\n\n') if p.strip() and len(p.strip()) > 10]
print(f"✅ Đã trích xuất {len(paragraphs)} đoạn văn từ file text (latin-1)")
return paragraphs
def _process_csv_file(self, file_path: str) -> List[str]:
"""Xử lý file CSV"""
print(f"📊 Đọc file CSV: {file_path}")
try:
df = pd.read_csv(file_path)
documents = []
print(f"📋 CSV có {len(df)} hàng và {len(df.columns)} cột")
for idx, row in df.iterrows():
doc_parts = []
for col in df.columns:
if pd.notna(row[col]) and str(row[col]).strip():
doc_parts.append(f"{col}: {row[col]}")
if doc_parts:
full_doc = " | ".join(doc_parts)
if len(full_doc) > 10: # Ensure minimum length
documents.append(full_doc)
if idx < 3: # Log first few rows
print(f"📝 Hàng {idx}: {doc_parts}")
print(f"✅ Đã trích xuất {len(documents)} documents từ CSV")
return documents
except Exception as e:
print(f"❌ Lỗi đọc CSV: {traceback.format_exc()}")
raise Exception(f"Lỗi đọc CSV: {str(e)}")
def _process_json_file(self, file_path: str) -> List[str]:
"""Xử lý file JSON"""
print(f"📄 Đọc file JSON: {file_path}")
try:
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
documents = []
def extract_text(obj, current_path=""):
if isinstance(obj, dict):
for key, value in obj.items():
extract_text(value, f"{current_path}.{key}" if current_path else key)
elif isinstance(obj, list):
for i, item in enumerate(obj):
extract_text(item, f"{current_path}[{i}]")
elif isinstance(obj, str) and len(obj.strip()) > 10:
documents.append(f"{current_path}: {obj.strip()}")
elif isinstance(obj, (int, float, bool)):
documents.append(f"{current_path}: {obj}")
extract_text(data)
print(f"✅ Đã trích xuất {len(documents)} documents từ JSON")
return documents
except Exception as e:
print(f"❌ Lỗi đọc JSON: {traceback.format_exc()}")
raise Exception(f"Lỗi đọc JSON: {str(e)}") |