File size: 4,912 Bytes
502e29f
 
 
 
dbf2148
 
 
 
 
5d11746
dbf2148
 
 
6469f7e
 
dbf2148
5d11746
dbf2148
5d11746
dbf2148
5d11746
dbf2148
 
 
6469f7e
5d11746
dbf2148
 
5d11746
6469f7e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dbf2148
 
5d11746
6469f7e
dbf2148
5d11746
dbf2148
 
6469f7e
 
 
dbf2148
 
 
5d11746
6469f7e
dbf2148
6469f7e
 
 
 
 
 
dbf2148
6469f7e
dbf2148
 
6469f7e
dbf2148
 
 
5d11746
6469f7e
dbf2148
 
 
 
 
 
5d11746
dbf2148
 
5d11746
dbf2148
6469f7e
 
5d11746
 
6469f7e
 
dbf2148
 
6469f7e
5d11746
dbf2148
6469f7e
dbf2148
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import os
import json
import pandas as pd
from typing import List
class WikipediaProcessor:
    def __init__(self):
        self.supported_formats = ['.txt', '.csv', '.json']
    
    def process_uploaded_file(self, file_path: str) -> List[str]:
        """Xử lý file Wikipedia uploaded"""
        file_ext = os.path.splitext(file_path)[1].lower()
        
        try:
            print(f"🔄 Đang xử lý file: {file_path}, định dạng: {file_ext}")
            
            if file_ext == '.txt':
                return self._process_txt_file(file_path)
            elif file_ext == '.csv':
                return self._process_csv_file(file_path)
            elif file_ext == '.json':
                return self._process_json_file(file_path)
            else:
                raise ValueError(f"Định dạng file không được hỗ trợ: {file_ext}")
        except Exception as e:
            print(f"❌ Lỗi xử lý file: {traceback.format_exc()}")
            raise Exception(f"Lỗi xử lý file: {str(e)}")
    
    def _process_txt_file(self, file_path: str) -> List[str]:
        """Xử lý file text"""
        print(f"📖 Đọc file text: {file_path}")
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()
            
            # Multiple splitting strategies
            paragraphs = []
            
            # Try splitting by double newlines first
            if '\n\n' in content:
                paragraphs = [p.strip() for p in content.split('\n\n') if p.strip()]
            else:
                # Try splitting by single newlines
                paragraphs = [p.strip() for p in content.split('\n') if p.strip()]
            
            # Filter by length
            paragraphs = [p for p in paragraphs if len(p.strip()) > 10]
            
            print(f"✅ Đã trích xuất {len(paragraphs)} đoạn văn từ file text")
            return paragraphs
            
        except UnicodeDecodeError:
            # Try with different encoding
            with open(file_path, 'r', encoding='latin-1') as f:
                content = f.read()
            
            paragraphs = [p.strip() for p in content.split('\n\n') if p.strip() and len(p.strip()) > 10]
            print(f"✅ Đã trích xuất {len(paragraphs)} đoạn văn từ file text (latin-1)")
            return paragraphs
    
    def _process_csv_file(self, file_path: str) -> List[str]:
        """Xử lý file CSV"""
        print(f"📊 Đọc file CSV: {file_path}")
        try:
            df = pd.read_csv(file_path)
            documents = []
            
            print(f"📋 CSV có {len(df)} hàng và {len(df.columns)} cột")
            
            for idx, row in df.iterrows():
                doc_parts = []
                for col in df.columns:
                    if pd.notna(row[col]) and str(row[col]).strip():
                        doc_parts.append(f"{col}: {row[col]}")
                
                if doc_parts:
                    full_doc = " | ".join(doc_parts)
                    if len(full_doc) > 10:  # Ensure minimum length
                        documents.append(full_doc)
                
                if idx < 3:  # Log first few rows
                    print(f"📝 Hàng {idx}: {doc_parts}")
            
            print(f"✅ Đã trích xuất {len(documents)} documents từ CSV")
            return documents
        except Exception as e:
            print(f"❌ Lỗi đọc CSV: {traceback.format_exc()}")
            raise Exception(f"Lỗi đọc CSV: {str(e)}")
    
    def _process_json_file(self, file_path: str) -> List[str]:
        """Xử lý file JSON"""
        print(f"📄 Đọc file JSON: {file_path}")
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
            
            documents = []
            
            def extract_text(obj, current_path=""):
                if isinstance(obj, dict):
                    for key, value in obj.items():
                        extract_text(value, f"{current_path}.{key}" if current_path else key)
                elif isinstance(obj, list):
                    for i, item in enumerate(obj):
                        extract_text(item, f"{current_path}[{i}]")
                elif isinstance(obj, str) and len(obj.strip()) > 10:
                    documents.append(f"{current_path}: {obj.strip()}")
                elif isinstance(obj, (int, float, bool)):
                    documents.append(f"{current_path}: {obj}")
            
            extract_text(data)
            print(f"✅ Đã trích xuất {len(documents)} documents từ JSON")
            return documents
        except Exception as e:
            print(f"❌ Lỗi đọc JSON: {traceback.format_exc()}")
            raise Exception(f"Lỗi đọc JSON: {str(e)}")