import os
import json
import pandas as pd
from typing import List
class WikipediaProcessor:
    def __init__(self):
        self.supported_formats = ['.txt', '.csv', '.json']
    
    def process_uploaded_file(self, file_path: str) -> List[str]:
        """Xử lý file Wikipedia uploaded"""
        file_ext = os.path.splitext(file_path)[1].lower()
        
        try:
            print(f"🔄 Đang xử lý file: {file_path}, định dạng: {file_ext}")
            
            if file_ext == '.txt':
                return self._process_txt_file(file_path)
            elif file_ext == '.csv':
                return self._process_csv_file(file_path)
            elif file_ext == '.json':
                return self._process_json_file(file_path)
            else:
                raise ValueError(f"Định dạng file không được hỗ trợ: {file_ext}")
        except Exception as e:
            print(f"❌ Lỗi xử lý file: {traceback.format_exc()}")
            raise Exception(f"Lỗi xử lý file: {str(e)}")
    
    def _process_txt_file(self, file_path: str) -> List[str]:
        """Xử lý file text"""
        print(f"📖 Đọc file text: {file_path}")
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()
            
            # Multiple splitting strategies
            paragraphs = []
            
            # Try splitting by double newlines first
            if '\n\n' in content:
                paragraphs = [p.strip() for p in content.split('\n\n') if p.strip()]
            else:
                # Try splitting by single newlines
                paragraphs = [p.strip() for p in content.split('\n') if p.strip()]
            
            # Filter by length
            paragraphs = [p for p in paragraphs if len(p.strip()) > 10]
            
            print(f"✅ Đã trích xuất {len(paragraphs)} đoạn văn từ file text")
            return paragraphs
            
        except UnicodeDecodeError:
            # Try with different encoding
            with open(file_path, 'r', encoding='latin-1') as f:
                content = f.read()
            
            paragraphs = [p.strip() for p in content.split('\n\n') if p.strip() and len(p.strip()) > 10]
            print(f"✅ Đã trích xuất {len(paragraphs)} đoạn văn từ file text (latin-1)")
            return paragraphs
    
    def _process_csv_file(self, file_path: str) -> List[str]:
        """Xử lý file CSV"""
        print(f"📊 Đọc file CSV: {file_path}")
        try:
            df = pd.read_csv(file_path)
            documents = []
            
            print(f"📋 CSV có {len(df)} hàng và {len(df.columns)} cột")
            
            for idx, row in df.iterrows():
                doc_parts = []
                for col in df.columns:
                    if pd.notna(row[col]) and str(row[col]).strip():
                        doc_parts.append(f"{col}: {row[col]}")
                
                if doc_parts:
                    full_doc = " | ".join(doc_parts)
                    if len(full_doc) > 10:  # Ensure minimum length
                        documents.append(full_doc)
                
                if idx < 3:  # Log first few rows
                    print(f"📝 Hàng {idx}: {doc_parts}")
            
            print(f"✅ Đã trích xuất {len(documents)} documents từ CSV")
            return documents
        except Exception as e:
            print(f"❌ Lỗi đọc CSV: {traceback.format_exc()}")
            raise Exception(f"Lỗi đọc CSV: {str(e)}")
    
    def _process_json_file(self, file_path: str) -> List[str]:
        """Xử lý file JSON"""
        print(f"📄 Đọc file JSON: {file_path}")
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
            
            documents = []
            
            def extract_text(obj, current_path=""):
                if isinstance(obj, dict):
                    for key, value in obj.items():
                        extract_text(value, f"{current_path}.{key}" if current_path else key)
                elif isinstance(obj, list):
                    for i, item in enumerate(obj):
                        extract_text(item, f"{current_path}[{i}]")
                elif isinstance(obj, str) and len(obj.strip()) > 10:
                    documents.append(f"{current_path}: {obj.strip()}")
                elif isinstance(obj, (int, float, bool)):
                    documents.append(f"{current_path}: {obj}")
            
            extract_text(data)
            print(f"✅ Đã trích xuất {len(documents)} documents từ JSON")
            return documents
        except Exception as e:
            print(f"❌ Lỗi đọc JSON: {traceback.format_exc()}")
            raise Exception(f"Lỗi đọc JSON: {str(e)}")