"""
Data loader for agricultural intervention data.
Handles loading and preprocessing of CSV and Excel files.
"""

import pandas as pd
import numpy as np
from pathlib import Path
from typing import List, Dict, Optional, Union
import os
from datasets import Dataset
from huggingface_hub import HfApi


class AgriculturalDataLoader:
    """Loads and preprocesses agricultural intervention data."""
    
    def __init__(self, data_path: str = None, hf_token: str = None, dataset_id: str = None):
        self.data_path = data_path or "/Users/tracyandre/Downloads/OneDrive_1_9-17-2025"
        self.hf_token = hf_token or os.environ.get("HF_TOKEN")
        self.dataset_id = dataset_id or "HackathonCRA/2024"
        self.data_cache = {}
        
    def load_all_files(self) -> pd.DataFrame:
        """Load all intervention files and combine them."""
        if 'combined_data' in self.data_cache:
            return self.data_cache['combined_data']
            
        data_files = []
        data_path = Path(self.data_path)
        
        # Get all CSV and Excel files
        csv_files = list(data_path.glob("Interventions-*.csv"))
        xlsx_files = list(data_path.glob("Interventions-*.xlsx"))
        
        all_dataframes = []
        
        # Load CSV files
        for file_path in csv_files:
            try:
                df = pd.read_csv(file_path, skiprows=1)  # Skip the first header row
                all_dataframes.append(df)
                print(f"Loaded {file_path.name}: {len(df)} rows")
            except Exception as e:
                print(f"Error loading {file_path}: {e}")
        
        # Load Excel files
        for file_path in xlsx_files:
            try:
                df = pd.read_excel(file_path, skiprows=1)  # Skip the first header row
                all_dataframes.append(df)
                print(f"Loaded {file_path.name}: {len(df)} rows")
            except Exception as e:
                print(f"Error loading {file_path}: {e}")
        
        # Combine all dataframes
        if all_dataframes:
            combined_df = pd.concat(all_dataframes, ignore_index=True)
            combined_df = self._preprocess_data(combined_df)
            self.data_cache['combined_data'] = combined_df
            return combined_df
        else:
            raise ValueError("No data files found")
    
    def _preprocess_data(self, df: pd.DataFrame) -> pd.DataFrame:
        """Preprocess the agricultural data."""
        # Convert date columns
        date_columns = ['datedebut', 'datefin']
        for col in date_columns:
            if col in df.columns:
                df[col] = pd.to_datetime(df[col], format='%d/%m/%y', errors='coerce')
        
        # Convert numeric columns
        numeric_columns = ['surfparc', 'quantitetot', 'neffqte', 'peffqte', 'kqte', 
                          'teneurn', 'teneurp', 'teneurk', 'keq', 'volumebo']
        for col in numeric_columns:
            if col in df.columns:
                df[col] = pd.to_numeric(df[col], errors='coerce')
        
        # Add derived columns
        df['year'] = df['millesime']
        df['crop_type'] = df['libelleusag']
        df['intervention_type'] = df['libevenem']
        df['product_family'] = df['familleprod']
        df['plot_name'] = df['nomparc']
        df['plot_number'] = df['numparcell']
        df['plot_surface'] = df['surfparc']
        
        # Calculate IFT (Treatment Frequency Index) for herbicides
        df['is_herbicide'] = df['familleprod'].str.contains('Herbicides', na=False)
        df['is_fungicide'] = df['familleprod'].str.contains('Fongicides', na=False)
        df['is_insecticide'] = df['familleprod'].str.contains('Insecticides', na=False)
        
        return df
    
    def get_years_available(self) -> List[int]:
        """Get list of available years in the data."""
        df = self.load_all_files()
        return sorted(df['year'].dropna().unique().astype(int).tolist())
    
    def get_plots_available(self) -> List[str]:
        """Get list of available plots."""
        df = self.load_all_files()
        return sorted(df['plot_name'].dropna().unique().tolist())
    
    def get_crops_available(self) -> List[str]:
        """Get list of available crop types."""
        df = self.load_all_files()
        return sorted(df['crop_type'].dropna().unique().tolist())
    
    def filter_data(self, 
                   years: Optional[List[int]] = None,
                   plots: Optional[List[str]] = None,
                   crops: Optional[List[str]] = None,
                   intervention_types: Optional[List[str]] = None) -> pd.DataFrame:
        """Filter the data based on criteria."""
        df = self.load_all_files()
        
        if years:
            df = df[df['year'].isin(years)]
        if plots:
            df = df[df['plot_name'].isin(plots)]
        if crops:
            df = df[df['crop_type'].isin(crops)]
        if intervention_types:
            df = df[df['intervention_type'].isin(intervention_types)]
            
        return df
    
    def get_herbicide_usage(self, years: Optional[List[int]] = None) -> pd.DataFrame:
        """Get herbicide usage data for weed pressure analysis."""
        df = self.filter_data(years=years)
        herbicide_data = df[df['is_herbicide'] == True].copy()
        
        # Group by plot, year, and crop
        usage_summary = herbicide_data.groupby(['plot_name', 'year', 'crop_type']).agg({
            'quantitetot': 'sum',
            'produit': 'count',  # Number of herbicide applications
            'surfparc': 'first'
        }).reset_index()
        
        usage_summary.columns = ['plot_name', 'year', 'crop_type', 'total_quantity', 'num_applications', 'plot_surface']
        usage_summary['ift_herbicide'] = usage_summary['num_applications'] / usage_summary['plot_surface']
        
        return usage_summary
    
    def upload_to_huggingface(self) -> str:
        """Upload data to Hugging Face dataset."""
        if not self.hf_token:
            raise ValueError("HF_TOKEN not provided")
            
        df = self.load_all_files()
        dataset = Dataset.from_pandas(df)
        
        # Upload to Hugging Face
        dataset.push_to_hub(
            repo_id=self.dataset_id,
            token=self.hf_token,
            private=False
        )
        
        return f"Data uploaded to {self.dataset_id}"