""" Data loader for agricultural intervention data. Handles loading and preprocessing of CSV and Excel files. """ import pandas as pd import numpy as np from pathlib import Path from typing import List, Dict, Optional, Union import os from datasets import Dataset from huggingface_hub import HfApi class AgriculturalDataLoader: """Loads and preprocesses agricultural intervention data.""" def __init__(self, data_path: str = None, hf_token: str = None, dataset_id: str = None): self.data_path = data_path or "/Users/tracyandre/Downloads/OneDrive_1_9-17-2025" self.hf_token = hf_token or os.environ.get("HF_TOKEN") self.dataset_id = dataset_id or "HackathonCRA/2024" self.data_cache = {} def load_all_files(self) -> pd.DataFrame: """Load all intervention files and combine them.""" if 'combined_data' in self.data_cache: return self.data_cache['combined_data'] data_files = [] data_path = Path(self.data_path) # Get all CSV and Excel files csv_files = list(data_path.glob("Interventions-*.csv")) xlsx_files = list(data_path.glob("Interventions-*.xlsx")) all_dataframes = [] # Load CSV files for file_path in csv_files: try: df = pd.read_csv(file_path, skiprows=1) # Skip the first header row all_dataframes.append(df) print(f"Loaded {file_path.name}: {len(df)} rows") except Exception as e: print(f"Error loading {file_path}: {e}") # Load Excel files for file_path in xlsx_files: try: df = pd.read_excel(file_path, skiprows=1) # Skip the first header row all_dataframes.append(df) print(f"Loaded {file_path.name}: {len(df)} rows") except Exception as e: print(f"Error loading {file_path}: {e}") # Combine all dataframes if all_dataframes: combined_df = pd.concat(all_dataframes, ignore_index=True) combined_df = self._preprocess_data(combined_df) self.data_cache['combined_data'] = combined_df return combined_df else: raise ValueError("No data files found") def _preprocess_data(self, df: pd.DataFrame) -> pd.DataFrame: """Preprocess the agricultural data.""" # Convert date columns date_columns = ['datedebut', 'datefin'] for col in date_columns: if col in df.columns: df[col] = pd.to_datetime(df[col], format='%d/%m/%y', errors='coerce') # Convert numeric columns numeric_columns = ['surfparc', 'quantitetot', 'neffqte', 'peffqte', 'kqte', 'teneurn', 'teneurp', 'teneurk', 'keq', 'volumebo'] for col in numeric_columns: if col in df.columns: df[col] = pd.to_numeric(df[col], errors='coerce') # Add derived columns df['year'] = df['millesime'] df['crop_type'] = df['libelleusag'] df['intervention_type'] = df['libevenem'] df['product_family'] = df['familleprod'] df['plot_name'] = df['nomparc'] df['plot_number'] = df['numparcell'] df['plot_surface'] = df['surfparc'] # Calculate IFT (Treatment Frequency Index) for herbicides df['is_herbicide'] = df['familleprod'].str.contains('Herbicides', na=False) df['is_fungicide'] = df['familleprod'].str.contains('Fongicides', na=False) df['is_insecticide'] = df['familleprod'].str.contains('Insecticides', na=False) return df def get_years_available(self) -> List[int]: """Get list of available years in the data.""" df = self.load_all_files() return sorted(df['year'].dropna().unique().astype(int).tolist()) def get_plots_available(self) -> List[str]: """Get list of available plots.""" df = self.load_all_files() return sorted(df['plot_name'].dropna().unique().tolist()) def get_crops_available(self) -> List[str]: """Get list of available crop types.""" df = self.load_all_files() return sorted(df['crop_type'].dropna().unique().tolist()) def filter_data(self, years: Optional[List[int]] = None, plots: Optional[List[str]] = None, crops: Optional[List[str]] = None, intervention_types: Optional[List[str]] = None) -> pd.DataFrame: """Filter the data based on criteria.""" df = self.load_all_files() if years: df = df[df['year'].isin(years)] if plots: df = df[df['plot_name'].isin(plots)] if crops: df = df[df['crop_type'].isin(crops)] if intervention_types: df = df[df['intervention_type'].isin(intervention_types)] return df def get_herbicide_usage(self, years: Optional[List[int]] = None) -> pd.DataFrame: """Get herbicide usage data for weed pressure analysis.""" df = self.filter_data(years=years) herbicide_data = df[df['is_herbicide'] == True].copy() # Group by plot, year, and crop usage_summary = herbicide_data.groupby(['plot_name', 'year', 'crop_type']).agg({ 'quantitetot': 'sum', 'produit': 'count', # Number of herbicide applications 'surfparc': 'first' }).reset_index() usage_summary.columns = ['plot_name', 'year', 'crop_type', 'total_quantity', 'num_applications', 'plot_surface'] usage_summary['ift_herbicide'] = usage_summary['num_applications'] / usage_summary['plot_surface'] return usage_summary def upload_to_huggingface(self) -> str: """Upload data to Hugging Face dataset.""" if not self.hf_token: raise ValueError("HF_TOKEN not provided") df = self.load_all_files() dataset = Dataset.from_pandas(df) # Upload to Hugging Face dataset.push_to_hub( repo_id=self.dataset_id, token=self.hf_token, private=False ) return f"Data uploaded to {self.dataset_id}"