Tracy AndrΓ© commited on
Commit
8ecf5f5
Β·
1 Parent(s): 5977115

Remove local file loading - HF dataset only

Browse files

πŸ—‘οΈ Removed Features:
- Removed local file loading capabilities
- Removed automatic fallback mechanism
- Removed factory methods for local loading
- Removed data_path parameter
- Removed use_hf parameter

✨ Simplified Architecture:
- AgriculturalDataLoader now loads exclusively from HF datasets
- Simplified constructor (only hf_token and dataset_id)
- Direct load_from_huggingface() call
- Clear error messages when HF loading fails

πŸ”§ Updated Components:
- Updated gradio_app.py for HF-only usage
- Simplified data_loader.py interface
- Added test_hf_only.py validation script
- Removed unused imports (pathlib, etc.)

🎯 Benefits:
- Cleaner, simpler codebase
- Production-ready for HF Spaces
- No local file dependencies
- Clear separation of concerns
- Easier deployment and maintenance

Files changed (3) hide show
  1. data_loader.py +13 -105
  2. gradio_app.py +3 -7
  3. test_hf_only.py +155 -0
data_loader.py CHANGED
@@ -1,44 +1,33 @@
1
  """
2
  Data loader for agricultural intervention data.
3
- Handles loading and preprocessing of CSV and Excel files.
4
  """
5
 
6
  import pandas as pd
7
  import numpy as np
8
- from pathlib import Path
9
- from typing import List, Dict, Optional, Union
10
  import os
11
  from datasets import Dataset, load_dataset
12
  from huggingface_hub import HfApi
13
 
14
 
15
  class AgriculturalDataLoader:
16
- """Loads and preprocesses agricultural intervention data."""
17
 
18
- def __init__(self, data_path: str = None, hf_token: str = None, dataset_id: str = None, use_hf: bool = False):
19
- self.data_path = data_path or "/Users/tracyandre/Downloads/OneDrive_1_9-17-2025"
20
  self.hf_token = hf_token or os.environ.get("HF_TOKEN")
21
  self.dataset_id = dataset_id or "HackathonCRA/2024"
22
- self.use_hf = use_hf # Flag to use Hugging Face dataset
23
  self.data_cache = {}
24
 
25
  def load_all_files(self) -> pd.DataFrame:
26
- """Load all intervention files and combine them."""
27
  if 'combined_data' in self.data_cache:
28
  return self.data_cache['combined_data']
29
 
30
- # Try to load from Hugging Face first if enabled
31
- if self.use_hf:
32
- try:
33
- df = self.load_from_huggingface()
34
- self.data_cache['combined_data'] = df
35
- return df
36
- except Exception as e:
37
- print(f"Failed to load from Hugging Face: {e}")
38
- print("Falling back to local files...")
39
-
40
- # Load from local files (original method)
41
- return self._load_local_files()
42
 
43
  def load_from_huggingface(self) -> pd.DataFrame:
44
  """Load data from Hugging Face dataset."""
@@ -62,45 +51,6 @@ class AgriculturalDataLoader:
62
  except Exception as e:
63
  raise ValueError(f"Failed to load dataset from Hugging Face: {e}")
64
 
65
- def _load_local_files(self) -> pd.DataFrame:
66
- """Load data from local CSV/Excel files."""
67
- print(f"πŸ“ Loading local files from: {self.data_path}")
68
-
69
- data_files = []
70
- data_path = Path(self.data_path)
71
-
72
- # Get all CSV and Excel files
73
- csv_files = list(data_path.glob("Interventions-*.csv"))
74
- xlsx_files = list(data_path.glob("Interventions-*.xlsx"))
75
-
76
- all_dataframes = []
77
-
78
- # Load CSV files
79
- for file_path in csv_files:
80
- try:
81
- df = pd.read_csv(file_path, skiprows=1) # Skip the first header row
82
- all_dataframes.append(df)
83
- print(f"Loaded {file_path.name}: {len(df)} rows")
84
- except Exception as e:
85
- print(f"Error loading {file_path}: {e}")
86
-
87
- # Load Excel files
88
- for file_path in xlsx_files:
89
- try:
90
- df = pd.read_excel(file_path, skiprows=1) # Skip the first header row
91
- all_dataframes.append(df)
92
- print(f"Loaded {file_path.name}: {len(df)} rows")
93
- except Exception as e:
94
- print(f"Error loading {file_path}: {e}")
95
-
96
- # Combine all dataframes
97
- if all_dataframes:
98
- combined_df = pd.concat(all_dataframes, ignore_index=True)
99
- combined_df = self._preprocess_data(combined_df)
100
- return combined_df
101
- else:
102
- raise ValueError("No data files found")
103
-
104
  def _preprocess_data(self, df: pd.DataFrame) -> pd.DataFrame:
105
  """Preprocess the agricultural data."""
106
  # Convert date columns
@@ -200,49 +150,7 @@ class AgriculturalDataLoader:
200
 
201
  return f"Data uploaded to {self.dataset_id}"
202
 
203
- def set_data_source(self, use_hf: bool = True, clear_cache: bool = True):
204
- """
205
- Switch between Hugging Face and local file data sources.
206
-
207
- Args:
208
- use_hf: If True, use Hugging Face dataset. If False, use local files.
209
- clear_cache: If True, clear cached data to force reload from new source.
210
- """
211
- self.use_hf = use_hf
212
- if clear_cache:
213
- self.data_cache.clear()
214
- print(f"πŸ“‹ Switched to {'Hugging Face' if use_hf else 'local files'} data source")
215
-
216
- @classmethod
217
- def create_hf_loader(cls, dataset_id: str = "HackathonCRA/2024", hf_token: str = None):
218
- """
219
- Factory method to create a loader configured for Hugging Face.
220
-
221
- Args:
222
- dataset_id: Hugging Face dataset identifier
223
- hf_token: Hugging Face token (optional, will use environment variable)
224
-
225
- Returns:
226
- AgriculturalDataLoader configured for HF
227
- """
228
- return cls(
229
- dataset_id=dataset_id,
230
- hf_token=hf_token,
231
- use_hf=True
232
- )
233
-
234
- @classmethod
235
- def create_local_loader(cls, data_path: str):
236
- """
237
- Factory method to create a loader configured for local files.
238
-
239
- Args:
240
- data_path: Path to local data directory
241
-
242
- Returns:
243
- AgriculturalDataLoader configured for local files
244
- """
245
- return cls(
246
- data_path=data_path,
247
- use_hf=False
248
- )
 
1
  """
2
  Data loader for agricultural intervention data.
3
+ Loads data exclusively from Hugging Face datasets.
4
  """
5
 
6
  import pandas as pd
7
  import numpy as np
8
+ from typing import List, Optional
 
9
  import os
10
  from datasets import Dataset, load_dataset
11
  from huggingface_hub import HfApi
12
 
13
 
14
  class AgriculturalDataLoader:
15
+ """Loads and preprocesses agricultural intervention data from Hugging Face datasets."""
16
 
17
+ def __init__(self, hf_token: str = None, dataset_id: str = None):
 
18
  self.hf_token = hf_token or os.environ.get("HF_TOKEN")
19
  self.dataset_id = dataset_id or "HackathonCRA/2024"
 
20
  self.data_cache = {}
21
 
22
  def load_all_files(self) -> pd.DataFrame:
23
+ """Load data from Hugging Face dataset."""
24
  if 'combined_data' in self.data_cache:
25
  return self.data_cache['combined_data']
26
 
27
+ # Load from Hugging Face only
28
+ df = self.load_from_huggingface()
29
+ self.data_cache['combined_data'] = df
30
+ return df
 
 
 
 
 
 
 
 
31
 
32
  def load_from_huggingface(self) -> pd.DataFrame:
33
  """Load data from Hugging Face dataset."""
 
51
  except Exception as e:
52
  raise ValueError(f"Failed to load dataset from Hugging Face: {e}")
53
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  def _preprocess_data(self, df: pd.DataFrame) -> pd.DataFrame:
55
  """Preprocess the agricultural data."""
56
  # Convert date columns
 
150
 
151
  return f"Data uploaded to {self.dataset_id}"
152
 
153
+ def clear_cache(self):
154
+ """Clear cached data to force reload from Hugging Face."""
155
+ self.data_cache.clear()
156
+ print("πŸ“‹ Cache cleared - will reload from Hugging Face on next access")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
gradio_app.py CHANGED
@@ -15,13 +15,9 @@ from analysis_tools import AgriculturalAnalyzer
15
 
16
 
17
  # Initialize components
18
- # Try to use Hugging Face first, fallback to local files
19
- try:
20
- data_loader = AgriculturalDataLoader(use_hf=True)
21
- print("πŸ€— Configured to use Hugging Face dataset with local fallback")
22
- except:
23
- data_loader = AgriculturalDataLoader(use_hf=False)
24
- print("πŸ“ Configured to use local files only")
25
 
26
  analyzer = AgriculturalAnalyzer(data_loader)
27
 
 
15
 
16
 
17
  # Initialize components
18
+ # Use Hugging Face dataset exclusively
19
+ data_loader = AgriculturalDataLoader()
20
+ print("πŸ€— Configured to use Hugging Face dataset exclusively")
 
 
 
 
21
 
22
  analyzer = AgriculturalAnalyzer(data_loader)
23
 
test_hf_only.py ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Test script to validate Hugging Face only loading.
4
+ """
5
+
6
+ import os
7
+ import warnings
8
+ warnings.filterwarnings('ignore')
9
+
10
+ def test_hf_only_loading():
11
+ """Test that the loader only works with Hugging Face."""
12
+ print("πŸ€— TESTING HUGGING FACE ONLY LOADING")
13
+ print("=" * 50)
14
+
15
+ from data_loader import AgriculturalDataLoader
16
+
17
+ # Check if HF token is available
18
+ hf_token = os.environ.get("HF_TOKEN")
19
+ if not hf_token:
20
+ print("⚠️ No HF_TOKEN found in environment variables")
21
+ print("πŸ’‘ Set HF_TOKEN to test Hugging Face loading")
22
+ print("πŸ”§ For this test, we'll try without token (may fail)")
23
+
24
+ try:
25
+ # Create loader (HF only)
26
+ loader = AgriculturalDataLoader(
27
+ dataset_id="HackathonCRA/2024",
28
+ hf_token=hf_token
29
+ )
30
+
31
+ print(f"πŸ€— Attempting to load from dataset: {loader.dataset_id}")
32
+
33
+ # Load data
34
+ df = loader.load_all_files()
35
+
36
+ print(f"βœ… Success! Loaded {len(df):,} records from Hugging Face")
37
+ print(f"πŸ“Š Years: {sorted(df['year'].unique())}")
38
+ print(f"🌱 Crops: {df['crop_type'].nunique()}")
39
+ print(f"πŸ“ Plots: {df['plot_name'].nunique()}")
40
+ print(f"πŸ’Š Herbicide applications: {df['is_herbicide'].sum()}")
41
+
42
+ return True
43
+
44
+ except Exception as e:
45
+ print(f"❌ Failed to load from Hugging Face: {e}")
46
+ print("πŸ’‘ This is expected if the dataset doesn't exist yet")
47
+ print("πŸ”§ Make sure to upload your dataset to HF Hub first")
48
+ return False
49
+
50
+ def test_no_local_fallback():
51
+ """Test that there's no local fallback."""
52
+ print("\n🚫 TESTING NO LOCAL FALLBACK")
53
+ print("=" * 50)
54
+
55
+ from data_loader import AgriculturalDataLoader
56
+
57
+ try:
58
+ # Create loader with non-existent dataset
59
+ loader = AgriculturalDataLoader(
60
+ dataset_id="nonexistent/dataset"
61
+ )
62
+
63
+ # This should fail without falling back to local
64
+ df = loader.load_all_files()
65
+
66
+ print(f"❌ Unexpected success - loaded {len(df)} records")
67
+ print("⚠️ This suggests local fallback is still active")
68
+ return False
69
+
70
+ except Exception as e:
71
+ print(f"βœ… Expected failure: {e}")
72
+ print("βœ… Confirmed: No local fallback, HF only")
73
+ return True
74
+
75
+ def test_simple_usage():
76
+ """Test simple usage pattern."""
77
+ print("\nπŸ“ SIMPLE USAGE EXAMPLE")
78
+ print("=" * 50)
79
+
80
+ print("πŸ’‘ Recommended usage pattern:")
81
+ print()
82
+
83
+ usage_code = '''
84
+ from data_loader import AgriculturalDataLoader
85
+
86
+ # Simple HF-only loader
87
+ loader = AgriculturalDataLoader(dataset_id="HackathonCRA/2024")
88
+
89
+ # Load data (will use HF_TOKEN from environment)
90
+ df = loader.load_all_files()
91
+
92
+ # Analyze data
93
+ print(f"Loaded {len(df)} records from Hugging Face")
94
+ '''
95
+
96
+ print(usage_code)
97
+
98
+ try:
99
+ from data_loader import AgriculturalDataLoader
100
+ loader = AgriculturalDataLoader(dataset_id="HackathonCRA/2024")
101
+ print("βœ… Loader created successfully")
102
+ print(f"🎯 Target dataset: {loader.dataset_id}")
103
+ print(f"πŸ”‘ Using token: {'Yes' if loader.hf_token else 'No (from env)'}")
104
+
105
+ return True
106
+
107
+ except Exception as e:
108
+ print(f"❌ Failed to create loader: {e}")
109
+ return False
110
+
111
+ def main():
112
+ """Run all tests."""
113
+ print("🚜 HUGGING FACE ONLY - VALIDATION TESTS")
114
+ print("=" * 60)
115
+ print()
116
+
117
+ results = []
118
+
119
+ # Test 1: HF loading
120
+ results.append(("HF Only Loading", test_hf_only_loading()))
121
+
122
+ # Test 2: No local fallback
123
+ results.append(("No Local Fallback", test_no_local_fallback()))
124
+
125
+ # Test 3: Simple usage
126
+ results.append(("Simple Usage", test_simple_usage()))
127
+
128
+ # Summary
129
+ print("\nπŸ“‹ TEST SUMMARY")
130
+ print("=" * 30)
131
+
132
+ passed = 0
133
+ for test_name, result in results:
134
+ status = "βœ… PASS" if result else "❌ FAIL"
135
+ print(f"{test_name:<20} {status}")
136
+ if result:
137
+ passed += 1
138
+
139
+ print(f"\n🎯 Results: {passed}/{len(results)} tests passed")
140
+
141
+ if passed >= 2: # Allow HF loading to fail if dataset doesn't exist
142
+ print("πŸŽ‰ Validation successful! Loader is HF-only.")
143
+ else:
144
+ print("⚠️ Validation issues detected.")
145
+
146
+ print("\nπŸš€ DEPLOYMENT CHECKLIST:")
147
+ print("βœ… Remove local file dependencies")
148
+ print("βœ… HF-only data loading")
149
+ print("βœ… No fallback mechanisms")
150
+ print("πŸ”² Upload dataset to HF Hub")
151
+ print("πŸ”² Set HF_TOKEN in production")
152
+ print("πŸ”² Test with real HF dataset")
153
+
154
+ if __name__ == "__main__":
155
+ main()