Spaces:
Running
Running
fixes linter errors in setup hf dataset
Browse files
scripts/dataset_tonic/setup_hf_dataset.py
CHANGED
|
@@ -32,7 +32,7 @@ def get_username_from_token(token: str) -> Optional[str]:
|
|
| 32 |
user_info = api.whoami()
|
| 33 |
username = user_info.get("name", user_info.get("username"))
|
| 34 |
|
| 35 |
-
|
| 36 |
except Exception as e:
|
| 37 |
print(f"❌ Error getting username from token: {e}")
|
| 38 |
return None
|
|
@@ -162,20 +162,20 @@ def add_initial_experiment_data(repo_id: str, token: str = None) -> bool:
|
|
| 162 |
if not token:
|
| 163 |
print("⚠️ No token available for uploading data")
|
| 164 |
return False
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
'experiment_id': f'exp_{datetime.now().strftime("%Y%m%d_%H%M%S")}',
|
| 170 |
'name': 'smollm3-finetune-demo',
|
| 171 |
'description': 'SmolLM3 fine-tuning experiment demo with comprehensive metrics tracking',
|
| 172 |
'created_at': datetime.now().isoformat(),
|
| 173 |
'status': 'completed',
|
| 174 |
-
|
| 175 |
-
|
| 176 |
'timestamp': datetime.now().isoformat(),
|
| 177 |
-
|
| 178 |
-
|
| 179 |
'loss': 1.15,
|
| 180 |
'grad_norm': 10.5,
|
| 181 |
'learning_rate': 5e-6,
|
|
@@ -191,13 +191,13 @@ def add_initial_experiment_data(repo_id: str, token: str = None) -> bool:
|
|
| 191 |
'gpu_memory_allocated': 15.2,
|
| 192 |
'gpu_memory_reserved': 70.1,
|
| 193 |
'gpu_utilization': 85.2,
|
| 194 |
-
|
| 195 |
-
|
|
|
|
| 196 |
}
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
'model_name': 'HuggingFaceTB/SmolLM3-3B',
|
| 201 |
'max_seq_length': 4096,
|
| 202 |
'batch_size': 2,
|
| 203 |
'learning_rate': 5e-6,
|
|
@@ -208,8 +208,8 @@ def add_initial_experiment_data(repo_id: str, token: str = None) -> bool:
|
|
| 208 |
'mixed_precision': True,
|
| 209 |
'gradient_checkpointing': True,
|
| 210 |
'flash_attention': True
|
| 211 |
-
|
| 212 |
-
|
| 213 |
'logs': json.dumps([
|
| 214 |
{
|
| 215 |
'timestamp': datetime.now().isoformat(),
|
|
@@ -227,10 +227,10 @@ def add_initial_experiment_data(repo_id: str, token: str = None) -> bool:
|
|
| 227 |
'message': 'Dataset loaded and preprocessed'
|
| 228 |
}
|
| 229 |
]),
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
# Create dataset and upload
|
| 235 |
from datasets import Dataset
|
| 236 |
|
|
@@ -347,21 +347,19 @@ This dataset is public by default for easier sharing and collaboration. Only non
|
|
| 347 |
```json
|
| 348 |
{{
|
| 349 |
"experiment_id": "exp_20250720_130853",
|
| 350 |
-
"name": "
|
| 351 |
-
"description": "SmolLM3 fine-tuning experiment
|
| 352 |
-
"created_at": "2025-07-
|
| 353 |
-
"status": "
|
| 354 |
-
"metrics": "
|
| 355 |
-
"parameters": "{{
|
| 356 |
"artifacts": "[]",
|
| 357 |
-
"logs": "
|
| 358 |
-
"last_updated": "2025-07-
|
| 359 |
}}
|
| 360 |
```
|
| 361 |
|
| 362 |
-
|
| 363 |
-
|
| 364 |
-
This dataset is part of the Trackio experiment tracking system and follows the same license as the main project.
|
| 365 |
"""
|
| 366 |
|
| 367 |
# Upload README to the dataset repository
|
|
|
|
| 32 |
user_info = api.whoami()
|
| 33 |
username = user_info.get("name", user_info.get("username"))
|
| 34 |
|
| 35 |
+
return username
|
| 36 |
except Exception as e:
|
| 37 |
print(f"❌ Error getting username from token: {e}")
|
| 38 |
return None
|
|
|
|
| 162 |
if not token:
|
| 163 |
print("⚠️ No token available for uploading data")
|
| 164 |
return False
|
| 165 |
+
|
| 166 |
+
# Initial experiment data
|
| 167 |
+
initial_experiments = [
|
| 168 |
+
{
|
| 169 |
'experiment_id': f'exp_{datetime.now().strftime("%Y%m%d_%H%M%S")}',
|
| 170 |
'name': 'smollm3-finetune-demo',
|
| 171 |
'description': 'SmolLM3 fine-tuning experiment demo with comprehensive metrics tracking',
|
| 172 |
'created_at': datetime.now().isoformat(),
|
| 173 |
'status': 'completed',
|
| 174 |
+
'metrics': json.dumps([
|
| 175 |
+
{
|
| 176 |
'timestamp': datetime.now().isoformat(),
|
| 177 |
+
'step': 100,
|
| 178 |
+
'metrics': {
|
| 179 |
'loss': 1.15,
|
| 180 |
'grad_norm': 10.5,
|
| 181 |
'learning_rate': 5e-6,
|
|
|
|
| 191 |
'gpu_memory_allocated': 15.2,
|
| 192 |
'gpu_memory_reserved': 70.1,
|
| 193 |
'gpu_utilization': 85.2,
|
| 194 |
+
'cpu_percent': 2.7,
|
| 195 |
+
'memory_percent': 10.1
|
| 196 |
+
}
|
| 197 |
}
|
| 198 |
+
]),
|
| 199 |
+
'parameters': json.dumps({
|
| 200 |
+
'model_name': 'HuggingFaceTB/SmolLM3-3B',
|
|
|
|
| 201 |
'max_seq_length': 4096,
|
| 202 |
'batch_size': 2,
|
| 203 |
'learning_rate': 5e-6,
|
|
|
|
| 208 |
'mixed_precision': True,
|
| 209 |
'gradient_checkpointing': True,
|
| 210 |
'flash_attention': True
|
| 211 |
+
}),
|
| 212 |
+
'artifacts': json.dumps([]),
|
| 213 |
'logs': json.dumps([
|
| 214 |
{
|
| 215 |
'timestamp': datetime.now().isoformat(),
|
|
|
|
| 227 |
'message': 'Dataset loaded and preprocessed'
|
| 228 |
}
|
| 229 |
]),
|
| 230 |
+
'last_updated': datetime.now().isoformat()
|
| 231 |
+
}
|
| 232 |
+
]
|
| 233 |
+
|
| 234 |
# Create dataset and upload
|
| 235 |
from datasets import Dataset
|
| 236 |
|
|
|
|
| 347 |
```json
|
| 348 |
{{
|
| 349 |
"experiment_id": "exp_20250720_130853",
|
| 350 |
+
"name": "smollm3-finetune-demo",
|
| 351 |
+
"description": "SmolLM3 fine-tuning experiment demo",
|
| 352 |
+
"created_at": "2025-07-20T13:08:53",
|
| 353 |
+
"status": "completed",
|
| 354 |
+
"metrics": "{{...}}",
|
| 355 |
+
"parameters": "{{...}}",
|
| 356 |
"artifacts": "[]",
|
| 357 |
+
"logs": "{{...}}",
|
| 358 |
+
"last_updated": "2025-07-20T13:08:53"
|
| 359 |
}}
|
| 360 |
```
|
| 361 |
|
| 362 |
+
This dataset is maintained by the Trackio monitoring system and automatically updated during training runs.
|
|
|
|
|
|
|
| 363 |
"""
|
| 364 |
|
| 365 |
# Upload README to the dataset repository
|