SmolFactory / scripts /dataset_tonic /setup_hf_dataset.py
Tonic's picture
adds formatting fix
ebe598e verified
raw
history blame
10.9 kB
#!/usr/bin/env python3
"""
Setup script for Hugging Face Dataset repository for Trackio experiments
"""
import os
import json
from datetime import datetime
from datasets import Dataset
from huggingface_hub import HfApi
def setup_trackio_dataset():
"""Set up the Trackio experiments dataset on Hugging Face Hub"""
# Configuration - get from environment variables with fallbacks
dataset_repo = os.environ.get('TRACKIO_DATASET_REPO', 'tonic/trackio-experiments')
hf_token = os.environ.get('HF_TOKEN')
if not hf_token:
print("❌ HF_TOKEN not found. Please set the HF_TOKEN environment variable.")
print("You can get your token from: https://huggingface.co/settings/tokens")
return False
print(f"πŸš€ Setting up Trackio dataset: {dataset_repo}")
print(f"πŸ”§ Using dataset repository: {dataset_repo}")
# Initial experiment data
initial_experiments = [
{
'experiment_id': 'exp_20250720_130853',
'name': 'petite-elle-l-aime-3',
'description': 'SmolLM3 fine-tuning experiment',
'created_at': '2025-07-20T11:20:01.780908',
'status': 'running',
'metrics': json.dumps([
{
'timestamp': '2025-07-20T11:20:01.780908',
'step': 25,
'metrics': {
'loss': 1.1659,
'grad_norm': 10.3125,
'learning_rate': 7e-08,
'num_tokens': 1642080.0,
'mean_token_accuracy': 0.75923578992486,
'epoch': 0.004851130919895701
}
},
{
'timestamp': '2025-07-20T11:26:39.042155',
'step': 50,
'metrics': {
'loss': 1.165,
'grad_norm': 10.75,
'learning_rate': 1.4291666666666667e-07,
'num_tokens': 3324682.0,
'mean_token_accuracy': 0.7577659255266189,
'epoch': 0.009702261839791402
}
},
{
'timestamp': '2025-07-20T11:33:16.203045',
'step': 75,
'metrics': {
'loss': 1.1639,
'grad_norm': 10.6875,
'learning_rate': 2.1583333333333334e-07,
'num_tokens': 4987941.0,
'mean_token_accuracy': 0.7581205774843692,
'epoch': 0.014553392759687101
}
},
{
'timestamp': '2025-07-20T11:39:53.453917',
'step': 100,
'metrics': {
'loss': 1.1528,
'grad_norm': 10.75,
'learning_rate': 2.8875e-07,
'num_tokens': 6630190.0,
'mean_token_accuracy': 0.7614579878747463,
'epoch': 0.019404523679582803
}
}
]),
'parameters': json.dumps({
'model_name': 'HuggingFaceTB/SmolLM3-3B',
'max_seq_length': 12288,
'use_flash_attention': True,
'use_gradient_checkpointing': False,
'batch_size': 8,
'gradient_accumulation_steps': 16,
'learning_rate': 3.5e-06,
'weight_decay': 0.01,
'warmup_steps': 1200,
'max_iters': 18000,
'eval_interval': 1000,
'log_interval': 25,
'save_interval': 2000,
'optimizer': 'adamw_torch',
'beta1': 0.9,
'beta2': 0.999,
'eps': 1e-08,
'scheduler': 'cosine',
'min_lr': 3.5e-07,
'fp16': False,
'bf16': True,
'ddp_backend': 'nccl',
'ddp_find_unused_parameters': False,
'save_steps': 2000,
'eval_steps': 1000,
'logging_steps': 25,
'save_total_limit': 5,
'eval_strategy': 'steps',
'metric_for_best_model': 'eval_loss',
'greater_is_better': False,
'load_best_model_at_end': True,
'data_dir': None,
'train_file': None,
'validation_file': None,
'test_file': None,
'use_chat_template': True,
'chat_template_kwargs': {'add_generation_prompt': True, 'no_think_system_message': True},
'enable_tracking': True,
'trackio_url': 'https://tonic-test-trackio-test.hf.space',
'trackio_token': None,
'log_artifacts': True,
'log_metrics': True,
'log_config': True,
'experiment_name': 'petite-elle-l-aime-3',
'dataset_name': 'legmlai/openhermes-fr',
'dataset_split': 'train',
'input_field': 'prompt',
'target_field': 'accepted_completion',
'filter_bad_entries': True,
'bad_entry_field': 'bad_entry',
'packing': False,
'max_prompt_length': 12288,
'max_completion_length': 8192,
'truncation': True,
'dataloader_num_workers': 10,
'dataloader_pin_memory': True,
'dataloader_prefetch_factor': 3,
'max_grad_norm': 1.0,
'group_by_length': True
}),
'artifacts': json.dumps([]),
'logs': json.dumps([]),
'last_updated': datetime.now().isoformat()
},
{
'experiment_id': 'exp_20250720_134319',
'name': 'petite-elle-l-aime-3-1',
'description': 'SmolLM3 fine-tuning experiment',
'created_at': '2025-07-20T11:54:31.993219',
'status': 'running',
'metrics': json.dumps([
{
'timestamp': '2025-07-20T11:54:31.993219',
'step': 25,
'metrics': {
'loss': 1.166,
'grad_norm': 10.375,
'learning_rate': 7e-08,
'num_tokens': 1642080.0,
'mean_token_accuracy': 0.7590958896279335,
'epoch': 0.004851130919895701
}
},
{
'timestamp': '2025-07-20T11:54:33.589487',
'step': 25,
'metrics': {
'gpu_0_memory_allocated': 17.202261447906494,
'gpu_0_memory_reserved': 75.474609375,
'gpu_0_utilization': 0,
'cpu_percent': 2.7,
'memory_percent': 10.1
}
}
]),
'parameters': json.dumps({
'model_name': 'HuggingFaceTB/SmolLM3-3B',
'max_seq_length': 12288,
'use_flash_attention': True,
'use_gradient_checkpointing': False,
'batch_size': 8,
'gradient_accumulation_steps': 16,
'learning_rate': 3.5e-06,
'weight_decay': 0.01,
'warmup_steps': 1200,
'max_iters': 18000,
'eval_interval': 1000,
'log_interval': 25,
'save_interval': 2000,
'optimizer': 'adamw_torch',
'beta1': 0.9,
'beta2': 0.999,
'eps': 1e-08,
'scheduler': 'cosine',
'min_lr': 3.5e-07,
'fp16': False,
'bf16': True,
'ddp_backend': 'nccl',
'ddp_find_unused_parameters': False,
'save_steps': 2000,
'eval_steps': 1000,
'logging_steps': 25,
'save_total_limit': 5,
'eval_strategy': 'steps',
'metric_for_best_model': 'eval_loss',
'greater_is_better': False,
'load_best_model_at_end': True,
'data_dir': None,
'train_file': None,
'validation_file': None,
'test_file': None,
'use_chat_template': True,
'chat_template_kwargs': {'add_generation_prompt': True, 'no_think_system_message': True},
'enable_tracking': True,
'trackio_url': 'https://tonic-test-trackio-test.hf.space',
'trackio_token': None,
'log_artifacts': True,
'log_metrics': True,
'log_config': True,
'experiment_name': 'petite-elle-l-aime-3-1',
'dataset_name': 'legmlai/openhermes-fr',
'dataset_split': 'train',
'input_field': 'prompt',
'target_field': 'accepted_completion',
'filter_bad_entries': True,
'bad_entry_field': 'bad_entry',
'packing': False,
'max_prompt_length': 12288,
'max_completion_length': 8192,
'truncation': True,
'dataloader_num_workers': 10,
'dataloader_pin_memory': True,
'dataloader_prefetch_factor': 3,
'max_grad_norm': 1.0,
'group_by_length': True
}),
'artifacts': json.dumps([]),
'logs': json.dumps([]),
'last_updated': datetime.now().isoformat()
}
]
try:
# Create dataset
dataset = Dataset.from_list(initial_experiments)
# Push to HF Hub
api = HfApi(token=hf_token)
dataset.push_to_hub(
dataset_repo,
token=hf_token,
private=True # Make it private for security
)
print(f"βœ… Successfully created dataset: {dataset_repo}")
print(f"πŸ“Š Added {len(initial_experiments)} experiments")
print("πŸ”’ Dataset is private (only accessible with your token)")
print("\n🎯 Next steps:")
print("1. Set HF_TOKEN in your Hugging Face Space environment")
print("2. Deploy the updated app.py to your Space")
print("3. The app will now load experiments from the dataset")
return True
except Exception as e:
print(f"❌ Failed to create dataset: {e}")
return False
if __name__ == "__main__":
setup_trackio_dataset()