Spaces:
Running
Running
adds harmony format , configurable gpt-oss parameters, launch.sh logic , improved templates for legml gpt-oss training, dynamic results directory and improve model pushing
Browse files- config/train_gpt_oss_custom.py +388 -0
- config/train_gpt_oss_openhermes_fr.py +174 -0
- config/train_gpt_oss_openhermes_fr_memory_optimized.py +233 -0
- docs/output.svg +1 -0
- launch.sh +328 -11
- scripts/model_tonic/push_gpt_oss_to_huggingface.py +79 -5
- scripts/model_tonic/push_to_huggingface.py +83 -5
- scripts/training/train_gpt_oss.py +313 -24
- templates/spaces/demo_gpt/README.md +1 -1
config/train_gpt_oss_custom.py
ADDED
|
@@ -0,0 +1,388 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
GPT-OSS Custom Training Configuration
|
| 3 |
+
Based on OpenAI's GPT-OSS fine-tuning tutorial
|
| 4 |
+
Fully customizable configuration for any dataset format
|
| 5 |
+
|
| 6 |
+
Supports specialized datasets like:
|
| 7 |
+
- legmlai/openhermes-fr (French instruction dataset)
|
| 8 |
+
- HuggingFaceH4/Multilingual-Thinking
|
| 9 |
+
- Custom prompt/completion formats
|
| 10 |
+
"""
|
| 11 |
+
import os
|
| 12 |
+
from dataclasses import dataclass
|
| 13 |
+
from typing import Optional, Dict, List, Union
|
| 14 |
+
|
| 15 |
+
@dataclass
|
| 16 |
+
class GPTOSSEnhancedCustomConfig:
|
| 17 |
+
"""Enhanced custom configuration for GPT-OSS fine-tuning with maximum flexibility"""
|
| 18 |
+
|
| 19 |
+
# ============================================================================
|
| 20 |
+
# CORE MODEL CONFIGURATION
|
| 21 |
+
# ============================================================================
|
| 22 |
+
trainer_type: str = "sft" # "sft" or "dpo"
|
| 23 |
+
model_name: str = "openai/gpt-oss-20b"
|
| 24 |
+
max_seq_length: int = 2048 # Customizable: 512, 1024, 2048, 4096, 8192
|
| 25 |
+
use_flash_attention: bool = True
|
| 26 |
+
use_gradient_checkpointing: bool = True
|
| 27 |
+
|
| 28 |
+
# ============================================================================
|
| 29 |
+
# TRAINING HYPERPARAMETERS - FULLY CUSTOMIZABLE
|
| 30 |
+
# ============================================================================
|
| 31 |
+
# Batch Configuration
|
| 32 |
+
batch_size: int = 4 # Per-device batch size (1-32 depending on GPU memory)
|
| 33 |
+
gradient_accumulation_steps: int = 4 # Effective batch = batch_size * accumulation * num_gpus
|
| 34 |
+
eval_batch_size: Optional[int] = None # If None, uses batch_size
|
| 35 |
+
|
| 36 |
+
# Learning Rate Configuration
|
| 37 |
+
learning_rate: float = 2e-4 # Main learning rate (1e-5 to 5e-4 typical range)
|
| 38 |
+
min_lr: float = 2e-5 # Minimum learning rate for scheduler
|
| 39 |
+
warmup_ratio: float = 0.03 # Fraction of steps for warmup (0.01-0.1)
|
| 40 |
+
warmup_steps: Optional[int] = None # If set, overrides warmup_ratio
|
| 41 |
+
|
| 42 |
+
# Training Duration
|
| 43 |
+
num_train_epochs: float = 1.0 # Number of epochs (0.5, 1.0, 2.0, 3.0)
|
| 44 |
+
max_steps: Optional[int] = None # If set, overrides num_train_epochs
|
| 45 |
+
max_iters: Optional[int] = None # Legacy compatibility
|
| 46 |
+
|
| 47 |
+
# Regularization
|
| 48 |
+
weight_decay: float = 0.01 # L2 regularization (0.0-0.1)
|
| 49 |
+
max_grad_norm: float = 1.0 # Gradient clipping (0.5-2.0)
|
| 50 |
+
|
| 51 |
+
# ============================================================================
|
| 52 |
+
# OPTIMIZER CONFIGURATION
|
| 53 |
+
# ============================================================================
|
| 54 |
+
optimizer: str = "adamw_torch" # "adamw_torch", "adamw_hf", "sgd"
|
| 55 |
+
beta1: float = 0.9 # Adam beta1 parameter
|
| 56 |
+
beta2: float = 0.95 # Adam beta2 parameter (0.95-0.999)
|
| 57 |
+
eps: float = 1e-8 # Adam epsilon
|
| 58 |
+
|
| 59 |
+
# ============================================================================
|
| 60 |
+
# SCHEDULER CONFIGURATION
|
| 61 |
+
# ============================================================================
|
| 62 |
+
scheduler: str = "cosine_with_min_lr" # "linear", "cosine", "cosine_with_min_lr", "constant"
|
| 63 |
+
lr_scheduler_kwargs: Optional[Dict] = None
|
| 64 |
+
|
| 65 |
+
# ============================================================================
|
| 66 |
+
# MIXED PRECISION & DISTRIBUTED TRAINING
|
| 67 |
+
# ============================================================================
|
| 68 |
+
fp16: bool = False # Use FP16 (not recommended for GPT-OSS)
|
| 69 |
+
bf16: bool = True # Use BF16 (recommended for GPT-OSS)
|
| 70 |
+
tf32: Optional[bool] = None # Use TF32 on A100/H100
|
| 71 |
+
ddp_backend: str = "nccl"
|
| 72 |
+
ddp_find_unused_parameters: bool = False
|
| 73 |
+
|
| 74 |
+
# ============================================================================
|
| 75 |
+
# LOGGING, EVALUATION & CHECKPOINTING
|
| 76 |
+
# ============================================================================
|
| 77 |
+
# Logging
|
| 78 |
+
logging_steps: int = 10 # Log every N steps
|
| 79 |
+
log_level: str = "info" # "debug", "info", "warning", "error"
|
| 80 |
+
|
| 81 |
+
# Evaluation
|
| 82 |
+
eval_strategy: str = "steps" # "no", "steps", "epoch"
|
| 83 |
+
eval_steps: int = 100 # Evaluate every N steps
|
| 84 |
+
eval_delay: float = 0 # Delay evaluation for N steps/epochs
|
| 85 |
+
eval_accumulation_steps: Optional[int] = None # Accumulate eval outputs
|
| 86 |
+
|
| 87 |
+
# Checkpointing
|
| 88 |
+
save_strategy: str = "steps" # "no", "steps", "epoch"
|
| 89 |
+
save_steps: int = 500 # Save checkpoint every N steps
|
| 90 |
+
save_total_limit: Optional[int] = 3 # Keep only N best checkpoints
|
| 91 |
+
save_only_model: bool = False # Save only model weights
|
| 92 |
+
|
| 93 |
+
# Model Selection
|
| 94 |
+
metric_for_best_model: str = "eval_loss"
|
| 95 |
+
greater_is_better: bool = False
|
| 96 |
+
load_best_model_at_end: bool = True
|
| 97 |
+
|
| 98 |
+
# ============================================================================
|
| 99 |
+
# DATASET CONFIGURATION - ENHANCED FOR CUSTOM FORMATS
|
| 100 |
+
# ============================================================================
|
| 101 |
+
# Dataset Source
|
| 102 |
+
dataset_name: str = "legmlai/openhermes-fr" # Default to French OpenHermes
|
| 103 |
+
dataset_split: str = "train" # Dataset split to use
|
| 104 |
+
dataset_config: Optional[str] = None # Dataset configuration name
|
| 105 |
+
|
| 106 |
+
# Field Mapping - Customize for your dataset format
|
| 107 |
+
input_field: str = "prompt" # Field containing the input/prompt
|
| 108 |
+
target_field: str = "accepted_completion" # Field containing the target/completion
|
| 109 |
+
|
| 110 |
+
# OpenHermes-FR specific fields
|
| 111 |
+
filter_bad_entries: bool = True # Filter entries marked as bad
|
| 112 |
+
bad_entry_field: str = "bad_entry" # Field indicating bad entries
|
| 113 |
+
bad_prompt_field: str = "bad_prompt_detected" # Field for bad prompts
|
| 114 |
+
bad_response_field: str = "bad_response_detected" # Field for bad responses
|
| 115 |
+
|
| 116 |
+
# Data Processing Options
|
| 117 |
+
concatenate_fields: bool = True # Combine input and target fields for training
|
| 118 |
+
field_separator: str = "\n\n### Response:\n" # Separator between input and target
|
| 119 |
+
add_eos_token: bool = True # Add EOS token at the end
|
| 120 |
+
|
| 121 |
+
# Dataset Filtering & Sampling
|
| 122 |
+
max_samples: Optional[int] = None # Limit dataset size (e.g., 100000 for testing)
|
| 123 |
+
min_length: int = 10 # Minimum sequence length
|
| 124 |
+
max_length: Optional[int] = None # Maximum sequence length (None = use max_seq_length)
|
| 125 |
+
|
| 126 |
+
# Custom Dataset Formats Support
|
| 127 |
+
dataset_format: str = "openhermes_fr" # "openhermes_fr", "messages", "text", "custom"
|
| 128 |
+
|
| 129 |
+
# GPT-OSS Harmony Format Configuration
|
| 130 |
+
use_harmony_format: bool = True # Enable GPT-OSS harmony format
|
| 131 |
+
use_chat_template: bool = False # Set to True for messages format
|
| 132 |
+
chat_template_kwargs: Optional[Dict] = None
|
| 133 |
+
|
| 134 |
+
# ============================================================================
|
| 135 |
+
# TRACKIO MONITORING CONFIGURATION
|
| 136 |
+
# ============================================================================
|
| 137 |
+
enable_tracking: bool = True
|
| 138 |
+
trackio_url: Optional[str] = None
|
| 139 |
+
trackio_token: Optional[str] = None
|
| 140 |
+
log_artifacts: bool = True
|
| 141 |
+
log_metrics: bool = True
|
| 142 |
+
log_config: bool = True
|
| 143 |
+
experiment_name: Optional[str] = None
|
| 144 |
+
|
| 145 |
+
# ============================================================================
|
| 146 |
+
# HUGGING FACE INTEGRATION
|
| 147 |
+
# ============================================================================
|
| 148 |
+
hf_token: Optional[str] = None
|
| 149 |
+
dataset_repo: Optional[str] = None
|
| 150 |
+
push_to_hub: bool = False # Push model to HF Hub after training
|
| 151 |
+
hub_model_id: Optional[str] = None # HF Hub model ID
|
| 152 |
+
hub_private_repo: bool = False # Make HF repo private
|
| 153 |
+
|
| 154 |
+
# ============================================================================
|
| 155 |
+
# GPT-OSS SPECIFIC CONFIGURATIONS
|
| 156 |
+
# ============================================================================
|
| 157 |
+
# LoRA Configuration
|
| 158 |
+
use_lora: bool = True
|
| 159 |
+
lora_config: Optional[Dict] = None
|
| 160 |
+
|
| 161 |
+
# Quantization Configuration
|
| 162 |
+
use_quantization: bool = True
|
| 163 |
+
quantization_config: Optional[Dict] = None
|
| 164 |
+
|
| 165 |
+
# Model Loading Configuration
|
| 166 |
+
model_kwargs: Optional[Dict] = None
|
| 167 |
+
|
| 168 |
+
# Generation Configuration (for evaluation/testing)
|
| 169 |
+
generation_config: Optional[Dict] = None
|
| 170 |
+
|
| 171 |
+
# ============================================================================
|
| 172 |
+
# MULTILINGUAL & DOMAIN SPECIFIC SETTINGS
|
| 173 |
+
# ============================================================================
|
| 174 |
+
# Language Support (for multilingual datasets)
|
| 175 |
+
primary_language: str = "fr" # Primary language code
|
| 176 |
+
reasoning_languages: Optional[List[str]] = None # Supported languages for reasoning
|
| 177 |
+
|
| 178 |
+
# Domain-specific settings
|
| 179 |
+
domain_focus: Optional[str] = None # "reasoning", "conversation", "instruction", "general"
|
| 180 |
+
|
| 181 |
+
# ============================================================================
|
| 182 |
+
# PERFORMANCE & MEMORY OPTIMIZATION
|
| 183 |
+
# ============================================================================
|
| 184 |
+
# Data Loading
|
| 185 |
+
dataloader_num_workers: int = 4 # Number of data loading workers
|
| 186 |
+
dataloader_pin_memory: bool = True # Pin memory for faster GPU transfer
|
| 187 |
+
dataloader_prefetch_factor: int = 2 # Prefetch factor for data loading
|
| 188 |
+
|
| 189 |
+
# Memory Management
|
| 190 |
+
max_memory_per_gpu: Optional[str] = None # e.g., "80GB", "40GB"
|
| 191 |
+
low_cpu_mem_usage: bool = True # Use low CPU memory loading
|
| 192 |
+
|
| 193 |
+
# Performance Optimizations
|
| 194 |
+
group_by_length: bool = True # Group sequences by length
|
| 195 |
+
length_column_name: str = "length" # Column name for sequence lengths
|
| 196 |
+
remove_unused_columns: bool = True # Remove unused dataset columns
|
| 197 |
+
|
| 198 |
+
def __post_init__(self):
|
| 199 |
+
"""Initialize default values and validate configuration"""
|
| 200 |
+
|
| 201 |
+
# ============================================================================
|
| 202 |
+
# LORA CONFIGURATION DEFAULTS
|
| 203 |
+
# ============================================================================
|
| 204 |
+
if self.lora_config is None:
|
| 205 |
+
self.lora_config = {
|
| 206 |
+
"r": 16, # Rank (4, 8, 16, 32, 64) - higher = more parameters
|
| 207 |
+
"lora_alpha": 32, # Scaling factor (usually 2*r)
|
| 208 |
+
"target_modules": "all-linear", # Apply LoRA to all linear layers
|
| 209 |
+
"target_parameters": [
|
| 210 |
+
"7.mlp.experts.gate_up_proj",
|
| 211 |
+
"7.mlp.experts.down_proj",
|
| 212 |
+
"15.mlp.experts.gate_up_proj",
|
| 213 |
+
"15.mlp.experts.down_proj",
|
| 214 |
+
"23.mlp.experts.gate_up_proj",
|
| 215 |
+
"23.mlp.experts.down_proj",
|
| 216 |
+
],
|
| 217 |
+
"bias": "none", # "none", "all", "lora_only"
|
| 218 |
+
"task_type": "CAUSAL_LM",
|
| 219 |
+
"lora_dropout": 0.05, # LoRA dropout rate
|
| 220 |
+
}
|
| 221 |
+
|
| 222 |
+
# ============================================================================
|
| 223 |
+
# QUANTIZATION CONFIGURATION DEFAULTS
|
| 224 |
+
# ============================================================================
|
| 225 |
+
if self.quantization_config is None:
|
| 226 |
+
self.quantization_config = {
|
| 227 |
+
"dequantize": True, # Use Mxfp4Config as per GPT-OSS tutorial
|
| 228 |
+
"load_in_4bit": False, # Set to True for extreme memory optimization
|
| 229 |
+
"bnb_4bit_compute_dtype": "bfloat16", # For 4-bit quantization
|
| 230 |
+
"bnb_4bit_use_double_quant": True, # Double quantization
|
| 231 |
+
"bnb_4bit_quant_type": "nf4" # Quantization type
|
| 232 |
+
}
|
| 233 |
+
|
| 234 |
+
# ============================================================================
|
| 235 |
+
# MODEL LOADING CONFIGURATION DEFAULTS
|
| 236 |
+
# ============================================================================
|
| 237 |
+
if self.model_kwargs is None:
|
| 238 |
+
self.model_kwargs = {
|
| 239 |
+
"attn_implementation": "eager", # "eager", "flash_attention_2"
|
| 240 |
+
"torch_dtype": "auto", # "auto", "bfloat16", "float16"
|
| 241 |
+
"use_cache": False, # Disable KV cache for training
|
| 242 |
+
"device_map": "auto", # Automatic device mapping
|
| 243 |
+
"low_cpu_mem_usage": self.low_cpu_mem_usage,
|
| 244 |
+
}
|
| 245 |
+
|
| 246 |
+
# Add memory constraints if specified
|
| 247 |
+
if self.max_memory_per_gpu:
|
| 248 |
+
self.model_kwargs["max_memory"] = {0: self.max_memory_per_gpu}
|
| 249 |
+
|
| 250 |
+
# ============================================================================
|
| 251 |
+
# GENERATION CONFIGURATION DEFAULTS
|
| 252 |
+
# ============================================================================
|
| 253 |
+
if self.generation_config is None:
|
| 254 |
+
self.generation_config = {
|
| 255 |
+
"max_new_tokens": 512, # Maximum tokens to generate
|
| 256 |
+
"do_sample": True, # Use sampling
|
| 257 |
+
"temperature": 0.7, # Sampling temperature
|
| 258 |
+
"top_p": 0.9, # Nucleus sampling
|
| 259 |
+
"top_k": 50, # Top-k sampling
|
| 260 |
+
"repetition_penalty": 1.1, # Repetition penalty
|
| 261 |
+
"pad_token_id": None, # Will be set from tokenizer
|
| 262 |
+
"eos_token_id": None, # Will be set from tokenizer
|
| 263 |
+
}
|
| 264 |
+
|
| 265 |
+
# ============================================================================
|
| 266 |
+
# LANGUAGE CONFIGURATION DEFAULTS
|
| 267 |
+
# ============================================================================
|
| 268 |
+
if self.reasoning_languages is None:
|
| 269 |
+
if self.primary_language == "fr":
|
| 270 |
+
self.reasoning_languages = [
|
| 271 |
+
"French", "English", "Spanish", "Italian", "German"
|
| 272 |
+
]
|
| 273 |
+
else:
|
| 274 |
+
self.reasoning_languages = [
|
| 275 |
+
"English", "Spanish", "French", "Italian", "German",
|
| 276 |
+
"Chinese", "Hindi", "Japanese", "Korean", "Arabic"
|
| 277 |
+
]
|
| 278 |
+
|
| 279 |
+
# ============================================================================
|
| 280 |
+
# SCHEDULER CONFIGURATION DEFAULTS
|
| 281 |
+
# ============================================================================
|
| 282 |
+
if self.lr_scheduler_kwargs is None:
|
| 283 |
+
self.lr_scheduler_kwargs = {"min_lr_rate": 0.1}
|
| 284 |
+
|
| 285 |
+
# ============================================================================
|
| 286 |
+
# CHAT TEMPLATE CONFIGURATION DEFAULTS (GPT-OSS Harmony Format)
|
| 287 |
+
# ============================================================================
|
| 288 |
+
if self.chat_template_kwargs is None:
|
| 289 |
+
self.chat_template_kwargs = {
|
| 290 |
+
"add_generation_prompt": True,
|
| 291 |
+
"tokenize": False,
|
| 292 |
+
"auto_insert_role": True,
|
| 293 |
+
# GPT-OSS Harmony Format specific settings
|
| 294 |
+
"reasoning_effort": "medium", # low, medium, high
|
| 295 |
+
"model_identity": "You are GPT-Tonic, a large language model trained by TonicAI.",
|
| 296 |
+
"builtin_tools": [], # Can include "browser" and/or "python"
|
| 297 |
+
}
|
| 298 |
+
|
| 299 |
+
# ============================================================================
|
| 300 |
+
# VALIDATION AND COMPUTED VALUES
|
| 301 |
+
# ============================================================================
|
| 302 |
+
# Compute effective batch size
|
| 303 |
+
effective_batch_size = self.batch_size * self.gradient_accumulation_steps
|
| 304 |
+
|
| 305 |
+
# Set warmup steps if not provided
|
| 306 |
+
if self.warmup_steps is None and self.max_steps:
|
| 307 |
+
self.warmup_steps = int(self.max_steps * self.warmup_ratio)
|
| 308 |
+
|
| 309 |
+
# Set max_length for dataset filtering
|
| 310 |
+
if self.max_length is None:
|
| 311 |
+
self.max_length = self.max_seq_length
|
| 312 |
+
|
| 313 |
+
# Validate configuration
|
| 314 |
+
self._validate_config()
|
| 315 |
+
|
| 316 |
+
# Print comprehensive configuration summary
|
| 317 |
+
self._print_config_summary(effective_batch_size)
|
| 318 |
+
|
| 319 |
+
def _validate_config(self):
|
| 320 |
+
"""Validate configuration parameters"""
|
| 321 |
+
|
| 322 |
+
# Validate batch configuration
|
| 323 |
+
if self.batch_size < 1:
|
| 324 |
+
raise ValueError("batch_size must be >= 1")
|
| 325 |
+
if self.gradient_accumulation_steps < 1:
|
| 326 |
+
raise ValueError("gradient_accumulation_steps must be >= 1")
|
| 327 |
+
|
| 328 |
+
# Validate learning rate
|
| 329 |
+
if self.learning_rate <= 0:
|
| 330 |
+
raise ValueError("learning_rate must be > 0")
|
| 331 |
+
if self.min_lr >= self.learning_rate:
|
| 332 |
+
raise ValueError("min_lr must be < learning_rate")
|
| 333 |
+
|
| 334 |
+
# Validate sequence length
|
| 335 |
+
if self.max_seq_length < 1:
|
| 336 |
+
raise ValueError("max_seq_length must be >= 1")
|
| 337 |
+
|
| 338 |
+
# Validate dataset format
|
| 339 |
+
valid_formats = ["openhermes_fr", "messages", "text", "custom"]
|
| 340 |
+
if self.dataset_format not in valid_formats:
|
| 341 |
+
raise ValueError(f"dataset_format must be one of {valid_formats}")
|
| 342 |
+
|
| 343 |
+
def _print_config_summary(self, effective_batch_size):
|
| 344 |
+
"""Print detailed configuration summary"""
|
| 345 |
+
|
| 346 |
+
print("\n" + "="*80)
|
| 347 |
+
print("🚀 GPT-OSS ENHANCED CUSTOM CONFIGURATION")
|
| 348 |
+
print("="*80)
|
| 349 |
+
|
| 350 |
+
print(f"📊 Model & Training:")
|
| 351 |
+
print(f" • Model: {self.model_name}")
|
| 352 |
+
print(f" • Dataset: {self.dataset_name} ({self.dataset_format})")
|
| 353 |
+
print(f" • Primary Language: {self.primary_language}")
|
| 354 |
+
print(f" • Sequence Length: {self.max_seq_length}")
|
| 355 |
+
print(f" • Epochs: {self.num_train_epochs}")
|
| 356 |
+
|
| 357 |
+
print(f"\n🔄 Batch Configuration:")
|
| 358 |
+
print(f" • Per-device Batch Size: {self.batch_size}")
|
| 359 |
+
print(f" • Gradient Accumulation: {self.gradient_accumulation_steps}")
|
| 360 |
+
print(f" • Effective Batch Size: {effective_batch_size}")
|
| 361 |
+
|
| 362 |
+
print(f"\n📈 Learning Configuration:")
|
| 363 |
+
print(f" • Learning Rate: {self.learning_rate}")
|
| 364 |
+
print(f" • Min Learning Rate: {self.min_lr}")
|
| 365 |
+
print(f" • Weight Decay: {self.weight_decay}")
|
| 366 |
+
print(f" • Warmup Ratio: {self.warmup_ratio}")
|
| 367 |
+
|
| 368 |
+
print(f"\n🎛️ LoRA Configuration:")
|
| 369 |
+
print(f" • Rank: {self.lora_config['r']}")
|
| 370 |
+
print(f" • Alpha: {self.lora_config['lora_alpha']}")
|
| 371 |
+
print(f" • Target Modules: {self.lora_config['target_modules']}")
|
| 372 |
+
|
| 373 |
+
print(f"\n📁 Dataset Configuration:")
|
| 374 |
+
print(f" • Input Field: {self.input_field}")
|
| 375 |
+
print(f" • Target Field: {self.target_field}")
|
| 376 |
+
print(f" • Filter Bad Entries: {self.filter_bad_entries}")
|
| 377 |
+
print(f" • Max Samples: {self.max_samples or 'All'}")
|
| 378 |
+
|
| 379 |
+
print(f"\n💾 Memory & Performance:")
|
| 380 |
+
print(f" • Mixed Precision: {'BF16' if self.bf16 else 'FP32'}")
|
| 381 |
+
print(f" • Gradient Checkpointing: {self.use_gradient_checkpointing}")
|
| 382 |
+
print(f" • Data Workers: {self.dataloader_num_workers}")
|
| 383 |
+
print(f" • Group by Length: {self.group_by_length}")
|
| 384 |
+
|
| 385 |
+
print("="*80 + "\n")
|
| 386 |
+
|
| 387 |
+
# Create the config instance with OpenHermes-FR optimized defaults
|
| 388 |
+
config = GPTOSSEnhancedCustomConfig()
|
config/train_gpt_oss_openhermes_fr.py
ADDED
|
@@ -0,0 +1,174 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
GPT-OSS OpenHermes-FR Optimized Configuration
|
| 3 |
+
Specifically optimized for the legmlai/openhermes-fr dataset
|
| 4 |
+
800K French instruction-response pairs with quality filtering
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from config.train_gpt_oss_custom import GPTOSSEnhancedCustomConfig
|
| 8 |
+
|
| 9 |
+
# OpenHermes-FR optimized configuration
|
| 10 |
+
config = GPTOSSEnhancedCustomConfig(
|
| 11 |
+
# ============================================================================
|
| 12 |
+
# DATASET CONFIGURATION - OpenHermes-FR Specific
|
| 13 |
+
# ============================================================================
|
| 14 |
+
dataset_name="legmlai/openhermes-fr",
|
| 15 |
+
dataset_split="train",
|
| 16 |
+
dataset_format="openhermes_fr",
|
| 17 |
+
|
| 18 |
+
# OpenHermes-FR field mapping
|
| 19 |
+
input_field="prompt", # French prompts
|
| 20 |
+
target_field="accepted_completion", # GPT-4o generated completions
|
| 21 |
+
|
| 22 |
+
# Quality filtering using OpenHermes-FR metadata
|
| 23 |
+
filter_bad_entries=True, # Use built-in quality flags
|
| 24 |
+
bad_entry_field="bad_entry",
|
| 25 |
+
bad_prompt_field="bad_prompt_detected",
|
| 26 |
+
bad_response_field="bad_response_detected",
|
| 27 |
+
|
| 28 |
+
# Data processing optimized for French with GPT-OSS Harmony Format
|
| 29 |
+
concatenate_fields=True,
|
| 30 |
+
field_separator="\n\n### Réponse:\n", # Fallback separator (harmony format takes precedence)
|
| 31 |
+
add_eos_token=True,
|
| 32 |
+
use_harmony_format=True, # Enable GPT-OSS harmony format
|
| 33 |
+
|
| 34 |
+
# Dataset sampling (use all 800K examples by default)
|
| 35 |
+
max_samples=None, # Use full dataset
|
| 36 |
+
min_length=20, # Minimum for meaningful French text
|
| 37 |
+
max_length=None, # Auto-set to max_seq_length
|
| 38 |
+
|
| 39 |
+
# ============================================================================
|
| 40 |
+
# TRAINING HYPERPARAMETERS - French Language Optimized
|
| 41 |
+
# ============================================================================
|
| 42 |
+
num_train_epochs=1.5, # 1.5 epochs optimal for large dataset
|
| 43 |
+
batch_size=6, # Balanced for most GPUs
|
| 44 |
+
gradient_accumulation_steps=6, # Effective batch size: 36
|
| 45 |
+
|
| 46 |
+
# Learning rate schedule optimized for French fine-tuning
|
| 47 |
+
learning_rate=2.5e-4, # Slightly higher for multilingual
|
| 48 |
+
min_lr=2.5e-5, # 10% of max learning rate
|
| 49 |
+
warmup_ratio=0.05, # 5% warmup for stability
|
| 50 |
+
weight_decay=0.01, # Standard L2 regularization
|
| 51 |
+
max_grad_norm=1.0, # Gradient clipping
|
| 52 |
+
|
| 53 |
+
# ============================================================================
|
| 54 |
+
# MODEL CONFIGURATION - Optimized for French
|
| 55 |
+
# ============================================================================
|
| 56 |
+
model_name="openai/gpt-oss-20b",
|
| 57 |
+
max_seq_length=3072, # Balanced length for French
|
| 58 |
+
use_flash_attention=True,
|
| 59 |
+
use_gradient_checkpointing=True,
|
| 60 |
+
|
| 61 |
+
# Mixed precision for efficiency
|
| 62 |
+
fp16=False,
|
| 63 |
+
bf16=True, # Better for GPT-OSS
|
| 64 |
+
|
| 65 |
+
# ============================================================================
|
| 66 |
+
# LORA CONFIGURATION - Optimized for French Language Learning
|
| 67 |
+
# ============================================================================
|
| 68 |
+
use_lora=True,
|
| 69 |
+
lora_config={
|
| 70 |
+
"r": 24, # Higher rank for language adaptation
|
| 71 |
+
"lora_alpha": 48, # 2x rank scaling
|
| 72 |
+
"lora_dropout": 0.05, # Light regularization
|
| 73 |
+
"target_modules": "all-linear",
|
| 74 |
+
"target_parameters": [
|
| 75 |
+
"7.mlp.experts.gate_up_proj",
|
| 76 |
+
"7.mlp.experts.down_proj",
|
| 77 |
+
"15.mlp.experts.gate_up_proj",
|
| 78 |
+
"15.mlp.experts.down_proj",
|
| 79 |
+
"23.mlp.experts.gate_up_proj",
|
| 80 |
+
"23.mlp.experts.down_proj",
|
| 81 |
+
],
|
| 82 |
+
"bias": "none",
|
| 83 |
+
"task_type": "CAUSAL_LM",
|
| 84 |
+
},
|
| 85 |
+
|
| 86 |
+
# ============================================================================
|
| 87 |
+
# QUANTIZATION - Balanced Performance/Memory
|
| 88 |
+
# ============================================================================
|
| 89 |
+
use_quantization=True,
|
| 90 |
+
quantization_config={
|
| 91 |
+
"dequantize": True, # MXFP4 as per GPT-OSS tutorial
|
| 92 |
+
"load_in_4bit": False, # Standard precision for quality
|
| 93 |
+
},
|
| 94 |
+
|
| 95 |
+
# ============================================================================
|
| 96 |
+
# PERFORMANCE OPTIMIZATION
|
| 97 |
+
# ============================================================================
|
| 98 |
+
# Data loading optimized for large dataset
|
| 99 |
+
dataloader_num_workers=6, # More workers for large dataset
|
| 100 |
+
dataloader_pin_memory=True,
|
| 101 |
+
dataloader_prefetch_factor=3, # Higher prefetch for efficiency
|
| 102 |
+
|
| 103 |
+
# Memory management
|
| 104 |
+
low_cpu_mem_usage=True,
|
| 105 |
+
group_by_length=True, # Efficient batching
|
| 106 |
+
remove_unused_columns=True,
|
| 107 |
+
|
| 108 |
+
# ============================================================================
|
| 109 |
+
# EVALUATION & LOGGING
|
| 110 |
+
# ============================================================================
|
| 111 |
+
eval_strategy="steps",
|
| 112 |
+
eval_steps=200, # Evaluate every 200 steps
|
| 113 |
+
logging_steps=20, # Log every 20 steps
|
| 114 |
+
|
| 115 |
+
save_strategy="steps",
|
| 116 |
+
save_steps=500, # Save every 500 steps
|
| 117 |
+
save_total_limit=3, # Keep 3 best checkpoints
|
| 118 |
+
|
| 119 |
+
metric_for_best_model="eval_loss",
|
| 120 |
+
greater_is_better=False,
|
| 121 |
+
load_best_model_at_end=True,
|
| 122 |
+
|
| 123 |
+
# ============================================================================
|
| 124 |
+
# MULTILINGUAL & FRENCH SPECIFIC SETTINGS
|
| 125 |
+
# ============================================================================
|
| 126 |
+
primary_language="fr", # French as primary language
|
| 127 |
+
reasoning_languages=["French", "English"], # Bilingual reasoning
|
| 128 |
+
domain_focus="instruction", # Instruction following
|
| 129 |
+
|
| 130 |
+
# ============================================================================
|
| 131 |
+
# GENERATION CONFIG FOR EVALUATION - GPT-OSS Harmony Format
|
| 132 |
+
# ============================================================================
|
| 133 |
+
generation_config={
|
| 134 |
+
"max_new_tokens": 512,
|
| 135 |
+
"do_sample": True,
|
| 136 |
+
"temperature": 0.7,
|
| 137 |
+
"top_p": 0.9,
|
| 138 |
+
"top_k": 50,
|
| 139 |
+
"repetition_penalty": 1.1,
|
| 140 |
+
"pad_token_id": None,
|
| 141 |
+
"eos_token_id": None,
|
| 142 |
+
# GPT-OSS Harmony Format specific settings
|
| 143 |
+
"reasoning_effort": "medium", # Configurable reasoning level
|
| 144 |
+
"use_harmony_format": True, # Ensure harmony format in generation
|
| 145 |
+
},
|
| 146 |
+
|
| 147 |
+
# ============================================================================
|
| 148 |
+
# HF HUB INTEGRATION
|
| 149 |
+
# ============================================================================
|
| 150 |
+
push_to_hub=False, # Set to True to auto-push
|
| 151 |
+
hub_model_id=None, # Will be set by launch script
|
| 152 |
+
hub_private_repo=False,
|
| 153 |
+
|
| 154 |
+
# ============================================================================
|
| 155 |
+
# MONITORING
|
| 156 |
+
# ============================================================================
|
| 157 |
+
enable_tracking=True, # Trackio monitoring
|
| 158 |
+
log_artifacts=True,
|
| 159 |
+
log_metrics=True,
|
| 160 |
+
log_config=True,
|
| 161 |
+
)
|
| 162 |
+
|
| 163 |
+
# Print configuration summary on import
|
| 164 |
+
print("\n🇫🇷 OpenHermes-FR Configuration Loaded")
|
| 165 |
+
print("=" * 50)
|
| 166 |
+
print(f"📊 Dataset: {config.dataset_name}")
|
| 167 |
+
print(f"🗣️ Language: French (with {config.dataset_format} format)")
|
| 168 |
+
print(f"📈 Training: {config.num_train_epochs} epochs")
|
| 169 |
+
print(f"🔄 Effective Batch Size: {config.batch_size * config.gradient_accumulation_steps}")
|
| 170 |
+
print(f"🧠 LoRA Rank: {config.lora_config['r']}")
|
| 171 |
+
print(f"📏 Sequence Length: {config.max_seq_length}")
|
| 172 |
+
print(f"🔍 Quality Filtering: {'Enabled' if config.filter_bad_entries else 'Disabled'}")
|
| 173 |
+
print(f"🎵 GPT-OSS Harmony Format: {'Enabled' if config.use_harmony_format else 'Disabled'}")
|
| 174 |
+
print("=" * 50)
|
config/train_gpt_oss_openhermes_fr_memory_optimized.py
ADDED
|
@@ -0,0 +1,233 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
GPT-OSS OpenHermes-FR Memory-Optimized Configuration
|
| 3 |
+
Combines memory optimization best practices with OpenHermes-FR dataset
|
| 4 |
+
Optimized for GPT-OSS harmony format and MXFP4 quantization
|
| 5 |
+
Based on OpenAI GPT-OSS specifications and memory optimization principles
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from config.train_gpt_oss_custom import GPTOSSEnhancedCustomConfig
|
| 9 |
+
|
| 10 |
+
# Memory-optimized OpenHermes-FR configuration for GPT-OSS
|
| 11 |
+
config = GPTOSSEnhancedCustomConfig(
|
| 12 |
+
# ============================================================================
|
| 13 |
+
# DATASET CONFIGURATION - OpenHermes-FR with Harmony Format
|
| 14 |
+
# ============================================================================
|
| 15 |
+
dataset_name="legmlai/openhermes-fr",
|
| 16 |
+
dataset_split="train",
|
| 17 |
+
dataset_format="openhermes_fr",
|
| 18 |
+
|
| 19 |
+
# OpenHermes-FR field mapping optimized for harmony format
|
| 20 |
+
input_field="prompt", # French prompts
|
| 21 |
+
target_field="accepted_completion", # GPT-4o generated completions
|
| 22 |
+
|
| 23 |
+
# Enhanced quality filtering for memory-constrained training
|
| 24 |
+
filter_bad_entries=True, # Critical for memory efficiency
|
| 25 |
+
bad_entry_field="bad_entry",
|
| 26 |
+
bad_prompt_field="bad_prompt_detected",
|
| 27 |
+
bad_response_field="bad_response_detected",
|
| 28 |
+
|
| 29 |
+
# Memory-optimized data processing with GPT-OSS Harmony Format
|
| 30 |
+
concatenate_fields=True,
|
| 31 |
+
field_separator="\n\n### Réponse:\n", # Fallback separator (harmony format takes precedence)
|
| 32 |
+
add_eos_token=True, # Required for proper training
|
| 33 |
+
use_harmony_format=True, # Enable GPT-OSS harmony format
|
| 34 |
+
|
| 35 |
+
# Dataset sampling optimized for memory constraints
|
| 36 |
+
max_samples=200000, # Reduced from 800K for memory efficiency
|
| 37 |
+
min_length=15, # Slightly higher minimum for quality
|
| 38 |
+
max_length=2048, # Explicit max length for memory control
|
| 39 |
+
|
| 40 |
+
# ============================================================================
|
| 41 |
+
# MEMORY-OPTIMIZED TRAINING HYPERPARAMETERS
|
| 42 |
+
# ============================================================================
|
| 43 |
+
# Batch configuration following memory optimization principles
|
| 44 |
+
num_train_epochs=1.0, # Single epoch to reduce memory pressure
|
| 45 |
+
batch_size=2, # Reduced from 6 for memory efficiency
|
| 46 |
+
gradient_accumulation_steps=16, # Increased to maintain effective batch size 32
|
| 47 |
+
|
| 48 |
+
# Learning rate optimized for single epoch + memory constraints
|
| 49 |
+
learning_rate=2e-4, # Standard GPT-OSS learning rate
|
| 50 |
+
min_lr=2e-5, # 10% of max learning rate
|
| 51 |
+
warmup_ratio=0.03, # Reduced warmup for memory efficiency
|
| 52 |
+
weight_decay=0.01, # Standard L2 regularization
|
| 53 |
+
max_grad_norm=1.0, # Gradient clipping for stability
|
| 54 |
+
|
| 55 |
+
# ============================================================================
|
| 56 |
+
# MODEL CONFIGURATION - Memory Optimized for GPT-OSS
|
| 57 |
+
# ============================================================================
|
| 58 |
+
model_name="openai/gpt-oss-20b",
|
| 59 |
+
max_seq_length=1024, # Reduced from 3072 for memory optimization
|
| 60 |
+
use_flash_attention=True, # Critical for memory efficiency
|
| 61 |
+
use_gradient_checkpointing=True, # Essential for memory optimization
|
| 62 |
+
|
| 63 |
+
# Mixed precision optimized for GPT-OSS MXFP4
|
| 64 |
+
fp16=False, # Not recommended for GPT-OSS
|
| 65 |
+
bf16=True, # Required for GPT-OSS stability
|
| 66 |
+
tf32=True, # Enable TF32 for A100/H100 efficiency
|
| 67 |
+
|
| 68 |
+
# ============================================================================
|
| 69 |
+
# LORA CONFIGURATION - Memory Optimized for GPT-OSS MoE
|
| 70 |
+
# ============================================================================
|
| 71 |
+
use_lora=True,
|
| 72 |
+
lora_config={
|
| 73 |
+
"r": 8, # Reduced rank for memory efficiency
|
| 74 |
+
"lora_alpha": 16, # 2x rank scaling (memory optimized)
|
| 75 |
+
"lora_dropout": 0.1, # Higher dropout for better generalization
|
| 76 |
+
"target_modules": "all-linear", # Apply to all linear layers
|
| 77 |
+
"target_parameters": [
|
| 78 |
+
# GPT-OSS specific MoE expert targeting
|
| 79 |
+
"7.mlp.experts.gate_up_proj",
|
| 80 |
+
"7.mlp.experts.down_proj",
|
| 81 |
+
"15.mlp.experts.gate_up_proj",
|
| 82 |
+
"15.mlp.experts.down_proj",
|
| 83 |
+
"23.mlp.experts.gate_up_proj",
|
| 84 |
+
"23.mlp.experts.down_proj",
|
| 85 |
+
],
|
| 86 |
+
"bias": "none", # No bias adaptation for memory efficiency
|
| 87 |
+
"task_type": "CAUSAL_LM",
|
| 88 |
+
"modules_to_save": [], # Don't save additional modules for memory
|
| 89 |
+
},
|
| 90 |
+
|
| 91 |
+
# ============================================================================
|
| 92 |
+
# QUANTIZATION - GPT-OSS Native MXFP4 Optimization
|
| 93 |
+
# ============================================================================
|
| 94 |
+
use_quantization=True,
|
| 95 |
+
quantization_config={
|
| 96 |
+
"dequantize": True, # Use native MXFP4 as per GPT-OSS specs
|
| 97 |
+
"load_in_4bit": False, # Don't use BNB 4-bit with MXFP4
|
| 98 |
+
"mxfp4_config": { # Native GPT-OSS MXFP4 settings
|
| 99 |
+
"enabled": True,
|
| 100 |
+
"block_size": 32, # Optimized block size for MoE
|
| 101 |
+
}
|
| 102 |
+
},
|
| 103 |
+
|
| 104 |
+
# ============================================================================
|
| 105 |
+
# MEMORY OPTIMIZATION CONFIGURATION
|
| 106 |
+
# ============================================================================
|
| 107 |
+
# Model loading with memory constraints
|
| 108 |
+
model_kwargs={
|
| 109 |
+
"attn_implementation": "eager", # Memory-safe attention
|
| 110 |
+
"torch_dtype": "auto", # Let model decide (MXFP4 compatible)
|
| 111 |
+
"use_cache": False, # Disable KV cache for training
|
| 112 |
+
"device_map": "auto", # Automatic device mapping
|
| 113 |
+
"low_cpu_mem_usage": True, # Critical for memory optimization
|
| 114 |
+
"max_memory": {0: "75GB"}, # Reserve memory for other processes
|
| 115 |
+
},
|
| 116 |
+
|
| 117 |
+
# Data loading optimized for memory efficiency
|
| 118 |
+
dataloader_num_workers=2, # Reduced workers to save memory
|
| 119 |
+
dataloader_pin_memory=False, # Disable to save memory
|
| 120 |
+
dataloader_prefetch_factor=1, # Minimal prefetch for memory
|
| 121 |
+
|
| 122 |
+
# Memory management optimizations
|
| 123 |
+
max_memory_per_gpu="75GB", # Explicit memory limit
|
| 124 |
+
low_cpu_mem_usage=True, # Essential for large models
|
| 125 |
+
group_by_length=True, # Efficient batching for memory
|
| 126 |
+
remove_unused_columns=True, # Remove unnecessary data
|
| 127 |
+
|
| 128 |
+
# ============================================================================
|
| 129 |
+
# EVALUATION & LOGGING - Memory Efficient
|
| 130 |
+
# ============================================================================
|
| 131 |
+
eval_strategy="steps",
|
| 132 |
+
eval_steps=500, # Less frequent evaluation for memory
|
| 133 |
+
logging_steps=50, # Reduced logging frequency
|
| 134 |
+
|
| 135 |
+
save_strategy="steps",
|
| 136 |
+
save_steps=1000, # Less frequent saves for memory/storage
|
| 137 |
+
save_total_limit=2, # Keep only 2 checkpoints for memory
|
| 138 |
+
save_only_model=True, # Save only model weights
|
| 139 |
+
|
| 140 |
+
metric_for_best_model="eval_loss",
|
| 141 |
+
greater_is_better=False,
|
| 142 |
+
load_best_model_at_end=True,
|
| 143 |
+
|
| 144 |
+
# Evaluation memory optimization
|
| 145 |
+
eval_accumulation_steps=4, # Accumulate eval outputs to save memory
|
| 146 |
+
eval_batch_size=1, # Smaller eval batch size
|
| 147 |
+
|
| 148 |
+
# ============================================================================
|
| 149 |
+
# GPT-OSS HARMONY FORMAT OPTIMIZATION
|
| 150 |
+
# ============================================================================
|
| 151 |
+
# Chat template for harmony format compatibility (following exact template)
|
| 152 |
+
use_chat_template=False, # Use custom harmony format instead
|
| 153 |
+
chat_template_kwargs={
|
| 154 |
+
"add_generation_prompt": True,
|
| 155 |
+
"tokenize": False,
|
| 156 |
+
# GPT-OSS Harmony Format specific settings (exact template format)
|
| 157 |
+
"reasoning_effort": "medium", # low, medium, high
|
| 158 |
+
"model_identity": "You are GPT-Tonic, a large language model trained by TonicAI.",
|
| 159 |
+
"builtin_tools": [], # Can include "browser" and/or "python"
|
| 160 |
+
},
|
| 161 |
+
|
| 162 |
+
# Generation config optimized for GPT-OSS harmony format (exact template compliance)
|
| 163 |
+
generation_config={
|
| 164 |
+
"max_new_tokens": 256, # Reduced for memory efficiency
|
| 165 |
+
"do_sample": True,
|
| 166 |
+
"temperature": 0.6, # Slightly lower for more focused training
|
| 167 |
+
"top_p": 0.9,
|
| 168 |
+
"top_k": 40, # Reduced for memory efficiency
|
| 169 |
+
"repetition_penalty": 1.1,
|
| 170 |
+
"pad_token_id": None,
|
| 171 |
+
"eos_token_id": None,
|
| 172 |
+
# GPT-OSS Harmony Format specific settings (exact template format)
|
| 173 |
+
"reasoning_effort": "medium", # Configurable reasoning level
|
| 174 |
+
"use_harmony_format": True, # Ensure harmony format in generation
|
| 175 |
+
},
|
| 176 |
+
|
| 177 |
+
# ============================================================================
|
| 178 |
+
# MULTILINGUAL & REASONING OPTIMIZATION
|
| 179 |
+
# ============================================================================
|
| 180 |
+
primary_language="fr", # French as primary language
|
| 181 |
+
reasoning_languages=["French", "English"], # Bilingual reasoning capability
|
| 182 |
+
domain_focus="reasoning", # Align with GPT-OSS reasoning focus
|
| 183 |
+
|
| 184 |
+
# ============================================================================
|
| 185 |
+
# OPTIMIZER & SCHEDULER - Memory Optimized
|
| 186 |
+
# ============================================================================
|
| 187 |
+
optimizer="adamw_torch", # Memory-efficient optimizer
|
| 188 |
+
beta1=0.9,
|
| 189 |
+
beta2=0.95, # GPT-OSS optimized beta2
|
| 190 |
+
eps=1e-8,
|
| 191 |
+
|
| 192 |
+
scheduler="cosine_with_min_lr", # Stable scheduler for single epoch
|
| 193 |
+
lr_scheduler_kwargs={
|
| 194 |
+
"min_lr_rate": 0.1,
|
| 195 |
+
"warmup_steps": None, # Use warmup_ratio instead
|
| 196 |
+
},
|
| 197 |
+
|
| 198 |
+
# ============================================================================
|
| 199 |
+
# MONITORING & HUB INTEGRATION
|
| 200 |
+
# ============================================================================
|
| 201 |
+
enable_tracking=True, # Trackio monitoring
|
| 202 |
+
log_artifacts=False, # Disable to save memory/storage
|
| 203 |
+
log_metrics=True,
|
| 204 |
+
log_config=True,
|
| 205 |
+
|
| 206 |
+
push_to_hub=False, # Set to True after successful training
|
| 207 |
+
hub_model_id=None,
|
| 208 |
+
hub_private_repo=False,
|
| 209 |
+
)
|
| 210 |
+
|
| 211 |
+
# Configuration validation and optimization tips
|
| 212 |
+
print("\n🔧 GPT-OSS Memory-Optimized OpenHermes-FR Configuration")
|
| 213 |
+
print("=" * 60)
|
| 214 |
+
print(f"📊 Dataset: {config.dataset_name} (200K samples)")
|
| 215 |
+
print(f"🗣️ Language: French with GPT-OSS Harmony Format")
|
| 216 |
+
print(f"📈 Training: {config.num_train_epochs} epoch (memory optimized)")
|
| 217 |
+
print(f"🔄 Effective Batch Size: {config.batch_size * config.gradient_accumulation_steps}")
|
| 218 |
+
print(f"🧠 LoRA Rank: {config.lora_config['r']} (memory optimized)")
|
| 219 |
+
print(f"📏 Sequence Length: {config.max_seq_length} (memory optimized)")
|
| 220 |
+
print(f"💾 Memory Limit: {config.max_memory_per_gpu}")
|
| 221 |
+
print(f"⚡ Quantization: MXFP4 (GPT-OSS native)")
|
| 222 |
+
print(f"🔍 Quality Filtering: Enabled")
|
| 223 |
+
print(f"🎵 GPT-OSS Harmony Format: {'Enabled' if config.use_harmony_format else 'Disabled'}")
|
| 224 |
+
print("=" * 60)
|
| 225 |
+
print("\n💡 Memory Optimization Features:")
|
| 226 |
+
print(" • Native MXFP4 quantization for GPT-OSS MoE layers")
|
| 227 |
+
print(" • Reduced batch size with increased gradient accumulation")
|
| 228 |
+
print(" • Limited sequence length for memory efficiency")
|
| 229 |
+
print(" • Reduced LoRA rank while maintaining effectiveness")
|
| 230 |
+
print(" • Dataset sampling (200K from 800K) for faster training")
|
| 231 |
+
print(" • Gradient checkpointing and efficient data loading")
|
| 232 |
+
print(" • Exact GPT-OSS Harmony format with <|return|> tokens")
|
| 233 |
+
print("=" * 60)
|
docs/output.svg
ADDED
|
|
launch.sh
CHANGED
|
@@ -234,7 +234,34 @@ show_training_configs() {
|
|
| 234 |
echo " - 4-bit quantization + reduced LoRA"
|
| 235 |
echo " - Optimized for limited GPU memory"
|
| 236 |
echo ""
|
| 237 |
-
echo "9.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 238 |
echo " - User-defined parameters"
|
| 239 |
echo ""
|
| 240 |
}
|
|
@@ -325,12 +352,142 @@ get_training_config() {
|
|
| 325 |
MAX_SEQ_LENGTH=1024
|
| 326 |
CONFIG_FILE="config/train_gpt_oss_memory_optimized.py"
|
| 327 |
;;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 328 |
"Custom Configuration")
|
| 329 |
get_custom_config
|
| 330 |
;;
|
| 331 |
esac
|
| 332 |
}
|
| 333 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 334 |
# Function to get custom configuration
|
| 335 |
get_custom_config() {
|
| 336 |
print_step "Custom Configuration Setup"
|
|
@@ -352,6 +509,136 @@ get_custom_config() {
|
|
| 352 |
fi
|
| 353 |
}
|
| 354 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 355 |
# Function to create training configuration file
|
| 356 |
create_training_config() {
|
| 357 |
local config_file="$1"
|
|
@@ -499,7 +786,7 @@ print_step "Step 2: Training Configuration"
|
|
| 499 |
echo "=================================="
|
| 500 |
|
| 501 |
show_training_configs
|
| 502 |
-
select_option "Select training configuration:" "Basic Training" "H100 Lightweight (Rapid)" "A100 Large Scale" "Multiple Passes" "GPT-OSS Basic Training" "GPT-OSS H100 Optimized" "GPT-OSS Multilingual Reasoning" "GPT-OSS Memory Optimized" "Custom Configuration" TRAINING_CONFIG_TYPE
|
| 503 |
|
| 504 |
get_training_config "$TRAINING_CONFIG_TYPE"
|
| 505 |
|
|
@@ -836,13 +1123,25 @@ print_info "Dataset: $DATASET_NAME"
|
|
| 836 |
print_info "Batch size: $BATCH_SIZE"
|
| 837 |
print_info "Learning rate: $LEARNING_RATE"
|
| 838 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 839 |
# Step 15: Start training
|
| 840 |
print_step "Step 15: Starting Training"
|
| 841 |
echo "=============================="
|
| 842 |
|
| 843 |
print_info "Starting training with configuration: $CONFIG_FILE"
|
| 844 |
print_info "Experiment: $EXPERIMENT_NAME"
|
| 845 |
-
print_info "Output:
|
| 846 |
print_info "Trackio: $TRACKIO_URL"
|
| 847 |
|
| 848 |
# Ensure environment variables are available for training
|
|
@@ -852,6 +1151,7 @@ export HF_TOKEN="$HF_TOKEN"
|
|
| 852 |
export HUGGING_FACE_HUB_TOKEN="$HF_TOKEN"
|
| 853 |
export HF_USERNAME="$HF_USERNAME"
|
| 854 |
export TRACKIO_DATASET_REPO="$TRACKIO_DATASET_REPO"
|
|
|
|
| 855 |
|
| 856 |
# Run the appropriate training script based on model type
|
| 857 |
if [[ "$MODEL_NAME" == *"gpt-oss"* ]]; then
|
|
@@ -859,7 +1159,7 @@ if [[ "$MODEL_NAME" == *"gpt-oss"* ]]; then
|
|
| 859 |
python scripts/training/train_gpt_oss.py \
|
| 860 |
--config "$CONFIG_FILE" \
|
| 861 |
--experiment-name "$EXPERIMENT_NAME" \
|
| 862 |
-
--output-dir
|
| 863 |
--trackio-url "$TRACKIO_URL" \
|
| 864 |
--trainer-type "$TRAINER_TYPE_LOWER"
|
| 865 |
else
|
|
@@ -867,7 +1167,7 @@ else
|
|
| 867 |
python scripts/training/train.py \
|
| 868 |
--config "$CONFIG_FILE" \
|
| 869 |
--experiment-name "$EXPERIMENT_NAME" \
|
| 870 |
-
--output-dir
|
| 871 |
--trackio-url "$TRACKIO_URL" \
|
| 872 |
--trainer-type "$TRAINER_TYPE_LOWER"
|
| 873 |
fi
|
|
@@ -877,7 +1177,7 @@ print_step "Step 16: Pushing Model to HF Hub"
|
|
| 877 |
echo "====================================="
|
| 878 |
|
| 879 |
print_info "Pushing model to: $REPO_NAME"
|
| 880 |
-
print_info "Checkpoint:
|
| 881 |
|
| 882 |
# Ensure environment variables are available for model push
|
| 883 |
export HF_WRITE_TOKEN="$HF_WRITE_TOKEN"
|
|
@@ -886,26 +1186,43 @@ export HF_TOKEN="$HF_TOKEN"
|
|
| 886 |
export HUGGING_FACE_HUB_TOKEN="$HF_TOKEN"
|
| 887 |
export HF_USERNAME="$HF_USERNAME"
|
| 888 |
export TRACKIO_DATASET_REPO="$TRACKIO_DATASET_REPO"
|
|
|
|
| 889 |
|
| 890 |
# Run the appropriate push script based on model type
|
| 891 |
if [[ "$MODEL_NAME" == *"gpt-oss"* ]]; then
|
| 892 |
print_info "Using GPT-OSS specialized push script..."
|
| 893 |
-
python scripts/model_tonic/push_gpt_oss_to_huggingface.py
|
| 894 |
--token "$HF_TOKEN" \
|
| 895 |
--trackio-url "$TRACKIO_URL" \
|
| 896 |
--experiment-name "$EXPERIMENT_NAME" \
|
| 897 |
--dataset-repo "$TRACKIO_DATASET_REPO" \
|
| 898 |
--author-name "$AUTHOR_NAME" \
|
| 899 |
-
--model-description "$MODEL_DESCRIPTION"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 900 |
else
|
| 901 |
print_info "Using standard SmolLM3 push script..."
|
| 902 |
-
python scripts/model_tonic/push_to_huggingface.py
|
| 903 |
--token "$HF_TOKEN" \
|
| 904 |
--trackio-url "$TRACKIO_URL" \
|
| 905 |
--experiment-name "$EXPERIMENT_NAME" \
|
| 906 |
--dataset-repo "$TRACKIO_DATASET_REPO" \
|
| 907 |
--author-name "$AUTHOR_NAME" \
|
| 908 |
-
--model-description "$MODEL_DESCRIPTION"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 909 |
fi
|
| 910 |
|
| 911 |
# Step 16.5: Switch Trackio Space to Read Token (Security)
|
|
@@ -1018,7 +1335,7 @@ fi)
|
|
| 1018 |
|
| 1019 |
## Files Created
|
| 1020 |
- Training configuration: \`$CONFIG_FILE\`
|
| 1021 |
-
- Model checkpoint:
|
| 1022 |
- Training logs: \`training.log\`
|
| 1023 |
- Summary report: \`training_summary.md\`
|
| 1024 |
EOF
|
|
|
|
| 234 |
echo " - 4-bit quantization + reduced LoRA"
|
| 235 |
echo " - Optimized for limited GPU memory"
|
| 236 |
echo ""
|
| 237 |
+
echo "9. GPT-OSS OpenHermes-FR (Recommended)"
|
| 238 |
+
echo " - Model: openai/gpt-oss-20b"
|
| 239 |
+
echo " - Dataset: legmlai/openhermes-fr (800K French examples)"
|
| 240 |
+
echo " - Epochs: 1.5"
|
| 241 |
+
echo " - Batch Size: 6 (effective 36 with accumulation)"
|
| 242 |
+
echo " - Learning Rate: 2.5e-4"
|
| 243 |
+
echo " - Optimized for French language training"
|
| 244 |
+
echo " - Quality filtering enabled"
|
| 245 |
+
echo ""
|
| 246 |
+
echo "10. GPT-OSS OpenHermes-FR Memory Optimized"
|
| 247 |
+
echo " - Model: openai/gpt-oss-20b"
|
| 248 |
+
echo " - Dataset: legmlai/openhermes-fr (200K samples)"
|
| 249 |
+
echo " - Epochs: 1"
|
| 250 |
+
echo " - Batch Size: 2 (effective 32 with accumulation)"
|
| 251 |
+
echo " - Learning Rate: 2e-4"
|
| 252 |
+
echo " - Native MXFP4 quantization"
|
| 253 |
+
echo " - Memory optimized for 40-80GB GPUs"
|
| 254 |
+
echo " - Harmony format compatible"
|
| 255 |
+
echo ""
|
| 256 |
+
echo "10. GPT-OSS Custom Dataset"
|
| 257 |
+
echo " - Model: openai/gpt-oss-20b"
|
| 258 |
+
echo " - Dataset: User-defined (fully customizable)"
|
| 259 |
+
echo " - Epochs: Configurable"
|
| 260 |
+
echo " - Batch Size: Configurable"
|
| 261 |
+
echo " - Learning Rate: Configurable"
|
| 262 |
+
echo " - Maximum flexibility with all parameters"
|
| 263 |
+
echo ""
|
| 264 |
+
echo "11. Custom Configuration"
|
| 265 |
echo " - User-defined parameters"
|
| 266 |
echo ""
|
| 267 |
}
|
|
|
|
| 352 |
MAX_SEQ_LENGTH=1024
|
| 353 |
CONFIG_FILE="config/train_gpt_oss_memory_optimized.py"
|
| 354 |
;;
|
| 355 |
+
"GPT-OSS OpenHermes-FR (Recommended)")
|
| 356 |
+
MODEL_NAME="openai/gpt-oss-20b"
|
| 357 |
+
DATASET_NAME="legmlai/openhermes-fr"
|
| 358 |
+
MAX_EPOCHS=1.5
|
| 359 |
+
BATCH_SIZE=6
|
| 360 |
+
GRADIENT_ACCUMULATION_STEPS=6
|
| 361 |
+
LEARNING_RATE=2.5e-4
|
| 362 |
+
MAX_SEQ_LENGTH=3072
|
| 363 |
+
CONFIG_FILE="config/train_gpt_oss_openhermes_fr.py"
|
| 364 |
+
;;
|
| 365 |
+
"GPT-OSS OpenHermes-FR Memory Optimized")
|
| 366 |
+
MODEL_NAME="openai/gpt-oss-20b"
|
| 367 |
+
DATASET_NAME="legmlai/openhermes-fr"
|
| 368 |
+
MAX_EPOCHS=1
|
| 369 |
+
BATCH_SIZE=2
|
| 370 |
+
GRADIENT_ACCUMULATION_STEPS=16
|
| 371 |
+
LEARNING_RATE=2e-4
|
| 372 |
+
MAX_SEQ_LENGTH=1024
|
| 373 |
+
CONFIG_FILE="config/train_gpt_oss_openhermes_fr_memory_optimized.py"
|
| 374 |
+
;;
|
| 375 |
+
"GPT-OSS Custom Dataset")
|
| 376 |
+
MODEL_NAME="openai/gpt-oss-20b"
|
| 377 |
+
DATASET_NAME="legmlai/openhermes-fr" # Will be customizable
|
| 378 |
+
MAX_EPOCHS=1
|
| 379 |
+
BATCH_SIZE=4
|
| 380 |
+
GRADIENT_ACCUMULATION_STEPS=4
|
| 381 |
+
LEARNING_RATE=2e-4
|
| 382 |
+
MAX_SEQ_LENGTH=2048
|
| 383 |
+
CONFIG_FILE="config/train_gpt_oss_custom.py"
|
| 384 |
+
get_custom_dataset_config
|
| 385 |
+
;;
|
| 386 |
"Custom Configuration")
|
| 387 |
get_custom_config
|
| 388 |
;;
|
| 389 |
esac
|
| 390 |
}
|
| 391 |
|
| 392 |
+
# Function to get custom dataset configuration
|
| 393 |
+
get_custom_dataset_config() {
|
| 394 |
+
print_step "GPT-OSS Custom Configuration"
|
| 395 |
+
echo "======================================"
|
| 396 |
+
|
| 397 |
+
echo "Configure your GPT-OSS training:"
|
| 398 |
+
echo ""
|
| 399 |
+
|
| 400 |
+
# Dataset Configuration
|
| 401 |
+
print_info "📊 Dataset Configuration"
|
| 402 |
+
get_input "Dataset name (HuggingFace format: username/dataset)" "legmlai/openhermes-fr" DATASET_NAME
|
| 403 |
+
get_input "Dataset split" "train" DATASET_SPLIT
|
| 404 |
+
|
| 405 |
+
echo ""
|
| 406 |
+
echo "Dataset format options:"
|
| 407 |
+
echo "1. OpenHermes-FR (prompt + accepted_completion fields)"
|
| 408 |
+
echo "2. Messages format (chat conversations)"
|
| 409 |
+
echo "3. Text format (plain text field)"
|
| 410 |
+
echo "4. Custom format (specify field names)"
|
| 411 |
+
echo ""
|
| 412 |
+
|
| 413 |
+
select_option "Select dataset format:" "OpenHermes-FR" "Messages format" "Text format" "Custom format" DATASET_FORMAT
|
| 414 |
+
|
| 415 |
+
case "$DATASET_FORMAT" in
|
| 416 |
+
"OpenHermes-FR")
|
| 417 |
+
INPUT_FIELD="prompt"
|
| 418 |
+
TARGET_FIELD="accepted_completion"
|
| 419 |
+
DATASET_FORMAT_CODE="openhermes_fr"
|
| 420 |
+
FILTER_BAD_ENTRIES="true"
|
| 421 |
+
;;
|
| 422 |
+
"Messages format")
|
| 423 |
+
INPUT_FIELD="messages"
|
| 424 |
+
TARGET_FIELD=""
|
| 425 |
+
DATASET_FORMAT_CODE="messages"
|
| 426 |
+
FILTER_BAD_ENTRIES="false"
|
| 427 |
+
;;
|
| 428 |
+
"Text format")
|
| 429 |
+
INPUT_FIELD="text"
|
| 430 |
+
TARGET_FIELD=""
|
| 431 |
+
DATASET_FORMAT_CODE="text"
|
| 432 |
+
FILTER_BAD_ENTRIES="false"
|
| 433 |
+
;;
|
| 434 |
+
"Custom format")
|
| 435 |
+
get_input "Input field name" "prompt" INPUT_FIELD
|
| 436 |
+
get_input "Target field name (leave empty if not needed)" "accepted_completion" TARGET_FIELD
|
| 437 |
+
DATASET_FORMAT_CODE="custom"
|
| 438 |
+
get_input "Filter bad entries? (true/false)" "false" FILTER_BAD_ENTRIES
|
| 439 |
+
;;
|
| 440 |
+
esac
|
| 441 |
+
|
| 442 |
+
# Dataset Filtering Options
|
| 443 |
+
echo ""
|
| 444 |
+
print_info "🔍 Dataset Filtering Options"
|
| 445 |
+
get_input "Maximum samples to use (leave empty for all)" "" MAX_SAMPLES
|
| 446 |
+
get_input "Minimum sequence length" "10" MIN_LENGTH
|
| 447 |
+
get_input "Maximum sequence length (leave empty for auto)" "" MAX_LENGTH
|
| 448 |
+
|
| 449 |
+
# Training Hyperparameters
|
| 450 |
+
echo ""
|
| 451 |
+
print_info "⚙️ Training Hyperparameters"
|
| 452 |
+
get_input "Number of epochs" "1.0" NUM_EPOCHS
|
| 453 |
+
get_input "Batch size per device" "4" BATCH_SIZE
|
| 454 |
+
get_input "Gradient accumulation steps" "4" GRAD_ACCUM_STEPS
|
| 455 |
+
get_input "Learning rate" "2e-4" LEARNING_RATE
|
| 456 |
+
get_input "Minimum learning rate" "2e-5" MIN_LR
|
| 457 |
+
get_input "Weight decay" "0.01" WEIGHT_DECAY
|
| 458 |
+
get_input "Warmup ratio" "0.03" WARMUP_RATIO
|
| 459 |
+
|
| 460 |
+
# Sequence Length
|
| 461 |
+
echo ""
|
| 462 |
+
print_info "📏 Sequence Configuration"
|
| 463 |
+
get_input "Maximum sequence length" "2048" MAX_SEQ_LENGTH
|
| 464 |
+
|
| 465 |
+
# LoRA Configuration
|
| 466 |
+
echo ""
|
| 467 |
+
print_info "🎛️ LoRA Configuration"
|
| 468 |
+
get_input "LoRA rank" "16" LORA_RANK
|
| 469 |
+
get_input "LoRA alpha" "32" LORA_ALPHA
|
| 470 |
+
get_input "LoRA dropout" "0.05" LORA_DROPOUT
|
| 471 |
+
|
| 472 |
+
# Memory & Performance
|
| 473 |
+
echo ""
|
| 474 |
+
print_info "💾 Memory & Performance"
|
| 475 |
+
select_option "Mixed precision:" "BF16 (recommended)" "FP16" "FP32" MIXED_PRECISION
|
| 476 |
+
get_input "Data loading workers" "4" NUM_WORKERS
|
| 477 |
+
select_option "Quantization:" "MXFP4 (default)" "4-bit BNB" "None" QUANTIZATION_TYPE
|
| 478 |
+
|
| 479 |
+
# Advanced Options
|
| 480 |
+
echo ""
|
| 481 |
+
echo "Advanced options (press Enter for defaults):"
|
| 482 |
+
get_input "Max gradient norm" "1.0" MAX_GRAD_NORM
|
| 483 |
+
get_input "Logging steps" "10" LOGGING_STEPS
|
| 484 |
+
get_input "Evaluation steps" "100" EVAL_STEPS
|
| 485 |
+
get_input "Save steps" "500" SAVE_STEPS
|
| 486 |
+
|
| 487 |
+
# Update the custom config file with user's choices
|
| 488 |
+
update_enhanced_gpt_oss_config
|
| 489 |
+
}
|
| 490 |
+
|
| 491 |
# Function to get custom configuration
|
| 492 |
get_custom_config() {
|
| 493 |
print_step "Custom Configuration Setup"
|
|
|
|
| 509 |
fi
|
| 510 |
}
|
| 511 |
|
| 512 |
+
# Function to update enhanced GPT-OSS config with user choices
|
| 513 |
+
update_enhanced_gpt_oss_config() {
|
| 514 |
+
print_info "Generating enhanced custom GPT-OSS configuration..."
|
| 515 |
+
|
| 516 |
+
# Process mixed precision setting
|
| 517 |
+
case "$MIXED_PRECISION" in
|
| 518 |
+
"BF16 (recommended)")
|
| 519 |
+
FP16="False"
|
| 520 |
+
BF16="True"
|
| 521 |
+
;;
|
| 522 |
+
"FP16")
|
| 523 |
+
FP16="True"
|
| 524 |
+
BF16="False"
|
| 525 |
+
;;
|
| 526 |
+
"FP32")
|
| 527 |
+
FP16="False"
|
| 528 |
+
BF16="False"
|
| 529 |
+
;;
|
| 530 |
+
esac
|
| 531 |
+
|
| 532 |
+
# Process quantization setting
|
| 533 |
+
case "$QUANTIZATION_TYPE" in
|
| 534 |
+
"MXFP4 (default)")
|
| 535 |
+
USE_QUANTIZATION="True"
|
| 536 |
+
QUANTIZATION_CONFIG='{"dequantize": True, "load_in_4bit": False}'
|
| 537 |
+
;;
|
| 538 |
+
"4-bit BNB")
|
| 539 |
+
USE_QUANTIZATION="True"
|
| 540 |
+
QUANTIZATION_CONFIG='{"dequantize": False, "load_in_4bit": True, "bnb_4bit_compute_dtype": "bfloat16", "bnb_4bit_use_double_quant": True, "bnb_4bit_quant_type": "nf4"}'
|
| 541 |
+
;;
|
| 542 |
+
"None")
|
| 543 |
+
USE_QUANTIZATION="False"
|
| 544 |
+
QUANTIZATION_CONFIG='{"dequantize": False, "load_in_4bit": False}'
|
| 545 |
+
;;
|
| 546 |
+
esac
|
| 547 |
+
|
| 548 |
+
# Create enhanced config file with all user choices
|
| 549 |
+
cat > "$CONFIG_FILE" << EOF
|
| 550 |
+
"""
|
| 551 |
+
GPT-OSS Enhanced Custom Training Configuration - Generated by launch.sh
|
| 552 |
+
Dataset: $DATASET_NAME ($DATASET_FORMAT)
|
| 553 |
+
Optimized for: ${DATASET_FORMAT} format with full customization
|
| 554 |
+
"""
|
| 555 |
+
|
| 556 |
+
from config.train_gpt_oss_custom import GPTOSSEnhancedCustomConfig
|
| 557 |
+
|
| 558 |
+
# Create enhanced config with all customizations
|
| 559 |
+
config = GPTOSSEnhancedCustomConfig(
|
| 560 |
+
# ============================================================================
|
| 561 |
+
# DATASET CONFIGURATION
|
| 562 |
+
# ============================================================================
|
| 563 |
+
dataset_name="$DATASET_NAME",
|
| 564 |
+
dataset_split="$DATASET_SPLIT",
|
| 565 |
+
dataset_format="$DATASET_FORMAT_CODE",
|
| 566 |
+
input_field="$INPUT_FIELD",
|
| 567 |
+
target_field=$(if [ -n "$TARGET_FIELD" ]; then echo "\"$TARGET_FIELD\""; else echo "None"; fi),
|
| 568 |
+
filter_bad_entries=$FILTER_BAD_ENTRIES,
|
| 569 |
+
max_samples=$(if [ -n "$MAX_SAMPLES" ]; then echo "$MAX_SAMPLES"; else echo "None"; fi),
|
| 570 |
+
min_length=$MIN_LENGTH,
|
| 571 |
+
max_length=$(if [ -n "$MAX_LENGTH" ]; then echo "$MAX_LENGTH"; else echo "None"; fi),
|
| 572 |
+
|
| 573 |
+
# ============================================================================
|
| 574 |
+
# TRAINING HYPERPARAMETERS
|
| 575 |
+
# ============================================================================
|
| 576 |
+
num_train_epochs=$NUM_EPOCHS,
|
| 577 |
+
batch_size=$BATCH_SIZE,
|
| 578 |
+
gradient_accumulation_steps=$GRAD_ACCUM_STEPS,
|
| 579 |
+
learning_rate=$LEARNING_RATE,
|
| 580 |
+
min_lr=$MIN_LR,
|
| 581 |
+
weight_decay=$WEIGHT_DECAY,
|
| 582 |
+
warmup_ratio=$WARMUP_RATIO,
|
| 583 |
+
max_grad_norm=$MAX_GRAD_NORM,
|
| 584 |
+
|
| 585 |
+
# ============================================================================
|
| 586 |
+
# MODEL CONFIGURATION
|
| 587 |
+
# ============================================================================
|
| 588 |
+
max_seq_length=$MAX_SEQ_LENGTH,
|
| 589 |
+
|
| 590 |
+
# ============================================================================
|
| 591 |
+
# MIXED PRECISION
|
| 592 |
+
# ============================================================================
|
| 593 |
+
fp16=$FP16,
|
| 594 |
+
bf16=$BF16,
|
| 595 |
+
|
| 596 |
+
# ============================================================================
|
| 597 |
+
# LORA CONFIGURATION
|
| 598 |
+
# ============================================================================
|
| 599 |
+
lora_config={
|
| 600 |
+
"r": $LORA_RANK,
|
| 601 |
+
"lora_alpha": $LORA_ALPHA,
|
| 602 |
+
"lora_dropout": $LORA_DROPOUT,
|
| 603 |
+
"target_modules": "all-linear",
|
| 604 |
+
"bias": "none",
|
| 605 |
+
"task_type": "CAUSAL_LM",
|
| 606 |
+
},
|
| 607 |
+
|
| 608 |
+
# ============================================================================
|
| 609 |
+
# QUANTIZATION CONFIGURATION
|
| 610 |
+
# ============================================================================
|
| 611 |
+
use_quantization=$USE_QUANTIZATION,
|
| 612 |
+
quantization_config=$QUANTIZATION_CONFIG,
|
| 613 |
+
|
| 614 |
+
# ============================================================================
|
| 615 |
+
# PERFORMANCE CONFIGURATION
|
| 616 |
+
# ============================================================================
|
| 617 |
+
dataloader_num_workers=$NUM_WORKERS,
|
| 618 |
+
dataloader_pin_memory=True,
|
| 619 |
+
group_by_length=True,
|
| 620 |
+
|
| 621 |
+
# ============================================================================
|
| 622 |
+
# LOGGING & EVALUATION
|
| 623 |
+
# ============================================================================
|
| 624 |
+
logging_steps=$LOGGING_STEPS,
|
| 625 |
+
eval_steps=$EVAL_STEPS,
|
| 626 |
+
save_steps=$SAVE_STEPS,
|
| 627 |
+
|
| 628 |
+
# ============================================================================
|
| 629 |
+
# RUNTIME CONFIGURATION
|
| 630 |
+
# ============================================================================
|
| 631 |
+
experiment_name="$EXPERIMENT_NAME",
|
| 632 |
+
trackio_url="$TRACKIO_URL",
|
| 633 |
+
dataset_repo="$TRACKIO_DATASET_REPO",
|
| 634 |
+
enable_tracking=True,
|
| 635 |
+
)
|
| 636 |
+
EOF
|
| 637 |
+
|
| 638 |
+
print_status "Enhanced GPT-OSS configuration generated successfully!"
|
| 639 |
+
print_info "Configuration saved to: $CONFIG_FILE"
|
| 640 |
+
}
|
| 641 |
+
|
| 642 |
# Function to create training configuration file
|
| 643 |
create_training_config() {
|
| 644 |
local config_file="$1"
|
|
|
|
| 786 |
echo "=================================="
|
| 787 |
|
| 788 |
show_training_configs
|
| 789 |
+
select_option "Select training configuration:" "Basic Training" "H100 Lightweight (Rapid)" "A100 Large Scale" "Multiple Passes" "GPT-OSS Basic Training" "GPT-OSS H100 Optimized" "GPT-OSS Multilingual Reasoning" "GPT-OSS Memory Optimized" "GPT-OSS OpenHermes-FR (Recommended)" "GPT-OSS OpenHermes-FR Memory Optimized" "GPT-OSS Custom Dataset" "Custom Configuration" TRAINING_CONFIG_TYPE
|
| 790 |
|
| 791 |
get_training_config "$TRAINING_CONFIG_TYPE"
|
| 792 |
|
|
|
|
| 1123 |
print_info "Batch size: $BATCH_SIZE"
|
| 1124 |
print_info "Learning rate: $LEARNING_RATE"
|
| 1125 |
|
| 1126 |
+
# Step 14.5: Define Output Directory
|
| 1127 |
+
print_step "Step 14.5: Output Directory Configuration"
|
| 1128 |
+
echo "============================================="
|
| 1129 |
+
|
| 1130 |
+
# Define the output directory for training results
|
| 1131 |
+
OUTPUT_DIR="./outputs/${EXPERIMENT_NAME}_$(date +%Y%m%d_%H%M%S)"
|
| 1132 |
+
print_info "Training output directory: $OUTPUT_DIR"
|
| 1133 |
+
|
| 1134 |
+
# Create output directory
|
| 1135 |
+
mkdir -p "$OUTPUT_DIR"
|
| 1136 |
+
print_status "Output directory created: $OUTPUT_DIR"
|
| 1137 |
+
|
| 1138 |
# Step 15: Start training
|
| 1139 |
print_step "Step 15: Starting Training"
|
| 1140 |
echo "=============================="
|
| 1141 |
|
| 1142 |
print_info "Starting training with configuration: $CONFIG_FILE"
|
| 1143 |
print_info "Experiment: $EXPERIMENT_NAME"
|
| 1144 |
+
print_info "Output: $OUTPUT_DIR"
|
| 1145 |
print_info "Trackio: $TRACKIO_URL"
|
| 1146 |
|
| 1147 |
# Ensure environment variables are available for training
|
|
|
|
| 1151 |
export HUGGING_FACE_HUB_TOKEN="$HF_TOKEN"
|
| 1152 |
export HF_USERNAME="$HF_USERNAME"
|
| 1153 |
export TRACKIO_DATASET_REPO="$TRACKIO_DATASET_REPO"
|
| 1154 |
+
export OUTPUT_DIR="$OUTPUT_DIR"
|
| 1155 |
|
| 1156 |
# Run the appropriate training script based on model type
|
| 1157 |
if [[ "$MODEL_NAME" == *"gpt-oss"* ]]; then
|
|
|
|
| 1159 |
python scripts/training/train_gpt_oss.py \
|
| 1160 |
--config "$CONFIG_FILE" \
|
| 1161 |
--experiment-name "$EXPERIMENT_NAME" \
|
| 1162 |
+
--output-dir "$OUTPUT_DIR" \
|
| 1163 |
--trackio-url "$TRACKIO_URL" \
|
| 1164 |
--trainer-type "$TRAINER_TYPE_LOWER"
|
| 1165 |
else
|
|
|
|
| 1167 |
python scripts/training/train.py \
|
| 1168 |
--config "$CONFIG_FILE" \
|
| 1169 |
--experiment-name "$EXPERIMENT_NAME" \
|
| 1170 |
+
--output-dir "$OUTPUT_DIR" \
|
| 1171 |
--trackio-url "$TRACKIO_URL" \
|
| 1172 |
--trainer-type "$TRAINER_TYPE_LOWER"
|
| 1173 |
fi
|
|
|
|
| 1177 |
echo "====================================="
|
| 1178 |
|
| 1179 |
print_info "Pushing model to: $REPO_NAME"
|
| 1180 |
+
print_info "Checkpoint: $OUTPUT_DIR"
|
| 1181 |
|
| 1182 |
# Ensure environment variables are available for model push
|
| 1183 |
export HF_WRITE_TOKEN="$HF_WRITE_TOKEN"
|
|
|
|
| 1186 |
export HUGGING_FACE_HUB_TOKEN="$HF_TOKEN"
|
| 1187 |
export HF_USERNAME="$HF_USERNAME"
|
| 1188 |
export TRACKIO_DATASET_REPO="$TRACKIO_DATASET_REPO"
|
| 1189 |
+
export OUTPUT_DIR="$OUTPUT_DIR"
|
| 1190 |
|
| 1191 |
# Run the appropriate push script based on model type
|
| 1192 |
if [[ "$MODEL_NAME" == *"gpt-oss"* ]]; then
|
| 1193 |
print_info "Using GPT-OSS specialized push script..."
|
| 1194 |
+
python scripts/model_tonic/push_gpt_oss_to_huggingface.py "$OUTPUT_DIR" "$REPO_NAME" \
|
| 1195 |
--token "$HF_TOKEN" \
|
| 1196 |
--trackio-url "$TRACKIO_URL" \
|
| 1197 |
--experiment-name "$EXPERIMENT_NAME" \
|
| 1198 |
--dataset-repo "$TRACKIO_DATASET_REPO" \
|
| 1199 |
--author-name "$AUTHOR_NAME" \
|
| 1200 |
+
--model-description "$MODEL_DESCRIPTION" \
|
| 1201 |
+
--training-config-type "$TRAINING_CONFIG_TYPE" \
|
| 1202 |
+
--model-name "$MODEL_NAME" \
|
| 1203 |
+
--dataset-name "$DATASET_NAME" \
|
| 1204 |
+
--batch-size "$BATCH_SIZE" \
|
| 1205 |
+
--learning-rate "$LEARNING_RATE" \
|
| 1206 |
+
--max-epochs "$MAX_EPOCHS" \
|
| 1207 |
+
--max-seq-length "$MAX_SEQ_LENGTH" \
|
| 1208 |
+
--trainer-type "$TRAINER_TYPE"
|
| 1209 |
else
|
| 1210 |
print_info "Using standard SmolLM3 push script..."
|
| 1211 |
+
python scripts/model_tonic/push_to_huggingface.py "$OUTPUT_DIR" "$REPO_NAME" \
|
| 1212 |
--token "$HF_TOKEN" \
|
| 1213 |
--trackio-url "$TRACKIO_URL" \
|
| 1214 |
--experiment-name "$EXPERIMENT_NAME" \
|
| 1215 |
--dataset-repo "$TRACKIO_DATASET_REPO" \
|
| 1216 |
--author-name "$AUTHOR_NAME" \
|
| 1217 |
+
--model-description "$MODEL_DESCRIPTION" \
|
| 1218 |
+
--training-config-type "$TRAINING_CONFIG_TYPE" \
|
| 1219 |
+
--model-name "$MODEL_NAME" \
|
| 1220 |
+
--dataset-name "$DATASET_NAME" \
|
| 1221 |
+
--batch-size "$BATCH_SIZE" \
|
| 1222 |
+
--learning-rate "$LEARNING_RATE" \
|
| 1223 |
+
--max-epochs "$MAX_EPOCHS" \
|
| 1224 |
+
--max-seq-length "$MAX_SEQ_LENGTH" \
|
| 1225 |
+
--trainer-type "$TRAINER_TYPE"
|
| 1226 |
fi
|
| 1227 |
|
| 1228 |
# Step 16.5: Switch Trackio Space to Read Token (Security)
|
|
|
|
| 1335 |
|
| 1336 |
## Files Created
|
| 1337 |
- Training configuration: \`$CONFIG_FILE\`
|
| 1338 |
+
- Model checkpoint: \`$OUTPUT_DIR/\`
|
| 1339 |
- Training logs: \`training.log\`
|
| 1340 |
- Summary report: \`training_summary.md\`
|
| 1341 |
EOF
|
scripts/model_tonic/push_gpt_oss_to_huggingface.py
CHANGED
|
@@ -43,8 +43,59 @@ def merge_lora_weights(checkpoint_path, base_model_name, output_path):
|
|
| 43 |
|
| 44 |
return model, tokenizer
|
| 45 |
|
| 46 |
-
def create_gpt_oss_model_card(model_name, experiment_name, trackio_url, dataset_repo, author_name, model_description):
|
| 47 |
-
"""Create a comprehensive model card for GPT-OSS models"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
|
| 49 |
card_content = f"""---
|
| 50 |
language:
|
|
@@ -196,7 +247,7 @@ This model is licensed under the MIT License.
|
|
| 196 |
|
| 197 |
return card_content
|
| 198 |
|
| 199 |
-
def push_gpt_oss_model(checkpoint_path, repo_name, hf_token, trackio_url, experiment_name, dataset_repo, author_name, model_description):
|
| 200 |
"""Push GPT-OSS model to Hugging Face Hub"""
|
| 201 |
|
| 202 |
print("=== GPT-OSS Model Push Pipeline ===")
|
|
@@ -230,7 +281,14 @@ def push_gpt_oss_model(checkpoint_path, repo_name, hf_token, trackio_url, experi
|
|
| 230 |
trackio_url=trackio_url,
|
| 231 |
dataset_repo=dataset_repo,
|
| 232 |
author_name=author_name,
|
| 233 |
-
model_description=model_description
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 234 |
)
|
| 235 |
|
| 236 |
# Save model card
|
|
@@ -291,6 +349,14 @@ def main():
|
|
| 291 |
parser.add_argument("--dataset-repo", help="Dataset repository")
|
| 292 |
parser.add_argument("--author-name", help="Author name")
|
| 293 |
parser.add_argument("--model-description", help="Model description")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 294 |
|
| 295 |
args = parser.parse_args()
|
| 296 |
|
|
@@ -308,7 +374,15 @@ def main():
|
|
| 308 |
experiment_name=experiment_name,
|
| 309 |
dataset_repo=dataset_repo,
|
| 310 |
author_name=author_name,
|
| 311 |
-
model_description=model_description
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 312 |
)
|
| 313 |
|
| 314 |
sys.exit(0 if success else 1)
|
|
|
|
| 43 |
|
| 44 |
return model, tokenizer
|
| 45 |
|
| 46 |
+
def create_gpt_oss_model_card(model_name, experiment_name, trackio_url, dataset_repo, author_name, model_description, training_config_type=None, dataset_name=None, batch_size=None, learning_rate=None, max_epochs=None, max_seq_length=None, trainer_type=None):
|
| 47 |
+
"""Create a comprehensive model card for GPT-OSS models using generate_model_card.py"""
|
| 48 |
+
|
| 49 |
+
try:
|
| 50 |
+
# Import the model card generator
|
| 51 |
+
import sys
|
| 52 |
+
import os
|
| 53 |
+
sys.path.append(os.path.join(os.path.dirname(__file__)))
|
| 54 |
+
from generate_model_card import ModelCardGenerator, create_default_variables
|
| 55 |
+
|
| 56 |
+
# Create generator
|
| 57 |
+
generator = ModelCardGenerator()
|
| 58 |
+
|
| 59 |
+
# Create variables for the model card
|
| 60 |
+
variables = create_default_variables()
|
| 61 |
+
|
| 62 |
+
# Update with GPT-OSS specific values
|
| 63 |
+
variables.update({
|
| 64 |
+
"repo_name": model_name,
|
| 65 |
+
"model_name": model_name.split('/')[-1],
|
| 66 |
+
"experiment_name": experiment_name or "gpt_oss_finetune",
|
| 67 |
+
"dataset_repo": dataset_repo,
|
| 68 |
+
"author_name": author_name or "GPT-OSS Fine-tuner",
|
| 69 |
+
"model_description": model_description or "A fine-tuned version of OpenAI's GPT-OSS-20B model for multilingual reasoning tasks.",
|
| 70 |
+
"training_config_type": training_config_type or "GPT-OSS Configuration",
|
| 71 |
+
"base_model": "openai/gpt-oss-20b",
|
| 72 |
+
"dataset_name": dataset_name or "HuggingFaceH4/Multilingual-Thinking",
|
| 73 |
+
"trainer_type": trainer_type or "SFTTrainer",
|
| 74 |
+
"batch_size": str(batch_size) if batch_size else "4",
|
| 75 |
+
"learning_rate": str(learning_rate) if learning_rate else "2e-4",
|
| 76 |
+
"max_epochs": str(max_epochs) if max_epochs else "1",
|
| 77 |
+
"max_seq_length": str(max_seq_length) if max_seq_length else "2048",
|
| 78 |
+
"hardware_info": "GPU (H100/A100)",
|
| 79 |
+
"trackio_url": trackio_url or "N/A",
|
| 80 |
+
"training_loss": "N/A",
|
| 81 |
+
"validation_loss": "N/A",
|
| 82 |
+
"perplexity": "N/A",
|
| 83 |
+
"quantized_models": False
|
| 84 |
+
})
|
| 85 |
+
|
| 86 |
+
# Generate the model card
|
| 87 |
+
model_card_content = generator.generate_model_card(variables)
|
| 88 |
+
|
| 89 |
+
print("✅ Model card generated using generate_model_card.py")
|
| 90 |
+
return model_card_content
|
| 91 |
+
|
| 92 |
+
except Exception as e:
|
| 93 |
+
print(f"❌ Failed to generate model card with generator: {e}")
|
| 94 |
+
print("🔄 Falling back to original GPT-OSS model card")
|
| 95 |
+
return _create_original_gpt_oss_model_card(model_name, experiment_name, trackio_url, dataset_repo, author_name, model_description)
|
| 96 |
+
|
| 97 |
+
def _create_original_gpt_oss_model_card(model_name, experiment_name, trackio_url, dataset_repo, author_name, model_description):
|
| 98 |
+
"""Create the original GPT-OSS model card as fallback"""
|
| 99 |
|
| 100 |
card_content = f"""---
|
| 101 |
language:
|
|
|
|
| 247 |
|
| 248 |
return card_content
|
| 249 |
|
| 250 |
+
def push_gpt_oss_model(checkpoint_path, repo_name, hf_token, trackio_url, experiment_name, dataset_repo, author_name, model_description, training_config_type=None, model_name=None, dataset_name=None, batch_size=None, learning_rate=None, max_epochs=None, max_seq_length=None, trainer_type=None):
|
| 251 |
"""Push GPT-OSS model to Hugging Face Hub"""
|
| 252 |
|
| 253 |
print("=== GPT-OSS Model Push Pipeline ===")
|
|
|
|
| 281 |
trackio_url=trackio_url,
|
| 282 |
dataset_repo=dataset_repo,
|
| 283 |
author_name=author_name,
|
| 284 |
+
model_description=model_description,
|
| 285 |
+
training_config_type=training_config_type,
|
| 286 |
+
dataset_name=dataset_name,
|
| 287 |
+
batch_size=batch_size,
|
| 288 |
+
learning_rate=learning_rate,
|
| 289 |
+
max_epochs=max_epochs,
|
| 290 |
+
max_seq_length=max_seq_length,
|
| 291 |
+
trainer_type=trainer_type
|
| 292 |
)
|
| 293 |
|
| 294 |
# Save model card
|
|
|
|
| 349 |
parser.add_argument("--dataset-repo", help="Dataset repository")
|
| 350 |
parser.add_argument("--author-name", help="Author name")
|
| 351 |
parser.add_argument("--model-description", help="Model description")
|
| 352 |
+
parser.add_argument("--training-config-type", help="Training configuration type")
|
| 353 |
+
parser.add_argument("--model-name", help="Base model name")
|
| 354 |
+
parser.add_argument("--dataset-name", help="Dataset name")
|
| 355 |
+
parser.add_argument("--batch-size", help="Batch size")
|
| 356 |
+
parser.add_argument("--learning-rate", help="Learning rate")
|
| 357 |
+
parser.add_argument("--max-epochs", help="Maximum epochs")
|
| 358 |
+
parser.add_argument("--max-seq-length", help="Maximum sequence length")
|
| 359 |
+
parser.add_argument("--trainer-type", help="Trainer type")
|
| 360 |
|
| 361 |
args = parser.parse_args()
|
| 362 |
|
|
|
|
| 374 |
experiment_name=experiment_name,
|
| 375 |
dataset_repo=dataset_repo,
|
| 376 |
author_name=author_name,
|
| 377 |
+
model_description=model_description,
|
| 378 |
+
training_config_type=args.training_config_type,
|
| 379 |
+
model_name=args.model_name,
|
| 380 |
+
dataset_name=args.dataset_name,
|
| 381 |
+
batch_size=args.batch_size,
|
| 382 |
+
learning_rate=args.learning_rate,
|
| 383 |
+
max_epochs=args.max_epochs,
|
| 384 |
+
max_seq_length=args.max_seq_length,
|
| 385 |
+
trainer_type=args.trainer_type
|
| 386 |
)
|
| 387 |
|
| 388 |
sys.exit(0 if success else 1)
|
scripts/model_tonic/push_to_huggingface.py
CHANGED
|
@@ -62,7 +62,15 @@ class HuggingFacePusher:
|
|
| 62 |
dataset_repo: Optional[str] = None,
|
| 63 |
hf_token: Optional[str] = None,
|
| 64 |
author_name: Optional[str] = None,
|
| 65 |
-
model_description: Optional[str] = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
):
|
| 67 |
self.model_path = Path(model_path)
|
| 68 |
self.repo_name = repo_name
|
|
@@ -73,6 +81,16 @@ class HuggingFacePusher:
|
|
| 73 |
self.author_name = author_name
|
| 74 |
self.model_description = model_description
|
| 75 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
# HF Datasets configuration
|
| 77 |
self.dataset_repo = dataset_repo or os.getenv('TRACKIO_DATASET_REPO', 'tonic/trackio-experiments')
|
| 78 |
self.hf_token = hf_token or os.getenv('HF_TOKEN')
|
|
@@ -156,9 +174,53 @@ class HuggingFacePusher:
|
|
| 156 |
return True
|
| 157 |
|
| 158 |
def create_model_card(self, training_config: Dict[str, Any], results: Dict[str, Any]) -> str:
|
| 159 |
-
"""Create a comprehensive model card using the
|
| 160 |
-
|
| 161 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 162 |
|
| 163 |
def _create_simple_model_card(self, training_config: Dict[str, Any], results: Dict[str, Any]) -> str:
|
| 164 |
"""Create a simple model card without complex YAML to avoid formatting issues"""
|
|
@@ -531,6 +593,14 @@ def parse_args():
|
|
| 531 |
parser.add_argument('--dataset-repo', type=str, default=None, help='HF Dataset repository for experiment storage')
|
| 532 |
parser.add_argument('--author-name', type=str, default=None, help='Author name for model card')
|
| 533 |
parser.add_argument('--model-description', type=str, default=None, help='Model description for model card')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 534 |
|
| 535 |
return parser.parse_args()
|
| 536 |
|
|
@@ -558,7 +628,15 @@ def main():
|
|
| 558 |
dataset_repo=args.dataset_repo,
|
| 559 |
hf_token=args.hf_token,
|
| 560 |
author_name=args.author_name,
|
| 561 |
-
model_description=args.model_description
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 562 |
)
|
| 563 |
|
| 564 |
# Push model
|
|
|
|
| 62 |
dataset_repo: Optional[str] = None,
|
| 63 |
hf_token: Optional[str] = None,
|
| 64 |
author_name: Optional[str] = None,
|
| 65 |
+
model_description: Optional[str] = None,
|
| 66 |
+
training_config_type: Optional[str] = None,
|
| 67 |
+
model_name: Optional[str] = None,
|
| 68 |
+
dataset_name: Optional[str] = None,
|
| 69 |
+
batch_size: Optional[str] = None,
|
| 70 |
+
learning_rate: Optional[str] = None,
|
| 71 |
+
max_epochs: Optional[str] = None,
|
| 72 |
+
max_seq_length: Optional[str] = None,
|
| 73 |
+
trainer_type: Optional[str] = None
|
| 74 |
):
|
| 75 |
self.model_path = Path(model_path)
|
| 76 |
self.repo_name = repo_name
|
|
|
|
| 81 |
self.author_name = author_name
|
| 82 |
self.model_description = model_description
|
| 83 |
|
| 84 |
+
# Training configuration details for model card generation
|
| 85 |
+
self.training_config_type = training_config_type
|
| 86 |
+
self.model_name = model_name
|
| 87 |
+
self.dataset_name = dataset_name
|
| 88 |
+
self.batch_size = batch_size
|
| 89 |
+
self.learning_rate = learning_rate
|
| 90 |
+
self.max_epochs = max_epochs
|
| 91 |
+
self.max_seq_length = max_seq_length
|
| 92 |
+
self.trainer_type = trainer_type
|
| 93 |
+
|
| 94 |
# HF Datasets configuration
|
| 95 |
self.dataset_repo = dataset_repo or os.getenv('TRACKIO_DATASET_REPO', 'tonic/trackio-experiments')
|
| 96 |
self.hf_token = hf_token or os.getenv('HF_TOKEN')
|
|
|
|
| 174 |
return True
|
| 175 |
|
| 176 |
def create_model_card(self, training_config: Dict[str, Any], results: Dict[str, Any]) -> str:
|
| 177 |
+
"""Create a comprehensive model card using the generate_model_card.py script"""
|
| 178 |
+
try:
|
| 179 |
+
# Import the model card generator
|
| 180 |
+
import sys
|
| 181 |
+
sys.path.append(os.path.join(os.path.dirname(__file__)))
|
| 182 |
+
from generate_model_card import ModelCardGenerator, create_default_variables
|
| 183 |
+
|
| 184 |
+
# Create generator
|
| 185 |
+
generator = ModelCardGenerator()
|
| 186 |
+
|
| 187 |
+
# Create variables for the model card
|
| 188 |
+
variables = create_default_variables()
|
| 189 |
+
|
| 190 |
+
# Update with actual values
|
| 191 |
+
variables.update({
|
| 192 |
+
"repo_name": self.repo_name,
|
| 193 |
+
"model_name": self.repo_name.split('/')[-1],
|
| 194 |
+
"experiment_name": self.experiment_name or "model_push",
|
| 195 |
+
"dataset_repo": self.dataset_repo,
|
| 196 |
+
"author_name": self.author_name or "Model Author",
|
| 197 |
+
"model_description": self.model_description or "A fine-tuned version of SmolLM3-3B for improved text generation capabilities.",
|
| 198 |
+
"training_config_type": self.training_config_type or "Custom Configuration",
|
| 199 |
+
"base_model": self.model_name or "HuggingFaceTB/SmolLM3-3B",
|
| 200 |
+
"dataset_name": self.dataset_name or "Custom Dataset",
|
| 201 |
+
"trainer_type": self.trainer_type or "SFTTrainer",
|
| 202 |
+
"batch_size": str(self.batch_size) if self.batch_size else "8",
|
| 203 |
+
"learning_rate": str(self.learning_rate) if self.learning_rate else "5e-6",
|
| 204 |
+
"max_epochs": str(self.max_epochs) if self.max_epochs else "3",
|
| 205 |
+
"max_seq_length": str(self.max_seq_length) if self.max_seq_length else "2048",
|
| 206 |
+
"hardware_info": self._get_hardware_info(),
|
| 207 |
+
"trackio_url": self.trackio_url or "N/A",
|
| 208 |
+
"training_loss": str(results.get('train_loss', 'N/A')),
|
| 209 |
+
"validation_loss": str(results.get('eval_loss', 'N/A')),
|
| 210 |
+
"perplexity": str(results.get('perplexity', 'N/A')),
|
| 211 |
+
"quantized_models": False # Set to True if quantized models are available
|
| 212 |
+
})
|
| 213 |
+
|
| 214 |
+
# Generate the model card
|
| 215 |
+
model_card_content = generator.generate_model_card(variables)
|
| 216 |
+
|
| 217 |
+
logger.info("✅ Model card generated using generate_model_card.py")
|
| 218 |
+
return model_card_content
|
| 219 |
+
|
| 220 |
+
except Exception as e:
|
| 221 |
+
logger.error(f"❌ Failed to generate model card with generator: {e}")
|
| 222 |
+
logger.info("🔄 Falling back to simple model card")
|
| 223 |
+
return self._create_simple_model_card(training_config, results)
|
| 224 |
|
| 225 |
def _create_simple_model_card(self, training_config: Dict[str, Any], results: Dict[str, Any]) -> str:
|
| 226 |
"""Create a simple model card without complex YAML to avoid formatting issues"""
|
|
|
|
| 593 |
parser.add_argument('--dataset-repo', type=str, default=None, help='HF Dataset repository for experiment storage')
|
| 594 |
parser.add_argument('--author-name', type=str, default=None, help='Author name for model card')
|
| 595 |
parser.add_argument('--model-description', type=str, default=None, help='Model description for model card')
|
| 596 |
+
parser.add_argument('--training-config-type', type=str, default=None, help='Training configuration type')
|
| 597 |
+
parser.add_argument('--model-name', type=str, default=None, help='Base model name')
|
| 598 |
+
parser.add_argument('--dataset-name', type=str, default=None, help='Dataset name')
|
| 599 |
+
parser.add_argument('--batch-size', type=str, default=None, help='Batch size')
|
| 600 |
+
parser.add_argument('--learning-rate', type=str, default=None, help='Learning rate')
|
| 601 |
+
parser.add_argument('--max-epochs', type=str, default=None, help='Maximum epochs')
|
| 602 |
+
parser.add_argument('--max-seq-length', type=str, default=None, help='Maximum sequence length')
|
| 603 |
+
parser.add_argument('--trainer-type', type=str, default=None, help='Trainer type')
|
| 604 |
|
| 605 |
return parser.parse_args()
|
| 606 |
|
|
|
|
| 628 |
dataset_repo=args.dataset_repo,
|
| 629 |
hf_token=args.hf_token,
|
| 630 |
author_name=args.author_name,
|
| 631 |
+
model_description=args.model_description,
|
| 632 |
+
training_config_type=args.training_config_type,
|
| 633 |
+
model_name=args.model_name,
|
| 634 |
+
dataset_name=args.dataset_name,
|
| 635 |
+
batch_size=args.batch_size,
|
| 636 |
+
learning_rate=args.learning_rate,
|
| 637 |
+
max_epochs=args.max_epochs,
|
| 638 |
+
max_seq_length=args.max_seq_length,
|
| 639 |
+
trainer_type=args.trainer_type
|
| 640 |
)
|
| 641 |
|
| 642 |
# Push model
|
scripts/training/train_gpt_oss.py
CHANGED
|
@@ -95,12 +95,215 @@ def setup_lora_for_gpt_oss(model, config):
|
|
| 95 |
|
| 96 |
return peft_model
|
| 97 |
|
| 98 |
-
def
|
| 99 |
-
"""Load
|
| 100 |
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
|
| 105 |
return dataset
|
| 106 |
|
|
@@ -127,25 +330,111 @@ def setup_trackio_tracking(config):
|
|
| 127 |
|
| 128 |
return trackio_client
|
| 129 |
|
| 130 |
-
def create_sft_config(config):
|
| 131 |
-
"""Create SFTConfig for GPT-OSS training"""
|
| 132 |
-
|
| 133 |
-
print("Creating SFT configuration...")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
|
| 135 |
sft_config = SFTConfig(
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 149 |
)
|
| 150 |
|
| 151 |
return sft_config
|
|
@@ -193,13 +482,13 @@ def train_gpt_oss(config_path, experiment_name, output_dir, trackio_url, trainer
|
|
| 193 |
peft_model = setup_lora_for_gpt_oss(model, config)
|
| 194 |
|
| 195 |
# Load dataset
|
| 196 |
-
dataset =
|
| 197 |
|
| 198 |
# Setup Trackio tracking
|
| 199 |
trackio_client = setup_trackio_tracking(config)
|
| 200 |
|
| 201 |
# Create SFT configuration
|
| 202 |
-
sft_config = create_sft_config(config)
|
| 203 |
|
| 204 |
# Create trainer
|
| 205 |
print("Creating SFT trainer...")
|
|
|
|
| 95 |
|
| 96 |
return peft_model
|
| 97 |
|
| 98 |
+
def load_dataset_from_config(config):
|
| 99 |
+
"""Load dataset based on configuration"""
|
| 100 |
|
| 101 |
+
dataset_name = getattr(config, 'dataset_name', 'HuggingFaceH4/Multilingual-Thinking')
|
| 102 |
+
dataset_split = getattr(config, 'dataset_split', 'train')
|
| 103 |
+
dataset_config = getattr(config, 'dataset_config', None)
|
| 104 |
+
|
| 105 |
+
print(f"Loading dataset: {dataset_name}")
|
| 106 |
+
print(f"Dataset split: {dataset_split}")
|
| 107 |
+
if dataset_config:
|
| 108 |
+
print(f"Dataset config: {dataset_config}")
|
| 109 |
+
|
| 110 |
+
# Load the dataset
|
| 111 |
+
if dataset_config:
|
| 112 |
+
dataset = load_dataset(dataset_name, dataset_config, split=dataset_split)
|
| 113 |
+
else:
|
| 114 |
+
dataset = load_dataset(dataset_name, split=dataset_split)
|
| 115 |
+
|
| 116 |
+
print(f"Original dataset size: {len(dataset)} examples")
|
| 117 |
+
|
| 118 |
+
# Apply filtering based on configuration
|
| 119 |
+
dataset = apply_dataset_filtering(dataset, config)
|
| 120 |
+
|
| 121 |
+
# Apply dataset processing based on format
|
| 122 |
+
dataset = process_dataset_format(dataset, config)
|
| 123 |
+
|
| 124 |
+
print(f"Final dataset size: {len(dataset)} examples")
|
| 125 |
+
|
| 126 |
+
return dataset
|
| 127 |
+
|
| 128 |
+
def apply_dataset_filtering(dataset, config):
|
| 129 |
+
"""Apply filtering based on configuration"""
|
| 130 |
+
|
| 131 |
+
# Filter bad entries if specified
|
| 132 |
+
if getattr(config, 'filter_bad_entries', False):
|
| 133 |
+
bad_entry_field = getattr(config, 'bad_entry_field', 'bad_entry')
|
| 134 |
+
bad_prompt_field = getattr(config, 'bad_prompt_field', 'bad_prompt_detected')
|
| 135 |
+
bad_response_field = getattr(config, 'bad_response_field', 'bad_response_detected')
|
| 136 |
+
|
| 137 |
+
original_size = len(dataset)
|
| 138 |
+
|
| 139 |
+
# Filter out bad entries
|
| 140 |
+
if bad_entry_field in dataset.column_names:
|
| 141 |
+
dataset = dataset.filter(lambda x: not x.get(bad_entry_field, False))
|
| 142 |
+
print(f"Filtered {original_size - len(dataset)} bad entries")
|
| 143 |
+
|
| 144 |
+
# Filter out bad prompts
|
| 145 |
+
if bad_prompt_field in dataset.column_names:
|
| 146 |
+
dataset = dataset.filter(lambda x: not x.get(bad_prompt_field, False))
|
| 147 |
+
print(f"Filtered bad prompts, remaining: {len(dataset)} examples")
|
| 148 |
+
|
| 149 |
+
# Filter out bad responses
|
| 150 |
+
if bad_response_field in dataset.column_names:
|
| 151 |
+
dataset = dataset.filter(lambda x: not x.get(bad_response_field, False))
|
| 152 |
+
print(f"Filtered bad responses, remaining: {len(dataset)} examples")
|
| 153 |
+
|
| 154 |
+
# Apply length filtering
|
| 155 |
+
min_length = getattr(config, 'min_length', 10)
|
| 156 |
+
max_length = getattr(config, 'max_length', None)
|
| 157 |
+
|
| 158 |
+
input_field = getattr(config, 'input_field', 'prompt')
|
| 159 |
+
target_field = getattr(config, 'target_field', 'accepted_completion')
|
| 160 |
+
|
| 161 |
+
if min_length > 0 or max_length:
|
| 162 |
+
def length_filter(example):
|
| 163 |
+
input_len = len(example.get(input_field, ''))
|
| 164 |
+
target_len = len(example.get(target_field, ''))
|
| 165 |
+
total_len = input_len + target_len
|
| 166 |
+
|
| 167 |
+
if total_len < min_length:
|
| 168 |
+
return False
|
| 169 |
+
if max_length and total_len > max_length:
|
| 170 |
+
return False
|
| 171 |
+
return True
|
| 172 |
+
|
| 173 |
+
original_size = len(dataset)
|
| 174 |
+
dataset = dataset.filter(length_filter)
|
| 175 |
+
print(f"Length filtering: {original_size} -> {len(dataset)} examples")
|
| 176 |
+
|
| 177 |
+
# Apply sampling if specified
|
| 178 |
+
max_samples = getattr(config, 'max_samples', None)
|
| 179 |
+
if max_samples and len(dataset) > max_samples:
|
| 180 |
+
dataset = dataset.shuffle(seed=42).select(range(max_samples))
|
| 181 |
+
print(f"Sampled {max_samples} examples from dataset")
|
| 182 |
+
|
| 183 |
+
return dataset
|
| 184 |
+
|
| 185 |
+
def format_gpt_oss_harmony(prompt, completion, add_eos_token=True):
|
| 186 |
+
"""
|
| 187 |
+
Format data for GPT-OSS Harmony format following the exact template structure.
|
| 188 |
+
Based on: https://huggingface.co/openai/gpt-oss-20b/raw/main/chat_template.jinja
|
| 189 |
+
"""
|
| 190 |
+
# GPT-OSS Harmony format structure (exact template compliance)
|
| 191 |
+
# User message: <|start|>user<|message|>content<|end|>
|
| 192 |
+
# Assistant message: <|start|>assistant<|channel|>final<|message|>content<|end|> (inference)
|
| 193 |
+
# Assistant message: <|start|>assistant<|channel|>final<|message|>content<|return|> (training)
|
| 194 |
+
|
| 195 |
+
harmony_text = f"<|start|>user<|message|>{prompt}<|end|><|start|>assistant<|channel|>final<|message|>{completion}"
|
| 196 |
+
|
| 197 |
+
if add_eos_token:
|
| 198 |
+
# Use <|return|> for training as per template specification
|
| 199 |
+
# This indicates the end of generation in training
|
| 200 |
+
harmony_text += "<|return|>"
|
| 201 |
+
else:
|
| 202 |
+
# Use <|end|> for inference
|
| 203 |
+
harmony_text += "<|end|>"
|
| 204 |
+
|
| 205 |
+
return harmony_text
|
| 206 |
+
|
| 207 |
+
def process_dataset_format(dataset, config):
|
| 208 |
+
"""Process dataset based on format configuration with exact GPT-OSS Harmony compliance"""
|
| 209 |
+
|
| 210 |
+
dataset_format = getattr(config, 'dataset_format', 'openhermes_fr')
|
| 211 |
+
input_field = getattr(config, 'input_field', 'prompt')
|
| 212 |
+
target_field = getattr(config, 'target_field', 'accepted_completion')
|
| 213 |
+
concatenate_fields = getattr(config, 'concatenate_fields', True)
|
| 214 |
+
field_separator = getattr(config, 'field_separator', '\n\n### Response:\n')
|
| 215 |
+
add_eos_token = getattr(config, 'add_eos_token', True)
|
| 216 |
+
use_harmony_format = getattr(config, 'use_harmony_format', True)
|
| 217 |
+
|
| 218 |
+
print(f"Processing dataset format: {dataset_format}")
|
| 219 |
+
print(f"Input field: {input_field}, Target field: {target_field}")
|
| 220 |
+
print(f"GPT-OSS Harmony Format: {'Enabled' if use_harmony_format else 'Disabled'}")
|
| 221 |
+
|
| 222 |
+
if dataset_format == "openhermes_fr":
|
| 223 |
+
# Process OpenHermes-FR format: prompt + accepted_completion
|
| 224 |
+
def format_openhermes_fr(example):
|
| 225 |
+
prompt = example.get(input_field, '')
|
| 226 |
+
completion = example.get(target_field, '')
|
| 227 |
+
|
| 228 |
+
if concatenate_fields:
|
| 229 |
+
if use_harmony_format:
|
| 230 |
+
# Use exact GPT-OSS Harmony format from template
|
| 231 |
+
text = format_gpt_oss_harmony(prompt, completion, add_eos_token)
|
| 232 |
+
else:
|
| 233 |
+
# Fallback to standard format with separator
|
| 234 |
+
text = prompt + field_separator + completion
|
| 235 |
+
if add_eos_token:
|
| 236 |
+
text += "</s>"
|
| 237 |
+
|
| 238 |
+
return {"text": text}
|
| 239 |
+
else:
|
| 240 |
+
# Keep separate for more advanced training setups
|
| 241 |
+
return {
|
| 242 |
+
"input": prompt,
|
| 243 |
+
"output": completion
|
| 244 |
+
}
|
| 245 |
+
|
| 246 |
+
dataset = dataset.map(format_openhermes_fr, remove_columns=dataset.column_names)
|
| 247 |
+
|
| 248 |
+
elif dataset_format == "messages":
|
| 249 |
+
# Process messages format (like HuggingFaceH4/Multilingual-Thinking)
|
| 250 |
+
def format_messages(example):
|
| 251 |
+
messages = example.get(input_field, [])
|
| 252 |
+
|
| 253 |
+
if use_harmony_format and len(messages) >= 2:
|
| 254 |
+
# Extract user and assistant messages for harmony format
|
| 255 |
+
user_message = ""
|
| 256 |
+
assistant_message = ""
|
| 257 |
+
|
| 258 |
+
for message in messages:
|
| 259 |
+
role = message.get("role", "")
|
| 260 |
+
content = message.get("content", "")
|
| 261 |
+
|
| 262 |
+
if role == "user":
|
| 263 |
+
user_message = content
|
| 264 |
+
elif role == "assistant":
|
| 265 |
+
assistant_message = content
|
| 266 |
+
|
| 267 |
+
if user_message and assistant_message:
|
| 268 |
+
# Use GPT-OSS Harmony format
|
| 269 |
+
text = format_gpt_oss_harmony(user_message, assistant_message, add_eos_token)
|
| 270 |
+
else:
|
| 271 |
+
# Fallback to simple concatenation
|
| 272 |
+
text = ""
|
| 273 |
+
for message in messages:
|
| 274 |
+
role = message.get("role", "")
|
| 275 |
+
content = message.get("content", "")
|
| 276 |
+
text += f"{role}: {content}\n"
|
| 277 |
+
if add_eos_token:
|
| 278 |
+
text += "</s>"
|
| 279 |
+
else:
|
| 280 |
+
# Standard format - convert messages to simple text
|
| 281 |
+
text = ""
|
| 282 |
+
for message in messages:
|
| 283 |
+
role = message.get("role", "")
|
| 284 |
+
content = message.get("content", "")
|
| 285 |
+
text += f"{role}: {content}\n"
|
| 286 |
+
if add_eos_token:
|
| 287 |
+
text += "</s>"
|
| 288 |
+
|
| 289 |
+
return {"text": text}
|
| 290 |
+
|
| 291 |
+
dataset = dataset.map(format_messages, remove_columns=dataset.column_names)
|
| 292 |
+
|
| 293 |
+
elif dataset_format == "text":
|
| 294 |
+
# Process plain text format
|
| 295 |
+
text_field = input_field
|
| 296 |
+
def format_text(example):
|
| 297 |
+
text = example.get(text_field, '')
|
| 298 |
+
if add_eos_token:
|
| 299 |
+
text += "</s>"
|
| 300 |
+
return {"text": text}
|
| 301 |
+
|
| 302 |
+
dataset = dataset.map(format_text, remove_columns=dataset.column_names)
|
| 303 |
+
|
| 304 |
+
elif dataset_format == "custom":
|
| 305 |
+
# Custom format - user handles this in their config
|
| 306 |
+
print("Using custom dataset format - no automatic processing")
|
| 307 |
|
| 308 |
return dataset
|
| 309 |
|
|
|
|
| 330 |
|
| 331 |
return trackio_client
|
| 332 |
|
| 333 |
+
def create_sft_config(config, output_dir):
|
| 334 |
+
"""Create enhanced SFTConfig for GPT-OSS training"""
|
| 335 |
+
|
| 336 |
+
print("Creating enhanced SFT configuration...")
|
| 337 |
+
|
| 338 |
+
# Extract training parameters from config with enhanced defaults
|
| 339 |
+
num_train_epochs = getattr(config, 'num_train_epochs', 1.0)
|
| 340 |
+
max_steps = getattr(config, 'max_steps', None)
|
| 341 |
+
warmup_ratio = getattr(config, 'warmup_ratio', 0.03)
|
| 342 |
+
warmup_steps = getattr(config, 'warmup_steps', None)
|
| 343 |
+
|
| 344 |
+
# Learning rate configuration
|
| 345 |
+
learning_rate = config.learning_rate
|
| 346 |
+
lr_scheduler_type = getattr(config, 'scheduler', 'cosine_with_min_lr')
|
| 347 |
+
lr_scheduler_kwargs = getattr(config, 'lr_scheduler_kwargs', {"min_lr_rate": 0.1})
|
| 348 |
+
|
| 349 |
+
# Batch configuration
|
| 350 |
+
per_device_train_batch_size = config.batch_size
|
| 351 |
+
per_device_eval_batch_size = getattr(config, 'eval_batch_size', config.batch_size)
|
| 352 |
+
gradient_accumulation_steps = config.gradient_accumulation_steps
|
| 353 |
+
|
| 354 |
+
# Evaluation and logging
|
| 355 |
+
eval_strategy = getattr(config, 'eval_strategy', 'steps')
|
| 356 |
+
eval_steps = getattr(config, 'eval_steps', 100)
|
| 357 |
+
logging_steps = getattr(config, 'logging_steps', 10)
|
| 358 |
+
|
| 359 |
+
# Saving configuration
|
| 360 |
+
save_strategy = getattr(config, 'save_strategy', 'steps')
|
| 361 |
+
save_steps = getattr(config, 'save_steps', 500)
|
| 362 |
+
save_total_limit = getattr(config, 'save_total_limit', 3)
|
| 363 |
+
|
| 364 |
+
# Mixed precision
|
| 365 |
+
fp16 = getattr(config, 'fp16', False)
|
| 366 |
+
bf16 = getattr(config, 'bf16', True)
|
| 367 |
+
|
| 368 |
+
# Regularization
|
| 369 |
+
weight_decay = getattr(config, 'weight_decay', 0.01)
|
| 370 |
+
max_grad_norm = getattr(config, 'max_grad_norm', 1.0)
|
| 371 |
+
|
| 372 |
+
# HuggingFace Hub integration
|
| 373 |
+
push_to_hub = getattr(config, 'push_to_hub', False)
|
| 374 |
+
|
| 375 |
+
print(f" • Epochs: {num_train_epochs}")
|
| 376 |
+
print(f" • Learning rate: {learning_rate}")
|
| 377 |
+
print(f" • Batch size: {per_device_train_batch_size}")
|
| 378 |
+
print(f" • Gradient accumulation: {gradient_accumulation_steps}")
|
| 379 |
+
print(f" • Effective batch size: {per_device_train_batch_size * gradient_accumulation_steps}")
|
| 380 |
|
| 381 |
sft_config = SFTConfig(
|
| 382 |
+
# Training duration
|
| 383 |
+
num_train_epochs=num_train_epochs,
|
| 384 |
+
max_steps=max_steps,
|
| 385 |
+
|
| 386 |
+
# Learning rate
|
| 387 |
+
learning_rate=learning_rate,
|
| 388 |
+
lr_scheduler_type=lr_scheduler_type,
|
| 389 |
+
lr_scheduler_kwargs=lr_scheduler_kwargs,
|
| 390 |
+
warmup_ratio=warmup_ratio,
|
| 391 |
+
warmup_steps=warmup_steps,
|
| 392 |
+
|
| 393 |
+
# Batch configuration
|
| 394 |
+
per_device_train_batch_size=per_device_train_batch_size,
|
| 395 |
+
per_device_eval_batch_size=per_device_eval_batch_size,
|
| 396 |
+
gradient_accumulation_steps=gradient_accumulation_steps,
|
| 397 |
+
|
| 398 |
+
# Model configuration
|
| 399 |
+
max_seq_length=config.max_seq_length,
|
| 400 |
+
gradient_checkpointing=getattr(config, 'use_gradient_checkpointing', True),
|
| 401 |
+
|
| 402 |
+
# Mixed precision
|
| 403 |
+
fp16=fp16,
|
| 404 |
+
bf16=bf16,
|
| 405 |
+
|
| 406 |
+
# Regularization
|
| 407 |
+
weight_decay=weight_decay,
|
| 408 |
+
max_grad_norm=max_grad_norm,
|
| 409 |
+
|
| 410 |
+
# Evaluation
|
| 411 |
+
evaluation_strategy=eval_strategy,
|
| 412 |
+
eval_steps=eval_steps,
|
| 413 |
+
|
| 414 |
+
# Logging
|
| 415 |
+
logging_steps=logging_steps,
|
| 416 |
+
|
| 417 |
+
# Saving
|
| 418 |
+
save_strategy=save_strategy,
|
| 419 |
+
save_steps=save_steps,
|
| 420 |
+
save_total_limit=save_total_limit,
|
| 421 |
+
|
| 422 |
+
# Output
|
| 423 |
+
output_dir=output_dir,
|
| 424 |
+
|
| 425 |
+
# Data loading
|
| 426 |
+
dataloader_num_workers=getattr(config, 'dataloader_num_workers', 4),
|
| 427 |
+
dataloader_pin_memory=getattr(config, 'dataloader_pin_memory', True),
|
| 428 |
+
|
| 429 |
+
# Performance
|
| 430 |
+
group_by_length=getattr(config, 'group_by_length', True),
|
| 431 |
+
remove_unused_columns=getattr(config, 'remove_unused_columns', True),
|
| 432 |
+
|
| 433 |
+
# HuggingFace Hub
|
| 434 |
+
push_to_hub=push_to_hub,
|
| 435 |
+
|
| 436 |
+
# Monitoring
|
| 437 |
+
report_to="trackio" if getattr(config, 'enable_tracking', False) else None,
|
| 438 |
)
|
| 439 |
|
| 440 |
return sft_config
|
|
|
|
| 482 |
peft_model = setup_lora_for_gpt_oss(model, config)
|
| 483 |
|
| 484 |
# Load dataset
|
| 485 |
+
dataset = load_dataset_from_config(config)
|
| 486 |
|
| 487 |
# Setup Trackio tracking
|
| 488 |
trackio_client = setup_trackio_tracking(config)
|
| 489 |
|
| 490 |
# Create SFT configuration
|
| 491 |
+
sft_config = create_sft_config(config, output_dir)
|
| 492 |
|
| 493 |
# Create trainer
|
| 494 |
print("Creating SFT trainer...")
|
templates/spaces/demo_gpt/README.md
CHANGED
|
@@ -6,7 +6,7 @@ colorTo: pink
|
|
| 6 |
sdk: gradio
|
| 7 |
sdk_version: 5.40.0
|
| 8 |
app_file: app.py
|
| 9 |
-
pinned:
|
| 10 |
short_description: GPT-OSS-20B Multilingual Reasoner LoRA adapter
|
| 11 |
---
|
| 12 |
|
|
|
|
| 6 |
sdk: gradio
|
| 7 |
sdk_version: 5.40.0
|
| 8 |
app_file: app.py
|
| 9 |
+
pinned: false
|
| 10 |
short_description: GPT-OSS-20B Multilingual Reasoner LoRA adapter
|
| 11 |
---
|
| 12 |
|