Spaces:
Running
Running
sets min_lr
Browse files
config/train_gpt_oss_openhermes_fr_memory_optimized.py
CHANGED
|
@@ -194,7 +194,7 @@ config = GPTOSSEnhancedCustomConfig(
|
|
| 194 |
|
| 195 |
scheduler="cosine_with_min_lr", # Stable scheduler for single epoch
|
| 196 |
lr_scheduler_kwargs={
|
| 197 |
-
"
|
| 198 |
"warmup_steps": None, # Use warmup_ratio instead
|
| 199 |
},
|
| 200 |
|
|
|
|
| 194 |
|
| 195 |
scheduler="cosine_with_min_lr", # Stable scheduler for single epoch
|
| 196 |
lr_scheduler_kwargs={
|
| 197 |
+
"min_lr": 2e-6, # Explicit absolute floor (matches min_lr above)
|
| 198 |
"warmup_steps": None, # Use warmup_ratio instead
|
| 199 |
},
|
| 200 |
|
scripts/training/train_gpt_oss.py
CHANGED
|
@@ -139,6 +139,24 @@ def load_dataset_from_config(config):
|
|
| 139 |
|
| 140 |
return dataset
|
| 141 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 142 |
def apply_dataset_filtering(dataset, config):
|
| 143 |
"""Apply filtering based on configuration"""
|
| 144 |
|
|
@@ -490,6 +508,7 @@ def create_sft_config(config, output_dir):
|
|
| 490 |
# Learning rate configuration
|
| 491 |
learning_rate = _as_float(getattr(config, 'learning_rate', 2e-4), 2e-4)
|
| 492 |
lr_scheduler_type = getattr(config, 'scheduler', 'cosine_with_min_lr')
|
|
|
|
| 493 |
|
| 494 |
# Batch configuration
|
| 495 |
per_device_train_batch_size = _as_int(getattr(config, 'batch_size', 2), 2)
|
|
@@ -533,6 +552,7 @@ def create_sft_config(config, output_dir):
|
|
| 533 |
# Learning rate
|
| 534 |
"learning_rate": learning_rate,
|
| 535 |
"lr_scheduler_type": lr_scheduler_type,
|
|
|
|
| 536 |
"warmup_ratio": warmup_ratio,
|
| 537 |
"warmup_steps": warmup_steps,
|
| 538 |
# Batch configuration
|
|
|
|
| 139 |
|
| 140 |
return dataset
|
| 141 |
|
| 142 |
+
def build_scheduler_kwargs(config):
|
| 143 |
+
"""Construct lr_scheduler_kwargs ensuring one of min_lr or min_lr_rate is set.
|
| 144 |
+
Falls back to config.min_lr or a default rate of 0.1.
|
| 145 |
+
"""
|
| 146 |
+
skw = getattr(config, 'lr_scheduler_kwargs', {}) or {}
|
| 147 |
+
if not isinstance(skw, dict):
|
| 148 |
+
skw = {}
|
| 149 |
+
min_lr_cfg = getattr(config, 'min_lr', 1e-6)
|
| 150 |
+
if 'min_lr' not in skw and 'min_lr_rate' not in skw:
|
| 151 |
+
try:
|
| 152 |
+
if min_lr_cfg is not None:
|
| 153 |
+
skw['min_lr'] = float(min_lr_cfg)
|
| 154 |
+
else:
|
| 155 |
+
skw['min_lr_rate'] = 0.1
|
| 156 |
+
except Exception:
|
| 157 |
+
skw['min_lr_rate'] = 0.001
|
| 158 |
+
return skw
|
| 159 |
+
|
| 160 |
def apply_dataset_filtering(dataset, config):
|
| 161 |
"""Apply filtering based on configuration"""
|
| 162 |
|
|
|
|
| 508 |
# Learning rate configuration
|
| 509 |
learning_rate = _as_float(getattr(config, 'learning_rate', 2e-4), 2e-4)
|
| 510 |
lr_scheduler_type = getattr(config, 'scheduler', 'cosine_with_min_lr')
|
| 511 |
+
lr_scheduler_kwargs = build_scheduler_kwargs(config)
|
| 512 |
|
| 513 |
# Batch configuration
|
| 514 |
per_device_train_batch_size = _as_int(getattr(config, 'batch_size', 2), 2)
|
|
|
|
| 552 |
# Learning rate
|
| 553 |
"learning_rate": learning_rate,
|
| 554 |
"lr_scheduler_type": lr_scheduler_type,
|
| 555 |
+
"lr_scheduler_kwargs": lr_scheduler_kwargs,
|
| 556 |
"warmup_ratio": warmup_ratio,
|
| 557 |
"warmup_steps": warmup_steps,
|
| 558 |
# Batch configuration
|