Tonic commited on
Commit
7f45871
·
1 Parent(s): 4e59f6d

sets min_lr

Browse files
config/train_gpt_oss_openhermes_fr_memory_optimized.py CHANGED
@@ -194,7 +194,7 @@ config = GPTOSSEnhancedCustomConfig(
194
 
195
  scheduler="cosine_with_min_lr", # Stable scheduler for single epoch
196
  lr_scheduler_kwargs={
197
- "min_lr_rate": 0.1,
198
  "warmup_steps": None, # Use warmup_ratio instead
199
  },
200
 
 
194
 
195
  scheduler="cosine_with_min_lr", # Stable scheduler for single epoch
196
  lr_scheduler_kwargs={
197
+ "min_lr": 2e-6, # Explicit absolute floor (matches min_lr above)
198
  "warmup_steps": None, # Use warmup_ratio instead
199
  },
200
 
scripts/training/train_gpt_oss.py CHANGED
@@ -139,6 +139,24 @@ def load_dataset_from_config(config):
139
 
140
  return dataset
141
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
  def apply_dataset_filtering(dataset, config):
143
  """Apply filtering based on configuration"""
144
 
@@ -490,6 +508,7 @@ def create_sft_config(config, output_dir):
490
  # Learning rate configuration
491
  learning_rate = _as_float(getattr(config, 'learning_rate', 2e-4), 2e-4)
492
  lr_scheduler_type = getattr(config, 'scheduler', 'cosine_with_min_lr')
 
493
 
494
  # Batch configuration
495
  per_device_train_batch_size = _as_int(getattr(config, 'batch_size', 2), 2)
@@ -533,6 +552,7 @@ def create_sft_config(config, output_dir):
533
  # Learning rate
534
  "learning_rate": learning_rate,
535
  "lr_scheduler_type": lr_scheduler_type,
 
536
  "warmup_ratio": warmup_ratio,
537
  "warmup_steps": warmup_steps,
538
  # Batch configuration
 
139
 
140
  return dataset
141
 
142
+ def build_scheduler_kwargs(config):
143
+ """Construct lr_scheduler_kwargs ensuring one of min_lr or min_lr_rate is set.
144
+ Falls back to config.min_lr or a default rate of 0.1.
145
+ """
146
+ skw = getattr(config, 'lr_scheduler_kwargs', {}) or {}
147
+ if not isinstance(skw, dict):
148
+ skw = {}
149
+ min_lr_cfg = getattr(config, 'min_lr', 1e-6)
150
+ if 'min_lr' not in skw and 'min_lr_rate' not in skw:
151
+ try:
152
+ if min_lr_cfg is not None:
153
+ skw['min_lr'] = float(min_lr_cfg)
154
+ else:
155
+ skw['min_lr_rate'] = 0.1
156
+ except Exception:
157
+ skw['min_lr_rate'] = 0.001
158
+ return skw
159
+
160
  def apply_dataset_filtering(dataset, config):
161
  """Apply filtering based on configuration"""
162
 
 
508
  # Learning rate configuration
509
  learning_rate = _as_float(getattr(config, 'learning_rate', 2e-4), 2e-4)
510
  lr_scheduler_type = getattr(config, 'scheduler', 'cosine_with_min_lr')
511
+ lr_scheduler_kwargs = build_scheduler_kwargs(config)
512
 
513
  # Batch configuration
514
  per_device_train_batch_size = _as_int(getattr(config, 'batch_size', 2), 2)
 
552
  # Learning rate
553
  "learning_rate": learning_rate,
554
  "lr_scheduler_type": lr_scheduler_type,
555
+ "lr_scheduler_kwargs": lr_scheduler_kwargs,
556
  "warmup_ratio": warmup_ratio,
557
  "warmup_steps": warmup_steps,
558
  # Batch configuration