Spaces:
Running
Running
coerce akk numeric config values to safe values
Browse files
scripts/trackio_tonic/deploy_trackio_space.py
CHANGED
|
@@ -411,8 +411,8 @@ class TrackioSpaceDeployer:
|
|
| 411 |
|
| 412 |
# Wait a bit for the space to build
|
| 413 |
import time
|
| 414 |
-
print("Waiting
|
| 415 |
-
time.sleep(
|
| 416 |
|
| 417 |
# Try to access the space
|
| 418 |
response = requests.get(self.space_url, timeout=30)
|
|
|
|
| 411 |
|
| 412 |
# Wait a bit for the space to build
|
| 413 |
import time
|
| 414 |
+
print("Waiting 120 seconds for Space to build...")
|
| 415 |
+
time.sleep(120)
|
| 416 |
|
| 417 |
# Try to access the space
|
| 418 |
response = requests.get(self.space_url, timeout=30)
|
scripts/training/train_gpt_oss.py
CHANGED
|
@@ -345,38 +345,60 @@ def create_sft_config(config, output_dir):
|
|
| 345 |
|
| 346 |
print("Creating enhanced SFT configuration...")
|
| 347 |
|
| 348 |
-
#
|
| 349 |
-
|
| 350 |
-
|
| 351 |
-
|
| 352 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 353 |
|
| 354 |
# Learning rate configuration
|
| 355 |
-
learning_rate = config
|
| 356 |
lr_scheduler_type = getattr(config, 'scheduler', 'cosine_with_min_lr')
|
| 357 |
|
| 358 |
# Batch configuration
|
| 359 |
-
per_device_train_batch_size = config
|
| 360 |
-
per_device_eval_batch_size = getattr(config, 'eval_batch_size',
|
| 361 |
-
gradient_accumulation_steps = config
|
| 362 |
|
| 363 |
# Evaluation and logging
|
| 364 |
eval_strategy = getattr(config, 'eval_strategy', 'steps')
|
| 365 |
-
eval_steps = getattr(config, 'eval_steps', 100)
|
| 366 |
-
|
|
|
|
| 367 |
|
| 368 |
# Saving configuration
|
| 369 |
save_strategy = getattr(config, 'save_strategy', 'steps')
|
| 370 |
-
save_steps = getattr(config, 'save_steps', 500)
|
| 371 |
-
save_total_limit = getattr(config, 'save_total_limit', 3)
|
| 372 |
|
| 373 |
# Mixed precision
|
| 374 |
-
fp16 = getattr(config, 'fp16', False)
|
| 375 |
-
bf16 = getattr(config, 'bf16', True)
|
|
|
|
| 376 |
|
| 377 |
# Regularization
|
| 378 |
-
weight_decay = getattr(config, 'weight_decay', 0.01)
|
| 379 |
-
max_grad_norm = getattr(config, 'max_grad_norm', 1.0)
|
| 380 |
|
| 381 |
# HuggingFace Hub integration
|
| 382 |
push_to_hub = getattr(config, 'push_to_hub', False)
|
|
@@ -406,12 +428,15 @@ def create_sft_config(config, output_dir):
|
|
| 406 |
# Mixed precision
|
| 407 |
"fp16": fp16,
|
| 408 |
"bf16": bf16,
|
|
|
|
|
|
|
| 409 |
# Regularization
|
| 410 |
"weight_decay": weight_decay,
|
| 411 |
"max_grad_norm": max_grad_norm,
|
| 412 |
# Evaluation (name may vary across versions)
|
| 413 |
"evaluation_strategy": eval_strategy,
|
| 414 |
"eval_steps": eval_steps,
|
|
|
|
| 415 |
# Logging
|
| 416 |
"logging_steps": logging_steps,
|
| 417 |
# Saving
|
|
@@ -421,8 +446,10 @@ def create_sft_config(config, output_dir):
|
|
| 421 |
# Output
|
| 422 |
"output_dir": output_dir,
|
| 423 |
# Data loading
|
| 424 |
-
"dataloader_num_workers": getattr(config, 'dataloader_num_workers', 4),
|
| 425 |
"dataloader_pin_memory": getattr(config, 'dataloader_pin_memory', True),
|
|
|
|
|
|
|
| 426 |
# Performance
|
| 427 |
"group_by_length": getattr(config, 'group_by_length', True),
|
| 428 |
"remove_unused_columns": getattr(config, 'remove_unused_columns', True),
|
|
@@ -432,6 +459,9 @@ def create_sft_config(config, output_dir):
|
|
| 432 |
"report_to": ("trackio" if getattr(config, 'enable_tracking', False) else None),
|
| 433 |
}
|
| 434 |
|
|
|
|
|
|
|
|
|
|
| 435 |
# Adapt to transformers versions where 'evaluation_strategy' was renamed
|
| 436 |
try:
|
| 437 |
ta_sig = inspect.signature(TrainingArguments.__init__)
|
|
|
|
| 345 |
|
| 346 |
print("Creating enhanced SFT configuration...")
|
| 347 |
|
| 348 |
+
# Helper coercion utilities to guarantee numeric types
|
| 349 |
+
def _as_int(value, default):
|
| 350 |
+
if value is None:
|
| 351 |
+
return int(default)
|
| 352 |
+
try:
|
| 353 |
+
return int(value)
|
| 354 |
+
except Exception:
|
| 355 |
+
return int(default)
|
| 356 |
+
|
| 357 |
+
def _as_float(value, default):
|
| 358 |
+
if value is None:
|
| 359 |
+
return float(default)
|
| 360 |
+
try:
|
| 361 |
+
return float(value)
|
| 362 |
+
except Exception:
|
| 363 |
+
return float(default)
|
| 364 |
+
|
| 365 |
+
# Extract training parameters from config with enhanced defaults and coercion
|
| 366 |
+
num_train_epochs = _as_float(getattr(config, 'num_train_epochs', 1.0), 1.0)
|
| 367 |
+
# Transformers expects max_steps default -1 (disabled). Some code compares > 0
|
| 368 |
+
raw_max_steps = getattr(config, 'max_steps', None)
|
| 369 |
+
max_steps = _as_int(raw_max_steps if raw_max_steps is not None else -1, -1)
|
| 370 |
+
warmup_ratio = _as_float(getattr(config, 'warmup_ratio', 0.03), 0.03)
|
| 371 |
+
# Ensure warmup_steps is an int; default 0 to avoid None comparisons in schedulers
|
| 372 |
+
warmup_steps = _as_int(getattr(config, 'warmup_steps', 0), 0)
|
| 373 |
|
| 374 |
# Learning rate configuration
|
| 375 |
+
learning_rate = _as_float(getattr(config, 'learning_rate', 2e-4), 2e-4)
|
| 376 |
lr_scheduler_type = getattr(config, 'scheduler', 'cosine_with_min_lr')
|
| 377 |
|
| 378 |
# Batch configuration
|
| 379 |
+
per_device_train_batch_size = _as_int(getattr(config, 'batch_size', 2), 2)
|
| 380 |
+
per_device_eval_batch_size = _as_int(getattr(config, 'eval_batch_size', per_device_train_batch_size), per_device_train_batch_size)
|
| 381 |
+
gradient_accumulation_steps = _as_int(getattr(config, 'gradient_accumulation_steps', 1), 1)
|
| 382 |
|
| 383 |
# Evaluation and logging
|
| 384 |
eval_strategy = getattr(config, 'eval_strategy', 'steps')
|
| 385 |
+
eval_steps = _as_int(getattr(config, 'eval_steps', 100), 100)
|
| 386 |
+
eval_accumulation_steps = _as_int(getattr(config, 'eval_accumulation_steps', 1), 1)
|
| 387 |
+
logging_steps = _as_int(getattr(config, 'logging_steps', 10), 10)
|
| 388 |
|
| 389 |
# Saving configuration
|
| 390 |
save_strategy = getattr(config, 'save_strategy', 'steps')
|
| 391 |
+
save_steps = _as_int(getattr(config, 'save_steps', 500), 500)
|
| 392 |
+
save_total_limit = _as_int(getattr(config, 'save_total_limit', 3), 3)
|
| 393 |
|
| 394 |
# Mixed precision
|
| 395 |
+
fp16 = bool(getattr(config, 'fp16', False))
|
| 396 |
+
bf16 = bool(getattr(config, 'bf16', True))
|
| 397 |
+
tf32 = bool(getattr(config, 'tf32', False))
|
| 398 |
|
| 399 |
# Regularization
|
| 400 |
+
weight_decay = _as_float(getattr(config, 'weight_decay', 0.01), 0.01)
|
| 401 |
+
max_grad_norm = _as_float(getattr(config, 'max_grad_norm', 1.0), 1.0)
|
| 402 |
|
| 403 |
# HuggingFace Hub integration
|
| 404 |
push_to_hub = getattr(config, 'push_to_hub', False)
|
|
|
|
| 428 |
# Mixed precision
|
| 429 |
"fp16": fp16,
|
| 430 |
"bf16": bf16,
|
| 431 |
+
# Some versions support tf32
|
| 432 |
+
"tf32": tf32 if 'tf32' in TrainingArguments.__init__.__code__.co_varnames else None,
|
| 433 |
# Regularization
|
| 434 |
"weight_decay": weight_decay,
|
| 435 |
"max_grad_norm": max_grad_norm,
|
| 436 |
# Evaluation (name may vary across versions)
|
| 437 |
"evaluation_strategy": eval_strategy,
|
| 438 |
"eval_steps": eval_steps,
|
| 439 |
+
"eval_accumulation_steps": eval_accumulation_steps,
|
| 440 |
# Logging
|
| 441 |
"logging_steps": logging_steps,
|
| 442 |
# Saving
|
|
|
|
| 446 |
# Output
|
| 447 |
"output_dir": output_dir,
|
| 448 |
# Data loading
|
| 449 |
+
"dataloader_num_workers": _as_int(getattr(config, 'dataloader_num_workers', 4), 4),
|
| 450 |
"dataloader_pin_memory": getattr(config, 'dataloader_pin_memory', True),
|
| 451 |
+
# Optional in some versions
|
| 452 |
+
"dataloader_prefetch_factor": _as_int(getattr(config, 'dataloader_prefetch_factor', 2), 2),
|
| 453 |
# Performance
|
| 454 |
"group_by_length": getattr(config, 'group_by_length', True),
|
| 455 |
"remove_unused_columns": getattr(config, 'remove_unused_columns', True),
|
|
|
|
| 459 |
"report_to": ("trackio" if getattr(config, 'enable_tracking', False) else None),
|
| 460 |
}
|
| 461 |
|
| 462 |
+
# Drop any None-valued kwargs
|
| 463 |
+
ta_kwargs = {k: v for k, v in ta_kwargs.items() if v is not None}
|
| 464 |
+
|
| 465 |
# Adapt to transformers versions where 'evaluation_strategy' was renamed
|
| 466 |
try:
|
| 467 |
ta_sig = inspect.signature(TrainingArguments.__init__)
|