Spaces:
Running
Running
adds a100 memory optimized
Browse files
config/train_gpt_oss_openhermes_fr_memory_optimized.py
CHANGED
|
@@ -41,9 +41,9 @@ config = GPTOSSEnhancedCustomConfig(
|
|
| 41 |
# MEMORY-OPTIMIZED TRAINING HYPERPARAMETERS
|
| 42 |
# ============================================================================
|
| 43 |
# Batch configuration following memory optimization principles
|
| 44 |
-
num_train_epochs=1.0,
|
| 45 |
-
batch_size=
|
| 46 |
-
gradient_accumulation_steps=
|
| 47 |
|
| 48 |
# Learning rate optimized for single epoch + memory constraints
|
| 49 |
learning_rate=2e-4, # Standard GPT-OSS learning rate
|
|
@@ -56,7 +56,7 @@ config = GPTOSSEnhancedCustomConfig(
|
|
| 56 |
# MODEL CONFIGURATION - Memory Optimized for GPT-OSS
|
| 57 |
# ============================================================================
|
| 58 |
model_name="openai/gpt-oss-20b",
|
| 59 |
-
max_seq_length=4096,
|
| 60 |
use_flash_attention=True, # Critical for memory efficiency
|
| 61 |
use_gradient_checkpointing=True, # Essential for memory optimization
|
| 62 |
|
|
@@ -92,6 +92,7 @@ config = GPTOSSEnhancedCustomConfig(
|
|
| 92 |
# QUANTIZATION - GPT-OSS Native MXFP4 Optimization
|
| 93 |
# ============================================================================
|
| 94 |
use_quantization=True,
|
|
|
|
| 95 |
quantization_config={
|
| 96 |
"dequantize": True, # Use native MXFP4 as per GPT-OSS specs
|
| 97 |
"load_in_4bit": False, # Don't use BNB 4-bit with MXFP4
|
|
@@ -106,40 +107,39 @@ config = GPTOSSEnhancedCustomConfig(
|
|
| 106 |
# ============================================================================
|
| 107 |
# Model loading with memory constraints
|
| 108 |
model_kwargs={
|
| 109 |
-
|
| 110 |
"torch_dtype": "auto", # Let model decide (MXFP4 compatible)
|
| 111 |
"use_cache": False, # Disable KV cache for training
|
| 112 |
"device_map": "auto", # Automatic device mapping
|
| 113 |
"low_cpu_mem_usage": True, # Critical for memory optimization
|
| 114 |
-
"max_memory": {0: "75GB"}, # Reserve memory for other processes
|
| 115 |
},
|
| 116 |
|
| 117 |
# Data loading optimized for throughput
|
| 118 |
dataloader_num_workers=4, # More workers for faster loading
|
| 119 |
dataloader_pin_memory=True, # Pin memory for faster host->GPU copies
|
| 120 |
-
dataloader_prefetch_factor=
|
| 121 |
|
| 122 |
# Memory management optimizations
|
| 123 |
-
max_memory_per_gpu=
|
| 124 |
low_cpu_mem_usage=True, # Essential for large models
|
| 125 |
group_by_length=True, # Efficient batching for memory
|
| 126 |
remove_unused_columns=True, # Remove unnecessary data
|
| 127 |
|
| 128 |
# ============================================================================
|
| 129 |
-
# EVALUATION & LOGGING -
|
| 130 |
# ============================================================================
|
| 131 |
eval_strategy="steps",
|
| 132 |
-
eval_steps=
|
| 133 |
-
logging_steps=
|
| 134 |
|
| 135 |
save_strategy="steps",
|
| 136 |
-
save_steps=
|
| 137 |
save_total_limit=3, # Keep only 2 checkpoints for memory
|
| 138 |
save_only_model=True, # Save only model weights
|
| 139 |
|
| 140 |
metric_for_best_model="eval_loss",
|
| 141 |
greater_is_better=False,
|
| 142 |
-
load_best_model_at_end=
|
| 143 |
|
| 144 |
# Evaluation memory optimization
|
| 145 |
eval_accumulation_steps=4, # Accumulate eval outputs to save memory
|
|
@@ -164,7 +164,7 @@ config = GPTOSSEnhancedCustomConfig(
|
|
| 164 |
|
| 165 |
# Generation config optimized for GPT-OSS harmony format (exact template compliance)
|
| 166 |
generation_config={
|
| 167 |
-
"max_new_tokens":
|
| 168 |
"do_sample": True,
|
| 169 |
"temperature": 0.6, # Slightly lower for more focused training
|
| 170 |
"top_p": 0.9,
|
|
@@ -214,7 +214,7 @@ config = GPTOSSEnhancedCustomConfig(
|
|
| 214 |
# Configuration validation and optimization tips
|
| 215 |
print("\nπ§ GPT-OSS Memory-Optimized OpenHermes-FR Configuration")
|
| 216 |
print("=" * 60)
|
| 217 |
-
print(f"π Dataset: {config.dataset_name} (
|
| 218 |
print(f"π£οΈ Language: French with GPT-OSS Harmony Format")
|
| 219 |
print(f"π Training: {config.num_train_epochs} epoch (memory optimized)")
|
| 220 |
print(f"π Effective Batch Size: {config.batch_size * config.gradient_accumulation_steps}")
|
|
@@ -230,7 +230,7 @@ print(" β’ Native MXFP4 quantization for GPT-OSS MoE layers")
|
|
| 230 |
print(" β’ Reduced batch size with increased gradient accumulation")
|
| 231 |
print(" β’ Limited sequence length for memory efficiency")
|
| 232 |
print(" β’ Reduced LoRA rank while maintaining effectiveness")
|
| 233 |
-
print(" β’ Dataset sampling (
|
| 234 |
print(" β’ Gradient checkpointing and efficient data loading")
|
| 235 |
print(" β’ Exact GPT-OSS Harmony format with <|return|> tokens")
|
| 236 |
print("=" * 60)
|
|
|
|
| 41 |
# MEMORY-OPTIMIZED TRAINING HYPERPARAMETERS
|
| 42 |
# ============================================================================
|
| 43 |
# Batch configuration following memory optimization principles
|
| 44 |
+
num_train_epochs=1.0, # Single epoch to reduce memory pressure
|
| 45 |
+
batch_size=2, # A100-safe per-device batch size
|
| 46 |
+
gradient_accumulation_steps=16, # Maintain reasonable effective batch size
|
| 47 |
|
| 48 |
# Learning rate optimized for single epoch + memory constraints
|
| 49 |
learning_rate=2e-4, # Standard GPT-OSS learning rate
|
|
|
|
| 56 |
# MODEL CONFIGURATION - Memory Optimized for GPT-OSS
|
| 57 |
# ============================================================================
|
| 58 |
model_name="openai/gpt-oss-20b",
|
| 59 |
+
max_seq_length=4096, # Maximize sequence length for A100 VRAM utilization
|
| 60 |
use_flash_attention=True, # Critical for memory efficiency
|
| 61 |
use_gradient_checkpointing=True, # Essential for memory optimization
|
| 62 |
|
|
|
|
| 92 |
# QUANTIZATION - GPT-OSS Native MXFP4 Optimization
|
| 93 |
# ============================================================================
|
| 94 |
use_quantization=True,
|
| 95 |
+
# MXFP4 per tutorial: https://cookbook.openai.com/articles/gpt-oss/fine-tune-transfomers
|
| 96 |
quantization_config={
|
| 97 |
"dequantize": True, # Use native MXFP4 as per GPT-OSS specs
|
| 98 |
"load_in_4bit": False, # Don't use BNB 4-bit with MXFP4
|
|
|
|
| 107 |
# ============================================================================
|
| 108 |
# Model loading with memory constraints
|
| 109 |
model_kwargs={
|
| 110 |
+
# Rely on training script to set eager + bf16 for MXFP4
|
| 111 |
"torch_dtype": "auto", # Let model decide (MXFP4 compatible)
|
| 112 |
"use_cache": False, # Disable KV cache for training
|
| 113 |
"device_map": "auto", # Automatic device mapping
|
| 114 |
"low_cpu_mem_usage": True, # Critical for memory optimization
|
|
|
|
| 115 |
},
|
| 116 |
|
| 117 |
# Data loading optimized for throughput
|
| 118 |
dataloader_num_workers=4, # More workers for faster loading
|
| 119 |
dataloader_pin_memory=True, # Pin memory for faster host->GPU copies
|
| 120 |
+
dataloader_prefetch_factor=1, # Lower prefetch to keep VRAM headroom
|
| 121 |
|
| 122 |
# Memory management optimizations
|
| 123 |
+
max_memory_per_gpu=None, # No explicit memory limit; use as much VRAM as available
|
| 124 |
low_cpu_mem_usage=True, # Essential for large models
|
| 125 |
group_by_length=True, # Efficient batching for memory
|
| 126 |
remove_unused_columns=True, # Remove unnecessary data
|
| 127 |
|
| 128 |
# ============================================================================
|
| 129 |
+
# EVALUATION & LOGGING - Memory Safe
|
| 130 |
# ============================================================================
|
| 131 |
eval_strategy="steps",
|
| 132 |
+
eval_steps=200,
|
| 133 |
+
logging_steps=10,
|
| 134 |
|
| 135 |
save_strategy="steps",
|
| 136 |
+
save_steps=500, # Less frequent saves for memory/storage
|
| 137 |
save_total_limit=3, # Keep only 2 checkpoints for memory
|
| 138 |
save_only_model=True, # Save only model weights
|
| 139 |
|
| 140 |
metric_for_best_model="eval_loss",
|
| 141 |
greater_is_better=False,
|
| 142 |
+
load_best_model_at_end=False, # Skip best model selection to save memory
|
| 143 |
|
| 144 |
# Evaluation memory optimization
|
| 145 |
eval_accumulation_steps=4, # Accumulate eval outputs to save memory
|
|
|
|
| 164 |
|
| 165 |
# Generation config optimized for GPT-OSS harmony format (exact template compliance)
|
| 166 |
generation_config={
|
| 167 |
+
"max_new_tokens": 1024,
|
| 168 |
"do_sample": True,
|
| 169 |
"temperature": 0.6, # Slightly lower for more focused training
|
| 170 |
"top_p": 0.9,
|
|
|
|
| 214 |
# Configuration validation and optimization tips
|
| 215 |
print("\nπ§ GPT-OSS Memory-Optimized OpenHermes-FR Configuration")
|
| 216 |
print("=" * 60)
|
| 217 |
+
print(f"π Dataset: {config.dataset_name} (600K samples)")
|
| 218 |
print(f"π£οΈ Language: French with GPT-OSS Harmony Format")
|
| 219 |
print(f"π Training: {config.num_train_epochs} epoch (memory optimized)")
|
| 220 |
print(f"π Effective Batch Size: {config.batch_size * config.gradient_accumulation_steps}")
|
|
|
|
| 230 |
print(" β’ Reduced batch size with increased gradient accumulation")
|
| 231 |
print(" β’ Limited sequence length for memory efficiency")
|
| 232 |
print(" β’ Reduced LoRA rank while maintaining effectiveness")
|
| 233 |
+
print(" β’ Dataset sampling (600K from 800K) for faster training")
|
| 234 |
print(" β’ Gradient checkpointing and efficient data loading")
|
| 235 |
print(" β’ Exact GPT-OSS Harmony format with <|return|> tokens")
|
| 236 |
print("=" * 60)
|
scripts/training/train_gpt_oss.py
CHANGED
|
@@ -28,6 +28,10 @@ config_dir = project_root / "config"
|
|
| 28 |
if str(config_dir) not in sys.path:
|
| 29 |
sys.path.insert(0, str(config_dir))
|
| 30 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
def load_gpt_oss_model_and_tokenizer(config):
|
| 32 |
"""Load GPT-OSS model and tokenizer with proper configuration"""
|
| 33 |
|
|
@@ -48,7 +52,13 @@ def load_gpt_oss_model_and_tokenizer(config):
|
|
| 48 |
bnb_4bit_use_double_quant=True,
|
| 49 |
bnb_4bit_quant_type="nf4"
|
| 50 |
)
|
| 51 |
-
elif config.quantization_config and
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
# Try to use Mxfp4Config if available (as per tutorial)
|
| 53 |
try:
|
| 54 |
from transformers import Mxfp4Config
|
|
@@ -75,11 +85,40 @@ def load_gpt_oss_model_and_tokenizer(config):
|
|
| 75 |
model_kwargs = {**default_model_kwargs, **cfg_model_kwargs}
|
| 76 |
else:
|
| 77 |
model_kwargs = default_model_kwargs.copy()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
|
| 79 |
# Only add quantization_config if it's not None
|
| 80 |
if quantization_config is not None:
|
| 81 |
model_kwargs["quantization_config"] = quantization_config
|
| 82 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
model = AutoModelForCausalLM.from_pretrained(config.model_name, **model_kwargs)
|
| 84 |
|
| 85 |
return model, tokenizer
|
|
|
|
| 28 |
if str(config_dir) not in sys.path:
|
| 29 |
sys.path.insert(0, str(config_dir))
|
| 30 |
|
| 31 |
+
# Reduce tokenizer thread contention and improve CUDA allocator behavior
|
| 32 |
+
os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
|
| 33 |
+
os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True")
|
| 34 |
+
|
| 35 |
def load_gpt_oss_model_and_tokenizer(config):
|
| 36 |
"""Load GPT-OSS model and tokenizer with proper configuration"""
|
| 37 |
|
|
|
|
| 52 |
bnb_4bit_use_double_quant=True,
|
| 53 |
bnb_4bit_quant_type="nf4"
|
| 54 |
)
|
| 55 |
+
elif config.quantization_config and (
|
| 56 |
+
config.quantization_config.get("dequantize")
|
| 57 |
+
or (
|
| 58 |
+
isinstance(config.quantization_config.get("mxfp4_config"), dict)
|
| 59 |
+
and config.quantization_config["mxfp4_config"].get("enabled", False)
|
| 60 |
+
)
|
| 61 |
+
):
|
| 62 |
# Try to use Mxfp4Config if available (as per tutorial)
|
| 63 |
try:
|
| 64 |
from transformers import Mxfp4Config
|
|
|
|
| 85 |
model_kwargs = {**default_model_kwargs, **cfg_model_kwargs}
|
| 86 |
else:
|
| 87 |
model_kwargs = default_model_kwargs.copy()
|
| 88 |
+
|
| 89 |
+
# Normalize torch_dtype if provided as a string in config
|
| 90 |
+
if isinstance(model_kwargs.get("torch_dtype"), str):
|
| 91 |
+
dtype_str = str(model_kwargs["torch_dtype"]).lower()
|
| 92 |
+
if dtype_str in {"bf16", "bfloat16"}:
|
| 93 |
+
model_kwargs["torch_dtype"] = torch.bfloat16
|
| 94 |
+
elif dtype_str in {"fp16", "float16", "half"}:
|
| 95 |
+
model_kwargs["torch_dtype"] = torch.float16
|
| 96 |
+
elif dtype_str == "auto":
|
| 97 |
+
# Leave as-is for HF to decide
|
| 98 |
+
pass
|
| 99 |
+
else:
|
| 100 |
+
# Fallback to bfloat16 for safer memory footprint on A100/H100
|
| 101 |
+
model_kwargs["torch_dtype"] = torch.bfloat16
|
| 102 |
+
|
| 103 |
+
# Ensure we have an offload folder for tight-memory setups
|
| 104 |
+
model_kwargs.setdefault("offload_folder", os.path.join(str(project_root), "offload"))
|
| 105 |
|
| 106 |
# Only add quantization_config if it's not None
|
| 107 |
if quantization_config is not None:
|
| 108 |
model_kwargs["quantization_config"] = quantization_config
|
| 109 |
|
| 110 |
+
# If using MXFP4, follow tutorial exactly: eager attention + bf16
|
| 111 |
+
try:
|
| 112 |
+
from transformers import Mxfp4Config as _Mxfp4Config
|
| 113 |
+
if isinstance(quantization_config, _Mxfp4Config):
|
| 114 |
+
model_kwargs["attn_implementation"] = "eager"
|
| 115 |
+
model_kwargs["torch_dtype"] = torch.bfloat16
|
| 116 |
+
model_kwargs["use_cache"] = False
|
| 117 |
+
model_kwargs["device_map"] = model_kwargs.get("device_map", "auto")
|
| 118 |
+
model_kwargs["quantization_config"] = quantization_config
|
| 119 |
+
except Exception:
|
| 120 |
+
pass
|
| 121 |
+
|
| 122 |
model = AutoModelForCausalLM.from_pretrained(config.model_name, **model_kwargs)
|
| 123 |
|
| 124 |
return model, tokenizer
|