Spaces:
Running
Running
attempts to resolve training argument issue
Browse files- model.py +1 -5
- trainer.py +16 -13
model.py
CHANGED
|
@@ -149,22 +149,18 @@ class SmolLM3Model:
|
|
| 149 |
"fp16": self.config.fp16,
|
| 150 |
"bf16": self.config.bf16,
|
| 151 |
"ddp_backend": self.config.ddp_backend if torch.cuda.device_count() > 1 else None,
|
| 152 |
-
"ddp_find_unused_parameters": self.config.ddp_find_unused_parameters if torch.cuda.device_count() > 1 else False,
|
| 153 |
"report_to": None,
|
| 154 |
-
"remove_unused_columns": False,
|
| 155 |
"dataloader_pin_memory": getattr(self.config, 'dataloader_pin_memory', True),
|
| 156 |
# Removed group_by_length as it's causing issues with newer transformers versions
|
| 157 |
# Removed length_column_name as it might conflict with data collator
|
| 158 |
"seed": 42,
|
| 159 |
-
"data_seed": 42,
|
| 160 |
"dataloader_num_workers": getattr(self.config, 'dataloader_num_workers', 4),
|
| 161 |
"max_grad_norm": getattr(self.config, 'max_grad_norm', 1.0),
|
| 162 |
"optim": self.config.optimizer,
|
| 163 |
"lr_scheduler_type": self.config.scheduler,
|
| 164 |
-
"warmup_ratio": 0.1,
|
| 165 |
"save_strategy": "steps",
|
| 166 |
"logging_strategy": "steps",
|
| 167 |
-
|
| 168 |
}
|
| 169 |
|
| 170 |
# Override with kwargs
|
|
|
|
| 149 |
"fp16": self.config.fp16,
|
| 150 |
"bf16": self.config.bf16,
|
| 151 |
"ddp_backend": self.config.ddp_backend if torch.cuda.device_count() > 1 else None,
|
|
|
|
| 152 |
"report_to": None,
|
|
|
|
| 153 |
"dataloader_pin_memory": getattr(self.config, 'dataloader_pin_memory', True),
|
| 154 |
# Removed group_by_length as it's causing issues with newer transformers versions
|
| 155 |
# Removed length_column_name as it might conflict with data collator
|
| 156 |
"seed": 42,
|
|
|
|
| 157 |
"dataloader_num_workers": getattr(self.config, 'dataloader_num_workers', 4),
|
| 158 |
"max_grad_norm": getattr(self.config, 'max_grad_norm', 1.0),
|
| 159 |
"optim": self.config.optimizer,
|
| 160 |
"lr_scheduler_type": self.config.scheduler,
|
|
|
|
| 161 |
"save_strategy": "steps",
|
| 162 |
"logging_strategy": "steps",
|
| 163 |
+
# Removed prediction_loss_only as it might cause issues
|
| 164 |
}
|
| 165 |
|
| 166 |
# Override with kwargs
|
trainer.py
CHANGED
|
@@ -104,22 +104,25 @@ class SmolLM3Trainer:
|
|
| 104 |
# Add monitoring callbacks
|
| 105 |
callbacks = []
|
| 106 |
|
|
|
|
| 107 |
# Add simple console callback
|
| 108 |
-
callbacks.append(SimpleConsoleCallback())
|
| 109 |
-
logger.info("Added simple console monitoring callback")
|
| 110 |
|
| 111 |
# Try to add Trackio callback if available
|
| 112 |
-
if self.monitor and self.monitor.enable_tracking:
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
|
|
|
|
|
|
| 123 |
|
| 124 |
# Try standard Trainer first (more stable with callbacks)
|
| 125 |
logger.info("Creating Trainer with training arguments...")
|
|
|
|
| 104 |
# Add monitoring callbacks
|
| 105 |
callbacks = []
|
| 106 |
|
| 107 |
+
# Temporarily disable callbacks to debug the bool object is not callable error
|
| 108 |
# Add simple console callback
|
| 109 |
+
# callbacks.append(SimpleConsoleCallback())
|
| 110 |
+
# logger.info("Added simple console monitoring callback")
|
| 111 |
|
| 112 |
# Try to add Trackio callback if available
|
| 113 |
+
# if self.monitor and self.monitor.enable_tracking:
|
| 114 |
+
# try:
|
| 115 |
+
# trackio_callback = self.monitor.create_monitoring_callback()
|
| 116 |
+
# if trackio_callback:
|
| 117 |
+
# callbacks.append(trackio_callback)
|
| 118 |
+
# logger.info("Added Trackio monitoring callback")
|
| 119 |
+
# else:
|
| 120 |
+
# logger.warning("Failed to create Trackio callback")
|
| 121 |
+
# except Exception as e:
|
| 122 |
+
# logger.error(f"Error creating Trackio callback: {e}")
|
| 123 |
+
# logger.info("Continuing with console monitoring only")
|
| 124 |
+
|
| 125 |
+
logger.info("Callbacks disabled for debugging")
|
| 126 |
|
| 127 |
# Try standard Trainer first (more stable with callbacks)
|
| 128 |
logger.info("Creating Trainer with training arguments...")
|