Spaces:
Running
Running
add error handling for monitoring
Browse files- model.py +1 -9
- monitoring.py +31 -13
model.py
CHANGED
|
@@ -172,15 +172,7 @@ class SmolLM3Model:
|
|
| 172 |
# Override with kwargs
|
| 173 |
training_args.update(kwargs)
|
| 174 |
|
| 175 |
-
|
| 176 |
-
logger.info(f"Training arguments keys: {list(training_args.keys())}")
|
| 177 |
-
|
| 178 |
-
try:
|
| 179 |
-
return TrainingArguments(**training_args)
|
| 180 |
-
except Exception as e:
|
| 181 |
-
logger.error(f"Failed to create TrainingArguments: {e}")
|
| 182 |
-
logger.error(f"Training arguments: {training_args}")
|
| 183 |
-
raise
|
| 184 |
|
| 185 |
def save_pretrained(self, path: str):
|
| 186 |
"""Save model and tokenizer"""
|
|
|
|
| 172 |
# Override with kwargs
|
| 173 |
training_args.update(kwargs)
|
| 174 |
|
| 175 |
+
return TrainingArguments(**training_args)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 176 |
|
| 177 |
def save_pretrained(self, path: str):
|
| 178 |
"""Save model and tokenizer"""
|
monitoring.py
CHANGED
|
@@ -263,34 +263,52 @@ class SmolLM3Monitor:
|
|
| 263 |
|
| 264 |
def on_init_end(self, args, state, control, **kwargs):
|
| 265 |
"""Called when training initialization is complete"""
|
| 266 |
-
|
|
|
|
|
|
|
|
|
|
| 267 |
|
| 268 |
def on_log(self, args, state, control, logs=None, **kwargs):
|
| 269 |
"""Called when logs are created"""
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
|
|
|
|
|
|
|
|
|
|
| 273 |
|
| 274 |
def on_save(self, args, state, control, **kwargs):
|
| 275 |
"""Called when a checkpoint is saved"""
|
| 276 |
-
|
| 277 |
-
|
| 278 |
-
|
|
|
|
|
|
|
|
|
|
| 279 |
|
| 280 |
def on_evaluate(self, args, state, control, metrics=None, **kwargs):
|
| 281 |
"""Called when evaluation is performed"""
|
| 282 |
-
|
| 283 |
-
|
|
|
|
|
|
|
|
|
|
| 284 |
|
| 285 |
def on_train_begin(self, args, state, control, **kwargs):
|
| 286 |
"""Called when training begins"""
|
| 287 |
-
|
|
|
|
|
|
|
|
|
|
| 288 |
|
| 289 |
def on_train_end(self, args, state, control, **kwargs):
|
| 290 |
"""Called when training ends"""
|
| 291 |
-
|
| 292 |
-
|
| 293 |
-
self.monitor
|
|
|
|
|
|
|
|
|
|
| 294 |
|
| 295 |
return TrackioCallback(self)
|
| 296 |
|
|
|
|
| 263 |
|
| 264 |
def on_init_end(self, args, state, control, **kwargs):
|
| 265 |
"""Called when training initialization is complete"""
|
| 266 |
+
try:
|
| 267 |
+
logger.info("Training initialization completed")
|
| 268 |
+
except Exception as e:
|
| 269 |
+
logger.error(f"Error in on_init_end: {e}")
|
| 270 |
|
| 271 |
def on_log(self, args, state, control, logs=None, **kwargs):
|
| 272 |
"""Called when logs are created"""
|
| 273 |
+
try:
|
| 274 |
+
if logs and isinstance(logs, dict):
|
| 275 |
+
self.monitor.log_metrics(logs, state.global_step)
|
| 276 |
+
self.monitor.log_system_metrics(state.global_step)
|
| 277 |
+
except Exception as e:
|
| 278 |
+
logger.error(f"Error in on_log: {e}")
|
| 279 |
|
| 280 |
def on_save(self, args, state, control, **kwargs):
|
| 281 |
"""Called when a checkpoint is saved"""
|
| 282 |
+
try:
|
| 283 |
+
checkpoint_path = os.path.join(args.output_dir, f"checkpoint-{state.global_step}")
|
| 284 |
+
if os.path.exists(checkpoint_path):
|
| 285 |
+
self.monitor.log_model_checkpoint(checkpoint_path, state.global_step)
|
| 286 |
+
except Exception as e:
|
| 287 |
+
logger.error(f"Error in on_save: {e}")
|
| 288 |
|
| 289 |
def on_evaluate(self, args, state, control, metrics=None, **kwargs):
|
| 290 |
"""Called when evaluation is performed"""
|
| 291 |
+
try:
|
| 292 |
+
if metrics and isinstance(metrics, dict):
|
| 293 |
+
self.monitor.log_evaluation_results(metrics, state.global_step)
|
| 294 |
+
except Exception as e:
|
| 295 |
+
logger.error(f"Error in on_evaluate: {e}")
|
| 296 |
|
| 297 |
def on_train_begin(self, args, state, control, **kwargs):
|
| 298 |
"""Called when training begins"""
|
| 299 |
+
try:
|
| 300 |
+
logger.info("Training started")
|
| 301 |
+
except Exception as e:
|
| 302 |
+
logger.error(f"Error in on_train_begin: {e}")
|
| 303 |
|
| 304 |
def on_train_end(self, args, state, control, **kwargs):
|
| 305 |
"""Called when training ends"""
|
| 306 |
+
try:
|
| 307 |
+
logger.info("Training completed")
|
| 308 |
+
if self.monitor:
|
| 309 |
+
self.monitor.close()
|
| 310 |
+
except Exception as e:
|
| 311 |
+
logger.error(f"Error in on_train_end: {e}")
|
| 312 |
|
| 313 |
return TrackioCallback(self)
|
| 314 |
|