Tonic commited on
Commit
93eabe8
·
verified ·
1 Parent(s): d1f29b8

add error handling for monitoring

Browse files
Files changed (2) hide show
  1. model.py +1 -9
  2. monitoring.py +31 -13
model.py CHANGED
@@ -172,15 +172,7 @@ class SmolLM3Model:
172
  # Override with kwargs
173
  training_args.update(kwargs)
174
 
175
- # Debug: Print training args before creating TrainingArguments
176
- logger.info(f"Training arguments keys: {list(training_args.keys())}")
177
-
178
- try:
179
- return TrainingArguments(**training_args)
180
- except Exception as e:
181
- logger.error(f"Failed to create TrainingArguments: {e}")
182
- logger.error(f"Training arguments: {training_args}")
183
- raise
184
 
185
  def save_pretrained(self, path: str):
186
  """Save model and tokenizer"""
 
172
  # Override with kwargs
173
  training_args.update(kwargs)
174
 
175
+ return TrainingArguments(**training_args)
 
 
 
 
 
 
 
 
176
 
177
  def save_pretrained(self, path: str):
178
  """Save model and tokenizer"""
monitoring.py CHANGED
@@ -263,34 +263,52 @@ class SmolLM3Monitor:
263
 
264
  def on_init_end(self, args, state, control, **kwargs):
265
  """Called when training initialization is complete"""
266
- logger.info("Training initialization completed")
 
 
 
267
 
268
  def on_log(self, args, state, control, logs=None, **kwargs):
269
  """Called when logs are created"""
270
- if logs:
271
- self.monitor.log_metrics(logs, state.global_step)
272
- self.monitor.log_system_metrics(state.global_step)
 
 
 
273
 
274
  def on_save(self, args, state, control, **kwargs):
275
  """Called when a checkpoint is saved"""
276
- checkpoint_path = os.path.join(args.output_dir, f"checkpoint-{state.global_step}")
277
- if os.path.exists(checkpoint_path):
278
- self.monitor.log_model_checkpoint(checkpoint_path, state.global_step)
 
 
 
279
 
280
  def on_evaluate(self, args, state, control, metrics=None, **kwargs):
281
  """Called when evaluation is performed"""
282
- if metrics:
283
- self.monitor.log_evaluation_results(metrics, state.global_step)
 
 
 
284
 
285
  def on_train_begin(self, args, state, control, **kwargs):
286
  """Called when training begins"""
287
- logger.info("Training started")
 
 
 
288
 
289
  def on_train_end(self, args, state, control, **kwargs):
290
  """Called when training ends"""
291
- logger.info("Training completed")
292
- if self.monitor:
293
- self.monitor.close()
 
 
 
294
 
295
  return TrackioCallback(self)
296
 
 
263
 
264
  def on_init_end(self, args, state, control, **kwargs):
265
  """Called when training initialization is complete"""
266
+ try:
267
+ logger.info("Training initialization completed")
268
+ except Exception as e:
269
+ logger.error(f"Error in on_init_end: {e}")
270
 
271
  def on_log(self, args, state, control, logs=None, **kwargs):
272
  """Called when logs are created"""
273
+ try:
274
+ if logs and isinstance(logs, dict):
275
+ self.monitor.log_metrics(logs, state.global_step)
276
+ self.monitor.log_system_metrics(state.global_step)
277
+ except Exception as e:
278
+ logger.error(f"Error in on_log: {e}")
279
 
280
  def on_save(self, args, state, control, **kwargs):
281
  """Called when a checkpoint is saved"""
282
+ try:
283
+ checkpoint_path = os.path.join(args.output_dir, f"checkpoint-{state.global_step}")
284
+ if os.path.exists(checkpoint_path):
285
+ self.monitor.log_model_checkpoint(checkpoint_path, state.global_step)
286
+ except Exception as e:
287
+ logger.error(f"Error in on_save: {e}")
288
 
289
  def on_evaluate(self, args, state, control, metrics=None, **kwargs):
290
  """Called when evaluation is performed"""
291
+ try:
292
+ if metrics and isinstance(metrics, dict):
293
+ self.monitor.log_evaluation_results(metrics, state.global_step)
294
+ except Exception as e:
295
+ logger.error(f"Error in on_evaluate: {e}")
296
 
297
  def on_train_begin(self, args, state, control, **kwargs):
298
  """Called when training begins"""
299
+ try:
300
+ logger.info("Training started")
301
+ except Exception as e:
302
+ logger.error(f"Error in on_train_begin: {e}")
303
 
304
  def on_train_end(self, args, state, control, **kwargs):
305
  """Called when training ends"""
306
+ try:
307
+ logger.info("Training completed")
308
+ if self.monitor:
309
+ self.monitor.close()
310
+ except Exception as e:
311
+ logger.error(f"Error in on_train_end: {e}")
312
 
313
  return TrackioCallback(self)
314