Spaces:
Running
Running
try to resolve the issue with sftt trainer or trackio
Browse files- trainer.py +25 -21
trainer.py
CHANGED
|
@@ -98,40 +98,44 @@ class SmolLM3Trainer:
|
|
| 98 |
callbacks.append(SimpleConsoleCallback())
|
| 99 |
logger.info("Added simple console monitoring callback")
|
| 100 |
|
| 101 |
-
# Try to add Trackio callback if available
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
|
|
|
| 113 |
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
trainer =
|
| 117 |
model=self.model.model,
|
|
|
|
|
|
|
| 118 |
train_dataset=train_dataset,
|
| 119 |
eval_dataset=eval_dataset,
|
| 120 |
-
args=training_args,
|
| 121 |
data_collator=data_collator,
|
| 122 |
callbacks=callbacks,
|
| 123 |
)
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
|
|
|
|
|
|
| 127 |
model=self.model.model,
|
| 128 |
-
tokenizer=self.model.tokenizer,
|
| 129 |
-
args=training_args,
|
| 130 |
train_dataset=train_dataset,
|
| 131 |
eval_dataset=eval_dataset,
|
|
|
|
| 132 |
data_collator=data_collator,
|
| 133 |
callbacks=callbacks,
|
| 134 |
)
|
|
|
|
| 135 |
|
| 136 |
return trainer
|
| 137 |
|
|
|
|
| 98 |
callbacks.append(SimpleConsoleCallback())
|
| 99 |
logger.info("Added simple console monitoring callback")
|
| 100 |
|
| 101 |
+
# Try to add Trackio callback if available (temporarily disabled for debugging)
|
| 102 |
+
logger.info("Skipping Trackio callback to debug training issue")
|
| 103 |
+
# if self.monitor and self.monitor.enable_tracking:
|
| 104 |
+
# try:
|
| 105 |
+
# trackio_callback = self.monitor.create_monitoring_callback()
|
| 106 |
+
# if trackio_callback:
|
| 107 |
+
# callbacks.append(trackio_callback)
|
| 108 |
+
# logger.info("Added Trackio monitoring callback")
|
| 109 |
+
# else:
|
| 110 |
+
# logger.warning("Failed to create Trackio callback")
|
| 111 |
+
# except Exception as e:
|
| 112 |
+
# logger.error(f"Error creating Trackio callback: {e}")
|
| 113 |
+
# logger.info("Continuing with console monitoring only")
|
| 114 |
|
| 115 |
+
# Try standard Trainer first (more stable with callbacks)
|
| 116 |
+
try:
|
| 117 |
+
trainer = Trainer(
|
| 118 |
model=self.model.model,
|
| 119 |
+
tokenizer=self.model.tokenizer,
|
| 120 |
+
args=training_args,
|
| 121 |
train_dataset=train_dataset,
|
| 122 |
eval_dataset=eval_dataset,
|
|
|
|
| 123 |
data_collator=data_collator,
|
| 124 |
callbacks=callbacks,
|
| 125 |
)
|
| 126 |
+
logger.info("Using standard Hugging Face Trainer")
|
| 127 |
+
except Exception as e:
|
| 128 |
+
logger.warning(f"Standard Trainer failed: {e}")
|
| 129 |
+
# Fallback to SFTTrainer
|
| 130 |
+
trainer = SFTTrainer(
|
| 131 |
model=self.model.model,
|
|
|
|
|
|
|
| 132 |
train_dataset=train_dataset,
|
| 133 |
eval_dataset=eval_dataset,
|
| 134 |
+
args=training_args,
|
| 135 |
data_collator=data_collator,
|
| 136 |
callbacks=callbacks,
|
| 137 |
)
|
| 138 |
+
logger.info("Using SFTTrainer")
|
| 139 |
|
| 140 |
return trainer
|
| 141 |
|