Spaces:
Running
Running
attempts to identify callbacks bug or trainer bug
Browse files- trainer.py +20 -15
trainer.py
CHANGED
|
@@ -94,22 +94,26 @@ class SmolLM3Trainer:
|
|
| 94 |
eval_loss = metrics.get('eval_loss', 'N/A')
|
| 95 |
print(f"📊 Evaluation at step {step}: eval_loss={eval_loss}")
|
| 96 |
|
| 97 |
-
#
|
| 98 |
-
callbacks
|
| 99 |
-
logger.info("
|
| 100 |
|
| 101 |
-
#
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
|
| 114 |
# Try standard Trainer first (more stable with callbacks)
|
| 115 |
try:
|
|
@@ -181,6 +185,7 @@ class SmolLM3Trainer:
|
|
| 181 |
|
| 182 |
# Start training
|
| 183 |
try:
|
|
|
|
| 184 |
train_result = self.trainer.train()
|
| 185 |
|
| 186 |
# Save the final model
|
|
|
|
| 94 |
eval_loss = metrics.get('eval_loss', 'N/A')
|
| 95 |
print(f"📊 Evaluation at step {step}: eval_loss={eval_loss}")
|
| 96 |
|
| 97 |
+
# Temporarily disable callbacks to debug the issue
|
| 98 |
+
callbacks = []
|
| 99 |
+
logger.info("Callbacks disabled for debugging")
|
| 100 |
|
| 101 |
+
# # Add simple console callback
|
| 102 |
+
# callbacks.append(SimpleConsoleCallback())
|
| 103 |
+
# logger.info("Added simple console monitoring callback")
|
| 104 |
+
#
|
| 105 |
+
# # Try to add Trackio callback if available
|
| 106 |
+
# if self.monitor and self.monitor.enable_tracking:
|
| 107 |
+
# try:
|
| 108 |
+
# trackio_callback = self.monitor.create_monitoring_callback()
|
| 109 |
+
# if trackio_callback:
|
| 110 |
+
# callbacks.append(trackio_callback)
|
| 111 |
+
# logger.info("Added Trackio monitoring callback")
|
| 112 |
+
# else:
|
| 113 |
+
# logger.warning("Failed to create Trackio callback")
|
| 114 |
+
# except Exception as e:
|
| 115 |
+
# logger.error(f"Error creating Trackio callback: {e}")
|
| 116 |
+
# logger.info("Continuing with console monitoring only")
|
| 117 |
|
| 118 |
# Try standard Trainer first (more stable with callbacks)
|
| 119 |
try:
|
|
|
|
| 185 |
|
| 186 |
# Start training
|
| 187 |
try:
|
| 188 |
+
logger.info("About to start trainer.train()")
|
| 189 |
train_result = self.trainer.train()
|
| 190 |
|
| 191 |
# Save the final model
|