Spaces:
Runtime error
Runtime error
hot-fix: memory
Browse files- agents/ensemble_team.py +16 -2
agents/ensemble_team.py
CHANGED
|
@@ -4,6 +4,7 @@ import torch
|
|
| 4 |
import psutil # Ensure psutil is imported here as well
|
| 5 |
import GPUtil
|
| 6 |
from datetime import datetime, timedelta
|
|
|
|
| 7 |
|
| 8 |
logger = logging.getLogger(__name__)
|
| 9 |
|
|
@@ -40,7 +41,7 @@ class EnsembleMonitorAgent:
|
|
| 40 |
self.alerts.append(alert_msg)
|
| 41 |
logger.warning(alert_msg)
|
| 42 |
|
| 43 |
-
logger.
|
| 44 |
|
| 45 |
def get_performance_summary(self):
|
| 46 |
logger.info("Generating performance summary for all models.")
|
|
@@ -74,7 +75,7 @@ class WeightOptimizationAgent:
|
|
| 74 |
|
| 75 |
# Keep history windowed
|
| 76 |
self.prediction_history = [p for p in self.prediction_history if timestamp - p["timestamp"] < self.performance_window]
|
| 77 |
-
logger.
|
| 78 |
|
| 79 |
# In a real scenario, this would involve a more complex optimization logic
|
| 80 |
# For now, it just logs the history length.
|
|
@@ -98,6 +99,19 @@ class SystemHealthAgent:
|
|
| 98 |
"percent": mem.percent
|
| 99 |
}
|
| 100 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
gpu_info = []
|
| 102 |
try:
|
| 103 |
gpus = GPUtil.getGPUs()
|
|
|
|
| 4 |
import psutil # Ensure psutil is imported here as well
|
| 5 |
import GPUtil
|
| 6 |
from datetime import datetime, timedelta
|
| 7 |
+
import gc # Import garbage collector
|
| 8 |
|
| 9 |
logger = logging.getLogger(__name__)
|
| 10 |
|
|
|
|
| 41 |
self.alerts.append(alert_msg)
|
| 42 |
logger.warning(alert_msg)
|
| 43 |
|
| 44 |
+
logger.info(f"Updated metrics for '{model_id}': {metrics}")
|
| 45 |
|
| 46 |
def get_performance_summary(self):
|
| 47 |
logger.info("Generating performance summary for all models.")
|
|
|
|
| 75 |
|
| 76 |
# Keep history windowed
|
| 77 |
self.prediction_history = [p for p in self.prediction_history if timestamp - p["timestamp"] < self.performance_window]
|
| 78 |
+
logger.info(f"Prediction history length: {len(self.prediction_history)}")
|
| 79 |
|
| 80 |
# In a real scenario, this would involve a more complex optimization logic
|
| 81 |
# For now, it just logs the history length.
|
|
|
|
| 99 |
"percent": mem.percent
|
| 100 |
}
|
| 101 |
|
| 102 |
+
# Holy moly, been at 99% for hours whoops
|
| 103 |
+
if mem.percent > 90:
|
| 104 |
+
logger.warning(f"CRITICAL: System memory usage is at {mem.percent}%. Attempting to clear memory cache...")
|
| 105 |
+
gc.collect()
|
| 106 |
+
logger.info("Garbage collection triggered. Re-checking memory usage...")
|
| 107 |
+
mem_after_gc = psutil.virtual_memory()
|
| 108 |
+
self.health_metrics["memory_usage_after_gc"] = {
|
| 109 |
+
"total": mem_after_gc.total,
|
| 110 |
+
"available": mem_after_gc.available,
|
| 111 |
+
"percent": mem_after_gc.percent
|
| 112 |
+
}
|
| 113 |
+
logger.info(f"Memory usage after GC: {mem_after_gc.percent}%")
|
| 114 |
+
|
| 115 |
gpu_info = []
|
| 116 |
try:
|
| 117 |
gpus = GPUtil.getGPUs()
|