Spaces:
Sleeping
Sleeping
perf: Optimize LLM for HuggingFace 2 vCPU environment
Browse filesCritical optimizations for game responsiveness:
- Reduce Llama n_threads from 4 to 1 (HF has only 2 vCPUs)
- Add n_batch=128 for faster response time
- Reduce default max_tokens: 512→256 (model_manager), 256→128 (NL translator), 300→200 (AI analysis)
- Reduce timeouts: 30s→15s (model_manager), 25s→15s (AI analysis), add 10s for NL translator
- Increase AI analysis interval: 30s→60s to reduce CPU load
This prevents LLM inference from freezing the game loop on limited CPU resources
- ai_analysis.py +4 -4
- app.py +1 -1
- model_manager.py +5 -4
- nl_translator.py +3 -2
ai_analysis.py
CHANGED
|
@@ -435,9 +435,9 @@ class AIAnalyzer:
|
|
| 435 |
self,
|
| 436 |
prompt: Optional[str] = None,
|
| 437 |
messages: Optional[List[Dict]] = None,
|
| 438 |
-
max_tokens: int =
|
| 439 |
temperature: float = 0.7,
|
| 440 |
-
timeout: float =
|
| 441 |
) -> Dict[str, Any]:
|
| 442 |
"""
|
| 443 |
Generate LLM response (uses shared model if available, falls back to separate process).
|
|
@@ -664,9 +664,9 @@ class AIAnalyzer:
|
|
| 664 |
|
| 665 |
result = self.generate_response(
|
| 666 |
prompt=prompt,
|
| 667 |
-
max_tokens=
|
| 668 |
temperature=0.7,
|
| 669 |
-
timeout=
|
| 670 |
)
|
| 671 |
|
| 672 |
if result.get('status') != 'ok':
|
|
|
|
| 435 |
self,
|
| 436 |
prompt: Optional[str] = None,
|
| 437 |
messages: Optional[List[Dict]] = None,
|
| 438 |
+
max_tokens: int = 200, # Reduced for faster analysis
|
| 439 |
temperature: float = 0.7,
|
| 440 |
+
timeout: float = 15.0 # Shorter timeout to avoid blocking game
|
| 441 |
) -> Dict[str, Any]:
|
| 442 |
"""
|
| 443 |
Generate LLM response (uses shared model if available, falls back to separate process).
|
|
|
|
| 664 |
|
| 665 |
result = self.generate_response(
|
| 666 |
prompt=prompt,
|
| 667 |
+
max_tokens=200, # Reduced for faster response
|
| 668 |
temperature=0.7,
|
| 669 |
+
timeout=15.0 # Shorter timeout
|
| 670 |
)
|
| 671 |
|
| 672 |
if result.get('status') != 'ok':
|
app.py
CHANGED
|
@@ -379,7 +379,7 @@ class ConnectionManager:
|
|
| 379 |
self.game_loop_task: Optional[asyncio.Task] = None
|
| 380 |
self.ai_analyzer = get_ai_analyzer()
|
| 381 |
self.last_ai_analysis: Dict[str, Any] = {}
|
| 382 |
-
self.ai_analysis_interval =
|
| 383 |
self.last_ai_analysis_time = 0.0
|
| 384 |
|
| 385 |
# RED ALERT: Enemy AI state
|
|
|
|
| 379 |
self.game_loop_task: Optional[asyncio.Task] = None
|
| 380 |
self.ai_analyzer = get_ai_analyzer()
|
| 381 |
self.last_ai_analysis: Dict[str, Any] = {}
|
| 382 |
+
self.ai_analysis_interval = 60.0 # Analyze every 60 seconds (reduced frequency to avoid blocking)
|
| 383 |
self.last_ai_analysis_time = 0.0
|
| 384 |
|
| 385 |
# RED ALERT: Enemy AI state
|
model_manager.py
CHANGED
|
@@ -78,9 +78,10 @@ class SharedModelManager:
|
|
| 78 |
self.model = Llama(
|
| 79 |
model_path=str(full_path),
|
| 80 |
n_ctx=4096,
|
| 81 |
-
n_threads=
|
|
|
|
| 82 |
verbose=False,
|
| 83 |
-
chat_format='qwen'
|
| 84 |
)
|
| 85 |
|
| 86 |
self.model_path = model_path
|
|
@@ -162,8 +163,8 @@ class SharedModelManager:
|
|
| 162 |
print(f"Worker thread error: {e}")
|
| 163 |
time.sleep(0.1)
|
| 164 |
|
| 165 |
-
def generate(self, messages: List[Dict[str, str]], max_tokens: int =
|
| 166 |
-
temperature: float = 0.7, timeout: float =
|
| 167 |
"""
|
| 168 |
Generate response from model (thread-safe, queued)
|
| 169 |
|
|
|
|
| 78 |
self.model = Llama(
|
| 79 |
model_path=str(full_path),
|
| 80 |
n_ctx=4096,
|
| 81 |
+
n_threads=1, # Use only 1 thread on HuggingFace (2 vCPUs available)
|
| 82 |
+
n_batch=128, # Smaller batch size for faster response
|
| 83 |
verbose=False,
|
| 84 |
+
chat_format='qwen'
|
| 85 |
)
|
| 86 |
|
| 87 |
self.model_path = model_path
|
|
|
|
| 163 |
print(f"Worker thread error: {e}")
|
| 164 |
time.sleep(0.1)
|
| 165 |
|
| 166 |
+
def generate(self, messages: List[Dict[str, str]], max_tokens: int = 256,
|
| 167 |
+
temperature: float = 0.7, timeout: float = 15.0) -> tuple[bool, Optional[str], Optional[str]]:
|
| 168 |
"""
|
| 169 |
Generate response from model (thread-safe, queued)
|
| 170 |
|
nl_translator.py
CHANGED
|
@@ -154,8 +154,9 @@ Exemple: {"tool": "move_units", "params": {"unit_ids": ["unit_1"], "target_x": 2
|
|
| 154 |
# Generate response using shared model
|
| 155 |
success, raw_response, error = self.model_manager.generate(
|
| 156 |
messages=messages,
|
| 157 |
-
max_tokens=
|
| 158 |
-
temperature=0.1
|
|
|
|
| 159 |
)
|
| 160 |
|
| 161 |
if not success or not raw_response:
|
|
|
|
| 154 |
# Generate response using shared model
|
| 155 |
success, raw_response, error = self.model_manager.generate(
|
| 156 |
messages=messages,
|
| 157 |
+
max_tokens=128, # Reduced for faster response
|
| 158 |
+
temperature=0.1,
|
| 159 |
+
timeout=10.0 # Shorter timeout for game responsiveness
|
| 160 |
)
|
| 161 |
|
| 162 |
if not success or not raw_response:
|