Spaces:

Luigi
/

rts-commander

Sleeping

Luigi commited on Oct 5

Commit

03eb9aa

1 Parent(s): 6d41c63

perf: Optimize LLM for HuggingFace 2 vCPU environment

Critical optimizations for game responsiveness:
- Reduce Llama n_threads from 4 to 1 (HF has only 2 vCPUs)
- Add n_batch=128 for faster response time
- Reduce default max_tokens: 512→256 (model_manager), 256→128 (NL translator), 300→200 (AI analysis)
- Reduce timeouts: 30s→15s (model_manager), 25s→15s (AI analysis), add 10s for NL translator
- Increase AI analysis interval: 30s→60s to reduce CPU load

This prevents LLM inference from freezing the game loop on limited CPU resources

Files changed (4) hide show

ai_analysis.py +4 -4
app.py +1 -1
model_manager.py +5 -4
nl_translator.py +3 -2

ai_analysis.py CHANGED Viewed

@@ -435,9 +435,9 @@ class AIAnalyzer:
         self,
         prompt: Optional[str] = None,
         messages: Optional[List[Dict]] = None,
-        max_tokens: int = 300,
         temperature: float = 0.7,
-        timeout: float = 30.0
     ) -> Dict[str, Any]:
         """
         Generate LLM response (uses shared model if available, falls back to separate process).
@@ -664,9 +664,9 @@ class AIAnalyzer:
         result = self.generate_response(
             prompt=prompt,
-            max_tokens=300,
             temperature=0.7,
-            timeout=25.0
         )
         if result.get('status') != 'ok':

         self,
         prompt: Optional[str] = None,
         messages: Optional[List[Dict]] = None,
+        max_tokens: int = 200,  # Reduced for faster analysis
         temperature: float = 0.7,
+        timeout: float = 15.0  # Shorter timeout to avoid blocking game
     ) -> Dict[str, Any]:
         """
         Generate LLM response (uses shared model if available, falls back to separate process).
         result = self.generate_response(
             prompt=prompt,
+            max_tokens=200,  # Reduced for faster response
             temperature=0.7,
+            timeout=15.0  # Shorter timeout
         )
         if result.get('status') != 'ok':

app.py CHANGED Viewed

@@ -379,7 +379,7 @@ class ConnectionManager:
         self.game_loop_task: Optional[asyncio.Task] = None
         self.ai_analyzer = get_ai_analyzer()
         self.last_ai_analysis: Dict[str, Any] = {}
-        self.ai_analysis_interval = 30.0  # Analyze every 30 seconds
         self.last_ai_analysis_time = 0.0
         # RED ALERT: Enemy AI state

         self.game_loop_task: Optional[asyncio.Task] = None
         self.ai_analyzer = get_ai_analyzer()
         self.last_ai_analysis: Dict[str, Any] = {}
+        self.ai_analysis_interval = 60.0  # Analyze every 60 seconds (reduced frequency to avoid blocking)
         self.last_ai_analysis_time = 0.0
         # RED ALERT: Enemy AI state

model_manager.py CHANGED Viewed

@@ -78,9 +78,10 @@ class SharedModelManager:
                 self.model = Llama(
                     model_path=str(full_path),
                     n_ctx=4096,
-                    n_threads=4,
                     verbose=False,
-                    chat_format='qwen'  # Changed from 'qwen2' to 'qwen'
                 )
                 self.model_path = model_path
@@ -162,8 +163,8 @@ class SharedModelManager:
                 print(f"Worker thread error: {e}")
                 time.sleep(0.1)
-    def generate(self, messages: List[Dict[str, str]], max_tokens: int = 512,
-                 temperature: float = 0.7, timeout: float = 30.0) -> tuple[bool, Optional[str], Optional[str]]:
         """
         Generate response from model (thread-safe, queued)

                 self.model = Llama(
                     model_path=str(full_path),
                     n_ctx=4096,
+                    n_threads=1,  # Use only 1 thread on HuggingFace (2 vCPUs available)
+                    n_batch=128,  # Smaller batch size for faster response
                     verbose=False,
+                    chat_format='qwen'
                 )
                 self.model_path = model_path
                 print(f"Worker thread error: {e}")
                 time.sleep(0.1)
+    def generate(self, messages: List[Dict[str, str]], max_tokens: int = 256,
+                 temperature: float = 0.7, timeout: float = 15.0) -> tuple[bool, Optional[str], Optional[str]]:
         """
         Generate response from model (thread-safe, queued)

nl_translator.py CHANGED Viewed

@@ -154,8 +154,9 @@ Exemple: {"tool": "move_units", "params": {"unit_ids": ["unit_1"], "target_x": 2
             # Generate response using shared model
             success, raw_response, error = self.model_manager.generate(
                 messages=messages,
-                max_tokens=256,
-                temperature=0.1
             )
             if not success or not raw_response:

             # Generate response using shared model
             success, raw_response, error = self.model_manager.generate(
                 messages=messages,
+                max_tokens=128,  # Reduced for faster response
+                temperature=0.1,
+                timeout=10.0  # Shorter timeout for game responsiveness
             )
             if not success or not raw_response: