Luigi commited on
Commit
03eb9aa
·
1 Parent(s): 6d41c63

perf: Optimize LLM for HuggingFace 2 vCPU environment

Browse files

Critical optimizations for game responsiveness:
- Reduce Llama n_threads from 4 to 1 (HF has only 2 vCPUs)
- Add n_batch=128 for faster response time
- Reduce default max_tokens: 512→256 (model_manager), 256→128 (NL translator), 300→200 (AI analysis)
- Reduce timeouts: 30s→15s (model_manager), 25s→15s (AI analysis), add 10s for NL translator
- Increase AI analysis interval: 30s→60s to reduce CPU load

This prevents LLM inference from freezing the game loop on limited CPU resources

Files changed (4) hide show
  1. ai_analysis.py +4 -4
  2. app.py +1 -1
  3. model_manager.py +5 -4
  4. nl_translator.py +3 -2
ai_analysis.py CHANGED
@@ -435,9 +435,9 @@ class AIAnalyzer:
435
  self,
436
  prompt: Optional[str] = None,
437
  messages: Optional[List[Dict]] = None,
438
- max_tokens: int = 300,
439
  temperature: float = 0.7,
440
- timeout: float = 30.0
441
  ) -> Dict[str, Any]:
442
  """
443
  Generate LLM response (uses shared model if available, falls back to separate process).
@@ -664,9 +664,9 @@ class AIAnalyzer:
664
 
665
  result = self.generate_response(
666
  prompt=prompt,
667
- max_tokens=300,
668
  temperature=0.7,
669
- timeout=25.0
670
  )
671
 
672
  if result.get('status') != 'ok':
 
435
  self,
436
  prompt: Optional[str] = None,
437
  messages: Optional[List[Dict]] = None,
438
+ max_tokens: int = 200, # Reduced for faster analysis
439
  temperature: float = 0.7,
440
+ timeout: float = 15.0 # Shorter timeout to avoid blocking game
441
  ) -> Dict[str, Any]:
442
  """
443
  Generate LLM response (uses shared model if available, falls back to separate process).
 
664
 
665
  result = self.generate_response(
666
  prompt=prompt,
667
+ max_tokens=200, # Reduced for faster response
668
  temperature=0.7,
669
+ timeout=15.0 # Shorter timeout
670
  )
671
 
672
  if result.get('status') != 'ok':
app.py CHANGED
@@ -379,7 +379,7 @@ class ConnectionManager:
379
  self.game_loop_task: Optional[asyncio.Task] = None
380
  self.ai_analyzer = get_ai_analyzer()
381
  self.last_ai_analysis: Dict[str, Any] = {}
382
- self.ai_analysis_interval = 30.0 # Analyze every 30 seconds
383
  self.last_ai_analysis_time = 0.0
384
 
385
  # RED ALERT: Enemy AI state
 
379
  self.game_loop_task: Optional[asyncio.Task] = None
380
  self.ai_analyzer = get_ai_analyzer()
381
  self.last_ai_analysis: Dict[str, Any] = {}
382
+ self.ai_analysis_interval = 60.0 # Analyze every 60 seconds (reduced frequency to avoid blocking)
383
  self.last_ai_analysis_time = 0.0
384
 
385
  # RED ALERT: Enemy AI state
model_manager.py CHANGED
@@ -78,9 +78,10 @@ class SharedModelManager:
78
  self.model = Llama(
79
  model_path=str(full_path),
80
  n_ctx=4096,
81
- n_threads=4,
 
82
  verbose=False,
83
- chat_format='qwen' # Changed from 'qwen2' to 'qwen'
84
  )
85
 
86
  self.model_path = model_path
@@ -162,8 +163,8 @@ class SharedModelManager:
162
  print(f"Worker thread error: {e}")
163
  time.sleep(0.1)
164
 
165
- def generate(self, messages: List[Dict[str, str]], max_tokens: int = 512,
166
- temperature: float = 0.7, timeout: float = 30.0) -> tuple[bool, Optional[str], Optional[str]]:
167
  """
168
  Generate response from model (thread-safe, queued)
169
 
 
78
  self.model = Llama(
79
  model_path=str(full_path),
80
  n_ctx=4096,
81
+ n_threads=1, # Use only 1 thread on HuggingFace (2 vCPUs available)
82
+ n_batch=128, # Smaller batch size for faster response
83
  verbose=False,
84
+ chat_format='qwen'
85
  )
86
 
87
  self.model_path = model_path
 
163
  print(f"Worker thread error: {e}")
164
  time.sleep(0.1)
165
 
166
+ def generate(self, messages: List[Dict[str, str]], max_tokens: int = 256,
167
+ temperature: float = 0.7, timeout: float = 15.0) -> tuple[bool, Optional[str], Optional[str]]:
168
  """
169
  Generate response from model (thread-safe, queued)
170
 
nl_translator.py CHANGED
@@ -154,8 +154,9 @@ Exemple: {"tool": "move_units", "params": {"unit_ids": ["unit_1"], "target_x": 2
154
  # Generate response using shared model
155
  success, raw_response, error = self.model_manager.generate(
156
  messages=messages,
157
- max_tokens=256,
158
- temperature=0.1
 
159
  )
160
 
161
  if not success or not raw_response:
 
154
  # Generate response using shared model
155
  success, raw_response, error = self.model_manager.generate(
156
  messages=messages,
157
+ max_tokens=128, # Reduced for faster response
158
+ temperature=0.1,
159
+ timeout=10.0 # Shorter timeout for game responsiveness
160
  )
161
 
162
  if not success or not raw_response: