rdune71 commited on
Commit
084503a
·
1 Parent(s): c1cbefd

Implement HF endpoint monitoring and integration with wake-up functionality

Browse files
src/llm/factory.py CHANGED
@@ -1,9 +1,10 @@
1
  import logging
2
- from typing import Optional, List
3
  from src.llm.base_provider import LLMProvider
4
  from src.llm.hf_provider import HuggingFaceProvider
5
  from src.llm.ollama_provider import OllamaProvider
6
  from utils.config import config
 
7
 
8
  logger = logging.getLogger(__name__)
9
 
@@ -24,87 +25,54 @@ class LLMFactory:
24
  def get_provider(self, preferred_provider: Optional[str] = None) -> LLMProvider:
25
  """
26
  Get an LLM provider based on preference and availability.
27
-
28
- Args:
29
- preferred_provider: Preferred provider name ('huggingface', 'ollama')
30
-
31
- Returns:
32
- LLMProvider instance
33
-
34
- Raises:
35
- ProviderNotAvailableError: When no providers are available
36
  """
37
- # Build provider chain based on configuration and preference
38
- provider_chain = self._build_provider_chain(preferred_provider)
 
 
 
 
 
 
 
 
 
39
 
40
- # Try providers in order
41
- for provider_name, provider_class, model_name in provider_chain:
42
- try:
43
- logger.info(f"Attempting to initialize {provider_name} provider...")
44
- provider = provider_class(model_name=model_name)
45
- # Test that provider is working
46
- if self._test_provider(provider):
47
- logger.info(f"Successfully initialized {provider_name} provider")
48
- return provider
49
- else:
50
- logger.warning(f"{provider_name} provider failed validation test")
51
- except Exception as e:
52
- logger.warning(f"Failed to initialize {provider_name} provider: {e}")
53
- continue
54
 
55
  raise ProviderNotAvailableError("No LLM providers are available or configured")
56
 
57
- def _build_provider_chain(self, preferred_provider: Optional[str]) -> List[tuple]:
58
- """Build provider chain based on preference and configuration"""
59
- chain = []
60
-
61
- # Add preferred provider first if specified
62
- if preferred_provider:
63
- provider_info = self._get_provider_info(preferred_provider)
64
- if provider_info:
65
- chain.append(provider_info)
66
-
67
- # Add fallback providers based on configuration
68
- if config.use_fallback:
69
- # Add HF if configured
70
- if config.hf_token:
71
- chain.append((
72
- "huggingface",
73
- HuggingFaceProvider,
74
- "DavidAU/OpenAi-GPT-oss-20b-abliterated-uncensored-NEO-Imatrix-gguf"
75
- ))
 
 
 
 
 
 
76
 
77
- # Add Ollama if configured
78
- if config.ollama_host:
79
- chain.append((
80
- "ollama",
81
- OllamaProvider,
82
- config.local_model_name
83
- ))
84
-
85
- return chain
86
-
87
- def _get_provider_info(self, provider_name: str) -> Optional[tuple]:
88
- """Get provider class and model info"""
89
- provider_map = {
90
- "huggingface": (
91
- "huggingface",
92
- HuggingFaceProvider,
93
- "DavidAU/OpenAi-GPT-oss-20b-abliterated-uncensored-NEO-Imatrix-gguf"
94
- ),
95
- "ollama": (
96
- "ollama",
97
- OllamaProvider,
98
- config.local_model_name
99
- )
100
- }
101
- return provider_map.get(provider_name)
102
-
103
- def _test_provider(self, provider: LLMProvider) -> bool:
104
- """Test if provider is working (stub implementation)"""
105
- # In a real implementation, you might want to do a lightweight test
106
- # For now, we'll assume initialization success means it's working
107
- return True
108
 
109
  # Global factory instance
110
  llm_factory = LLMFactory()
 
1
  import logging
2
+ from typing import Optional
3
  from src.llm.base_provider import LLMProvider
4
  from src.llm.hf_provider import HuggingFaceProvider
5
  from src.llm.ollama_provider import OllamaProvider
6
  from utils.config import config
7
+ from src.services.hf_monitor import hf_monitor
8
 
9
  logger = logging.getLogger(__name__)
10
 
 
25
  def get_provider(self, preferred_provider: Optional[str] = None) -> LLMProvider:
26
  """
27
  Get an LLM provider based on preference and availability.
 
 
 
 
 
 
 
 
 
28
  """
29
+ # Try preferred provider first
30
+ if preferred_provider:
31
+ provider = self._try_provider(preferred_provider)
32
+ if provider:
33
+ return provider
34
+
35
+ # Try HF provider if configured
36
+ if config.hf_token:
37
+ provider = self._try_provider("huggingface")
38
+ if provider:
39
+ return provider
40
 
41
+ # Try Ollama as fallback
42
+ if config.ollama_host:
43
+ provider = self._try_provider("ollama")
44
+ if provider:
45
+ return provider
 
 
 
 
 
 
 
 
 
46
 
47
  raise ProviderNotAvailableError("No LLM providers are available or configured")
48
 
49
+ def _try_provider(self, provider_name: str) -> Optional[LLMProvider]:
50
+ """Try to initialize a specific provider"""
51
+ try:
52
+ if provider_name == "huggingface" and config.hf_token:
53
+ # Check if HF endpoint is available
54
+ status = hf_monitor.get_endpoint_status()
55
+ if status["available"] or status["initializing"]:
56
+ return HuggingFaceProvider(
57
+ model_name="DavidAU/OpenAi-GPT-oss-20b-abliterated-uncensored-NEO-Imatrix-gguf"
58
+ )
59
+ elif status["status"] == "scaled_to_zero":
60
+ # Attempt to wake up the endpoint
61
+ logger.info("Attempting to wake up HF endpoint...")
62
+ if hf_monitor.attempt_wake_up():
63
+ return HuggingFaceProvider(
64
+ model_name="DavidAU/OpenAi-GPT-oss-20b-abliterated-uncensored-NEO-Imatrix-gguf"
65
+ )
66
+
67
+ elif provider_name == "ollama" and config.ollama_host:
68
+ return OllamaProvider(
69
+ model_name=config.local_model_name
70
+ )
71
+
72
+ except Exception as e:
73
+ logger.warning(f"Failed to initialize {provider_name} provider: {e}")
74
 
75
+ return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
 
77
  # Global factory instance
78
  llm_factory = LLMFactory()
src/llm/hf_provider.py CHANGED
@@ -1,8 +1,9 @@
1
  import time
2
  import logging
3
  from typing import List, Dict, Optional, Union
4
- from src.llm.base_provider import LLMProvider
5
  from utils.config import config
 
6
 
7
  logger = logging.getLogger(__name__)
8
 
@@ -13,10 +14,10 @@ except ImportError:
13
  HF_SDK_AVAILABLE = False
14
  OpenAI = None
15
 
16
- class HuggingFaceProvider(LLMProvider):
17
  """Hugging Face LLM provider for your custom endpoint"""
18
 
19
- def __init__(self, model_name: str, timeout: int = 60, max_retries: int = 3):
20
  super().__init__(model_name, timeout, max_retries)
21
 
22
  if not HF_SDK_AVAILABLE:
@@ -34,24 +35,20 @@ class HuggingFaceProvider(LLMProvider):
34
 
35
  def generate(self, prompt: str, conversation_history: List[Dict]) -> Optional[str]:
36
  """Generate a response synchronously"""
37
- return self._retry_with_backoff(self._generate_impl, prompt, conversation_history)
38
-
39
- def stream_generate(self, prompt: str, conversation_history: List[Dict]) -> Optional[Union[str, List[str]]]:
40
- """Generate a response with streaming support"""
41
- return self._retry_with_backoff(self._stream_generate_impl, prompt, conversation_history)
42
-
43
- def _generate_impl(self, prompt: str, conversation_history: List[Dict]) -> str:
44
- """Implementation of synchronous generation"""
45
  try:
 
 
 
46
  response = self.client.chat.completions.create(
47
  model=self.model_name,
48
- messages=conversation_history,
49
  max_tokens=8192,
50
  temperature=0.7,
51
  stream=False
52
  )
53
  return response.choices[0].message.content
54
  except Exception as e:
 
55
  # Handle scale-to-zero behavior
56
  if self._is_scale_to_zero_error(e):
57
  logger.info("HF endpoint is scaling up, waiting...")
@@ -67,12 +64,15 @@ class HuggingFaceProvider(LLMProvider):
67
  return response.choices[0].message.content
68
  raise
69
 
70
- def _stream_generate_impl(self, prompt: str, conversation_history: List[Dict]) -> List[str]:
71
- """Implementation of streaming generation"""
72
  try:
 
 
 
73
  response = self.client.chat.completions.create(
74
  model=self.model_name,
75
- messages=conversation_history,
76
  max_tokens=8192,
77
  temperature=0.7,
78
  stream=True
@@ -85,6 +85,7 @@ class HuggingFaceProvider(LLMProvider):
85
  chunks.append(content)
86
  return chunks
87
  except Exception as e:
 
88
  # Handle scale-to-zero behavior
89
  if self._is_scale_to_zero_error(e):
90
  logger.info("HF endpoint is scaling up, waiting...")
@@ -106,6 +107,28 @@ class HuggingFaceProvider(LLMProvider):
106
  return chunks
107
  raise
108
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
  def _is_scale_to_zero_error(self, error: Exception) -> bool:
110
  """Check if the error is related to scale-to-zero initialization"""
111
  error_str = str(error).lower()
@@ -113,6 +136,7 @@ class HuggingFaceProvider(LLMProvider):
113
  "503",
114
  "service unavailable",
115
  "initializing",
116
- "cold start"
 
117
  ]
118
  return any(indicator in error_str for indicator in scale_to_zero_indicators)
 
1
  import time
2
  import logging
3
  from typing import List, Dict, Optional, Union
4
+ from src.llm.enhanced_provider import EnhancedLLMProvider
5
  from utils.config import config
6
+ from src.services.context_enrichment import context_service
7
 
8
  logger = logging.getLogger(__name__)
9
 
 
14
  HF_SDK_AVAILABLE = False
15
  OpenAI = None
16
 
17
+ class HuggingFaceProvider(EnhancedLLMProvider):
18
  """Hugging Face LLM provider for your custom endpoint"""
19
 
20
+ def __init__(self, model_name: str, timeout: int = 120, max_retries: int = 2):
21
  super().__init__(model_name, timeout, max_retries)
22
 
23
  if not HF_SDK_AVAILABLE:
 
35
 
36
  def generate(self, prompt: str, conversation_history: List[Dict]) -> Optional[str]:
37
  """Generate a response synchronously"""
 
 
 
 
 
 
 
 
38
  try:
39
+ # Enrich context
40
+ enriched_history = self._enrich_context(conversation_history)
41
+
42
  response = self.client.chat.completions.create(
43
  model=self.model_name,
44
+ messages=enriched_history,
45
  max_tokens=8192,
46
  temperature=0.7,
47
  stream=False
48
  )
49
  return response.choices[0].message.content
50
  except Exception as e:
51
+ logger.error(f"HF generation failed: {e}")
52
  # Handle scale-to-zero behavior
53
  if self._is_scale_to_zero_error(e):
54
  logger.info("HF endpoint is scaling up, waiting...")
 
64
  return response.choices[0].message.content
65
  raise
66
 
67
+ def stream_generate(self, prompt: str, conversation_history: List[Dict]) -> Optional[Union[str, List[str]]]:
68
+ """Generate a response with streaming support"""
69
  try:
70
+ # Enrich context
71
+ enriched_history = self._enrich_context(conversation_history)
72
+
73
  response = self.client.chat.completions.create(
74
  model=self.model_name,
75
+ messages=enriched_history,
76
  max_tokens=8192,
77
  temperature=0.7,
78
  stream=True
 
85
  chunks.append(content)
86
  return chunks
87
  except Exception as e:
88
+ logger.error(f"HF stream generation failed: {e}")
89
  # Handle scale-to-zero behavior
90
  if self._is_scale_to_zero_error(e):
91
  logger.info("HF endpoint is scaling up, waiting...")
 
107
  return chunks
108
  raise
109
 
110
+ def _enrich_context(self, conversation_history: List[Dict]) -> List[Dict]:
111
+ """Add current context to conversation"""
112
+ # Get the last user message to determine context needs
113
+ last_user_message = ""
114
+ for msg in reversed(conversation_history):
115
+ if msg["role"] == "user":
116
+ last_user_message = msg["content"]
117
+ break
118
+
119
+ # Get current context
120
+ context = context_service.get_current_context(last_user_message)
121
+
122
+ # Add context as system message at the beginning
123
+ context_message = {
124
+ "role": "system",
125
+ "content": f"[Current Context: {context['current_time']} | Weather: {context['weather']}]"
126
+ }
127
+
128
+ # Insert context at the beginning
129
+ enriched_history = [context_message] + conversation_history
130
+ return enriched_history
131
+
132
  def _is_scale_to_zero_error(self, error: Exception) -> bool:
133
  """Check if the error is related to scale-to-zero initialization"""
134
  error_str = str(error).lower()
 
136
  "503",
137
  "service unavailable",
138
  "initializing",
139
+ "cold start",
140
+ "timeout"
141
  ]
142
  return any(indicator in error_str for indicator in scale_to_zero_indicators)
src/services/hf_monitor.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import time
3
+ import logging
4
+ from typing import Dict
5
+ from utils.config import config
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+ class HFEndpointMonitor:
10
+ """Monitor Hugging Face endpoint status and health"""
11
+
12
+ def __init__(self):
13
+ self.endpoint_url = config.hf_api_url.rstrip('/') if config.hf_api_url else ""
14
+ self.hf_token = config.hf_token
15
+ self.last_check = 0
16
+ self.check_interval = 300 # 5 minutes
17
+ self._cached_status = None
18
+
19
+ def get_endpoint_status(self) -> Dict:
20
+ """Get current HF endpoint status"""
21
+ current_time = time.time()
22
+
23
+ # Return cached status if checked recently
24
+ if (self._cached_status and
25
+ current_time - self.last_check < 60):
26
+ return self._cached_status
27
+
28
+ self.last_check = current_time
29
+
30
+ # Check if configured
31
+ if not self.endpoint_url or not self.hf_token:
32
+ status = {
33
+ "status": "not_configured",
34
+ "message": "HF endpoint not configured",
35
+ "available": False,
36
+ "initializing": False
37
+ }
38
+ self._cached_status = status
39
+ return status
40
+
41
+ try:
42
+ # Check endpoint status
43
+ headers = {"Authorization": f"Bearer {self.hf_token}"}
44
+ models_url = f"{self.endpoint_url}/models"
45
+
46
+ response = requests.get(
47
+ models_url,
48
+ headers=headers,
49
+ timeout=15
50
+ )
51
+
52
+ if response.status_code in [200, 201]:
53
+ status = {
54
+ "status": "available",
55
+ "message": "HF endpoint is ready",
56
+ "available": True,
57
+ "initializing": False,
58
+ "status_code": response.status_code
59
+ }
60
+ elif response.status_code == 503:
61
+ status = {
62
+ "status": "scaled_to_zero",
63
+ "message": "HF endpoint is scaled to zero",
64
+ "available": False,
65
+ "initializing": False,
66
+ "status_code": 503
67
+ }
68
+ else:
69
+ status = {
70
+ "status": "error",
71
+ "message": f"HF endpoint error: {response.status_code}",
72
+ "available": False,
73
+ "initializing": False,
74
+ "status_code": response.status_code
75
+ }
76
+
77
+ except requests.exceptions.Timeout:
78
+ status = {
79
+ "status": "timeout",
80
+ "message": "HF endpoint timeout (may be initializing)",
81
+ "available": False,
82
+ "initializing": True
83
+ }
84
+ except Exception as e:
85
+ status = {
86
+ "status": "error",
87
+ "message": f"HF endpoint error: {str(e)}",
88
+ "available": False,
89
+ "initializing": False
90
+ }
91
+
92
+ self._cached_status = status
93
+ return status
94
+
95
+ def get_human_readable_status(self) -> str:
96
+ """Get human-readable status message"""
97
+ status = self.get_endpoint_status()
98
+
99
+ status_messages = {
100
+ "not_configured": "🟡 HF Endpoint: Not configured",
101
+ "available": "🟢 HF Endpoint: Available and ready",
102
+ "scaled_to_zero": "🔴 HF Endpoint: Scaled to zero (send message to wake up)",
103
+ "timeout": "⏳ HF Endpoint: Initializing (may take 4 minutes)",
104
+ "error": f"❌ HF Endpoint: Error - {status.get('message', 'Unknown error')}"
105
+ }
106
+
107
+ return status_messages.get(status["status"], "⚪ HF Endpoint: Unknown status")
108
+
109
+ def attempt_wake_up(self) -> bool:
110
+ """Attempt to wake up the HF endpoint"""
111
+ if not self.endpoint_url or not self.hf_token:
112
+ return False
113
+
114
+ try:
115
+ headers = {
116
+ "Authorization": f"Bearer {self.hf_token}",
117
+ "Content-Type": "application/json"
118
+ }
119
+
120
+ # Send a minimal request to wake up the endpoint
121
+ payload = {
122
+ "model": "DavidAU/OpenAi-GPT-oss-20b-abliterated-uncensored-NEO-Imatrix-gguf",
123
+ "messages": [{"role": "user", "content": "Hello"}],
124
+ "max_tokens": 10,
125
+ "stream": False
126
+ }
127
+
128
+ chat_url = f"{self.endpoint_url}/chat/completions"
129
+ response = requests.post(
130
+ chat_url,
131
+ headers=headers,
132
+ json=payload,
133
+ timeout=45
134
+ )
135
+
136
+ return response.status_code in [200, 201]
137
+
138
+ except Exception as e:
139
+ logger.warning(f"Failed to wake up HF endpoint: {e}")
140
+ return False
141
+
142
+ # Global instance
143
+ hf_monitor = HFEndpointMonitor()