kshitijthakkar commited on
Commit
02a42b8
·
1 Parent(s): 60fc856

run via ollama quants gguf for faster inference speed

Browse files
Dockerfile CHANGED
@@ -1,30 +1,44 @@
1
- # Dockerfile for a Python application with user permissions
2
  FROM python:3.11-slim
3
 
4
- # Install system dependencies as root
5
- RUN apt-get update && apt-get install -y build-essential && apt-get install -y curl && rm -rf /var/lib/apt/lists/*
 
 
 
6
 
7
- # Create user and set up directory structure as root
8
- RUN useradd -m -u 1000 user && \
 
 
 
 
 
9
  mkdir -p /app && \
10
- chown -R user:user /app
11
 
12
- # Set working directory
13
  WORKDIR /app
14
 
15
- # Switch to user AFTER setting up permissions
16
- USER user
17
- ENV PATH="/home/user/.local/bin:$PATH"
 
 
 
18
 
19
- # Copy files with proper ownership
20
- COPY --chown=user:user . /app
21
 
22
  # Install Python dependencies
23
- COPY --chown=user:user ./requirements.txt requirements.txt
24
  RUN pip install --no-cache-dir --upgrade pip && \
25
- pip install --no-cache-dir --user -r requirements.txt
 
 
 
 
 
 
 
26
 
27
- # Make start.sh executable
28
- EXPOSE 8000 7860
29
- # Run the startup script
30
- CMD bash -c "python /app/enhanced_app.py"
 
1
+ # Dockerfile - Hugging Face Space with Ollama (small model)
2
  FROM python:3.11-slim
3
 
4
+ # Set Ollama environment
5
+ ENV OLLAMA_HOST=0.0.0.0:11434
6
+ ENV OLLAMA_ORIGINS=http://*,https://*
7
+ # Optional: change model storage to /data for better caching
8
+ # ENV OLLAMA_MODELS=/data/ollama
9
 
10
+ # Install dependencies
11
+ RUN apt-get update && \
12
+ apt-get install -y curl ca-certificates && \
13
+ rm -rf /var/lib/apt/lists/*
14
+
15
+ # Create non-root user and app directory
16
+ RUN useradd -m -u 1000 appuser && \
17
  mkdir -p /app && \
18
+ chown -R appuser:appuser /app
19
 
20
+ USER appuser
21
  WORKDIR /app
22
 
23
+ # Install Ollama CLI
24
+ RUN mkdir -p ~/.local/bin && \
25
+ curl -fsSL https://ollama.com/download/ollama-linux-amd64.tgz | tar -xvz -C ~/.local/bin ollama && \
26
+ chmod +x ~/.local/bin/ollama
27
+
28
+ ENV PATH="/home/appuser/.local/bin:$PATH"
29
 
30
+ # Copy app
31
+ COPY --chown=appuser:appuser . /app
32
 
33
  # Install Python dependencies
 
34
  RUN pip install --no-cache-dir --upgrade pip && \
35
+ pip install --no-cache-dir -r requirements.txt
36
+
37
+ # Expose Gradio port (required)
38
+ EXPOSE 7860
39
+
40
+ # Entrypoint
41
+ COPY --chown=appuser:appuser entrypoint.sh /app/entrypoint.sh
42
+ RUN chmod +x /app/entrypoint.sh
43
 
44
+ CMD ["/app/entrypoint.sh"]
 
 
 
enhanced_app.py CHANGED
@@ -7,8 +7,8 @@ import json
7
  import random
8
  import os
9
  #from model_handler import generate_response, get_inference_configs
10
- from enhanced_model_handler import generate_response, get_inference_configs
11
-
12
  import torch
13
 
14
  # Configuration for datasets
@@ -775,5 +775,6 @@ if __name__ == "__main__":
775
  server_name="0.0.0.0",
776
  server_port=7860,
777
  share=False,
778
- debug=True
 
779
  )
 
7
  import random
8
  import os
9
  #from model_handler import generate_response, get_inference_configs
10
+ #from enhanced_model_handler import generate_response, get_inference_configs
11
+ from model_handler_ollama import generate_response, get_inference_configs
12
  import torch
13
 
14
  # Configuration for datasets
 
775
  server_name="0.0.0.0",
776
  server_port=7860,
777
  share=False,
778
+ debug=True,
779
+ mcp_server=True
780
  )
enhanced_model_handler.py CHANGED
@@ -202,7 +202,7 @@ def load_model() -> Tuple[Optional[Any], Optional[Any]]:
202
  try:
203
  model_kwargs = {
204
  "device_map": "auto",
205
- "dtype": torch.float16,
206
  "use_cache": False,
207
  "trust_remote_code": True,
208
  #"cache_dir": "./model_cache"
@@ -223,7 +223,9 @@ def load_model() -> Tuple[Optional[Any], Optional[Any]]:
223
  model = AutoModelForCausalLM.from_pretrained(MODEL_ID, **model_kwargs)
224
  model = model.eval()
225
  logger.info("Model loaded successfully")
226
-
 
 
227
  except torch.cuda.OutOfMemoryError:
228
  logger.error("CUDA out of memory. Try reducing batch size or using CPU")
229
  return None, None
 
202
  try:
203
  model_kwargs = {
204
  "device_map": "auto",
205
+ #"dtype": torch.float16,
206
  "use_cache": False,
207
  "trust_remote_code": True,
208
  #"cache_dir": "./model_cache"
 
223
  model = AutoModelForCausalLM.from_pretrained(MODEL_ID, **model_kwargs)
224
  model = model.eval()
225
  logger.info("Model loaded successfully")
226
+ print(next(model.parameters()).device)
227
+ from accelerate import infer_auto_device_map
228
+ print(infer_auto_device_map(model)) # Should show "cuda" for all layers
229
  except torch.cuda.OutOfMemoryError:
230
  logger.error("CUDA out of memory. Try reducing batch size or using CPU")
231
  return None, None
entrypoint.sh ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # entrypoint.sh
3
+
4
+ set -e
5
+
6
+ echo "🔹 Starting Ollama server in background..."
7
+ OLLAMA_HOST=0.0.0.0:11434 ollama serve &
8
+ OLLAMA_PID=$!
9
+
10
+ # Wait until Ollama API is responsive
11
+ echo "🔹 Waiting for Ollama API..."
12
+ until curl -f http://localhost:11434/ > /dev/null 2>&1; do
13
+ echo "🟡 Ollama not ready... retrying in 3s"
14
+ sleep 3
15
+ done
16
+ echo "🟢 Ollama is live!"
17
+
18
+ # Pull your lightweight model
19
+ MODEL_NAME="hf.co/kshitijthakkar/loggenix-moe-0.3B-A0.1B-e3-lr7e5-b16-4090-v6.3-finetuned-tool-Q8_0-GGUF:Q8_0"
20
+ echo "🔽 Pulling model: $MODEL_NAME"
21
+ ollama pull "$MODEL_NAME" || {
22
+ echo "❌ Failed to pull model. Check name and internet."
23
+ exit 1
24
+ }
25
+
26
+ # Start your app
27
+ echo "🚀 Launching enhanced_app.py"
28
+ exec python /app/enhanced_app.py
model_handler_ollama.py ADDED
@@ -0,0 +1,464 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import json
3
+ import re
4
+ import time
5
+ from typing import Dict, Any, Optional, List
6
+
7
+ # Ollama configuration
8
+ OLLAMA_BASE_URL = "http://localhost:11434" # Default Ollama URL
9
+ MODEL_NAME = "hf.co/kshitijthakkar/loggenix-moe-0.3B-A0.1B-e3-lr7e5-b16-4090-v6.3-finetuned-tool-Q8_0-GGUF:Q8_0" # Replace with your actual model name in Ollama
10
+
11
+ # Inference configurations
12
+ INFERENCE_CONFIGS = {
13
+ "Optimized for Speed": {
14
+ "num_predict": 512,
15
+ "temperature": 0.7,
16
+ "top_p": 0.9,
17
+ "top_k": 40,
18
+ "repeat_penalty": 1.1,
19
+ "description": "Fast responses with limited output length"
20
+ },
21
+ "Middle-ground": {
22
+ "num_predict": 2048,
23
+ "temperature": 0.7,
24
+ "top_p": 0.9,
25
+ "top_k": 40,
26
+ "repeat_penalty": 1.1,
27
+ "description": "Balanced performance and output quality"
28
+ },
29
+ "Full Capacity": {
30
+ "num_predict": 4096,
31
+ "temperature": 0.7,
32
+ "top_p": 0.9,
33
+ "top_k": 40,
34
+ "repeat_penalty": 1.1,
35
+ "description": "Maximum output length with dynamic allocation"
36
+ }
37
+ }
38
+
39
+
40
+ def get_inference_configs():
41
+ """Get available inference configurations"""
42
+ return INFERENCE_CONFIGS
43
+
44
+
45
+ def check_ollama_connection():
46
+ """Check if Ollama is running and accessible"""
47
+ try:
48
+ response = requests.get(f"{OLLAMA_BASE_URL}/api/tags", timeout=5)
49
+ return response.status_code == 200
50
+ except requests.RequestException:
51
+ return False
52
+
53
+
54
+ def list_ollama_models():
55
+ """List available models in Ollama"""
56
+ try:
57
+ response = requests.get(f"{OLLAMA_BASE_URL}/api/tags", timeout=5)
58
+ if response.status_code == 200:
59
+ models = response.json().get("models", [])
60
+ return [model["name"] for model in models]
61
+ return []
62
+ except requests.RequestException:
63
+ return []
64
+
65
+
66
+ def load_model():
67
+ """Check Ollama connection and model availability"""
68
+ if not check_ollama_connection():
69
+ raise ConnectionError(
70
+ "Cannot connect to Ollama. Please make sure Ollama is running.\n"
71
+ "Start Ollama with: ollama serve"
72
+ )
73
+
74
+ available_models = list_ollama_models()
75
+ if MODEL_NAME not in available_models:
76
+ print(f"Warning: Model '{MODEL_NAME}' not found in Ollama.")
77
+ print(f"Available models: {available_models}")
78
+ print(f"Pull your model with: ollama pull {MODEL_NAME}")
79
+ return False
80
+
81
+ print(f"Using Ollama model: {MODEL_NAME}")
82
+ return True
83
+
84
+
85
+ # ===== TOOL DEFINITIONS =====
86
+
87
+ def calculate_numbers(operation: str, num1: float, num2: float) -> Dict[str, Any]:
88
+ """
89
+ Sample tool to perform basic mathematical operations on two numbers.
90
+
91
+ Args:
92
+ operation: The operation to perform ('add', 'subtract', 'multiply', 'divide')
93
+ num1: First number
94
+ num2: Second number
95
+
96
+ Returns:
97
+ Dictionary with result and operation details
98
+ """
99
+ try:
100
+ num1, num2 = float(num1), float(num2)
101
+
102
+ if operation.lower() == 'add':
103
+ result = num1 + num2
104
+ elif operation.lower() == 'subtract':
105
+ result = num1 - num2
106
+ elif operation.lower() == 'multiply':
107
+ result = num1 * num2
108
+ elif operation.lower() == 'divide':
109
+ if num2 == 0:
110
+ return {"error": "Division by zero is not allowed"}
111
+ result = num1 / num2
112
+ else:
113
+ return {"error": f"Unknown operation: {operation}"}
114
+
115
+ return {
116
+ "result": result,
117
+ "operation": operation,
118
+ "operands": [num1, num2],
119
+ "formatted": f"{num1} {operation} {num2} = {result}"
120
+ }
121
+ except ValueError as e:
122
+ return {"error": f"Invalid number format: {str(e)}"}
123
+ except Exception as e:
124
+ return {"error": f"Calculation error: {str(e)}"}
125
+
126
+
127
+ # Tool registry
128
+ AVAILABLE_TOOLS = {
129
+ "calculate_numbers": {
130
+ "function": calculate_numbers,
131
+ "description": "Perform basic mathematical operations (add, subtract, multiply, divide) on two numbers",
132
+ "parameters": {
133
+ "operation": "The mathematical operation to perform",
134
+ "num1": "First number",
135
+ "num2": "Second number"
136
+ }
137
+ }
138
+ }
139
+
140
+
141
+ def execute_tool_call(tool_name: str, **kwargs) -> Dict[str, Any]:
142
+ """Execute a tool call with given parameters"""
143
+ print(f"Executing tool: {tool_name} with parameters: {kwargs}")
144
+ if tool_name not in AVAILABLE_TOOLS:
145
+ return {"error": f"Unknown tool: {tool_name}"}
146
+
147
+ try:
148
+ tool_function = AVAILABLE_TOOLS[tool_name]["function"]
149
+ result = tool_function(**kwargs)
150
+ return {
151
+ "tool_name": tool_name,
152
+ "parameters": kwargs,
153
+ "result": result
154
+ }
155
+ except Exception as e:
156
+ print(f"Tool execution failed: {str(e)}")
157
+ return {
158
+ "tool_name": tool_name,
159
+ "parameters": kwargs,
160
+ "error": f"Tool execution error: {str(e)}"
161
+ }
162
+
163
+
164
+ def parse_tool_calls(text: str) -> list:
165
+ """
166
+ Parse tool calls from model output.
167
+ Supports both formats:
168
+ - [TOOL_CALL:tool_name(param1=value1, param2=value2)]
169
+ - <tool_call>{"name": "tool_name", "parameters": {"param1": "value1", "param2": "value2"}}</tool_call>
170
+ """
171
+ tool_calls = []
172
+
173
+ # Pattern for both formats
174
+ pattern = r'(\[TOOL_CALL:(\w+)\((.*?)\)\]|<tool_call>\s*{"name":\s*"(\w+)",\s*"parameters":\s*{([^}]*)}\s*}\s*</tool_call>)'
175
+ matches = re.findall(pattern, text)
176
+ print("Raw matches:", matches)
177
+
178
+ for match in matches:
179
+ full_match, old_tool_name, old_params, json_tool_name, json_params = match
180
+
181
+ # Determine which format was matched
182
+ if old_tool_name: # Old format: [TOOL_CALL:tool_name(params)]
183
+ tool_name = old_tool_name
184
+ params_str = old_params
185
+ original_call = f"[TOOL_CALL:{tool_name}({params_str})]"
186
+
187
+ try:
188
+ params = {}
189
+ if params_str.strip():
190
+ param_pairs = params_str.split(',')
191
+ for pair in param_pairs:
192
+ if '=' in pair:
193
+ key, value = pair.split('=', 1)
194
+ key = key.strip()
195
+ value = value.strip().strip('"\'') # Remove quotes
196
+ params[key] = value
197
+
198
+ tool_calls.append({
199
+ "tool_name": tool_name,
200
+ "parameters": params,
201
+ "original_call": original_call
202
+ })
203
+
204
+ except Exception as e:
205
+ print(f"Error parsing old format tool call '{tool_name}({params_str})': {e}")
206
+ continue
207
+
208
+ elif json_tool_name: # JSON format: <tool_call>...</tool_call>
209
+ tool_name = json_tool_name
210
+ params_str = json_params
211
+ original_call = full_match
212
+
213
+ try:
214
+ params = {}
215
+ if params_str.strip():
216
+ # Parse JSON-like parameters
217
+ param_pairs = params_str.split(',')
218
+ for pair in param_pairs:
219
+ if ':' in pair:
220
+ key, value = pair.split(':', 1)
221
+ key = key.strip().strip('"\'')
222
+ value = value.strip().strip('"\'')
223
+ params[key] = value
224
+
225
+ tool_calls.append({
226
+ "tool_name": tool_name,
227
+ "parameters": params,
228
+ "original_call": original_call
229
+ })
230
+
231
+ except Exception as e:
232
+ print(f"Error parsing JSON format tool call '{tool_name}': {e}")
233
+ continue
234
+
235
+ return tool_calls
236
+
237
+
238
+ def process_tool_calls(text: str) -> str:
239
+ """Process tool calls in the generated text and replace with results"""
240
+ tool_calls = parse_tool_calls(text)
241
+
242
+ if not tool_calls:
243
+ return text
244
+
245
+ processed_text = text
246
+
247
+ for tool_call in tool_calls:
248
+ tool_name = tool_call["tool_name"]
249
+ parameters = tool_call["parameters"]
250
+ original_call = tool_call["original_call"]
251
+
252
+ try:
253
+ # Validate parameters before execution
254
+ if not isinstance(parameters, dict):
255
+ raise ValueError(f"Invalid parameters for tool {tool_name}: {parameters}")
256
+
257
+ # Execute tool
258
+ result = execute_tool_call(tool_name, **parameters)
259
+
260
+ # Create replacement text
261
+ if "error" in result:
262
+ replacement = f"[TOOL_ERROR: {result['error']}]"
263
+ else:
264
+ if "result" in result["result"]:
265
+ replacement = f"[TOOL_RESULT: {result['result']['formatted']}]"
266
+ else:
267
+ replacement = f"[TOOL_RESULT: {result['result']}]"
268
+
269
+ # Replace tool call with result
270
+ processed_text = processed_text.replace(original_call, replacement)
271
+
272
+ except Exception as e:
273
+ print(f"Error processing tool call '{tool_name}': {e}")
274
+ replacement = f"[TOOL_ERROR: Failed to process tool call: {str(e)}]"
275
+ processed_text = processed_text.replace(original_call, replacement)
276
+
277
+ return processed_text
278
+
279
+
280
+ def call_ollama_api(messages: List[Dict], config: Dict, stream: bool = False) -> str:
281
+ """
282
+ Make a request to Ollama API
283
+
284
+ Args:
285
+ messages: List of message dictionaries with 'role' and 'content'
286
+ config: Configuration dictionary with inference parameters
287
+ stream: Whether to stream the response
288
+
289
+ Returns:
290
+ Generated response text
291
+ """
292
+ # Convert messages to prompt format expected by your model
293
+ # This might need adjustment based on your model's expected format
294
+ prompt = ""
295
+ for msg in messages:
296
+ if msg["role"] == "system":
297
+ prompt += f"System: {msg['content']}\n\n"
298
+ elif msg["role"] == "user":
299
+ prompt += f"User: {msg['content']}\n\n"
300
+ elif msg["role"] == "assistant":
301
+ prompt += f"Assistant: {msg['content']}\n\n"
302
+
303
+ prompt += "Assistant: "
304
+
305
+ payload = {
306
+ "model": MODEL_NAME,
307
+ "prompt": prompt,
308
+ "stream": stream,
309
+ "options": {
310
+ "num_predict": config.get("num_predict", 2048),
311
+ "temperature": config.get("temperature", 0.7),
312
+ "top_p": config.get("top_p", 0.9),
313
+ "top_k": config.get("top_k", 40),
314
+ "repeat_penalty": config.get("repeat_penalty", 1.1),
315
+ }
316
+ }
317
+
318
+ try:
319
+ if stream:
320
+ return stream_ollama_response(payload)
321
+ else:
322
+ response = requests.post(
323
+ f"{OLLAMA_BASE_URL}/api/generate",
324
+ json=payload,
325
+ timeout=300 # 5 minutes timeout
326
+ )
327
+ response.raise_for_status()
328
+
329
+ result = response.json()
330
+ return result.get("response", "")
331
+
332
+ except requests.RequestException as e:
333
+ raise ConnectionError(f"Failed to connect to Ollama: {str(e)}")
334
+ except json.JSONDecodeError as e:
335
+ raise ValueError(f"Invalid response from Ollama: {str(e)}")
336
+
337
+
338
+ def stream_ollama_response(payload: Dict) -> str:
339
+ """Stream response from Ollama and return complete text"""
340
+ full_response = ""
341
+
342
+ try:
343
+ response = requests.post(
344
+ f"{OLLAMA_BASE_URL}/api/generate",
345
+ json=payload,
346
+ stream=True,
347
+ timeout=300
348
+ )
349
+ response.raise_for_status()
350
+
351
+ for line in response.iter_lines():
352
+ if line:
353
+ try:
354
+ chunk = json.loads(line.decode('utf-8'))
355
+ if 'response' in chunk:
356
+ token = chunk['response']
357
+ full_response += token
358
+ print(token, end='', flush=True) # Print tokens as they come
359
+
360
+ if chunk.get('done', False):
361
+ break
362
+
363
+ except json.JSONDecodeError:
364
+ continue
365
+
366
+ except requests.RequestException as e:
367
+ raise ConnectionError(f"Streaming failed: {str(e)}")
368
+
369
+ print() # New line after streaming
370
+ return full_response
371
+
372
+
373
+ def generate_response(system_prompt: str, user_input: str, config_name: str = "Middle-ground",
374
+ stream: bool = False) -> str:
375
+ """
376
+ Generate response using Ollama API with the given system prompt and user input.
377
+
378
+ Args:
379
+ system_prompt: System instruction for the model
380
+ user_input: User's input message
381
+ config_name: Configuration preset to use
382
+ stream: Whether to stream the response
383
+
384
+ Returns:
385
+ Generated response text
386
+ """
387
+ # Load/check model
388
+ if not load_model():
389
+ return "Error: Model not available in Ollama"
390
+
391
+ config = INFERENCE_CONFIGS[config_name]
392
+
393
+ # Prepare messages
394
+ messages = [
395
+ {"role": "system", "content": system_prompt},
396
+ {"role": "user", "content": user_input}
397
+ ]
398
+
399
+ start_time = time.time()
400
+
401
+ try:
402
+ # Generate response using Ollama
403
+ generated_response = call_ollama_api(messages, config, stream=stream)
404
+
405
+ inference_time = time.time() - start_time
406
+ print(f"Inference time: {inference_time:.2f} seconds")
407
+
408
+ # Process any tool calls in the generated response
409
+ processed_response = process_tool_calls(generated_response)
410
+
411
+ return processed_response
412
+
413
+ except Exception as e:
414
+ print(f"Error generating response: {str(e)}")
415
+ return f"Error: {str(e)}"
416
+
417
+
418
+ # Example usage and testing functions
419
+ def test_connection():
420
+ """Test Ollama connection and model availability"""
421
+ print("Testing Ollama connection...")
422
+
423
+ if not check_ollama_connection():
424
+ print("❌ Cannot connect to Ollama")
425
+ print("Make sure Ollama is running: ollama serve")
426
+ return False
427
+
428
+ print("✅ Ollama is running")
429
+
430
+ models = list_ollama_models()
431
+ print(f"Available models: {models}")
432
+
433
+ if MODEL_NAME not in models:
434
+ print(f"❌ Model '{MODEL_NAME}' not found")
435
+ print(f"Pull the model with: ollama pull {MODEL_NAME}")
436
+ return False
437
+
438
+ print(f"✅ Model '{MODEL_NAME}' is available")
439
+ return True
440
+
441
+
442
+ def example_usage():
443
+ """Example of how to use the system"""
444
+ if not test_connection():
445
+ return
446
+
447
+ system_prompt = """You are a helpful AI assistant with access to tools. When you need to perform mathematical calculations, use the available tools by calling them in this format: [TOOL_CALL:calculate_numbers(operation="add", num1="10", num2="5")]
448
+
449
+ Available tools:
450
+ - calculate_numbers: Perform basic math operations (add, subtract, multiply, divide)
451
+ """
452
+
453
+ user_input = "What is 125 + 675? Please calculate this for me."
454
+
455
+ print("Generating response...")
456
+ response = generate_response(system_prompt, user_input, "Middle-ground", stream=True)
457
+ print(f"\nFinal response: {response}")
458
+
459
+
460
+ if __name__ == "__main__":
461
+ # Update MODEL_NAME to match your model in Ollama
462
+ MODEL_NAME = "hf.co/kshitijthakkar/loggenix-moe-0.3B-A0.1B-e3-lr7e5-b16-4090-v6.3-finetuned-tool-Q8_0-GGUF:Q8_0" # Change this!
463
+
464
+ example_usage()