Spaces:

kshitijthakkar
/

loggenix-moe-0.3B-A0.1B-demo

Sleeping

App Files Files Community

kshitijthakkar commited on Sep 5

Commit

02a42b8

1 Parent(s): 60fc856

run via ollama quants gguf for faster inference speed

Browse files

Files changed (5) hide show

Dockerfile +32 -18
enhanced_app.py +4 -3
enhanced_model_handler.py +4 -2
entrypoint.sh +28 -0
model_handler_ollama.py +464 -0

Dockerfile CHANGED Viewed

@@ -1,30 +1,44 @@
-# Dockerfile for a Python application with user permissions
 FROM python:3.11-slim
-# Install system dependencies as root
-RUN apt-get update && apt-get install -y build-essential && apt-get install -y curl && rm -rf /var/lib/apt/lists/*
-# Create user and set up directory structure as root
-RUN useradd -m -u 1000 user && \
     mkdir -p /app && \
-    chown -R user:user /app
-# Set working directory
 WORKDIR /app
-# Switch to user AFTER setting up permissions
-USER user
-ENV PATH="/home/user/.local/bin:$PATH"
-# Copy files with proper ownership
-COPY --chown=user:user . /app
 # Install Python dependencies
-COPY --chown=user:user ./requirements.txt requirements.txt
 RUN pip install --no-cache-dir --upgrade pip && \
-    pip install --no-cache-dir --user -r requirements.txt
-# Make start.sh executable
-EXPOSE 8000 7860
-# Run the startup script
-CMD bash -c "python /app/enhanced_app.py"

+# Dockerfile - Hugging Face Space with Ollama (small model)
 FROM python:3.11-slim
+# Set Ollama environment
+ENV OLLAMA_HOST=0.0.0.0:11434
+ENV OLLAMA_ORIGINS=http://*,https://*
+# Optional: change model storage to /data for better caching
+# ENV OLLAMA_MODELS=/data/ollama
+# Install dependencies
+RUN apt-get update && \
+    apt-get install -y curl ca-certificates && \
+    rm -rf /var/lib/apt/lists/*
+# Create non-root user and app directory
+RUN useradd -m -u 1000 appuser && \
     mkdir -p /app && \
+    chown -R appuser:appuser /app
+USER appuser
 WORKDIR /app
+# Install Ollama CLI
+RUN mkdir -p ~/.local/bin && \
+    curl -fsSL https://ollama.com/download/ollama-linux-amd64.tgz | tar -xvz -C ~/.local/bin ollama && \
+    chmod +x ~/.local/bin/ollama
+ENV PATH="/home/appuser/.local/bin:$PATH"
+# Copy app
+COPY --chown=appuser:appuser . /app
 # Install Python dependencies
 RUN pip install --no-cache-dir --upgrade pip && \
+    pip install --no-cache-dir -r requirements.txt
+# Expose Gradio port (required)
+EXPOSE 7860
+# Entrypoint
+COPY --chown=appuser:appuser entrypoint.sh /app/entrypoint.sh
+RUN chmod +x /app/entrypoint.sh
+CMD ["/app/entrypoint.sh"]

enhanced_app.py CHANGED Viewed

@@ -7,8 +7,8 @@ import json
 import random
 import os
 #from model_handler import generate_response, get_inference_configs
-from enhanced_model_handler import generate_response, get_inference_configs
 import torch
 # Configuration for datasets
@@ -775,5 +775,6 @@ if __name__ == "__main__":
         server_name="0.0.0.0",
         server_port=7860,
         share=False,
-        debug=True
     )

 import random
 import os
 #from model_handler import generate_response, get_inference_configs
+#from enhanced_model_handler import generate_response, get_inference_configs
+from model_handler_ollama import generate_response, get_inference_configs
 import torch
 # Configuration for datasets
         server_name="0.0.0.0",
         server_port=7860,
         share=False,
+        debug=True,
+        mcp_server=True
     )

enhanced_model_handler.py CHANGED Viewed

@@ -202,7 +202,7 @@ def load_model() -> Tuple[Optional[Any], Optional[Any]]:
         try:
             model_kwargs = {
                 "device_map": "auto",
-                "dtype": torch.float16,
                 "use_cache": False,
                 "trust_remote_code": True,
                 #"cache_dir": "./model_cache"
@@ -223,7 +223,9 @@ def load_model() -> Tuple[Optional[Any], Optional[Any]]:
             model = AutoModelForCausalLM.from_pretrained(MODEL_ID, **model_kwargs)
             model = model.eval()
             logger.info("Model loaded successfully")
         except torch.cuda.OutOfMemoryError:
             logger.error("CUDA out of memory. Try reducing batch size or using CPU")
             return None, None

         try:
             model_kwargs = {
                 "device_map": "auto",
+                #"dtype": torch.float16,
                 "use_cache": False,
                 "trust_remote_code": True,
                 #"cache_dir": "./model_cache"
             model = AutoModelForCausalLM.from_pretrained(MODEL_ID, **model_kwargs)
             model = model.eval()
             logger.info("Model loaded successfully")
+            print(next(model.parameters()).device)
+            from accelerate import infer_auto_device_map
+            print(infer_auto_device_map(model)) # Should show "cuda" for all layers
         except torch.cuda.OutOfMemoryError:
             logger.error("CUDA out of memory. Try reducing batch size or using CPU")
             return None, None

entrypoint.sh ADDED Viewed

	@@ -0,0 +1,28 @@

+#!/bin/bash
+# entrypoint.sh
+set -e
+echo "🔹 Starting Ollama server in background..."
+OLLAMA_HOST=0.0.0.0:11434 ollama serve &
+OLLAMA_PID=$!
+# Wait until Ollama API is responsive
+echo "🔹 Waiting for Ollama API..."
+until curl -f http://localhost:11434/ > /dev/null 2>&1; do
+  echo "🟡 Ollama not ready... retrying in 3s"
+  sleep 3
+done
+echo "🟢 Ollama is live!"
+# Pull your lightweight model
+MODEL_NAME="hf.co/kshitijthakkar/loggenix-moe-0.3B-A0.1B-e3-lr7e5-b16-4090-v6.3-finetuned-tool-Q8_0-GGUF:Q8_0"
+echo "🔽 Pulling model: $MODEL_NAME"
+ollama pull "$MODEL_NAME" || {
+  echo "❌ Failed to pull model. Check name and internet."
+  exit 1
+}
+# Start your app
+echo "🚀 Launching enhanced_app.py"
+exec python /app/enhanced_app.py

model_handler_ollama.py ADDED Viewed

	@@ -0,0 +1,464 @@

+import requests
+import json
+import re
+import time
+from typing import Dict, Any, Optional, List
+# Ollama configuration
+OLLAMA_BASE_URL = "http://localhost:11434"  # Default Ollama URL
+MODEL_NAME = "hf.co/kshitijthakkar/loggenix-moe-0.3B-A0.1B-e3-lr7e5-b16-4090-v6.3-finetuned-tool-Q8_0-GGUF:Q8_0"  # Replace with your actual model name in Ollama
+# Inference configurations
+INFERENCE_CONFIGS = {
+    "Optimized for Speed": {
+        "num_predict": 512,
+        "temperature": 0.7,
+        "top_p": 0.9,
+        "top_k": 40,
+        "repeat_penalty": 1.1,
+        "description": "Fast responses with limited output length"
+    },
+    "Middle-ground": {
+        "num_predict": 2048,
+        "temperature": 0.7,
+        "top_p": 0.9,
+        "top_k": 40,
+        "repeat_penalty": 1.1,
+        "description": "Balanced performance and output quality"
+    },
+    "Full Capacity": {
+        "num_predict": 4096,
+        "temperature": 0.7,
+        "top_p": 0.9,
+        "top_k": 40,
+        "repeat_penalty": 1.1,
+        "description": "Maximum output length with dynamic allocation"
+    }
+}
+def get_inference_configs():
+    """Get available inference configurations"""
+    return INFERENCE_CONFIGS
+def check_ollama_connection():
+    """Check if Ollama is running and accessible"""
+    try:
+        response = requests.get(f"{OLLAMA_BASE_URL}/api/tags", timeout=5)
+        return response.status_code == 200
+    except requests.RequestException:
+        return False
+def list_ollama_models():
+    """List available models in Ollama"""
+    try:
+        response = requests.get(f"{OLLAMA_BASE_URL}/api/tags", timeout=5)
+        if response.status_code == 200:
+            models = response.json().get("models", [])
+            return [model["name"] for model in models]
+        return []
+    except requests.RequestException:
+        return []
+def load_model():
+    """Check Ollama connection and model availability"""
+    if not check_ollama_connection():
+        raise ConnectionError(
+            "Cannot connect to Ollama. Please make sure Ollama is running.\n"
+            "Start Ollama with: ollama serve"
+        )
+    available_models = list_ollama_models()
+    if MODEL_NAME not in available_models:
+        print(f"Warning: Model '{MODEL_NAME}' not found in Ollama.")
+        print(f"Available models: {available_models}")
+        print(f"Pull your model with: ollama pull {MODEL_NAME}")
+        return False
+    print(f"Using Ollama model: {MODEL_NAME}")
+    return True
+# ===== TOOL DEFINITIONS =====
+def calculate_numbers(operation: str, num1: float, num2: float) -> Dict[str, Any]:
+    """
+    Sample tool to perform basic mathematical operations on two numbers.
+    Args:
+        operation: The operation to perform ('add', 'subtract', 'multiply', 'divide')
+        num1: First number
+        num2: Second number
+    Returns:
+        Dictionary with result and operation details
+    """
+    try:
+        num1, num2 = float(num1), float(num2)
+        if operation.lower() == 'add':
+            result = num1 + num2
+        elif operation.lower() == 'subtract':
+            result = num1 - num2
+        elif operation.lower() == 'multiply':
+            result = num1 * num2
+        elif operation.lower() == 'divide':
+            if num2 == 0:
+                return {"error": "Division by zero is not allowed"}
+            result = num1 / num2
+        else:
+            return {"error": f"Unknown operation: {operation}"}
+        return {
+            "result": result,
+            "operation": operation,
+            "operands": [num1, num2],
+            "formatted": f"{num1} {operation} {num2} = {result}"
+        }
+    except ValueError as e:
+        return {"error": f"Invalid number format: {str(e)}"}
+    except Exception as e:
+        return {"error": f"Calculation error: {str(e)}"}
+# Tool registry
+AVAILABLE_TOOLS = {
+    "calculate_numbers": {
+        "function": calculate_numbers,
+        "description": "Perform basic mathematical operations (add, subtract, multiply, divide) on two numbers",
+        "parameters": {
+            "operation": "The mathematical operation to perform",
+            "num1": "First number",
+            "num2": "Second number"
+        }
+    }
+}
+def execute_tool_call(tool_name: str, **kwargs) -> Dict[str, Any]:
+    """Execute a tool call with given parameters"""
+    print(f"Executing tool: {tool_name} with parameters: {kwargs}")
+    if tool_name not in AVAILABLE_TOOLS:
+        return {"error": f"Unknown tool: {tool_name}"}
+    try:
+        tool_function = AVAILABLE_TOOLS[tool_name]["function"]
+        result = tool_function(**kwargs)
+        return {
+            "tool_name": tool_name,
+            "parameters": kwargs,
+            "result": result
+        }
+    except Exception as e:
+        print(f"Tool execution failed: {str(e)}")
+        return {
+            "tool_name": tool_name,
+            "parameters": kwargs,
+            "error": f"Tool execution error: {str(e)}"
+        }
+def parse_tool_calls(text: str) -> list:
+    """
+    Parse tool calls from model output.
+    Supports both formats:
+    - [TOOL_CALL:tool_name(param1=value1, param2=value2)]
+    - <tool_call>{"name": "tool_name", "parameters": {"param1": "value1", "param2": "value2"}}</tool_call>
+    """
+    tool_calls = []
+    # Pattern for both formats
+    pattern = r'(\[TOOL_CALL:(\w+)\((.*?)\)\]|<tool_call>\s*{"name":\s*"(\w+)",\s*"parameters":\s*{([^}]*)}\s*}\s*</tool_call>)'
+    matches = re.findall(pattern, text)
+    print("Raw matches:", matches)
+    for match in matches:
+        full_match, old_tool_name, old_params, json_tool_name, json_params = match
+        # Determine which format was matched
+        if old_tool_name:  # Old format: [TOOL_CALL:tool_name(params)]
+            tool_name = old_tool_name
+            params_str = old_params
+            original_call = f"[TOOL_CALL:{tool_name}({params_str})]"
+            try:
+                params = {}
+                if params_str.strip():
+                    param_pairs = params_str.split(',')
+                    for pair in param_pairs:
+                        if '=' in pair:
+                            key, value = pair.split('=', 1)
+                            key = key.strip()
+                            value = value.strip().strip('"\'')  # Remove quotes
+                            params[key] = value
+                tool_calls.append({
+                    "tool_name": tool_name,
+                    "parameters": params,
+                    "original_call": original_call
+                })
+            except Exception as e:
+                print(f"Error parsing old format tool call '{tool_name}({params_str})': {e}")
+                continue
+        elif json_tool_name:  # JSON format: <tool_call>...</tool_call>
+            tool_name = json_tool_name
+            params_str = json_params
+            original_call = full_match
+            try:
+                params = {}
+                if params_str.strip():
+                    # Parse JSON-like parameters
+                    param_pairs = params_str.split(',')
+                    for pair in param_pairs:
+                        if ':' in pair:
+                            key, value = pair.split(':', 1)
+                            key = key.strip().strip('"\'')
+                            value = value.strip().strip('"\'')
+                            params[key] = value
+                tool_calls.append({
+                    "tool_name": tool_name,
+                    "parameters": params,
+                    "original_call": original_call
+                })
+            except Exception as e:
+                print(f"Error parsing JSON format tool call '{tool_name}': {e}")
+                continue
+    return tool_calls
+def process_tool_calls(text: str) -> str:
+    """Process tool calls in the generated text and replace with results"""
+    tool_calls = parse_tool_calls(text)
+    if not tool_calls:
+        return text
+    processed_text = text
+    for tool_call in tool_calls:
+        tool_name = tool_call["tool_name"]
+        parameters = tool_call["parameters"]
+        original_call = tool_call["original_call"]
+        try:
+            # Validate parameters before execution
+            if not isinstance(parameters, dict):
+                raise ValueError(f"Invalid parameters for tool {tool_name}: {parameters}")
+            # Execute tool
+            result = execute_tool_call(tool_name, **parameters)
+            # Create replacement text
+            if "error" in result:
+                replacement = f"[TOOL_ERROR: {result['error']}]"
+            else:
+                if "result" in result["result"]:
+                    replacement = f"[TOOL_RESULT: {result['result']['formatted']}]"
+                else:
+                    replacement = f"[TOOL_RESULT: {result['result']}]"
+            # Replace tool call with result
+            processed_text = processed_text.replace(original_call, replacement)
+        except Exception as e:
+            print(f"Error processing tool call '{tool_name}': {e}")
+            replacement = f"[TOOL_ERROR: Failed to process tool call: {str(e)}]"
+            processed_text = processed_text.replace(original_call, replacement)
+    return processed_text
+def call_ollama_api(messages: List[Dict], config: Dict, stream: bool = False) -> str:
+    """
+    Make a request to Ollama API
+    Args:
+        messages: List of message dictionaries with 'role' and 'content'
+        config: Configuration dictionary with inference parameters
+        stream: Whether to stream the response
+    Returns:
+        Generated response text
+    """
+    # Convert messages to prompt format expected by your model
+    # This might need adjustment based on your model's expected format
+    prompt = ""
+    for msg in messages:
+        if msg["role"] == "system":
+            prompt += f"System: {msg['content']}\n\n"
+        elif msg["role"] == "user":
+            prompt += f"User: {msg['content']}\n\n"
+        elif msg["role"] == "assistant":
+            prompt += f"Assistant: {msg['content']}\n\n"
+    prompt += "Assistant: "
+    payload = {
+        "model": MODEL_NAME,
+        "prompt": prompt,
+        "stream": stream,
+        "options": {
+            "num_predict": config.get("num_predict", 2048),
+            "temperature": config.get("temperature", 0.7),
+            "top_p": config.get("top_p", 0.9),
+            "top_k": config.get("top_k", 40),
+            "repeat_penalty": config.get("repeat_penalty", 1.1),
+        }
+    }
+    try:
+        if stream:
+            return stream_ollama_response(payload)
+        else:
+            response = requests.post(
+                f"{OLLAMA_BASE_URL}/api/generate",
+                json=payload,
+                timeout=300  # 5 minutes timeout
+            )
+            response.raise_for_status()
+            result = response.json()
+            return result.get("response", "")
+    except requests.RequestException as e:
+        raise ConnectionError(f"Failed to connect to Ollama: {str(e)}")
+    except json.JSONDecodeError as e:
+        raise ValueError(f"Invalid response from Ollama: {str(e)}")
+def stream_ollama_response(payload: Dict) -> str:
+    """Stream response from Ollama and return complete text"""
+    full_response = ""
+    try:
+        response = requests.post(
+            f"{OLLAMA_BASE_URL}/api/generate",
+            json=payload,
+            stream=True,
+            timeout=300
+        )
+        response.raise_for_status()
+        for line in response.iter_lines():
+            if line:
+                try:
+                    chunk = json.loads(line.decode('utf-8'))
+                    if 'response' in chunk:
+                        token = chunk['response']
+                        full_response += token
+                        print(token, end='', flush=True)  # Print tokens as they come
+                    if chunk.get('done', False):
+                        break
+                except json.JSONDecodeError:
+                    continue
+    except requests.RequestException as e:
+        raise ConnectionError(f"Streaming failed: {str(e)}")
+    print()  # New line after streaming
+    return full_response
+def generate_response(system_prompt: str, user_input: str, config_name: str = "Middle-ground",
+                      stream: bool = False) -> str:
+    """
+    Generate response using Ollama API with the given system prompt and user input.
+    Args:
+        system_prompt: System instruction for the model
+        user_input: User's input message
+        config_name: Configuration preset to use
+        stream: Whether to stream the response
+    Returns:
+        Generated response text
+    """
+    # Load/check model
+    if not load_model():
+        return "Error: Model not available in Ollama"
+    config = INFERENCE_CONFIGS[config_name]
+    # Prepare messages
+    messages = [
+        {"role": "system", "content": system_prompt},
+        {"role": "user", "content": user_input}
+    ]
+    start_time = time.time()
+    try:
+        # Generate response using Ollama
+        generated_response = call_ollama_api(messages, config, stream=stream)
+        inference_time = time.time() - start_time
+        print(f"Inference time: {inference_time:.2f} seconds")
+        # Process any tool calls in the generated response
+        processed_response = process_tool_calls(generated_response)
+        return processed_response
+    except Exception as e:
+        print(f"Error generating response: {str(e)}")
+        return f"Error: {str(e)}"
+# Example usage and testing functions
+def test_connection():
+    """Test Ollama connection and model availability"""
+    print("Testing Ollama connection...")
+    if not check_ollama_connection():
+        print("❌ Cannot connect to Ollama")
+        print("Make sure Ollama is running: ollama serve")
+        return False
+    print("✅ Ollama is running")
+    models = list_ollama_models()
+    print(f"Available models: {models}")
+    if MODEL_NAME not in models:
+        print(f"❌ Model '{MODEL_NAME}' not found")
+        print(f"Pull the model with: ollama pull {MODEL_NAME}")
+        return False
+    print(f"✅ Model '{MODEL_NAME}' is available")
+    return True
+def example_usage():
+    """Example of how to use the system"""
+    if not test_connection():
+        return
+    system_prompt = """You are a helpful AI assistant with access to tools. When you need to perform mathematical calculations, use the available tools by calling them in this format: [TOOL_CALL:calculate_numbers(operation="add", num1="10", num2="5")]
+Available tools:
+- calculate_numbers: Perform basic math operations (add, subtract, multiply, divide)
+"""
+    user_input = "What is 125 + 675? Please calculate this for me."
+    print("Generating response...")
+    response = generate_response(system_prompt, user_input, "Middle-ground", stream=True)
+    print(f"\nFinal response: {response}")
+if __name__ == "__main__":
+    # Update MODEL_NAME to match your model in Ollama
+    MODEL_NAME = "hf.co/kshitijthakkar/loggenix-moe-0.3B-A0.1B-e3-lr7e5-b16-4090-v6.3-finetuned-tool-Q8_0-GGUF:Q8_0"  # Change this!
+    example_usage()