Spaces:

kshitijthakkar
/

loggenix-moe-0.3B-A0.1B-demo

Sleeping

App Files Files Community

kshitijthakkar commited on Sep 12

Commit

2bd7a23

1 Parent(s): 5c1925e

updated new model

Browse files

Files changed (5) hide show

enhanced_app.py +22 -14
enhanced_model_handler.py +5 -5
entrypoint.sh +1 -1
model_handler.py +7 -6
model_handler_ollama.py +2 -2

enhanced_app.py CHANGED Viewed

@@ -24,6 +24,10 @@ DATASET_CONFIGS = {
     'Loggenix Synthetic AI Tasks Eval (with outputs) v6-large': {
         'repo_id': 'kshitijthakkar/loggenix-synthetic-ai-tasks-eval_v6-with-outputs',
         'split': 'train'
     }
 }
@@ -390,7 +394,7 @@ def create_interface():
                         3. Analyze patterns for model improvement
                         ### 💡 **Tips:**
-                        - Start with "Balanced Mode" for general testing
                         - Use specific task types to focus your evaluation
                         - Flag responses immediately when issues are noticed
                         - Regularly review flagged responses for patterns
@@ -406,8 +410,8 @@ def create_interface():
                         1. **Select Task Type**: Choose from available task types (loaded from the inference dataset)
                         2. **Configure Inference**: Select optimization level:
                         - `Optimized for Speed`: Fast responses (max 512 tokens)
-                        - `Balanced Mode`: Comprehensive answers (max 2048 tokens)
-                        - `Full Capacity`: Maximum context utilization
                         3. **Review System Prompt**: The system prompt auto-loads based on task type (editable)
                         4. **Chat Interface**:
                         - Enter messages in the input field
@@ -433,8 +437,9 @@ def create_interface():
                         ### 📚 **Available Datasets:**
                         - **Loggenix Synthetic AI Tasks Eval (small)**: Compact evaluation set
-                        - **Loggenix Synthetic AI Tasks Eval v5 (large)**: Extended evaluation dataset
-                        - **Loggenix Synthetic AI Tasks Eval v6 (large)**: Latest evaluation dataset
                         ### 📝 **How to Use:**
                         1. **Select Dataset**: Choose from the dropdown (displays dataset info automatically)
@@ -497,12 +502,12 @@ def create_interface():
                     with gr.Accordion("🛠️ Technical Specifications", open=False, elem_classes="panel"):
                         gr.Markdown("""
                         ### 🤖 **Model Details:**
-                        **Primary Model**: `kshitijthakkar/loggenix-moe-0.3B-A0.1B-e3-lr7e5-b16-4090-v6.3-finetuned-tool`
                         - **Architecture**: Mixture of Experts (MOE)
                         - **Total Parameters**: 330M (16 experts, 2 active)
                         - **Active Parameters**: 185M
-                        - **Context Length**: 4096 tokens
                         - **Precision**: FP16
                         - **Flash Attention**: Supported
                         - **Tool Calling**: Enabled
@@ -513,17 +518,20 @@ def create_interface():
                             "Optimized for Speed": {
                                 "max_new_tokens": 512,
                                 "temperature": 0.7,
-                                "do_sample": True
                             },
-                            "Balanced Mode": {
                                 "max_new_tokens": 2048,
                                 "temperature": 0.8,
-                                "do_sample": True
                             },
                             "Full Capacity": {
-                                "max_new_tokens": 4000,
                                 "temperature": 0.9,
-                                "do_sample": True
                             }
                         }
                         ```
@@ -555,7 +563,7 @@ def create_interface():
                         ollama serve
                         # Pull the quantized model (Q8_0 format)
-                        ollama pull hf.co/kshitijthakkar/loggenix-moe-0.3B-A0.1B-e3-lr7e5-b16-4090-v6.3-finetuned-tool-Q8_0-GGUF:Q8_0
                         ```
                         ### ✅ **Advantages:**
@@ -568,7 +576,7 @@ def create_interface():
                         ### ⚙️ **Configuration:**
                         ```python
                         OLLAMA_BASE_URL = "http://localhost:11434"
-                        MODEL_NAME = "hf.co/kshitijthakkar/loggenix-moe-0.3B-A0.1B-e3-lr7e5-b16-4090-v6.3-finetuned-tool-Q8_0-GGUF:Q8_0"
                         ```
                         ### 🛠️ **Tool Calling Support:**

     'Loggenix Synthetic AI Tasks Eval (with outputs) v6-large': {
         'repo_id': 'kshitijthakkar/loggenix-synthetic-ai-tasks-eval_v6-with-outputs',
         'split': 'train'
+    },
+     'Loggenix Synthetic AI Tasks Eval (with outputs) v7-large': {
+        'repo_id': 'kshitijthakkar/loggenix-synthetic-ai-tasks-eval_v5-with-outputs-v7-sft-v1',
+        'split': 'train'
     }
 }
                         3. Analyze patterns for model improvement
                         ### 💡 **Tips:**
+                        - Start with "Optimized for Speed" for general testing
                         - Use specific task types to focus your evaluation
                         - Flag responses immediately when issues are noticed
                         - Regularly review flagged responses for patterns
                         1. **Select Task Type**: Choose from available task types (loaded from the inference dataset)
                         2. **Configure Inference**: Select optimization level:
                         - `Optimized for Speed`: Fast responses (max 512 tokens)
+                        - `Middle-ground`: Comprehensive answers (max 2048 tokens)
+                        - `Full Capacity`: Maximum context utilization (max 8192 tokens)
                         3. **Review System Prompt**: The system prompt auto-loads based on task type (editable)
                         4. **Chat Interface**:
                         - Enter messages in the input field
                         ### 📚 **Available Datasets:**
                         - **Loggenix Synthetic AI Tasks Eval (small)**: Compact evaluation set
+                        - **Loggenix Synthetic AI Tasks Eval v5 (large)**: Extended evaluation dataset-large-models
+                        - **Loggenix Synthetic AI Tasks Eval v6 (large)**: Extended evaluation dataset-small-models
+                        - **Loggenix Synthetic AI Tasks Eval v7 (large)**: Latest evaluation dataset-large-models
                         ### 📝 **How to Use:**
                         1. **Select Dataset**: Choose from the dropdown (displays dataset info automatically)
                     with gr.Accordion("🛠️ Technical Specifications", open=False, elem_classes="panel"):
                         gr.Markdown("""
                         ### 🤖 **Model Details:**
+                        **Primary Model**: `kshitijthakkar/loggenix-moe-0.3B-A0.1B-e3-lr7e5-b16-4090-v7-sft-v1`
                         - **Architecture**: Mixture of Experts (MOE)
                         - **Total Parameters**: 330M (16 experts, 2 active)
                         - **Active Parameters**: 185M
+                        - **Context Length**: 8192 tokens
                         - **Precision**: FP16
                         - **Flash Attention**: Supported
                         - **Tool Calling**: Enabled
                             "Optimized for Speed": {
                                 "max_new_tokens": 512,
                                 "temperature": 0.7,
+                                "do_sample": True,
+                                "use_cache": False
                             },
+                            "Middle-ground": {
                                 "max_new_tokens": 2048,
                                 "temperature": 0.8,
+                                "do_sample": True,
+                                "use_cache": False
                             },
                             "Full Capacity": {
+                                "max_new_tokens": 8192,
                                 "temperature": 0.9,
+                                "do_sample": True,
+                                "use_cache": False
                             }
                         }
                         ```
                         ollama serve
                         # Pull the quantized model (Q8_0 format)
+                        ollama pull hf.co/kshitijthakkar/loggenix-moe-0.3B-A0.1B-e3-lr7e5-b16-4090-v7-sft-v1-Q8_0-GGUF:Q8_0
                         ```
                         ### ✅ **Advantages:**
                         ### ⚙️ **Configuration:**
                         ```python
                         OLLAMA_BASE_URL = "http://localhost:11434"
+                        MODEL_NAME = "hf.co/kshitijthakkar/loggenix-moe-0.3B-A0.1B-e3-lr7e5-b16-4090-v7-sft-v1-Q8_0-GGUF:Q8_0"
                         ```
                         ### 🛠️ **Tool Calling Support:**

enhanced_model_handler.py CHANGED Viewed

@@ -60,7 +60,7 @@ except Exception as e:
 # Global model and tokenizer variables
 model = None
 tokenizer = None
-MODEL_ID = "kshitijthakkar/loggenix-moe-0.3B-A0.1B-e3-lr7e5-b16-4090-v6.3-finetuned-tool"
 # Inference configurations
 INFERENCE_CONFIGS = {
@@ -75,8 +75,8 @@ INFERENCE_CONFIGS = {
         "description": "Fast responses with limited output length"
     },
     "Middle-ground": {
-        "max_new_tokens_base": 2048,
-        "max_new_tokens_cap": 2048,
         "min_tokens": 50,
         "temperature": 0.7,
         "top_p": 0.9,
@@ -85,8 +85,8 @@ INFERENCE_CONFIGS = {
         "description": "Balanced performance and output quality"
     },
     "Full Capacity": {
-        "max_new_tokens_base": 4096,
-        "max_new_tokens_cap": 4096,
         "min_tokens": 1,
         "temperature": 0.7,
         "top_p": 0.9,

 # Global model and tokenizer variables
 model = None
 tokenizer = None
+MODEL_ID = "kshitijthakkar/loggenix-moe-0.3B-A0.1B-e3-lr7e5-b16-4090-v7-sft-v1"
 # Inference configurations
 INFERENCE_CONFIGS = {
         "description": "Fast responses with limited output length"
     },
     "Middle-ground": {
+        "max_new_tokens_base": 4096,
+        "max_new_tokens_cap": 4096,
         "min_tokens": 50,
         "temperature": 0.7,
         "top_p": 0.9,
         "description": "Balanced performance and output quality"
     },
     "Full Capacity": {
+        "max_new_tokens_base": 8192,
+        "max_new_tokens_cap": 8192,
         "min_tokens": 1,
         "temperature": 0.7,
         "top_p": 0.9,

entrypoint.sh CHANGED Viewed

@@ -16,7 +16,7 @@ done
 echo "🟢 Ollama is live!"
 # Pull your lightweight model
-MODEL_NAME="hf.co/kshitijthakkar/loggenix-moe-0.3B-A0.1B-e3-lr7e5-b16-4090-v6.3-finetuned-tool-Q8_0-GGUF:Q8_0"
 echo "🔽 Pulling model: $MODEL_NAME"
 /app/ollama pull "$MODEL_NAME" || {
   echo "❌ Failed to pull model. Check name and internet."

 echo "🟢 Ollama is live!"
 # Pull your lightweight model
+MODEL_NAME="hf.co/kshitijthakkar/loggenix-moe-0.3B-A0.1B-e3-lr7e5-b16-4090-v7-sft-v1-Q8_0-GGUF:Q8_0"
 echo "🔽 Pulling model: $MODEL_NAME"
 /app/ollama pull "$MODEL_NAME" || {
   echo "❌ Failed to pull model. Check name and internet."

model_handler.py CHANGED Viewed

@@ -14,7 +14,7 @@ torch.backends.cudnn.allow_tf32 = True
 # Global model and tokenizer variables
 model = None
 tokenizer = None
-MODEL_ID = "kshitijthakkar/loggenix-moe-0.3B-A0.1B-e3-lr7e5-b16-4090-v6.3-finetuned-tool"
 # Inference configurations
 INFERENCE_CONFIGS = {
@@ -29,8 +29,8 @@ INFERENCE_CONFIGS = {
         "description": "Fast responses with limited output length"
     },
     "Middle-ground": {
-        "max_new_tokens_base": 2048,
-        "max_new_tokens_cap": 2048,
         "min_tokens": 50,
         "temperature": 0.7,
         "top_p": 0.9,
@@ -39,8 +39,8 @@ INFERENCE_CONFIGS = {
         "description": "Balanced performance and output quality"
     },
     "Full Capacity": {
-        "max_new_tokens_base": 4096,
-        "max_new_tokens_cap": 4096,
         "min_tokens": 1,
         "temperature": 0.7,
         "top_p": 0.9,
@@ -56,6 +56,7 @@ def get_inference_configs():
     return INFERENCE_CONFIGS
 def load_model():
     """Load model and tokenizer with optimizations"""
     global model, tokenizer
@@ -87,7 +88,7 @@ def load_model():
         dtype=torch.float16,  # Use half precision for speed
         attn_implementation="flash_attention_2" if hasattr(torch.nn, 'scaled_dot_product_attention') else None,
         use_cache=True,
-        quantization_config=quantization_config,
     ).eval()
     # Enable gradient checkpointing if available

 # Global model and tokenizer variables
 model = None
 tokenizer = None
+MODEL_ID = "kshitijthakkar/loggenix-moe-0.3B-A0.1B-e3-lr7e5-b16-4090-v7-sft-v1"
 # Inference configurations
 INFERENCE_CONFIGS = {
         "description": "Fast responses with limited output length"
     },
     "Middle-ground": {
+        "max_new_tokens_base": 4096,
+        "max_new_tokens_cap": 4096,
         "min_tokens": 50,
         "temperature": 0.7,
         "top_p": 0.9,
         "description": "Balanced performance and output quality"
     },
     "Full Capacity": {
+        "max_new_tokens_base": 8192,
+        "max_new_tokens_cap": 8192,
         "min_tokens": 1,
         "temperature": 0.7,
         "top_p": 0.9,
     return INFERENCE_CONFIGS
 def load_model():
     """Load model and tokenizer with optimizations"""
     global model, tokenizer
         dtype=torch.float16,  # Use half precision for speed
         attn_implementation="flash_attention_2" if hasattr(torch.nn, 'scaled_dot_product_attention') else None,
         use_cache=True,
+        #quantization_config=quantization_config,
     ).eval()
     # Enable gradient checkpointing if available

model_handler_ollama.py CHANGED Viewed

@@ -6,7 +6,7 @@ from typing import Dict, Any, Optional, List
 # Ollama configuration
 OLLAMA_BASE_URL = "http://localhost:11434"  # Default Ollama URL
-MODEL_NAME = "hf.co/kshitijthakkar/loggenix-moe-0.3B-A0.1B-e3-lr7e5-b16-4090-v6.3-finetuned-tool-Q8_0-GGUF:Q8_0"  # Replace with your actual model name in Ollama
 # Inference configurations
 INFERENCE_CONFIGS = {
@@ -459,6 +459,6 @@ Available tools:
 if __name__ == "__main__":
     # Update MODEL_NAME to match your model in Ollama
-    MODEL_NAME = "hf.co/kshitijthakkar/loggenix-moe-0.3B-A0.1B-e3-lr7e5-b16-4090-v6.3-finetuned-tool-Q8_0-GGUF:Q8_0"  # Change this!
     example_usage()

 # Ollama configuration
 OLLAMA_BASE_URL = "http://localhost:11434"  # Default Ollama URL
+MODEL_NAME = "hf.co/kshitijthakkar/loggenix-moe-0.3B-A0.1B-e3-lr7e5-b16-4090-v7-sft-v1-Q8_0-GGUF:Q8_0"  # Replace with your actual model name in Ollama
 # Inference configurations
 INFERENCE_CONFIGS = {
 if __name__ == "__main__":
     # Update MODEL_NAME to match your model in Ollama
+    MODEL_NAME = "hf.co/kshitijthakkar/loggenix-moe-0.3B-A0.1B-e3-lr7e5-b16-4090-v7-sft-v1-Q8_0-GGUF:Q8_0"  # Change this!
     example_usage()