Spaces:

kshitijthakkar
/

loggenix-moe-0.3B-A0.1B-demo

Sleeping

File size: 31,901 Bytes

import gradio as gr
import pandas as pd
from datasets import load_dataset
import plotly.graph_objects as go
import datetime
import json
import random
import os
#from model_handler import generate_response, get_inference_configs
#from enhanced_model_handler import generate_response, get_inference_configs
from model_handler_ollama import generate_response, get_inference_configs
import torch

# Configuration for datasets
DATASET_CONFIGS = {
    'Loggenix Synthetic AI Tasks Eval (with outputs)-small': {
        'repo_id': 'kshitijthakkar/loggenix-synthetic-ai-tasks-eval-with-outputs',
        'split': 'train'
    },
    'Loggenix Synthetic AI Tasks Eval (with outputs) v5-large': {
        'repo_id': 'kshitijthakkar/loggenix-synthetic-ai-tasks-eval_v5-with-outputs',
        'split': 'train'
    },
    'Loggenix Synthetic AI Tasks Eval (with outputs) v6-large': {
        'repo_id': 'kshitijthakkar/loggenix-synthetic-ai-tasks-eval_v6-with-outputs',
        'split': 'train'
    }
}


# Load main dataset for inference tab
def load_inference_dataset():
    """Load the main dataset for inference use case"""
    try:
        print("Loading synthetic-ai-tasks-eval-v5 dataset...")
        dataset = load_dataset(
            'kshitijthakkar/synthetic-ai-tasks-eval-v5',
            split='train',
            trust_remote_code=True
        )
        df = dataset.to_pandas()
        print(f"✓ Successfully loaded: {len(df)} rows, {len(df.columns)} columns")
        return df
    except Exception as e:
        print(f"✗ Error loading dataset: {str(e)}")
        return pd.DataFrame({'Error': [f'Failed to load: {str(e)}']})


# Load dataset for eval samples tab
def load_eval_datasets():
    """Load all datasets for evaluation samples"""
    datasets = {}
    for display_name, config in DATASET_CONFIGS.items():
        try:
            print(f"Loading {display_name}...")
            dataset = load_dataset(
                config['repo_id'],
                split=config['split'],
                trust_remote_code=True
            )
            df = dataset.to_pandas()
            datasets[display_name] = df
            print(f"✓ Successfully loaded {display_name}: {len(df)} rows")
        except Exception as e:
            print(f"✗ Error loading {display_name}: {str(e)}")
            datasets[display_name] = pd.DataFrame({
                'Error': [f'Failed to load: {str(e)}'],
                'Dataset': [config['repo_id']]
            })
    return datasets


# Load datasets
INFERENCE_DATASET = load_inference_dataset()
EVAL_DATASETS = load_eval_datasets()


# ===== TAB 1: INFERENCE USE CASE WITH INTEGRATED FLAGGING =====

def get_task_types():
    """Get unique task types from inference dataset"""
    if 'task_type' in INFERENCE_DATASET.columns:
        task_types = INFERENCE_DATASET['task_type'].unique().tolist()
        return [str(t) for t in task_types if pd.notna(t)]
    return ["No task types available"]


def get_task_by_type(task_type):
    """Get task content by task type"""
    if 'task_type' in INFERENCE_DATASET.columns and 'task' in INFERENCE_DATASET.columns:
        filtered = INFERENCE_DATASET[INFERENCE_DATASET['task_type'] == task_type]
        if len(filtered) > 0:
            return str(filtered.iloc[0]['task'])
    return "No task found for this type"


def chat_interface_with_inference(prompt, history, system_prompt, inference_config):
    """Enhanced chat interface with model inference and history"""
    if not prompt.strip():
        return history, ""

    # Add user message to history
    history.append(("You", prompt))

    try:
        if not system_prompt.strip():
            response = "Please select a task type to load system prompt first."
        else:
            # Get inference configuration
            configs = get_inference_configs()
            config = configs.get(inference_config, configs["Optimized for Speed"])

            # Run inference using the model
            response = generate_response(
                system_prompt=system_prompt,
                user_input=prompt,
                config_name=inference_config
            )

        # Format and add AI response to history
        formatted_response = f"**AI Assistant:**\n{response}"
        history.append(("AI Assistant", formatted_response))

    except Exception as e:
        error_msg = f"**AI Assistant:**\nError during inference: {str(e)}"
        history.append(("AI Assistant", error_msg))

    return history, ""


def flag_response(history, flagged_message, flag_reason):
    """Flag a response"""
    if not flagged_message or flagged_message == "No responses available":
        return "Invalid message selection."

    try:
        flagged_index = int(flagged_message.split()[1][:-1])
        if flagged_index >= len(history) or history[flagged_index][0] != "AI Assistant":
            return "You can only flag assistant responses."

        flagged_message_content = history[flagged_index][1]

        log_entry = {
            "timestamp": datetime.datetime.now().isoformat(),
            "flag_reason": str(flag_reason),
            "flagged_message": str(flagged_message_content),
            "conversation_context": history,
        }

        os.makedirs("logs", exist_ok=True)
        with open("logs/flagged_responses.log", "a") as f:
            f.write(json.dumps(log_entry) + "\n")

        return f"Response flagged successfully: {flag_reason}"
    except Exception as e:
        return f"Error flagging response: {str(e)}"


def get_assistant_responses(history):
    """Get dropdown options for assistant responses"""
    responses = [
        f"Response {i}: {str(msg[1])[:50]}..."
        for i, msg in enumerate(history)
        if msg[0] == "AI Assistant"
    ]

    if not responses:
        responses = ["No responses available"]

    return gr.update(choices=responses, value=responses[0] if responses else "No responses available")


def display_selected_message(selected_index, history):
    """Display the selected flagged message"""
    if selected_index == "No responses available":
        return "No responses available"

    try:
        flagged_index = int(selected_index.split()[1][:-1])
        if flagged_index < len(history) and history[flagged_index][0] == "AI Assistant":
            return history[flagged_index][1]
        else:
            return "Invalid selection."
    except Exception as e:
        return f"Error: {str(e)}"


def clear_inference_history():
    """Clear chat history for inference tab"""
    return [], gr.update(choices=["No responses available"], value="No responses available")


# ===== TAB 2: EVAL SAMPLES =====

def update_eval_table(dataset_name):
    """Update eval table based on selected dataset"""
    if dataset_name in EVAL_DATASETS:
        return EVAL_DATASETS[dataset_name].head(100)
    return pd.DataFrame()


def get_eval_dataset_info(dataset_name):
    """Get info about selected eval dataset"""
    if dataset_name in EVAL_DATASETS:
        df = EVAL_DATASETS[dataset_name]
        return f"""
        **Dataset**: {dataset_name}
        - **Rows**: {len(df):,}
        - **Columns**: {len(df.columns)}
        - **Column Names**: {', '.join(df.columns.tolist())}
        """
    return "No dataset selected"


# def get_task_types_for_eval(dataset_name):
#     """Get unique task types from selected eval dataset"""
#     if dataset_name in EVAL_DATASETS and 'task_type' in EVAL_DATASETS[dataset_name].columns:
#         task_types = EVAL_DATASETS[dataset_name]['task_type'].unique().tolist()
#         return [str(t) for t in task_types if pd.notna(t)]
#     return ["No task types available"]
def get_task_types_for_eval(dataset_name):
    """Get unique task types from selected eval dataset"""
    if dataset_name in EVAL_DATASETS and 'task_type' in EVAL_DATASETS[dataset_name].columns:
        task_types = EVAL_DATASETS[dataset_name]['task_type'].unique().tolist()
        # The correct way is to return the list directly, not a joined string.
        # The list comprehension `[str(t) for t in task_types if pd.notna(t)]` already does this.
        return [str(t) for t in task_types if pd.notna(t)]
    return ["No task types available"]


def get_tasks_by_type_eval(dataset_name, task_type):
    """Get tasks filtered by dataset and task type"""
    if (dataset_name in EVAL_DATASETS and
            'task_type' in EVAL_DATASETS[dataset_name].columns and
            'task' in EVAL_DATASETS[dataset_name].columns):

        filtered = EVAL_DATASETS[dataset_name][EVAL_DATASETS[dataset_name]['task_type'] == task_type]
        if len(filtered) > 0:
            # Create display options with index and truncated task content
            tasks = []
            for idx, row in filtered.iterrows():
                task_preview = str(row['task'])[:100] + "..." if len(str(row['task'])) > 100 else str(row['task'])
                tasks.append(f"Row {idx}: {task_preview}")
            return tasks
    return ["No tasks found"]


# def get_selected_row_data(dataset_name, task_type, selected_task):
#     """Get all data for the selected row"""
#     if not selected_task or selected_task == "No tasks found":
#         return "", "", "", "", "", "",""
#
#     try:
#         # Extract row index from selected_task
#         row_idx = int(selected_task.split("Row ")[1].split(":")[0])
#
#         if dataset_name in EVAL_DATASETS:
#             df = EVAL_DATASETS[dataset_name]
#             if row_idx in df.index:
#                 row = df.loc[row_idx]
#
#                 # Extract all fields with safe handling for missing columns
#                 task = str(row.get('task', 'N/A'))
#                 task_type_val = str(row.get('task_type', 'N/A'))
#                 input_model = str(row.get('input_model', 'N/A'))
#                 expected_response = str(row.get('expected_response', 'N/A'))
#                 loggenix_output = str(row.get('loggenix_output', 'N/A'))
#                 output_model = str(row.get('output_model', 'N/A'))
#                 input_text = str(row.get('input', 'N/A'))
#
#
#                 return task_type_val, input_model, output_model, task, input_text, expected_response, loggenix_output
#
#     except Exception as e:
#         return f"Error: {str(e)}", "", "", "", "", "", "", ""
#
#     return "", "", "", "", "", "", ""

def get_selected_row_data_by_type(dataset_name, task_type):
    """Get all data for the first row of a selected dataset and task type"""
    if (dataset_name in EVAL_DATASETS and
            'task_type' in EVAL_DATASETS[dataset_name].columns and
            'task' in EVAL_DATASETS[dataset_name].columns):

        filtered = EVAL_DATASETS[dataset_name][EVAL_DATASETS[dataset_name]['task_type'] == task_type]
        if len(filtered) > 0:
            row = filtered.iloc[0] # Get the first row

            # Extract all fields with safe handling for missing columns
            task = str(row.get('task', 'N/A'))
            input_model = str(row.get('input_model', 'N/A'))
            expected_response = str(row.get('expected_response', 'N/A'))
            loggenix_output = str(row.get('loggenix_output', 'N/A'))
            output_model = str(row.get('output_model', 'N/A'))
            input_text = str(row.get('input', 'N/A'))

            return input_model, output_model, task, input_text, expected_response, loggenix_output

    return "", "", "", "", "", ""

# ===== TAB 3: VIEW FLAGGED RESPONSES =====

def read_flagged_messages():
    """Read flagged messages from log file"""
    try:
        if not os.path.exists("logs/flagged_responses.log"):
            return pd.DataFrame()

        with open("logs/flagged_responses.log", "r") as f:
            flagged_messages = f.readlines()

        if not flagged_messages:
            return pd.DataFrame()

        table_data = []
        for entry in flagged_messages:
            data = json.loads(entry)
            table_data.append({
                "Timestamp": data.get("timestamp", "N/A"),
                "Flag Reason": data.get("flag_reason", "N/A"),
                "Flagged Message": data.get("flagged_message", "N/A")[:100] + "...",
                "Conversation Context": str(len(data.get("conversation_context", []))) + " messages"
            })
        return pd.DataFrame(table_data)
    except Exception as e:
        return pd.DataFrame({"Error": [f"Error reading flagged messages: {str(e)}"]})


def handle_row_select(evt: gr.SelectData):
    """Handle row selection in flagged messages table"""
    try:
        if not os.path.exists("logs/flagged_responses.log"):
            return []

        with open("logs/flagged_responses.log", "r") as f:
            flagged_messages_log = f.readlines()

        if evt.index[0] < len(flagged_messages_log):
            selected_entry = json.loads(flagged_messages_log[evt.index[0]])
            conversation_context = selected_entry.get("conversation_context", [])
            return conversation_context
        return []
    except Exception as e:
        return [("System", f"Error loading conversation: {str(e)}")]


# ===== MAIN INTERFACE =====

def create_interface():
    with gr.Blocks(title="AI Tasks Evaluation Suite", theme=gr.themes.Soft()) as demo:
        gr.Markdown("# 🤖 AI Tasks Evaluation Suite")
        gr.Markdown("Comprehensive platform for AI model evaluation and testing")

        with gr.Tabs():
            # TAB 1: INFERENCE USE CASE WITH INTEGRATED FLAGGING
            with gr.Tab("🚀 Inference Use Case"):
                gr.Markdown("## Model Inference Testing with Response Flagging")

                with gr.Row():
                    with gr.Column(scale=1):
                        # Task type dropdown
                        task_type_dropdown = gr.Dropdown(
                            choices=get_task_types(),
                            value=get_task_types()[0] if get_task_types() else None,
                            label="Task Type",
                            info="Select task type to load system prompt"
                        )

                        # Inference configuration
                        inference_config = gr.Dropdown(
                            choices=list(get_inference_configs().keys()),
                            value="Optimized for Speed",
                            label="Inference Configuration",
                            info="Select inference optimization level"
                        )

                    with gr.Column(scale=2):
                        # System prompt (editable)
                        system_prompt = gr.Textbox(
                            label="System Prompt (Editable)",
                            lines=6,
                            max_lines=10,
                            placeholder="Select a task type to load system prompt...",
                            interactive=True
                        )

                # Chat interface section
                gr.Markdown("### 💬 Chat Interface")
                with gr.Row():
                    with gr.Column(scale=2):
                        # Chat display (replacing the old textbox)
                        chat_display = gr.Chatbot(label="Conversation History", height=400)
                        chat_history_state = gr.State([])

                        # Chat input
                        with gr.Row():
                            chat_input = gr.Textbox(
                                placeholder="Enter your message here...",
                                label="Your Message",
                                scale=4
                            )
                            send_btn = gr.Button("Send", variant="primary", scale=1)

                        with gr.Row():
                            clear_chat_btn = gr.Button("🗑️ Clear History", variant="secondary")

                    # Flagging section
                    with gr.Column(scale=1):
                        gr.Markdown("### 🚩 Flag Response")

                        flagged_message_index = gr.Dropdown(
                            label="Select a response to flag",
                            choices=["No responses available"],
                            value="No responses available",
                            interactive=True
                        )

                        selected_message_display = gr.Textbox(
                            label="Selected Response",
                            interactive=False,
                            lines=4,
                            max_lines=6
                        )

                        flag_reason = gr.Textbox(
                            placeholder="Enter reason for flagging...",
                            label="Reason for Flagging"
                        )

                        flag_btn = gr.Button("🚩 Flag Response", variant="stop")
                        flag_output = gr.Textbox(label="Flagging Status", visible=True, lines=2)

                # Event handlers for Tab 1
                task_type_dropdown.change(
                    fn=get_task_by_type,
                    inputs=[task_type_dropdown],
                    outputs=[system_prompt]
                )

                # Chat functionality
                send_btn.click(
                    chat_interface_with_inference,
                    inputs=[chat_input, chat_history_state, system_prompt, inference_config],
                    outputs=[chat_display, chat_input]
                ).then(
                    lambda x: x,  # Update state
                    inputs=[chat_display],
                    outputs=[chat_history_state]
                ).then(
                    get_assistant_responses,
                    inputs=[chat_history_state],
                    outputs=[flagged_message_index]
                )

                # Enter key support for chat input
                chat_input.submit(
                    chat_interface_with_inference,
                    inputs=[chat_input, chat_history_state, system_prompt, inference_config],
                    outputs=[chat_display, chat_input]
                ).then(
                    lambda x: x,  # Update state
                    inputs=[chat_display],
                    outputs=[chat_history_state]
                ).then(
                    get_assistant_responses,
                    inputs=[chat_history_state],
                    outputs=[flagged_message_index]
                )

                clear_chat_btn.click(
                    clear_inference_history,
                    outputs=[chat_display, flagged_message_index]
                ).then(
                    lambda: [],
                    outputs=[chat_history_state]
                )

                # Flagging functionality
                flagged_message_index.change(
                    display_selected_message,
                    inputs=[flagged_message_index, chat_history_state],
                    outputs=[selected_message_display]
                )

                flag_btn.click(
                    flag_response,
                    inputs=[chat_history_state, flagged_message_index, flag_reason],
                    outputs=[flag_output]
                )

            # TAB 2: EVAL SAMPLES
            # with gr.Tab("📊 Eval Samples"):
            #     gr.Markdown("## Dataset Evaluation Samples")
            #
            #     with gr.Row():
            #         with gr.Column(scale=1):
            #             eval_dataset_dropdown = gr.Dropdown(
            #                 choices=list(EVAL_DATASETS.keys()),
            #                 value=list(EVAL_DATASETS.keys())[0] if EVAL_DATASETS else None,
            #                 label="Select Dataset",
            #                 info="Choose evaluation dataset to view"
            #             )
            #
            #             eval_dataset_info = gr.Markdown(
            #                 get_eval_dataset_info(list(EVAL_DATASETS.keys())[0] if EVAL_DATASETS else "")
            #             )
            #
            #     with gr.Row():
            #         eval_table = gr.Dataframe(
            #             value=update_eval_table(list(EVAL_DATASETS.keys())[0]) if EVAL_DATASETS else pd.DataFrame(),
            #             label="Dataset Table",
            #             max_height=800,
            #             min_width=800,
            #             interactive=True,
            #             wrap=True,
            #             show_fullscreen_button=True,
            #             show_copy_button=True,
            #             show_row_numbers=True,
            #             show_search="search",
            #             column_widths=["80px","80px","80px","150px","250px","250px","250px"]
            #         )
            #
            #     # Event handlers for Tab 2
            #     eval_dataset_dropdown.change(
            #         fn=lambda x: (update_eval_table(x), get_eval_dataset_info(x)),
            #         inputs=[eval_dataset_dropdown],
            #         outputs=[eval_table, eval_dataset_info]
            #     )
            with gr.Tab("📊 Eval Samples"):
                gr.Markdown("## Dataset Evaluation Samples")
                gr.Markdown("Select dataset and task type to view detailed information")

                with gr.Row():
                    with gr.Column(scale=1):
                        eval_dataset_dropdown = gr.Dropdown(
                            choices=list(EVAL_DATASETS.keys()),
                            value=list(EVAL_DATASETS.keys())[0] if EVAL_DATASETS else None,
                            label="Select Dataset",
                            info="Choose evaluation dataset to view"
                        )

                        eval_task_type_dropdown = gr.Dropdown(
                            choices=[],
                            label="Select Task Type",
                            info="Choose task type from selected dataset",
                            allow_custom_value=True
                        )

                    with gr.Column(scale=1):
                        eval_dataset_info = gr.Markdown(
                            get_eval_dataset_info(list(EVAL_DATASETS.keys())[0] if EVAL_DATASETS else "")
                        )

                # Task details section
                gr.Markdown("### Task Details")
                with gr.Row():
                    input_model_field = gr.Textbox(
                        label="input_model",
                        lines=1,
                        interactive=False
                    )

                    output_model_field = gr.Textbox(
                        label="output_model",
                        lines=1,
                        interactive=False
                    )
                with gr.Row():
                    task_field = gr.Textbox(
                        label="Task",
                        lines=2,
                        max_lines=5,
                        interactive=False
                    )

                with gr.Row():
                    input_field = gr.Textbox(
                        label="input",
                        lines=12,
                        max_lines=20,
                        interactive=False
                    )

                # Large text fields for outputs side by side
                gr.Markdown("### Expected vs Actual Response Comparison")

                with gr.Row():
                    loggenix_output_field = gr.Textbox(
                        label="Expected Response",
                        lines=30,
                        max_lines=40,
                        interactive=False
                    )
                    expected_response_field = gr.Textbox(
                        label="Loggenix Output",
                        lines=30,
                        max_lines=40,
                        interactive=False
                    )

                # Event handlers for Tab 2
                # eval_dataset_dropdown.change(
                #     fn=lambda x: (get_eval_dataset_info(x), get_task_types_for_eval(x), None),
                #     inputs=[eval_dataset_dropdown],
                #     outputs=[eval_dataset_info, eval_task_type_dropdown]
                # )

                # Event handlers for Tab 2
                # eval_dataset_dropdown.change(
                #     fn=lambda x: (get_eval_dataset_info(x), get_task_types_for_eval(x)),
                #     inputs=[eval_dataset_dropdown],
                #     outputs=[eval_dataset_info, eval_task_type_dropdown]
                # )
                # Define a new function instead of lambda for clarity
                def update_eval_components(dataset_name):
                    info = get_eval_dataset_info(dataset_name)
                    task_types = get_task_types_for_eval(dataset_name)
                    return info, gr.update(choices=task_types,
                                           value=task_types[0] if task_types else "No task types available")

                # In the event handlers for Tab 2, replace the existing .change with this:
                eval_dataset_dropdown.change(
                    fn=update_eval_components,
                    inputs=[eval_dataset_dropdown],
                    outputs=[eval_dataset_info, eval_task_type_dropdown]
                )
                eval_task_type_dropdown.change(
                    fn=get_selected_row_data_by_type,
                    inputs=[eval_dataset_dropdown, eval_task_type_dropdown],
                    outputs=[input_model_field, output_model_field, task_field, input_field,
                             loggenix_output_field, expected_response_field]
                )

            # NOTE: The get_tasks_by_type_eval and eval_task_dropdown.change handlers are removed as per request.
            # TAB 3: VIEW FLAGGED RESPONSES (RENAMED FROM TAB 4)
            with gr.Tab("👀 View Flagged Responses"):
                gr.Markdown("## Review Flagged Responses")

                with gr.Row():
                    with gr.Column():
                        flagged_messages_display = gr.Dataframe(
                            headers=["Timestamp", "Flag Reason", "Flagged Message", "Conversation Context"],
                            interactive=False,
                            max_height=400
                        )
                        refresh_btn = gr.Button("🔄 Refresh", variant="primary")

                    with gr.Column():
                        conversation_context_display = gr.Chatbot(
                            label="Conversation Context",
                            height=400
                        )

                # Event handlers for Tab 3
                flagged_messages_display.select(
                    handle_row_select,
                    outputs=[conversation_context_display]
                )

                refresh_btn.click(
                    read_flagged_messages,
                    outputs=[flagged_messages_display]
                )

            # TAB 4: MODEL EVAL RESULTS (MOVED FROM TAB 5)
            with gr.Tab("📈 Model Eval Results"):
                gr.Markdown("## Model Evaluation Results")
                gr.Markdown("### 🚧 Coming Soon")
                gr.Markdown(
                    "This section will display comprehensive model evaluation metrics, charts, and performance analysis.")

                # Placeholder content
                with gr.Row():
                    with gr.Column():
                        gr.Markdown("#### Evaluation Metrics")
                        gr.Markdown("- Accuracy scores")
                        gr.Markdown("- Performance benchmarks")
                        gr.Markdown("- Comparative analysis")

                    with gr.Column():
                        gr.Markdown("#### Visualization")
                        gr.Markdown("- Performance charts")
                        gr.Markdown("- Score distributions")
                        gr.Markdown("- Trend analysis")

            # TAB 5: ABOUT (MOVED FROM TAB 6)
            with gr.Tab("ℹ️ About"):
                gr.Markdown("## About Loggenix MOE Model")

                gr.Markdown("""
                ### Model: `kshitijthakkar/loggenix-moe-0.3B-A0.1B-e3-lr7e5-b16-4090-v6.2-finetuned-tool`

                This is a fine-tuned Mixture of Experts (MOE) model designed for specialized AI tasks with tool calling capabilities.

                #### Key Features:
                - **Architecture**: MOE with 0.3B total parameters, 0.1B active parameters
                - **Training**: Fine-tuned with learning rate 7e-5, batch size 16
                - **Hardware**: Optimized for RTX 4090 GPU
                - **Capabilities**: Tool calling, instruction following, task-specific responses

                #### Model Specifications:
                - **Total Parameters**: 0.3B
                - **Active Parameters**: 0.1B  
                - **Context Length**: 4096 tokens
                - **Precision**: FP16 for optimal performance
                - **Flash Attention**: Supported for faster inference

                #### Sample Inference Code:
                ```python
                from transformers import AutoModelForCausalLM, AutoTokenizer
                import torch

                # Load model and tokenizer
                model_id = "kshitijthakkar/loggenix-moe-0.3B-A0.1B-e3-lr7e5-b16-4090-v6.2-finetuned-tool"
                tokenizer = AutoTokenizer.from_pretrained(model_id)
                model = AutoModelForCausalLM.from_pretrained(
                    model_id,
                    device_map="auto",
                    torch_dtype=torch.float16,
                    attn_implementation="flash_attention_2"
                ).eval()

                # Prepare messages
                messages = [
                    {"role": "system", "content": "You are a helpful AI assistant."},
                    {"role": "user", "content": "Calculate 25 + 37"}
                ]

                # Format and generate
                prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
                inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

                with torch.no_grad():
                    outputs = model.generate(
                        **inputs,
                        max_new_tokens=512,
                        do_sample=True,
                        temperature=0.7,
                        pad_token_id=tokenizer.pad_token_id
                    )

                response = tokenizer.decode(outputs[0], skip_special_tokens=True)
                print(response)
                ```

                #### Tool Calling Support:
                The model supports structured tool calling for mathematical operations, data analysis, and other specialized tasks.

                #### Performance Optimizations:
                - **Speed Mode**: Max 512 new tokens for fast responses  
                - **Balanced Mode**: Max 2048 new tokens for comprehensive answers
                - **Full Capacity**: Dynamic token allocation up to context limit

                ---

                **Developed by**: Kshitij Thakkar  
                **Version**: v6.2  
                **License**: Please check model repository for licensing details
                """)

        # Load initial data
        demo.load(
            fn=read_flagged_messages,
            outputs=[flagged_messages_display]
        )

    return demo


# Launch the application
if __name__ == "__main__":
    print("Starting AI Tasks Evaluation Suite...")
    demo = create_interface()
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False,
        debug=True,
        mcp_server=True
    )