import gradio as gr import pandas as pd from datasets import load_dataset import plotly.graph_objects as go import datetime import json import random import os #from model_handler import generate_response, get_inference_configs #from enhanced_model_handler import generate_response, get_inference_configs from model_handler_ollama import generate_response, get_inference_configs import torch # Configuration for datasets DATASET_CONFIGS = { 'Loggenix Synthetic AI Tasks Eval (with outputs)-small': { 'repo_id': 'kshitijthakkar/loggenix-synthetic-ai-tasks-eval-with-outputs', 'split': 'train' }, 'Loggenix Synthetic AI Tasks Eval (with outputs) v5-large': { 'repo_id': 'kshitijthakkar/loggenix-synthetic-ai-tasks-eval_v5-with-outputs', 'split': 'train' }, 'Loggenix Synthetic AI Tasks Eval (with outputs) v6-large': { 'repo_id': 'kshitijthakkar/loggenix-synthetic-ai-tasks-eval_v6-with-outputs', 'split': 'train' } } # Load main dataset for inference tab def load_inference_dataset(): """Load the main dataset for inference use case""" try: print("Loading synthetic-ai-tasks-eval-v5 dataset...") dataset = load_dataset( 'kshitijthakkar/synthetic-ai-tasks-eval-v5', split='train', trust_remote_code=True ) df = dataset.to_pandas() print(f"✓ Successfully loaded: {len(df)} rows, {len(df.columns)} columns") return df except Exception as e: print(f"✗ Error loading dataset: {str(e)}") return pd.DataFrame({'Error': [f'Failed to load: {str(e)}']}) # Load dataset for eval samples tab def load_eval_datasets(): """Load all datasets for evaluation samples""" datasets = {} for display_name, config in DATASET_CONFIGS.items(): try: print(f"Loading {display_name}...") dataset = load_dataset( config['repo_id'], split=config['split'], trust_remote_code=True ) df = dataset.to_pandas() datasets[display_name] = df print(f"✓ Successfully loaded {display_name}: {len(df)} rows") except Exception as e: print(f"✗ Error loading {display_name}: {str(e)}") datasets[display_name] = pd.DataFrame({ 'Error': [f'Failed to load: {str(e)}'], 'Dataset': [config['repo_id']] }) return datasets # Load datasets INFERENCE_DATASET = load_inference_dataset() EVAL_DATASETS = load_eval_datasets() # ===== TAB 1: INFERENCE USE CASE WITH INTEGRATED FLAGGING ===== def get_task_types(): """Get unique task types from inference dataset""" if 'task_type' in INFERENCE_DATASET.columns: task_types = INFERENCE_DATASET['task_type'].unique().tolist() return [str(t) for t in task_types if pd.notna(t)] return ["No task types available"] def get_task_by_type(task_type): """Get task content by task type""" if 'task_type' in INFERENCE_DATASET.columns and 'task' in INFERENCE_DATASET.columns: filtered = INFERENCE_DATASET[INFERENCE_DATASET['task_type'] == task_type] if len(filtered) > 0: return str(filtered.iloc[0]['task']) return "No task found for this type" def chat_interface_with_inference(prompt, history, system_prompt, inference_config): """Enhanced chat interface with model inference and history""" if not prompt.strip(): return history, "" # Add user message to history history.append(("You", prompt)) try: if not system_prompt.strip(): response = "Please select a task type to load system prompt first." else: # Get inference configuration configs = get_inference_configs() config = configs.get(inference_config, configs["Optimized for Speed"]) # Run inference using the model response = generate_response( system_prompt=system_prompt, user_input=prompt, config_name=inference_config ) # Format and add AI response to history formatted_response = f"**AI Assistant:**\n{response}" history.append(("AI Assistant", formatted_response)) except Exception as e: error_msg = f"**AI Assistant:**\nError during inference: {str(e)}" history.append(("AI Assistant", error_msg)) return history, "" def flag_response(history, flagged_message, flag_reason): """Flag a response""" if not flagged_message or flagged_message == "No responses available": return "Invalid message selection." try: flagged_index = int(flagged_message.split()[1][:-1]) if flagged_index >= len(history) or history[flagged_index][0] != "AI Assistant": return "You can only flag assistant responses." flagged_message_content = history[flagged_index][1] log_entry = { "timestamp": datetime.datetime.now().isoformat(), "flag_reason": str(flag_reason), "flagged_message": str(flagged_message_content), "conversation_context": history, } os.makedirs("logs", exist_ok=True) with open("logs/flagged_responses.log", "a") as f: f.write(json.dumps(log_entry) + "\n") return f"Response flagged successfully: {flag_reason}" except Exception as e: return f"Error flagging response: {str(e)}" def get_assistant_responses(history): """Get dropdown options for assistant responses""" responses = [ f"Response {i}: {str(msg[1])[:50]}..." for i, msg in enumerate(history) if msg[0] == "AI Assistant" ] if not responses: responses = ["No responses available"] return gr.update(choices=responses, value=responses[0] if responses else "No responses available") def display_selected_message(selected_index, history): """Display the selected flagged message""" if selected_index == "No responses available": return "No responses available" try: flagged_index = int(selected_index.split()[1][:-1]) if flagged_index < len(history) and history[flagged_index][0] == "AI Assistant": return history[flagged_index][1] else: return "Invalid selection." except Exception as e: return f"Error: {str(e)}" def clear_inference_history(): """Clear chat history for inference tab""" return [], gr.update(choices=["No responses available"], value="No responses available") # ===== TAB 2: EVAL SAMPLES ===== def update_eval_table(dataset_name): """Update eval table based on selected dataset""" if dataset_name in EVAL_DATASETS: return EVAL_DATASETS[dataset_name].head(100) return pd.DataFrame() def get_eval_dataset_info(dataset_name): """Get info about selected eval dataset""" if dataset_name in EVAL_DATASETS: df = EVAL_DATASETS[dataset_name] return f""" **Dataset**: {dataset_name} - **Rows**: {len(df):,} - **Columns**: {len(df.columns)} - **Column Names**: {', '.join(df.columns.tolist())} """ return "No dataset selected" # def get_task_types_for_eval(dataset_name): # """Get unique task types from selected eval dataset""" # if dataset_name in EVAL_DATASETS and 'task_type' in EVAL_DATASETS[dataset_name].columns: # task_types = EVAL_DATASETS[dataset_name]['task_type'].unique().tolist() # return [str(t) for t in task_types if pd.notna(t)] # return ["No task types available"] def get_task_types_for_eval(dataset_name): """Get unique task types from selected eval dataset""" if dataset_name in EVAL_DATASETS and 'task_type' in EVAL_DATASETS[dataset_name].columns: task_types = EVAL_DATASETS[dataset_name]['task_type'].unique().tolist() # The correct way is to return the list directly, not a joined string. # The list comprehension `[str(t) for t in task_types if pd.notna(t)]` already does this. return [str(t) for t in task_types if pd.notna(t)] return ["No task types available"] def get_tasks_by_type_eval(dataset_name, task_type): """Get tasks filtered by dataset and task type""" if (dataset_name in EVAL_DATASETS and 'task_type' in EVAL_DATASETS[dataset_name].columns and 'task' in EVAL_DATASETS[dataset_name].columns): filtered = EVAL_DATASETS[dataset_name][EVAL_DATASETS[dataset_name]['task_type'] == task_type] if len(filtered) > 0: # Create display options with index and truncated task content tasks = [] for idx, row in filtered.iterrows(): task_preview = str(row['task'])[:100] + "..." if len(str(row['task'])) > 100 else str(row['task']) tasks.append(f"Row {idx}: {task_preview}") return tasks return ["No tasks found"] # def get_selected_row_data(dataset_name, task_type, selected_task): # """Get all data for the selected row""" # if not selected_task or selected_task == "No tasks found": # return "", "", "", "", "", "","" # # try: # # Extract row index from selected_task # row_idx = int(selected_task.split("Row ")[1].split(":")[0]) # # if dataset_name in EVAL_DATASETS: # df = EVAL_DATASETS[dataset_name] # if row_idx in df.index: # row = df.loc[row_idx] # # # Extract all fields with safe handling for missing columns # task = str(row.get('task', 'N/A')) # task_type_val = str(row.get('task_type', 'N/A')) # input_model = str(row.get('input_model', 'N/A')) # expected_response = str(row.get('expected_response', 'N/A')) # loggenix_output = str(row.get('loggenix_output', 'N/A')) # output_model = str(row.get('output_model', 'N/A')) # input_text = str(row.get('input', 'N/A')) # # # return task_type_val, input_model, output_model, task, input_text, expected_response, loggenix_output # # except Exception as e: # return f"Error: {str(e)}", "", "", "", "", "", "", "" # # return "", "", "", "", "", "", "" def get_selected_row_data_by_type(dataset_name, task_type): """Get all data for the first row of a selected dataset and task type""" if (dataset_name in EVAL_DATASETS and 'task_type' in EVAL_DATASETS[dataset_name].columns and 'task' in EVAL_DATASETS[dataset_name].columns): filtered = EVAL_DATASETS[dataset_name][EVAL_DATASETS[dataset_name]['task_type'] == task_type] if len(filtered) > 0: row = filtered.iloc[0] # Get the first row # Extract all fields with safe handling for missing columns task = str(row.get('task', 'N/A')) input_model = str(row.get('input_model', 'N/A')) expected_response = str(row.get('expected_response', 'N/A')) loggenix_output = str(row.get('loggenix_output', 'N/A')) output_model = str(row.get('output_model', 'N/A')) input_text = str(row.get('input', 'N/A')) return input_model, output_model, task, input_text, expected_response, loggenix_output return "", "", "", "", "", "" # ===== TAB 3: VIEW FLAGGED RESPONSES ===== def read_flagged_messages(): """Read flagged messages from log file""" try: if not os.path.exists("logs/flagged_responses.log"): return pd.DataFrame() with open("logs/flagged_responses.log", "r") as f: flagged_messages = f.readlines() if not flagged_messages: return pd.DataFrame() table_data = [] for entry in flagged_messages: data = json.loads(entry) table_data.append({ "Timestamp": data.get("timestamp", "N/A"), "Flag Reason": data.get("flag_reason", "N/A"), "Flagged Message": data.get("flagged_message", "N/A")[:100] + "...", "Conversation Context": str(len(data.get("conversation_context", []))) + " messages" }) return pd.DataFrame(table_data) except Exception as e: return pd.DataFrame({"Error": [f"Error reading flagged messages: {str(e)}"]}) def handle_row_select(evt: gr.SelectData): """Handle row selection in flagged messages table""" try: if not os.path.exists("logs/flagged_responses.log"): return [] with open("logs/flagged_responses.log", "r") as f: flagged_messages_log = f.readlines() if evt.index[0] < len(flagged_messages_log): selected_entry = json.loads(flagged_messages_log[evt.index[0]]) conversation_context = selected_entry.get("conversation_context", []) return conversation_context return [] except Exception as e: return [("System", f"Error loading conversation: {str(e)}")] # ===== MAIN INTERFACE ===== def create_interface(): with gr.Blocks(title="AI Tasks Evaluation Suite", theme=gr.themes.Soft()) as demo: gr.Markdown("# 🤖 AI Tasks Evaluation Suite") gr.Markdown("Comprehensive platform for AI model evaluation and testing") with gr.Tabs(): # TAB 1: INFERENCE USE CASE WITH INTEGRATED FLAGGING with gr.Tab("🚀 Inference Use Case"): gr.Markdown("## Model Inference Testing with Response Flagging") with gr.Row(): with gr.Column(scale=1): # Task type dropdown task_type_dropdown = gr.Dropdown( choices=get_task_types(), value=get_task_types()[0] if get_task_types() else None, label="Task Type", info="Select task type to load system prompt" ) # Inference configuration inference_config = gr.Dropdown( choices=list(get_inference_configs().keys()), value="Optimized for Speed", label="Inference Configuration", info="Select inference optimization level" ) with gr.Column(scale=2): # System prompt (editable) system_prompt = gr.Textbox( label="System Prompt (Editable)", lines=6, max_lines=10, placeholder="Select a task type to load system prompt...", interactive=True ) # Chat interface section gr.Markdown("### đŸ’Ŧ Chat Interface") with gr.Row(): with gr.Column(scale=2): # Chat display (replacing the old textbox) chat_display = gr.Chatbot(label="Conversation History", height=400) chat_history_state = gr.State([]) # Chat input with gr.Row(): chat_input = gr.Textbox( placeholder="Enter your message here...", label="Your Message", scale=4 ) send_btn = gr.Button("Send", variant="primary", scale=1) with gr.Row(): clear_chat_btn = gr.Button("đŸ—‘ī¸ Clear History", variant="secondary") # Flagging section with gr.Column(scale=1): gr.Markdown("### 🚩 Flag Response") flagged_message_index = gr.Dropdown( label="Select a response to flag", choices=["No responses available"], value="No responses available", interactive=True ) selected_message_display = gr.Textbox( label="Selected Response", interactive=False, lines=4, max_lines=6 ) flag_reason = gr.Textbox( placeholder="Enter reason for flagging...", label="Reason for Flagging" ) flag_btn = gr.Button("🚩 Flag Response", variant="stop") flag_output = gr.Textbox(label="Flagging Status", visible=True, lines=2) # Event handlers for Tab 1 task_type_dropdown.change( fn=get_task_by_type, inputs=[task_type_dropdown], outputs=[system_prompt] ) # Chat functionality send_btn.click( chat_interface_with_inference, inputs=[chat_input, chat_history_state, system_prompt, inference_config], outputs=[chat_display, chat_input] ).then( lambda x: x, # Update state inputs=[chat_display], outputs=[chat_history_state] ).then( get_assistant_responses, inputs=[chat_history_state], outputs=[flagged_message_index] ) # Enter key support for chat input chat_input.submit( chat_interface_with_inference, inputs=[chat_input, chat_history_state, system_prompt, inference_config], outputs=[chat_display, chat_input] ).then( lambda x: x, # Update state inputs=[chat_display], outputs=[chat_history_state] ).then( get_assistant_responses, inputs=[chat_history_state], outputs=[flagged_message_index] ) clear_chat_btn.click( clear_inference_history, outputs=[chat_display, flagged_message_index] ).then( lambda: [], outputs=[chat_history_state] ) # Flagging functionality flagged_message_index.change( display_selected_message, inputs=[flagged_message_index, chat_history_state], outputs=[selected_message_display] ) flag_btn.click( flag_response, inputs=[chat_history_state, flagged_message_index, flag_reason], outputs=[flag_output] ) # TAB 2: EVAL SAMPLES # with gr.Tab("📊 Eval Samples"): # gr.Markdown("## Dataset Evaluation Samples") # # with gr.Row(): # with gr.Column(scale=1): # eval_dataset_dropdown = gr.Dropdown( # choices=list(EVAL_DATASETS.keys()), # value=list(EVAL_DATASETS.keys())[0] if EVAL_DATASETS else None, # label="Select Dataset", # info="Choose evaluation dataset to view" # ) # # eval_dataset_info = gr.Markdown( # get_eval_dataset_info(list(EVAL_DATASETS.keys())[0] if EVAL_DATASETS else "") # ) # # with gr.Row(): # eval_table = gr.Dataframe( # value=update_eval_table(list(EVAL_DATASETS.keys())[0]) if EVAL_DATASETS else pd.DataFrame(), # label="Dataset Table", # max_height=800, # min_width=800, # interactive=True, # wrap=True, # show_fullscreen_button=True, # show_copy_button=True, # show_row_numbers=True, # show_search="search", # column_widths=["80px","80px","80px","150px","250px","250px","250px"] # ) # # # Event handlers for Tab 2 # eval_dataset_dropdown.change( # fn=lambda x: (update_eval_table(x), get_eval_dataset_info(x)), # inputs=[eval_dataset_dropdown], # outputs=[eval_table, eval_dataset_info] # ) with gr.Tab("📊 Eval Samples"): gr.Markdown("## Dataset Evaluation Samples") gr.Markdown("Select dataset and task type to view detailed information") with gr.Row(): with gr.Column(scale=1): eval_dataset_dropdown = gr.Dropdown( choices=list(EVAL_DATASETS.keys()), value=list(EVAL_DATASETS.keys())[0] if EVAL_DATASETS else None, label="Select Dataset", info="Choose evaluation dataset to view" ) eval_task_type_dropdown = gr.Dropdown( choices=[], label="Select Task Type", info="Choose task type from selected dataset", allow_custom_value=True ) with gr.Column(scale=1): eval_dataset_info = gr.Markdown( get_eval_dataset_info(list(EVAL_DATASETS.keys())[0] if EVAL_DATASETS else "") ) # Task details section gr.Markdown("### Task Details") with gr.Row(): input_model_field = gr.Textbox( label="input_model", lines=1, interactive=False ) output_model_field = gr.Textbox( label="output_model", lines=1, interactive=False ) with gr.Row(): task_field = gr.Textbox( label="Task", lines=2, max_lines=5, interactive=False ) with gr.Row(): input_field = gr.Textbox( label="input", lines=12, max_lines=20, interactive=False ) # Large text fields for outputs side by side gr.Markdown("### Expected vs Actual Response Comparison") with gr.Row(): loggenix_output_field = gr.Textbox( label="Expected Response", lines=30, max_lines=40, interactive=False ) expected_response_field = gr.Textbox( label="Loggenix Output", lines=30, max_lines=40, interactive=False ) # Event handlers for Tab 2 # eval_dataset_dropdown.change( # fn=lambda x: (get_eval_dataset_info(x), get_task_types_for_eval(x), None), # inputs=[eval_dataset_dropdown], # outputs=[eval_dataset_info, eval_task_type_dropdown] # ) # Event handlers for Tab 2 # eval_dataset_dropdown.change( # fn=lambda x: (get_eval_dataset_info(x), get_task_types_for_eval(x)), # inputs=[eval_dataset_dropdown], # outputs=[eval_dataset_info, eval_task_type_dropdown] # ) # Define a new function instead of lambda for clarity def update_eval_components(dataset_name): info = get_eval_dataset_info(dataset_name) task_types = get_task_types_for_eval(dataset_name) return info, gr.update(choices=task_types, value=task_types[0] if task_types else "No task types available") # In the event handlers for Tab 2, replace the existing .change with this: eval_dataset_dropdown.change( fn=update_eval_components, inputs=[eval_dataset_dropdown], outputs=[eval_dataset_info, eval_task_type_dropdown] ) eval_task_type_dropdown.change( fn=get_selected_row_data_by_type, inputs=[eval_dataset_dropdown, eval_task_type_dropdown], outputs=[input_model_field, output_model_field, task_field, input_field, loggenix_output_field, expected_response_field] ) # NOTE: The get_tasks_by_type_eval and eval_task_dropdown.change handlers are removed as per request. # TAB 3: VIEW FLAGGED RESPONSES (RENAMED FROM TAB 4) with gr.Tab("👀 View Flagged Responses"): gr.Markdown("## Review Flagged Responses") with gr.Row(): with gr.Column(): flagged_messages_display = gr.Dataframe( headers=["Timestamp", "Flag Reason", "Flagged Message", "Conversation Context"], interactive=False, max_height=400 ) refresh_btn = gr.Button("🔄 Refresh", variant="primary") with gr.Column(): conversation_context_display = gr.Chatbot( label="Conversation Context", height=400 ) # Event handlers for Tab 3 flagged_messages_display.select( handle_row_select, outputs=[conversation_context_display] ) refresh_btn.click( read_flagged_messages, outputs=[flagged_messages_display] ) # TAB 4: MODEL EVAL RESULTS (MOVED FROM TAB 5) with gr.Tab("📈 Model Eval Results"): gr.Markdown("## Model Evaluation Results") gr.Markdown("### 🚧 Coming Soon") gr.Markdown( "This section will display comprehensive model evaluation metrics, charts, and performance analysis.") # Placeholder content with gr.Row(): with gr.Column(): gr.Markdown("#### Evaluation Metrics") gr.Markdown("- Accuracy scores") gr.Markdown("- Performance benchmarks") gr.Markdown("- Comparative analysis") with gr.Column(): gr.Markdown("#### Visualization") gr.Markdown("- Performance charts") gr.Markdown("- Score distributions") gr.Markdown("- Trend analysis") # TAB 5: ABOUT (MOVED FROM TAB 6) with gr.Tab("â„šī¸ About"): gr.Markdown("## About Loggenix MOE Model") gr.Markdown(""" ### Model: `kshitijthakkar/loggenix-moe-0.3B-A0.1B-e3-lr7e5-b16-4090-v6.2-finetuned-tool` This is a fine-tuned Mixture of Experts (MOE) model designed for specialized AI tasks with tool calling capabilities. #### Key Features: - **Architecture**: MOE with 0.3B total parameters, 0.1B active parameters - **Training**: Fine-tuned with learning rate 7e-5, batch size 16 - **Hardware**: Optimized for RTX 4090 GPU - **Capabilities**: Tool calling, instruction following, task-specific responses #### Model Specifications: - **Total Parameters**: 0.3B - **Active Parameters**: 0.1B - **Context Length**: 4096 tokens - **Precision**: FP16 for optimal performance - **Flash Attention**: Supported for faster inference #### Sample Inference Code: ```python from transformers import AutoModelForCausalLM, AutoTokenizer import torch # Load model and tokenizer model_id = "kshitijthakkar/loggenix-moe-0.3B-A0.1B-e3-lr7e5-b16-4090-v6.2-finetuned-tool" tokenizer = AutoTokenizer.from_pretrained(model_id) model = AutoModelForCausalLM.from_pretrained( model_id, device_map="auto", torch_dtype=torch.float16, attn_implementation="flash_attention_2" ).eval() # Prepare messages messages = [ {"role": "system", "content": "You are a helpful AI assistant."}, {"role": "user", "content": "Calculate 25 + 37"} ] # Format and generate prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) inputs = tokenizer(prompt, return_tensors="pt").to("cuda") with torch.no_grad(): outputs = model.generate( **inputs, max_new_tokens=512, do_sample=True, temperature=0.7, pad_token_id=tokenizer.pad_token_id ) response = tokenizer.decode(outputs[0], skip_special_tokens=True) print(response) ``` #### Tool Calling Support: The model supports structured tool calling for mathematical operations, data analysis, and other specialized tasks. #### Performance Optimizations: - **Speed Mode**: Max 512 new tokens for fast responses - **Balanced Mode**: Max 2048 new tokens for comprehensive answers - **Full Capacity**: Dynamic token allocation up to context limit --- **Developed by**: Kshitij Thakkar **Version**: v6.2 **License**: Please check model repository for licensing details """) # Load initial data demo.load( fn=read_flagged_messages, outputs=[flagged_messages_display] ) return demo # Launch the application if __name__ == "__main__": print("Starting AI Tasks Evaluation Suite...") demo = create_interface() demo.launch( server_name="0.0.0.0", server_port=7860, share=False, debug=True, mcp_server=True )