|
|
import gradio as gr |
|
|
import pandas as pd |
|
|
from datasets import load_dataset |
|
|
import plotly.graph_objects as go |
|
|
import datetime |
|
|
import json |
|
|
import random |
|
|
import os |
|
|
from model_handler import generate_response, get_inference_configs |
|
|
import torch |
|
|
|
|
|
|
|
|
DATASET_CONFIGS = { |
|
|
'Loggenix Synthetic AI Tasks Eval (with outputs)': { |
|
|
'repo_id': 'kshitijthakkar/loggenix-synthetic-ai-tasks-eval-with-outputs', |
|
|
'split': 'train' |
|
|
}, |
|
|
'Loggenix Synthetic AI Tasks Eval (with outputs) v5': { |
|
|
'repo_id': 'kshitijthakkar/loggenix-synthetic-ai-tasks-eval_v5-with-outputs', |
|
|
'split': 'train' |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
def load_inference_dataset(): |
|
|
"""Load the main dataset for inference use case""" |
|
|
try: |
|
|
print("Loading synthetic-ai-tasks-eval-v5 dataset...") |
|
|
dataset = load_dataset( |
|
|
'kshitijthakkar/synthetic-ai-tasks-eval-v5', |
|
|
split='train', |
|
|
trust_remote_code=True |
|
|
) |
|
|
df = dataset.to_pandas() |
|
|
print(f"β Successfully loaded: {len(df)} rows, {len(df.columns)} columns") |
|
|
return df |
|
|
except Exception as e: |
|
|
print(f"β Error loading dataset: {str(e)}") |
|
|
return pd.DataFrame({'Error': [f'Failed to load: {str(e)}']}) |
|
|
|
|
|
|
|
|
|
|
|
def load_eval_datasets(): |
|
|
"""Load all datasets for evaluation samples""" |
|
|
datasets = {} |
|
|
for display_name, config in DATASET_CONFIGS.items(): |
|
|
try: |
|
|
print(f"Loading {display_name}...") |
|
|
dataset = load_dataset( |
|
|
config['repo_id'], |
|
|
split=config['split'], |
|
|
trust_remote_code=True |
|
|
) |
|
|
df = dataset.to_pandas() |
|
|
datasets[display_name] = df |
|
|
print(f"β Successfully loaded {display_name}: {len(df)} rows") |
|
|
except Exception as e: |
|
|
print(f"β Error loading {display_name}: {str(e)}") |
|
|
datasets[display_name] = pd.DataFrame({ |
|
|
'Error': [f'Failed to load: {str(e)}'], |
|
|
'Dataset': [config['repo_id']] |
|
|
}) |
|
|
return datasets |
|
|
|
|
|
|
|
|
|
|
|
INFERENCE_DATASET = load_inference_dataset() |
|
|
EVAL_DATASETS = load_eval_datasets() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_task_types(): |
|
|
"""Get unique task types from inference dataset""" |
|
|
if 'task_type' in INFERENCE_DATASET.columns: |
|
|
task_types = INFERENCE_DATASET['task_type'].unique().tolist() |
|
|
return [str(t) for t in task_types if pd.notna(t)] |
|
|
return ["No task types available"] |
|
|
|
|
|
|
|
|
def get_task_by_type(task_type): |
|
|
"""Get task content by task type""" |
|
|
if 'task_type' in INFERENCE_DATASET.columns and 'task' in INFERENCE_DATASET.columns: |
|
|
filtered = INFERENCE_DATASET[INFERENCE_DATASET['task_type'] == task_type] |
|
|
if len(filtered) > 0: |
|
|
return str(filtered.iloc[0]['task']) |
|
|
return "No task found for this type" |
|
|
|
|
|
|
|
|
def run_inference(task_type, system_prompt, user_input, inference_config): |
|
|
"""Run model inference""" |
|
|
if not user_input.strip(): |
|
|
return "Please enter a user input" |
|
|
|
|
|
if not system_prompt.strip(): |
|
|
return "Please select a task type to load system prompt" |
|
|
|
|
|
try: |
|
|
|
|
|
configs = get_inference_configs() |
|
|
config = configs.get(inference_config, configs["Optimized for Speed"]) |
|
|
|
|
|
|
|
|
response = generate_response( |
|
|
system_prompt=system_prompt, |
|
|
user_input=user_input, |
|
|
config_name=inference_config |
|
|
) |
|
|
return response |
|
|
except Exception as e: |
|
|
return f"Error during inference: {str(e)}" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def update_eval_table(dataset_name): |
|
|
"""Update eval table based on selected dataset""" |
|
|
if dataset_name in EVAL_DATASETS: |
|
|
return EVAL_DATASETS[dataset_name].head(100) |
|
|
return pd.DataFrame() |
|
|
|
|
|
|
|
|
def get_eval_dataset_info(dataset_name): |
|
|
"""Get info about selected eval dataset""" |
|
|
if dataset_name in EVAL_DATASETS: |
|
|
df = EVAL_DATASETS[dataset_name] |
|
|
return f""" |
|
|
**Dataset**: {dataset_name} |
|
|
- **Rows**: {len(df):,} |
|
|
- **Columns**: {len(df.columns)} |
|
|
- **Column Names**: {', '.join(df.columns.tolist())} |
|
|
""" |
|
|
return "No dataset selected" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def generate_chart(): |
|
|
"""Generate a sample Plotly chart""" |
|
|
x = list(range(10)) |
|
|
y = [random.randint(1, 100) for _ in x] |
|
|
fig = go.Figure() |
|
|
fig.add_trace(go.Scatter(x=x, y=y, mode="lines+markers", name="Random Data")) |
|
|
fig.update_layout(title="Sample Chart", xaxis_title="X-axis", yaxis_title="Y-axis") |
|
|
return fig.to_html(full_html=False) |
|
|
|
|
|
|
|
|
def chat_interface(prompt, history): |
|
|
"""Handle chat interface with history""" |
|
|
if not prompt.strip(): |
|
|
return history, "" |
|
|
|
|
|
history.append(("You", prompt)) |
|
|
|
|
|
try: |
|
|
if "chart" in prompt.lower() or "graph" in prompt.lower(): |
|
|
response = generate_chart() |
|
|
else: |
|
|
response = f"This is a demo response to: {prompt}" |
|
|
|
|
|
if isinstance(response, str): |
|
|
formatted_response = f"**AI Assistant:**\n{response}" |
|
|
history.append(("AI Assistant", formatted_response)) |
|
|
else: |
|
|
history.append(("AI Assistant", response)) |
|
|
except Exception as e: |
|
|
error_msg = f"**AI Assistant:**\nSorry, an error occurred: {str(e)}" |
|
|
history.append(("AI Assistant", error_msg)) |
|
|
|
|
|
return history, "" |
|
|
|
|
|
|
|
|
def flag_response(history, flagged_message, flag_reason): |
|
|
"""Flag a response""" |
|
|
if not flagged_message or flagged_message == "No responses available": |
|
|
return "Invalid message selection." |
|
|
|
|
|
try: |
|
|
flagged_index = int(flagged_message.split()[1][:-1]) |
|
|
if flagged_index >= len(history) or history[flagged_index][0] != "AI Assistant": |
|
|
return "You can only flag assistant responses." |
|
|
|
|
|
flagged_message_content = history[flagged_index][1] |
|
|
|
|
|
log_entry = { |
|
|
"timestamp": datetime.datetime.now().isoformat(), |
|
|
"flag_reason": str(flag_reason), |
|
|
"flagged_message": str(flagged_message_content), |
|
|
"conversation_context": history, |
|
|
} |
|
|
|
|
|
os.makedirs("logs", exist_ok=True) |
|
|
with open("logs/flagged_responses.log", "a") as f: |
|
|
f.write(json.dumps(log_entry) + "\n") |
|
|
|
|
|
return f"Response flagged successfully" |
|
|
except Exception as e: |
|
|
return f"Error flagging response: {str(e)}" |
|
|
|
|
|
|
|
|
def get_assistant_responses(history): |
|
|
"""Get dropdown options for assistant responses""" |
|
|
responses = [ |
|
|
f"Response {i}: {str(msg[1])[:50]}..." |
|
|
for i, msg in enumerate(history) |
|
|
if msg[0] == "AI Assistant" |
|
|
] |
|
|
|
|
|
if not responses: |
|
|
responses = ["No responses available"] |
|
|
|
|
|
return gr.update(choices=responses, value=responses[0]) |
|
|
|
|
|
|
|
|
def display_selected_message(selected_index, history): |
|
|
"""Display the selected flagged message""" |
|
|
if selected_index == "No responses available": |
|
|
return "No responses available" |
|
|
|
|
|
try: |
|
|
flagged_index = int(selected_index.split()[1][:-1]) |
|
|
if flagged_index < len(history) and history[flagged_index][0] == "AI Assistant": |
|
|
return history[flagged_index][1] |
|
|
else: |
|
|
return "Invalid selection." |
|
|
except Exception as e: |
|
|
return f"Error: {str(e)}" |
|
|
|
|
|
|
|
|
def read_flagged_messages(): |
|
|
"""Read flagged messages from log file""" |
|
|
try: |
|
|
if not os.path.exists("logs/flagged_responses.log"): |
|
|
return pd.DataFrame() |
|
|
|
|
|
with open("logs/flagged_responses.log", "r") as f: |
|
|
flagged_messages = f.readlines() |
|
|
|
|
|
if not flagged_messages: |
|
|
return pd.DataFrame() |
|
|
|
|
|
table_data = [] |
|
|
for entry in flagged_messages: |
|
|
data = json.loads(entry) |
|
|
table_data.append({ |
|
|
"Timestamp": data.get("timestamp", "N/A"), |
|
|
"Flag Reason": data.get("flag_reason", "N/A"), |
|
|
"Flagged Message": data.get("flagged_message", "N/A")[:100] + "...", |
|
|
"Conversation Context": str(len(data.get("conversation_context", []))) + " messages" |
|
|
}) |
|
|
return pd.DataFrame(table_data) |
|
|
except Exception as e: |
|
|
return pd.DataFrame({"Error": [f"Error reading flagged messages: {str(e)}"]}) |
|
|
|
|
|
|
|
|
def handle_row_select(evt: gr.SelectData): |
|
|
"""Handle row selection in flagged messages table""" |
|
|
try: |
|
|
if not os.path.exists("logs/flagged_responses.log"): |
|
|
return [] |
|
|
|
|
|
with open("logs/flagged_responses.log", "r") as f: |
|
|
flagged_messages_log = f.readlines() |
|
|
|
|
|
if evt.index[0] < len(flagged_messages_log): |
|
|
selected_entry = json.loads(flagged_messages_log[evt.index[0]]) |
|
|
conversation_context = selected_entry.get("conversation_context", []) |
|
|
return conversation_context |
|
|
return [] |
|
|
except Exception as e: |
|
|
return [("System", f"Error loading conversation: {str(e)}")] |
|
|
|
|
|
|
|
|
def clear_history(): |
|
|
"""Clear chat history""" |
|
|
return [], gr.update(choices=["No responses available"], value="No responses available") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def create_interface(): |
|
|
with gr.Blocks(title="AI Tasks Evaluation Suite", theme=gr.themes.Soft()) as demo: |
|
|
gr.Markdown("# π€ AI Tasks Evaluation Suite") |
|
|
gr.Markdown("Comprehensive platform for AI model evaluation and testing") |
|
|
|
|
|
with gr.Tabs(): |
|
|
|
|
|
with gr.Tab("π Inference Use Case"): |
|
|
gr.Markdown("## Model Inference Testing") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(scale=1): |
|
|
|
|
|
task_type_dropdown = gr.Dropdown( |
|
|
choices=get_task_types(), |
|
|
value=get_task_types()[0] if get_task_types() else None, |
|
|
label="Task Type", |
|
|
info="Select task type to load system prompt" |
|
|
) |
|
|
|
|
|
|
|
|
inference_config = gr.Dropdown( |
|
|
choices=list(get_inference_configs().keys()), |
|
|
value="Optimized for Speed", |
|
|
label="Inference Configuration", |
|
|
info="Select inference optimization level" |
|
|
) |
|
|
|
|
|
with gr.Column(scale=2): |
|
|
|
|
|
system_prompt = gr.Textbox( |
|
|
label="System Prompt (Editable)", |
|
|
lines=6, |
|
|
max_lines=10, |
|
|
placeholder="Select a task type to load system prompt...", |
|
|
interactive=True |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
|
|
|
user_input = gr.Textbox( |
|
|
label="User Input", |
|
|
lines=4, |
|
|
placeholder="Enter your input here...", |
|
|
interactive=True |
|
|
) |
|
|
|
|
|
with gr.Column(): |
|
|
|
|
|
model_response = gr.Textbox( |
|
|
label="Model Response", |
|
|
lines=8, |
|
|
interactive=False |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
submit_btn = gr.Button("π₯ Run Inference", variant="primary", size="lg") |
|
|
clear_btn = gr.Button("ποΈ Clear", variant="secondary") |
|
|
|
|
|
|
|
|
task_type_dropdown.change( |
|
|
fn=get_task_by_type, |
|
|
inputs=[task_type_dropdown], |
|
|
outputs=[system_prompt] |
|
|
) |
|
|
|
|
|
submit_btn.click( |
|
|
fn=run_inference, |
|
|
inputs=[task_type_dropdown, system_prompt, user_input, inference_config], |
|
|
outputs=[model_response] |
|
|
) |
|
|
|
|
|
clear_btn.click( |
|
|
fn=lambda: ("", "", ""), |
|
|
outputs=[system_prompt, user_input, model_response] |
|
|
) |
|
|
|
|
|
|
|
|
with gr.Tab("π Eval Samples"): |
|
|
gr.Markdown("## Dataset Evaluation Samples") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(scale=1): |
|
|
eval_dataset_dropdown = gr.Dropdown( |
|
|
choices=list(EVAL_DATASETS.keys()), |
|
|
value=list(EVAL_DATASETS.keys())[0] if EVAL_DATASETS else None, |
|
|
label="Select Dataset", |
|
|
info="Choose evaluation dataset to view" |
|
|
) |
|
|
|
|
|
eval_dataset_info = gr.Markdown( |
|
|
get_eval_dataset_info(list(EVAL_DATASETS.keys())[0] if EVAL_DATASETS else "") |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
eval_table = gr.Dataframe( |
|
|
value=update_eval_table(list(EVAL_DATASETS.keys())[0]) if EVAL_DATASETS else pd.DataFrame(), |
|
|
label="Dataset Table", |
|
|
max_height=800, |
|
|
min_width=800, |
|
|
interactive=False, |
|
|
wrap=True, |
|
|
show_fullscreen_button=True, |
|
|
show_copy_button=True, |
|
|
show_row_numbers=True, |
|
|
show_search="filter", |
|
|
) |
|
|
|
|
|
|
|
|
eval_dataset_dropdown.change( |
|
|
fn=lambda x: (update_eval_table(x), get_eval_dataset_info(x)), |
|
|
inputs=[eval_dataset_dropdown], |
|
|
outputs=[eval_table, eval_dataset_info] |
|
|
) |
|
|
|
|
|
|
|
|
with gr.Tab("π© Flag Responses"): |
|
|
gr.Markdown("## Chat Interface with Response Flagging") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
chat_input = gr.Textbox(placeholder="Ask something...", label="Your Message") |
|
|
|
|
|
with gr.Row(): |
|
|
chat_submit_btn = gr.Button("Send", variant="primary") |
|
|
chat_clear_btn = gr.Button("Clear History", variant="secondary") |
|
|
|
|
|
with gr.Column(): |
|
|
chat_display = gr.Chatbot(label="Chat History", height=400) |
|
|
chat_history_state = gr.State([]) |
|
|
|
|
|
gr.Markdown("### Flag Response") |
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
flagged_message_index = gr.Dropdown( |
|
|
label="Select a response to flag", |
|
|
choices=["No responses available"], |
|
|
value="No responses available", |
|
|
interactive=True |
|
|
) |
|
|
|
|
|
selected_message_display = gr.Textbox( |
|
|
label="Selected Response", |
|
|
interactive=False, |
|
|
lines=4 |
|
|
) |
|
|
|
|
|
with gr.Column(): |
|
|
flag_reason = gr.Textbox( |
|
|
placeholder="Enter reason for flagging...", |
|
|
label="Reason for Flagging" |
|
|
) |
|
|
|
|
|
flag_btn = gr.Button("Flag Response", variant="stop") |
|
|
flag_output = gr.Textbox(label="Flagging Feedback", visible=True) |
|
|
|
|
|
|
|
|
chat_submit_btn.click( |
|
|
chat_interface, |
|
|
inputs=[chat_input, chat_history_state], |
|
|
outputs=[chat_display, chat_input] |
|
|
).then( |
|
|
get_assistant_responses, |
|
|
inputs=[chat_history_state], |
|
|
outputs=[flagged_message_index] |
|
|
) |
|
|
|
|
|
chat_clear_btn.click( |
|
|
clear_history, |
|
|
outputs=[chat_display, flagged_message_index] |
|
|
) |
|
|
|
|
|
flagged_message_index.change( |
|
|
display_selected_message, |
|
|
inputs=[flagged_message_index, chat_history_state], |
|
|
outputs=[selected_message_display] |
|
|
) |
|
|
|
|
|
flag_btn.click( |
|
|
flag_response, |
|
|
inputs=[chat_history_state, flagged_message_index, flag_reason], |
|
|
outputs=[flag_output] |
|
|
) |
|
|
|
|
|
|
|
|
with gr.Tab("π View Flagged Responses"): |
|
|
gr.Markdown("## Review Flagged Responses") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
flagged_messages_display = gr.Dataframe( |
|
|
headers=["Timestamp", "Flag Reason", "Flagged Message", "Conversation Context"], |
|
|
interactive=False, |
|
|
max_height=400 |
|
|
) |
|
|
refresh_btn = gr.Button("π Refresh", variant="primary") |
|
|
|
|
|
with gr.Column(): |
|
|
conversation_context_display = gr.Chatbot( |
|
|
label="Conversation Context", |
|
|
height=400 |
|
|
) |
|
|
|
|
|
|
|
|
flagged_messages_display.select( |
|
|
handle_row_select, |
|
|
outputs=[conversation_context_display] |
|
|
) |
|
|
|
|
|
refresh_btn.click( |
|
|
read_flagged_messages, |
|
|
outputs=[flagged_messages_display] |
|
|
) |
|
|
|
|
|
|
|
|
with gr.Tab("π Model Eval Results"): |
|
|
gr.Markdown("## Model Evaluation Results") |
|
|
gr.Markdown("### π§ Coming Soon") |
|
|
gr.Markdown( |
|
|
"This section will display comprehensive model evaluation metrics, charts, and performance analysis.") |
|
|
|
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
gr.Markdown("#### Evaluation Metrics") |
|
|
gr.Markdown("- Accuracy scores") |
|
|
gr.Markdown("- Performance benchmarks") |
|
|
gr.Markdown("- Comparative analysis") |
|
|
|
|
|
with gr.Column(): |
|
|
gr.Markdown("#### Visualization") |
|
|
gr.Markdown("- Performance charts") |
|
|
gr.Markdown("- Score distributions") |
|
|
gr.Markdown("- Trend analysis") |
|
|
|
|
|
|
|
|
with gr.Tab("βΉοΈ About"): |
|
|
gr.Markdown("## About Loggenix MOE Model") |
|
|
|
|
|
gr.Markdown(""" |
|
|
### Model: `kshitijthakkar/loggenix-moe-0.3B-A0.1B-e3-lr7e5-b16-4090-v6.2-finetuned-tool` |
|
|
|
|
|
This is a fine-tuned Mixture of Experts (MOE) model designed for specialized AI tasks with tool calling capabilities. |
|
|
|
|
|
#### Key Features: |
|
|
- **Architecture**: MOE with 0.3B total parameters, 0.1B active parameters |
|
|
- **Training**: Fine-tuned with learning rate 7e-5, batch size 16 |
|
|
- **Hardware**: Optimized for RTX 4090 GPU |
|
|
- **Capabilities**: Tool calling, instruction following, task-specific responses |
|
|
|
|
|
#### Model Specifications: |
|
|
- **Total Parameters**: 0.3B |
|
|
- **Active Parameters**: 0.1B |
|
|
- **Context Length**: 4096 tokens |
|
|
- **Precision**: FP16 for optimal performance |
|
|
- **Flash Attention**: Supported for faster inference |
|
|
|
|
|
#### Sample Inference Code: |
|
|
```python |
|
|
from transformers import AutoModelForCausalLM, AutoTokenizer |
|
|
import torch |
|
|
|
|
|
# Load model and tokenizer |
|
|
model_id = "kshitijthakkar/loggenix-moe-0.3B-A0.1B-e3-lr7e5-b16-4090-v6.2-finetuned-tool" |
|
|
tokenizer = AutoTokenizer.from_pretrained(model_id) |
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
|
model_id, |
|
|
device_map="auto", |
|
|
torch_dtype=torch.float16, |
|
|
attn_implementation="flash_attention_2" |
|
|
).eval() |
|
|
|
|
|
# Prepare messages |
|
|
messages = [ |
|
|
{"role": "system", "content": "You are a helpful AI assistant."}, |
|
|
{"role": "user", "content": "Calculate 25 + 37"} |
|
|
] |
|
|
|
|
|
# Format and generate |
|
|
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) |
|
|
inputs = tokenizer(prompt, return_tensors="pt").to("cuda") |
|
|
|
|
|
with torch.no_grad(): |
|
|
outputs = model.generate( |
|
|
**inputs, |
|
|
max_new_tokens=512, |
|
|
do_sample=True, |
|
|
temperature=0.7, |
|
|
pad_token_id=tokenizer.pad_token_id |
|
|
) |
|
|
|
|
|
response = tokenizer.decode(outputs[0], skip_special_tokens=True) |
|
|
print(response) |
|
|
``` |
|
|
|
|
|
#### Tool Calling Support: |
|
|
The model supports structured tool calling for mathematical operations, data analysis, and other specialized tasks. |
|
|
|
|
|
#### Performance Optimizations: |
|
|
- **Speed Mode**: Max 512 new tokens for fast responses |
|
|
- **Balanced Mode**: Max 2048 new tokens for comprehensive answers |
|
|
- **Full Capacity**: Dynamic token allocation up to context limit |
|
|
|
|
|
--- |
|
|
|
|
|
**Developed by**: Kshitij Thakkar |
|
|
**Version**: v6.2 |
|
|
**License**: Please check model repository for licensing details |
|
|
""") |
|
|
|
|
|
|
|
|
demo.load( |
|
|
fn=read_flagged_messages, |
|
|
outputs=[flagged_messages_display] |
|
|
) |
|
|
|
|
|
return demo |
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
print("Starting AI Tasks Evaluation Suite...") |
|
|
demo = create_interface() |
|
|
demo.launch( |
|
|
server_name="0.0.0.0", |
|
|
server_port=7860, |
|
|
share=False, |
|
|
debug=True |
|
|
) |