Spaces:
Sleeping
Sleeping
| import gc | |
| import logging | |
| import os | |
| import re | |
| from collections import Counter | |
| from typing import Any, Dict, List | |
| import gradio as gr | |
| import pandas as pd | |
| import plotly.express as px | |
| import plotly.graph_objects as go | |
| import psutil | |
| import torch | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| def get_memory_usage(): | |
| """Return (gpu_mem_used_MB, gpu_mem_total_MB, ram_used_MB, ram_total_MB)""" | |
| # System RAM | |
| vm = psutil.virtual_memory() | |
| ram_used_mb = vm.used / (1024**2) | |
| ram_total_mb = vm.total / (1024**2) | |
| # GPU memory | |
| if torch.cuda.is_available(): | |
| gpu_idx = torch.cuda.current_device() | |
| torch.cuda.synchronize() | |
| gpu_mem_alloc = torch.cuda.memory_allocated(gpu_idx) / (1024**2) | |
| gpu_mem_reserved = torch.cuda.memory_reserved(gpu_idx) / (1024**2) | |
| gpu_mem_total = torch.cuda.get_device_properties(gpu_idx).total_memory / ( | |
| 1024**2 | |
| ) | |
| gpu_mem_used = max(gpu_mem_alloc, gpu_mem_reserved) # safe estimate | |
| else: | |
| gpu_mem_used = 0 | |
| gpu_mem_total = 0 | |
| return gpu_mem_used, gpu_mem_total, ram_used_mb, ram_total_mb | |
| # Set up logging | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| # Model configurations - maps display names to HF model paths | |
| PREDEFINED_MODELS = [ | |
| "meta-llama/Llama-3.2-1B", | |
| "google/gemma-2-2b", | |
| "Qwen/Qwen3-0.6B", | |
| "Qwen/Qwen2.5-0.5B", | |
| "Qwen/Qwen2.5-1.5B", | |
| "bigscience/bloom-560m", | |
| "CohereForAI/aya-expanse-8b", | |
| "common-pile/comma-v0.1-2t", | |
| "google/byt5-small", | |
| "gsaltintas/supertoken_models-llama_gpt2", | |
| "gsaltintas/supertoken_models-llama_google-gemma-2-2b", | |
| ] | |
| # Global cache for loaded models | |
| model_cache = {} | |
| def parse_dataset(text): | |
| """Parse the input dataset text into structured questions""" | |
| if not text.strip(): | |
| return [], "Please enter your dataset" | |
| lines = text.strip().split("\n") | |
| if len(lines) < 2: | |
| return [], "Dataset must have at least a header and one question" | |
| # Skip header and detect delimiter | |
| first_data_line = lines[1] if len(lines) > 1 else lines[0] | |
| delimiter = "\t" if "\t" in first_data_line else "," | |
| questions = [] | |
| errors = [] | |
| for i, line in enumerate(lines[1:], 2): # Start from line 2 (after header) | |
| line = line.strip() | |
| if not line: | |
| continue | |
| parts = [part.strip().strip('"') for part in line.split(delimiter)] | |
| if len(parts) < 5: | |
| errors.append(f"Line {i}: Not enough columns (need 5, got {len(parts)})") | |
| continue | |
| question = { | |
| "question": parts[0], | |
| "correct_answer": parts[1], | |
| "choices": [parts[2], parts[3], parts[4]], | |
| } | |
| # Ensure correct answer is in choices | |
| if question["correct_answer"] not in question["choices"]: | |
| question["choices"].append(question["correct_answer"]) | |
| questions.append(question) | |
| error_msg = "\n".join(errors) if errors else "" | |
| return questions, error_msg | |
| def setup_tokenizer(model_path): | |
| tokenizer_name = model_path | |
| if "supertoken" in model_path: | |
| import json | |
| from huggingface_hub import hf_hub_download, list_repo_files | |
| files = list_repo_files(model_path) | |
| if "tokenizer_config.json" in files: | |
| tokenizer_path = hf_hub_download( | |
| repo_id=model_path, filename="tokenizer_config.json" | |
| ) | |
| with open(tokenizer_path) as f: | |
| tok_config = json.load(f)["data"]["tokenizer"] | |
| if tok_config["name"] == "huggingface": | |
| tokenizer_name = tok_config["path"] | |
| # todo: tiktoken | |
| tokenizer = AutoTokenizer.from_pretrained( | |
| tokenizer_name, trust_remote_code=True, legacy=True | |
| ) | |
| return tokenizer | |
| def load_model_and_tokenizer(model_path, progress_callback=None): | |
| """Load model and tokenizer with caching""" | |
| global model_cache | |
| # Decide caching strategy based on memory usage | |
| gpu_used, gpu_total, ram_used, ram_total = get_memory_usage() | |
| logger.info(f"Current GPU memory: {gpu_used:.1f}/{gpu_total:.1f} MB") | |
| logger.info(f"Current RAM: {ram_used:.1f}/{ram_total:.1f} MB") | |
| use_cache = ( | |
| not ( | |
| (gpu_total > 0 and gpu_used / gpu_total > 0.8) | |
| or (ram_used / ram_total > 0.8) | |
| ) | |
| or model_path in model_cache | |
| ) | |
| if not use_cache: | |
| logger.warning("High memory usage detected — disabling model cache.") | |
| if use_cache and model_path in model_cache: | |
| logger.info(f"Using cached model: {model_path}") | |
| if progress_callback: | |
| progress_callback(1.0, f"✅ Using cached model: {model_path}") | |
| return model_cache[model_path] | |
| try: | |
| if progress_callback: | |
| progress_callback(0.1, f"🔄 Starting to load model: {model_path}") | |
| # Check if CUDA is available | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| logger.info(f"Loading model: {model_path} using device: {device}") | |
| if progress_callback: | |
| progress_callback(0.2, f"📥 Loading tokenizer for {model_path}...") | |
| # Load tokenizer | |
| tokenizer = setup_tokenizer(model_path) | |
| # Add pad token if missing | |
| if tokenizer.pad_token is None: | |
| tokenizer.pad_token = tokenizer.eos_token | |
| if progress_callback: | |
| progress_callback( | |
| 0.5, | |
| f"🧠 Loading model weights for {model_path}... (this may take a while)", | |
| ) | |
| logger.info(os.getcwd()) | |
| # Load model with appropriate settings | |
| model = AutoModelForCausalLM.from_pretrained( | |
| model_path, | |
| torch_dtype=torch.float16 if device == "cuda" else torch.float32, | |
| device_map="auto" if device == "cuda" else None, | |
| trust_remote_code=True, | |
| low_cpu_mem_usage=True, | |
| ) | |
| model_info = {"tokenizer": tokenizer, "model": model, "device": device} | |
| if use_cache: | |
| model_cache[model_path] = model_info | |
| if progress_callback: | |
| progress_callback(1.0, f"✅ Successfully loaded model: {model_path}") | |
| return model_info | |
| except Exception as e: | |
| import code | |
| error_msg = f"❌ Error loading model {model_path}: {str(e)}" | |
| logger.error(error_msg) | |
| # code.interact(local=dict(globals(), **locals())) | |
| if progress_callback: | |
| progress_callback(0.0, error_msg) | |
| return None | |
| def calculate_choice_likelihood(model, tokenizer, question, choice): | |
| """Calculate the log-likelihood of the choice given the question prompt""" | |
| try: | |
| prompt = f"Question: {question}\nAnswer: " | |
| prompt = question | |
| full_text = f"{prompt} {choice}" | |
| # Tokenize full input (prompt + answer) | |
| input_ids = tokenizer.encode( | |
| full_text, return_tensors="pt", add_special_tokens=False | |
| ).to(model.device) | |
| prompt_ids = tokenizer.encode( | |
| prompt, return_tensors="pt", add_special_tokens=False | |
| ).to(model.device) | |
| if input_ids.size(1) <= prompt_ids.size(1): | |
| logger.warning("Answer tokens are empty after tokenization.") | |
| return float("-inf") | |
| with torch.no_grad(): | |
| outputs = model(input_ids) | |
| logits = outputs.logits | |
| # Get logits for the answer tokens only | |
| answer_len = input_ids.size(1) - prompt_ids.size(1) | |
| target_ids = input_ids[:, -answer_len:] | |
| logits = logits[ | |
| :, prompt_ids.size(1) - 1 : -1, : | |
| ] # shifted for next-token prediction | |
| log_probs = torch.nn.functional.log_softmax(logits, dim=-1) | |
| token_log_probs = log_probs.gather(2, target_ids.unsqueeze(-1)).squeeze(-1) | |
| total_log_prob = token_log_probs.sum().item() | |
| return total_log_prob | |
| except Exception as e: | |
| logger.error(f"Error calculating likelihood for choice '{choice}': {str(e)}") | |
| return float("-inf") | |
| def evaluate_model_on_questions(model_path, questions, progress_callback=None): | |
| """Evaluate a single model on all questions using likelihood-based scoring""" | |
| model_info = load_model_and_tokenizer( | |
| model_path, progress_callback=progress_callback | |
| ) | |
| if model_info is None: | |
| return [{"error": f"Failed to load model {model_path}"}] * len(questions) | |
| results = [] | |
| model = model_info["model"] | |
| tokenizer = model_info["tokenizer"] | |
| for i, question in enumerate(questions): | |
| try: | |
| # Calculate likelihood for each choice | |
| choice_likelihoods = {} | |
| choice_probs = {} | |
| for choice in question["choices"]: | |
| likelihood = calculate_choice_likelihood( | |
| model, tokenizer, question["question"], choice | |
| ) | |
| choice_likelihoods[choice] = likelihood | |
| # Convert log probabilities to probabilities for confidence scoring | |
| max_log_prob = max(choice_likelihoods.values()) | |
| choice_probs = { | |
| choice: torch.exp(torch.tensor(log_prob - max_log_prob)).item() | |
| for choice, log_prob in choice_likelihoods.items() | |
| } | |
| # Normalize probabilities | |
| total_prob = sum(choice_probs.values()) | |
| if total_prob > 0: | |
| choice_probs = { | |
| choice: prob / total_prob for choice, prob in choice_probs.items() | |
| } | |
| # Select the choice with highest likelihood | |
| predicted_choice = max( | |
| choice_likelihoods.keys(), key=lambda x: choice_likelihoods[x] | |
| ) | |
| is_correct = predicted_choice == question["correct_answer"] | |
| # Confidence is the probability of the selected choice | |
| confidence = choice_probs.get(predicted_choice, 0.0) | |
| results.append( | |
| { | |
| "question_idx": i, | |
| "predicted": predicted_choice, | |
| "correct": is_correct, | |
| "confidence": confidence, | |
| "choice_likelihoods": choice_likelihoods, | |
| "choice_probabilities": choice_probs, | |
| "raw_response": f"Likelihoods: {choice_likelihoods}", | |
| } | |
| ) | |
| if progress_callback: | |
| # Use remaining 80% for evaluation progress | |
| evaluation_progress = 0.2 + (i + 1) / len(questions) * 0.8 | |
| progress_callback( | |
| evaluation_progress, | |
| f"🔍 Evaluating {model_path}: {i + 1}/{len(questions)} questions (likelihood-based)", | |
| ) | |
| except Exception as e: | |
| logger.error(f"Error evaluating question {i} with {model_path}: {str(e)}") | |
| results.append( | |
| { | |
| "question_idx": i, | |
| "predicted": question["choices"][0] if question["choices"] else "", | |
| "correct": False, | |
| "confidence": 0.0, | |
| "choice_likelihoods": {}, | |
| "choice_probabilities": {}, | |
| "raw_response": f"Error: {str(e)}", | |
| } | |
| ) | |
| return results | |
| def run_evaluation( | |
| dataset_text, selected_predefined, custom_models_text="", progress=gr.Progress() | |
| ): | |
| """Main evaluation function""" | |
| if not dataset_text.strip(): | |
| return ( | |
| "Please enter your dataset", | |
| "<p>No data provided</p>", | |
| None, | |
| None, | |
| gr.update(visible=True), | |
| "", # markdown_summary | |
| "", # csv_summary | |
| ) | |
| # Parse custom models | |
| custom_models = [] | |
| if custom_models_text is None: | |
| custom_models_text = "" | |
| if custom_models_text.strip(): | |
| custom_models = [ | |
| model.strip() | |
| for model in custom_models_text.strip().split("\n") | |
| if model.strip() | |
| ] | |
| # Combine selected models | |
| all_models = [] | |
| # Add predefined models | |
| all_models.extend(selected_predefined) | |
| all_models.extend(custom_models) | |
| if not all_models: | |
| return ( | |
| "Please select at least one model or add custom models", | |
| "<p>No models selected</p>", | |
| None, | |
| None, | |
| gr.update(visible=False), | |
| "", | |
| "", | |
| ) | |
| # Parse dataset | |
| questions, parse_error = parse_dataset(dataset_text) | |
| if parse_error: | |
| return ( | |
| f"Dataset parsing error:\n{parse_error}", | |
| "<p>Failed to parse dataset</p>", | |
| None, | |
| None, | |
| gr.update(visible=True), | |
| "", | |
| "", | |
| ) | |
| if not questions: | |
| return ( | |
| "No valid questions found in dataset", | |
| "<p>No questions to evaluate</p>", | |
| None, | |
| None, | |
| gr.update(visible=True), | |
| "", | |
| "", | |
| ) | |
| # Run evaluation | |
| progress(0, "Starting evaluation...") | |
| results = {} | |
| total_steps = len(all_models) * len(questions) | |
| current_step = 0 | |
| summary_md = create_summary_markdown({}) | |
| for model_path in all_models: | |
| display_name = model_path.split("/")[-1] if "/" in model_path else model_path | |
| try: | |
| def model_progress(p, msg): | |
| nonlocal current_step | |
| current_step = int(p * len(questions)) | |
| overall_progress = current_step / total_steps | |
| progress(overall_progress, msg) | |
| model_results = evaluate_model_on_questions( | |
| model_path, questions, model_progress | |
| ) | |
| results[display_name] = model_results | |
| except Exception as e: | |
| logger.error(f"Failed to evaluate {display_name}: {str(e)}") | |
| results[display_name] = [{"error": str(e)}] * len(questions) | |
| # Clean up GPU memory | |
| if torch.cuda.is_available(): | |
| torch.cuda.empty_cache() | |
| gc.collect() | |
| # Generate outputs | |
| summary_stats = generate_summary_stats(questions, results) | |
| summary_md = create_summary_markdown(summary_stats) | |
| detailed_html = create_detailed_results_html(questions, results) | |
| accuracy_chart = create_accuracy_chart(summary_stats) | |
| confidence_chart = create_confidence_chart(results) | |
| # Generate compact summaries | |
| markdown_summary = generate_compact_summary_markdown( | |
| questions, results, summary_stats | |
| ) | |
| csv_summary = generate_csv_summary(questions, results, summary_stats) | |
| return ( | |
| summary_md, | |
| detailed_html, | |
| accuracy_chart, | |
| confidence_chart, | |
| gr.update(visible=True), | |
| markdown_summary, | |
| csv_summary, | |
| ) | |
| def generate_summary_stats(questions, results): | |
| """Generate summary statistics for all models""" | |
| summary = {} | |
| for model, model_results in results.items(): | |
| if not model_results or "error" in model_results[0]: | |
| summary[model] = { | |
| "accuracy": 0.0, | |
| "correct": 0, | |
| "total": len(questions), | |
| "avg_confidence": 0.0, | |
| "error": model_results[0].get("error", "Unknown error") | |
| if model_results | |
| else "No results", | |
| } | |
| continue | |
| correct_count = sum(1 for r in model_results if r.get("correct", False)) | |
| total_count = len(model_results) | |
| accuracy = correct_count / total_count if total_count > 0 else 0 | |
| # Calculate average confidence | |
| avg_confidence = ( | |
| sum(r.get("confidence", 0) for r in model_results) / total_count | |
| if total_count > 0 | |
| else 0 | |
| ) | |
| summary[model] = { | |
| "accuracy": accuracy, | |
| "correct": correct_count, | |
| "total": total_count, | |
| "avg_confidence": avg_confidence, | |
| } | |
| return summary | |
| def create_summary_markdown(summary_stats): | |
| """Create markdown summary of results""" | |
| if not summary_stats: | |
| return "No results available" | |
| # Sort by accuracy | |
| sorted_models = sorted( | |
| summary_stats.items(), key=lambda x: x[1]["accuracy"], reverse=True | |
| ) | |
| lines = ["## 🏆 Model Performance Summary\n"] | |
| for i, (model, stats) in enumerate(sorted_models): | |
| if "error" in stats: | |
| lines.append(f"❌ **{model}**: Error - {stats['error']}") | |
| continue | |
| accuracy_pct = stats["accuracy"] * 100 | |
| medal = "🥇" if i == 0 else "🥈" if i == 1 else "🥉" if i == 2 else f"{i + 1}." | |
| lines.append( | |
| f"{medal} **{model}**: {accuracy_pct:.1f}% " | |
| f"({stats['correct']}/{stats['total']} correct, " | |
| f"avg confidence: {stats['avg_confidence']:.2f})" | |
| ) | |
| return "\n".join(lines) | |
| def create_detailed_results_html(questions, results): | |
| """Create detailed HTML results for each question""" | |
| if not questions or not results: | |
| return "<p>No detailed results available</p>" | |
| html_parts = [ | |
| """ | |
| <style> | |
| .question-card { | |
| background: white; | |
| border-radius: 12px; | |
| padding: 20px; | |
| margin-bottom: 20px; | |
| box-shadow: 0 2px 8px rgba(0,0,0,0.1); | |
| border-left: 5px solid #667eea; | |
| } | |
| .question-header { | |
| display: flex; | |
| justify-content: space-between; | |
| align-items: center; | |
| margin-bottom: 15px; | |
| } | |
| .question-number { | |
| background: linear-gradient(135deg, #667eea, #764ba2); | |
| color: white; | |
| padding: 6px 12px; | |
| border-radius: 20px; | |
| font-weight: bold; | |
| font-size: 14px; | |
| } | |
| .question-text { | |
| font-weight: 600; | |
| font-size: 16px; | |
| margin: 15px 0; | |
| color: #2d3748; | |
| } | |
| .choices { | |
| background: #f8fafc; | |
| border-radius: 8px; | |
| padding: 15px; | |
| margin: 10px 0; | |
| } | |
| .choice { | |
| margin: 8px 0; | |
| color: #4a5568; | |
| } | |
| .correct-answer { | |
| background: linear-gradient(135deg, #c6f6d5, #9ae6b4); | |
| border-left: 4px solid #48bb78; | |
| border-radius: 6px; | |
| padding: 12px; | |
| margin: 10px 0; | |
| font-weight: 600; | |
| color: #22543d; | |
| } | |
| .model-results { | |
| display: grid; | |
| grid-template-columns: repeat(auto-fit, minmax(220px, 1fr)); | |
| gap: 12px; | |
| margin-top: 15px; | |
| } | |
| .model-result { | |
| padding: 12px; | |
| border-radius: 8px; | |
| text-align: center; | |
| font-weight: 600; | |
| transition: transform 0.2s ease; | |
| } | |
| .model-result:hover { | |
| transform: scale(1.02); | |
| } | |
| .result-correct { | |
| background: linear-gradient(135deg, #c6f6d5, #9ae6b4); | |
| color: #22543d; | |
| border: 2px solid #48bb78; | |
| } | |
| .result-incorrect { | |
| background: linear-gradient(135deg, #fed7d7, #fca5a5); | |
| color: #742a2a; | |
| border: 2px solid #e53e3e; | |
| } | |
| .result-error { | |
| background: linear-gradient(135deg, #fbb6ce, #f687b3); | |
| color: #744210; | |
| border: 2px solid #d69e2e; | |
| } | |
| .raw-response { | |
| font-size: 10px; | |
| margin-top: 4px; | |
| opacity: 0.7; | |
| font-family: monospace; | |
| } | |
| </style> | |
| """ | |
| ] | |
| for q_idx, question in enumerate(questions): | |
| html_parts.append(f""" | |
| <div class="question-card"> | |
| <div class="question-header"> | |
| <span class="question-number">Q{q_idx + 1}</span> | |
| </div> | |
| <div class="question-text">{question["question"]}</div> | |
| <div class="choices"> | |
| <strong>Choices:</strong><br> | |
| {" | ".join(f"{chr(65 + i)}) {choice}" for i, choice in enumerate(question["choices"]))} | |
| </div> | |
| <div class="correct-answer"> | |
| <strong>✓ Correct Answer:</strong> {question["correct_answer"]} | |
| </div> | |
| <div class="model-results"> | |
| """) | |
| # Add results for each model | |
| for model, model_results in results.items(): | |
| if q_idx < len(model_results): | |
| result = model_results[q_idx] | |
| if "error" in result: | |
| html_parts.append(f""" | |
| <div class="model-result result-error"> | |
| <div>⚠️ {model}</div> | |
| <div style="font-size: 12px; margin-top: 4px;"> | |
| Error occurred | |
| </div> | |
| <div class="raw-response">{result.get("raw_response", "Unknown error")}</div> | |
| </div> | |
| """) | |
| else: | |
| result_class = ( | |
| "result-correct" | |
| if result.get("correct", False) | |
| else "result-incorrect" | |
| ) | |
| icon = "✅" if result.get("correct", False) else "❌" | |
| html_parts.append(f""" | |
| <div class="model-result {result_class}"> | |
| <div>{icon} {model}</div> | |
| <div style="font-size: 12px; margin-top: 4px;"> | |
| "{result.get("predicted", "No prediction")}" | |
| </div> | |
| <div class="raw-response">Raw: "{result.get("raw_response", "")}"</div> | |
| </div> | |
| """) | |
| html_parts.append(""" | |
| </div> | |
| </div> | |
| """) | |
| return "".join(html_parts) | |
| def create_accuracy_chart(summary_stats): | |
| """Create accuracy comparison chart""" | |
| if not summary_stats: | |
| return None | |
| models = [] | |
| accuracies = [] | |
| for model, stats in summary_stats.items(): | |
| if "error" not in stats: | |
| models.append(model) | |
| accuracies.append(stats["accuracy"] * 100) | |
| if not models: | |
| return None | |
| fig = go.Figure( | |
| data=[ | |
| go.Bar( | |
| x=models, | |
| y=accuracies, | |
| marker_color="lightblue", | |
| text=[f"{acc:.1f}%" for acc in accuracies], | |
| textposition="auto", | |
| ) | |
| ] | |
| ) | |
| fig.update_layout( | |
| title="Model Accuracy Comparison", | |
| xaxis_title="Models", | |
| yaxis_title="Accuracy (%)", | |
| template="plotly_white", | |
| showlegend=False, | |
| ) | |
| return fig | |
| def create_confidence_chart(results): | |
| """Create confidence distribution chart""" | |
| if not results: | |
| return None | |
| data = [] | |
| for model, model_results in results.items(): | |
| for result in model_results: | |
| if "error" not in result and "confidence" in result: | |
| data.append( | |
| { | |
| "Model": model, | |
| "Confidence": result["confidence"], | |
| "Correct": "Correct" | |
| if result.get("correct", False) | |
| else "Incorrect", | |
| } | |
| ) | |
| if not data: | |
| return None | |
| df = pd.DataFrame(data) | |
| fig = px.box( | |
| df, | |
| x="Model", | |
| y="Confidence", | |
| color="Correct", | |
| title="Confidence Distribution by Model and Correctness", | |
| template="plotly_white", | |
| ) | |
| return fig | |
| def generate_compact_summary_markdown(questions, results, summary_stats): | |
| """Generate a compact markdown summary table for copy-pasting""" | |
| logger.info("compaaact summary") | |
| if not summary_stats or not questions or not results: | |
| return "No data available for summary" | |
| lines = ["# Model Performance Summary\n"] | |
| # Accuracy Summary Table | |
| lines.append("## 📊 Accuracy Summary\n") | |
| lines.append("| Rank | Model | Accuracy | Correct | Total | Avg Confidence |") | |
| lines.append("|------|-------|----------|---------|-------|----------------|") | |
| # Sort by accuracy | |
| sorted_models = sorted( | |
| summary_stats.items(), key=lambda x: x[1].get("accuracy", 0), reverse=True | |
| ) | |
| for i, (model, stats) in enumerate(sorted_models): | |
| if "error" in stats: | |
| lines.append(f"| {i + 1} | {model} | ERROR | - | - | - |") | |
| else: | |
| accuracy_pct = stats["accuracy"] * 100 | |
| lines.append( | |
| f"| {i + 1} | {model} | {accuracy_pct:.1f}% | {stats['correct']} | {stats['total']} | {stats['avg_confidence']:.3f} |" | |
| ) | |
| lines.append("\n") | |
| # Detailed Results Table | |
| lines.append("## 📋 Detailed Question Results\n") | |
| # Get all model names for header | |
| model_names = list(results.keys()) | |
| header = "| Q# | Question | Correct Answer |" + "".join( | |
| [f" {model} |" for model in model_names] | |
| ) | |
| separator = "|" + "|".join( | |
| ["-" * (len(col.strip()) + 2) for col in header.split("|")[1:]] | |
| ) | |
| lines.append(header) | |
| lines.append(separator) | |
| for q_idx, question in enumerate(questions): | |
| # Truncate long questions for table readability | |
| question_text = question["question"] | |
| if len(question_text) > 50: | |
| question_text = question_text[:47] + "..." | |
| row = f"| {q_idx + 1} | {question_text} | {question['correct_answer']} |" | |
| for model in model_names: | |
| if q_idx < len(results[model]) and "error" not in results[model][q_idx]: | |
| result = results[model][q_idx] | |
| predicted = result.get("predicted", "N/A") | |
| is_correct = result.get("correct", False) | |
| confidence = result.get("confidence", 0) | |
| # Add emoji for visual feedback | |
| status_emoji = "✅" if is_correct else "❌" | |
| row += f" {status_emoji} {predicted} ({confidence:.2f}) |" | |
| else: | |
| row += " ⚠️ ERROR |" | |
| lines.append(row) | |
| lines.append("\n") | |
| # Legend | |
| lines.append("### Legend") | |
| lines.append("- ✅ = Correct answer") | |
| lines.append("- ❌ = Incorrect answer") | |
| lines.append("- ⚠️ = Error occurred") | |
| lines.append("- Numbers in parentheses = Confidence score") | |
| logger.info("\n".join(lines)) | |
| return "\n".join(lines) | |
| def generate_csv_summary(questions, results, summary_stats): | |
| """Generate CSV format summary""" | |
| # TODO: add CSV file download if necessary | |
| if not summary_stats or not questions or not results: | |
| return "No data available" | |
| lines = [] | |
| # Accuracy summary header | |
| lines.append("# ACCURACY SUMMARY") | |
| lines.append("Rank,Model,Accuracy_Percent,Correct,Total,Avg_Confidence") | |
| sorted_models = sorted( | |
| summary_stats.items(), key=lambda x: x[1].get("accuracy", 0), reverse=True | |
| ) | |
| for i, (model, stats) in enumerate(sorted_models): | |
| if "error" in stats: | |
| lines.append(f"{i + 1},{model},ERROR,-,-,-") | |
| else: | |
| accuracy_pct = stats["accuracy"] * 100 | |
| lines.append( | |
| f"{i + 1},{model},{accuracy_pct:.1f},{stats['correct']},{stats['total']},{stats['avg_confidence']:.3f}" | |
| ) | |
| lines.append("") | |
| lines.append("# DETAILED RESULTS") | |
| # Header for detailed results | |
| model_names = list(results.keys()) | |
| header = "Question_ID,Question,Correct_Answer," + ",".join( | |
| [ | |
| f"{model}_Predicted,{model}_Correct,{model}_Confidence" | |
| for model in model_names | |
| ] | |
| ) | |
| lines.append(header) | |
| # Detailed results | |
| for q_idx, question in enumerate(questions): | |
| row = f'{q_idx + 1},"{question["question"]}",{question["correct_answer"]}' | |
| for model in model_names: | |
| if q_idx < len(results[model]) and "error" not in results[model][q_idx]: | |
| result = results[model][q_idx] | |
| predicted = result.get("predicted", "N/A") | |
| is_correct = str(result.get("correct", False)) | |
| confidence = result.get("confidence", 0) | |
| row += f",{predicted},{is_correct},{confidence:.3f}" | |
| else: | |
| row += ",ERROR,FALSE,0" | |
| lines.append(row) | |
| return "\n".join(lines) | |
| # Sample datasets for quick testing | |
| SAMPLE_DATASETS = { | |
| "Custom (enter below)": "", | |
| "LP": """Question,Correct Answer,Choice1,Choice2,Choice3 | |
| In which country is Llanfairpwllgwyngyllgogerychwyrndrobwllllantysiliogogogoch located? Wales Germany France Scotland | |
| In which country is Llanfair pwllgwyngyll located? Wales Germany France Scotland | |
| In which country is Llanfair PG located? Wales Germany France Scotland""", | |
| "Simple Math": """Question,Correct Answer,Choice1,Choice2,Choice3 | |
| What is 2+2?,4,3,2,5 | |
| What is 5*3?,15,12,16,18 | |
| What is 10-7?,3,7,4,2 | |
| What is 8/2?,4,3,2,5""", | |
| "World Capitals": """Question,Correct Answer,Choice1,Choice2,Choice3 | |
| What is the capital of France?,Paris,London,Berlin,Rome | |
| What is the capital of Japan?,Tokyo,Seoul,Beijing,Bangkok | |
| What is the capital of Brazil?,Brasília,Rio de Janeiro,São Paulo,Salvador | |
| What is the capital of Australia?,Canberra,Sydney,Melbourne,Perth""", | |
| "Science Quiz": """Question,Correct Answer,Choice1,Choice2,Choice3 | |
| What is the chemical symbol for gold?,Au,Ag,Ca,K | |
| Which planet is closest to the Sun?,Mercury,Venus,Earth,Mars | |
| What is the speed of light?,299792458 m/s,300000000 m/s,2992458 m/s,299000000 m/s | |
| What gas do plants absorb from the atmosphere?,Carbon dioxide,Oxygen,Nitrogen,Hydrogen""", | |
| } | |
| # Custom CSS | |
| css = """ | |
| .gradio-container { | |
| font-family: 'Inter', sans-serif; | |
| } | |
| .sample-text { | |
| font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace; | |
| font-size: 12px; | |
| } | |
| """ | |
| # Create Gradio interface | |
| with gr.Blocks( | |
| title="🤖 Model Performance Comparison", theme=gr.themes.Soft(), css=css | |
| ) as demo: | |
| gr.Markdown(""" | |
| # 🤖 Model Performance Comparison Tool | |
| Compare LLM performance on multiple-choice questions using Hugging Face models. | |
| **Format**: Each line should have: `Question,Correct Answer,Choice1,Choice2,Choice3` | |
| 💡 **Features**: | |
| - Model evaluation using HuggingFace transformers | |
| - Support for custom models via HF model paths | |
| - Detailed question-by-question results | |
| - Performance charts and statistics | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| # Sample dataset selector | |
| sample_selector = gr.Dropdown( | |
| choices=list(SAMPLE_DATASETS.keys()), | |
| value="Custom (enter below)", | |
| label="Choose sample dataset or enter your own", | |
| interactive=True, | |
| ) | |
| # Dataset input | |
| dataset_input = gr.Textbox( | |
| label="Dataset (CSV/TSV format)", | |
| placeholder="""Enter your dataset here... | |
| Example format: | |
| Question,Correct Answer,Choice1,Choice2,Choice3 | |
| What is 2+2?,4,3,2,5 | |
| What is the capital of France?,Paris,London,Berlin,Paris""", | |
| lines=8, | |
| max_lines=15, | |
| ) | |
| gr.Markdown(""" | |
| **Format Requirements**: | |
| - First line: header (will be ignored), leave empty if no header | |
| - Each data line: Question, Correct Answer, Choice1, Choice2, Choice3 | |
| - Use commas or tabs as separators | |
| """) | |
| with gr.Column(scale=1): | |
| # Model selection | |
| with gr.Tabs(): | |
| with gr.TabItem("🤖 Predefined Models"): | |
| predefined_selector = gr.CheckboxGroup( | |
| choices=PREDEFINED_MODELS, | |
| value=[PREDEFINED_MODELS[0]], | |
| label="Select from popular models", | |
| interactive=True, | |
| ) | |
| with gr.TabItem("➕ Custom Models"): | |
| custom_models_input = gr.Textbox( | |
| label="Custom HuggingFace Model Paths", | |
| placeholder="""Enter HuggingFace model paths (one per line): | |
| microsoft/DialoGPT-medium | |
| bigscience/bloom-560m""", | |
| lines=5, | |
| info="Add any HuggingFace model path. One model per line.", | |
| ) | |
| gr.Markdown(""" | |
| **Examples of valid model paths**: | |
| - `microsoft/DialoGPT-medium` | |
| - `bigscience/bloom-560m` | |
| - `facebook/opt-350m` | |
| - Your own fine-tuned models! | |
| """) | |
| # Evaluate button | |
| evaluate_btn = gr.Button("⚡ Run Evaluation", variant="primary", scale=1) | |
| gr.Markdown(""" | |
| **⚠️ Note**: | |
| - Larger models require more GPU memory, currently we only run on CPU | |
| - First run will download models (may take time) | |
| - Models are cached for subsequent runs | |
| """) | |
| # Results section | |
| with gr.Column(visible=True) as results_section: | |
| gr.Markdown("## 📊 Results") | |
| summary_output = gr.Markdown( | |
| value="Results will appear here...", label="Performance Summary" | |
| ) | |
| with gr.Row(): | |
| accuracy_plot = gr.Plot(label="Accuracy Comparison") | |
| confidence_plot = gr.Plot(label="Confidence Analysis") | |
| # NEW: Export Section | |
| gr.Markdown("## 📥 Export Results") | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.Markdown("### 📋 Markdown Table Format") | |
| markdown_summary_output = gr.Textbox( | |
| label="Markdown Summary (Copy & Paste Ready)", | |
| lines=15, | |
| max_lines=25, | |
| show_copy_button=True, | |
| interactive=False, | |
| value="", | |
| ) | |
| with gr.Column(): | |
| gr.Markdown("### 📊 CSV Format") | |
| csv_summary_output = gr.Textbox( | |
| label="CSV Summary (Copy & Paste Ready)", | |
| lines=15, | |
| max_lines=25, | |
| show_copy_button=True, | |
| interactive=False, | |
| value="", | |
| ) | |
| detailed_results = gr.HTML( | |
| value="<p>Detailed results will appear here...</p>", | |
| label="Detailed Question-by-Question Results", | |
| ) | |
| # Event handlers | |
| def update_dataset_from_sample(sample_name): | |
| if sample_name in SAMPLE_DATASETS: | |
| return gr.update(value=SAMPLE_DATASETS[sample_name]) | |
| return gr.update() | |
| sample_selector.change( | |
| fn=update_dataset_from_sample, inputs=sample_selector, outputs=dataset_input | |
| ) | |
| evaluate_btn.click( | |
| fn=run_evaluation, | |
| inputs=[dataset_input, predefined_selector, custom_models_input], | |
| outputs=[ | |
| summary_output, | |
| detailed_results, | |
| accuracy_plot, | |
| confidence_plot, | |
| results_section, | |
| markdown_summary_output, | |
| csv_summary_output, | |
| ], | |
| ) | |
| gr.Markdown(""" | |
| --- | |
| ### About Model Evaluation | |
| This tool loads and runs HuggingFace models for evaluation: | |
| **🏗️ How it works**: | |
| - Downloads models from HuggingFace Hub | |
| - Formats questions as prompts for each model | |
| - Runs likelihood based evaluation | |
| **⚡ Performance Tips**: | |
| - Use smaller models for testing | |
| - Larger models (7B+) require significant GPU memory | |
| - Models are cached after first load | |
| **🔧 Supported Models**: | |
| - Any HuggingFace autoregressive language model | |
| - Both instruction-tuned and base models | |
| - Custom fine-tuned models via HF paths | |
| """) | |
| if __name__ == "__main__": | |
| demo.launch() | |