Spaces:
Configuration error
Configuration error
| #!/usr/bin/env python3 | |
| """ | |
| Comprehensive Benchmark Data Fetcher | |
| ===================================== | |
| Strategy: | |
| 1. Fetch per-question results from TOP models on each benchmark | |
| 2. Collect ~1000 questions total across GPQA, MMLU-Pro, MATH | |
| 3. Compute success rates from strongest models only | |
| 4. Post-process to stratify by difficulty: | |
| - LOW success (0-30%): Hard boundary - model limitations | |
| - MEDIUM success (30-70%): Capability boundary - interesting edge cases | |
| - HIGH success (70-100%): Within capability - baseline | |
| This gives us the full spectrum to understand LLM capability boundaries. | |
| """ | |
| import json | |
| import logging | |
| from pathlib import Path | |
| from typing import Dict, List, Any, Optional | |
| from dataclasses import dataclass, asdict | |
| from collections import defaultdict | |
| import numpy as np | |
| logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') | |
| logger = logging.getLogger(__name__) | |
| try: | |
| from datasets import load_dataset | |
| DATASETS_AVAILABLE = True | |
| except ImportError: | |
| logger.error("datasets not installed. Run: uv pip install datasets") | |
| DATASETS_AVAILABLE = False | |
| class QuestionResult: | |
| """Single question with performance across top models""" | |
| question_id: str | |
| source_benchmark: str | |
| domain: str | |
| question_text: str | |
| correct_answer: str | |
| choices: Optional[List[str]] = None | |
| # Performance from top models | |
| model_results: Dict[str, bool] = None # model_name -> correct/incorrect | |
| success_rate: float = None # Across top models | |
| num_models: int = 0 | |
| # Difficulty classification | |
| difficulty_tier: str = None # "low", "medium", "high" success | |
| difficulty_label: str = None # "Nearly_Impossible", "Hard", "Moderate", "Easy" | |
| class BenchmarkDataFetcher: | |
| """ | |
| Fetch benchmark data from top models on HuggingFace leaderboards. | |
| Focuses on strongest models to get accurate capability boundary signal. | |
| """ | |
| def __init__(self, output_dir: Path = Path("./data/benchmark_results")): | |
| self.output_dir = output_dir | |
| self.output_dir.mkdir(parents=True, exist_ok=True) | |
| # Top models from OpenLLM Leaderboard v2 (as of Oct 2024) | |
| # Focusing on open-source models with available detailed results | |
| self.top_models = [ | |
| "meta-llama/Meta-Llama-3.1-70B-Instruct", | |
| "meta-llama/Meta-Llama-3.1-8B-Instruct", | |
| "Qwen/Qwen2.5-72B-Instruct", | |
| "mistralai/Mixtral-8x22B-Instruct-v0.1", | |
| "mistralai/Mistral-7B-Instruct-v0.3", | |
| ] | |
| self.questions: Dict[str, QuestionResult] = {} | |
| def fetch_mmlu_pro_results(self, max_questions: int = 500) -> Dict[str, QuestionResult]: | |
| """ | |
| Fetch MMLU-Pro results from top models. | |
| MMLU-Pro: 12K questions, 10 choices, harder than MMLU | |
| Target: 500 questions with performance from 5 top models | |
| """ | |
| logger.info(f"Fetching MMLU-Pro results (target: {max_questions} questions)...") | |
| if not DATASETS_AVAILABLE: | |
| logger.error("datasets library not available") | |
| return {} | |
| # First, load the base dataset to get questions | |
| try: | |
| logger.info(" Loading MMLU-Pro base dataset...") | |
| base_dataset = load_dataset("TIGER-Lab/MMLU-Pro", split="test") | |
| # Sample questions | |
| total_available = len(base_dataset) | |
| logger.info(f" Total MMLU-Pro questions available: {total_available}") | |
| if total_available > max_questions: | |
| # Stratified sampling across domains | |
| sampled_indices = self._stratified_sample(base_dataset, max_questions) | |
| else: | |
| sampled_indices = range(total_available) | |
| # Initialize questions | |
| questions = {} | |
| for idx in sampled_indices: | |
| item = base_dataset[int(idx)] | |
| question_id = f"mmlu_pro_{idx}" | |
| questions[question_id] = QuestionResult( | |
| question_id=question_id, | |
| source_benchmark="MMLU_Pro", | |
| domain=item.get('category', 'unknown'), | |
| question_text=item['question'], | |
| correct_answer=item['answer'], | |
| choices=item.get('options', []), | |
| model_results={}, | |
| num_models=0 | |
| ) | |
| logger.info(f" Initialized {len(questions)} MMLU-Pro questions") | |
| # Now fetch model results | |
| for model_name in self.top_models: | |
| try: | |
| logger.info(f" Fetching results for {model_name}...") | |
| dataset_name = f"open-llm-leaderboard/details_{model_name.replace('/', '__')}" | |
| # Try different config names for MMLU-Pro | |
| for config in ["harness_mmlu_pro_5", "mmlu_pro", "harness_mmlu_pro"]: | |
| try: | |
| results = load_dataset(dataset_name, config, split="latest") | |
| logger.info(f" ✓ Loaded {len(results)} results from {config}") | |
| # Match results to our questions | |
| for row in results: | |
| doc_id = row.get('doc_id', row.get('example', None)) | |
| if doc_id is None: | |
| continue | |
| question_id = f"mmlu_pro_{doc_id}" | |
| if question_id in questions: | |
| predicted = str(row.get('pred', row.get('prediction', ''))) | |
| correct = str(row.get('target', row.get('answer', ''))) | |
| is_correct = (predicted.strip().lower() == correct.strip().lower()) | |
| questions[question_id].model_results[model_name] = is_correct | |
| questions[question_id].num_models += 1 | |
| break # Success, don't try other configs | |
| except Exception as e: | |
| continue | |
| except Exception as e: | |
| logger.warning(f" Skipping {model_name}: {e}") | |
| continue | |
| # Compute success rates | |
| for qid, q in questions.items(): | |
| if q.num_models > 0: | |
| correct_count = sum(1 for v in q.model_results.values() if v) | |
| q.success_rate = correct_count / q.num_models | |
| q.difficulty_tier = self._classify_difficulty_tier(q.success_rate) | |
| q.difficulty_label = self._classify_difficulty_label(q.success_rate) | |
| logger.info(f" ✓ Collected results from {len(self.top_models)} models") | |
| return questions | |
| except Exception as e: | |
| logger.error(f" Failed to fetch MMLU-Pro: {e}") | |
| return {} | |
| def fetch_gpqa_results(self, max_questions: int = 200) -> Dict[str, QuestionResult]: | |
| """ | |
| Fetch GPQA Diamond results from top models. | |
| GPQA Diamond: 198 expert-written questions (hardest benchmark) | |
| Target: All questions with performance from 5 top models | |
| """ | |
| logger.info(f"Fetching GPQA Diamond results (target: {max_questions} questions)...") | |
| if not DATASETS_AVAILABLE: | |
| logger.error("datasets library not available") | |
| return {} | |
| try: | |
| # Load GPQA Diamond base dataset | |
| logger.info(" Loading GPQA Diamond base dataset...") | |
| base_dataset = load_dataset("Idavidrein/gpqa", "gpqa_diamond", split="train") | |
| total_available = len(base_dataset) | |
| logger.info(f" Total GPQA Diamond questions: {total_available}") | |
| # Initialize questions | |
| questions = {} | |
| for idx, item in enumerate(base_dataset): | |
| question_id = f"gpqa_diamond_{idx}" | |
| choices = [ | |
| item['Correct Answer'], | |
| item['Incorrect Answer 1'], | |
| item['Incorrect Answer 2'], | |
| item['Incorrect Answer 3'] | |
| ] | |
| questions[question_id] = QuestionResult( | |
| question_id=question_id, | |
| source_benchmark="GPQA_Diamond", | |
| domain=item.get('Subdomain', 'unknown').lower(), | |
| question_text=item['Question'], | |
| correct_answer=item['Correct Answer'], | |
| choices=choices, | |
| model_results={}, | |
| num_models=0 | |
| ) | |
| logger.info(f" Initialized {len(questions)} GPQA questions") | |
| # Fetch model results | |
| for model_name in self.top_models: | |
| try: | |
| logger.info(f" Fetching results for {model_name}...") | |
| dataset_name = f"open-llm-leaderboard/details_{model_name.replace('/', '__')}" | |
| # Try different config names | |
| for config in ["harness_gpqa_0", "gpqa", "harness_gpqa_diamond"]: | |
| try: | |
| results = load_dataset(dataset_name, config, split="latest") | |
| logger.info(f" ✓ Loaded {len(results)} results from {config}") | |
| # Match results | |
| for row in results: | |
| doc_id = row.get('doc_id', row.get('example', None)) | |
| if doc_id is None: | |
| continue | |
| question_id = f"gpqa_diamond_{doc_id}" | |
| if question_id in questions: | |
| predicted = str(row.get('pred', row.get('prediction', ''))) | |
| correct = str(row.get('target', row.get('answer', ''))) | |
| is_correct = (predicted.strip().lower() == correct.strip().lower()) | |
| questions[question_id].model_results[model_name] = is_correct | |
| questions[question_id].num_models += 1 | |
| break | |
| except Exception: | |
| continue | |
| except Exception as e: | |
| logger.warning(f" Skipping {model_name}: {e}") | |
| continue | |
| # Compute success rates | |
| for qid, q in questions.items(): | |
| if q.num_models > 0: | |
| correct_count = sum(1 for v in q.model_results.values() if v) | |
| q.success_rate = correct_count / q.num_models | |
| q.difficulty_tier = self._classify_difficulty_tier(q.success_rate) | |
| q.difficulty_label = self._classify_difficulty_label(q.success_rate) | |
| logger.info(f" ✓ Collected GPQA results") | |
| return questions | |
| except Exception as e: | |
| logger.error(f" Failed to fetch GPQA: {e}") | |
| logger.info(" GPQA may be gated. Try: huggingface-cli login") | |
| return {} | |
| def fetch_math_results(self, max_questions: int = 300) -> Dict[str, QuestionResult]: | |
| """ | |
| Fetch MATH (competition mathematics) results from top models. | |
| MATH: 12,500 competition-level math problems | |
| Target: 300 questions with performance from top models | |
| """ | |
| logger.info(f"Fetching MATH dataset results (target: {max_questions} questions)...") | |
| if not DATASETS_AVAILABLE: | |
| logger.error("datasets library not available") | |
| return {} | |
| try: | |
| # Try different dataset names | |
| for dataset_name in ["hendrycks/competition_math", "competition_math", "lighteval/MATH"]: | |
| try: | |
| logger.info(f" Trying dataset: {dataset_name}...") | |
| base_dataset = load_dataset(dataset_name, split="test") | |
| logger.info(f" ✓ Loaded {len(base_dataset)} MATH questions") | |
| # Sample questions | |
| if len(base_dataset) > max_questions: | |
| import random | |
| random.seed(42) | |
| sampled_indices = random.sample(range(len(base_dataset)), max_questions) | |
| else: | |
| sampled_indices = range(len(base_dataset)) | |
| # Initialize questions | |
| questions = {} | |
| for idx in sampled_indices: | |
| item = base_dataset[int(idx)] | |
| question_id = f"math_{idx}" | |
| questions[question_id] = QuestionResult( | |
| question_id=question_id, | |
| source_benchmark="MATH", | |
| domain=item.get('type', item.get('level', 'mathematics')), | |
| question_text=item['problem'], | |
| correct_answer=item['solution'], | |
| choices=None, # Free-form answer | |
| model_results={}, | |
| num_models=0 | |
| ) | |
| logger.info(f" Initialized {len(questions)} MATH questions") | |
| # Note: Model results for MATH are harder to fetch | |
| # OpenLLM Leaderboard may not have detailed per-question results | |
| # We'll use estimated success rates based on benchmark scores | |
| logger.warning(" MATH per-question results not available from leaderboard") | |
| logger.info(" Using estimated success rates based on benchmark scores") | |
| # Estimate: Top models get ~50% on MATH | |
| for q in questions.values(): | |
| q.success_rate = 0.35 # Conservative estimate | |
| q.num_models = 1 # Indicator that this is estimated | |
| q.difficulty_tier = self._classify_difficulty_tier(q.success_rate) | |
| q.difficulty_label = self._classify_difficulty_label(q.success_rate) | |
| return questions | |
| except Exception as e: | |
| logger.warning(f" Failed with {dataset_name}: {e}") | |
| continue | |
| logger.error(" Could not load MATH dataset from any source") | |
| return {} | |
| except Exception as e: | |
| logger.error(f" Failed to fetch MATH: {e}") | |
| return {} | |
| def _stratified_sample(self, dataset, n_samples: int) -> List[int]: | |
| """Sample questions stratified by domain/category""" | |
| try: | |
| # Get categories | |
| categories = [item.get('category', 'unknown') for item in dataset] | |
| unique_categories = list(set(categories)) | |
| # Samples per category | |
| samples_per_cat = n_samples // len(unique_categories) | |
| sampled_indices = [] | |
| for cat in unique_categories: | |
| cat_indices = [i for i, c in enumerate(categories) if c == cat] | |
| n_sample = min(samples_per_cat, len(cat_indices)) | |
| import random | |
| random.seed(42) | |
| sampled_indices.extend(random.sample(cat_indices, n_sample)) | |
| # Fill remaining | |
| remaining = n_samples - len(sampled_indices) | |
| if remaining > 0: | |
| all_indices = set(range(len(dataset))) | |
| available = list(all_indices - set(sampled_indices)) | |
| import random | |
| random.seed(42) | |
| sampled_indices.extend(random.sample(available, min(remaining, len(available)))) | |
| return sampled_indices[:n_samples] | |
| except Exception: | |
| # Fallback: random sampling | |
| import random | |
| random.seed(42) | |
| return random.sample(range(len(dataset)), min(n_samples, len(dataset))) | |
| def _classify_difficulty_tier(self, success_rate: float) -> str: | |
| """Classify into low/medium/high success tiers""" | |
| if success_rate < 0.30: | |
| return "low" # Hard - model struggles | |
| elif success_rate < 0.70: | |
| return "medium" # Capability boundary | |
| else: | |
| return "high" # Within capability | |
| def _classify_difficulty_label(self, success_rate: float) -> str: | |
| """Detailed difficulty label""" | |
| if success_rate < 0.10: | |
| return "Nearly_Impossible" | |
| elif success_rate < 0.30: | |
| return "Expert" | |
| elif success_rate < 0.50: | |
| return "Hard" | |
| elif success_rate < 0.70: | |
| return "Moderate" | |
| else: | |
| return "Easy" | |
| def fetch_all_benchmarks(self) -> Dict[str, QuestionResult]: | |
| """ | |
| Fetch all benchmark data. | |
| Target: ~1000 questions total | |
| - MMLU-Pro: 500 | |
| - GPQA: 200 | |
| - MATH: 300 | |
| """ | |
| logger.info("="*80) | |
| logger.info("Fetching Benchmark Data from Top Models") | |
| logger.info("="*80) | |
| logger.info(f"Top models: {', '.join(self.top_models)}") | |
| logger.info("") | |
| all_questions = {} | |
| # Fetch each benchmark | |
| mmlu_questions = self.fetch_mmlu_pro_results(max_questions=500) | |
| all_questions.update(mmlu_questions) | |
| gpqa_questions = self.fetch_gpqa_results(max_questions=200) | |
| all_questions.update(gpqa_questions) | |
| math_questions = self.fetch_math_results(max_questions=300) | |
| all_questions.update(math_questions) | |
| self.questions = all_questions | |
| logger.info("") | |
| logger.info("="*80) | |
| logger.info(f"Total questions collected: {len(all_questions)}") | |
| logger.info("="*80) | |
| return all_questions | |
| def save_raw_results(self, filename: str = "raw_benchmark_results.json"): | |
| """Save raw results for post-processing""" | |
| output_path = self.output_dir / filename | |
| # Convert to serializable format | |
| data = { | |
| "metadata": { | |
| "top_models": self.top_models, | |
| "total_questions": len(self.questions), | |
| "fetched_at": str(Path(__file__).stat().st_mtime) | |
| }, | |
| "questions": { | |
| qid: { | |
| **asdict(q), | |
| "model_results": q.model_results if q.model_results else {} | |
| } | |
| for qid, q in self.questions.items() | |
| } | |
| } | |
| with open(output_path, 'w') as f: | |
| json.dump(data, f, indent=2) | |
| logger.info(f"Saved raw results to {output_path}") | |
| return output_path | |
| def generate_statistics(self) -> Dict[str, Any]: | |
| """Generate statistics for collected data""" | |
| stats = { | |
| "total_questions": len(self.questions), | |
| "by_benchmark": defaultdict(int), | |
| "by_domain": defaultdict(int), | |
| "by_difficulty_tier": defaultdict(int), | |
| "by_difficulty_label": defaultdict(int), | |
| "success_rate_distribution": { | |
| "min": None, | |
| "max": None, | |
| "mean": None, | |
| "median": None | |
| } | |
| } | |
| success_rates = [] | |
| for q in self.questions.values(): | |
| stats["by_benchmark"][q.source_benchmark] += 1 | |
| stats["by_domain"][q.domain] += 1 | |
| if q.difficulty_tier: | |
| stats["by_difficulty_tier"][q.difficulty_tier] += 1 | |
| if q.difficulty_label: | |
| stats["by_difficulty_label"][q.difficulty_label] += 1 | |
| if q.success_rate is not None: | |
| success_rates.append(q.success_rate) | |
| if success_rates: | |
| stats["success_rate_distribution"]["min"] = float(np.min(success_rates)) | |
| stats["success_rate_distribution"]["max"] = float(np.max(success_rates)) | |
| stats["success_rate_distribution"]["mean"] = float(np.mean(success_rates)) | |
| stats["success_rate_distribution"]["median"] = float(np.median(success_rates)) | |
| # Convert defaultdicts to regular dicts | |
| stats["by_benchmark"] = dict(stats["by_benchmark"]) | |
| stats["by_domain"] = dict(stats["by_domain"]) | |
| stats["by_difficulty_tier"] = dict(stats["by_difficulty_tier"]) | |
| stats["by_difficulty_label"] = dict(stats["by_difficulty_label"]) | |
| return stats | |
| def print_summary(self): | |
| """Print summary of collected data""" | |
| stats = self.generate_statistics() | |
| print("\n" + "="*80) | |
| print("BENCHMARK DATA COLLECTION SUMMARY") | |
| print("="*80) | |
| print(f"\nTotal Questions: {stats['total_questions']}") | |
| print(f"\nBy Benchmark:") | |
| for benchmark, count in stats['by_benchmark'].items(): | |
| print(f" {benchmark}: {count}") | |
| print(f"\nBy Difficulty Tier:") | |
| for tier, count in stats['by_difficulty_tier'].items(): | |
| print(f" {tier.upper()}: {count} ({count/stats['total_questions']*100:.1f}%)") | |
| print(f"\nBy Difficulty Label:") | |
| for label, count in sorted(stats['by_difficulty_label'].items()): | |
| print(f" {label}: {count}") | |
| print(f"\nSuccess Rate Distribution:") | |
| dist = stats['success_rate_distribution'] | |
| if dist['mean']: | |
| print(f" Min: {dist['min']:.1%}") | |
| print(f" Max: {dist['max']:.1%}") | |
| print(f" Mean: {dist['mean']:.1%}") | |
| print(f" Median: {dist['median']:.1%}") | |
| print("\n" + "="*80) | |
| def main(): | |
| """Main execution""" | |
| fetcher = BenchmarkDataFetcher() | |
| # Fetch all data | |
| questions = fetcher.fetch_all_benchmarks() | |
| # Save raw results | |
| fetcher.save_raw_results() | |
| # Print summary | |
| fetcher.print_summary() | |
| # Save statistics | |
| stats = fetcher.generate_statistics() | |
| stats_path = fetcher.output_dir / "collection_statistics.json" | |
| with open(stats_path, 'w') as f: | |
| json.dump(stats, f, indent=2) | |
| logger.info(f"Saved statistics to {stats_path}") | |
| print("\nNext steps:") | |
| print("1. Review raw_benchmark_results.json") | |
| print("2. Run post-processing to stratify by difficulty") | |
| print("3. Build vector database with stratified sample") | |
| if __name__ == "__main__": | |
| main() | |