Togmal-demo / test_leaderboard_access.py
HeTalksInMaths
Initial commit: ToGMAL Prompt Difficulty Analyzer with real MMLU data
f9b1ad5
raw
history blame
2.04 kB
#!/usr/bin/env python3
"""
Test if we can actually access OpenLLM Leaderboard per-question results
"""
from datasets import load_dataset
print("Testing access to OpenLLM Leaderboard detailed results...")
print("="*80)
# Test model
model = "meta-llama__Meta-Llama-3-70B-Instruct"
print(f"\nTrying to load: open-llm-leaderboard/details_{model}")
print("Config: harness_mmlu_5")
try:
results = load_dataset(
f"open-llm-leaderboard/details_{model}",
"harness_mmlu_5"
)
print(f"\nβœ“ SUCCESS! Loaded dataset")
print(f"Available splits: {list(results.keys())}")
# Check if 'latest' split exists
if 'latest' in results:
latest = results['latest']
print(f"Latest split has {len(latest)} rows")
# Show first few rows
print(f"\nFirst 3 rows:")
for i, row in enumerate(latest[:3]):
print(f"\nRow {i}:")
print(f" Keys: {list(row.keys())}")
if 'doc_id' in row:
print(f" doc_id: {row['doc_id']}")
if 'pred' in row:
print(f" pred: {row['pred']}")
if 'target' in row:
print(f" target: {row['target']}")
# Check if correct
if 'pred' in row and 'target' in row:
is_correct = (row['pred'] == row['target'])
print(f" Correct: {'βœ“' if is_correct else 'βœ—'}")
print("\n" + "="*80)
print("βœ“ Per-question data IS available!")
print("="*80)
except Exception as e:
print(f"\nβœ— FAILED: {e}")
print("\nTrying alternative configs...")
# Try other possible configs
for config in ["harness_mmlu_pro_5", "harness_gpqa_0", "results"]:
try:
print(f"\nTrying config: {config}")
results = load_dataset(f"open-llm-leaderboard/details_{model}", config)
print(f" βœ“ {config} works! Splits: {list(results.keys())}")
except Exception as e2:
print(f" βœ— {config} failed: {e2}")