File size: 3,439 Bytes
f9b1ad5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
#!/usr/bin/env python3
"""
Quick test with real data - sample 1000 questions for faster testing
"""

import json
from pathlib import Path
from benchmark_vector_db import BenchmarkVectorDB, BenchmarkQuestion
import random

def load_sample_mmlu_data(n_samples=1000):
    """Load a sample of real MMLU questions"""
    print(f"Loading sample of {n_samples} real MMLU questions...")
    
    with open("./data/benchmark_results/mmlu_real_results.json") as f:
        data = json.load(f)
    
    # Sample questions
    all_qids = list(data['questions'].keys())
    sampled_qids = random.sample(all_qids, min(n_samples, len(all_qids)))
    
    questions = []
    for qid in sampled_qids:
        q = data['questions'][qid]
        questions.append(BenchmarkQuestion(
            question_id=q['question_id'],
            source_benchmark=q['source_benchmark'],
            domain=q['domain'],
            question_text=q['question_text'],
            correct_answer="",  # Not needed for vector DB
            choices=q.get('choices'),
            success_rate=q['success_rate'],
            difficulty_score=1.0 - q['success_rate'],
            difficulty_label=q['difficulty_label'],
            num_models_tested=q['num_models_tested']
        ))
    
    print(f"βœ“ Loaded {len(questions)} sampled questions")
    return questions


def quick_test():
    """Quick test with sampled real data"""
    
    # Initialize fresh database
    db = BenchmarkVectorDB(
        db_path=Path("./data/benchmark_vector_db"),
        embedding_model="all-MiniLM-L6-v2"
    )
    
    # Load sample data
    questions = load_sample_mmlu_data(1000)
    
    # Index questions (much faster with 1000 vs 14000)
    print("\nIndexing into vector database...")
    db.index_questions(questions)
    
    # Get stats
    stats = db.get_statistics()
    print(f"\nπŸ“Š Database Statistics:")
    print(f"  Total Questions: {stats['total_questions']}")
    print(f"  Difficulty Distribution: {stats.get('difficulty_levels', {})}")
    
    # Test with diverse prompts
    test_prompts = [
        # Should be HARD (physics/abstract math)
        "Calculate the quantum correction to the partition function for a 3D harmonic oscillator",
        "Prove that there are infinitely many prime numbers",
        
        # Should be MODERATE (reasoning)
        "Diagnose a patient with acute chest pain and shortness of breath",
        "Explain the legal doctrine of precedent in common law systems",
        
        # Should be EASY (basic knowledge)
        "What is 2 + 2?",
        "What is the capital of France?",
    ]
    
    print(f"\nπŸ§ͺ Testing {len(test_prompts)} diverse prompts:")
    print("="*80)
    
    for prompt in test_prompts:
        result = db.query_similar_questions(prompt, k=5)
        
        print(f"\nπŸ“ '{prompt}'")
        print(f"   🎯 Risk: {result['risk_level']}")
        print(f"   πŸ“Š Success Rate: {result['weighted_success_rate']:.1%}")
        print(f"   πŸ“Œ Top Match: {result['similar_questions'][0]['question_text'][:80]}...")
        if result['similar_questions'][0]['success_rate'] < 0.5:
            print(f"   πŸ” Found similar hard question (success: {result['similar_questions'][0]['success_rate']:.0%})")
        print(f"   πŸ’‘ {result['recommendation']}")
    
    print("\n" + "="*80)
    print("βœ… Quick real data test complete!")
    print("="*80)


if __name__ == "__main__":
    quick_test()