|
|
|
|
|
""" |
|
|
Test script for GAIA comparison functionality. |
|
|
""" |
|
|
|
|
|
import sys |
|
|
import os |
|
|
sys.path.append(os.path.dirname(os.path.abspath(__file__))) |
|
|
|
|
|
from comparison import AnswerComparator |
|
|
from phoenix_evaluator import log_evaluations_to_phoenix |
|
|
import pandas as pd |
|
|
|
|
|
|
|
|
def test_basic_comparison(): |
|
|
"""Test basic comparison functionality.""" |
|
|
print("Testing basic comparison...") |
|
|
|
|
|
|
|
|
comparator = AnswerComparator() |
|
|
|
|
|
|
|
|
sample_results = [ |
|
|
{"task_id": "8e867cd7-cff9-4e6c-867a-ff5ddc2550be", "submitted_answer": "3"}, |
|
|
{"task_id": "a1e91b78-d3d8-4675-bb8d-62741b4b68a6", "submitted_answer": "3"}, |
|
|
{"task_id": "nonexistent-task", "submitted_answer": "test"} |
|
|
] |
|
|
|
|
|
|
|
|
evaluations_df = comparator.evaluate_batch(sample_results) |
|
|
print(f"Evaluated {len(evaluations_df)} answers") |
|
|
|
|
|
|
|
|
summary_stats = comparator.get_summary_stats(evaluations_df) |
|
|
print("Summary statistics:") |
|
|
for key, value in summary_stats.items(): |
|
|
print(f" {key}: {value}") |
|
|
|
|
|
|
|
|
print("\nTesting single evaluation...") |
|
|
single_eval = comparator.evaluate_answer("8e867cd7-cff9-4e6c-867a-ff5ddc2550be", "3") |
|
|
print(f"Single evaluation result: {single_eval}") |
|
|
|
|
|
return evaluations_df |
|
|
|
|
|
|
|
|
def test_results_enhancement(): |
|
|
"""Test results log enhancement.""" |
|
|
print("\nTesting results log enhancement...") |
|
|
|
|
|
comparator = AnswerComparator() |
|
|
|
|
|
|
|
|
sample_results_log = [ |
|
|
{ |
|
|
"Task ID": "8e867cd7-cff9-4e6c-867a-ff5ddc2550be", |
|
|
"Question": "How many studio albums were published by Mercedes Sosa between 2000 and 2009?", |
|
|
"Submitted Answer": "3" |
|
|
}, |
|
|
{ |
|
|
"Task ID": "a1e91b78-d3d8-4675-bb8d-62741b4b68a6", |
|
|
"Question": "Test question", |
|
|
"Submitted Answer": "wrong answer" |
|
|
} |
|
|
] |
|
|
|
|
|
|
|
|
enhanced_results = comparator.enhance_results_log(sample_results_log) |
|
|
|
|
|
print("Enhanced results:") |
|
|
for result in enhanced_results: |
|
|
print(f" Task: {result['Task ID']}") |
|
|
print(f" Answer: {result['Submitted Answer']}") |
|
|
print(f" Ground Truth: {result['Ground Truth']}") |
|
|
print(f" Exact Match: {result['Exact Match']}") |
|
|
print(f" Similarity: {result['Similarity']}") |
|
|
print() |
|
|
|
|
|
|
|
|
def test_phoenix_integration(): |
|
|
"""Test Phoenix integration (basic).""" |
|
|
print("\nTesting Phoenix integration...") |
|
|
|
|
|
|
|
|
sample_evaluations = pd.DataFrame([ |
|
|
{ |
|
|
"task_id": "8e867cd7-cff9-4e6c-867a-ff5ddc2550be", |
|
|
"predicted_answer": "3", |
|
|
"actual_answer": "3", |
|
|
"exact_match": True, |
|
|
"similarity_score": 1.0, |
|
|
"contains_answer": True, |
|
|
"error": None |
|
|
}, |
|
|
{ |
|
|
"task_id": "a1e91b78-d3d8-4675-bb8d-62741b4b68a6", |
|
|
"predicted_answer": "wrong", |
|
|
"actual_answer": "3", |
|
|
"exact_match": False, |
|
|
"similarity_score": 0.2, |
|
|
"contains_answer": False, |
|
|
"error": None |
|
|
} |
|
|
]) |
|
|
|
|
|
|
|
|
try: |
|
|
result = log_evaluations_to_phoenix(sample_evaluations) |
|
|
if result is not None: |
|
|
print("✅ Phoenix integration successful") |
|
|
else: |
|
|
print("⚠️ Phoenix integration failed (likely Phoenix not running)") |
|
|
except Exception as e: |
|
|
print(f"⚠️ Phoenix integration error: {e}") |
|
|
|
|
|
|
|
|
def main(): |
|
|
"""Run all tests.""" |
|
|
print("="*50) |
|
|
print("GAIA Comparison Test Suite") |
|
|
print("="*50) |
|
|
|
|
|
try: |
|
|
|
|
|
evaluations_df = test_basic_comparison() |
|
|
|
|
|
|
|
|
test_results_enhancement() |
|
|
|
|
|
|
|
|
test_phoenix_integration() |
|
|
|
|
|
print("\n" + "="*50) |
|
|
print("All tests completed!") |
|
|
print("="*50) |
|
|
|
|
|
except Exception as e: |
|
|
print(f"❌ Test failed with error: {e}") |
|
|
import traceback |
|
|
traceback.print_exc() |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|