refactor: test cases
Browse files- tests/test_bbh_parser.py +0 -5
- tests/test_gsm8k_parser.py +0 -18
- tests/test_humaneval_parser.py +0 -2
- tests/test_ifeval_parser.py +0 -22
- tests/test_math_parser.py +0 -9
- tests/test_mbpp_parser.py +0 -33
- tests/test_mgsm_parser.py +1 -27
tests/test_bbh_parser.py
CHANGED
|
@@ -165,14 +165,9 @@ def test_get_dataset_description(bbh_parser):
|
|
| 165 |
description = bbh_parser.get_dataset_description()
|
| 166 |
|
| 167 |
assert description.name == "Big Bench Hard (BBH)"
|
| 168 |
-
assert "challenging BIG-Bench tasks" in description.purpose
|
| 169 |
assert description.language == "English"
|
| 170 |
assert description.format == "Multiple choice questions with single correct answers"
|
| 171 |
-
assert "Tasks require complex multi-step reasoning" in description.characteristics
|
| 172 |
assert "suzgun2022challenging" in description.citation
|
| 173 |
-
assert description.additional_info is not None
|
| 174 |
-
assert "model_performance" in description.additional_info
|
| 175 |
-
assert "size" in description.additional_info
|
| 176 |
|
| 177 |
|
| 178 |
def test_get_evaluation_metrics(bbh_parser):
|
|
|
|
| 165 |
description = bbh_parser.get_dataset_description()
|
| 166 |
|
| 167 |
assert description.name == "Big Bench Hard (BBH)"
|
|
|
|
| 168 |
assert description.language == "English"
|
| 169 |
assert description.format == "Multiple choice questions with single correct answers"
|
|
|
|
| 170 |
assert "suzgun2022challenging" in description.citation
|
|
|
|
|
|
|
|
|
|
| 171 |
|
| 172 |
|
| 173 |
def test_get_evaluation_metrics(bbh_parser):
|
tests/test_gsm8k_parser.py
CHANGED
|
@@ -190,10 +190,7 @@ def test_get_dataset_description(gsm8k_parser):
|
|
| 190 |
assert description.name == "Grade School Math 8K (GSM8K)"
|
| 191 |
assert description.source == "OpenAI"
|
| 192 |
assert description.language == "English"
|
| 193 |
-
assert "8.5K grade school math word problems" in description.characteristics
|
| 194 |
-
assert "Training Verifiers to Solve Math Word Problems" in description.citation
|
| 195 |
assert "Cobbe" in description.citation
|
| 196 |
-
assert "arXiv" in description.citation
|
| 197 |
|
| 198 |
|
| 199 |
def test_get_evaluation_metrics(gsm8k_parser):
|
|
@@ -210,18 +207,3 @@ def test_get_evaluation_metrics(gsm8k_parser):
|
|
| 210 |
assert exact_match.type == "string"
|
| 211 |
assert exact_match.primary is True
|
| 212 |
assert "exact match" in exact_match.description.lower()
|
| 213 |
-
|
| 214 |
-
# Check solution_validity metric details
|
| 215 |
-
solution_validity = next(m for m in metrics if m.name == "solution_validity")
|
| 216 |
-
assert solution_validity.type == "text"
|
| 217 |
-
assert solution_validity.primary is True
|
| 218 |
-
assert "valid" in solution_validity.description.lower()
|
| 219 |
-
|
| 220 |
-
# Check step metrics
|
| 221 |
-
step_accuracy = next(m for m in metrics if m.name == "step_accuracy")
|
| 222 |
-
assert step_accuracy.type == "numerical"
|
| 223 |
-
assert step_accuracy.primary is True
|
| 224 |
-
|
| 225 |
-
step_count = next(m for m in metrics if m.name == "step_count")
|
| 226 |
-
assert step_count.type == "numerical"
|
| 227 |
-
assert step_count.primary is False
|
|
|
|
| 190 |
assert description.name == "Grade School Math 8K (GSM8K)"
|
| 191 |
assert description.source == "OpenAI"
|
| 192 |
assert description.language == "English"
|
|
|
|
|
|
|
| 193 |
assert "Cobbe" in description.citation
|
|
|
|
| 194 |
|
| 195 |
|
| 196 |
def test_get_evaluation_metrics(gsm8k_parser):
|
|
|
|
| 207 |
assert exact_match.type == "string"
|
| 208 |
assert exact_match.primary is True
|
| 209 |
assert "exact match" in exact_match.description.lower()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tests/test_humaneval_parser.py
CHANGED
|
@@ -180,8 +180,6 @@ def test_get_dataset_description(parser, plus_parser):
|
|
| 180 |
assert description.name == "HumanEval"
|
| 181 |
assert "code generation" in description.purpose
|
| 182 |
assert description.language == "Python"
|
| 183 |
-
assert "Function signatures with docstrings" in description.format
|
| 184 |
-
assert "164 hand-written Python programming problems" in description.characteristics
|
| 185 |
assert "chen2021codex" in description.citation
|
| 186 |
|
| 187 |
# Test HumanEval Plus description
|
|
|
|
| 180 |
assert description.name == "HumanEval"
|
| 181 |
assert "code generation" in description.purpose
|
| 182 |
assert description.language == "Python"
|
|
|
|
|
|
|
| 183 |
assert "chen2021codex" in description.citation
|
| 184 |
|
| 185 |
# Test HumanEval Plus description
|
tests/test_ifeval_parser.py
CHANGED
|
@@ -96,14 +96,8 @@ def test_get_dataset_description(ifeval_parser):
|
|
| 96 |
description = ifeval_parser.get_dataset_description()
|
| 97 |
|
| 98 |
assert description.name == "IFEval"
|
| 99 |
-
assert "verifiable instructions" in description.purpose.lower()
|
| 100 |
assert description.source == "Google Research"
|
| 101 |
assert description.language == "English (BCP-47 en)"
|
| 102 |
-
assert "verifiable instruction prompts" in description.format.lower()
|
| 103 |
-
assert "500" in description.characteristics
|
| 104 |
-
assert "automated heuristics" in description.characteristics.lower()
|
| 105 |
-
assert "open llm leaderboard" in description.characteristics.lower()
|
| 106 |
-
assert "zhou2023instructionfollowingevaluation" in description.citation
|
| 107 |
|
| 108 |
|
| 109 |
def test_get_evaluation_metrics(ifeval_parser):
|
|
@@ -124,19 +118,3 @@ def test_get_evaluation_metrics(ifeval_parser):
|
|
| 124 |
assert "punctuation_rules" in metric_names
|
| 125 |
assert "keyword_usage" in metric_names
|
| 126 |
assert "structural_requirements" in metric_names
|
| 127 |
-
|
| 128 |
-
# Check specific metric properties
|
| 129 |
-
format_metric = next(m for m in metrics if m.name == "format_compliance")
|
| 130 |
-
assert format_metric.primary is True
|
| 131 |
-
assert "formatting rules" in format_metric.description.lower()
|
| 132 |
-
assert format_metric.type == "text"
|
| 133 |
-
|
| 134 |
-
length_metric = next(m for m in metrics if m.name == "length_constraints")
|
| 135 |
-
assert length_metric.primary is True
|
| 136 |
-
assert "word" in length_metric.description.lower()
|
| 137 |
-
assert length_metric.type == "text"
|
| 138 |
-
|
| 139 |
-
punctuation_metric = next(m for m in metrics if m.name == "punctuation_rules")
|
| 140 |
-
assert punctuation_metric.primary is True
|
| 141 |
-
assert "punctuation" in punctuation_metric.description.lower()
|
| 142 |
-
assert punctuation_metric.type == "text"
|
|
|
|
| 96 |
description = ifeval_parser.get_dataset_description()
|
| 97 |
|
| 98 |
assert description.name == "IFEval"
|
|
|
|
| 99 |
assert description.source == "Google Research"
|
| 100 |
assert description.language == "English (BCP-47 en)"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
|
| 102 |
|
| 103 |
def test_get_evaluation_metrics(ifeval_parser):
|
|
|
|
| 118 |
assert "punctuation_rules" in metric_names
|
| 119 |
assert "keyword_usage" in metric_names
|
| 120 |
assert "structural_requirements" in metric_names
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tests/test_math_parser.py
CHANGED
|
@@ -205,12 +205,9 @@ def test_get_dataset_description(math_parser):
|
|
| 205 |
description = math_parser.get_dataset_description()
|
| 206 |
|
| 207 |
assert description.name == "MATH"
|
| 208 |
-
assert "mathematical problem-solving" in description.purpose.lower()
|
| 209 |
assert "Hendrycks" in description.source
|
| 210 |
assert description.language == "English"
|
| 211 |
-
assert "competition mathematics problems" in description.format.lower()
|
| 212 |
assert "12,500" in description.characteristics
|
| 213 |
-
assert "step-by-step solutions" in description.characteristics.lower()
|
| 214 |
assert "hendrycksmath2021" in description.citation
|
| 215 |
assert "NeurIPS" in description.citation
|
| 216 |
|
|
@@ -220,8 +217,6 @@ def test_get_dataset_description(math_parser):
|
|
| 220 |
assert "algebra" in description.additional_info["topics"]
|
| 221 |
assert "geometry" in description.additional_info["topics"]
|
| 222 |
assert description.additional_info["size"] == "12,500 problems"
|
| 223 |
-
assert "sympy" in description.additional_info["evaluation_note"].lower()
|
| 224 |
-
assert "github.com/hendrycks/math" in description.additional_info["homepage"]
|
| 225 |
|
| 226 |
|
| 227 |
def test_get_evaluation_metrics(math_parser):
|
|
@@ -259,7 +254,3 @@ def test_get_evaluation_metrics(math_parser):
|
|
| 259 |
assert reasoning_metric.type == "text"
|
| 260 |
assert reasoning_metric.primary is True
|
| 261 |
assert "mathematical reasoning" in reasoning_metric.description.lower()
|
| 262 |
-
|
| 263 |
-
# Check non-primary metrics
|
| 264 |
-
non_primary_metrics = {m.name for m in metrics if not m.primary}
|
| 265 |
-
assert non_primary_metrics == {"mathematical_notation", "solution_clarity"}
|
|
|
|
| 205 |
description = math_parser.get_dataset_description()
|
| 206 |
|
| 207 |
assert description.name == "MATH"
|
|
|
|
| 208 |
assert "Hendrycks" in description.source
|
| 209 |
assert description.language == "English"
|
|
|
|
| 210 |
assert "12,500" in description.characteristics
|
|
|
|
| 211 |
assert "hendrycksmath2021" in description.citation
|
| 212 |
assert "NeurIPS" in description.citation
|
| 213 |
|
|
|
|
| 217 |
assert "algebra" in description.additional_info["topics"]
|
| 218 |
assert "geometry" in description.additional_info["topics"]
|
| 219 |
assert description.additional_info["size"] == "12,500 problems"
|
|
|
|
|
|
|
| 220 |
|
| 221 |
|
| 222 |
def test_get_evaluation_metrics(math_parser):
|
|
|
|
| 254 |
assert reasoning_metric.type == "text"
|
| 255 |
assert reasoning_metric.primary is True
|
| 256 |
assert "mathematical reasoning" in reasoning_metric.description.lower()
|
|
|
|
|
|
|
|
|
|
|
|
tests/test_mbpp_parser.py
CHANGED
|
@@ -162,31 +162,10 @@ def test_get_dataset_description(parser):
|
|
| 162 |
assert "code generation" in description.purpose.lower()
|
| 163 |
assert "google-research" in description.source
|
| 164 |
assert description.language == "English and Python"
|
| 165 |
-
assert "task descriptions" in description.format.lower()
|
| 166 |
-
assert "python solutions" in description.format.lower()
|
| 167 |
assert "1,000" in description.characteristics
|
| 168 |
-
assert "entry-level programmers" in description.characteristics.lower()
|
| 169 |
-
assert "3 automated test cases" in description.characteristics
|
| 170 |
-
assert "hand-verified" in description.characteristics
|
| 171 |
assert "austin2021program" in description.citation
|
| 172 |
assert "Program Synthesis" in description.citation
|
| 173 |
|
| 174 |
-
# Check additional info
|
| 175 |
-
assert description.additional_info is not None
|
| 176 |
-
assert description.additional_info["size"] == "~1,000 programming problems"
|
| 177 |
-
assert (
|
| 178 |
-
description.additional_info["splits"]
|
| 179 |
-
== "Available in full or sanitized versions"
|
| 180 |
-
)
|
| 181 |
-
assert (
|
| 182 |
-
description.additional_info["test_coverage"]
|
| 183 |
-
== "Each problem includes 3 automated test cases"
|
| 184 |
-
)
|
| 185 |
-
assert (
|
| 186 |
-
description.additional_info["verification"]
|
| 187 |
-
== "Subset of data has been hand-verified by authors"
|
| 188 |
-
)
|
| 189 |
-
|
| 190 |
|
| 191 |
def test_get_evaluation_metrics(parser):
|
| 192 |
"""Test evaluation metrics generation."""
|
|
@@ -211,15 +190,3 @@ def test_get_evaluation_metrics(parser):
|
|
| 211 |
assert pass_k_metric.primary is True
|
| 212 |
assert "k generations" in pass_k_metric.description.lower()
|
| 213 |
assert "custom_pass_at_k" in pass_k_metric.implementation
|
| 214 |
-
|
| 215 |
-
test_case_metric = next(m for m in metrics if m.name == "test_case_success_rate")
|
| 216 |
-
assert test_case_metric.type == "code_evaluation"
|
| 217 |
-
assert test_case_metric.primary is False
|
| 218 |
-
assert "test cases" in test_case_metric.description.lower()
|
| 219 |
-
assert "custom_test_success_rate" in test_case_metric.implementation
|
| 220 |
-
|
| 221 |
-
syntax_metric = next(m for m in metrics if m.name == "syntax_validity")
|
| 222 |
-
assert syntax_metric.type == "code_evaluation"
|
| 223 |
-
assert syntax_metric.primary is False
|
| 224 |
-
assert "syntactically valid" in syntax_metric.description.lower()
|
| 225 |
-
assert "custom_syntax_check" in syntax_metric.implementation
|
|
|
|
| 162 |
assert "code generation" in description.purpose.lower()
|
| 163 |
assert "google-research" in description.source
|
| 164 |
assert description.language == "English and Python"
|
|
|
|
|
|
|
| 165 |
assert "1,000" in description.characteristics
|
|
|
|
|
|
|
|
|
|
| 166 |
assert "austin2021program" in description.citation
|
| 167 |
assert "Program Synthesis" in description.citation
|
| 168 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 169 |
|
| 170 |
def test_get_evaluation_metrics(parser):
|
| 171 |
"""Test evaluation metrics generation."""
|
|
|
|
| 190 |
assert pass_k_metric.primary is True
|
| 191 |
assert "k generations" in pass_k_metric.description.lower()
|
| 192 |
assert "custom_pass_at_k" in pass_k_metric.implementation
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tests/test_mgsm_parser.py
CHANGED
|
@@ -192,35 +192,18 @@ def test_get_dataset_description(mgsm_parser):
|
|
| 192 |
assert "multilingual chain-of-thought reasoning" in description.purpose.lower()
|
| 193 |
assert "juletxara/mgsm" in description.source
|
| 194 |
assert description.language == "Multilingual (11 languages)"
|
| 195 |
-
|
| 196 |
-
assert "numerical answers" in description.format.lower()
|
| 197 |
-
assert "solution steps" in description.format.lower()
|
| 198 |
-
|
| 199 |
-
# Check characteristics
|
| 200 |
-
assert "250" in description.characteristics
|
| 201 |
-
assert "gsm8k" in description.characteristics.lower()
|
| 202 |
-
assert "translations" in description.characteristics.lower()
|
| 203 |
assert "mathematical reasoning" in description.characteristics.lower()
|
| 204 |
|
| 205 |
# Check citations
|
| 206 |
assert "shi2022language" in description.citation
|
| 207 |
assert "cobbe2021gsm8k" in description.citation
|
| 208 |
-
assert (
|
| 209 |
-
"Language Models are Multilingual Chain-of-Thought Reasoners"
|
| 210 |
-
in description.citation
|
| 211 |
-
)
|
| 212 |
-
assert "Training Verifiers to Solve Math Word Problems" in description.citation
|
| 213 |
|
| 214 |
# Check additional info
|
| 215 |
assert description.additional_info is not None
|
| 216 |
assert len(description.additional_info["languages"]) == 11
|
| 217 |
assert "English" in description.additional_info["languages"]
|
| 218 |
assert "Chinese" in description.additional_info["languages"]
|
| 219 |
-
assert (
|
| 220 |
-
description.additional_info["size"]
|
| 221 |
-
== "250 problems translated into each language"
|
| 222 |
-
)
|
| 223 |
-
assert description.additional_info["base_dataset"] == "GSM8K (Grade School Math 8K)"
|
| 224 |
|
| 225 |
|
| 226 |
def test_get_evaluation_metrics(mgsm_parser):
|
|
@@ -259,12 +242,3 @@ def test_get_evaluation_metrics(mgsm_parser):
|
|
| 259 |
assert step_metric.primary is True
|
| 260 |
assert "calculation steps" in step_metric.description.lower()
|
| 261 |
assert "custom_step_accuracy" in step_metric.implementation
|
| 262 |
-
|
| 263 |
-
# Check cross-lingual metric specifically
|
| 264 |
-
cross_lingual_metric = next(
|
| 265 |
-
m for m in metrics if m.name == "cross_lingual_consistency"
|
| 266 |
-
)
|
| 267 |
-
assert cross_lingual_metric.type == "comparison"
|
| 268 |
-
assert cross_lingual_metric.primary is False
|
| 269 |
-
assert "different language versions" in cross_lingual_metric.description.lower()
|
| 270 |
-
assert "custom_language_comparator" in cross_lingual_metric.implementation
|
|
|
|
| 192 |
assert "multilingual chain-of-thought reasoning" in description.purpose.lower()
|
| 193 |
assert "juletxara/mgsm" in description.source
|
| 194 |
assert description.language == "Multilingual (11 languages)"
|
| 195 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 196 |
assert "mathematical reasoning" in description.characteristics.lower()
|
| 197 |
|
| 198 |
# Check citations
|
| 199 |
assert "shi2022language" in description.citation
|
| 200 |
assert "cobbe2021gsm8k" in description.citation
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 201 |
|
| 202 |
# Check additional info
|
| 203 |
assert description.additional_info is not None
|
| 204 |
assert len(description.additional_info["languages"]) == 11
|
| 205 |
assert "English" in description.additional_info["languages"]
|
| 206 |
assert "Chinese" in description.additional_info["languages"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 207 |
|
| 208 |
|
| 209 |
def test_get_evaluation_metrics(mgsm_parser):
|
|
|
|
| 242 |
assert step_metric.primary is True
|
| 243 |
assert "calculation steps" in step_metric.description.lower()
|
| 244 |
assert "custom_step_accuracy" in step_metric.implementation
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|