refactor: ifeval parser
Browse files- llmdataparser/ifeval_parser.py +73 -1
- tests/test_ifeval_parser.py +51 -0
llmdataparser/ifeval_parser.py
CHANGED
|
@@ -1,7 +1,12 @@
|
|
| 1 |
from dataclasses import dataclass
|
| 2 |
from typing import Any, ClassVar, List
|
| 3 |
|
| 4 |
-
from llmdataparser.base_parser import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
from llmdataparser.prompts import IFEVAL_SYSTEM_PROMPT # You'll need to create this
|
| 6 |
|
| 7 |
|
|
@@ -77,6 +82,73 @@ class IFEvalDatasetParser(HuggingFaceDatasetParser[IFEvalParseEntry]):
|
|
| 77 |
task_name=task,
|
| 78 |
)
|
| 79 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
|
| 81 |
if __name__ == "__main__":
|
| 82 |
# Example usage
|
|
|
|
| 1 |
from dataclasses import dataclass
|
| 2 |
from typing import Any, ClassVar, List
|
| 3 |
|
| 4 |
+
from llmdataparser.base_parser import (
|
| 5 |
+
DatasetDescription,
|
| 6 |
+
EvaluationMetric,
|
| 7 |
+
HuggingFaceDatasetParser,
|
| 8 |
+
HuggingFaceParseEntry,
|
| 9 |
+
)
|
| 10 |
from llmdataparser.prompts import IFEVAL_SYSTEM_PROMPT # You'll need to create this
|
| 11 |
|
| 12 |
|
|
|
|
| 82 |
task_name=task,
|
| 83 |
)
|
| 84 |
|
| 85 |
+
def get_dataset_description(self) -> DatasetDescription:
|
| 86 |
+
"""Returns description of the IFEval dataset."""
|
| 87 |
+
return DatasetDescription.create(
|
| 88 |
+
name="IFEval",
|
| 89 |
+
purpose="Evaluate instruction following capabilities through verifiable instructions",
|
| 90 |
+
source="Google Research",
|
| 91 |
+
language="English (BCP-47 en)",
|
| 92 |
+
format="Verifiable instruction prompts with automated evaluation criteria",
|
| 93 |
+
characteristics=(
|
| 94 |
+
"Collection of approximately 500 verifiable instructions designed to evaluate "
|
| 95 |
+
"language models' instruction-following capabilities. Instructions include "
|
| 96 |
+
"specific, measurable criteria like 'write in more than 400 words' or "
|
| 97 |
+
"'mention the keyword AI at least 3 times' that can be verified through "
|
| 98 |
+
"automated heuristics. Used as a core benchmark in the Open LLM Leaderboard "
|
| 99 |
+
"for evaluating chat or instruction fine-tuned language models."
|
| 100 |
+
),
|
| 101 |
+
citation="""@misc{zhou2023instructionfollowingevaluationlargelanguage,
|
| 102 |
+
title={Instruction-Following Evaluation for Large Language Models},
|
| 103 |
+
author={Jeffrey Zhou and Tianjian Lu and Swaroop Mishra and Siddhartha Brahma and Sujoy Basu and Yi Luan and Denny Zhou and Le Hou},
|
| 104 |
+
year={2023},
|
| 105 |
+
eprint={2311.07911},
|
| 106 |
+
archivePrefix={arXiv},
|
| 107 |
+
primaryClass={cs.CL},
|
| 108 |
+
url={https://arxiv.org/abs/2311.07911}
|
| 109 |
+
}""",
|
| 110 |
+
)
|
| 111 |
+
|
| 112 |
+
def get_evaluation_metrics(self) -> list[EvaluationMetric]:
|
| 113 |
+
"""Returns recommended evaluation metrics for IFEval."""
|
| 114 |
+
return [
|
| 115 |
+
EvaluationMetric.create(
|
| 116 |
+
name="format_compliance",
|
| 117 |
+
type="text",
|
| 118 |
+
description="Verifies if the output follows specified formatting rules (e.g., highlighting, bullet points, sections)",
|
| 119 |
+
implementation="custom_format_checker",
|
| 120 |
+
primary=True,
|
| 121 |
+
),
|
| 122 |
+
EvaluationMetric.create(
|
| 123 |
+
name="length_constraints",
|
| 124 |
+
type="text",
|
| 125 |
+
description="Checks if the response meets word, sentence, or paragraph count requirements",
|
| 126 |
+
implementation="custom_length_validator",
|
| 127 |
+
primary=True,
|
| 128 |
+
),
|
| 129 |
+
EvaluationMetric.create(
|
| 130 |
+
name="punctuation_rules",
|
| 131 |
+
type="text",
|
| 132 |
+
description="Validates adherence to punctuation constraints (e.g., no commas, specific endings)",
|
| 133 |
+
implementation="custom_punctuation_checker",
|
| 134 |
+
primary=True,
|
| 135 |
+
),
|
| 136 |
+
EvaluationMetric.create(
|
| 137 |
+
name="keyword_usage",
|
| 138 |
+
type="text",
|
| 139 |
+
description="Verifies correct usage of required keywords or avoidance of forbidden words",
|
| 140 |
+
implementation="custom_keyword_validator",
|
| 141 |
+
primary=False,
|
| 142 |
+
),
|
| 143 |
+
EvaluationMetric.create(
|
| 144 |
+
name="structural_requirements",
|
| 145 |
+
type="text",
|
| 146 |
+
description="Checks for specific structural elements like sections, paragraphs, or formatting patterns",
|
| 147 |
+
implementation="custom_structure_validator",
|
| 148 |
+
primary=False,
|
| 149 |
+
),
|
| 150 |
+
]
|
| 151 |
+
|
| 152 |
|
| 153 |
if __name__ == "__main__":
|
| 154 |
# Example usage
|
tests/test_ifeval_parser.py
CHANGED
|
@@ -89,3 +89,54 @@ def test_parser_string_representation(ifeval_parser):
|
|
| 89 |
assert "IFEvalDatasetParser" in repr_str
|
| 90 |
assert "google/IFEval" in repr_str
|
| 91 |
assert "not loaded" in repr_str
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
assert "IFEvalDatasetParser" in repr_str
|
| 90 |
assert "google/IFEval" in repr_str
|
| 91 |
assert "not loaded" in repr_str
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
def test_get_dataset_description(ifeval_parser):
|
| 95 |
+
"""Test dataset description generation for IFEval."""
|
| 96 |
+
description = ifeval_parser.get_dataset_description()
|
| 97 |
+
|
| 98 |
+
assert description.name == "IFEval"
|
| 99 |
+
assert "verifiable instructions" in description.purpose.lower()
|
| 100 |
+
assert description.source == "Google Research"
|
| 101 |
+
assert description.language == "English (BCP-47 en)"
|
| 102 |
+
assert "verifiable instruction prompts" in description.format.lower()
|
| 103 |
+
assert "500" in description.characteristics
|
| 104 |
+
assert "automated heuristics" in description.characteristics.lower()
|
| 105 |
+
assert "open llm leaderboard" in description.characteristics.lower()
|
| 106 |
+
assert "zhou2023instructionfollowingevaluation" in description.citation
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
def test_get_evaluation_metrics(ifeval_parser):
|
| 110 |
+
"""Test evaluation metrics generation for IFEval."""
|
| 111 |
+
metrics = ifeval_parser.get_evaluation_metrics()
|
| 112 |
+
|
| 113 |
+
# Should have 5 metrics total
|
| 114 |
+
assert len(metrics) == 5
|
| 115 |
+
|
| 116 |
+
# Check primary metrics
|
| 117 |
+
primary_metrics = [m for m in metrics if m.primary]
|
| 118 |
+
assert len(primary_metrics) == 3
|
| 119 |
+
|
| 120 |
+
# Verify specific metrics exist and have correct properties
|
| 121 |
+
metric_names = {m.name for m in metrics}
|
| 122 |
+
assert "format_compliance" in metric_names
|
| 123 |
+
assert "length_constraints" in metric_names
|
| 124 |
+
assert "punctuation_rules" in metric_names
|
| 125 |
+
assert "keyword_usage" in metric_names
|
| 126 |
+
assert "structural_requirements" in metric_names
|
| 127 |
+
|
| 128 |
+
# Check specific metric properties
|
| 129 |
+
format_metric = next(m for m in metrics if m.name == "format_compliance")
|
| 130 |
+
assert format_metric.primary is True
|
| 131 |
+
assert "formatting rules" in format_metric.description.lower()
|
| 132 |
+
assert format_metric.type == "text"
|
| 133 |
+
|
| 134 |
+
length_metric = next(m for m in metrics if m.name == "length_constraints")
|
| 135 |
+
assert length_metric.primary is True
|
| 136 |
+
assert "word" in length_metric.description.lower()
|
| 137 |
+
assert length_metric.type == "text"
|
| 138 |
+
|
| 139 |
+
punctuation_metric = next(m for m in metrics if m.name == "punctuation_rules")
|
| 140 |
+
assert punctuation_metric.primary is True
|
| 141 |
+
assert "punctuation" in punctuation_metric.description.lower()
|
| 142 |
+
assert punctuation_metric.type == "text"
|