refactor: add category to parser
Browse files- app.py +18 -2
- llmdataparser/base_parser.py +20 -0
- llmdataparser/bbh_parser.py +1 -0
- llmdataparser/gsm8k_parser.py +1 -0
- llmdataparser/humaneval_parser.py +2 -0
- llmdataparser/ifeval_parser.py +1 -0
- llmdataparser/math_parser.py +1 -0
- llmdataparser/mbpp_parser.py +1 -0
- llmdataparser/mgsm_parser.py +1 -0
- llmdataparser/mmlu_parser.py +4 -0
- llmdataparser/tmlu_parser.py +1 -0
- llmdataparser/tw_legal_parser.py +1 -0
app.py
CHANGED
|
@@ -252,8 +252,24 @@ def update_metric_details(metric_name: str, parser_name: str) -> str:
|
|
| 252 |
|
| 253 |
def create_interface() -> gr.Blocks:
|
| 254 |
"""Create and return the Gradio interface."""
|
| 255 |
-
with gr.Blocks() as demo:
|
| 256 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 257 |
|
| 258 |
# State management
|
| 259 |
parser_state = gr.State("")
|
|
|
|
| 252 |
|
| 253 |
def create_interface() -> gr.Blocks:
|
| 254 |
"""Create and return the Gradio interface."""
|
| 255 |
+
with gr.Blocks(css="footer {display: none !important}") as demo:
|
| 256 |
+
# Add header section with purpose and GitHub info
|
| 257 |
+
gr.Markdown("""
|
| 258 |
+
# LLM Evaluation Dataset Parser
|
| 259 |
+
|
| 260 |
+
### 🎯 Purpose
|
| 261 |
+
A unified interface for parsing and exploring various LLM benchmark datasets (MMLU, MMLU-Pro, GSM8k, and more).
|
| 262 |
+
This tool helps researchers and developers to:
|
| 263 |
+
- Easily explore different benchmark datasets
|
| 264 |
+
- Access standardized parsing for multiple dataset formats
|
| 265 |
+
- View dataset descriptions and evaluation metrics
|
| 266 |
+
|
| 267 |
+
### 🔗 Links
|
| 268 |
+
- [GitHub Repository](https://github.com/jeff52415/LLMDataParser)
|
| 269 |
+
- [Documentation](https://github.com/jeff52415/LLMDataParser#readme)
|
| 270 |
+
|
| 271 |
+
---
|
| 272 |
+
""")
|
| 273 |
|
| 274 |
# State management
|
| 275 |
parser_state = gr.State("")
|
llmdataparser/base_parser.py
CHANGED
|
@@ -9,6 +9,18 @@ import datasets
|
|
| 9 |
T = TypeVar("T", bound="ParseEntry")
|
| 10 |
|
| 11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
@dataclass(frozen=True, kw_only=True, slots=True)
|
| 13 |
class ParseEntry:
|
| 14 |
"""A simple base class for entries, customizable by each dataset parser."""
|
|
@@ -28,6 +40,7 @@ class DatasetDescription:
|
|
| 28 |
source: str
|
| 29 |
language: str
|
| 30 |
format: str
|
|
|
|
| 31 |
characteristics: str
|
| 32 |
citation: str | None = None
|
| 33 |
additional_info: dict[str, Any] | None = None
|
|
@@ -40,16 +53,23 @@ class DatasetDescription:
|
|
| 40 |
source: str,
|
| 41 |
language: str,
|
| 42 |
format: str,
|
|
|
|
| 43 |
characteristics: str,
|
| 44 |
citation: str | None = None,
|
| 45 |
additional_info: dict[str, Any] | None = None,
|
| 46 |
) -> "DatasetDescription":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
return cls(
|
| 48 |
name=name,
|
| 49 |
purpose=purpose,
|
| 50 |
source=source,
|
| 51 |
language=language,
|
| 52 |
format=format,
|
|
|
|
| 53 |
characteristics=characteristics,
|
| 54 |
citation=citation,
|
| 55 |
additional_info=additional_info,
|
|
|
|
| 9 |
T = TypeVar("T", bound="ParseEntry")
|
| 10 |
|
| 11 |
|
| 12 |
+
# Add this after the DatasetCategory definition
|
| 13 |
+
VALID_CATEGORIES = {
|
| 14 |
+
"Math",
|
| 15 |
+
"General Knowledge and Reasoning",
|
| 16 |
+
"Programming",
|
| 17 |
+
"MultiLingual",
|
| 18 |
+
"Taiwan",
|
| 19 |
+
"Advanced Reasoning",
|
| 20 |
+
"Legal",
|
| 21 |
+
}
|
| 22 |
+
|
| 23 |
+
|
| 24 |
@dataclass(frozen=True, kw_only=True, slots=True)
|
| 25 |
class ParseEntry:
|
| 26 |
"""A simple base class for entries, customizable by each dataset parser."""
|
|
|
|
| 40 |
source: str
|
| 41 |
language: str
|
| 42 |
format: str
|
| 43 |
+
category: list[str]
|
| 44 |
characteristics: str
|
| 45 |
citation: str | None = None
|
| 46 |
additional_info: dict[str, Any] | None = None
|
|
|
|
| 53 |
source: str,
|
| 54 |
language: str,
|
| 55 |
format: str,
|
| 56 |
+
category: list[str],
|
| 57 |
characteristics: str,
|
| 58 |
citation: str | None = None,
|
| 59 |
additional_info: dict[str, Any] | None = None,
|
| 60 |
) -> "DatasetDescription":
|
| 61 |
+
# Validate that all categories are valid DatasetCategory values
|
| 62 |
+
for item in category:
|
| 63 |
+
assert (
|
| 64 |
+
item in VALID_CATEGORIES
|
| 65 |
+
), f"Category '{item}' is not a valid category. Valid categories are: {VALID_CATEGORIES}"
|
| 66 |
return cls(
|
| 67 |
name=name,
|
| 68 |
purpose=purpose,
|
| 69 |
source=source,
|
| 70 |
language=language,
|
| 71 |
format=format,
|
| 72 |
+
category=category,
|
| 73 |
characteristics=characteristics,
|
| 74 |
citation=citation,
|
| 75 |
additional_info=additional_info,
|
llmdataparser/bbh_parser.py
CHANGED
|
@@ -106,6 +106,7 @@ class BBHDatasetParser(HuggingFaceDatasetParser[BBHParseEntry]):
|
|
| 106 |
"significantly improved through chain-of-thought prompting. The dataset "
|
| 107 |
"includes 23 core tasks plus additional related tasks."
|
| 108 |
),
|
|
|
|
| 109 |
citation=(
|
| 110 |
"@article{suzgun2022challenging,\n"
|
| 111 |
" title={Challenging BIG-Bench Tasks and Whether Chain-of-Thought Can Solve Them},\n"
|
|
|
|
| 106 |
"significantly improved through chain-of-thought prompting. The dataset "
|
| 107 |
"includes 23 core tasks plus additional related tasks."
|
| 108 |
),
|
| 109 |
+
category=["Advanced Reasoning"],
|
| 110 |
citation=(
|
| 111 |
"@article{suzgun2022challenging,\n"
|
| 112 |
" title={Challenging BIG-Bench Tasks and Whether Chain-of-Thought Can Solve Them},\n"
|
llmdataparser/gsm8k_parser.py
CHANGED
|
@@ -89,6 +89,7 @@ class GSM8KDatasetParser(HuggingFaceDatasetParser[GSM8KParseEntry]):
|
|
| 89 |
source="OpenAI",
|
| 90 |
language="English",
|
| 91 |
format="Word problems with step-by-step solutions and numerical answers",
|
|
|
|
| 92 |
characteristics=(
|
| 93 |
"Collection of 8.5K grade school math word problems that require "
|
| 94 |
"multi-step reasoning. Problems gradually increase in difficulty "
|
|
|
|
| 89 |
source="OpenAI",
|
| 90 |
language="English",
|
| 91 |
format="Word problems with step-by-step solutions and numerical answers",
|
| 92 |
+
category=["Math"],
|
| 93 |
characteristics=(
|
| 94 |
"Collection of 8.5K grade school math word problems that require "
|
| 95 |
"multi-step reasoning. Problems gradually increase in difficulty "
|
llmdataparser/humaneval_parser.py
CHANGED
|
@@ -88,6 +88,7 @@ class HumanEvalDatasetParser(HuggingFaceDatasetParser[HumanEvalParseEntry]):
|
|
| 88 |
source="OpenAI",
|
| 89 |
language="Python",
|
| 90 |
format="Function signatures with docstrings and unit tests",
|
|
|
|
| 91 |
characteristics=(
|
| 92 |
"Collection of 164 hand-written Python programming problems. Each problem "
|
| 93 |
"includes a function signature, docstring, example test cases, and hidden unit "
|
|
@@ -186,6 +187,7 @@ class HumanEvalDatasetPlusParser(HumanEvalDatasetParser):
|
|
| 186 |
source="EvalPlus",
|
| 187 |
language="Python",
|
| 188 |
format="Function signatures with docstrings and comprehensive test suites",
|
|
|
|
| 189 |
characteristics=(
|
| 190 |
"Significantly enhanced version of HumanEval with 80x more test cases. "
|
| 191 |
"Includes extensive edge cases, boundary conditions, stress tests, and "
|
|
|
|
| 88 |
source="OpenAI",
|
| 89 |
language="Python",
|
| 90 |
format="Function signatures with docstrings and unit tests",
|
| 91 |
+
category=["Programming"],
|
| 92 |
characteristics=(
|
| 93 |
"Collection of 164 hand-written Python programming problems. Each problem "
|
| 94 |
"includes a function signature, docstring, example test cases, and hidden unit "
|
|
|
|
| 187 |
source="EvalPlus",
|
| 188 |
language="Python",
|
| 189 |
format="Function signatures with docstrings and comprehensive test suites",
|
| 190 |
+
category=["Programming"],
|
| 191 |
characteristics=(
|
| 192 |
"Significantly enhanced version of HumanEval with 80x more test cases. "
|
| 193 |
"Includes extensive edge cases, boundary conditions, stress tests, and "
|
llmdataparser/ifeval_parser.py
CHANGED
|
@@ -90,6 +90,7 @@ class IFEvalDatasetParser(HuggingFaceDatasetParser[IFEvalParseEntry]):
|
|
| 90 |
source="Google Research",
|
| 91 |
language="English (BCP-47 en)",
|
| 92 |
format="Verifiable instruction prompts with automated evaluation criteria",
|
|
|
|
| 93 |
characteristics=(
|
| 94 |
"Collection of approximately 500 verifiable instructions designed to evaluate "
|
| 95 |
"language models' instruction-following capabilities. Instructions include "
|
|
|
|
| 90 |
source="Google Research",
|
| 91 |
language="English (BCP-47 en)",
|
| 92 |
format="Verifiable instruction prompts with automated evaluation criteria",
|
| 93 |
+
category=["Programming"],
|
| 94 |
characteristics=(
|
| 95 |
"Collection of approximately 500 verifiable instructions designed to evaluate "
|
| 96 |
"language models' instruction-following capabilities. Instructions include "
|
llmdataparser/math_parser.py
CHANGED
|
@@ -97,6 +97,7 @@ class MATHDatasetParser(HuggingFaceDatasetParser[MATHParseEntry]):
|
|
| 97 |
source="Hendrycks et al., UC Berkeley (NeurIPS 2021)",
|
| 98 |
language="English",
|
| 99 |
format="Competition mathematics problems with step-by-step solutions",
|
|
|
|
| 100 |
characteristics=(
|
| 101 |
"Collection of 12,500 challenging competition mathematics problems designed to "
|
| 102 |
"evaluate mathematical reasoning. Problems include step-by-step solutions that "
|
|
|
|
| 97 |
source="Hendrycks et al., UC Berkeley (NeurIPS 2021)",
|
| 98 |
language="English",
|
| 99 |
format="Competition mathematics problems with step-by-step solutions",
|
| 100 |
+
category=["Math"],
|
| 101 |
characteristics=(
|
| 102 |
"Collection of 12,500 challenging competition mathematics problems designed to "
|
| 103 |
"evaluate mathematical reasoning. Problems include step-by-step solutions that "
|
llmdataparser/mbpp_parser.py
CHANGED
|
@@ -95,6 +95,7 @@ class MBPPDatasetParser(HuggingFaceDatasetParser[MBPPParseEntry]):
|
|
| 95 |
purpose="A benchmark for evaluating code generation capabilities using entry-level Python programming problems",
|
| 96 |
source="https://github.com/google-research/google-research/tree/master/mbpp",
|
| 97 |
language="English and Python",
|
|
|
|
| 98 |
format="Task descriptions in English with corresponding Python solutions and automated test cases",
|
| 99 |
characteristics=(
|
| 100 |
"Contains approximately 1,000 crowd-sourced Python programming problems "
|
|
|
|
| 95 |
purpose="A benchmark for evaluating code generation capabilities using entry-level Python programming problems",
|
| 96 |
source="https://github.com/google-research/google-research/tree/master/mbpp",
|
| 97 |
language="English and Python",
|
| 98 |
+
category=["Programming"],
|
| 99 |
format="Task descriptions in English with corresponding Python solutions and automated test cases",
|
| 100 |
characteristics=(
|
| 101 |
"Contains approximately 1,000 crowd-sourced Python programming problems "
|
llmdataparser/mgsm_parser.py
CHANGED
|
@@ -106,6 +106,7 @@ class MGSMDatasetParser(HuggingFaceDatasetParser[MGSMParseEntry]):
|
|
| 106 |
source="https://huggingface.co/datasets/juletxara/mgsm",
|
| 107 |
language="Multilingual (11 languages)",
|
| 108 |
format="Word problems with numerical answers and solution steps",
|
|
|
|
| 109 |
characteristics=(
|
| 110 |
"Human-translated version of 250 GSM8K problems into 10 additional languages. "
|
| 111 |
"Each problem includes the original question from GSM8K, its translations, "
|
|
|
|
| 106 |
source="https://huggingface.co/datasets/juletxara/mgsm",
|
| 107 |
language="Multilingual (11 languages)",
|
| 108 |
format="Word problems with numerical answers and solution steps",
|
| 109 |
+
category=["Math", "MultiLingual"],
|
| 110 |
characteristics=(
|
| 111 |
"Human-translated version of 250 GSM8K problems into 10 additional languages. "
|
| 112 |
"Each problem includes the original question from GSM8K, its translations, "
|
llmdataparser/mmlu_parser.py
CHANGED
|
@@ -212,6 +212,7 @@ class BaseMMLUDatasetParser(MMLUDatasetParser):
|
|
| 212 |
purpose="Evaluate models' extensive world knowledge and problem-solving abilities across diverse branches of knowledge",
|
| 213 |
source="https://huggingface.co/datasets/cais/mmlu",
|
| 214 |
language="English",
|
|
|
|
| 215 |
format="Multiple choice questions with four options (A, B, C, D)",
|
| 216 |
characteristics=(
|
| 217 |
"Comprehensive evaluation benchmark spanning humanities, social sciences, hard sciences, "
|
|
@@ -332,6 +333,7 @@ class MMLUReduxDatasetParser(MMLUDatasetParser):
|
|
| 332 |
source="https://huggingface.co/datasets/edinburgh-dawg/mmlu-redux",
|
| 333 |
language="English",
|
| 334 |
format="Multiple choice questions with four options (A, B, C, D)",
|
|
|
|
| 335 |
characteristics=(
|
| 336 |
"A carefully curated subset of 3,000 questions across 30 MMLU subjects, "
|
| 337 |
"manually re-annotated to identify and classify various types of errors. "
|
|
@@ -494,6 +496,7 @@ class TMMLUPlusDatasetParser(MMLUDatasetParser):
|
|
| 494 |
purpose="Evaluate language models' understanding and reasoning capabilities in Traditional Chinese across diverse subjects",
|
| 495 |
source="https://huggingface.co/datasets/ikala/tmmluplus",
|
| 496 |
language="Traditional Chinese",
|
|
|
|
| 497 |
format="Multiple choice questions with four options (A, B, C, D)",
|
| 498 |
characteristics=(
|
| 499 |
"A comprehensive evaluation benchmark featuring 66 subjects from elementary "
|
|
@@ -621,6 +624,7 @@ class MMLUProDatasetParser(HuggingFaceDatasetParser[MMLUProParseEntry]):
|
|
| 621 |
purpose="Provide a more robust and challenging multi-task language understanding benchmark with enhanced reasoning requirements",
|
| 622 |
source="https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro",
|
| 623 |
language="English",
|
|
|
|
| 624 |
format="Multiple choice questions with up to 10 options (expanded from original 4)",
|
| 625 |
characteristics=(
|
| 626 |
"A more challenging version of MMLU containing 12K complex questions across various "
|
|
|
|
| 212 |
purpose="Evaluate models' extensive world knowledge and problem-solving abilities across diverse branches of knowledge",
|
| 213 |
source="https://huggingface.co/datasets/cais/mmlu",
|
| 214 |
language="English",
|
| 215 |
+
category=["General Knowledge and Reasoning"],
|
| 216 |
format="Multiple choice questions with four options (A, B, C, D)",
|
| 217 |
characteristics=(
|
| 218 |
"Comprehensive evaluation benchmark spanning humanities, social sciences, hard sciences, "
|
|
|
|
| 333 |
source="https://huggingface.co/datasets/edinburgh-dawg/mmlu-redux",
|
| 334 |
language="English",
|
| 335 |
format="Multiple choice questions with four options (A, B, C, D)",
|
| 336 |
+
category=["General Knowledge and Reasoning"],
|
| 337 |
characteristics=(
|
| 338 |
"A carefully curated subset of 3,000 questions across 30 MMLU subjects, "
|
| 339 |
"manually re-annotated to identify and classify various types of errors. "
|
|
|
|
| 496 |
purpose="Evaluate language models' understanding and reasoning capabilities in Traditional Chinese across diverse subjects",
|
| 497 |
source="https://huggingface.co/datasets/ikala/tmmluplus",
|
| 498 |
language="Traditional Chinese",
|
| 499 |
+
category=["General Knowledge and Reasoning", "Taiwan"],
|
| 500 |
format="Multiple choice questions with four options (A, B, C, D)",
|
| 501 |
characteristics=(
|
| 502 |
"A comprehensive evaluation benchmark featuring 66 subjects from elementary "
|
|
|
|
| 624 |
purpose="Provide a more robust and challenging multi-task language understanding benchmark with enhanced reasoning requirements",
|
| 625 |
source="https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro",
|
| 626 |
language="English",
|
| 627 |
+
category=["General Knowledge and Reasoning", "Advanced Reasoning"],
|
| 628 |
format="Multiple choice questions with up to 10 options (expanded from original 4)",
|
| 629 |
characteristics=(
|
| 630 |
"A more challenging version of MMLU containing 12K complex questions across various "
|
llmdataparser/tmlu_parser.py
CHANGED
|
@@ -130,6 +130,7 @@ class TMLUDatasetParser(HuggingFaceDatasetParser[TMLUParseEntry]):
|
|
| 130 |
language="Traditional Chinese",
|
| 131 |
purpose="Evaluate models on Taiwan-specific educational and professional knowledge",
|
| 132 |
source="Various Taiwan standardized tests and professional certifications",
|
|
|
|
| 133 |
format="Multiple choice questions (A/B/C/D)",
|
| 134 |
characteristics=(
|
| 135 |
"Covers various subjects including Advanced Subjects Test (AST), "
|
|
|
|
| 130 |
language="Traditional Chinese",
|
| 131 |
purpose="Evaluate models on Taiwan-specific educational and professional knowledge",
|
| 132 |
source="Various Taiwan standardized tests and professional certifications",
|
| 133 |
+
category=["Taiwan", "General Knowledge and Reasoning"],
|
| 134 |
format="Multiple choice questions (A/B/C/D)",
|
| 135 |
characteristics=(
|
| 136 |
"Covers various subjects including Advanced Subjects Test (AST), "
|
llmdataparser/tw_legal_parser.py
CHANGED
|
@@ -82,6 +82,7 @@ class TWLegalDatasetParser(HuggingFaceDatasetParser[TWLegalParseEntry]):
|
|
| 82 |
language="Traditional Chinese",
|
| 83 |
purpose="Evaluate models on Taiwan-specific legal knowledge and understanding",
|
| 84 |
source="Taiwan Bar Examination questions",
|
|
|
|
| 85 |
format="Multiple choice questions (A/B/C/D)",
|
| 86 |
characteristics=(
|
| 87 |
"Contains questions from Taiwan's bar examination, testing understanding "
|
|
|
|
| 82 |
language="Traditional Chinese",
|
| 83 |
purpose="Evaluate models on Taiwan-specific legal knowledge and understanding",
|
| 84 |
source="Taiwan Bar Examination questions",
|
| 85 |
+
category=["Taiwan", "General Knowledge and Reasoning", "Legal"],
|
| 86 |
format="Multiple choice questions (A/B/C/D)",
|
| 87 |
characteristics=(
|
| 88 |
"Contains questions from Taiwan's bar examination, testing understanding "
|