Spaces:

JeffYang52415
/

LLMEval-Dataset-Parser

Running

App Files Files Community

JeffYang52415 commited on Dec 29, 2024

Commit

0772011

unverified ·

1 Parent(s): 8f24676

refactor: mbpp parser

Browse files

Files changed (2) hide show

llmdataparser/mbpp_parser.py +70 -1
tests/test_mbpp_parser.py +71 -0

llmdataparser/mbpp_parser.py CHANGED Viewed

@@ -1,7 +1,12 @@
 from dataclasses import dataclass
 from typing import Any, ClassVar
-from llmdataparser.base_parser import HuggingFaceDatasetParser, HuggingFaceParseEntry
 from llmdataparser.prompts import MBPP_SYSTEM_PROMPT
@@ -83,6 +88,70 @@ class MBPPDatasetParser(HuggingFaceDatasetParser[MBPPParseEntry]):
             source_file=source_file,
         )
 if __name__ == "__main__":
     # Example usage

 from dataclasses import dataclass
 from typing import Any, ClassVar
+from llmdataparser.base_parser import (
+    DatasetDescription,
+    EvaluationMetric,
+    HuggingFaceDatasetParser,
+    HuggingFaceParseEntry,
+)
 from llmdataparser.prompts import MBPP_SYSTEM_PROMPT
             source_file=source_file,
         )
+    def get_dataset_description(self) -> DatasetDescription:
+        """Returns a description of the MBPP dataset."""
+        return DatasetDescription.create(
+            name="Mostly Basic Python Problems (MBPP)",
+            purpose="A benchmark for evaluating code generation capabilities using entry-level Python programming problems",
+            source="https://github.com/google-research/google-research/tree/master/mbpp",
+            language="English and Python",
+            format="Task descriptions in English with corresponding Python solutions and automated test cases",
+            characteristics=(
+                "Contains approximately 1,000 crowd-sourced Python programming problems "
+                "designed for entry-level programmers. Problems cover programming fundamentals "
+                "and standard library functionality. Each problem includes a task description, "
+                "code solution, and 3 automated test cases. A subset of the data has been "
+                "hand-verified by the authors."
+            ),
+            citation=(
+                "@article{austin2021program,\n"
+                "  title={Program Synthesis with Large Language Models},\n"
+                "  author={Austin, Jacob and Odena, Augustus and Nye, Maxwell and Bosma, Maarten and Michalewski, Henryk and Dohan, David and Jiang, Ellen and Cai, Carrie and Terry, Michael and Le, Quoc and others},\n"
+                "  journal={arXiv preprint arXiv:2108.07732},\n"
+                "  year={2021}\n"
+                "}"
+            ),
+            additional_info={
+                "size": "~1,000 programming problems",
+                "splits": "Available in full or sanitized versions",
+                "test_coverage": "Each problem includes 3 automated test cases",
+                "verification": "Subset of data has been hand-verified by authors",
+            },
+        )
+    def get_evaluation_metrics(self) -> list[EvaluationMetric]:
+        """Returns the recommended evaluation metrics for MBPP dataset."""
+        return [
+            EvaluationMetric.create(
+                name="pass@k",
+                type="code_evaluation",
+                description="Percentage of problems where at least one solution in k generations passes all test cases",
+                implementation="custom_pass_at_k",
+                primary=True,
+            ),
+            EvaluationMetric.create(
+                name="test_case_success_rate",
+                type="code_evaluation",
+                description="Percentage of test cases passed across all problems",
+                implementation="custom_test_success_rate",
+                primary=False,
+            ),
+            EvaluationMetric.create(
+                name="syntax_validity",
+                type="code_evaluation",
+                description="Verifies that generated code is syntactically valid Python",
+                implementation="custom_syntax_check",
+                primary=False,
+            ),
+            EvaluationMetric.create(
+                name="code_similarity",
+                type="similarity",
+                description="Similarity between generated code and reference solution",
+                implementation="evaluate.load('code_eval')",
+                primary=False,
+            ),
+        ]
 if __name__ == "__main__":
     # Example usage

tests/test_mbpp_parser.py CHANGED Viewed

@@ -152,3 +152,74 @@ def test_custom_system_prompt():
 def test_default_system_prompt(parser):
     """Test parser uses default system prompt when none provided"""
     assert parser._system_prompt == parser._default_system_prompt

 def test_default_system_prompt(parser):
     """Test parser uses default system prompt when none provided"""
     assert parser._system_prompt == parser._default_system_prompt
+def test_get_dataset_description(parser):
+    """Test dataset description generation."""
+    description = parser.get_dataset_description()
+    assert description.name == "Mostly Basic Python Problems (MBPP)"
+    assert "code generation" in description.purpose.lower()
+    assert "google-research" in description.source
+    assert description.language == "English and Python"
+    assert "task descriptions" in description.format.lower()
+    assert "python solutions" in description.format.lower()
+    assert "1,000" in description.characteristics
+    assert "entry-level programmers" in description.characteristics.lower()
+    assert "3 automated test cases" in description.characteristics
+    assert "hand-verified" in description.characteristics
+    assert "austin2021program" in description.citation
+    assert "Program Synthesis" in description.citation
+    # Check additional info
+    assert description.additional_info is not None
+    assert description.additional_info["size"] == "~1,000 programming problems"
+    assert (
+        description.additional_info["splits"]
+        == "Available in full or sanitized versions"
+    )
+    assert (
+        description.additional_info["test_coverage"]
+        == "Each problem includes 3 automated test cases"
+    )
+    assert (
+        description.additional_info["verification"]
+        == "Subset of data has been hand-verified by authors"
+    )
+def test_get_evaluation_metrics(parser):
+    """Test evaluation metrics generation."""
+    metrics = parser.get_evaluation_metrics()
+    # Check total number of metrics
+    assert len(metrics) == 4
+    # Check primary metrics
+    primary_metrics = [m for m in metrics if m.primary]
+    assert len(primary_metrics) == 1
+    # Verify specific metrics exist with correct properties
+    metric_names = {m.name for m in metrics}
+    assert "pass@k" in metric_names
+    assert "test_case_success_rate" in metric_names
+    assert "syntax_validity" in metric_names
+    # Check specific metric properties
+    pass_k_metric = next(m for m in metrics if m.name == "pass@k")
+    assert pass_k_metric.type == "code_evaluation"
+    assert pass_k_metric.primary is True
+    assert "k generations" in pass_k_metric.description.lower()
+    assert "custom_pass_at_k" in pass_k_metric.implementation
+    test_case_metric = next(m for m in metrics if m.name == "test_case_success_rate")
+    assert test_case_metric.type == "code_evaluation"
+    assert test_case_metric.primary is False
+    assert "test cases" in test_case_metric.description.lower()
+    assert "custom_test_success_rate" in test_case_metric.implementation
+    syntax_metric = next(m for m in metrics if m.name == "syntax_validity")
+    assert syntax_metric.type == "code_evaluation"
+    assert syntax_metric.primary is False
+    assert "syntactically valid" in syntax_metric.description.lower()
+    assert "custom_syntax_check" in syntax_metric.implementation