Spaces:

symanto
/

generation_evaluator

Running

App Files Files Community

José Ángel González commited on Sep 5, 2024

Commit

64096c9

1 Parent(s): 9ddbb93

added alignscore and removed bleurt

Browse files

Files changed (4) hide show

app.py +1 -1
generation_evaluator.py +53 -26
gradio_tst.py +21 -11
requirements.txt +4 -2

app.py CHANGED Viewed

@@ -2,4 +2,4 @@ import evaluate
 from gradio_tst import launch_gradio_widget2
 module = evaluate.load("generation_evaluator.py")
-launch_gradio_widget2(module)

 from gradio_tst import launch_gradio_widget2
 module = evaluate.load("generation_evaluator.py")
+launch_gradio_widget2(module)

generation_evaluator.py CHANGED Viewed

@@ -1,6 +1,8 @@
 import datasets
 import evaluate
 import numpy as np
 _CITATION = """\
 @inproceedings{lin-2004-rouge,
@@ -77,10 +79,8 @@ Moreover, BERTScore computes precision, recall, and F1 measure, which can be use
 generation tasks.
 See the project's README at https://github.com/Tiiiger/bert_score#readme for more information.
-BLEURT a learnt evaluation metric for Natural Language Generation. It is built using multiple phases of transfer learning starting from a pretrained BERT model (Devlin et al. 2018)
-and then employing another pre-training phrase using synthetic data. Finally it is trained on WMT human annotations. You may run BLEURT out-of-the-box or fine-tune
-it for your specific application (the latter is expected to perform better).
-See the project's README at https://github.com/google-research/bleurt#readme for more information.
 ChrF and ChrF++ are two MT evaluation metrics. They both use the F-score statistic for character n-gram matches,
 and ChrF++ adds word n-grams as well which correlates more strongly with direct assessment. We use the implementation
@@ -119,8 +119,8 @@ BERT_SCORE:{
     "f1": F1 score.
     "hashcode": Hashcode of the library.
 },
-BLEURT:{
-    "scores": List of scores.
 },
 CHRF:{
     'score' (float): The chrF (chrF++) score,
@@ -130,6 +130,14 @@ CHRF:{
 }
 """
 class GenerationEvaluator(evaluate.Metric):
     def _info(self):
@@ -152,52 +160,71 @@ class GenerationEvaluator(evaluate.Metric):
             ],
         )
-    def _compute(self, predictions, references):
         rouge_score = evaluate.load("rouge")
         rouge_results = rouge_score.compute(
             predictions=predictions, references=references
         )
         bleu_score = evaluate.load("bleu")
         bleu_results = bleu_score.compute(
             predictions=predictions, references=references
         )
         exact_match_score = evaluate.load("exact_match")
         exact_match_results = exact_match_score.compute(
             predictions=predictions, references=references
         )
         bert_score = evaluate.load("bertscore")
         bert_score_results = bert_score.compute(
             predictions=predictions, references=references, lang="en"
         )
-        mean_precision = np.mean(bert_score_results['precision'])
-        mean_recall = np.mean(bert_score_results['recall'])
-        mean_f1 = np.mean(bert_score_results['f1'])
-        bert_score_results['precision'] = round(mean_precision, 4)
-        bert_score_results['recall'] = round(mean_recall, 4)
-        bert_score_results['f1'] = round(mean_f1, 4)
-        bleurt_score = evaluate.load("bleurt", module_type="metric")
-        bleurt_results = bleurt_score.compute(
-            predictions=predictions, references=references
         )
-        mean_bleurt_score = np.mean(bleurt_results['scores'])
-        bleurt_results['scores'] = round(mean_bleurt_score, 4)
         chrf = evaluate.load("chrf")
-        chrf_results = chrf.compute(predictions=predictions, references=references)
         return {
             "ROUGE": rouge_results,
             "BLEU": bleu_results,
             "EXACT_MATCH": exact_match_results,
             "BERT_SCORE": bert_score_results,
-            "BLEURT": bleurt_results,
-            "CHRF": chrf_results
         }

 import datasets
 import evaluate
 import numpy as np
+import spacy
+from alignscore import AlignScore
 _CITATION = """\
 @inproceedings{lin-2004-rouge,
 generation tasks.
 See the project's README at https://github.com/Tiiiger/bert_score#readme for more information.
+AlignScore evaluates whether all the information in b is contained in a (b does not contradict a).
+See https://github.com/yuh-zha/AlignScore for more information.
 ChrF and ChrF++ are two MT evaluation metrics. They both use the F-score statistic for character n-gram matches,
 and ChrF++ adds word n-grams as well which correlates more strongly with direct assessment. We use the implementation
     "f1": F1 score.
     "hashcode": Hashcode of the library.
 },
+AlignScore:{
+    "score": mean align scores using roberta-large as scorer
 },
 CHRF:{
     'score' (float): The chrF (chrF++) score,
 }
 """
+ALIGNSCORE_ARGS = {
+    "model": "roberta-large",
+    "batch_size": 32,
+    "device": "cuda",
+    "ckpt_path": "https://huggingface.co/yzha/AlignScore/resolve/main/AlignScore-large.ckpt",
+    "evaluation_mode": "nli_sp",
+}
 class GenerationEvaluator(evaluate.Metric):
     def _info(self):
             ],
         )
+    def _download_and_prepare(self, dl_manager):
+        # Download Spacy en_core_web_sm model for AlignScore
+        try:
+            spacy.load("en_core_web_sm")
+        except OSError:
+            spacy.cli.download("en_core_web_sm")
+        # Download AlignScore checkpoint
+        model_path = dl_manager.download(ALIGNSCORE_ARGS["ckpt_path"])
+        ALIGNSCORE_ARGS["ckpt_path"] = model_path
+        self.align_scorer = AlignScore(**ALIGNSCORE_ARGS)
+    def _compute(self, predictions, references):
+        # Compute ROUGE
         rouge_score = evaluate.load("rouge")
         rouge_results = rouge_score.compute(
             predictions=predictions, references=references
         )
+        # Compute BLEU
         bleu_score = evaluate.load("bleu")
         bleu_results = bleu_score.compute(
             predictions=predictions, references=references
         )
+        # Compute Exact Match
         exact_match_score = evaluate.load("exact_match")
         exact_match_results = exact_match_score.compute(
             predictions=predictions, references=references
         )
+        # Compute BERTScore
         bert_score = evaluate.load("bertscore")
         bert_score_results = bert_score.compute(
             predictions=predictions, references=references, lang="en"
         )
+        mean_precision = np.mean(bert_score_results["precision"])
+        mean_recall = np.mean(bert_score_results["recall"])
+        mean_f1 = np.mean(bert_score_results["f1"])
+        bert_score_results["precision"] = round(mean_precision, 4)
+        bert_score_results["recall"] = round(mean_recall, 4)
+        bert_score_results["f1"] = round(mean_f1, 4)
+        # Compute AlignScore
+        align_score = round(
+            np.mean(
+                self.align_scorer.score(contexts=references, claims=predictions)
+            ),
+            4,
         )
+        # Compute CHRF
         chrf = evaluate.load("chrf")
+        chrf_results = chrf.compute(
+            predictions=predictions, references=references
+        )
         return {
             "ROUGE": rouge_results,
             "BLEU": bleu_results,
             "EXACT_MATCH": exact_match_results,
             "BERT_SCORE": bert_score_results,
+            "CHRF": chrf_results,
+            "ALIGN_SCORE": align_score,
         }

gradio_tst.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import json
 import os
 import re
 import sys
@@ -7,10 +8,6 @@ from pathlib import Path
 import numpy as np
 from datasets import Value
-import logging
 REGEX_YAML_BLOCK = re.compile(r"---[\n\r]+([\S\s]*?)[\n\r]+---[\n\r]")
@@ -27,7 +24,9 @@ def infer_gradio_input_types(feature_types):
     for feature_type in feature_types:
         input_type = "json"
         if isinstance(feature_type, Value):
-            if feature_type.dtype.startswith("int") or feature_type.dtype.startswith("float"):
                 input_type = "number"
             elif feature_type.dtype == "string":
                 input_type = "str"
@@ -59,9 +58,13 @@ def parse_gradio_data(data, input_types):
     data.dropna(inplace=True)
     for feature_name, input_type in zip(data, input_types):
         if input_type == "json":
-            metric_inputs[feature_name] = [json.loads(d) for d in data[feature_name].to_list()]
         elif input_type == "str":
-            metric_inputs[feature_name] = [d.strip('"') for d in data[feature_name].to_list()]
         else:
             metric_inputs[feature_name] = data[feature_name]
     return metric_inputs
@@ -79,9 +82,13 @@ def parse_test_cases(test_cases, feature_names, input_types):
         parsed_cases = []
         for feat, input_type in zip(feature_names, input_types):
             if input_type == "json":
-                parsed_cases.append([str(element) for element in test_case[feat]])
             elif input_type == "str":
-                parsed_cases.append(['"' + element + '"' for element in test_case[feat]])
             else:
                 parsed_cases.append(test_case[feat])
         examples.append([list(i) for i in zip(*parsed_cases)])
@@ -94,7 +101,9 @@ def launch_gradio_widget2(metric):
     try:
         import gradio as gr
     except ImportError as error:
-        logging.error("To create a metric widget with Gradio make sure gradio is installed.")
         raise error
     local_path = Path(sys.path[0])
@@ -118,7 +127,8 @@ def launch_gradio_widget2(metric):
         ),
         outputs=gr.Textbox(label=metric.name),
         description=(
-            metric.info.description + "\nIf this is a text-based metric, make sure to wrap you input in double quotes."
             " Alternatively you can use a JSON-formatted list as input."
         ),
         title=f"Metric: {metric.name}",

 import json
+import logging
 import os
 import re
 import sys
 import numpy as np
 from datasets import Value
 REGEX_YAML_BLOCK = re.compile(r"---[\n\r]+([\S\s]*?)[\n\r]+---[\n\r]")
     for feature_type in feature_types:
         input_type = "json"
         if isinstance(feature_type, Value):
+            if feature_type.dtype.startswith(
+                "int"
+            ) or feature_type.dtype.startswith("float"):
                 input_type = "number"
             elif feature_type.dtype == "string":
                 input_type = "str"
     data.dropna(inplace=True)
     for feature_name, input_type in zip(data, input_types):
         if input_type == "json":
+            metric_inputs[feature_name] = [
+                json.loads(d) for d in data[feature_name].to_list()
+            ]
         elif input_type == "str":
+            metric_inputs[feature_name] = [
+                d.strip('"') for d in data[feature_name].to_list()
+            ]
         else:
             metric_inputs[feature_name] = data[feature_name]
     return metric_inputs
         parsed_cases = []
         for feat, input_type in zip(feature_names, input_types):
             if input_type == "json":
+                parsed_cases.append(
+                    [str(element) for element in test_case[feat]]
+                )
             elif input_type == "str":
+                parsed_cases.append(
+                    ['"' + element + '"' for element in test_case[feat]]
+                )
             else:
                 parsed_cases.append(test_case[feat])
         examples.append([list(i) for i in zip(*parsed_cases)])
     try:
         import gradio as gr
     except ImportError as error:
+        logging.error(
+            "To create a metric widget with Gradio make sure gradio is installed."
+        )
         raise error
     local_path = Path(sys.path[0])
         ),
         outputs=gr.Textbox(label=metric.name),
         description=(
+            metric.info.description
+            + "\nIf this is a text-based metric, make sure to wrap you input in double quotes."
             " Alternatively you can use a JSON-formatted list as input."
         ),
         title=f"Metric: {metric.name}",

requirements.txt CHANGED Viewed

@@ -3,7 +3,9 @@ datasets
 scikit-learn
 gradio
 bert_score
-git+https://github.com/google-research/bleurt.git
 numpy
 git+https://github.com/huggingface/evaluate@a4bdc10c48a450b978d91389a48dbb5297835c7d
-sacrebleu

 scikit-learn
 gradio
 bert_score
+rouge_score
 numpy
 git+https://github.com/huggingface/evaluate@a4bdc10c48a450b978d91389a48dbb5297835c7d
+sacrebleu
+git+ssh://git@github.com/yuh-zha/AlignScore.git
+spacy