Spaces:

symanto
/

generation_evaluator

Running

App Files Files Community

HalteroXHunter commited on Jun 21, 2024

Commit

c418edf

1 Parent(s): 553023f

add bleurt and bertscore

Browse files

Files changed (1) hide show

generation_evaluator.py +57 -4

generation_evaluator.py CHANGED Viewed

@@ -32,6 +32,21 @@ _CITATION = """\
     publisher = "COLING",
     url = "https://www.aclweb.org/anthology/C04-1072",
     pages = "501--507",
 """
 _DESCRIPTION = """\
@@ -54,6 +69,18 @@ Neither intelligibility nor grammatical correctness are not taken into account.
 EXACT MATCH: Returns the rate at which the input predicted strings exactly match their references, ignoring any strings input as part of the regexes_to_ignore list.
 """
 _KWARGS_DESCRIPTION = """
@@ -63,7 +90,7 @@ Args:
         should be a string with tokens separated by spaces.
     references: list of reference for each prediction. Each
         reference should be a string with tokens separated by spaces.
 Returns:
 ROUGE:{
     rouge1: rouge_1 (precision, recall, f1),
@@ -81,9 +108,19 @@ BLEU:{
 },
 EXACT_MATCH:{
     "exact_match": exact_match rate. Possible values are between 0.0 and 1.0, inclusive.
 }
 """
 class GenerationEvaluator(evaluate.Metric):
     def _info(self):
         return evaluate.MetricInfo(
@@ -116,11 +153,27 @@ class GenerationEvaluator(evaluate.Metric):
         bleu_results = bleu_score.compute(
             predictions=predictions, references=references
         )
         exact_match_score = evaluate.load("exact_match")
         exact_match_results = exact_match_score.compute(
             predictions=predictions, references=references
         )
-        return {"ROUGE": rouge_results, "BLEU": bleu_results, "EXACT_MATCH": exact_match_results}

     publisher = "COLING",
     url = "https://www.aclweb.org/anthology/C04-1072",
     pages = "501--507",
+\
+@inproceedings{bert-score,
+  title={BERTScore: Evaluating Text Generation with BERT},
+  author={Tianyi Zhang* and Varsha Kishore* and Felix Wu* and Kilian Q. Weinberger and Yoav Artzi},
+  booktitle={International Conference on Learning Representations},
+  year={2020},
+  url={https://openreview.net/forum?id=SkeHuCVFDr}
+\
+@inproceedings{bleurt,
+  title={BLEURT: Learning Robust Metrics for Text Generation},
+  author={Thibault Sellam and Dipanjan Das and Ankur P. Parikh},
+  booktitle={ACL},
+  year={2020},
+  url={https://arxiv.org/abs/2004.04696}
+}
 """
 _DESCRIPTION = """\
 EXACT MATCH: Returns the rate at which the input predicted strings exactly match their references, ignoring any strings input as part of the regexes_to_ignore list.
+BERTScore leverages the pre-trained contextual embeddings from BERT and matches words in candidate and reference
+sentences by cosine similarity.
+It has been shown to correlate with human judgment on sentence-level and system-level evaluation.
+Moreover, BERTScore computes precision, recall, and F1 measure, which can be useful for evaluating different language
+generation tasks.
+See the project's README at https://github.com/Tiiiger/bert_score#readme for more information.
+BLEURT a learnt evaluation metric for Natural Language Generation. It is built using multiple phases of transfer learning starting from a pretrained BERT model (Devlin et al. 2018)
+and then employing another pre-training phrase using synthetic data. Finally it is trained on WMT human annotations. You may run BLEURT out-of-the-box or fine-tune
+it for your specific application (the latter is expected to perform better).
+See the project's README at https://github.com/google-research/bleurt#readme for more information.
 """
 _KWARGS_DESCRIPTION = """
         should be a string with tokens separated by spaces.
     references: list of reference for each prediction. Each
         reference should be a string with tokens separated by spaces.
 Returns:
 ROUGE:{
     rouge1: rouge_1 (precision, recall, f1),
 },
 EXACT_MATCH:{
     "exact_match": exact_match rate. Possible values are between 0.0 and 1.0, inclusive.
+},
+BERT_SCORE:{
+    "precision": Precision.
+    "recall": Recall.
+    "f1": F1 score.
+    "hashcode": Hashcode of the library.
+},
+BLEURT:{
+    "scores": List of scores.
 }
 """
 class GenerationEvaluator(evaluate.Metric):
     def _info(self):
         return evaluate.MetricInfo(
         bleu_results = bleu_score.compute(
             predictions=predictions, references=references
         )
         exact_match_score = evaluate.load("exact_match")
         exact_match_results = exact_match_score.compute(
             predictions=predictions, references=references
         )
+        bert_score = evaluate.load("bert_score")
+        bert_score_results = bert_score.compute(
+            predictions=predictions, references=references,
+            lang="en"
+        )
+        bleurt_score = evaluate.load("bleurt", module_type="metric")
+        bleurt_results = bleurt_score.compute(
+            predictions=predictions, references=references
+        )
+        return {
+            "ROUGE": rouge_results,
+            "BLEU": bleu_results,
+            "EXACT_MATCH": exact_match_results,
+            "BERT_SCORE":bert_score_results,
+            "BLEURT":bleurt_results
+        }