Spaces:
Running
Running
Commit
·
bac1189
1
Parent(s):
fff21b1
update info
Browse files- textgen_evaluator.py +41 -10
textgen_evaluator.py
CHANGED
|
@@ -13,6 +13,25 @@ _CITATION = """\
|
|
| 13 |
url = "https://www.aclweb.org/anthology/W04-1013",
|
| 14 |
pages = "74--81",
|
| 15 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
"""
|
| 17 |
|
| 18 |
_DESCRIPTION = """\
|
|
@@ -24,31 +43,43 @@ Note that ROUGE is case insensitive, meaning that upper case letters are treated
|
|
| 24 |
|
| 25 |
This metrics is a wrapper around Google Research reimplementation of ROUGE:
|
| 26 |
https://github.com/google-research/google-research/tree/master/rouge
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
"""
|
| 28 |
|
| 29 |
_KWARGS_DESCRIPTION = """
|
| 30 |
-
Calculates average rouge scores for a list of hypotheses and references
|
| 31 |
Args:
|
| 32 |
predictions: list of predictions to score. Each prediction
|
| 33 |
should be a string with tokens separated by spaces.
|
| 34 |
references: list of reference for each prediction. Each
|
| 35 |
reference should be a string with tokens separated by spaces.
|
| 36 |
-
|
| 37 |
-
Valid names:
|
| 38 |
-
`"rouge{n}"` (e.g. `"rouge1"`, `"rouge2"`) where: {n} is the n-gram based scoring,
|
| 39 |
-
`"rougeL"`: Longest common subsequence based scoring.
|
| 40 |
-
`"rougeLSum"`: rougeLsum splits text using `"\n"`.
|
| 41 |
-
See details in https://github.com/huggingface/datasets/issues/617
|
| 42 |
-
use_stemmer: Bool indicating whether Porter stemmer should be used to strip word suffixes.
|
| 43 |
-
use_aggregator: Return aggregates if this is set to True
|
| 44 |
Returns:
|
|
|
|
| 45 |
rouge1: rouge_1 (precision, recall, f1),
|
| 46 |
rouge2: rouge_2 (precision, recall, f1),
|
| 47 |
rougeL: rouge_l (precision, recall, f1),
|
| 48 |
rougeLsum: rouge_lsum (precision, recall, f1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
"""
|
| 50 |
|
| 51 |
-
class
|
| 52 |
def _info(self):
|
| 53 |
return evaluate.MetricInfo(
|
| 54 |
description=_DESCRIPTION,
|
|
|
|
| 13 |
url = "https://www.aclweb.org/anthology/W04-1013",
|
| 14 |
pages = "74--81",
|
| 15 |
}
|
| 16 |
+
\
|
| 17 |
+
@INPROCEEDINGS{Papineni02bleu:a,
|
| 18 |
+
author = {Kishore Papineni and Salim Roukos and Todd Ward and Wei-jing Zhu},
|
| 19 |
+
title = {BLEU: a Method for Automatic Evaluation of Machine Translation},
|
| 20 |
+
booktitle = {},
|
| 21 |
+
year = {2002},
|
| 22 |
+
pages = {311--318}
|
| 23 |
+
}
|
| 24 |
+
@inproceedings{lin-och-2004-orange,
|
| 25 |
+
title = "{ORANGE}: a Method for Evaluating Automatic Evaluation Metrics for Machine Translation",
|
| 26 |
+
author = "Lin, Chin-Yew and
|
| 27 |
+
Och, Franz Josef",
|
| 28 |
+
booktitle = "{COLING} 2004: Proceedings of the 20th International Conference on Computational Linguistics",
|
| 29 |
+
month = "aug 23{--}aug 27",
|
| 30 |
+
year = "2004",
|
| 31 |
+
address = "Geneva, Switzerland",
|
| 32 |
+
publisher = "COLING",
|
| 33 |
+
url = "https://www.aclweb.org/anthology/C04-1072",
|
| 34 |
+
pages = "501--507",
|
| 35 |
"""
|
| 36 |
|
| 37 |
_DESCRIPTION = """\
|
|
|
|
| 43 |
|
| 44 |
This metrics is a wrapper around Google Research reimplementation of ROUGE:
|
| 45 |
https://github.com/google-research/google-research/tree/master/rouge
|
| 46 |
+
|
| 47 |
+
BLEU (Bilingual Evaluation Understudy) is an algorithm for evaluating the quality of text which has been machine-translated from one natural language to another.
|
| 48 |
+
Quality is considered to be the correspondence between a machine's output and that of a human: "the closer a machine translation is to a professional human translation, the better it is"
|
| 49 |
+
this is the central idea behind BLEU. BLEU was one of the first metrics to claim a high correlation with human judgements of quality, and remains one of the most popular automated and inexpensive metrics.
|
| 50 |
+
|
| 51 |
+
Scores are calculated for individual translated segments—generally sentences—by comparing them with a set of good quality reference translations.
|
| 52 |
+
Those scores are then averaged over the whole corpus to reach an estimate of the translation's overall quality.
|
| 53 |
+
Neither intelligibility nor grammatical correctness are not taken into account.
|
| 54 |
+
|
| 55 |
"""
|
| 56 |
|
| 57 |
_KWARGS_DESCRIPTION = """
|
| 58 |
+
Calculates average rouge and bleu scores for a list of hypotheses and references
|
| 59 |
Args:
|
| 60 |
predictions: list of predictions to score. Each prediction
|
| 61 |
should be a string with tokens separated by spaces.
|
| 62 |
references: list of reference for each prediction. Each
|
| 63 |
reference should be a string with tokens separated by spaces.
|
| 64 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
Returns:
|
| 66 |
+
ROUGE:{
|
| 67 |
rouge1: rouge_1 (precision, recall, f1),
|
| 68 |
rouge2: rouge_2 (precision, recall, f1),
|
| 69 |
rougeL: rouge_l (precision, recall, f1),
|
| 70 |
rougeLsum: rouge_lsum (precision, recall, f1)
|
| 71 |
+
},
|
| 72 |
+
BLEU:{
|
| 73 |
+
'bleu': bleu score,
|
| 74 |
+
'precisions': geometric mean of n-gram precisions,
|
| 75 |
+
'brevity_penalty': brevity penalty,
|
| 76 |
+
'length_ratio': ratio of lengths,
|
| 77 |
+
'translation_length': translation_length,
|
| 78 |
+
'reference_length': reference_length
|
| 79 |
+
}
|
| 80 |
"""
|
| 81 |
|
| 82 |
+
class TextGenEvaluator(evaluate.Metric):
|
| 83 |
def _info(self):
|
| 84 |
return evaluate.MetricInfo(
|
| 85 |
description=_DESCRIPTION,
|