Spaces:

symanto
/

generation_evaluator

Running

App Files Files Community

HalteroXHunter commited on Jun 19, 2024

Commit

bac1189

1 Parent(s): fff21b1

update info

Browse files

Files changed (1) hide show

textgen_evaluator.py +41 -10

textgen_evaluator.py CHANGED Viewed

@@ -13,6 +13,25 @@ _CITATION = """\
     url = "https://www.aclweb.org/anthology/W04-1013",
     pages = "74--81",
 }
 """
 _DESCRIPTION = """\
@@ -24,31 +43,43 @@ Note that ROUGE is case insensitive, meaning that upper case letters are treated
 This metrics is a wrapper around Google Research reimplementation of ROUGE:
 https://github.com/google-research/google-research/tree/master/rouge
 """
 _KWARGS_DESCRIPTION = """
-Calculates average rouge scores for a list of hypotheses and references
 Args:
     predictions: list of predictions to score. Each prediction
         should be a string with tokens separated by spaces.
     references: list of reference for each prediction. Each
         reference should be a string with tokens separated by spaces.
-    rouge_types: A list of rouge types to calculate.
-        Valid names:
-        `"rouge{n}"` (e.g. `"rouge1"`, `"rouge2"`) where: {n} is the n-gram based scoring,
-        `"rougeL"`: Longest common subsequence based scoring.
-        `"rougeLSum"`: rougeLsum splits text using `"\n"`.
-        See details in https://github.com/huggingface/datasets/issues/617
-    use_stemmer: Bool indicating whether Porter stemmer should be used to strip word suffixes.
-    use_aggregator: Return aggregates if this is set to True
 Returns:
     rouge1: rouge_1 (precision, recall, f1),
     rouge2: rouge_2 (precision, recall, f1),
     rougeL: rouge_l (precision, recall, f1),
     rougeLsum: rouge_lsum (precision, recall, f1)
 """
-class TextGenEvaluatorTest(evaluate.Metric):
     def _info(self):
         return evaluate.MetricInfo(
             description=_DESCRIPTION,

     url = "https://www.aclweb.org/anthology/W04-1013",
     pages = "74--81",
 }
+\
+@INPROCEEDINGS{Papineni02bleu:a,
+    author = {Kishore Papineni and Salim Roukos and Todd Ward and Wei-jing Zhu},
+    title = {BLEU: a Method for Automatic Evaluation of Machine Translation},
+    booktitle = {},
+    year = {2002},
+    pages = {311--318}
+}
+@inproceedings{lin-och-2004-orange,
+    title = "{ORANGE}: a Method for Evaluating Automatic Evaluation Metrics for Machine Translation",
+    author = "Lin, Chin-Yew  and
+      Och, Franz Josef",
+    booktitle = "{COLING} 2004: Proceedings of the 20th International Conference on Computational Linguistics",
+    month = "aug 23{--}aug 27",
+    year = "2004",
+    address = "Geneva, Switzerland",
+    publisher = "COLING",
+    url = "https://www.aclweb.org/anthology/C04-1072",
+    pages = "501--507",
 """
 _DESCRIPTION = """\
 This metrics is a wrapper around Google Research reimplementation of ROUGE:
 https://github.com/google-research/google-research/tree/master/rouge
+BLEU (Bilingual Evaluation Understudy) is an algorithm for evaluating the quality of text which has been machine-translated from one natural language to another.
+Quality is considered to be the correspondence between a machine's output and that of a human: "the closer a machine translation is to a professional human translation, the better it is"
+this is the central idea behind BLEU. BLEU was one of the first metrics to claim a high correlation with human judgements of quality, and remains one of the most popular automated and inexpensive metrics.
+Scores are calculated for individual translated segments—generally sentences—by comparing them with a set of good quality reference translations.
+Those scores are then averaged over the whole corpus to reach an estimate of the translation's overall quality.
+Neither intelligibility nor grammatical correctness are not taken into account.
 """
 _KWARGS_DESCRIPTION = """
+Calculates average rouge and bleu scores for a list of hypotheses and references
 Args:
     predictions: list of predictions to score. Each prediction
         should be a string with tokens separated by spaces.
     references: list of reference for each prediction. Each
         reference should be a string with tokens separated by spaces.
 Returns:
+ROUGE:{
     rouge1: rouge_1 (precision, recall, f1),
     rouge2: rouge_2 (precision, recall, f1),
     rougeL: rouge_l (precision, recall, f1),
     rougeLsum: rouge_lsum (precision, recall, f1)
+},
+BLEU:{
+    'bleu': bleu score,
+    'precisions': geometric mean of n-gram precisions,
+    'brevity_penalty': brevity penalty,
+    'length_ratio': ratio of lengths,
+    'translation_length': translation_length,
+    'reference_length': reference_length
+}
 """
+class TextGenEvaluator(evaluate.Metric):
     def _info(self):
         return evaluate.MetricInfo(
             description=_DESCRIPTION,