Spaces:
Sleeping
Sleeping
| # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| """CharacTER metric, a character-based TER variant, for machine translation.""" | |
| import math | |
| from statistics import mean, median | |
| from typing import Iterable, List, Union | |
| import cer | |
| import datasets | |
| from cer import calculate_cer | |
| from datasets import Sequence, Value | |
| import evaluate | |
| _CITATION = """\ | |
| @inproceedings{wang-etal-2016-character, | |
| title = "{C}harac{T}er: Translation Edit Rate on Character Level", | |
| author = "Wang, Weiyue and | |
| Peter, Jan-Thorsten and | |
| Rosendahl, Hendrik and | |
| Ney, Hermann", | |
| booktitle = "Proceedings of the First Conference on Machine Translation: Volume 2, Shared Task Papers", | |
| month = aug, | |
| year = "2016", | |
| address = "Berlin, Germany", | |
| publisher = "Association for Computational Linguistics", | |
| url = "https://aclanthology.org/W16-2342", | |
| doi = "10.18653/v1/W16-2342", | |
| pages = "505--510", | |
| } | |
| """ | |
| _DESCRIPTION = """\ | |
| CharacTer is a character-level metric inspired by the commonly applied translation edit rate (TER). It is | |
| defined as the minimum number of character edits required to adjust a hypothesis, until it completely matches the | |
| reference, normalized by the length of the hypothesis sentence. CharacTer calculates the character level edit | |
| distance while performing the shift edit on word level. Unlike the strict matching criterion in TER, a hypothesis | |
| word is considered to match a reference word and could be shifted, if the edit distance between them is below a | |
| threshold value. The Levenshtein distance between the reference and the shifted hypothesis sequence is computed on the | |
| character level. In addition, the lengths of hypothesis sequences instead of reference sequences are used for | |
| normalizing the edit distance, which effectively counters the issue that shorter translations normally achieve lower | |
| TER.""" | |
| _KWARGS_DESCRIPTION = """ | |
| Calculates how good the predictions are in terms of the CharacTER metric given some references. | |
| Args: | |
| predictions: a list of predictions to score. Each prediction should be a string with | |
| tokens separated by spaces. | |
| references: a list of references for each prediction. You can also pass multiple references for each prediction, | |
| so a list and in that list a sublist for each prediction for its related references. When multiple references are | |
| given, the lowest (best) score is returned for that prediction-references pair. | |
| Each reference should be a string with tokens separated by spaces. | |
| aggregate: one of "mean", "sum", "median" to indicate how the scores of individual sentences should be | |
| aggregated | |
| return_all_scores: a boolean, indicating whether in addition to the aggregated score, also all individual | |
| scores should be returned | |
| Returns: | |
| cer_score: an aggregated score across all the items, based on 'aggregate' | |
| cer_scores: (optionally, if 'return_all_scores' evaluates to True) a list of all scores, one per ref/hyp pair | |
| Examples: | |
| >>> character_mt = evaluate.load("character") | |
| >>> preds = ["this week the saudis denied information published in the new york times"] | |
| >>> refs = ["saudi arabia denied this week information published in the american new york times"] | |
| >>> character_mt.compute(references=refs, predictions=preds) | |
| {'cer_score': 0.36619718309859156} | |
| >>> preds = ["this week the saudis denied information published in the new york times", | |
| ... "this is in fact an estimate"] | |
| >>> refs = ["saudi arabia denied this week information published in the american new york times", | |
| ... "this is actually an estimate"] | |
| >>> character_mt.compute(references=refs, predictions=preds, aggregate="sum", return_all_scores=True) | |
| {'cer_score': 0.6254564423578508, 'cer_scores': [0.36619718309859156, 0.25925925925925924]} | |
| >>> preds = ["this week the saudis denied information published in the new york times"] | |
| >>> refs = [["saudi arabia denied this week information published in the american new york times", | |
| ... "the saudis have denied new information published in the ny times"]] | |
| >>> character_mt.compute(references=refs, predictions=preds, aggregate="median", return_all_scores=True) | |
| {'cer_score': 0.36619718309859156, 'cer_scores': [0.36619718309859156]} | |
| """ | |
| class Character(evaluate.Metric): | |
| """CharacTer is a character-level metric inspired by the commonly applied translation edit rate (TER).""" | |
| def _info(self): | |
| return evaluate.MetricInfo( | |
| module_type="metric", | |
| description=_DESCRIPTION, | |
| citation=_CITATION, | |
| inputs_description=_KWARGS_DESCRIPTION, | |
| features=[ | |
| datasets.Features( | |
| {"predictions": Value("string", id="prediction"), "references": Value("string", id="reference")} | |
| ), | |
| datasets.Features( | |
| { | |
| "predictions": Value("string", id="prediction"), | |
| "references": Sequence(Value("string", id="reference"), id="references"), | |
| } | |
| ), | |
| ], | |
| homepage="https://github.com/bramvanroy/CharacTER", | |
| codebase_urls=["https://github.com/bramvanroy/CharacTER", "https://github.com/rwth-i6/CharacTER"], | |
| ) | |
| def _compute( | |
| self, | |
| predictions: Iterable[str], | |
| references: Union[Iterable[str], Iterable[Iterable[str]]], | |
| aggregate: str = "mean", | |
| return_all_scores: bool = False, | |
| ): | |
| if aggregate not in ("mean", "sum", "median"): | |
| raise ValueError("'aggregate' must be one of 'sum', 'mean', 'median'") | |
| predictions = [p.split() for p in predictions] | |
| # Predictions and references have the same internal types (both lists of strings), | |
| # so only one reference per prediction | |
| if isinstance(references[0], str): | |
| references = [r.split() for r in references] | |
| scores_d = cer.calculate_cer_corpus(predictions, references) | |
| cer_scores: List[float] = scores_d["cer_scores"] | |
| if aggregate == "sum": | |
| score = sum(cer_scores) | |
| elif aggregate == "mean": | |
| score = scores_d["mean"] | |
| else: | |
| score = scores_d["median"] | |
| else: | |
| # In the case of multiple references, we just find the "best score", | |
| # i.e., the reference that the prediction is closest to, i.e. the lowest characTER score | |
| references = [[r.split() for r in refs] for refs in references] | |
| cer_scores = [] | |
| for pred, refs in zip(predictions, references): | |
| min_score = math.inf | |
| for ref in refs: | |
| score = calculate_cer(pred, ref) | |
| if score < min_score: | |
| min_score = score | |
| cer_scores.append(min_score) | |
| if aggregate == "sum": | |
| score = sum(cer_scores) | |
| elif aggregate == "mean": | |
| score = mean(cer_scores) | |
| else: | |
| score = median(cer_scores) | |
| # Return scores | |
| if return_all_scores: | |
| return {"cer_score": score, "cer_scores": cer_scores} | |
| else: | |
| return {"cer_score": score} | |