Spaces:
Running
Running
Commit
·
9ddbb93
1
Parent(s):
80d7919
include chrf
Browse files- generation_evaluator.py +13 -0
- requirements.txt +3 -1
generation_evaluator.py
CHANGED
|
@@ -82,6 +82,9 @@ and then employing another pre-training phrase using synthetic data. Finally it
|
|
| 82 |
it for your specific application (the latter is expected to perform better).
|
| 83 |
See the project's README at https://github.com/google-research/bleurt#readme for more information.
|
| 84 |
|
|
|
|
|
|
|
|
|
|
| 85 |
"""
|
| 86 |
|
| 87 |
_KWARGS_DESCRIPTION = """
|
|
@@ -118,6 +121,12 @@ BERT_SCORE:{
|
|
| 118 |
},
|
| 119 |
BLEURT:{
|
| 120 |
"scores": List of scores.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
}
|
| 122 |
"""
|
| 123 |
|
|
@@ -180,6 +189,9 @@ class GenerationEvaluator(evaluate.Metric):
|
|
| 180 |
|
| 181 |
mean_bleurt_score = np.mean(bleurt_results['scores'])
|
| 182 |
bleurt_results['scores'] = round(mean_bleurt_score, 4)
|
|
|
|
|
|
|
|
|
|
| 183 |
|
| 184 |
return {
|
| 185 |
"ROUGE": rouge_results,
|
|
@@ -187,4 +199,5 @@ class GenerationEvaluator(evaluate.Metric):
|
|
| 187 |
"EXACT_MATCH": exact_match_results,
|
| 188 |
"BERT_SCORE": bert_score_results,
|
| 189 |
"BLEURT": bleurt_results,
|
|
|
|
| 190 |
}
|
|
|
|
| 82 |
it for your specific application (the latter is expected to perform better).
|
| 83 |
See the project's README at https://github.com/google-research/bleurt#readme for more information.
|
| 84 |
|
| 85 |
+
ChrF and ChrF++ are two MT evaluation metrics. They both use the F-score statistic for character n-gram matches,
|
| 86 |
+
and ChrF++ adds word n-grams as well which correlates more strongly with direct assessment. We use the implementation
|
| 87 |
+
that is already present in sacrebleu.
|
| 88 |
"""
|
| 89 |
|
| 90 |
_KWARGS_DESCRIPTION = """
|
|
|
|
| 121 |
},
|
| 122 |
BLEURT:{
|
| 123 |
"scores": List of scores.
|
| 124 |
+
},
|
| 125 |
+
CHRF:{
|
| 126 |
+
'score' (float): The chrF (chrF++) score,
|
| 127 |
+
'char_order' (int): The character n-gram order,
|
| 128 |
+
'word_order' (int): The word n-gram order. If equals to 2, the metric is referred to as chrF++,
|
| 129 |
+
'beta' (int): Determine the importance of recall w.r.t precision
|
| 130 |
}
|
| 131 |
"""
|
| 132 |
|
|
|
|
| 189 |
|
| 190 |
mean_bleurt_score = np.mean(bleurt_results['scores'])
|
| 191 |
bleurt_results['scores'] = round(mean_bleurt_score, 4)
|
| 192 |
+
|
| 193 |
+
chrf = evaluate.load("chrf")
|
| 194 |
+
chrf_results = chrf.compute(predictions=predictions, references=references)
|
| 195 |
|
| 196 |
return {
|
| 197 |
"ROUGE": rouge_results,
|
|
|
|
| 199 |
"EXACT_MATCH": exact_match_results,
|
| 200 |
"BERT_SCORE": bert_score_results,
|
| 201 |
"BLEURT": bleurt_results,
|
| 202 |
+
"CHRF": chrf_results
|
| 203 |
}
|
requirements.txt
CHANGED
|
@@ -4,4 +4,6 @@ scikit-learn
|
|
| 4 |
gradio
|
| 5 |
bert_score
|
| 6 |
git+https://github.com/google-research/bleurt.git
|
| 7 |
-
numpy
|
|
|
|
|
|
|
|
|
| 4 |
gradio
|
| 5 |
bert_score
|
| 6 |
git+https://github.com/google-research/bleurt.git
|
| 7 |
+
numpy
|
| 8 |
+
git+https://github.com/huggingface/evaluate@a4bdc10c48a450b978d91389a48dbb5297835c7d
|
| 9 |
+
sacrebleu
|