Spaces:
Running
Running
David Dale
commited on
Commit
·
1bca40f
1
Parent(s):
e5fc052
Make the LB filterable by languages
Browse files- .gitignore +1 -0
- app.py +9 -16
- leaderboard.py +103 -0
- requirements.txt +0 -1
.gitignore
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
__pycache__
|
app.py
CHANGED
|
@@ -7,12 +7,14 @@
|
|
| 7 |
import gradio as gr
|
| 8 |
from huggingface_hub import whoami
|
| 9 |
from huggingface_hub.errors import LocalTokenNotFoundError
|
| 10 |
-
from gradio_leaderboard import Leaderboard
|
| 11 |
import argilla as rg
|
| 12 |
import os
|
| 13 |
from requests.exceptions import HTTPError
|
| 14 |
import csv
|
| 15 |
import pandas as pd
|
|
|
|
|
|
|
|
|
|
| 16 |
|
| 17 |
CLA = """
|
| 18 |
Meta Platforms, Inc. Individual Contributor License Agreement ("Agreement"), v2.0
|
|
@@ -389,6 +391,12 @@ You can also find more details on BOUQuET 💐 scientific context and purpose i
|
|
| 389 |
|
| 390 |
The dataset is accessible at https://huggingface.co/datasets/facebook/bouquet. We are going to update it regularly, as the contributions in new languages are completed and validated.
|
| 391 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 392 |
### Contribute
|
| 393 |
|
| 394 |
If you want to contribute dataset translations for a new language or validate existing translations, check out our crowdsourcing system: https://bouquet.metademolab.com.
|
|
@@ -401,21 +409,6 @@ If you want to contribute dataset translations for a new language or validate ex
|
|
| 401 |
""")
|
| 402 |
|
| 403 |
|
| 404 |
-
def leaderboard_tab():
|
| 405 |
-
stats = pd.read_csv("data/benchmark_stats.tsv", sep="\t", quoting=csv.QUOTE_NONE)
|
| 406 |
-
df = stats.groupby(['system', 'level'])[
|
| 407 |
-
['score_metricx_both', 'score_xcomet_both', 'score_CHRFpp', 'score_glotlid_ref']
|
| 408 |
-
].mean().reset_index().sort_values('score_metricx_both')
|
| 409 |
-
with gr.Tab("Leaderboard"):
|
| 410 |
-
gr.Markdown("The current leaderboard displays performance across all directions in the benchmark.")
|
| 411 |
-
gr.Markdown("A smarter leaderboard and the code for reproducing the evaluation will be published soon!")
|
| 412 |
-
# Leaderboard(
|
| 413 |
-
# value=df,
|
| 414 |
-
# select_columns=["system", "level"] + ['score_metricx_both', 'score_xcomet_both', 'score_CHRFpp', 'score_glotlid_ref'],
|
| 415 |
-
# filter_columns=["system", "level"],
|
| 416 |
-
# )
|
| 417 |
-
gr.Dataframe(df)
|
| 418 |
-
|
| 419 |
with gr.Blocks(
|
| 420 |
css="""
|
| 421 |
#cla textarea {min-height: 60em;}
|
|
|
|
| 7 |
import gradio as gr
|
| 8 |
from huggingface_hub import whoami
|
| 9 |
from huggingface_hub.errors import LocalTokenNotFoundError
|
|
|
|
| 10 |
import argilla as rg
|
| 11 |
import os
|
| 12 |
from requests.exceptions import HTTPError
|
| 13 |
import csv
|
| 14 |
import pandas as pd
|
| 15 |
+
from collections import defaultdict
|
| 16 |
+
|
| 17 |
+
from leaderboard import leaderboard_tab
|
| 18 |
|
| 19 |
CLA = """
|
| 20 |
Meta Platforms, Inc. Individual Contributor License Agreement ("Agreement"), v2.0
|
|
|
|
| 391 |
|
| 392 |
The dataset is accessible at https://huggingface.co/datasets/facebook/bouquet. We are going to update it regularly, as the contributions in new languages are completed and validated.
|
| 393 |
|
| 394 |
+
### Leaderboard
|
| 395 |
+
|
| 396 |
+
To see how the various translation systems perform on BOUQuET, refer to the "Leaderboard" tab!
|
| 397 |
+
|
| 398 |
+
If you want another system evaluated, please open a discussion in the "Community" tab.
|
| 399 |
+
|
| 400 |
### Contribute
|
| 401 |
|
| 402 |
If you want to contribute dataset translations for a new language or validate existing translations, check out our crowdsourcing system: https://bouquet.metademolab.com.
|
|
|
|
| 409 |
""")
|
| 410 |
|
| 411 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 412 |
with gr.Blocks(
|
| 413 |
css="""
|
| 414 |
#cla textarea {min-height: 60em;}
|
leaderboard.py
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import gradio as gr
|
| 4 |
+
import csv
|
| 5 |
+
from collections import defaultdict
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def strip_colname(x):
|
| 9 |
+
if x.startswith('score_'):
|
| 10 |
+
return x[6:]
|
| 11 |
+
return x
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
INTRO = """The current leaderboard displays performance across all filtered directions based on the dev subset of BOUQuET.
|
| 15 |
+
|
| 16 |
+
A smarter leaderboard and the code for reproducing the evaluations will be published soon!
|
| 17 |
+
"""
|
| 18 |
+
|
| 19 |
+
LANGS_EXPLANATION = """## Languages
|
| 20 |
+
For the description of languages, please refer to https://huggingface.co/datasets/facebook/bouquet#languages.
|
| 21 |
+
"""
|
| 22 |
+
|
| 23 |
+
METRICS_EXPLANATION = """## Metrics
|
| 24 |
+
1. `metricx_both`: [google/metricx-24-hybrid-xl-v2p6](https://huggingface.co/google/metricx-24-hybrid-xl-v2p6) score based on both source and reference. **Attention: lower is better!**
|
| 25 |
+
2. `xcomet_both`: []() score based on both source and reference.
|
| 26 |
+
3. `CHRFpp`: ChrF++ score ([sacrebleu](https://github.com/mjpost/sacrebleu) implementation) based on reference.
|
| 27 |
+
4. `glotlid_ref`: probability, as predicted by the [GlotLID model](https://huggingface.co/cis-lmu/glotlid), that translation and reference are in the same language.
|
| 28 |
+
"""
|
| 29 |
+
|
| 30 |
+
SYSTEMS_EXPLANATION = """## Systems
|
| 31 |
+
Descriptions of the implementation of the systems will come out later.
|
| 32 |
+
"""
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def leaderboard_tab():
|
| 36 |
+
stats = pd.read_csv("data/benchmark_stats.tsv", sep="\t", quoting=csv.QUOTE_NONE)
|
| 37 |
+
|
| 38 |
+
metrics = ['score_metricx_both', 'score_xcomet_both', 'score_CHRFpp', 'score_glotlid_ref']
|
| 39 |
+
ALL = "ALL"
|
| 40 |
+
lang_src2tgt = defaultdict(set)
|
| 41 |
+
lang_tgt2src = defaultdict(set)
|
| 42 |
+
langs_src = set()
|
| 43 |
+
langs_tgt = set()
|
| 44 |
+
for src_lang, tgt_lang in stats[["src_lang", "tgt_lang"]].drop_duplicates().values:
|
| 45 |
+
lang_src2tgt[src_lang].add(tgt_lang)
|
| 46 |
+
lang_tgt2src[tgt_lang].add(src_lang)
|
| 47 |
+
langs_src.add(src_lang)
|
| 48 |
+
langs_tgt.add(tgt_lang)
|
| 49 |
+
|
| 50 |
+
with gr.Tab("Leaderboard"):
|
| 51 |
+
gr.Markdown("# BOUQuET translation leaderboard")
|
| 52 |
+
gr.Markdown(INTRO)
|
| 53 |
+
|
| 54 |
+
gr.Markdown("## Systems ranking")
|
| 55 |
+
# Inputs
|
| 56 |
+
gr_level = gr.Dropdown(
|
| 57 |
+
["sentence_level", "paragraph_level"], value="sentence_level", label="Level"
|
| 58 |
+
)
|
| 59 |
+
gr_src_lang = gr.Dropdown([ALL] + sorted(langs_src), value=ALL, label="Source lang")
|
| 60 |
+
gr_tgt_lang = gr.Dropdown([ALL] + sorted(langs_tgt), value=ALL, label="Target lang")
|
| 61 |
+
|
| 62 |
+
# Interactivity
|
| 63 |
+
inputs = [gr_level, gr_src_lang, gr_tgt_lang]
|
| 64 |
+
|
| 65 |
+
def get_lb(level, src_lang, tgt_lang):
|
| 66 |
+
filtered = stats[stats["level"].eq(level)]
|
| 67 |
+
if src_lang != ALL:
|
| 68 |
+
filtered = filtered[filtered["src_lang"].eq(src_lang)]
|
| 69 |
+
if tgt_lang != ALL:
|
| 70 |
+
filtered = filtered[filtered["tgt_lang"].eq(tgt_lang)]
|
| 71 |
+
means = filtered.groupby(['system'])[metrics].mean().reset_index().sort_values('score_metricx_both')
|
| 72 |
+
means.columns = [strip_colname(c) for c in means.columns]
|
| 73 |
+
styler = means.style.background_gradient().format(precision=4)
|
| 74 |
+
return styler
|
| 75 |
+
|
| 76 |
+
df_all = get_lb(*[inp.value for inp in inputs])
|
| 77 |
+
gr_df = gr.Dataframe(df_all)
|
| 78 |
+
|
| 79 |
+
for inp in inputs:
|
| 80 |
+
inp.change(fn=get_lb, inputs=inputs, outputs=gr_df)
|
| 81 |
+
|
| 82 |
+
# Interdependecy of the controls
|
| 83 |
+
def src2tgt(src_lang, tgt_lang):
|
| 84 |
+
if src_lang == ALL:
|
| 85 |
+
choices = [ALL] + sorted(langs_tgt)
|
| 86 |
+
else:
|
| 87 |
+
choices = [ALL] + sorted(lang_src2tgt[src_lang])
|
| 88 |
+
|
| 89 |
+
return gr.update(choices=choices, value=tgt_lang)
|
| 90 |
+
|
| 91 |
+
def tgt2src(src_lang, tgt_lang):
|
| 92 |
+
if tgt_lang == ALL:
|
| 93 |
+
choices = [ALL] + sorted(langs_src)
|
| 94 |
+
else:
|
| 95 |
+
choices = [ALL] + sorted(lang_tgt2src[tgt_lang])
|
| 96 |
+
return gr.update(choices=choices, value=src_lang)
|
| 97 |
+
|
| 98 |
+
gr_src_lang.input(fn=src2tgt, inputs=[gr_src_lang, gr_tgt_lang], outputs=gr_tgt_lang)
|
| 99 |
+
gr_tgt_lang.input(fn=tgt2src, inputs=[gr_src_lang, gr_tgt_lang], outputs=gr_src_lang)
|
| 100 |
+
|
| 101 |
+
gr.Markdown(LANGS_EXPLANATION)
|
| 102 |
+
gr.Markdown(METRICS_EXPLANATION)
|
| 103 |
+
gr.Markdown(SYSTEMS_EXPLANATION)
|
requirements.txt
CHANGED
|
@@ -1,4 +1,3 @@
|
|
| 1 |
argilla
|
| 2 |
gradio[oauth]
|
| 3 |
pandas
|
| 4 |
-
gradio_leaderboard
|
|
|
|
| 1 |
argilla
|
| 2 |
gradio[oauth]
|
| 3 |
pandas
|
|
|