David Dale commited on
Commit
1bca40f
·
1 Parent(s): e5fc052

Make the LB filterable by languages

Browse files
Files changed (4) hide show
  1. .gitignore +1 -0
  2. app.py +9 -16
  3. leaderboard.py +103 -0
  4. requirements.txt +0 -1
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ __pycache__
app.py CHANGED
@@ -7,12 +7,14 @@
7
  import gradio as gr
8
  from huggingface_hub import whoami
9
  from huggingface_hub.errors import LocalTokenNotFoundError
10
- from gradio_leaderboard import Leaderboard
11
  import argilla as rg
12
  import os
13
  from requests.exceptions import HTTPError
14
  import csv
15
  import pandas as pd
 
 
 
16
 
17
  CLA = """
18
  Meta Platforms, Inc. Individual Contributor License Agreement ("Agreement"), v2.0
@@ -389,6 +391,12 @@ You can also find more details on BOUQuET 💐 scientific context and purpose i
389
 
390
  The dataset is accessible at https://huggingface.co/datasets/facebook/bouquet. We are going to update it regularly, as the contributions in new languages are completed and validated.
391
 
 
 
 
 
 
 
392
  ### Contribute
393
 
394
  If you want to contribute dataset translations for a new language or validate existing translations, check out our crowdsourcing system: https://bouquet.metademolab.com.
@@ -401,21 +409,6 @@ If you want to contribute dataset translations for a new language or validate ex
401
  """)
402
 
403
 
404
- def leaderboard_tab():
405
- stats = pd.read_csv("data/benchmark_stats.tsv", sep="\t", quoting=csv.QUOTE_NONE)
406
- df = stats.groupby(['system', 'level'])[
407
- ['score_metricx_both', 'score_xcomet_both', 'score_CHRFpp', 'score_glotlid_ref']
408
- ].mean().reset_index().sort_values('score_metricx_both')
409
- with gr.Tab("Leaderboard"):
410
- gr.Markdown("The current leaderboard displays performance across all directions in the benchmark.")
411
- gr.Markdown("A smarter leaderboard and the code for reproducing the evaluation will be published soon!")
412
- # Leaderboard(
413
- # value=df,
414
- # select_columns=["system", "level"] + ['score_metricx_both', 'score_xcomet_both', 'score_CHRFpp', 'score_glotlid_ref'],
415
- # filter_columns=["system", "level"],
416
- # )
417
- gr.Dataframe(df)
418
-
419
  with gr.Blocks(
420
  css="""
421
  #cla textarea {min-height: 60em;}
 
7
  import gradio as gr
8
  from huggingface_hub import whoami
9
  from huggingface_hub.errors import LocalTokenNotFoundError
 
10
  import argilla as rg
11
  import os
12
  from requests.exceptions import HTTPError
13
  import csv
14
  import pandas as pd
15
+ from collections import defaultdict
16
+
17
+ from leaderboard import leaderboard_tab
18
 
19
  CLA = """
20
  Meta Platforms, Inc. Individual Contributor License Agreement ("Agreement"), v2.0
 
391
 
392
  The dataset is accessible at https://huggingface.co/datasets/facebook/bouquet. We are going to update it regularly, as the contributions in new languages are completed and validated.
393
 
394
+ ### Leaderboard
395
+
396
+ To see how the various translation systems perform on BOUQuET, refer to the "Leaderboard" tab!
397
+
398
+ If you want another system evaluated, please open a discussion in the "Community" tab.
399
+
400
  ### Contribute
401
 
402
  If you want to contribute dataset translations for a new language or validate existing translations, check out our crowdsourcing system: https://bouquet.metademolab.com.
 
409
  """)
410
 
411
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
412
  with gr.Blocks(
413
  css="""
414
  #cla textarea {min-height: 60em;}
leaderboard.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import pandas as pd
3
+ import gradio as gr
4
+ import csv
5
+ from collections import defaultdict
6
+
7
+
8
+ def strip_colname(x):
9
+ if x.startswith('score_'):
10
+ return x[6:]
11
+ return x
12
+
13
+
14
+ INTRO = """The current leaderboard displays performance across all filtered directions based on the dev subset of BOUQuET.
15
+
16
+ A smarter leaderboard and the code for reproducing the evaluations will be published soon!
17
+ """
18
+
19
+ LANGS_EXPLANATION = """## Languages
20
+ For the description of languages, please refer to https://huggingface.co/datasets/facebook/bouquet#languages.
21
+ """
22
+
23
+ METRICS_EXPLANATION = """## Metrics
24
+ 1. `metricx_both`: [google/metricx-24-hybrid-xl-v2p6](https://huggingface.co/google/metricx-24-hybrid-xl-v2p6) score based on both source and reference. **Attention: lower is better!**
25
+ 2. `xcomet_both`: []() score based on both source and reference.
26
+ 3. `CHRFpp`: ChrF++ score ([sacrebleu](https://github.com/mjpost/sacrebleu) implementation) based on reference.
27
+ 4. `glotlid_ref`: probability, as predicted by the [GlotLID model](https://huggingface.co/cis-lmu/glotlid), that translation and reference are in the same language.
28
+ """
29
+
30
+ SYSTEMS_EXPLANATION = """## Systems
31
+ Descriptions of the implementation of the systems will come out later.
32
+ """
33
+
34
+
35
+ def leaderboard_tab():
36
+ stats = pd.read_csv("data/benchmark_stats.tsv", sep="\t", quoting=csv.QUOTE_NONE)
37
+
38
+ metrics = ['score_metricx_both', 'score_xcomet_both', 'score_CHRFpp', 'score_glotlid_ref']
39
+ ALL = "ALL"
40
+ lang_src2tgt = defaultdict(set)
41
+ lang_tgt2src = defaultdict(set)
42
+ langs_src = set()
43
+ langs_tgt = set()
44
+ for src_lang, tgt_lang in stats[["src_lang", "tgt_lang"]].drop_duplicates().values:
45
+ lang_src2tgt[src_lang].add(tgt_lang)
46
+ lang_tgt2src[tgt_lang].add(src_lang)
47
+ langs_src.add(src_lang)
48
+ langs_tgt.add(tgt_lang)
49
+
50
+ with gr.Tab("Leaderboard"):
51
+ gr.Markdown("# BOUQuET translation leaderboard")
52
+ gr.Markdown(INTRO)
53
+
54
+ gr.Markdown("## Systems ranking")
55
+ # Inputs
56
+ gr_level = gr.Dropdown(
57
+ ["sentence_level", "paragraph_level"], value="sentence_level", label="Level"
58
+ )
59
+ gr_src_lang = gr.Dropdown([ALL] + sorted(langs_src), value=ALL, label="Source lang")
60
+ gr_tgt_lang = gr.Dropdown([ALL] + sorted(langs_tgt), value=ALL, label="Target lang")
61
+
62
+ # Interactivity
63
+ inputs = [gr_level, gr_src_lang, gr_tgt_lang]
64
+
65
+ def get_lb(level, src_lang, tgt_lang):
66
+ filtered = stats[stats["level"].eq(level)]
67
+ if src_lang != ALL:
68
+ filtered = filtered[filtered["src_lang"].eq(src_lang)]
69
+ if tgt_lang != ALL:
70
+ filtered = filtered[filtered["tgt_lang"].eq(tgt_lang)]
71
+ means = filtered.groupby(['system'])[metrics].mean().reset_index().sort_values('score_metricx_both')
72
+ means.columns = [strip_colname(c) for c in means.columns]
73
+ styler = means.style.background_gradient().format(precision=4)
74
+ return styler
75
+
76
+ df_all = get_lb(*[inp.value for inp in inputs])
77
+ gr_df = gr.Dataframe(df_all)
78
+
79
+ for inp in inputs:
80
+ inp.change(fn=get_lb, inputs=inputs, outputs=gr_df)
81
+
82
+ # Interdependecy of the controls
83
+ def src2tgt(src_lang, tgt_lang):
84
+ if src_lang == ALL:
85
+ choices = [ALL] + sorted(langs_tgt)
86
+ else:
87
+ choices = [ALL] + sorted(lang_src2tgt[src_lang])
88
+
89
+ return gr.update(choices=choices, value=tgt_lang)
90
+
91
+ def tgt2src(src_lang, tgt_lang):
92
+ if tgt_lang == ALL:
93
+ choices = [ALL] + sorted(langs_src)
94
+ else:
95
+ choices = [ALL] + sorted(lang_tgt2src[tgt_lang])
96
+ return gr.update(choices=choices, value=src_lang)
97
+
98
+ gr_src_lang.input(fn=src2tgt, inputs=[gr_src_lang, gr_tgt_lang], outputs=gr_tgt_lang)
99
+ gr_tgt_lang.input(fn=tgt2src, inputs=[gr_src_lang, gr_tgt_lang], outputs=gr_src_lang)
100
+
101
+ gr.Markdown(LANGS_EXPLANATION)
102
+ gr.Markdown(METRICS_EXPLANATION)
103
+ gr.Markdown(SYSTEMS_EXPLANATION)
requirements.txt CHANGED
@@ -1,4 +1,3 @@
1
  argilla
2
  gradio[oauth]
3
  pandas
4
- gradio_leaderboard
 
1
  argilla
2
  gradio[oauth]
3
  pandas