Spaces:

facebook
/

bouquet

Running

App Files Files Community

David Dale commited on 6 days ago

Commit

1bca40f

1 Parent(s): e5fc052

Make the LB filterable by languages

Browse files

Files changed (4) hide show

.gitignore +1 -0
app.py +9 -16
leaderboard.py +103 -0
requirements.txt +0 -1

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ __pycache__

app.py CHANGED Viewed

@@ -7,12 +7,14 @@
 import gradio as gr
 from huggingface_hub import whoami
 from huggingface_hub.errors import LocalTokenNotFoundError
-from gradio_leaderboard import Leaderboard
 import argilla as rg
 import os
 from requests.exceptions import HTTPError
 import csv
 import pandas as pd
 CLA = """
                    Meta Platforms, Inc. Individual Contributor License Agreement ("Agreement"), v2.0
@@ -389,6 +391,12 @@ You can also find more details on BOUQuET 💐  scientific context and purpose i
 The dataset is accessible at https://huggingface.co/datasets/facebook/bouquet. We are going to update it regularly, as the contributions in new languages are completed and validated.
 ### Contribute
 If you want to contribute dataset translations for a new language or validate existing translations, check out our crowdsourcing system: https://bouquet.metademolab.com.
@@ -401,21 +409,6 @@ If you want to contribute dataset translations for a new language or validate ex
         """)
-def leaderboard_tab():
-    stats = pd.read_csv("data/benchmark_stats.tsv", sep="\t", quoting=csv.QUOTE_NONE)
-    df = stats.groupby(['system', 'level'])[
-        ['score_metricx_both', 'score_xcomet_both', 'score_CHRFpp', 'score_glotlid_ref']
-    ].mean().reset_index().sort_values('score_metricx_both')
-    with gr.Tab("Leaderboard"):
-        gr.Markdown("The current leaderboard displays performance across all directions in the benchmark.")
-        gr.Markdown("A smarter leaderboard and the code for reproducing the evaluation will be published soon!")
-        # Leaderboard(
-        #     value=df,
-        #     select_columns=["system", "level"] +  ['score_metricx_both', 'score_xcomet_both', 'score_CHRFpp', 'score_glotlid_ref'],
-        #     filter_columns=["system", "level"],
-        # )
-        gr.Dataframe(df)
 with gr.Blocks(
     css="""
     #cla textarea {min-height: 60em;}

 import gradio as gr
 from huggingface_hub import whoami
 from huggingface_hub.errors import LocalTokenNotFoundError
 import argilla as rg
 import os
 from requests.exceptions import HTTPError
 import csv
 import pandas as pd
+from collections import defaultdict
+from leaderboard import leaderboard_tab
 CLA = """
                    Meta Platforms, Inc. Individual Contributor License Agreement ("Agreement"), v2.0
 The dataset is accessible at https://huggingface.co/datasets/facebook/bouquet. We are going to update it regularly, as the contributions in new languages are completed and validated.
+### Leaderboard
+To see how the various translation systems perform on BOUQuET, refer to the "Leaderboard" tab!
+If you want another system evaluated, please open a discussion in the "Community" tab.
 ### Contribute
 If you want to contribute dataset translations for a new language or validate existing translations, check out our crowdsourcing system: https://bouquet.metademolab.com.
         """)
 with gr.Blocks(
     css="""
     #cla textarea {min-height: 60em;}

leaderboard.py ADDED Viewed

	@@ -0,0 +1,103 @@

+import pandas as pd
+import gradio as gr
+import csv
+from collections import defaultdict
+def strip_colname(x):
+    if x.startswith('score_'):
+        return x[6:]
+    return x
+INTRO = """The current leaderboard displays performance across all filtered directions based on the dev subset of BOUQuET.
+A smarter leaderboard and the code for reproducing the evaluations will be published soon!
+"""
+LANGS_EXPLANATION = """## Languages
+For the description of languages, please refer to https://huggingface.co/datasets/facebook/bouquet#languages.
+"""
+METRICS_EXPLANATION = """## Metrics
+1. `metricx_both`: [google/metricx-24-hybrid-xl-v2p6](https://huggingface.co/google/metricx-24-hybrid-xl-v2p6) score based on both source and reference. **Attention: lower is better!**
+2. `xcomet_both`: []() score based on both source and reference.
+3. `CHRFpp`: ChrF++ score ([sacrebleu](https://github.com/mjpost/sacrebleu) implementation) based on reference.
+4. `glotlid_ref`: probability, as predicted by the [GlotLID model](https://huggingface.co/cis-lmu/glotlid), that translation and reference are in the same language.
+"""
+SYSTEMS_EXPLANATION = """## Systems
+Descriptions of the implementation of the systems will come out later.
+"""
+def leaderboard_tab():
+    stats = pd.read_csv("data/benchmark_stats.tsv", sep="\t", quoting=csv.QUOTE_NONE)
+    metrics = ['score_metricx_both', 'score_xcomet_both', 'score_CHRFpp', 'score_glotlid_ref']
+    ALL = "ALL"
+    lang_src2tgt = defaultdict(set)
+    lang_tgt2src = defaultdict(set)
+    langs_src = set()
+    langs_tgt = set()
+    for src_lang, tgt_lang in stats[["src_lang", "tgt_lang"]].drop_duplicates().values:
+        lang_src2tgt[src_lang].add(tgt_lang)
+        lang_tgt2src[tgt_lang].add(src_lang)
+        langs_src.add(src_lang)
+        langs_tgt.add(tgt_lang)
+    with gr.Tab("Leaderboard"):
+        gr.Markdown("# BOUQuET translation leaderboard")
+        gr.Markdown(INTRO)
+        gr.Markdown("## Systems ranking")
+        # Inputs
+        gr_level = gr.Dropdown(
+            ["sentence_level", "paragraph_level"], value="sentence_level", label="Level"
+        )
+        gr_src_lang = gr.Dropdown([ALL] + sorted(langs_src), value=ALL, label="Source lang")
+        gr_tgt_lang = gr.Dropdown([ALL] + sorted(langs_tgt), value=ALL, label="Target lang")
+        # Interactivity
+        inputs = [gr_level, gr_src_lang, gr_tgt_lang]
+        def get_lb(level, src_lang, tgt_lang):
+            filtered = stats[stats["level"].eq(level)]
+            if src_lang != ALL:
+                filtered = filtered[filtered["src_lang"].eq(src_lang)]
+            if tgt_lang != ALL:
+                filtered = filtered[filtered["tgt_lang"].eq(tgt_lang)]
+            means = filtered.groupby(['system'])[metrics].mean().reset_index().sort_values('score_metricx_both')
+            means.columns = [strip_colname(c) for c in means.columns]
+            styler = means.style.background_gradient().format(precision=4)
+            return styler
+        df_all = get_lb(*[inp.value for inp in inputs])
+        gr_df = gr.Dataframe(df_all)
+        for inp in inputs:
+            inp.change(fn=get_lb, inputs=inputs, outputs=gr_df)
+        # Interdependecy of the controls
+        def src2tgt(src_lang, tgt_lang):
+            if src_lang == ALL:
+                choices = [ALL] + sorted(langs_tgt)
+            else:
+                choices = [ALL] + sorted(lang_src2tgt[src_lang])
+            return gr.update(choices=choices, value=tgt_lang)
+        def tgt2src(src_lang, tgt_lang):
+            if tgt_lang == ALL:
+                choices = [ALL] + sorted(langs_src)
+            else:
+                choices = [ALL] + sorted(lang_tgt2src[tgt_lang])
+            return gr.update(choices=choices, value=src_lang)
+        gr_src_lang.input(fn=src2tgt, inputs=[gr_src_lang, gr_tgt_lang], outputs=gr_tgt_lang)
+        gr_tgt_lang.input(fn=tgt2src, inputs=[gr_src_lang, gr_tgt_lang], outputs=gr_src_lang)
+        gr.Markdown(LANGS_EXPLANATION)
+        gr.Markdown(METRICS_EXPLANATION)
+        gr.Markdown(SYSTEMS_EXPLANATION)

requirements.txt CHANGED Viewed

@@ -1,4 +1,3 @@
 argilla
 gradio[oauth]
 pandas
-gradio_leaderboard

 argilla
 gradio[oauth]
 pandas