Spaces:
Running
Running
fix metrics for LongEmbed (#124)
Browse files- init branch (b12b1dc2ff1f79ab6e8f4d7e10fe15eee4c3a0a9)
- small fix (877acad403f31fb7c96c051de7324e9874c47336)
- app.py +15 -2
- config.yaml +1 -0
app.py
CHANGED
|
@@ -116,8 +116,16 @@ for model in pbar:
|
|
| 116 |
ds = ds.map(add_task)
|
| 117 |
base_dict = {"Model": make_clickable_model(model, link=EXTERNAL_MODEL_TO_LINK.get(model, f"https://huggingface.co/spaces/{REPO_ID}"))}
|
| 118 |
# For now only one metric per task - Could add more metrics lateron
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
for task, metric in TASK_TO_METRIC.items():
|
| 120 |
-
ds_dict = ds.filter(lambda x: (x
|
| 121 |
ds_dict = {k: round(v, 2) for k, v in zip(ds_dict["mteb_dataset_name_with_lang"], ds_dict["score"])}
|
| 122 |
EXTERNAL_MODEL_RESULTS[model][task][metric].append({**base_dict, **ds_dict})
|
| 123 |
|
|
@@ -463,6 +471,7 @@ for board, board_config in BOARDS_CONFIG.items():
|
|
| 463 |
"data": boards_data[board]["data_tasks"][task_category],
|
| 464 |
"refresh": get_refresh_function(task_category, task_category_list),
|
| 465 |
"credits": credits,
|
|
|
|
| 466 |
})
|
| 467 |
|
| 468 |
dataframes = []
|
|
@@ -618,11 +627,15 @@ with gr.Blocks(css=css) as block:
|
|
| 618 |
# For updating the 'language' in the URL
|
| 619 |
item_tab.select(update_url_language, [current_task_language, language_per_task], [current_task_language, language_per_task], trigger_mode="always_last").then(None, [current_task_language], [], js=set_window_url_params)
|
| 620 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 621 |
with gr.Row():
|
| 622 |
gr.Markdown(f"""
|
| 623 |
{item['description']}
|
| 624 |
|
| 625 |
-
- **Metric:** {
|
| 626 |
- **Languages:** {item['language_long'] if 'language_long' in item else item['language']}
|
| 627 |
{"- **Credits:** " + item['credits'] if ("credits" in item and item["credits"] is not None) else ''}
|
| 628 |
""")
|
|
|
|
| 116 |
ds = ds.map(add_task)
|
| 117 |
base_dict = {"Model": make_clickable_model(model, link=EXTERNAL_MODEL_TO_LINK.get(model, f"https://huggingface.co/spaces/{REPO_ID}"))}
|
| 118 |
# For now only one metric per task - Could add more metrics lateron
|
| 119 |
+
|
| 120 |
+
def filter_function(x, task, metric):
|
| 121 |
+
# This is a hack for the passkey and needle retrieval test, which reports ndcg_at_1 (i.e. accuracy), rather than the ndcg_at_10 that is commonly used for retrieval tasks.
|
| 122 |
+
if x['mteb_dataset_name'] in ['LEMBNeedleRetrieval', 'LEMBPasskeyRetrieval']:
|
| 123 |
+
return x["mteb_task"] == task and x['metric'] == 'ndcg_at_1'
|
| 124 |
+
else:
|
| 125 |
+
return x["mteb_task"] == task and x["metric"] == metric
|
| 126 |
+
|
| 127 |
for task, metric in TASK_TO_METRIC.items():
|
| 128 |
+
ds_dict = ds.filter(lambda x: filter_function(x, task, metric))["test"].to_dict()
|
| 129 |
ds_dict = {k: round(v, 2) for k, v in zip(ds_dict["mteb_dataset_name_with_lang"], ds_dict["score"])}
|
| 130 |
EXTERNAL_MODEL_RESULTS[model][task][metric].append({**base_dict, **ds_dict})
|
| 131 |
|
|
|
|
| 471 |
"data": boards_data[board]["data_tasks"][task_category],
|
| 472 |
"refresh": get_refresh_function(task_category, task_category_list),
|
| 473 |
"credits": credits,
|
| 474 |
+
"metric": board_config.get("metric", None),
|
| 475 |
})
|
| 476 |
|
| 477 |
dataframes = []
|
|
|
|
| 627 |
# For updating the 'language' in the URL
|
| 628 |
item_tab.select(update_url_language, [current_task_language, language_per_task], [current_task_language, language_per_task], trigger_mode="always_last").then(None, [current_task_language], [], js=set_window_url_params)
|
| 629 |
|
| 630 |
+
specific_metric = metric
|
| 631 |
+
if item.get("metric", None) is not None:
|
| 632 |
+
specific_metric = item['metric']
|
| 633 |
+
|
| 634 |
with gr.Row():
|
| 635 |
gr.Markdown(f"""
|
| 636 |
{item['description']}
|
| 637 |
|
| 638 |
+
- **Metric:** {specific_metric}
|
| 639 |
- **Languages:** {item['language_long'] if 'language_long' in item else item['language']}
|
| 640 |
{"- **Credits:** " + item['credits'] if ("credits" in item and item["credits"] is not None) else ''}
|
| 641 |
""")
|
config.yaml
CHANGED
|
@@ -301,6 +301,7 @@ boards:
|
|
| 301 |
icon: "📚"
|
| 302 |
special_icons: null
|
| 303 |
credits: "[LongEmbed](https://arxiv.org/abs/2404.12096v2)"
|
|
|
|
| 304 |
tasks:
|
| 305 |
Retrieval:
|
| 306 |
- LEMBNarrativeQARetrieval
|
|
|
|
| 301 |
icon: "📚"
|
| 302 |
special_icons: null
|
| 303 |
credits: "[LongEmbed](https://arxiv.org/abs/2404.12096v2)"
|
| 304 |
+
metric: nDCG@10 (for NarrativeQA, QMSum, SummScreenFD, WikimQA) & nDCG@1 (for passkey and needle)
|
| 305 |
tasks:
|
| 306 |
Retrieval:
|
| 307 |
- LEMBNarrativeQARetrieval
|