Spaces:
Runtime error
Runtime error
Tristan Thrush
commited on
Commit
·
888432c
1
Parent(s):
79668b2
added enforcement for known metric ranges
Browse files- app.py +13 -2
- ascending_metrics.py +0 -10
- utils.py +38 -0
app.py
CHANGED
|
@@ -2,7 +2,7 @@ import pandas as pd
|
|
| 2 |
import streamlit as st
|
| 3 |
from huggingface_hub import HfApi, hf_hub_download
|
| 4 |
from huggingface_hub.repocard import metadata_load
|
| 5 |
-
from
|
| 6 |
import numpy as np
|
| 7 |
from st_aggrid import AgGrid, GridOptionsBuilder, JsCode
|
| 8 |
from os.path import exists
|
|
@@ -46,6 +46,7 @@ def parse_metrics_rows(meta, only_verified=False):
|
|
| 46 |
if "config" in result["dataset"]:
|
| 47 |
row["config"] = result["dataset"]["config"]
|
| 48 |
no_results = True
|
|
|
|
| 49 |
for metric in result["metrics"]:
|
| 50 |
name = metric["type"].lower().strip()
|
| 51 |
|
|
@@ -64,10 +65,16 @@ def parse_metrics_rows(meta, only_verified=False):
|
|
| 64 |
if "verified" in metric and metric["verified"]:
|
| 65 |
no_results = False
|
| 66 |
row[name] = value
|
|
|
|
|
|
|
|
|
|
| 67 |
else:
|
| 68 |
no_results = False
|
| 69 |
row[name] = value
|
| 70 |
-
|
|
|
|
|
|
|
|
|
|
| 71 |
continue
|
| 72 |
yield row
|
| 73 |
|
|
@@ -199,6 +206,10 @@ if len(dataset_df) > 0:
|
|
| 199 |
"Want to beat the leaderboard? Don't see your model here? Simply request an automatic evaluation [here](https://huggingface.co/spaces/autoevaluate/model-evaluator)."
|
| 200 |
)
|
| 201 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 202 |
# Make the default metric appear right after model names
|
| 203 |
cols = dataset_df.columns.tolist()
|
| 204 |
cols.remove(sorting_metric)
|
|
|
|
| 2 |
import streamlit as st
|
| 3 |
from huggingface_hub import HfApi, hf_hub_download
|
| 4 |
from huggingface_hub.repocard import metadata_load
|
| 5 |
+
from utils import ascending_metrics, metric_ranges
|
| 6 |
import numpy as np
|
| 7 |
from st_aggrid import AgGrid, GridOptionsBuilder, JsCode
|
| 8 |
from os.path import exists
|
|
|
|
| 46 |
if "config" in result["dataset"]:
|
| 47 |
row["config"] = result["dataset"]["config"]
|
| 48 |
no_results = True
|
| 49 |
+
incorrect_results = False
|
| 50 |
for metric in result["metrics"]:
|
| 51 |
name = metric["type"].lower().strip()
|
| 52 |
|
|
|
|
| 65 |
if "verified" in metric and metric["verified"]:
|
| 66 |
no_results = False
|
| 67 |
row[name] = value
|
| 68 |
+
if name in metric_ranges:
|
| 69 |
+
if value < metric_ranges[name][0] or value > metric_ranges[name][1]:
|
| 70 |
+
incorrect_results = True
|
| 71 |
else:
|
| 72 |
no_results = False
|
| 73 |
row[name] = value
|
| 74 |
+
if name in metric_ranges:
|
| 75 |
+
if value < metric_ranges[name][0] or value > metric_ranges[name][1]:
|
| 76 |
+
incorrect_results = True
|
| 77 |
+
if no_results or incorrect_results:
|
| 78 |
continue
|
| 79 |
yield row
|
| 80 |
|
|
|
|
| 206 |
"Want to beat the leaderboard? Don't see your model here? Simply request an automatic evaluation [here](https://huggingface.co/spaces/autoevaluate/model-evaluator)."
|
| 207 |
)
|
| 208 |
|
| 209 |
+
st.markdown(
|
| 210 |
+
"Note: if you do not see your self-reported results here, ensure that your results are in the expected range for all metrics. E.g., accuracy is 0-1, not 0-100."
|
| 211 |
+
)
|
| 212 |
+
|
| 213 |
# Make the default metric appear right after model names
|
| 214 |
cols = dataset_df.columns.tolist()
|
| 215 |
cols.remove(sorting_metric)
|
ascending_metrics.py
DELETED
|
@@ -1,10 +0,0 @@
|
|
| 1 |
-
ascending_metrics = {
|
| 2 |
-
"wer",
|
| 3 |
-
"cer",
|
| 4 |
-
"loss",
|
| 5 |
-
"mae",
|
| 6 |
-
"mahalanobis",
|
| 7 |
-
"mse",
|
| 8 |
-
"perplexity",
|
| 9 |
-
"ter",
|
| 10 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
utils.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
ascending_metrics = {
|
| 2 |
+
"wer",
|
| 3 |
+
"cer",
|
| 4 |
+
"loss",
|
| 5 |
+
"mae",
|
| 6 |
+
"mahalanobis",
|
| 7 |
+
"mse",
|
| 8 |
+
"perplexity",
|
| 9 |
+
"ter",
|
| 10 |
+
}
|
| 11 |
+
|
| 12 |
+
metric_ranges = {
|
| 13 |
+
"accuracy": (0,1),
|
| 14 |
+
"precision": (0,1),
|
| 15 |
+
"recall": (0,1),
|
| 16 |
+
"f1": (0,1),
|
| 17 |
+
"macro f1": (0,1),
|
| 18 |
+
"micro f1": (0,1),
|
| 19 |
+
"cer": (0,1),
|
| 20 |
+
"wer": (0,1),
|
| 21 |
+
"pearson": (-1, 1),
|
| 22 |
+
"matthews_correlation": (-1, 1),
|
| 23 |
+
"spearmanr": (-1, 1),
|
| 24 |
+
"google_bleu": (0, 1),
|
| 25 |
+
"precision@10": (0, 1),
|
| 26 |
+
"mae": (0, 1),
|
| 27 |
+
"mauve": (0, 1),
|
| 28 |
+
"frontier_integral": (0, 1),
|
| 29 |
+
"mean_iou": (0, 1),
|
| 30 |
+
"mean_accuracy": (0, 1),
|
| 31 |
+
"overall_accuracy": (0, 1),
|
| 32 |
+
"meteor": (0, 1),
|
| 33 |
+
"mse": (0, 1),
|
| 34 |
+
"perplexity": (0, float("inf")),
|
| 35 |
+
"rogue1": (0, 1),
|
| 36 |
+
"rogue2": (0, 1),
|
| 37 |
+
"sari": (0, 100),
|
| 38 |
+
}
|