Spaces:
Runtime error
Runtime error
resolved conflict
Browse files- .github/workflows/sync_with_spaces.yml +2 -1
- app.py +21 -23
.github/workflows/sync_with_spaces.yml
CHANGED
|
@@ -16,4 +16,5 @@ jobs:
|
|
| 16 |
- name: Push to hub
|
| 17 |
env:
|
| 18 |
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
| 19 |
-
run:
|
|
|
|
|
|
| 16 |
- name: Push to hub
|
| 17 |
env:
|
| 18 |
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
| 19 |
+
run: |
|
| 20 |
+
git push https://lewtun:$HF_TOKEN@huggingface.co/spaces/autoevaluate/autoevaluate main
|
app.py
CHANGED
|
@@ -41,12 +41,12 @@ TASK_TO_DEFAULT_METRICS = {
|
|
| 41 |
"summarization": ["rouge1", "rouge2", "rougeL", "rougeLsum", "gen_len"],
|
| 42 |
}
|
| 43 |
|
| 44 |
-
|
| 45 |
|
| 46 |
@st.cache
|
| 47 |
def get_supported_metrics():
|
| 48 |
metrics = list_metrics()
|
| 49 |
-
supported_metrics =
|
| 50 |
for metric in tqdm(metrics):
|
| 51 |
try:
|
| 52 |
metric_func = load_metric(metric)
|
|
@@ -71,7 +71,7 @@ def get_supported_metrics():
|
|
| 71 |
break
|
| 72 |
|
| 73 |
if defaults:
|
| 74 |
-
supported_metrics
|
| 75 |
return supported_metrics
|
| 76 |
|
| 77 |
supported_metrics = get_supported_metrics()
|
|
@@ -102,7 +102,6 @@ selected_dataset = st.selectbox("Select a dataset", all_datasets, index=all_data
|
|
| 102 |
st.experimental_set_query_params(**{"dataset": [selected_dataset]})
|
| 103 |
|
| 104 |
|
| 105 |
-
# TODO: In general this will be a list of multiple configs => need to generalise logic here
|
| 106 |
metadata = get_metadata(selected_dataset)
|
| 107 |
if metadata is None:
|
| 108 |
st.warning("No evaluation metadata found. Please configure the evaluation job below.")
|
|
@@ -111,8 +110,8 @@ with st.expander("Advanced configuration"):
|
|
| 111 |
## Select task
|
| 112 |
selected_task = st.selectbox(
|
| 113 |
"Select a task",
|
| 114 |
-
|
| 115 |
-
index=
|
| 116 |
)
|
| 117 |
### Select config
|
| 118 |
configs = get_dataset_config_names(selected_dataset)
|
|
@@ -136,7 +135,7 @@ with st.expander("Advanced configuration"):
|
|
| 136 |
## Select columns
|
| 137 |
rows_resp = http_get(
|
| 138 |
path="/rows",
|
| 139 |
-
domain=
|
| 140 |
params={"dataset": selected_dataset, "config": selected_config, "split": selected_split},
|
| 141 |
).json()
|
| 142 |
col_names = list(pd.json_normalize(rows_resp["rows"][0]["row"]).columns)
|
|
@@ -236,6 +235,9 @@ with st.expander("Advanced configuration"):
|
|
| 236 |
col_mapping[target_col] = "target"
|
| 237 |
|
| 238 |
elif selected_task == "extractive_question_answering":
|
|
|
|
|
|
|
|
|
|
| 239 |
with col1:
|
| 240 |
st.markdown("`context` column")
|
| 241 |
st.text("")
|
|
@@ -257,26 +259,22 @@ with st.expander("Advanced configuration"):
|
|
| 257 |
context_col = st.selectbox(
|
| 258 |
"This column should contain the question's context",
|
| 259 |
col_names,
|
| 260 |
-
index=col_names.index(get_key(
|
| 261 |
)
|
| 262 |
question_col = st.selectbox(
|
| 263 |
"This column should contain the question to be answered, given the context",
|
| 264 |
col_names,
|
| 265 |
-
index=col_names.index(get_key(
|
| 266 |
)
|
| 267 |
answers_text_col = st.selectbox(
|
| 268 |
"This column should contain example answers to the question, extracted from the context",
|
| 269 |
col_names,
|
| 270 |
-
index=col_names.index(get_key(
|
| 271 |
-
if metadata is not None
|
| 272 |
-
else 0,
|
| 273 |
)
|
| 274 |
answers_start_col = st.selectbox(
|
| 275 |
"This column should contain the indices in the context of the first character of each answers.text",
|
| 276 |
col_names,
|
| 277 |
-
index=col_names.index(get_key(
|
| 278 |
-
if metadata is not None
|
| 279 |
-
else 0,
|
| 280 |
)
|
| 281 |
col_mapping[context_col] = "context"
|
| 282 |
col_mapping[question_col] = "question"
|
|
@@ -287,19 +285,19 @@ with st.form(key="form"):
|
|
| 287 |
|
| 288 |
compatible_models = get_compatible_models(selected_task, selected_dataset)
|
| 289 |
st.markdown("The following metrics will be computed")
|
| 290 |
-
html_string = " ".join([
|
|
|
|
|
|
|
|
|
|
|
|
|
| 291 |
st.markdown(html_string, unsafe_allow_html=True)
|
| 292 |
selected_metrics = st.multiselect(
|
| 293 |
"(Optional) Select additional metrics",
|
| 294 |
-
list(set(supported_metrics
|
| 295 |
)
|
| 296 |
-
|
| 297 |
-
argument_string = ", ".join(["-".join(key, value) for key, value in supported_metrics[metric].items()])
|
| 298 |
-
st.info(f"Note! The arguments for {metric_name} are: {argument_string}")
|
| 299 |
selected_models = st.multiselect("Select the models you wish to evaluate", compatible_models)
|
| 300 |
-
print("Selected models:", selected_models)
|
| 301 |
submit_button = st.form_submit_button("Make submission")
|
| 302 |
-
|
| 303 |
if submit_button:
|
| 304 |
project_id = str(uuid.uuid4())[:3]
|
| 305 |
payload = {
|
|
@@ -355,7 +353,7 @@ with st.form(key="form"):
|
|
| 355 |
f"""
|
| 356 |
Evaluation takes appoximately 1 hour to complete, so grab a β or π΅ while you wait:
|
| 357 |
|
| 358 |
-
* π Click [here](https://huggingface.co/spaces/
|
| 359 |
"""
|
| 360 |
)
|
| 361 |
else:
|
|
|
|
| 41 |
"summarization": ["rouge1", "rouge2", "rougeL", "rougeLsum", "gen_len"],
|
| 42 |
}
|
| 43 |
|
| 44 |
+
SUPPORTED_TASKS = list(TASK_TO_ID.keys())
|
| 45 |
|
| 46 |
@st.cache
|
| 47 |
def get_supported_metrics():
|
| 48 |
metrics = list_metrics()
|
| 49 |
+
supported_metrics = []
|
| 50 |
for metric in tqdm(metrics):
|
| 51 |
try:
|
| 52 |
metric_func = load_metric(metric)
|
|
|
|
| 71 |
break
|
| 72 |
|
| 73 |
if defaults:
|
| 74 |
+
supported_metrics.append(metric)
|
| 75 |
return supported_metrics
|
| 76 |
|
| 77 |
supported_metrics = get_supported_metrics()
|
|
|
|
| 102 |
st.experimental_set_query_params(**{"dataset": [selected_dataset]})
|
| 103 |
|
| 104 |
|
|
|
|
| 105 |
metadata = get_metadata(selected_dataset)
|
| 106 |
if metadata is None:
|
| 107 |
st.warning("No evaluation metadata found. Please configure the evaluation job below.")
|
|
|
|
| 110 |
## Select task
|
| 111 |
selected_task = st.selectbox(
|
| 112 |
"Select a task",
|
| 113 |
+
SUPPORTED_TASKS,
|
| 114 |
+
index=SUPPORTED_TASKS.index(metadata[0]["task_id"]) if metadata is not None else 0,
|
| 115 |
)
|
| 116 |
### Select config
|
| 117 |
configs = get_dataset_config_names(selected_dataset)
|
|
|
|
| 135 |
## Select columns
|
| 136 |
rows_resp = http_get(
|
| 137 |
path="/rows",
|
| 138 |
+
domain=DATASETS_PREVIEW_API,
|
| 139 |
params={"dataset": selected_dataset, "config": selected_config, "split": selected_split},
|
| 140 |
).json()
|
| 141 |
col_names = list(pd.json_normalize(rows_resp["rows"][0]["row"]).columns)
|
|
|
|
| 235 |
col_mapping[target_col] = "target"
|
| 236 |
|
| 237 |
elif selected_task == "extractive_question_answering":
|
| 238 |
+
col_mapping = metadata[0]["col_mapping"]
|
| 239 |
+
# Hub YAML parser converts periods to hyphens, so we remap them here
|
| 240 |
+
col_mapping = {k.replace("-", "."): v.replace("-", ".") for k, v in col_mapping.items()}
|
| 241 |
with col1:
|
| 242 |
st.markdown("`context` column")
|
| 243 |
st.text("")
|
|
|
|
| 259 |
context_col = st.selectbox(
|
| 260 |
"This column should contain the question's context",
|
| 261 |
col_names,
|
| 262 |
+
index=col_names.index(get_key(col_mapping, "context")) if metadata is not None else 0,
|
| 263 |
)
|
| 264 |
question_col = st.selectbox(
|
| 265 |
"This column should contain the question to be answered, given the context",
|
| 266 |
col_names,
|
| 267 |
+
index=col_names.index(get_key(col_mapping, "question")) if metadata is not None else 0,
|
| 268 |
)
|
| 269 |
answers_text_col = st.selectbox(
|
| 270 |
"This column should contain example answers to the question, extracted from the context",
|
| 271 |
col_names,
|
| 272 |
+
index=col_names.index(get_key(col_mapping, "answers.text")) if metadata is not None else 0,
|
|
|
|
|
|
|
| 273 |
)
|
| 274 |
answers_start_col = st.selectbox(
|
| 275 |
"This column should contain the indices in the context of the first character of each answers.text",
|
| 276 |
col_names,
|
| 277 |
+
index=col_names.index(get_key(col_mapping, "answers.answer_start")) if metadata is not None else 0,
|
|
|
|
|
|
|
| 278 |
)
|
| 279 |
col_mapping[context_col] = "context"
|
| 280 |
col_mapping[question_col] = "question"
|
|
|
|
| 285 |
|
| 286 |
compatible_models = get_compatible_models(selected_task, selected_dataset)
|
| 287 |
st.markdown("The following metrics will be computed")
|
| 288 |
+
html_string = " ".join([
|
| 289 |
+
"<div style=\"padding-right:5px;padding-left:5px;padding-top:5px;padding-bottom:5px;float:left\">"
|
| 290 |
+
+ "<div style=\"background-color:#D3D3D3;border-radius:5px;display:inline-block;padding-right:5px;padding-left:5px;color:white\">"
|
| 291 |
+
+ metric + "</div></div>" for metric in TASK_TO_DEFAULT_METRICS[selected_task]
|
| 292 |
+
])
|
| 293 |
st.markdown(html_string, unsafe_allow_html=True)
|
| 294 |
selected_metrics = st.multiselect(
|
| 295 |
"(Optional) Select additional metrics",
|
| 296 |
+
list(set(supported_metrics) - set(TASK_TO_DEFAULT_METRICS[selected_task])),
|
| 297 |
)
|
| 298 |
+
st.info("Note: user-selected metrics will be run with their default arguments from [here](https://github.com/huggingface/datasets/tree/master/metrics)")
|
|
|
|
|
|
|
| 299 |
selected_models = st.multiselect("Select the models you wish to evaluate", compatible_models)
|
|
|
|
| 300 |
submit_button = st.form_submit_button("Make submission")
|
|
|
|
| 301 |
if submit_button:
|
| 302 |
project_id = str(uuid.uuid4())[:3]
|
| 303 |
payload = {
|
|
|
|
| 353 |
f"""
|
| 354 |
Evaluation takes appoximately 1 hour to complete, so grab a β or π΅ while you wait:
|
| 355 |
|
| 356 |
+
* π Click [here](https://huggingface.co/spaces/autoevaluate/leaderboards) to view the results from your submission
|
| 357 |
"""
|
| 358 |
)
|
| 359 |
else:
|