Spaces:
Runtime error
Runtime error
Merge pull request #40 from huggingface/refactor-model-filter
Browse files- app.py +106 -105
- evaluation.py +13 -14
app.py
CHANGED
|
@@ -436,17 +436,6 @@ with st.form(key="form"):
|
|
| 436 |
)
|
| 437 |
print("INFO -- Selected models before filter:", selected_models)
|
| 438 |
|
| 439 |
-
if len(selected_models) > 0:
|
| 440 |
-
selected_models = filter_evaluated_models(
|
| 441 |
-
selected_models,
|
| 442 |
-
selected_task,
|
| 443 |
-
selected_dataset,
|
| 444 |
-
selected_config,
|
| 445 |
-
selected_split,
|
| 446 |
-
selected_metrics,
|
| 447 |
-
)
|
| 448 |
-
print("INFO -- Selected models after filter:", selected_models)
|
| 449 |
-
|
| 450 |
hf_username = st.text_input("Enter your π€ Hub username to be notified when the evaluation is finished")
|
| 451 |
|
| 452 |
submit_button = st.form_submit_button("Evaluate models π")
|
|
@@ -454,106 +443,118 @@ with st.form(key="form"):
|
|
| 454 |
if submit_button:
|
| 455 |
if len(hf_username) == 0:
|
| 456 |
st.warning("No π€ Hub username provided! Please enter your username and try again.")
|
|
|
|
|
|
|
| 457 |
elif len(selected_models) > 10:
|
| 458 |
-
st.warning("Only 10 models can be evaluated at once. Please select fewer models
|
| 459 |
-
|
| 460 |
-
|
| 461 |
-
|
| 462 |
-
|
| 463 |
-
|
| 464 |
-
|
| 465 |
-
|
| 466 |
-
|
| 467 |
-
|
| 468 |
-
|
| 469 |
-
|
| 470 |
-
|
| 471 |
-
|
| 472 |
-
|
| 473 |
-
|
| 474 |
-
|
| 475 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 476 |
},
|
| 477 |
-
"evaluation": {"metrics": selected_metrics, "models": selected_models, "hf_username": hf_username},
|
| 478 |
-
},
|
| 479 |
-
}
|
| 480 |
-
print(f"INFO -- Payload: {project_payload}")
|
| 481 |
-
project_json_resp = http_post(
|
| 482 |
-
path="/projects/create",
|
| 483 |
-
payload=project_payload,
|
| 484 |
-
token=HF_TOKEN,
|
| 485 |
-
domain=AUTOTRAIN_BACKEND_API,
|
| 486 |
-
).json()
|
| 487 |
-
print(f"INFO -- Project creation response: {project_json_resp}")
|
| 488 |
-
|
| 489 |
-
if project_json_resp["created"]:
|
| 490 |
-
data_payload = {
|
| 491 |
-
"split": 4, # use "auto" split choice in AutoTrain
|
| 492 |
-
"col_mapping": col_mapping,
|
| 493 |
-
"load_config": {"max_size_bytes": 0, "shuffle": False},
|
| 494 |
}
|
| 495 |
-
|
| 496 |
-
|
| 497 |
-
|
|
|
|
| 498 |
token=HF_TOKEN,
|
| 499 |
domain=AUTOTRAIN_BACKEND_API,
|
| 500 |
-
params={
|
| 501 |
-
"type": "dataset",
|
| 502 |
-
"config_name": selected_config,
|
| 503 |
-
"split_name": selected_split,
|
| 504 |
-
},
|
| 505 |
).json()
|
| 506 |
-
print(f"INFO --
|
| 507 |
-
|
| 508 |
-
|
| 509 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 510 |
token=HF_TOKEN,
|
| 511 |
domain=AUTOTRAIN_BACKEND_API,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 512 |
).json()
|
| 513 |
-
print(f"INFO --
|
| 514 |
-
if
|
| 515 |
-
|
| 516 |
-
"
|
| 517 |
-
|
| 518 |
-
|
| 519 |
-
|
| 520 |
-
|
| 521 |
-
|
| 522 |
-
|
| 523 |
-
|
| 524 |
-
|
| 525 |
-
|
| 526 |
-
|
| 527 |
-
|
| 528 |
-
|
| 529 |
-
|
| 530 |
-
|
| 531 |
-
|
| 532 |
-
|
| 533 |
-
|
| 534 |
-
|
| 535 |
-
|
| 536 |
-
|
| 537 |
-
|
| 538 |
-
to
|
| 539 |
-
|
| 540 |
-
[
|
| 541 |
-
|
| 542 |
-
|
| 543 |
-
|
| 544 |
-
|
| 545 |
-
|
| 546 |
-
|
| 547 |
-
|
| 548 |
-
|
| 549 |
-
|
| 550 |
-
|
| 551 |
-
|
| 552 |
-
|
| 553 |
-
|
| 554 |
-
|
| 555 |
-
|
| 556 |
-
|
| 557 |
-
|
| 558 |
-
|
| 559 |
-
|
|
|
|
|
|
|
|
|
| 436 |
)
|
| 437 |
print("INFO -- Selected models before filter:", selected_models)
|
| 438 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 439 |
hf_username = st.text_input("Enter your π€ Hub username to be notified when the evaluation is finished")
|
| 440 |
|
| 441 |
submit_button = st.form_submit_button("Evaluate models π")
|
|
|
|
| 443 |
if submit_button:
|
| 444 |
if len(hf_username) == 0:
|
| 445 |
st.warning("No π€ Hub username provided! Please enter your username and try again.")
|
| 446 |
+
elif len(selected_models) == 0:
|
| 447 |
+
st.warning("β οΈ No models were selected for evaluation! Please select at least one model and try again.")
|
| 448 |
elif len(selected_models) > 10:
|
| 449 |
+
st.warning("Only 10 models can be evaluated at once. Please select fewer models and try again.")
|
| 450 |
+
else:
|
| 451 |
+
# Filter out previously evaluated models
|
| 452 |
+
selected_models = filter_evaluated_models(
|
| 453 |
+
selected_models,
|
| 454 |
+
selected_task,
|
| 455 |
+
selected_dataset,
|
| 456 |
+
selected_config,
|
| 457 |
+
selected_split,
|
| 458 |
+
selected_metrics,
|
| 459 |
+
)
|
| 460 |
+
print("INFO -- Selected models after filter:", selected_models)
|
| 461 |
+
if len(selected_models) > 0:
|
| 462 |
+
project_id = str(uuid.uuid4())[:8]
|
| 463 |
+
project_payload = {
|
| 464 |
+
"username": AUTOTRAIN_USERNAME,
|
| 465 |
+
"proj_name": f"eval-project-{project_id}",
|
| 466 |
+
"task": TASK_TO_ID[selected_task],
|
| 467 |
+
"config": {
|
| 468 |
+
"language": AUTOTRAIN_TASK_TO_LANG[selected_task]
|
| 469 |
+
if selected_task in AUTOTRAIN_TASK_TO_LANG
|
| 470 |
+
else "en",
|
| 471 |
+
"max_models": 5,
|
| 472 |
+
"instance": {
|
| 473 |
+
"provider": "aws",
|
| 474 |
+
"instance_type": "ml.g4dn.4xlarge",
|
| 475 |
+
"max_runtime_seconds": 172800,
|
| 476 |
+
"num_instances": 1,
|
| 477 |
+
"disk_size_gb": 150,
|
| 478 |
+
},
|
| 479 |
+
"evaluation": {
|
| 480 |
+
"metrics": selected_metrics,
|
| 481 |
+
"models": selected_models,
|
| 482 |
+
"hf_username": hf_username,
|
| 483 |
+
},
|
| 484 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 485 |
}
|
| 486 |
+
print(f"INFO -- Payload: {project_payload}")
|
| 487 |
+
project_json_resp = http_post(
|
| 488 |
+
path="/projects/create",
|
| 489 |
+
payload=project_payload,
|
| 490 |
token=HF_TOKEN,
|
| 491 |
domain=AUTOTRAIN_BACKEND_API,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 492 |
).json()
|
| 493 |
+
print(f"INFO -- Project creation response: {project_json_resp}")
|
| 494 |
+
|
| 495 |
+
if project_json_resp["created"]:
|
| 496 |
+
data_payload = {
|
| 497 |
+
"split": 4, # use "auto" split choice in AutoTrain
|
| 498 |
+
"col_mapping": col_mapping,
|
| 499 |
+
"load_config": {"max_size_bytes": 0, "shuffle": False},
|
| 500 |
+
}
|
| 501 |
+
data_json_resp = http_post(
|
| 502 |
+
path=f"/projects/{project_json_resp['id']}/data/{selected_dataset}",
|
| 503 |
+
payload=data_payload,
|
| 504 |
token=HF_TOKEN,
|
| 505 |
domain=AUTOTRAIN_BACKEND_API,
|
| 506 |
+
params={
|
| 507 |
+
"type": "dataset",
|
| 508 |
+
"config_name": selected_config,
|
| 509 |
+
"split_name": selected_split,
|
| 510 |
+
},
|
| 511 |
).json()
|
| 512 |
+
print(f"INFO -- Dataset creation response: {data_json_resp}")
|
| 513 |
+
if data_json_resp["download_status"] == 1:
|
| 514 |
+
train_json_resp = http_get(
|
| 515 |
+
path=f"/projects/{project_json_resp['id']}/data/start_process",
|
| 516 |
+
token=HF_TOKEN,
|
| 517 |
+
domain=AUTOTRAIN_BACKEND_API,
|
| 518 |
+
).json()
|
| 519 |
+
print(f"INFO -- AutoTrain job response: {train_json_resp}")
|
| 520 |
+
if train_json_resp["success"]:
|
| 521 |
+
train_eval_index = {
|
| 522 |
+
"train-eval-index": [
|
| 523 |
+
{
|
| 524 |
+
"config": selected_config,
|
| 525 |
+
"task": AUTOTRAIN_TASK_TO_HUB_TASK[selected_task],
|
| 526 |
+
"task_id": selected_task,
|
| 527 |
+
"splits": {"eval_split": selected_split},
|
| 528 |
+
"col_mapping": col_mapping,
|
| 529 |
+
}
|
| 530 |
+
]
|
| 531 |
+
}
|
| 532 |
+
selected_metadata = yaml.dump(train_eval_index, sort_keys=False)
|
| 533 |
+
dataset_card_url = get_dataset_card_url(selected_dataset)
|
| 534 |
+
st.success("β
Successfully submitted evaluation job!")
|
| 535 |
+
st.markdown(
|
| 536 |
+
f"""
|
| 537 |
+
Evaluation can take up to 1 hour to complete, so grab a βοΈ or π΅ while you wait:
|
| 538 |
+
|
| 539 |
+
* π A [Hub pull request](https://huggingface.co/docs/hub/repositories-pull-requests-discussions) with the evaluation results will be opened for each model you selected. Check your email for notifications.
|
| 540 |
+
* π Click [here](https://hf.co/spaces/autoevaluate/leaderboards?dataset={selected_dataset}) to view the results from your submission once the Hub pull request is merged.
|
| 541 |
+
* π₯± Tired of configuring evaluations? Add the following metadata to the [dataset card]({dataset_card_url}) to enable 1-click evaluations:
|
| 542 |
+
""" # noqa
|
| 543 |
+
)
|
| 544 |
+
st.markdown(
|
| 545 |
+
f"""
|
| 546 |
+
```yaml
|
| 547 |
+
{selected_metadata}
|
| 548 |
+
"""
|
| 549 |
+
)
|
| 550 |
+
print("INFO -- Pushing evaluation job logs to the Hub")
|
| 551 |
+
evaluation_log = {}
|
| 552 |
+
evaluation_log["payload"] = project_payload
|
| 553 |
+
evaluation_log["project_creation_response"] = project_json_resp
|
| 554 |
+
evaluation_log["dataset_creation_response"] = data_json_resp
|
| 555 |
+
evaluation_log["autotrain_job_response"] = train_json_resp
|
| 556 |
+
commit_evaluation_log(evaluation_log, hf_access_token=HF_TOKEN)
|
| 557 |
+
else:
|
| 558 |
+
st.error("π Oh no, there was an error submitting your evaluation job!")
|
| 559 |
+
else:
|
| 560 |
+
st.warning("β οΈ No models left to evaluate! Please select other models and try again.")
|
evaluation.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
|
|
| 1 |
from dataclasses import dataclass
|
| 2 |
|
| 3 |
import streamlit as st
|
|
@@ -15,30 +16,29 @@ class EvaluationInfo:
|
|
| 15 |
metrics: set
|
| 16 |
|
| 17 |
|
| 18 |
-
def
|
| 19 |
if dataset_info.cardData is not None:
|
| 20 |
metadata = dataset_info.cardData["eval_info"]
|
| 21 |
metadata.pop("col_mapping", None)
|
| 22 |
# TODO(lewtun): populate dataset cards with metric info
|
| 23 |
if "metrics" not in metadata:
|
| 24 |
metadata["metrics"] = frozenset()
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
return
|
| 28 |
-
else:
|
| 29 |
-
return None
|
| 30 |
|
| 31 |
|
| 32 |
-
def
|
| 33 |
filt = DatasetFilter(author="autoevaluate")
|
| 34 |
evaluation_datasets = HfApi().list_datasets(filter=filt, full=True)
|
| 35 |
-
return [
|
| 36 |
|
| 37 |
|
| 38 |
def filter_evaluated_models(models, task, dataset_name, dataset_config, dataset_split, metrics):
|
| 39 |
-
|
|
|
|
| 40 |
|
| 41 |
-
for
|
| 42 |
evaluation_info = EvaluationInfo(
|
| 43 |
task=task,
|
| 44 |
model=model,
|
|
@@ -47,12 +47,11 @@ def filter_evaluated_models(models, task, dataset_name, dataset_config, dataset_
|
|
| 47 |
dataset_split=dataset_split,
|
| 48 |
metrics=frozenset(metrics),
|
| 49 |
)
|
| 50 |
-
|
| 51 |
-
if candidate_id in evaluation_ids:
|
| 52 |
st.info(
|
| 53 |
-
f"Model `{model}` has already been evaluated on this configuration. \
|
| 54 |
This model will be excluded from the evaluation job..."
|
| 55 |
)
|
| 56 |
-
models.
|
| 57 |
|
| 58 |
return models
|
|
|
|
| 1 |
+
import copy
|
| 2 |
from dataclasses import dataclass
|
| 3 |
|
| 4 |
import streamlit as st
|
|
|
|
| 16 |
metrics: set
|
| 17 |
|
| 18 |
|
| 19 |
+
def create_evaluation_info(dataset_info: DatasetInfo) -> int:
|
| 20 |
if dataset_info.cardData is not None:
|
| 21 |
metadata = dataset_info.cardData["eval_info"]
|
| 22 |
metadata.pop("col_mapping", None)
|
| 23 |
# TODO(lewtun): populate dataset cards with metric info
|
| 24 |
if "metrics" not in metadata:
|
| 25 |
metadata["metrics"] = frozenset()
|
| 26 |
+
else:
|
| 27 |
+
metadata["metrics"] = frozenset(metadata["metrics"])
|
| 28 |
+
return EvaluationInfo(**metadata)
|
|
|
|
|
|
|
| 29 |
|
| 30 |
|
| 31 |
+
def get_evaluation_infos():
|
| 32 |
filt = DatasetFilter(author="autoevaluate")
|
| 33 |
evaluation_datasets = HfApi().list_datasets(filter=filt, full=True)
|
| 34 |
+
return [create_evaluation_info(dset) for dset in evaluation_datasets]
|
| 35 |
|
| 36 |
|
| 37 |
def filter_evaluated_models(models, task, dataset_name, dataset_config, dataset_split, metrics):
|
| 38 |
+
evaluation_infos = get_evaluation_infos()
|
| 39 |
+
models_to_filter = copy.copy(models)
|
| 40 |
|
| 41 |
+
for model in models_to_filter:
|
| 42 |
evaluation_info = EvaluationInfo(
|
| 43 |
task=task,
|
| 44 |
model=model,
|
|
|
|
| 47 |
dataset_split=dataset_split,
|
| 48 |
metrics=frozenset(metrics),
|
| 49 |
)
|
| 50 |
+
if evaluation_info in evaluation_infos:
|
|
|
|
| 51 |
st.info(
|
| 52 |
+
f"Model [`{model}`](https://huggingface.co/{model}) has already been evaluated on this configuration. \
|
| 53 |
This model will be excluded from the evaluation job..."
|
| 54 |
)
|
| 55 |
+
models.remove(model)
|
| 56 |
|
| 57 |
return models
|