Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
heatmap + rank ordering
Browse files
app.py
CHANGED
|
@@ -11,6 +11,8 @@ import pandas as pd
|
|
| 11 |
from apscheduler.schedulers.background import BackgroundScheduler
|
| 12 |
from datasets import VerificationMode, load_dataset, Dataset
|
| 13 |
from huggingface_hub import HfApi, snapshot_download
|
|
|
|
|
|
|
| 14 |
|
| 15 |
from content import (
|
| 16 |
CITATION_BUTTON_LABEL,
|
|
@@ -193,15 +195,13 @@ class LeaderboardData:
|
|
| 193 |
df = pd.DataFrame(local_df)
|
| 194 |
avail_columns = list(df.columns)
|
| 195 |
missing_columns = list(set(filtered_columns) - set(avail_columns))
|
| 196 |
-
df[missing_columns] = "
|
| 197 |
|
| 198 |
df = df[filtered_columns]
|
| 199 |
# Unit conversion
|
| 200 |
for col in df.columns:
|
| 201 |
if "mae" in col.lower():
|
| 202 |
-
df[col] =
|
| 203 |
-
elif pd.api.types.is_numeric_dtype(df[col]):
|
| 204 |
-
df[col] = df[col].round(4)
|
| 205 |
df = df.sort_values(by=[f"{subsplit}_energy_mae"], ascending=True)
|
| 206 |
df[f"{subsplit}_energy_mae"] = df[f"{subsplit}_energy_mae"]
|
| 207 |
df[f"{subsplit}_forces_mae"] = df[f"{subsplit}_forces_mae"]
|
|
@@ -231,16 +231,15 @@ class LeaderboardData:
|
|
| 231 |
df = pd.DataFrame(local_df)
|
| 232 |
avail_columns = list(df.columns)
|
| 233 |
missing_columns = list(set(filtered_columns) - set(avail_columns))
|
| 234 |
-
df[missing_columns] = "
|
| 235 |
|
| 236 |
df = df[filtered_columns]
|
| 237 |
# Unit conversion
|
| 238 |
for col in df.columns:
|
| 239 |
if "mae" in col.lower():
|
| 240 |
-
df[col] =
|
| 241 |
-
elif pd.api.types.is_numeric_dtype(df[col]):
|
| 242 |
-
df[col] = df[col].round(4)
|
| 243 |
df = df.sort_values(by=[eval_columns[0]], ascending=True)
|
|
|
|
| 244 |
df = df.rename(columns=COLUMN_MAPPING)
|
| 245 |
return df
|
| 246 |
|
|
@@ -427,6 +426,8 @@ def create_dataframe_tab(
|
|
| 427 |
# Model | Organization |Energy Conserving | Training Set | Metrics | date
|
| 428 |
widths = ["10%", "5%", "5%", "5%"] + ["5%"] * (num_cols - fixed_cols) + ["10%"]
|
| 429 |
|
|
|
|
|
|
|
| 430 |
with gr.Tab(tab_name) as tab:
|
| 431 |
gr.Dataframe(
|
| 432 |
value=df,
|
|
@@ -434,6 +435,7 @@ def create_dataframe_tab(
|
|
| 434 |
interactive=False,
|
| 435 |
show_search="filter",
|
| 436 |
column_widths=widths,
|
|
|
|
| 437 |
)
|
| 438 |
return tab
|
| 439 |
|
|
@@ -463,7 +465,7 @@ def create_evaluation_tabs(results_dfs: Dict[str, pd.DataFrame]) -> None:
|
|
| 463 |
overview_df = create_overview_dataframe(results_dfs)
|
| 464 |
n_overview_columns = len(overview_df.columns)
|
| 465 |
create_dataframe_tab(
|
| 466 |
-
"Overview", overview_df, widths=["
|
| 467 |
)
|
| 468 |
|
| 469 |
# Create individual evaluation tabs
|
|
@@ -476,14 +478,8 @@ def create_overview_dataframe(results_dfs: Dict[str, pd.DataFrame]) -> pd.DataFr
|
|
| 476 |
"""
|
| 477 |
Create an overview dataframe combining all models with only the first metric from each eval type.
|
| 478 |
"""
|
| 479 |
-
# Initialize overview data with model info
|
| 480 |
-
overview_data = {}
|
| 481 |
|
| 482 |
-
# Get all unique model-dataset combinations across all dataframes
|
| 483 |
-
all_model_entries = set()
|
| 484 |
model_info = {}
|
| 485 |
-
|
| 486 |
-
# Collect all models and their info from all evaluation types
|
| 487 |
for eval_type, df in results_dfs.items():
|
| 488 |
if eval_type.startswith("Validation_") or eval_type.startswith("Test_"):
|
| 489 |
continue
|
|
@@ -491,10 +487,7 @@ def create_overview_dataframe(results_dfs: Dict[str, pd.DataFrame]) -> pd.DataFr
|
|
| 491 |
for _, row in df.iterrows():
|
| 492 |
model_name = row["Model"]
|
| 493 |
dataset = row["Training Set"]
|
| 494 |
-
# Create unique identifier combining model name and training set
|
| 495 |
model_entry = (model_name, dataset)
|
| 496 |
-
all_model_entries.add(model_entry)
|
| 497 |
-
# Store model metadata for this specific entry
|
| 498 |
model_info[model_entry] = {
|
| 499 |
"Model": model_name,
|
| 500 |
"Organization": row.get("Organization", ""),
|
|
@@ -502,7 +495,6 @@ def create_overview_dataframe(results_dfs: Dict[str, pd.DataFrame]) -> pd.DataFr
|
|
| 502 |
"Training Set": dataset,
|
| 503 |
}
|
| 504 |
|
| 505 |
-
# Initialize overview data structure
|
| 506 |
overview_data = {
|
| 507 |
"Model": [],
|
| 508 |
"Organization": [],
|
|
@@ -510,25 +502,18 @@ def create_overview_dataframe(results_dfs: Dict[str, pd.DataFrame]) -> pd.DataFr
|
|
| 510 |
"Training Set": [],
|
| 511 |
}
|
| 512 |
|
| 513 |
-
# Add columns for the primary metric from each evaluation type
|
| 514 |
metric_columns = {}
|
| 515 |
-
|
| 516 |
-
# Add primary metric from each OTHER evaluation type (skip S2EF)
|
| 517 |
for eval_type in OTHER_EVAL_TYPES:
|
| 518 |
if eval_type in results_dfs and eval_type in LEADERBOARD_COLUMNS:
|
| 519 |
-
|
| 520 |
-
# Map to display name using COLUMN_MAPPING
|
| 521 |
-
metric_display_name = COLUMN_MAPPING.get(primary_metric, primary_metric)
|
| 522 |
-
# Include task name to avoid conflicts when multiple tasks have same metric
|
| 523 |
task_display_name = "IE/EA" if eval_type == "IE_EA" else eval_type
|
| 524 |
full_display_name = f"{task_display_name}\n{metric_display_name}"
|
| 525 |
overview_data[full_display_name] = []
|
| 526 |
metric_columns[full_display_name] = (eval_type, metric_display_name)
|
| 527 |
|
| 528 |
-
|
| 529 |
-
|
| 530 |
-
|
| 531 |
-
): # Sort by model name, then dataset
|
| 532 |
model_name, dataset = model_entry
|
| 533 |
entry_info = model_info[model_entry]
|
| 534 |
|
|
@@ -540,35 +525,32 @@ def create_overview_dataframe(results_dfs: Dict[str, pd.DataFrame]) -> pd.DataFr
|
|
| 540 |
# Fill in metrics for each column
|
| 541 |
for display_col, (eval_type, source_col) in metric_columns.items():
|
| 542 |
if eval_type in results_dfs:
|
| 543 |
-
df = results_dfs[eval_type]
|
| 544 |
# Match both model name and training set
|
| 545 |
model_row = df[
|
| 546 |
(df["Model"] == model_name) & (df["Training Set"] == dataset)
|
| 547 |
]
|
| 548 |
if not model_row.empty and source_col in model_row.columns:
|
| 549 |
value = model_row.iloc[0][source_col]
|
|
|
|
| 550 |
else:
|
| 551 |
-
value = "
|
| 552 |
-
|
| 553 |
-
value
|
| 554 |
-
|
| 555 |
|
| 556 |
overview_df = pd.DataFrame(overview_data)
|
| 557 |
|
| 558 |
-
|
| 559 |
-
|
| 560 |
-
|
| 561 |
-
|
| 562 |
-
|
| 563 |
-
|
| 564 |
-
|
| 565 |
-
|
| 566 |
-
|
| 567 |
-
|
| 568 |
-
# Calculate mean across columns, ignoring NaN values
|
| 569 |
-
avg_scores = numeric_metrics.mean(axis=1)
|
| 570 |
-
# Sort by average score (ascending for MAE metrics)
|
| 571 |
-
overview_df = overview_df.loc[avg_scores.sort_values().index]
|
| 572 |
|
| 573 |
return overview_df
|
| 574 |
|
|
|
|
| 11 |
from apscheduler.schedulers.background import BackgroundScheduler
|
| 12 |
from datasets import VerificationMode, load_dataset, Dataset
|
| 13 |
from huggingface_hub import HfApi, snapshot_download
|
| 14 |
+
from collections import defaultdict
|
| 15 |
+
import seaborn as sns
|
| 16 |
|
| 17 |
from content import (
|
| 18 |
CITATION_BUTTON_LABEL,
|
|
|
|
| 195 |
df = pd.DataFrame(local_df)
|
| 196 |
avail_columns = list(df.columns)
|
| 197 |
missing_columns = list(set(filtered_columns) - set(avail_columns))
|
| 198 |
+
df[missing_columns] = ""
|
| 199 |
|
| 200 |
df = df[filtered_columns]
|
| 201 |
# Unit conversion
|
| 202 |
for col in df.columns:
|
| 203 |
if "mae" in col.lower():
|
| 204 |
+
df[col] = df[col] * 1000
|
|
|
|
|
|
|
| 205 |
df = df.sort_values(by=[f"{subsplit}_energy_mae"], ascending=True)
|
| 206 |
df[f"{subsplit}_energy_mae"] = df[f"{subsplit}_energy_mae"]
|
| 207 |
df[f"{subsplit}_forces_mae"] = df[f"{subsplit}_forces_mae"]
|
|
|
|
| 231 |
df = pd.DataFrame(local_df)
|
| 232 |
avail_columns = list(df.columns)
|
| 233 |
missing_columns = list(set(filtered_columns) - set(avail_columns))
|
| 234 |
+
df[missing_columns] = ""
|
| 235 |
|
| 236 |
df = df[filtered_columns]
|
| 237 |
# Unit conversion
|
| 238 |
for col in df.columns:
|
| 239 |
if "mae" in col.lower():
|
| 240 |
+
df[col] = df[col] * 1000
|
|
|
|
|
|
|
| 241 |
df = df.sort_values(by=[eval_columns[0]], ascending=True)
|
| 242 |
+
|
| 243 |
df = df.rename(columns=COLUMN_MAPPING)
|
| 244 |
return df
|
| 245 |
|
|
|
|
| 426 |
# Model | Organization |Energy Conserving | Training Set | Metrics | date
|
| 427 |
widths = ["10%", "5%", "5%", "5%"] + ["5%"] * (num_cols - fixed_cols) + ["10%"]
|
| 428 |
|
| 429 |
+
cm = sns.color_palette("viridis_r", as_cmap=True)
|
| 430 |
+
df = df.style.format(precision=2).background_gradient(cmap=cm)
|
| 431 |
with gr.Tab(tab_name) as tab:
|
| 432 |
gr.Dataframe(
|
| 433 |
value=df,
|
|
|
|
| 435 |
interactive=False,
|
| 436 |
show_search="filter",
|
| 437 |
column_widths=widths,
|
| 438 |
+
show_copy_button=True,
|
| 439 |
)
|
| 440 |
return tab
|
| 441 |
|
|
|
|
| 465 |
overview_df = create_overview_dataframe(results_dfs)
|
| 466 |
n_overview_columns = len(overview_df.columns)
|
| 467 |
create_dataframe_tab(
|
| 468 |
+
"Overview", overview_df, widths=["15%"] + ["10%"] * (n_overview_columns - 1)
|
| 469 |
)
|
| 470 |
|
| 471 |
# Create individual evaluation tabs
|
|
|
|
| 478 |
"""
|
| 479 |
Create an overview dataframe combining all models with only the first metric from each eval type.
|
| 480 |
"""
|
|
|
|
|
|
|
| 481 |
|
|
|
|
|
|
|
| 482 |
model_info = {}
|
|
|
|
|
|
|
| 483 |
for eval_type, df in results_dfs.items():
|
| 484 |
if eval_type.startswith("Validation_") or eval_type.startswith("Test_"):
|
| 485 |
continue
|
|
|
|
| 487 |
for _, row in df.iterrows():
|
| 488 |
model_name = row["Model"]
|
| 489 |
dataset = row["Training Set"]
|
|
|
|
| 490 |
model_entry = (model_name, dataset)
|
|
|
|
|
|
|
| 491 |
model_info[model_entry] = {
|
| 492 |
"Model": model_name,
|
| 493 |
"Organization": row.get("Organization", ""),
|
|
|
|
| 495 |
"Training Set": dataset,
|
| 496 |
}
|
| 497 |
|
|
|
|
| 498 |
overview_data = {
|
| 499 |
"Model": [],
|
| 500 |
"Organization": [],
|
|
|
|
| 502 |
"Training Set": [],
|
| 503 |
}
|
| 504 |
|
|
|
|
| 505 |
metric_columns = {}
|
|
|
|
|
|
|
| 506 |
for eval_type in OTHER_EVAL_TYPES:
|
| 507 |
if eval_type in results_dfs and eval_type in LEADERBOARD_COLUMNS:
|
| 508 |
+
metric_display_name = COLUMN_MAPPING[LEADERBOARD_COLUMNS[eval_type][0]]
|
|
|
|
|
|
|
|
|
|
| 509 |
task_display_name = "IE/EA" if eval_type == "IE_EA" else eval_type
|
| 510 |
full_display_name = f"{task_display_name}\n{metric_display_name}"
|
| 511 |
overview_data[full_display_name] = []
|
| 512 |
metric_columns[full_display_name] = (eval_type, metric_display_name)
|
| 513 |
|
| 514 |
+
all_model_entries = model_info.keys()
|
| 515 |
+
model_rankings = defaultdict(list)
|
| 516 |
+
for model_entry in sorted(all_model_entries, key=lambda x: (x[0], x[1])):
|
|
|
|
| 517 |
model_name, dataset = model_entry
|
| 518 |
entry_info = model_info[model_entry]
|
| 519 |
|
|
|
|
| 525 |
# Fill in metrics for each column
|
| 526 |
for display_col, (eval_type, source_col) in metric_columns.items():
|
| 527 |
if eval_type in results_dfs:
|
| 528 |
+
df = results_dfs[eval_type].reset_index(drop=True)
|
| 529 |
# Match both model name and training set
|
| 530 |
model_row = df[
|
| 531 |
(df["Model"] == model_name) & (df["Training Set"] == dataset)
|
| 532 |
]
|
| 533 |
if not model_row.empty and source_col in model_row.columns:
|
| 534 |
value = model_row.iloc[0][source_col]
|
| 535 |
+
rank = model_row.index[0]
|
| 536 |
else:
|
| 537 |
+
value = ""
|
| 538 |
+
rank = df.shape[0]
|
| 539 |
+
overview_data[display_col].append(value)
|
| 540 |
+
model_rankings[model_entry].append(rank)
|
| 541 |
|
| 542 |
overview_df = pd.DataFrame(overview_data)
|
| 543 |
|
| 544 |
+
def get_rank(row):
|
| 545 |
+
model_name = row["Model"]
|
| 546 |
+
dataset = row["Training Set"]
|
| 547 |
+
rank = np.mean(model_rankings[(model_name, dataset)])
|
| 548 |
+
return rank
|
| 549 |
+
|
| 550 |
+
overview_df["overall_rank"] = overview_df.apply(get_rank, axis=1)
|
| 551 |
+
overview_df = overview_df.sort_values(by="overall_rank").drop(
|
| 552 |
+
columns=["overall_rank"]
|
| 553 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 554 |
|
| 555 |
return overview_df
|
| 556 |
|