mshuaibi commited on
Commit
7291625
·
1 Parent(s): 86644d0

heatmap + rank ordering

Browse files
Files changed (1) hide show
  1. app.py +31 -49
app.py CHANGED
@@ -11,6 +11,8 @@ import pandas as pd
11
  from apscheduler.schedulers.background import BackgroundScheduler
12
  from datasets import VerificationMode, load_dataset, Dataset
13
  from huggingface_hub import HfApi, snapshot_download
 
 
14
 
15
  from content import (
16
  CITATION_BUTTON_LABEL,
@@ -193,15 +195,13 @@ class LeaderboardData:
193
  df = pd.DataFrame(local_df)
194
  avail_columns = list(df.columns)
195
  missing_columns = list(set(filtered_columns) - set(avail_columns))
196
- df[missing_columns] = "-"
197
 
198
  df = df[filtered_columns]
199
  # Unit conversion
200
  for col in df.columns:
201
  if "mae" in col.lower():
202
- df[col] = (df[col] * 1000).round(2)
203
- elif pd.api.types.is_numeric_dtype(df[col]):
204
- df[col] = df[col].round(4)
205
  df = df.sort_values(by=[f"{subsplit}_energy_mae"], ascending=True)
206
  df[f"{subsplit}_energy_mae"] = df[f"{subsplit}_energy_mae"]
207
  df[f"{subsplit}_forces_mae"] = df[f"{subsplit}_forces_mae"]
@@ -231,16 +231,15 @@ class LeaderboardData:
231
  df = pd.DataFrame(local_df)
232
  avail_columns = list(df.columns)
233
  missing_columns = list(set(filtered_columns) - set(avail_columns))
234
- df[missing_columns] = "-"
235
 
236
  df = df[filtered_columns]
237
  # Unit conversion
238
  for col in df.columns:
239
  if "mae" in col.lower():
240
- df[col] = (df[col] * 1000).round(2)
241
- elif pd.api.types.is_numeric_dtype(df[col]):
242
- df[col] = df[col].round(4)
243
  df = df.sort_values(by=[eval_columns[0]], ascending=True)
 
244
  df = df.rename(columns=COLUMN_MAPPING)
245
  return df
246
 
@@ -427,6 +426,8 @@ def create_dataframe_tab(
427
  # Model | Organization |Energy Conserving | Training Set | Metrics | date
428
  widths = ["10%", "5%", "5%", "5%"] + ["5%"] * (num_cols - fixed_cols) + ["10%"]
429
 
 
 
430
  with gr.Tab(tab_name) as tab:
431
  gr.Dataframe(
432
  value=df,
@@ -434,6 +435,7 @@ def create_dataframe_tab(
434
  interactive=False,
435
  show_search="filter",
436
  column_widths=widths,
 
437
  )
438
  return tab
439
 
@@ -463,7 +465,7 @@ def create_evaluation_tabs(results_dfs: Dict[str, pd.DataFrame]) -> None:
463
  overview_df = create_overview_dataframe(results_dfs)
464
  n_overview_columns = len(overview_df.columns)
465
  create_dataframe_tab(
466
- "Overview", overview_df, widths=["20%"] + ["10%"] * (n_overview_columns - 1)
467
  )
468
 
469
  # Create individual evaluation tabs
@@ -476,14 +478,8 @@ def create_overview_dataframe(results_dfs: Dict[str, pd.DataFrame]) -> pd.DataFr
476
  """
477
  Create an overview dataframe combining all models with only the first metric from each eval type.
478
  """
479
- # Initialize overview data with model info
480
- overview_data = {}
481
 
482
- # Get all unique model-dataset combinations across all dataframes
483
- all_model_entries = set()
484
  model_info = {}
485
-
486
- # Collect all models and their info from all evaluation types
487
  for eval_type, df in results_dfs.items():
488
  if eval_type.startswith("Validation_") or eval_type.startswith("Test_"):
489
  continue
@@ -491,10 +487,7 @@ def create_overview_dataframe(results_dfs: Dict[str, pd.DataFrame]) -> pd.DataFr
491
  for _, row in df.iterrows():
492
  model_name = row["Model"]
493
  dataset = row["Training Set"]
494
- # Create unique identifier combining model name and training set
495
  model_entry = (model_name, dataset)
496
- all_model_entries.add(model_entry)
497
- # Store model metadata for this specific entry
498
  model_info[model_entry] = {
499
  "Model": model_name,
500
  "Organization": row.get("Organization", ""),
@@ -502,7 +495,6 @@ def create_overview_dataframe(results_dfs: Dict[str, pd.DataFrame]) -> pd.DataFr
502
  "Training Set": dataset,
503
  }
504
 
505
- # Initialize overview data structure
506
  overview_data = {
507
  "Model": [],
508
  "Organization": [],
@@ -510,25 +502,18 @@ def create_overview_dataframe(results_dfs: Dict[str, pd.DataFrame]) -> pd.DataFr
510
  "Training Set": [],
511
  }
512
 
513
- # Add columns for the primary metric from each evaluation type
514
  metric_columns = {}
515
-
516
- # Add primary metric from each OTHER evaluation type (skip S2EF)
517
  for eval_type in OTHER_EVAL_TYPES:
518
  if eval_type in results_dfs and eval_type in LEADERBOARD_COLUMNS:
519
- primary_metric = LEADERBOARD_COLUMNS[eval_type][0] # First metric
520
- # Map to display name using COLUMN_MAPPING
521
- metric_display_name = COLUMN_MAPPING.get(primary_metric, primary_metric)
522
- # Include task name to avoid conflicts when multiple tasks have same metric
523
  task_display_name = "IE/EA" if eval_type == "IE_EA" else eval_type
524
  full_display_name = f"{task_display_name}\n{metric_display_name}"
525
  overview_data[full_display_name] = []
526
  metric_columns[full_display_name] = (eval_type, metric_display_name)
527
 
528
- # Populate data for each model entry
529
- for model_entry in sorted(
530
- all_model_entries, key=lambda x: (x[0], x[1])
531
- ): # Sort by model name, then dataset
532
  model_name, dataset = model_entry
533
  entry_info = model_info[model_entry]
534
 
@@ -540,35 +525,32 @@ def create_overview_dataframe(results_dfs: Dict[str, pd.DataFrame]) -> pd.DataFr
540
  # Fill in metrics for each column
541
  for display_col, (eval_type, source_col) in metric_columns.items():
542
  if eval_type in results_dfs:
543
- df = results_dfs[eval_type]
544
  # Match both model name and training set
545
  model_row = df[
546
  (df["Model"] == model_name) & (df["Training Set"] == dataset)
547
  ]
548
  if not model_row.empty and source_col in model_row.columns:
549
  value = model_row.iloc[0][source_col]
 
550
  else:
551
- value = "-"
552
- else:
553
- value = "-"
554
- overview_data[display_col].append(value)
555
 
556
  overview_df = pd.DataFrame(overview_data)
557
 
558
- # Sort by the average of all metric columns (ascending for MAE metrics)
559
- metric_cols = [
560
- col
561
- for col in overview_df.columns
562
- if col not in PRE_COLUMN_NAMES + POST_COLUMN_NAMES
563
- ]
564
- if metric_cols:
565
- # Calculate average across all metric columns for each row
566
- # Convert all metric columns to numeric, keeping "-" as NaN
567
- numeric_metrics = overview_df[metric_cols].apply(pd.to_numeric, errors="coerce")
568
- # Calculate mean across columns, ignoring NaN values
569
- avg_scores = numeric_metrics.mean(axis=1)
570
- # Sort by average score (ascending for MAE metrics)
571
- overview_df = overview_df.loc[avg_scores.sort_values().index]
572
 
573
  return overview_df
574
 
 
11
  from apscheduler.schedulers.background import BackgroundScheduler
12
  from datasets import VerificationMode, load_dataset, Dataset
13
  from huggingface_hub import HfApi, snapshot_download
14
+ from collections import defaultdict
15
+ import seaborn as sns
16
 
17
  from content import (
18
  CITATION_BUTTON_LABEL,
 
195
  df = pd.DataFrame(local_df)
196
  avail_columns = list(df.columns)
197
  missing_columns = list(set(filtered_columns) - set(avail_columns))
198
+ df[missing_columns] = ""
199
 
200
  df = df[filtered_columns]
201
  # Unit conversion
202
  for col in df.columns:
203
  if "mae" in col.lower():
204
+ df[col] = df[col] * 1000
 
 
205
  df = df.sort_values(by=[f"{subsplit}_energy_mae"], ascending=True)
206
  df[f"{subsplit}_energy_mae"] = df[f"{subsplit}_energy_mae"]
207
  df[f"{subsplit}_forces_mae"] = df[f"{subsplit}_forces_mae"]
 
231
  df = pd.DataFrame(local_df)
232
  avail_columns = list(df.columns)
233
  missing_columns = list(set(filtered_columns) - set(avail_columns))
234
+ df[missing_columns] = ""
235
 
236
  df = df[filtered_columns]
237
  # Unit conversion
238
  for col in df.columns:
239
  if "mae" in col.lower():
240
+ df[col] = df[col] * 1000
 
 
241
  df = df.sort_values(by=[eval_columns[0]], ascending=True)
242
+
243
  df = df.rename(columns=COLUMN_MAPPING)
244
  return df
245
 
 
426
  # Model | Organization |Energy Conserving | Training Set | Metrics | date
427
  widths = ["10%", "5%", "5%", "5%"] + ["5%"] * (num_cols - fixed_cols) + ["10%"]
428
 
429
+ cm = sns.color_palette("viridis_r", as_cmap=True)
430
+ df = df.style.format(precision=2).background_gradient(cmap=cm)
431
  with gr.Tab(tab_name) as tab:
432
  gr.Dataframe(
433
  value=df,
 
435
  interactive=False,
436
  show_search="filter",
437
  column_widths=widths,
438
+ show_copy_button=True,
439
  )
440
  return tab
441
 
 
465
  overview_df = create_overview_dataframe(results_dfs)
466
  n_overview_columns = len(overview_df.columns)
467
  create_dataframe_tab(
468
+ "Overview", overview_df, widths=["15%"] + ["10%"] * (n_overview_columns - 1)
469
  )
470
 
471
  # Create individual evaluation tabs
 
478
  """
479
  Create an overview dataframe combining all models with only the first metric from each eval type.
480
  """
 
 
481
 
 
 
482
  model_info = {}
 
 
483
  for eval_type, df in results_dfs.items():
484
  if eval_type.startswith("Validation_") or eval_type.startswith("Test_"):
485
  continue
 
487
  for _, row in df.iterrows():
488
  model_name = row["Model"]
489
  dataset = row["Training Set"]
 
490
  model_entry = (model_name, dataset)
 
 
491
  model_info[model_entry] = {
492
  "Model": model_name,
493
  "Organization": row.get("Organization", ""),
 
495
  "Training Set": dataset,
496
  }
497
 
 
498
  overview_data = {
499
  "Model": [],
500
  "Organization": [],
 
502
  "Training Set": [],
503
  }
504
 
 
505
  metric_columns = {}
 
 
506
  for eval_type in OTHER_EVAL_TYPES:
507
  if eval_type in results_dfs and eval_type in LEADERBOARD_COLUMNS:
508
+ metric_display_name = COLUMN_MAPPING[LEADERBOARD_COLUMNS[eval_type][0]]
 
 
 
509
  task_display_name = "IE/EA" if eval_type == "IE_EA" else eval_type
510
  full_display_name = f"{task_display_name}\n{metric_display_name}"
511
  overview_data[full_display_name] = []
512
  metric_columns[full_display_name] = (eval_type, metric_display_name)
513
 
514
+ all_model_entries = model_info.keys()
515
+ model_rankings = defaultdict(list)
516
+ for model_entry in sorted(all_model_entries, key=lambda x: (x[0], x[1])):
 
517
  model_name, dataset = model_entry
518
  entry_info = model_info[model_entry]
519
 
 
525
  # Fill in metrics for each column
526
  for display_col, (eval_type, source_col) in metric_columns.items():
527
  if eval_type in results_dfs:
528
+ df = results_dfs[eval_type].reset_index(drop=True)
529
  # Match both model name and training set
530
  model_row = df[
531
  (df["Model"] == model_name) & (df["Training Set"] == dataset)
532
  ]
533
  if not model_row.empty and source_col in model_row.columns:
534
  value = model_row.iloc[0][source_col]
535
+ rank = model_row.index[0]
536
  else:
537
+ value = ""
538
+ rank = df.shape[0]
539
+ overview_data[display_col].append(value)
540
+ model_rankings[model_entry].append(rank)
541
 
542
  overview_df = pd.DataFrame(overview_data)
543
 
544
+ def get_rank(row):
545
+ model_name = row["Model"]
546
+ dataset = row["Training Set"]
547
+ rank = np.mean(model_rankings[(model_name, dataset)])
548
+ return rank
549
+
550
+ overview_df["overall_rank"] = overview_df.apply(get_rank, axis=1)
551
+ overview_df = overview_df.sort_values(by="overall_rank").drop(
552
+ columns=["overall_rank"]
553
+ )
 
 
 
 
554
 
555
  return overview_df
556