Spaces:
Running
Running
Heatmap: Abbreviations for categories, created functions _prepare_tasks_categories_abbreviations() and get_leaderboard_heatmap(), and modification of create_heatmap()
Browse files- analyze_winscore.py +14 -23
- server.py +111 -0
analyze_winscore.py
CHANGED
|
@@ -8,6 +8,7 @@ import random
|
|
| 8 |
import numpy as np
|
| 9 |
from bokeh.plotting import figure
|
| 10 |
from bokeh.models import LabelSet, LogScale, ColumnDataSource, tickers
|
|
|
|
| 11 |
from bokeh.palettes import Turbo256 # A color palette with enough colors
|
| 12 |
|
| 13 |
# Function to fit a polynomial curve and return the x and y values of the fitted curve
|
|
@@ -207,15 +208,14 @@ def create_scatter_plot_with_curve_with_variances_named(category, variance_acros
|
|
| 207 |
|
| 208 |
return p
|
| 209 |
|
| 210 |
-
def create_heatmap(data_matrix, original_scores,
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
if selected_rows is not None:
|
| 220 |
# Select only the specified rows (models)
|
| 221 |
data_matrix = data_matrix[selected_rows]
|
|
@@ -223,13 +223,14 @@ def create_heatmap(data_matrix, original_scores, selected_rows=None, hide_scores
|
|
| 223 |
|
| 224 |
# Set up the figure with tasks as x-axis and models as y-axis
|
| 225 |
p = figure(
|
| 226 |
-
|
| 227 |
height=plot_height,
|
| 228 |
x_range=list(data_matrix.index),
|
| 229 |
y_range=list(data_matrix.columns),
|
| 230 |
toolbar_location="below",
|
| 231 |
tools="pan,wheel_zoom,box_zoom,reset,save",
|
| 232 |
-
|
|
|
|
| 233 |
)
|
| 234 |
|
| 235 |
# Create the color mapper for the heatmap
|
|
@@ -282,16 +283,6 @@ def create_heatmap(data_matrix, original_scores, selected_rows=None, hide_scores
|
|
| 282 |
p.rect(x='x', y='y', width=1, height=1, source=heatmap_source,
|
| 283 |
line_color=None, fill_color={'field': 'colors', 'transform': color_mapper})
|
| 284 |
|
| 285 |
-
# Add color bar
|
| 286 |
-
# Add color bar with custom ticks
|
| 287 |
-
color_bar = ColorBar(
|
| 288 |
-
color_mapper=color_mapper,
|
| 289 |
-
width=8, location=(0, 0),
|
| 290 |
-
ticker=FixedTicker(ticks=[0, 0.2, 0.4, 0.6, 0.8, 1]), # Fixed ticks at 0, 20, 40, 60, 80, 100
|
| 291 |
-
major_label_overrides={0: '0', 0.2: '20', 0.4: '40', 0.6: '60', 0.8: '80', 1: '100'} # Custom labels for ticks
|
| 292 |
-
)
|
| 293 |
-
#p.add_layout(color_bar, 'right')
|
| 294 |
-
|
| 295 |
# Add HoverTool for interactivity
|
| 296 |
hover = HoverTool()
|
| 297 |
hover.tooltips = [("Model", "@x"), ("Task", "@y"), ("DS", "@scores")] # Updated tooltip
|
|
@@ -309,13 +300,13 @@ def create_heatmap(data_matrix, original_scores, selected_rows=None, hide_scores
|
|
| 309 |
p.yaxis.major_label_text_font_size = "13pt"
|
| 310 |
p.xaxis.major_label_text_font_size = "13pt"
|
| 311 |
|
| 312 |
-
|
| 313 |
p.xaxis.axis_label_text_font_size = "18pt" # Set font size for x-axis label
|
| 314 |
p.yaxis.axis_label_text_font_size = "18pt" # Set font size for y-axis label
|
| 315 |
p.xaxis.axis_label_text_font_style = "normal" # Set x-axis label to normal
|
| 316 |
p.yaxis.axis_label_text_font_style = "normal" # Set y-axis label to normal
|
| 317 |
|
| 318 |
-
|
| 319 |
|
| 320 |
return p
|
| 321 |
|
|
|
|
| 8 |
import numpy as np
|
| 9 |
from bokeh.plotting import figure
|
| 10 |
from bokeh.models import LabelSet, LogScale, ColumnDataSource, tickers
|
| 11 |
+
from bokeh.models import LinearColorMapper, HoverTool
|
| 12 |
from bokeh.palettes import Turbo256 # A color palette with enough colors
|
| 13 |
|
| 14 |
# Function to fit a polynomial curve and return the x and y values of the fitted curve
|
|
|
|
| 208 |
|
| 209 |
return p
|
| 210 |
|
| 211 |
+
def create_heatmap(data_matrix, original_scores,
|
| 212 |
+
selected_rows=None,
|
| 213 |
+
hide_scores_tasks=[],
|
| 214 |
+
plot_width=None,
|
| 215 |
+
plot_height=None,
|
| 216 |
+
x_axis_label="Model",
|
| 217 |
+
y_axis_label="Task",
|
| 218 |
+
):
|
|
|
|
| 219 |
if selected_rows is not None:
|
| 220 |
# Select only the specified rows (models)
|
| 221 |
data_matrix = data_matrix[selected_rows]
|
|
|
|
| 223 |
|
| 224 |
# Set up the figure with tasks as x-axis and models as y-axis
|
| 225 |
p = figure(
|
| 226 |
+
width=plot_width,
|
| 227 |
height=plot_height,
|
| 228 |
x_range=list(data_matrix.index),
|
| 229 |
y_range=list(data_matrix.columns),
|
| 230 |
toolbar_location="below",
|
| 231 |
tools="pan,wheel_zoom,box_zoom,reset,save",
|
| 232 |
+
x_axis_label=x_axis_label,
|
| 233 |
+
y_axis_label=y_axis_label,
|
| 234 |
)
|
| 235 |
|
| 236 |
# Create the color mapper for the heatmap
|
|
|
|
| 283 |
p.rect(x='x', y='y', width=1, height=1, source=heatmap_source,
|
| 284 |
line_color=None, fill_color={'field': 'colors', 'transform': color_mapper})
|
| 285 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 286 |
# Add HoverTool for interactivity
|
| 287 |
hover = HoverTool()
|
| 288 |
hover.tooltips = [("Model", "@x"), ("Task", "@y"), ("DS", "@scores")] # Updated tooltip
|
|
|
|
| 300 |
p.yaxis.major_label_text_font_size = "13pt"
|
| 301 |
p.xaxis.major_label_text_font_size = "13pt"
|
| 302 |
|
| 303 |
+
# Set the axis label font size
|
| 304 |
p.xaxis.axis_label_text_font_size = "18pt" # Set font size for x-axis label
|
| 305 |
p.yaxis.axis_label_text_font_size = "18pt" # Set font size for y-axis label
|
| 306 |
p.xaxis.axis_label_text_font_style = "normal" # Set x-axis label to normal
|
| 307 |
p.yaxis.axis_label_text_font_style = "normal" # Set y-axis label to normal
|
| 308 |
|
| 309 |
+
p.yaxis.visible = False # Hide the y-axis labels
|
| 310 |
|
| 311 |
return p
|
| 312 |
|
server.py
CHANGED
|
@@ -272,6 +272,7 @@ class LeaderboardServer:
|
|
| 272 |
self.REPO_TYPE = "dataset"
|
| 273 |
self.TASKS_METADATA = json.load(open(TASKS_METADATA_PATH))
|
| 274 |
self.TASKS_CATEGORIES = {self.TASKS_METADATA[task]["category"] for task in self.TASKS_METADATA}
|
|
|
|
| 275 |
self.TASKS_CATEGORY_OVERALL = "Overall"
|
| 276 |
self.TASKS_CATEGORY_OVERALL_DETAILS = "Overall with details"
|
| 277 |
self.CATEGORY_TO_TASK_ABBREVIATION_TO_DETAILS = self._prepare_category_to_task_abbr_to_details()
|
|
@@ -381,6 +382,28 @@ class LeaderboardServer:
|
|
| 381 |
results = json.load(ranks_file)
|
| 382 |
return results
|
| 383 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 384 |
def _prepare_category_to_task_abbr_to_details(self):
|
| 385 |
tasks_per_category = {}
|
| 386 |
for task in self.TASKS_METADATA:
|
|
@@ -739,6 +762,94 @@ class LeaderboardServer:
|
|
| 739 |
|
| 740 |
return fig
|
| 741 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 742 |
def get_leaderboard_csv(self, pre_submit=None, category=None, kind_of_p_value=None):
|
| 743 |
if pre_submit == None:
|
| 744 |
category = category if category else self.TASKS_CATEGORY_OVERALL
|
|
|
|
| 272 |
self.REPO_TYPE = "dataset"
|
| 273 |
self.TASKS_METADATA = json.load(open(TASKS_METADATA_PATH))
|
| 274 |
self.TASKS_CATEGORIES = {self.TASKS_METADATA[task]["category"] for task in self.TASKS_METADATA}
|
| 275 |
+
self.TASKS_CATEGORIES_ABBREVIATIONS = self._prepare_tasks_categories_abbreviations()
|
| 276 |
self.TASKS_CATEGORY_OVERALL = "Overall"
|
| 277 |
self.TASKS_CATEGORY_OVERALL_DETAILS = "Overall with details"
|
| 278 |
self.CATEGORY_TO_TASK_ABBREVIATION_TO_DETAILS = self._prepare_category_to_task_abbr_to_details()
|
|
|
|
| 382 |
results = json.load(ranks_file)
|
| 383 |
return results
|
| 384 |
|
| 385 |
+
def _prepare_tasks_categories_abbreviations(self):
|
| 386 |
+
name2abbreviation = {
|
| 387 |
+
'Czech Language Understanding': 'CLU',
|
| 388 |
+
'Czech Math Reasoning': 'CMR',
|
| 389 |
+
'Factual Knowledge': 'FK',
|
| 390 |
+
'Language Modeling': 'LM',
|
| 391 |
+
'NER': 'NER',
|
| 392 |
+
'NLI': 'NLI',
|
| 393 |
+
'Reading Comprehension': 'RC',
|
| 394 |
+
'Sentiment': 'S'
|
| 395 |
+
}
|
| 396 |
+
for category in self.TASKS_CATEGORIES:
|
| 397 |
+
if category not in name2abbreviation:
|
| 398 |
+
name2abbreviation[category] = category
|
| 399 |
+
|
| 400 |
+
assert all(category in name2abbreviation for category in self.TASKS_CATEGORIES)
|
| 401 |
+
|
| 402 |
+
abbreviation2name = {abbr: name for name, abbr in name2abbreviation.items()}
|
| 403 |
+
assert len(abbreviation2name) == len(name2abbreviation)
|
| 404 |
+
|
| 405 |
+
return abbreviation2name
|
| 406 |
+
|
| 407 |
def _prepare_category_to_task_abbr_to_details(self):
|
| 408 |
tasks_per_category = {}
|
| 409 |
for task in self.TASKS_METADATA:
|
|
|
|
| 762 |
|
| 763 |
return fig
|
| 764 |
|
| 765 |
+
def get_leaderboard_heatmap(self, pre_submit=None, category=None, kind_of_p_value=None):
|
| 766 |
+
from analyze_winscore import get_ldb_records, create_heatmap
|
| 767 |
+
|
| 768 |
+
kind_of_p_value = kind_of_p_value if kind_of_p_value else self.DEFAULT_KIND_OF_P_VALUE
|
| 769 |
+
|
| 770 |
+
#tournament = self.tournament_results
|
| 771 |
+
name_map = self.submission_id_to_model_title
|
| 772 |
+
|
| 773 |
+
category = category if category else self.TASKS_CATEGORY_OVERALL
|
| 774 |
+
|
| 775 |
+
csv_file_path = self.leaderboard_dataframes_csv[kind_of_p_value][category]
|
| 776 |
+
ldb_records = get_ldb_records(name_map, csv_file_path)
|
| 777 |
+
model_names = list(ldb_records.keys())
|
| 778 |
+
|
| 779 |
+
task2model2score = {}
|
| 780 |
+
if category == self.TASKS_CATEGORY_OVERALL:
|
| 781 |
+
fig_y_axis_label = 'Category'
|
| 782 |
+
abbreviation2name = self.TASKS_CATEGORIES_ABBREVIATIONS
|
| 783 |
+
|
| 784 |
+
for abbr, name in abbreviation2name.items():
|
| 785 |
+
for model in model_names:
|
| 786 |
+
score = float(ldb_records[model][name])
|
| 787 |
+
task2model2score.setdefault(abbr, dict())[model] = score
|
| 788 |
+
else:
|
| 789 |
+
fig_y_axis_label = "Task"
|
| 790 |
+
abbreviation2name = self.CATEGORY_TO_TASK_ABBREVIATION_TO_DETAILS[category]
|
| 791 |
+
|
| 792 |
+
for abbr, name, url in abbreviation2name.values():
|
| 793 |
+
for model in model_names:
|
| 794 |
+
score = float(ldb_records[model][abbr])
|
| 795 |
+
task2model2score.setdefault(abbr, dict())[model] = score
|
| 796 |
+
|
| 797 |
+
# Convert proportions dictionary to DataFrame
|
| 798 |
+
# Transpose to have models as rows and tasks as columns
|
| 799 |
+
proportions_df = pd.DataFrame(task2model2score)
|
| 800 |
+
proportions_df /= 100
|
| 801 |
+
|
| 802 |
+
# Rename index and columns to reflect models and tasks
|
| 803 |
+
# Index is now tasks and columns are models
|
| 804 |
+
proportions_df.index.name = 'Model'
|
| 805 |
+
proportions_df.columns.name = fig_y_axis_label
|
| 806 |
+
|
| 807 |
+
# Calculate row averages
|
| 808 |
+
row_averages = proportions_df.mean(axis=1)
|
| 809 |
+
|
| 810 |
+
# Sort DataFrame by row averages
|
| 811 |
+
sorted_df = proportions_df.loc[row_averages.sort_values(ascending=False).index]
|
| 812 |
+
|
| 813 |
+
# Create task_to_category
|
| 814 |
+
task_to_category = {}
|
| 815 |
+
for task, details in self.TASKS_METADATA.items():
|
| 816 |
+
task_to_category[details['abbreviation']] = details['category']
|
| 817 |
+
|
| 818 |
+
if category != self.TASKS_CATEGORY_OVERALL:
|
| 819 |
+
# Create a Series from task_to_category with tasks as index
|
| 820 |
+
categories = pd.Series(task_to_category)
|
| 821 |
+
|
| 822 |
+
# Sort tasks by their categories
|
| 823 |
+
sorted_tasks = categories.sort_values().index
|
| 824 |
+
|
| 825 |
+
# Reorder the DataFrame columns based on sorted tasks
|
| 826 |
+
sorted_df = sorted_df[sorted_tasks]
|
| 827 |
+
|
| 828 |
+
# Remove team_name from model_title
|
| 829 |
+
sorted_df.index = sorted_df.index.str.replace(r'^[^/]+/', '', regex=True)
|
| 830 |
+
|
| 831 |
+
original_scores = sorted_df
|
| 832 |
+
size_dict = {v['Model'][v['Model'].index("/")+1:]: float(v['# θ (B)']) for v in ldb_records.values()}
|
| 833 |
+
sizes_series = pd.Series([size_dict[m] for m in original_scores.index], index=original_scores.index)
|
| 834 |
+
|
| 835 |
+
# Sort columns by category
|
| 836 |
+
sorted_columns = sorted(original_scores.columns, key=lambda x: task_to_category.get(x, ''))
|
| 837 |
+
original_scores = original_scores[sorted_columns] # Ensure original scores match the sorted columns
|
| 838 |
+
|
| 839 |
+
# Sort rows by model size
|
| 840 |
+
sorted_indices = sizes_series.sort_values(ascending=False).index
|
| 841 |
+
original_scores = original_scores.loc[sorted_indices] # Sort rows by model size
|
| 842 |
+
|
| 843 |
+
fig = create_heatmap(
|
| 844 |
+
original_scores,
|
| 845 |
+
original_scores*100,
|
| 846 |
+
plot_width=1200,
|
| 847 |
+
plot_height=1200,
|
| 848 |
+
y_axis_label=fig_y_axis_label,
|
| 849 |
+
)
|
| 850 |
+
|
| 851 |
+
return fig
|
| 852 |
+
|
| 853 |
def get_leaderboard_csv(self, pre_submit=None, category=None, kind_of_p_value=None):
|
| 854 |
if pre_submit == None:
|
| 855 |
category = category if category else self.TASKS_CATEGORY_OVERALL
|