Spaces:

CZLC
/

BenCzechMark

Running

App Files Files Community

idolezal commited on Jul 24

Commit

c2e1968

1 Parent(s): f70bc66

Heatmap: Abbreviations for categories, created functions _prepare_tasks_categories_abbreviations() and get_leaderboard_heatmap(), and modification of create_heatmap()

Browse files

Files changed (2) hide show

analyze_winscore.py +14 -23
server.py +111 -0

analyze_winscore.py CHANGED Viewed

@@ -8,6 +8,7 @@ import random
 import numpy as np
 from bokeh.plotting import figure
 from bokeh.models import LabelSet, LogScale, ColumnDataSource, tickers
 from bokeh.palettes import Turbo256  # A color palette with enough colors
 # Function to fit a polynomial curve and return the x and y values of the fitted curve
@@ -207,15 +208,14 @@ def create_scatter_plot_with_curve_with_variances_named(category, variance_acros
     return p
-def create_heatmap(data_matrix, original_scores, selected_rows=None, hide_scores_tasks=[], width=800, height=1400):
-    plot_width = width
-    plot_height = height
-    n_rows, n_cols = data_matrix.shape
-    # Clean column names (remove 'benczechmark_' prefix)
-    # data_matrix.columns = data_matrix.columns.str.replace('benczechmark_', '', regex=False)
-    # original_scores.columns = original_scores.columns.str.replace('benczechmark_', '', regex=False)
     if selected_rows is not None:
         # Select only the specified rows (models)
         data_matrix = data_matrix[selected_rows]
@@ -223,13 +223,14 @@ def create_heatmap(data_matrix, original_scores, selected_rows=None, hide_scores
     # Set up the figure with tasks as x-axis and models as y-axis
     p = figure(
-        #width=plot_width,
         height=plot_height,
         x_range=list(data_matrix.index),
         y_range=list(data_matrix.columns),
         toolbar_location="below",
         tools="pan,wheel_zoom,box_zoom,reset,save",
-        #x_axis_label="Model", y_axis_label="Task"
     )
     # Create the color mapper for the heatmap
@@ -282,16 +283,6 @@ def create_heatmap(data_matrix, original_scores, selected_rows=None, hide_scores
     p.rect(x='x', y='y', width=1, height=1, source=heatmap_source,
            line_color=None, fill_color={'field': 'colors', 'transform': color_mapper})
-    # Add color bar
-    # Add color bar with custom ticks
-    color_bar = ColorBar(
-        color_mapper=color_mapper,
-        width=8, location=(0, 0),
-        ticker=FixedTicker(ticks=[0, 0.2, 0.4, 0.6, 0.8, 1]),  # Fixed ticks at 0, 20, 40, 60, 80, 100
-        major_label_overrides={0: '0', 0.2: '20', 0.4: '40', 0.6: '60', 0.8: '80', 1: '100'}  # Custom labels for ticks
-    )
-    #p.add_layout(color_bar, 'right')
     # Add HoverTool for interactivity
     hover = HoverTool()
     hover.tooltips = [("Model", "@x"), ("Task", "@y"), ("DS", "@scores")]  # Updated tooltip
@@ -309,13 +300,13 @@ def create_heatmap(data_matrix, original_scores, selected_rows=None, hide_scores
     p.yaxis.major_label_text_font_size = "13pt"
     p.xaxis.major_label_text_font_size = "13pt"
-        # Set the axis label font size
     p.xaxis.axis_label_text_font_size = "18pt"  # Set font size for x-axis label
     p.yaxis.axis_label_text_font_size = "18pt"  # Set font size for y-axis label
     p.xaxis.axis_label_text_font_style = "normal"  # Set x-axis label to normal
     p.yaxis.axis_label_text_font_style = "normal"  # Set y-axis label to normal
-    #p.yaxis.visible = False  # Hide the y-axis labels
     return p

 import numpy as np
 from bokeh.plotting import figure
 from bokeh.models import LabelSet, LogScale, ColumnDataSource, tickers
+from bokeh.models import LinearColorMapper, HoverTool
 from bokeh.palettes import Turbo256  # A color palette with enough colors
 # Function to fit a polynomial curve and return the x and y values of the fitted curve
     return p
+def create_heatmap(data_matrix, original_scores,
+    selected_rows=None,
+    hide_scores_tasks=[],
+    plot_width=None,
+    plot_height=None,
+    x_axis_label="Model",
+    y_axis_label="Task",
+):
     if selected_rows is not None:
         # Select only the specified rows (models)
         data_matrix = data_matrix[selected_rows]
     # Set up the figure with tasks as x-axis and models as y-axis
     p = figure(
+        width=plot_width,
         height=plot_height,
         x_range=list(data_matrix.index),
         y_range=list(data_matrix.columns),
         toolbar_location="below",
         tools="pan,wheel_zoom,box_zoom,reset,save",
+        x_axis_label=x_axis_label,
+        y_axis_label=y_axis_label,
     )
     # Create the color mapper for the heatmap
     p.rect(x='x', y='y', width=1, height=1, source=heatmap_source,
            line_color=None, fill_color={'field': 'colors', 'transform': color_mapper})
     # Add HoverTool for interactivity
     hover = HoverTool()
     hover.tooltips = [("Model", "@x"), ("Task", "@y"), ("DS", "@scores")]  # Updated tooltip
     p.yaxis.major_label_text_font_size = "13pt"
     p.xaxis.major_label_text_font_size = "13pt"
+    # Set the axis label font size
     p.xaxis.axis_label_text_font_size = "18pt"  # Set font size for x-axis label
     p.yaxis.axis_label_text_font_size = "18pt"  # Set font size for y-axis label
     p.xaxis.axis_label_text_font_style = "normal"  # Set x-axis label to normal
     p.yaxis.axis_label_text_font_style = "normal"  # Set y-axis label to normal
+    p.yaxis.visible = False  # Hide the y-axis labels
     return p

server.py CHANGED Viewed

@@ -272,6 +272,7 @@ class LeaderboardServer:
         self.REPO_TYPE = "dataset"
         self.TASKS_METADATA = json.load(open(TASKS_METADATA_PATH))
         self.TASKS_CATEGORIES = {self.TASKS_METADATA[task]["category"] for task in self.TASKS_METADATA}
         self.TASKS_CATEGORY_OVERALL = "Overall"
         self.TASKS_CATEGORY_OVERALL_DETAILS = "Overall with details"
         self.CATEGORY_TO_TASK_ABBREVIATION_TO_DETAILS = self._prepare_category_to_task_abbr_to_details()
@@ -381,6 +382,28 @@ class LeaderboardServer:
                 results = json.load(ranks_file)
             return results
     def _prepare_category_to_task_abbr_to_details(self):
         tasks_per_category = {}
         for task in self.TASKS_METADATA:
@@ -739,6 +762,94 @@ class LeaderboardServer:
         return fig
     def get_leaderboard_csv(self, pre_submit=None, category=None, kind_of_p_value=None):
         if pre_submit == None:
             category = category if category else self.TASKS_CATEGORY_OVERALL

         self.REPO_TYPE = "dataset"
         self.TASKS_METADATA = json.load(open(TASKS_METADATA_PATH))
         self.TASKS_CATEGORIES = {self.TASKS_METADATA[task]["category"] for task in self.TASKS_METADATA}
+        self.TASKS_CATEGORIES_ABBREVIATIONS = self._prepare_tasks_categories_abbreviations()
         self.TASKS_CATEGORY_OVERALL = "Overall"
         self.TASKS_CATEGORY_OVERALL_DETAILS = "Overall with details"
         self.CATEGORY_TO_TASK_ABBREVIATION_TO_DETAILS = self._prepare_category_to_task_abbr_to_details()
                 results = json.load(ranks_file)
             return results
+    def _prepare_tasks_categories_abbreviations(self):
+        name2abbreviation = {
+            'Czech Language Understanding': 'CLU',
+            'Czech Math Reasoning': 'CMR',
+            'Factual Knowledge': 'FK',
+            'Language Modeling': 'LM',
+            'NER': 'NER',
+            'NLI': 'NLI',
+            'Reading Comprehension': 'RC',
+            'Sentiment': 'S'
+        }
+        for category in self.TASKS_CATEGORIES:
+            if category not in name2abbreviation:
+                name2abbreviation[category] = category
+        assert all(category in name2abbreviation for category in self.TASKS_CATEGORIES)
+        abbreviation2name = {abbr: name for name, abbr in name2abbreviation.items()}
+        assert len(abbreviation2name) == len(name2abbreviation)
+        return abbreviation2name
     def _prepare_category_to_task_abbr_to_details(self):
         tasks_per_category = {}
         for task in self.TASKS_METADATA:
         return fig
+    def get_leaderboard_heatmap(self, pre_submit=None, category=None, kind_of_p_value=None):
+        from analyze_winscore import get_ldb_records, create_heatmap
+        kind_of_p_value = kind_of_p_value if kind_of_p_value else self.DEFAULT_KIND_OF_P_VALUE
+        #tournament = self.tournament_results
+        name_map = self.submission_id_to_model_title
+        category = category if category else self.TASKS_CATEGORY_OVERALL
+        csv_file_path = self.leaderboard_dataframes_csv[kind_of_p_value][category]
+        ldb_records = get_ldb_records(name_map, csv_file_path)
+        model_names = list(ldb_records.keys())
+        task2model2score = {}
+        if category == self.TASKS_CATEGORY_OVERALL:
+            fig_y_axis_label = 'Category'
+            abbreviation2name = self.TASKS_CATEGORIES_ABBREVIATIONS
+            for abbr, name in abbreviation2name.items():
+                for model in model_names:
+                    score = float(ldb_records[model][name])
+                    task2model2score.setdefault(abbr, dict())[model] = score
+        else:
+            fig_y_axis_label = "Task"
+            abbreviation2name = self.CATEGORY_TO_TASK_ABBREVIATION_TO_DETAILS[category]
+            for abbr, name, url in abbreviation2name.values():
+                for model in model_names:
+                    score = float(ldb_records[model][abbr])
+                    task2model2score.setdefault(abbr, dict())[model] = score
+        # Convert proportions dictionary to DataFrame
+        # Transpose to have models as rows and tasks as columns
+        proportions_df = pd.DataFrame(task2model2score)
+        proportions_df /= 100
+        # Rename index and columns to reflect models and tasks
+        # Index is now tasks and columns are models
+        proportions_df.index.name = 'Model'
+        proportions_df.columns.name = fig_y_axis_label
+        # Calculate row averages
+        row_averages = proportions_df.mean(axis=1)
+        # Sort DataFrame by row averages
+        sorted_df = proportions_df.loc[row_averages.sort_values(ascending=False).index]
+        # Create task_to_category
+        task_to_category = {}
+        for task, details in self.TASKS_METADATA.items():
+            task_to_category[details['abbreviation']] = details['category']
+        if category != self.TASKS_CATEGORY_OVERALL:
+            # Create a Series from task_to_category with tasks as index
+            categories = pd.Series(task_to_category)
+            # Sort tasks by their categories
+            sorted_tasks = categories.sort_values().index
+            # Reorder the DataFrame columns based on sorted tasks
+            sorted_df = sorted_df[sorted_tasks]
+        # Remove team_name from model_title
+        sorted_df.index = sorted_df.index.str.replace(r'^[^/]+/', '', regex=True)
+        original_scores = sorted_df
+        size_dict = {v['Model'][v['Model'].index("/")+1:]: float(v['# θ (B)']) for v in ldb_records.values()}
+        sizes_series = pd.Series([size_dict[m] for m in original_scores.index], index=original_scores.index)
+        # Sort columns by category
+        sorted_columns = sorted(original_scores.columns, key=lambda x: task_to_category.get(x, ''))
+        original_scores = original_scores[sorted_columns]  # Ensure original scores match the sorted columns
+        # Sort rows by model size
+        sorted_indices = sizes_series.sort_values(ascending=False).index
+        original_scores = original_scores.loc[sorted_indices]  # Sort rows by model size
+        fig = create_heatmap(
+            original_scores,
+            original_scores*100,
+            plot_width=1200,
+            plot_height=1200,
+            y_axis_label=fig_y_axis_label,
+        )
+        return fig
     def get_leaderboard_csv(self, pre_submit=None, category=None, kind_of_p_value=None):
         if pre_submit == None:
             category = category if category else self.TASKS_CATEGORY_OVERALL