open_pt_llm_leaderboard

Running on CPU Upgrade

App Files Files Community

open_pt_llm_leaderboard / src /tools /plots.py

eduagarcia

add top model bar graph

405857a 7 months ago

raw

history blame contribute delete

17.8 kB

	import pandas as pd
	import numpy as np
	import plotly.express as px
	from plotly.graph_objs import Figure

	from src.leaderboard.filter_models import FLAGGED_MODELS
	from src.display.utils import human_baseline_row as HUMAN_BASELINE, AutoEvalColumn, Tasks, Task, BENCHMARK_COLS, external_eval_results, NUMERIC_INTERVALS
	from src.leaderboard.read_evals import EvalResult
	import copy



	def create_scores_df(raw_data: list[EvalResult]) -> pd.DataFrame:
	"""
	Generates a DataFrame containing the maximum scores until each date.

	:param results_df: A DataFrame containing result information including metric scores and dates.
	:return: A new DataFrame containing the maximum scores until each date for every metric.
	"""
	# Step 1: Ensure 'date' is in datetime format and sort the DataFrame by it

	#create dataframe with EvalResult dataclass columns, even if raw_data is empty
	raw_data = copy.deepcopy(raw_data)
	for external_row in external_eval_results:
	raw_data.append(EvalResult(**external_row))
	results_df = pd.DataFrame(raw_data, columns=EvalResult.__dataclass_fields__.keys())

	#results_df["date"] = pd.to_datetime(results_df["date"], format="mixed", utc=True)
	#convert date to datetime
	results_df["date"] = pd.to_datetime(results_df["date"], format="mixed", utc=True)
	#convert to simple date string 2025-04-26
	results_df["date"] = results_df["date"].dt.strftime("%Y-%m-%d")
	results_df.sort_values(by="date", inplace=True)

	# Step 2: Initialize the scores dictionary
	scores = {k: [] for k in BENCHMARK_COLS + [AutoEvalColumn.average.name]}

	# Step 3: Iterate over the rows of the DataFrame and update the scores dictionary
	for task in [t.value for t in Tasks] + [Task("Average", "avg", AutoEvalColumn.average.name)]:
	current_max = 0
	last_date = ""
	column = task.col_name
	for _, row in results_df.iterrows():
	current_model = row["full_model"]
	# We ignore models that are flagged/no longer on the hub/not finished
	to_ignore = not row["still_on_hub"] or row["flagged"] or current_model in FLAGGED_MODELS or row["status"] != "FINISHED"
	if to_ignore:
	continue

	current_date = row["date"]
	if task.benchmark == "Average":
	current_score = np.mean(list(row["results"].values()))
	else:
	if task.benchmark not in row["results"]:
	continue
	current_score = row["results"][task.benchmark]

	if current_score > current_max:
	if current_date == last_date and len(scores[column]) > 0:
	scores[column][-1] = {"model": current_model, "date": current_date, "score": current_score}
	else:
	scores[column].append({"model": current_model, "date": current_date, "score": current_score})
	current_max = current_score
	last_date = current_date

	# Step 4: Return all dictionaries as DataFrames
	return {k: pd.DataFrame(v, columns=["model", "date", "score"]) for k, v in scores.items()}


	def create_plot_df(scores_df: dict[str: pd.DataFrame]) -> pd.DataFrame:
	"""
	Transforms the scores DataFrame into a new format suitable for plotting.

	:param scores_df: A DataFrame containing metric scores and dates.
	:return: A new DataFrame reshaped for plotting purposes.
	"""
	# Initialize the list to store DataFrames
	dfs = []

	# Iterate over the cols and create a new DataFrame for each column
	for col in BENCHMARK_COLS + [AutoEvalColumn.average.name]:
	d = scores_df[col].reset_index(drop=True)
	d["task"] = col
	dfs.append(d)

	# Concatenate all the created DataFrames
	concat_df = pd.concat(dfs, ignore_index=True)

	# Sort values by 'date'
	concat_df.sort_values(by="date", inplace=True)
	concat_df.reset_index(drop=True, inplace=True)
	return concat_df


	def create_metric_plot_obj(
	df: pd.DataFrame, metrics: list[str], title: str
	) -> Figure:
	"""
	Create a Plotly figure object with lines representing different metrics
	and horizontal dotted lines representing human baselines.

	:param df: The DataFrame containing the metric values, names, and dates.
	:param metrics: A list of strings representing the names of the metrics
	to be included in the plot.
	:param title: A string representing the title of the plot.
	:return: A Plotly figure object with lines representing metrics and
	horizontal dotted lines representing human baselines.
	"""

	# Filter the DataFrame based on the specified metrics
	df = df[df["task"].isin(metrics)]

	# Filter the human baselines based on the specified metrics
	filtered_human_baselines = {k: v for k, v in HUMAN_BASELINE.items() if k in metrics if v is not None}

	# Create a line figure using plotly express with specified markers and custom data
	fig = px.line(
	df,
	x="date",
	y="score",
	color="task",
	markers=True,
	custom_data=["task", "score", "model"],
	title=title,
	)

	# Update hovertemplate for better hover interaction experience
	fig.update_traces(
	hovertemplate="<br>".join(
	[
	"Model Name: %{customdata[2]}",
	"Metric Name: %{customdata[0]}",
	"Date: %{x}",
	"Metric Value: %{y}",
	]
	)
	)

	# Update the range of the y-axis
	#fig.update_layout(yaxis_range=[0, 100])

	# Create a dictionary to hold the color mapping for each metric
	metric_color_mapping = {}

	# Map each metric name to its color in the figure
	for trace in fig.data:
	metric_color_mapping[trace.name] = trace.line.color

	# Iterate over filtered human baselines and add horizontal lines to the figure
	for metric, value in filtered_human_baselines.items():
	color = metric_color_mapping.get(metric, "blue") # Retrieve color from mapping; default to blue if not found
	location = "top left" if metric == "HellaSwag" else "bottom left" # Set annotation position
	# Add horizontal line with matched color and positioned annotation
	fig.add_hline(
	y=value,
	line_dash="dot",
	annotation_text=f"{metric} human baseline",
	annotation_position=location,
	annotation_font_size=10,
	annotation_font_color=color,
	line_color=color,
	)

	return fig

	def create_lat_score_mem_plot_obj(leaderboard_df):
	copy_df = leaderboard_df.copy()
	copy_df = copy_df[~(copy_df[AutoEvalColumn.dummy.name].isin(["baseline", "human_baseline"]))]
	# plot
	SCORE_MEMORY_LATENCY_DATA = [
	AutoEvalColumn.dummy.name,
	AutoEvalColumn.average.name,
	AutoEvalColumn.params.name,
	AutoEvalColumn.architecture.name,
	"Evaluation Time (min)"
	]

	copy_df["LLM Average Score"] = copy_df[AutoEvalColumn.average.name]
	copy_df["Evaluation Time (min)"] = copy_df[AutoEvalColumn.eval_time.name] / 60

	#copy_df["size"] = copy_df[AutoEvalColumn.params.name]
	copy_df["size"] = copy_df[AutoEvalColumn.params.name].apply(lambda x: 0.5 if 0 <= x < 0.8 else x)
	copy_df["size"] = copy_df["size"].apply(lambda x: 0.8 if 0.8 <= x < 2 else x)
	copy_df["size"] = copy_df["size"].apply(lambda x: 1.5 if 2 <= x < 5 else x)
	copy_df["size"] = copy_df["size"].apply(lambda x: 2.0 if 5 <= x < 10 else x)
	copy_df["size"] = copy_df["size"].apply(lambda x: 3.0 if 10 <= x < 35 else x)
	copy_df["size"] = copy_df["size"].apply(lambda x: 4.0 if 35 <= x < 60 else x)
	copy_df["size"] = copy_df["size"].apply(lambda x: 6.0 if 60 <= x < 90 else x)
	copy_df["size"] = copy_df["size"].apply(lambda x: 8.0 if x >= 90 else x)

	fig = px.scatter(
	copy_df,
	x="Evaluation Time (min)",
	y="LLM Average Score",
	size="size",
	color=AutoEvalColumn.architecture.name,
	custom_data=SCORE_MEMORY_LATENCY_DATA,
	color_discrete_sequence=px.colors.qualitative.Light24,
	log_x=True
	)
	fig.update_traces(
	hovertemplate="<br>".join(
	[f"<b>{column}:</b> %{{customdata[{i}]}}" for i, column in enumerate(SCORE_MEMORY_LATENCY_DATA)]
	)
	)
	fig.update_layout(
	title={
	"text": "Eval Time vs. Score vs. #Params",
	"y": 0.95,
	"x": 0.5,
	"xanchor": "center",
	"yanchor": "top",
	},
	xaxis_title="Time To Evaluate (min)",
	yaxis_title="LLM Average Score",
	legend_title="LLM Architecture",
	width=1200,
	height=600,
	)

	return fig

	def create_top_n_models_comparison_plot(leaderboard_df: pd.DataFrame, top_n: int = 5, size_filter: str = None) -> Figure:
	"""
	Creates a grouped bar chart comparing the performance of the top N models across all metrics.

	:param leaderboard_df: DataFrame containing the leaderboard data.
	:param top_n: The number of top models to include in the comparison (default is 5).
	:param size_filter: If provided, only include models of this specific size category.
	:return: A Plotly figure object representing the comparison plot.
	"""
	# Ensure BENCHMARK_COLS contains the correct metric column names
	metric_cols = BENCHMARK_COLS

	# Filter out non-model rows (like baseline or human) and select relevant columns
	models_df = leaderboard_df[~leaderboard_df[AutoEvalColumn.dummy.name].isin(["baseline", "human_baseline"])].copy()

	# Add size group information to the DataFrame
	models_df['size_group'] = models_df[AutoEvalColumn.params.name].apply(
	lambda x: next((k for k, v in NUMERIC_INTERVALS.items() if x in v), '?')
	)

	# Filter by size category if specified
	if size_filter and size_filter != 'All Sizes':
	models_df = models_df[models_df['size_group'] == size_filter]
	if models_df.empty:
	# If no models match the size filter, return an empty figure with a message
	fig = px.bar(
	x=["No Data"],
	y=[0],
	title=f"No models found in the {size_filter} size category"
	)
	fig.update_layout(
	xaxis_title="",
	yaxis_title="",
	showlegend=False
	)
	return fig

	# Sort models by average score and select the top N
	top_models_df = models_df.nlargest(top_n, AutoEvalColumn.average.name)

	# Select only the necessary columns: model name and metric scores
	plot_data = top_models_df[[AutoEvalColumn.dummy.name] + metric_cols]

	# Melt the DataFrame to long format suitable for plotting
	# 'id_vars' specifies the column(s) to keep as identifiers
	# 'value_vars' specifies the columns to unpivot
	# 'var_name' is the name for the new column containing the original column names (metrics)
	# 'value_name' is the name for the new column containing the values (scores)
	melted_df = pd.melt(
	plot_data,
	id_vars=[AutoEvalColumn.dummy.name],
	value_vars=metric_cols,
	var_name="Metric",
	value_name="Score",
	)

	# Validate and cap scores to ensure they're within a reasonable range (0-100)
	melted_df['Score'] = melted_df['Score'].apply(lambda x: min(max(x, 0), 100))

	# Create the grouped bar chart
	fig = px.bar(
	melted_df,
	x="Metric",
	y="Score",
	color=AutoEvalColumn.dummy.name, # Group bars by model name
	barmode="group", # Display bars side-by-side for each metric
	title=f"Top {top_n} Models Comparison Across Metrics",
	labels={AutoEvalColumn.dummy.name: "Model"}, # Rename legend title
	custom_data=[AutoEvalColumn.dummy.name, "Metric", "Score"], # Data for hover
	range_y=[0, 100], # Force y-axis range to be 0-100
	)

	# Update hovertemplate
	fig.update_traces(
	hovertemplate="<br>".join(
	[
	"Model: %{customdata[0]}",
	"Metric: %{customdata[1]}",
	"Score: %{customdata[2]:.2f}", # Format score to 2 decimal places
	"<extra></extra>", # Remove the default trace info
	]
	)
	)

	# Create title with size filter information if applicable
	title_text = f"Top {top_n} Models Comparison Across Metrics"
	if size_filter and size_filter != 'All Sizes':
	title_text += f" ({size_filter} Models)"

	# Calculate appropriate y-axis range based on the data
	min_score = melted_df['Score'].min()
	max_score = melted_df['Score'].max()

	# Set y-axis minimum (start at 0 unless all scores are high)
	y_min = 40 if min_score > 50 else 0

	# Set y-axis maximum (ensure there's room for annotations)
	y_max = 100 if max_score < 95 else 105

	# Optional: Adjust layout for better readability
	fig.update_layout(
	title={
	"text": title_text,
	"y": 0.95,
	"x": 0.5,
	"xanchor": "center",
	"yanchor": "top",
	},
	xaxis_title="Metric",
	yaxis_title="Score (%)",
	legend_title="Model",
	yaxis=dict(
	range=[y_min, y_max], # Set y-axis range dynamically
	constrain="domain", # Constrain the axis to the domain
	constraintoward="top" # Constrain toward the top
	),
	width=1600,
	height=450,
	)

	# Define shape icons for each model
	shape_icons = {
	0: "triangle-up", # First model gets triangle
	1: "square", # Second model gets square
	2: "circle", # Third model gets circle
	3: "diamond", # Fourth model gets diamond
	4: "star", # Fifth model gets star
	5: "pentagon", # Sixth model gets pentagon
	6: "hexagon", # Seventh model gets hexagon
	7: "cross", # Eighth model gets cross
	8: "x", # Ninth model gets x
	9: "hourglass", # Tenth model gets hourglass
	}

	# Get the average score for each model
	model_averages = {}
	for model in top_models_df[AutoEvalColumn.dummy.name].unique():
	try:
	model_averages[model] = top_models_df.loc[top_models_df[AutoEvalColumn.dummy.name] == model, AutoEvalColumn.average.name].values[0]
	except (IndexError, KeyError):
	# If average score is not available, use None
	model_averages[model] = None

	# Add shapes to the legend and annotations with icons for each bar
	for i, bar in enumerate(fig.data):
	model_name = bar.name
	model_index = list(top_models_df[AutoEvalColumn.dummy.name].unique()).index(model_name) % len(shape_icons)
	icon_shape = shape_icons[model_index]

	# Update the name in the legend to include the shape symbol
	shape_symbol = get_symbol_for_shape(icon_shape)
	fig.data[i].name = f"{shape_symbol} {model_name}"

	# For each bar in this trace
	for j, (x, y) in enumerate(zip(bar.x, bar.y)):
	# Use the actual bar score instead of the average
	score_text = f"<b>{y:.1f}</b>"

	# Calculate the exact position for the annotation
	# Plotly's grouped bar charts position bars at specific offsets
	# We need to match these offsets exactly
	num_models = len(top_models_df[AutoEvalColumn.dummy.name].unique())

	# The total width allocated for all bars in a group
	total_group_width = 0.8

	# Width of each individual bar
	bar_width = total_group_width / num_models

	# Calculate the offset for this specific bar within its group
	# i represents which model in the group (0 is the first model, etc.)
	# Center of the group is at x, so we need to adjust from there
	offset = (i - (num_models-1)/2) * bar_width

	# Add score text directly above its bar
	fig.add_annotation(
	x=x,
	y=y + 2, # Position slightly above the bar
	text=score_text, # Display the actual bar score
	showarrow=False,
	font=dict(
	size=10,
	color=bar.marker.color # Match the bar color
	),
	opacity=0.9,
	xshift=offset * 130 # Adjust the multiplier to better center the annotation
	)

	# Add the shape icon above the score
	fig.add_annotation(
	x=x,
	y=y - 3, # Position above the score text
	text=get_symbol_for_shape(icon_shape), # Convert shape name to symbol
	showarrow=False,
	font=dict(
	size=14,
	color="black" # Match the bar color
	),
	opacity=0.9,
	xshift=offset * 130 # Adjust the multiplier to better center the annotation
	)

	return fig

	def get_symbol_for_shape(shape_name):
	"""Convert shape name to a symbol character that can be used in annotations."""
	symbols = {
	"triangle-up": "▲",
	"square": "■",
	"circle": "●",
	"diamond": "◆",
	"star": "★",
	"pentagon": "⬟",
	"hexagon": "⬢",
	"cross": "✚",
	"x": "✖",
	"hourglass": "⧗"
	}
	return symbols.get(shape_name, "●") # Default to circle if shape not found