Spaces:
Runtime error
Runtime error
| from datasets import load_dataset, Dataset | |
| import os | |
| from datasets import load_dataset | |
| from datasets.utils.logging import disable_progress_bar | |
| from constants import column_names, all_task_types | |
| from utils_display import make_clickable_model | |
| import random | |
| import json | |
| disable_progress_bar() | |
| id_to_data = None | |
| model_len_info = None | |
| def estimated_win_rate(elo_a, elo_b): | |
| """ | |
| Calculate the estimated win rate for player A against player B using their Elo ratings. | |
| :param elo_a: Elo rating of player A | |
| :param elo_b: Elo rating of player B | |
| :return: Estimated win rate for player A | |
| """ | |
| exponent = (elo_b - elo_a) / 400 | |
| probability_a_wins = 1 / (1 + 10 ** exponent) | |
| return (1-probability_a_wins)*100 | |
| # Formats the columns | |
| def formatter(x): | |
| if type(x) is str: | |
| x = x | |
| else: | |
| x = round(x, 2) | |
| return x | |
| def add_winrates(current_df): | |
| df = current_df.copy() | |
| elo_column = "Overall Elo" | |
| # Correct way to filter the DataFrame and get the Elo rating for "gpt-4-0125-preview" | |
| model_a_elo = df[df["Model"].str.contains("gpt-4")][elo_column].iloc[0] | |
| # Correct way to filter the DataFrame and get the Elo rating for "gpt-3.5-turbo-0125" | |
| model_b_elo = df[df["Model"].str.contains("gpt-3.5")][elo_column].iloc[0] | |
| # Calculate the win rate of "gpt-4-0125-preview" against all models | |
| df['Win% vs GPT-4'] = df[elo_column].apply(lambda x: estimated_win_rate(model_a_elo, x)).apply(formatter) | |
| df['Win% vs GPT-3.5T'] = df[elo_column].apply(lambda x: estimated_win_rate(model_b_elo, x)).apply(formatter) | |
| # apply the formatter for the two new columns | |
| cols = list(df.columns) | |
| cols.remove("# battles"); cols.append("# battles") | |
| cols.remove("Length"); cols.append("Length") | |
| df = df[cols] | |
| return df | |
| def add_winrates_tasks(current_df, ref="gpt-4"): | |
| new_df = current_df.copy() | |
| for t in all_task_types: | |
| column = column_names[t] | |
| model_a_elo = current_df[current_df["Model"].str.contains(ref)][column].iloc[0] | |
| new_df[column] = current_df[column].apply(lambda x: estimated_win_rate(model_a_elo, x)).apply(formatter) | |
| return new_df | |
| def post_processing(df, model_len_info): | |
| if model_len_info: | |
| df["Length"] = df["model name "].apply(lambda x: model_len_info[x]) | |
| for col in df.columns: | |
| if col == "model name ": | |
| df[col] = df[col].apply(lambda x: x.replace(x, make_clickable_model(x))) | |
| else: | |
| df[col] = df[col].apply(formatter) # For numerical values | |
| df.rename(columns=column_names, inplace=True) | |
| df.sort_values(by="Overall Elo", inplace=True, ascending=False) | |
| # put the "Overall Elo" and "Task-Avg Elo" column to the front | |
| # add the length info | |
| df = df[["Model", "Overall Elo", "Task-Avg Elo"] + [col for col in df.columns if col not in ["Model", "Overall Elo", "Task-Avg Elo"]]] | |
| return df | |
| def apply_length_penalty(original_df, ablation_df, length_penalty=0.2): | |
| original_df = original_df.copy() | |
| ablation_df = ablation_df.copy() | |
| # replace all values in original_df with the values as z = x - y * length_penalty where y is from ablation_df at the same row and column | |
| # except for the "Model" column and the "# battles" column | |
| # do not assume the order of the rows are the same in both dataframes | |
| for i, row in original_df.iterrows(): | |
| for col in original_df.columns: | |
| if col == "Model" or col == "# battles" or col == "Length": | |
| continue | |
| # assert that the model names are the same in both dataframes | |
| assert original_df.at[i, "Model"] == ablation_df[ablation_df["Model"] == row["Model"]]["Model"].values[0] | |
| original_df[col] = original_df[col].astype(float) | |
| original_df.at[i, col] = original_df.at[i, col] - ablation_df[ablation_df["Model"] == row["Model"]][col].values[0] * length_penalty | |
| # post_processing | |
| original_df = post_processing(original_df, model_len_info=None) | |
| return original_df | |
| def load_benchdata_dict(): | |
| with open("data_dir/predictions_logs.jsonl", "r") as f: | |
| bench_data = [json.loads(d) for d in f] | |
| id_to_data = {} | |
| for item in bench_data: | |
| id_to_data[item["idx"]] = item | |
| return id_to_data | |
| def load_eval_results(): | |
| with open("data_dir/predictions_logs.jsonl", "r") as f: | |
| eval_results = [json.loads(d) for d in f] | |
| return eval_results | |
| def sample_an_eval_result(eval_results, model_list=[]): | |
| global id_to_data | |
| eval_results = list(eval_results) | |
| random.shuffle(eval_results) | |
| for eval_item in eval_results: | |
| model = eval_item['model'] | |
| task_type = eval_item['task_type'] # primary task type | |
| if model not in model_list: | |
| continue | |
| plan_history = eval_item['plan_prompts'] | |
| ground_history = eval_item['ground_prompts'] | |
| task = eval_item['question'] | |
| if "image" in eval_item: | |
| result_dict = { | |
| "session_id": eval_item['idx'], | |
| "task": task, | |
| "task_type": task_type, | |
| "plan_history": plan_history, | |
| "ground_history": ground_history, | |
| "image": eval_item['image'].replace("eval/aokvqa/images/val2017/", "file/data_dir/test_images/") | |
| } | |
| else: | |
| result_dict = { | |
| "session_id": eval_item['idx'], | |
| "task": task, | |
| "task_type": task_type, | |
| "plan_history": plan_history, | |
| "ground_history": ground_history, | |
| "image": None | |
| } | |
| break | |
| return result_dict | |
| id_to_data = load_benchdata_dict() |