Spaces:
Runtime error
Runtime error
Yeoni Rhee
commited on
Commit
·
ac011bc
1
Parent(s):
79cc507
Sotopia Task Submission
Browse files- sotopia_space/benchmark.py +3 -37
sotopia_space/benchmark.py
CHANGED
|
@@ -1,42 +1,13 @@
|
|
| 1 |
import gradio as gr # type: ignore
|
| 2 |
import pandas as pd
|
| 3 |
from sotopia_space.constants import MODEL_OPTIONS
|
| 4 |
-
from sotopia_space.utils import
|
| 5 |
|
| 6 |
LP_MODE = "v2"
|
| 7 |
original_df, ablation_df = None, None
|
| 8 |
LP_original_dfs = {}
|
| 9 |
DEFAULT_LP = 0.5
|
| 10 |
|
| 11 |
-
available_models = [] # to be filled in later
|
| 12 |
-
original_df, ablation_df = None, None
|
| 13 |
-
|
| 14 |
-
def slider_change_main(length_penalty):
|
| 15 |
-
global original_df, ablation_df, LP_MODE
|
| 16 |
-
adjusted_df = apply_length_penalty(original_df, ablation_df, length_penalty, mode=LP_MODE, LP_original_dfs=LP_original_dfs)
|
| 17 |
-
adjusted_df = adjusted_df[["Model", "Overall Elo", "Task-Avg Elo", "# battles", "Length"]]
|
| 18 |
-
adjusted_df = adjusted_df.sort_values(by="Overall Elo", ascending=False)
|
| 19 |
-
# adjusted_df = add_winrates(adjusted_df, LP=length_penalty)
|
| 20 |
-
# adjusted_df = adjusted_df.drop(columns=["Length"])
|
| 21 |
-
adjusted_df.insert(0, "Rank", range(1, 1 + len(adjusted_df)))
|
| 22 |
-
return adjusted_df
|
| 23 |
-
|
| 24 |
-
def slider_change_full(length_penalty, show_winrate):
|
| 25 |
-
global original_df, ablation_df, LP_MODE
|
| 26 |
-
adjusted_df = apply_length_penalty(original_df, ablation_df, length_penalty, mode=LP_MODE, LP_original_dfs=LP_original_dfs)
|
| 27 |
-
# sort the model by the "Task-Avg Elo" column
|
| 28 |
-
adjusted_df = adjusted_df.sort_values(by="Overall Elo", ascending=False)
|
| 29 |
-
adjusted_df.drop(columns=["Overall Elo", "Task-Avg Elo", "# battles", "Length"], inplace=True)
|
| 30 |
-
if show_winrate == "none":
|
| 31 |
-
adjusted_df.insert(0, "Rank", range(1, 1 + len(adjusted_df)))
|
| 32 |
-
return adjusted_df
|
| 33 |
-
elif show_winrate == "gpt-3.5":
|
| 34 |
-
adjusted_df = add_winrates_tasks(adjusted_df, ref="gpt-3.5", LP=length_penalty)
|
| 35 |
-
elif show_winrate == "gpt-4":
|
| 36 |
-
adjusted_df = add_winrates_tasks(adjusted_df, ref="gpt-4", LP=length_penalty)
|
| 37 |
-
adjusted_df.insert(0, "Rank", range(1, 1 + len(adjusted_df)))
|
| 38 |
-
return adjusted_df
|
| 39 |
-
|
| 40 |
def benchmark_table():
|
| 41 |
global original_df, ablation_df
|
| 42 |
global LP_original_dfs, LP_MODE
|
|
@@ -44,7 +15,6 @@ def benchmark_table():
|
|
| 44 |
gr.Markdown(f"**Version**: sotopia (v1.01; 2024.04.22) | **# Examples**: 7200 | **# Models**: {len(MODEL_OPTIONS)} | **# Comparisons**: x", elem_classes="markdown-text")
|
| 45 |
|
| 46 |
with gr.TabItem("Vs GPT-3.5", elem_id="od-benchmark-tab-table-ablation", id=0, elem_classes="subtab"):
|
| 47 |
-
# original_df, ablation_df = skip_empty_original_df, skip_empty_ablation_df
|
| 48 |
original_df = pd.read_json('data_dir/models_vs_gpt35.jsonl', lines=True)
|
| 49 |
default_main_df = apply_length_penalty(original_df, ablation_df, length_penalty=DEFAULT_LP, mode=LP_MODE, LP_original_dfs=LP_original_dfs)
|
| 50 |
default_main_df = default_main_df.sort_values(by="GOAL [0, 10]", ascending=False)
|
|
@@ -52,10 +22,7 @@ def benchmark_table():
|
|
| 52 |
default_main_df.insert(0, "Rank", range(1, 1 + len(default_main_df)))
|
| 53 |
with gr.Row():
|
| 54 |
with gr.Column(scale=4):
|
| 55 |
-
gr.Markdown("**Vs GPT3.5**: The interlocutors are compared against GPT-3.5, the baseline model.")
|
| 56 |
-
with gr.Column(scale=1):
|
| 57 |
-
length_penlty_slider = gr.Slider(minimum=0.1, maximum=1, step=0.1, value=DEFAULT_LP, label="Length Penalty", elem_id="length-penalty-slider")
|
| 58 |
-
# checkbox_skip_empty = gr.Checkbox(label="Skip empty results", value=False, elem_id="skip-empty-checkbox", scale=2)
|
| 59 |
TYPES = ["number", "markdown", "number"]
|
| 60 |
leaderboard_table = gr.components.Dataframe(
|
| 61 |
value=default_main_df,
|
|
@@ -66,5 +33,4 @@ def benchmark_table():
|
|
| 66 |
interactive=False,
|
| 67 |
visible=True,
|
| 68 |
min_width=60,
|
| 69 |
-
)
|
| 70 |
-
#length_penlty_slider.change(fn=slider_change_main, inputs=[length_penlty_slider], outputs=[leaderboard_table])
|
|
|
|
| 1 |
import gradio as gr # type: ignore
|
| 2 |
import pandas as pd
|
| 3 |
from sotopia_space.constants import MODEL_OPTIONS
|
| 4 |
+
from sotopia_space.utils import apply_length_penalty
|
| 5 |
|
| 6 |
LP_MODE = "v2"
|
| 7 |
original_df, ablation_df = None, None
|
| 8 |
LP_original_dfs = {}
|
| 9 |
DEFAULT_LP = 0.5
|
| 10 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
def benchmark_table():
|
| 12 |
global original_df, ablation_df
|
| 13 |
global LP_original_dfs, LP_MODE
|
|
|
|
| 15 |
gr.Markdown(f"**Version**: sotopia (v1.01; 2024.04.22) | **# Examples**: 7200 | **# Models**: {len(MODEL_OPTIONS)} | **# Comparisons**: x", elem_classes="markdown-text")
|
| 16 |
|
| 17 |
with gr.TabItem("Vs GPT-3.5", elem_id="od-benchmark-tab-table-ablation", id=0, elem_classes="subtab"):
|
|
|
|
| 18 |
original_df = pd.read_json('data_dir/models_vs_gpt35.jsonl', lines=True)
|
| 19 |
default_main_df = apply_length_penalty(original_df, ablation_df, length_penalty=DEFAULT_LP, mode=LP_MODE, LP_original_dfs=LP_original_dfs)
|
| 20 |
default_main_df = default_main_df.sort_values(by="GOAL [0, 10]", ascending=False)
|
|
|
|
| 22 |
default_main_df.insert(0, "Rank", range(1, 1 + len(default_main_df)))
|
| 23 |
with gr.Row():
|
| 24 |
with gr.Column(scale=4):
|
| 25 |
+
gr.Markdown("**Vs GPT3.5**: The interlocutors are compared against GPT-3.5, the baseline model.")
|
|
|
|
|
|
|
|
|
|
| 26 |
TYPES = ["number", "markdown", "number"]
|
| 27 |
leaderboard_table = gr.components.Dataframe(
|
| 28 |
value=default_main_df,
|
|
|
|
| 33 |
interactive=False,
|
| 34 |
visible=True,
|
| 35 |
min_width=60,
|
| 36 |
+
)
|
|
|