Spaces:

cmu-lti
/

sotopia-space

Runtime error

App Files Files Community

Xuhui

wdplx commited on Apr 24, 2024

Commit

0adb6ea

1 Parent(s): 7d4925b

Leaderboard and Unified UI (#61)

Browse files

* Update start_app.sh to use gradio instead of python app.py

* fixed action typing error

---------

Co-authored-by: Jasonqi146 <jasonqi146@gmail.com>

Files changed (11) hide show

README.md +13 -0
app.py +65 -314
data_dir/models_vs_gpt35.jsonl +4 -0
requirements.txt +14 -14
sotopia_space/_header.md +4 -0
sotopia_space/benchmark.py +70 -0
sotopia_space/chat.py +284 -0
sotopia_space/constants.py +39 -0
sotopia_space/utils.py +223 -0
start_app.sh +1 -1
ui_constants.py +191 -0

README.md CHANGED Viewed

@@ -11,3 +11,16 @@ license: apache-2.0
 ---
 This is a synced repository with a Huggingface Space for the Sotopia project [space](https://huggingface.co/spaces/wdplx/Sotopia-demo)

 ---
 This is a synced repository with a Huggingface Space for the Sotopia project [space](https://huggingface.co/spaces/wdplx/Sotopia-demo)
+## Getting Started
+```bash
+conda create -n sotopia-space python=3.11; conda activate sotopia-space
+python -m pip install -r requirements.txt
+```
+To run the app, run the following command:
+```bash
+bash start_app.sh
+```

app.py CHANGED Viewed

@@ -1,332 +1,83 @@
 import os
-from collections import defaultdict
-import json
 from typing import Literal
-import gradio as gr
-from utils import Environment, Agent, get_context_prompt, dialogue_history_prompt
-from functools import cache
-from sotopia_pi_generate import prepare_model, generate_action
 OPENAI_KEY_FILE="./openai_api.key"
 if os.path.exists(OPENAI_KEY_FILE):
     with open(OPENAI_KEY_FILE, "r") as f:
         os.environ["OPENAI_API_KEY"] = f.read().strip()
-DEPLOYED = os.getenv("DEPLOYED", "true").lower() == "true"
-DEFAULT_MODEL_SELECTION = "gpt-3.5-turbo"
-TEMPERATURE = 0.7
-TOP_P = 1
-MAX_TOKENS = 1024
-ENVIRONMENT_PROFILES = "profiles/environment_profiles.jsonl"
-AGENT_PROFILES = "profiles/agent_profiles.jsonl"
-RELATIONSHIP_PROFILES = "profiles/relationship_profiles.jsonl"
-ACTION_TYPES = ['none', 'action', 'non-verbal communication', 'speak', 'leave']
-MODEL_OPTIONS = [
-    "gpt-3.5-turbo",
-    "gpt-4",
-    "gpt-4-turbo",
-    "cmu-lti/sotopia-pi-mistral-7b-BC_SR",
-    "cmu-lti/sotopia-pi-mistral-7b-BC_SR_4bit",
-    "mistralai/Mistral-7B-Instruct-v0.1"
-    # "mistralai/Mixtral-8x7B-Instruct-v0.1",
-    # "togethercomputer/llama-2-7b-chat",
-    # "togethercomputer/llama-2-70b-chat",
-    # "togethercomputer/mpt-30b-chat",
-    # "together_ai/togethercomputer/llama-2-7b-chat",
-    # "together_ai/togethercomputer/falcon-7b-instruct",
-]
-@cache
-def get_sotopia_profiles(env_file=ENVIRONMENT_PROFILES, agent_file=AGENT_PROFILES, relationship_file=RELATIONSHIP_PROFILES):
-    with open(env_file, 'r') as f:
-        data = [json.loads(line) for line in f.readlines()]
-    code_names_count = defaultdict(int)
-    environments = []
-    environment_dict = {}
-    for profile in sorted(data, key=lambda x: x['codename']):
-        env_obj = Environment(profile)
-        if profile['codename'] in code_names_count:
-            environments.append((
-                "{}_{:05d}".format(profile['codename'],
-                                   code_names_count[profile['codename']]
-                                   ),
-                env_obj._id
-                ))
-        else:
-            environments.append((profile['codename'], env_obj._id))
-        environment_dict[env_obj._id] = env_obj
-        code_names_count[profile['codename']] += 1
-    with open(agent_file, 'r') as f:
-        data = [json.loads(line) for line in f.readlines()]
-    agent_dict = {}
-    for profile in data:
-        agent_obj = Agent(profile)
-        agent_dict[agent_obj._id] = agent_obj
-    with open(relationship_file, 'r') as f:
-        data = [json.loads(line) for line in f.readlines()]
-    relationship_dict = defaultdict(lambda : defaultdict(list))
-    for profile in data:
-        relationship_dict[profile['relationship']][profile['agent1_id']].append(profile['agent2_id'])
-        relationship_dict[profile['relationship']][profile['agent2_id']].append(profile['agent1_id'])
-    return environments, environment_dict, agent_dict, relationship_dict
-def introduction():
     with gr.Column(scale=2):
-        gr.Image(
-            "images/sotopia.jpg", elem_id="banner-image", show_label=False
         )
-    with gr.Column(scale=5):
-        gr.Markdown(
-            """# Sotopia Space
-            **Chat with different social agent models including [sotopia-pi](https://github.com/sotopia-lab/sotopia-pi), GPT and so on in sotopia space!**
-            ➡️️ **Intended Use**: Sotopia space is intended to showcase the social intelligence ability of different social agents in interesting social scenarios.
-            ✨ **Guidance**:
-            Step (1) Select a social scenario that interests you in "Scenario Selection"
-            Step (2) Select a social agent you want to chat with in "Model Selection"
-            Step (3) Select which character you and your social agent will play in the scenario in "User Agent Selection" and "Bot Agent Selection"
-            Step (4) Negotiate/debate/cooperate with the social agent to see whether your goal or their social goal can be achieved.
-            ⚠️ **Limitations**: The social agent can and will produce factually incorrect information, hallucinating facts and potentially offensive actions. It can produce problematic outputs, especially if prompted to do so.
-            🗄️ **Disclaimer**: User prompts and generated replies from the model may be collected solely for the purpose of pure academic research. By using this demo, users implicitly agree to these terms.
-            """
-        )
-def create_user_agent_dropdown(environment_id):
-    _, environment_dict, agent_dict, relationship_dict = get_sotopia_profiles()
-    environment = environment_dict[environment_id]
-    user_agents_list = []
-    unique_agent_ids = set()
-    for x, _ in relationship_dict[environment.relationship].items():
-        unique_agent_ids.add(x)
-    for agent_id in unique_agent_ids:
-        user_agents_list.append((agent_dict[agent_id].name, agent_id))
-    return gr.Dropdown(choices=user_agents_list, value=user_agents_list[0][1] if user_agents_list else None, label="User Agent Selection")
-def create_bot_agent_dropdown(environment_id, user_agent_id):
-    _, environment_dict, agent_dict, relationship_dict = get_sotopia_profiles()
-    environment, user_agent = environment_dict[environment_id], agent_dict[user_agent_id]
-    bot_agent_list = []
-    for neighbor_id in relationship_dict[environment.relationship][user_agent.agent_id]:
-        bot_agent_list.append((agent_dict[neighbor_id].name, neighbor_id))
-    return gr.Dropdown(choices=bot_agent_list, value=bot_agent_list[0][1] if bot_agent_list else None,  label="Bot Agent Selection")
-def create_environment_info(environment_dropdown):
-    _, environment_dict, _, _ = get_sotopia_profiles()
-    environment = environment_dict[environment_dropdown]
-    text = environment.scenario
-    return gr.Textbox(label="Scenario", lines=1, value=text)
-def create_user_info(user_agent_dropdown):
-    _, _, agent_dict, _ = get_sotopia_profiles()
-    user_agent = agent_dict[user_agent_dropdown]
-    text = f"{user_agent.background} {user_agent.personality}"
-    return gr.Textbox(label="User Agent Profile", lines=4, value=text)
-def create_bot_info(bot_agent_dropdown):
-    _, _, agent_dict, _ = get_sotopia_profiles()
-    bot_agent = agent_dict[bot_agent_dropdown]
-    text = f"{bot_agent.background} {bot_agent.personality}"
-    return gr.Textbox(label="Bot Agent Profile", lines=4, value=text)
-def create_user_goal(environment_dropdown):
-    _, environment_dict, _, _ = get_sotopia_profiles()
-    text = environment_dict[environment_dropdown].agent_goals[0]
-    text = text.replace('(', '').replace(')', '')
-    if "<extra_info>" in text:
-        text = text.replace("<extra_info>", "\n\n")
-        text = text.replace("</extra_info>", "\n")
-    if "<strategy_hint>" in text:
-        text = text.replace("<strategy_hint>", "\n\n")
-        text = text.replace("</strategy_hint>", "\n")
-    return gr.Textbox(label="User Agent Goal", lines=4, value=text)
-def create_bot_goal(environment_dropdown):
-    _, environment_dict, _, _ = get_sotopia_profiles()
-    text = environment_dict[environment_dropdown].agent_goals[1]
-    text = text.replace('(', '').replace(')', '')
-    if "<extra_info>" in text:
-        text = text.replace("<extra_info>", "\n\n")
-        text = text.replace("</extra_info>", "\n")
-    if "<strategy_hint>" in text:
-        text = text.replace("<strategy_hint>", "\n\n")
-        text = text.replace("</strategy_hint>", "\n")
-    return gr.Textbox(label="Bot Agent Goal", lines=4, value=text)
-def sotopia_info_accordion(accordion_visible=True):
-    environments, _, _, _ = get_sotopia_profiles()
-    with gr.Accordion("Create your sotopia space!", open=accordion_visible):
-        with gr.Row():
-            environment_dropdown = gr.Dropdown(
-                choices=environments,
-                label="Scenario Selection",
-                value=environments[0][1] if environments else None,
-                interactive=True,
-            )
-            model_name_dropdown = gr.Dropdown(
-                choices=MODEL_OPTIONS,
-                value=DEFAULT_MODEL_SELECTION,
-                interactive=True,
-                label="Model Selection"
-            )
-        with gr.Row():
-            user_agent_dropdown = create_user_agent_dropdown(environment_dropdown.value)
-            bot_agent_dropdown = create_bot_agent_dropdown(environment_dropdown.value, user_agent_dropdown.value)
-    with gr.Accordion("Check your social task!", open=accordion_visible):
-        scenario_info_display = create_environment_info(environment_dropdown.value)
-        with gr.Row():
-            bot_goal_display = create_bot_goal(environment_dropdown.value)
-            user_goal_display = create_user_goal(environment_dropdown.value)
-        with gr.Row():
-            bot_agent_info_display = create_bot_info(bot_agent_dropdown.value)
-            user_agent_info_display = create_user_info(user_agent_dropdown.value)
-    # Update user dropdown when scenario changes
-    environment_dropdown.change(fn=create_user_agent_dropdown, inputs=[environment_dropdown], outputs=[user_agent_dropdown])
-    # Update bot dropdown when user or scenario changes
-    user_agent_dropdown.change(fn=create_bot_agent_dropdown, inputs=[environment_dropdown, user_agent_dropdown], outputs=[bot_agent_dropdown])
-    # Update scenario information when scenario changes
-    environment_dropdown.change(fn=create_environment_info, inputs=[environment_dropdown], outputs=[scenario_info_display])
-    # Update user agent profile when user changes
-    user_agent_dropdown.change(fn=create_user_info, inputs=[user_agent_dropdown], outputs=[user_agent_info_display])
-    # Update bot agent profile when bot changes
-    bot_agent_dropdown.change(fn=create_bot_info, inputs=[bot_agent_dropdown], outputs=[bot_agent_info_display])
-    # Update user goal when scenario changes
-    environment_dropdown.change(fn=create_user_goal, inputs=[environment_dropdown], outputs=[user_goal_display])
-    # Update bot goal when scenario changes
-    environment_dropdown.change(fn=create_bot_goal, inputs=[environment_dropdown], outputs=[bot_goal_display])
-    return model_name_dropdown, environment_dropdown, user_agent_dropdown, bot_agent_dropdown
-def instructions_accordion(instructions, according_visible=False):
-    with gr.Accordion("Instructions", open=False, visible=according_visible):
-        instructions = gr.Textbox(
-            lines=10,
-            value=instructions,
-            interactive=False,
-            placeholder="Instructions",
-            show_label=False,
-            max_lines=10,
-            visible=False,
-        )
-    return instructions
-def chat_tab():
-    # history are input output pairs
-    _, environment_dict, agent_dict, _ = get_sotopia_profiles()
-    def run_chat(
-        message,
-        history,
-        environment_selection,
-        user_agent_dropdown,
-        bot_agent_dropdown,
-        model_selection:str
-    ):
-        environment = environment_dict[environment_selection]
-        user_agent = agent_dict[user_agent_dropdown]
-        bot_agent = agent_dict[bot_agent_dropdown]
-        context = get_context_prompt(bot_agent, user_agent, environment)
-        dialogue_history, next_turn_idx = dialogue_history_prompt(message, history, user_agent, bot_agent)
-        prompt_history = f"{context}{dialogue_history}"
-        agent_action = generate_action(model_selection, prompt_history, next_turn_idx, ACTION_TYPES, bot_agent.name, TEMPERATURE)
-        return agent_action.to_natural_language()
-    with gr.Column():
-        with gr.Blocks():
-            model_name_dropdown, scenario_dropdown, user_agent_dropdown, bot_agent_dropdown = sotopia_info_accordion()
-        with gr.Column():
-            with gr.Accordion("Start the conversation to achieve your goal!", open=True):
-                gr.ChatInterface(
-                    fn=run_chat,
-                    chatbot=gr.Chatbot(
-                        height=620,
-                        render=False,
-                        show_label=False,
-                        rtl=False,
-                        avatar_images=(
-                            "images/profile1.jpg",
-                            "images/profile2.jpg",
-                        ),
-                    ),
-                    textbox=gr.Textbox(
-                        placeholder="Write your message here...",
-                        render=False,
-                        scale=7,
-                        rtl=False,
-                    ),
-                    additional_inputs=[
-                        scenario_dropdown,
-                        user_agent_dropdown,
-                        bot_agent_dropdown,
-                        model_name_dropdown,
-                    ],
-                    submit_btn="Send",
-                    stop_btn="Stop",
-                    retry_btn="🔄 Retry",
-                    undo_btn="↩️ Delete",
-                    clear_btn="🗑️ Clear",
-                )
-def main():
-    with gr.Blocks(
-        css="""#chat_container {height: 820px; width: 1000px; margin-left: auto; margin-right: auto;}
-               #chatbot {height: 600px; overflow: auto;}
-               #create_container {height: 750px; margin-left: 0px; margin-right: 0px;}
-               #tokenizer_renderer span {white-space: pre-wrap}
-               """
-    ) as demo:
-        with gr.Row():
-            introduction()
-        with gr.Row():
-            chat_tab()
-    return demo
-def start_demo():
-    demo = main()
-    if DEPLOYED:
-        demo.queue(api_open=False).launch(show_api=False)
-    else:
-        demo.queue()
-        demo.launch(share=False, server_name="0.0.0.0")
 if __name__ == "__main__":
     get_sotopia_profiles()
     # prepare_model(DEFAULT_MODEL_SELECTION)
-    start_demo()

 import os
+import argparse
 from typing import Literal
+import gradio as gr # type: ignore
+from sotopia_space.chat import chat_introduction, chat_tab, get_sotopia_profiles
+from sotopia_space import benchmark
+from ui_constants import CITATION_TEXT, BANNER
 OPENAI_KEY_FILE="./openai_api.key"
 if os.path.exists(OPENAI_KEY_FILE):
     with open(OPENAI_KEY_FILE, "r") as f:
         os.environ["OPENAI_API_KEY"] = f.read().strip()
+with open("./sotopia_space/_header.md", "r") as f:
+    HEADER_MD = f.read()
+def navigation_bar():
     with gr.Column(scale=2):
+        toggle_dark = gr.Button(value="Toggle Dark")
+    toggle_dark.click(
+        None,
+        js="""
+        () => {
+            if (document.body.classList.contains('dark')) {
+            document.body.classList.remove('dark');
+            document.querySelector('gradio-app').style.backgroundColor = 'var(--color-background-primary-light)';
+            } else {
+            document.body.classList.add('dark');
+            document.querySelector('gradio-app').style.backgroundColor = 'var(--color-background-primary-dark)';
+            }
+        }
+        """,
+    )
+with gr.Blocks(
+    css="""#chat_container {height: 820px; width: 1000px; margin-left: auto; margin-right: auto;}
+            #chatbot {height: 600px; overflow: auto;}
+            #create_container {height: 750px; margin-left: 0px; margin-right: 0px;}
+            #tokenizer_renderer span {white-space: pre-wrap}
+            """,
+    theme="gradio/monochrome",
+) as demo:
+    # with gr.Row():
+    #     navigation_bar()
+    gr.Image(
+            "images/banner.png", elem_id="banner-image", show_label=False
         )
+    gr.Markdown(HEADER_MD, elem_classes="markdown-text")
+    with gr.Tabs(elem_classes="tab-buttons") as tabs:
+        with gr.TabItem("🏅 Leaderboard", elem_id="benchmark-tab-table", id=0):
+            benchmark.benchmark_table()
+        with gr.TabItem("💬 Chat", elem_id="chat-tab-interface", id=1):
+            with gr.Row():
+                chat_introduction()
+            with gr.Row():
+                chat_tab()
+    with gr.Row():
+        with gr.Accordion("📙 Citation", open=False, elem_classes="accordion-label"):
+            gr.Textbox(
+                value=CITATION_TEXT,
+                lines=7,
+                label="Copy the BibTeX snippet to cite this source",
+                elem_id="citation-button",
+                show_copy_button=True)
+# def start_demo():
+#     demo = main()
+#     if DEPLOYED:
+#         demo.queue(api_open=False).launch(show_api=False)
+#     else:
+#         demo.queue()
+#         demo.launch(share=False, server_name="0.0.0.0")
 if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--result_file", help="Path to results table", default="data_dir/models_vs_gpt35.jsonl")
+    #benchmark.original_df = pd.read_json(args.result_file, lines=True)
     get_sotopia_profiles()
     # prepare_model(DEFAULT_MODEL_SELECTION)
+    demo.launch()

data_dir/models_vs_gpt35.jsonl ADDED Viewed

	@@ -0,0 +1,4 @@

+{"model_name": "GPT-4", "SOC [-10, 0]": -0.07, "SEC [-10, 0]": -0.14, "FIN [-5, 5]": 0.81, "REL [-5, 5]": 1.94, "KNO [0, 10]": 3.73, "GOAL [0, 10]": 7.62, "BEL [0, 10]": 9.28}
+{"model_name": "GPT-3.5", "SOC [-10, 0]": -0.08, "SEC [-10, 0]": -0.08, "FIN [-5, 5]": 0.46, "REL [-5, 5]": 1.23, "KNO [0, 10]": 3.4, "GOAL [0, 10]": 6.45, "BEL [0, 10]": 9.15}
+{"model_name": "Llama-2", "SOC [-10, 0]": -0.11, "SEC [-10, 0]": -0.14, "FIN [-5, 5]": 0.4, "REL [-5, 5]": 0.91, "KNO [0, 10]": 3.11, "GOAL [0, 10]": 5.38, "BEL [0, 10]": 8.1}
+{"model_name": "MPT", "SOC [-10, 0]": -0.09, "SEC [-10, 0]": -0.07, "FIN [-5, 5]": 0.28, "REL [-5, 5]": 0.58, "KNO [0, 10]": 2.11, "GOAL [0, 10]": 4.1, "BEL [0, 10]": 6.17}

requirements.txt CHANGED Viewed

@@ -8,7 +8,7 @@ annotated-types==0.6.0
 anyio==3.7.1
 attrs==23.2.0
 beartype==0.14.1
-bitsandbytes==0.43.1
 certifi==2024.2.2
 cffi==1.16.0
 charset-normalizer==3.3.2
@@ -68,18 +68,18 @@ mypy-extensions==1.0.0
 names==0.3.0
 networkx==3.3
 numpy==1.26.4
-nvidia-cublas-cu12==12.1.3.1
-nvidia-cuda-cupti-cu12==12.1.105
-nvidia-cuda-nvrtc-cu12==12.1.105
-nvidia-cuda-runtime-cu12==12.1.105
-nvidia-cudnn-cu12==8.9.2.26
-nvidia-cufft-cu12==11.0.2.54
-nvidia-curand-cu12==10.3.2.106
-nvidia-cusolver-cu12==11.4.5.107
-nvidia-cusparse-cu12==12.1.0.106
-nvidia-nccl-cu12==2.19.3
-nvidia-nvjitlink-cu12==12.4.127
-nvidia-nvtx-cu12==12.1.105
 openai==1.22.0
 orjson==3.10.1
 packaging==23.2
@@ -129,7 +129,7 @@ toolz==0.12.1
 torch==2.2.2
 tqdm==4.66.2
 transformers==4.40.0
-triton==2.2.0
 typer==0.12.3
 types-cffi==1.16.0.20240331
 types-pyOpenSSL==24.0.0.20240417

 anyio==3.7.1
 attrs==23.2.0
 beartype==0.14.1
+bitsandbytes==0.42.0
 certifi==2024.2.2
 cffi==1.16.0
 charset-normalizer==3.3.2
 names==0.3.0
 networkx==3.3
 numpy==1.26.4
+# nvidia-cublas-cu12==12.1.3.1
+# nvidia-cuda-cupti-cu12==12.1.105
+# nvidia-cuda-nvrtc-cu12==12.1.105
+# nvidia-cuda-runtime-cu12==12.1.105
+# nvidia-cudnn-cu12==8.9.2.26
+# nvidia-cufft-cu12==11.0.2.54
+# nvidia-curand-cu12==10.3.2.106
+# nvidia-cusolver-cu12==11.4.5.107
+# nvidia-cusparse-cu12==12.1.0.106
+# nvidia-nccl-cu12==2.19.3
+# nvidia-nvjitlink-cu12==12.4.127
+# nvidia-nvtx-cu12==12.1.105
 openai==1.22.0
 orjson==3.10.1
 packaging==23.2
 torch==2.2.2
 tqdm==4.66.2
 transformers==4.40.0
+# triton==2.2.0
 typer==0.12.3
 types-cffi==1.16.0.20240331
 types-pyOpenSSL==24.0.0.20240417

sotopia_space/_header.md ADDED Viewed

	@@ -0,0 +1,4 @@

+<br/>
+# Sotopia Space: A Huggingface Space for the Sotopia projects
+[⚙️ GitHub](https://github.com/sotopia-lab) | [🤗 HuggingFace](https://huggingface.co/collections/cmu-lti/sotopia-65f312c1bd04a8c4a9225e5b) | [💬 Discussions](https://github.com/orgs/sotopia-lab/discussions)

sotopia_space/benchmark.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import gradio as gr # type: ignore
+import pandas as pd
+from sotopia_space.constants import MODEL_OPTIONS
+from sotopia_space.utils import estimated_win_rate, make_clickable_model, styled_error, styled_warning, styled_message,apply_length_penalty
+LP_MODE = "v2"
+original_df, ablation_df = None, None
+LP_original_dfs = {}
+DEFAULT_LP = 0.5
+available_models = [] # to be filled in later
+original_df, ablation_df = None, None
+def slider_change_main(length_penalty):
+    global original_df, ablation_df, LP_MODE
+    adjusted_df = apply_length_penalty(original_df, ablation_df, length_penalty, mode=LP_MODE, LP_original_dfs=LP_original_dfs)
+    adjusted_df = adjusted_df[["Model", "Overall Elo", "Task-Avg Elo", "# battles", "Length"]]
+    adjusted_df = adjusted_df.sort_values(by="Overall Elo", ascending=False)
+    # adjusted_df = add_winrates(adjusted_df, LP=length_penalty)
+    # adjusted_df = adjusted_df.drop(columns=["Length"])
+    adjusted_df.insert(0, "Rank", range(1, 1 + len(adjusted_df)))
+    return adjusted_df
+def slider_change_full(length_penalty, show_winrate):
+    global original_df, ablation_df, LP_MODE
+    adjusted_df = apply_length_penalty(original_df, ablation_df, length_penalty, mode=LP_MODE, LP_original_dfs=LP_original_dfs)
+    # sort the model by the "Task-Avg Elo" column
+    adjusted_df = adjusted_df.sort_values(by="Overall Elo", ascending=False)
+    adjusted_df.drop(columns=["Overall Elo", "Task-Avg Elo", "# battles", "Length"], inplace=True)
+    if show_winrate == "none":
+        adjusted_df.insert(0, "Rank", range(1, 1 + len(adjusted_df)))
+        return adjusted_df
+    elif show_winrate == "gpt-3.5":
+        adjusted_df = add_winrates_tasks(adjusted_df, ref="gpt-3.5", LP=length_penalty)
+    elif show_winrate == "gpt-4":
+        adjusted_df = add_winrates_tasks(adjusted_df, ref="gpt-4", LP=length_penalty)
+    adjusted_df.insert(0, "Rank", range(1, 1 + len(adjusted_df)))
+    return adjusted_df
+def benchmark_table():
+    global original_df, ablation_df
+    global LP_original_dfs, LP_MODE
+    gr.Markdown(f"**Version**: sotopia (v1.01; 2024.04.22) | **# Examples**: 7200 | **# Models**: {len(MODEL_OPTIONS)} | **# Comparisons**: x", elem_classes="markdown-text")
+    with gr.TabItem("Vs GPT-3.5", elem_id="od-benchmark-tab-table-ablation", id=0, elem_classes="subtab"):
+        # original_df, ablation_df = skip_empty_original_df, skip_empty_ablation_df
+        original_df = pd.read_json('data_dir/models_vs_gpt35.jsonl', lines=True)
+        default_main_df = apply_length_penalty(original_df, ablation_df, length_penalty=DEFAULT_LP, mode=LP_MODE, LP_original_dfs=LP_original_dfs)
+        default_main_df = default_main_df.sort_values(by="GOAL [0, 10]", ascending=False)
+        # add a Rank column to the first columnn (starting from 1)
+        default_main_df.insert(0, "Rank", range(1, 1 + len(default_main_df)))
+        with gr.Row():
+            with gr.Column(scale=4):
+                gr.Markdown("**Vs GPT3.5**: The interlocutors are compared against GPT-3.5, the baseline model.")
+            with gr.Column(scale=1):
+                length_penlty_slider = gr.Slider(minimum=0.1, maximum=1, step=0.1, value=DEFAULT_LP, label="Length Penalty", elem_id="length-penalty-slider")
+        # checkbox_skip_empty = gr.Checkbox(label="Skip empty results", value=False, elem_id="skip-empty-checkbox", scale=2)
+        TYPES = ["number", "markdown", "number"]
+        leaderboard_table = gr.components.Dataframe(
+            value=default_main_df,
+            datatype=TYPES,
+            # max_rows=None,
+            height=1000,
+            elem_id="leaderboard-table",
+            interactive=False,
+            visible=True,
+            min_width=60,
+            )
+        #length_penlty_slider.change(fn=slider_change_main, inputs=[length_penlty_slider], outputs=[leaderboard_table])

sotopia_space/chat.py ADDED Viewed

	@@ -0,0 +1,284 @@

+import os
+import gradio as gr # type: ignore
+# Functions for creating the chat interface
+from functools import cache
+from typing import Literal
+import json
+from collections import defaultdict
+from utils import Environment, Agent, get_context_prompt, dialogue_history_prompt
+from sotopia_pi_generate import prepare_model, generate_action
+from sotopia_space.constants import MODEL_OPTIONS
+DEPLOYED = os.getenv("DEPLOYED", "true").lower() == "true"
+DEFAULT_MODEL_SELECTION = "gpt-3.5-turbo"
+TEMPERATURE = 0.7
+TOP_P = 1
+MAX_TOKENS = 1024
+ENVIRONMENT_PROFILES = "profiles/environment_profiles.jsonl"
+AGENT_PROFILES = "profiles/agent_profiles.jsonl"
+RELATIONSHIP_PROFILES = "profiles/relationship_profiles.jsonl"
+Action = Literal['none', 'action', 'non-verbal communication', 'speak', 'leave']
+ACTION_TYPES: list[Action] = ['none', 'action', 'non-verbal communication', 'speak', 'leave']
+@cache
+def get_sotopia_profiles(env_file=ENVIRONMENT_PROFILES, agent_file=AGENT_PROFILES, relationship_file=RELATIONSHIP_PROFILES):
+    with open(env_file, 'r') as f:
+        data = [json.loads(line) for line in f.readlines()]
+    code_names_count = defaultdict(int)
+    environments = []
+    environment_dict = {}
+    for profile in sorted(data, key=lambda x: x['codename']):
+        env_obj = Environment(profile)
+        if profile['codename'] in code_names_count:
+            environments.append((
+                "{}_{:05d}".format(profile['codename'],
+                                   code_names_count[profile['codename']]
+                                   ),
+                env_obj._id
+                ))
+        else:
+            environments.append((profile['codename'], env_obj._id))
+        environment_dict[env_obj._id] = env_obj
+        code_names_count[profile['codename']] += 1
+    with open(agent_file, 'r') as f:
+        data = [json.loads(line) for line in f.readlines()]
+    agent_dict = {}
+    for profile in data:
+        agent_obj = Agent(profile)
+        agent_dict[agent_obj._id] = agent_obj
+    with open(relationship_file, 'r') as f:
+        data = [json.loads(line) for line in f.readlines()]
+    relationship_dict = defaultdict(lambda : defaultdict(list))
+    for profile in data:
+        relationship_dict[profile['relationship']][profile['agent1_id']].append(profile['agent2_id'])
+        relationship_dict[profile['relationship']][profile['agent2_id']].append(profile['agent1_id'])
+    return environments, environment_dict, agent_dict, relationship_dict
+def chat_introduction():
+    with gr.Column(scale=2):
+        gr.Image(
+            "images/sotopia.jpg", elem_id="banner-image", show_label=False
+        )
+    with gr.Column(scale=5):
+        gr.Markdown(
+            """# Sotopia Space
+            **Chat with different social agent models including [sotopia-pi](https://github.com/sotopia-lab/sotopia-pi), GPT and so on in sotopia space!**
+            ➡️️ **Intended Use**: Sotopia space is intended to showcase the social intelligence ability of different social agents in interesting social scenarios.
+            ✨ **Guidance**:
+            Step (1) Select a social scenario that interests you in "Scenario Selection"
+            Step (2) Select a social agent you want to chat with in "Model Selection"
+            Step (3) Select which character you and your social agent will play in the scenario in "User Agent Selection" and "Bot Agent Selection"
+            Step (4) Negotiate/debate/cooperate with the social agent to see whether your goal or their social goal can be achieved.
+            ⚠️ **Limitations**: The social agent can and will produce factually incorrect information, hallucinating facts and potentially offensive actions. It can produce problematic outputs, especially if prompted to do so.
+            🗄️ **Disclaimer**: User prompts and generated replies from the model may be collected solely for the purpose of pure academic research. By using this demo, users implicitly agree to these terms.
+            """
+        )
+    # with gr.Column(scale=1):
+    #     toggle_dark = gr.Button(value="Toggle Dark")
+def create_user_agent_dropdown(environment_id):
+    _, environment_dict, agent_dict, relationship_dict = get_sotopia_profiles()
+    environment = environment_dict[environment_id]
+    user_agents_list = []
+    unique_agent_ids = set()
+    for x, _ in relationship_dict[environment.relationship].items():
+        unique_agent_ids.add(x)
+    for agent_id in unique_agent_ids:
+        user_agents_list.append((agent_dict[agent_id].name, agent_id))
+    return gr.Dropdown(choices=user_agents_list, value=user_agents_list[0][1] if user_agents_list else None, label="User Agent Selection")
+def create_bot_agent_dropdown(environment_id, user_agent_id):
+    _, environment_dict, agent_dict, relationship_dict = get_sotopia_profiles()
+    environment, user_agent = environment_dict[environment_id], agent_dict[user_agent_id]
+    bot_agent_list = []
+    for neighbor_id in relationship_dict[environment.relationship][user_agent.agent_id]:
+        bot_agent_list.append((agent_dict[neighbor_id].name, neighbor_id))
+    return gr.Dropdown(choices=bot_agent_list, value=bot_agent_list[0][1] if bot_agent_list else None,  label="Bot Agent Selection")
+def create_environment_info(environment_dropdown):
+    _, environment_dict, _, _ = get_sotopia_profiles()
+    environment = environment_dict[environment_dropdown]
+    text = environment.scenario
+    return gr.Textbox(label="Scenario", lines=1, value=text)
+def create_user_info(user_agent_dropdown):
+    _, _, agent_dict, _ = get_sotopia_profiles()
+    user_agent = agent_dict[user_agent_dropdown]
+    text = f"{user_agent.background} {user_agent.personality}"
+    return gr.Textbox(label="User Agent Profile", lines=4, value=text)
+def create_bot_info(bot_agent_dropdown):
+    _, _, agent_dict, _ = get_sotopia_profiles()
+    bot_agent = agent_dict[bot_agent_dropdown]
+    text = f"{bot_agent.background} {bot_agent.personality}"
+    return gr.Textbox(label="Bot Agent Profile", lines=4, value=text)
+def create_user_goal(environment_dropdown):
+    _, environment_dict, _, _ = get_sotopia_profiles()
+    text = environment_dict[environment_dropdown].agent_goals[0]
+    text = text.replace('(', '').replace(')', '')
+    if "<extra_info>" in text:
+        text = text.replace("<extra_info>", "\n\n")
+        text = text.replace("</extra_info>", "\n")
+    if "<strategy_hint>" in text:
+        text = text.replace("<strategy_hint>", "\n\n")
+        text = text.replace("</strategy_hint>", "\n")
+    return gr.Textbox(label="User Agent Goal", lines=4, value=text)
+def create_bot_goal(environment_dropdown):
+    _, environment_dict, _, _ = get_sotopia_profiles()
+    text = environment_dict[environment_dropdown].agent_goals[1]
+    text = text.replace('(', '').replace(')', '')
+    if "<extra_info>" in text:
+        text = text.replace("<extra_info>", "\n\n")
+        text = text.replace("</extra_info>", "\n")
+    if "<strategy_hint>" in text:
+        text = text.replace("<strategy_hint>", "\n\n")
+        text = text.replace("</strategy_hint>", "\n")
+    return gr.Textbox(label="Bot Agent Goal", lines=4, value=text)
+def sotopia_info_accordion(accordion_visible=True):
+    environments, _, _, _ = get_sotopia_profiles()
+    with gr.Accordion("Create your sotopia space!", open=accordion_visible):
+        with gr.Row():
+            environment_dropdown = gr.Dropdown(
+                choices=environments,
+                label="Scenario Selection",
+                value=environments[0][1] if environments else None,
+                interactive=True,
+            )
+            model_name_dropdown = gr.Dropdown(
+                choices=MODEL_OPTIONS,
+                value=DEFAULT_MODEL_SELECTION,
+                interactive=True,
+                label="Model Selection"
+            )
+        with gr.Row():
+            user_agent_dropdown = create_user_agent_dropdown(environment_dropdown.value)
+            bot_agent_dropdown = create_bot_agent_dropdown(environment_dropdown.value, user_agent_dropdown.value)
+    with gr.Accordion("Check your social task!", open=accordion_visible):
+        scenario_info_display = create_environment_info(environment_dropdown.value)
+        with gr.Row():
+            bot_goal_display = create_bot_goal(environment_dropdown.value)
+            user_goal_display = create_user_goal(environment_dropdown.value)
+        with gr.Row():
+            bot_agent_info_display = create_bot_info(bot_agent_dropdown.value)
+            user_agent_info_display = create_user_info(user_agent_dropdown.value)
+    # Update user dropdown when scenario changes
+    environment_dropdown.change(fn=create_user_agent_dropdown, inputs=[environment_dropdown], outputs=[user_agent_dropdown])
+    # Update bot dropdown when user or scenario changes
+    user_agent_dropdown.change(fn=create_bot_agent_dropdown, inputs=[environment_dropdown, user_agent_dropdown], outputs=[bot_agent_dropdown])
+    # Update scenario information when scenario changes
+    environment_dropdown.change(fn=create_environment_info, inputs=[environment_dropdown], outputs=[scenario_info_display])
+    # Update user agent profile when user changes
+    user_agent_dropdown.change(fn=create_user_info, inputs=[user_agent_dropdown], outputs=[user_agent_info_display])
+    # Update bot agent profile when bot changes
+    bot_agent_dropdown.change(fn=create_bot_info, inputs=[bot_agent_dropdown], outputs=[bot_agent_info_display])
+    # Update user goal when scenario changes
+    environment_dropdown.change(fn=create_user_goal, inputs=[environment_dropdown], outputs=[user_goal_display])
+    # Update bot goal when scenario changes
+    environment_dropdown.change(fn=create_bot_goal, inputs=[environment_dropdown], outputs=[bot_goal_display])
+    return model_name_dropdown, environment_dropdown, user_agent_dropdown, bot_agent_dropdown
+def instructions_accordion(instructions, according_visible=False):
+    with gr.Accordion("Instructions", open=False, visible=according_visible):
+        instructions = gr.Textbox(
+            lines=10,
+            value=instructions,
+            interactive=False,
+            placeholder="Instructions",
+            show_label=False,
+            max_lines=10,
+            visible=False,
+        )
+    return instructions
+def chat_tab():
+   # history are input output pairs
+    _, environment_dict, agent_dict, _ = get_sotopia_profiles()
+    def run_chat(
+        message,
+        history,
+        environment_selection,
+        user_agent_dropdown,
+        bot_agent_dropdown,
+        model_selection:str
+    ):
+        environment = environment_dict[environment_selection]
+        user_agent = agent_dict[user_agent_dropdown]
+        bot_agent = agent_dict[bot_agent_dropdown]
+        context = get_context_prompt(bot_agent, user_agent, environment)
+        dialogue_history, next_turn_idx = dialogue_history_prompt(message, history, user_agent, bot_agent)
+        prompt_history = f"{context}{dialogue_history}"
+        agent_action = generate_action(model_selection, prompt_history, next_turn_idx, ACTION_TYPES, bot_agent.name, TEMPERATURE)
+        return agent_action.to_natural_language()
+    with gr.Column():
+        with gr.Blocks():
+            model_name_dropdown, scenario_dropdown, user_agent_dropdown, bot_agent_dropdown = sotopia_info_accordion()
+        with gr.Column():
+            with gr.Accordion("Start the conversation to achieve your goal!", open=True):
+                gr.ChatInterface(
+                    fn=run_chat,
+                    chatbot=gr.Chatbot(
+                        height=620,
+                        render=False,
+                        show_label=False,
+                        rtl=False,
+                        avatar_images=(
+                            "images/profile1.jpg",
+                            "images/profile2.jpg",
+                        ),
+                    ),
+                    textbox=gr.Textbox(
+                        placeholder="Write your message here...",
+                        render=False,
+                        scale=7,
+                        rtl=False,
+                    ),
+                    additional_inputs=[
+                        scenario_dropdown,
+                        user_agent_dropdown,
+                        bot_agent_dropdown,
+                        model_name_dropdown,
+                    ],
+                    submit_btn="Send",
+                    stop_btn="Stop",
+                    retry_btn="🔄 Retry",
+                    undo_btn="↩️ Delete",
+                    clear_btn="🗑️ Clear",
+                )

sotopia_space/constants.py ADDED Viewed

	@@ -0,0 +1,39 @@

+MODEL_OPTIONS = [
+    "gpt-3.5-turbo",
+    "gpt-4",
+    "gpt-4-turbo",
+    "cmu-lti/sotopia-pi-mistral-7b-BC_SR",
+    "cmu-lti/sotopia-pi-mistral-7b-BC_SR_4bit",
+    "mistralai/Mistral-7B-Instruct-v0.1"
+    # "mistralai/Mixtral-8x7B-Instruct-v0.1",
+    # "togethercomputer/llama-2-7b-chat",
+    # "togethercomputer/llama-2-70b-chat",
+    # "togethercomputer/mpt-30b-chat",
+    # "together_ai/togethercomputer/llama-2-7b-chat",
+    # "together_ai/togethercomputer/falcon-7b-instruct",
+]
+MODEL_INFO = {
+    "Llama-2-13b-chat-hf.nosp": {"pretty_name": "Llama-2-13B-chat", "hf_model_id": "meta-llama/Llama-2-13b-chat-hf"},
+    "Llama-2-70b-chat-hf.nosp": {"pretty_name": "Llama-2-70B-chat", "hf_model_id": "meta-llama/Llama-2-70b-chat-hf"},
+    "Llama-2-7b-chat-hf.nosp": {"pretty_name": "Llama-2-7B-chat", "hf_model_id": "meta-llama/Llama-2-7b-chat-hf"},
+    "Llama-2-7b-chat-hf": {"pretty_name": "Llama-2-7B-chat (+sys prmpt)", "hf_model_id": "meta-llama/Llama-2-7b-chat-hf"},
+    "Mistral-7B-Instruct-v0.1": {"pretty_name": "Mistral-7B-Instruct", "hf_model_id": "mistralai/Mistral-7B-Instruct-v0.1"},
+    "Mistral-7B-Instruct-v0.2": {"pretty_name": "Mistral-7B-Instruct (v0.2)", "hf_model_id": "mistralai/Mistral-7B-Instruct-v0.2"},
+    "Mixtral-8x7B-Instruct-v0.1": {"pretty_name": "Mixtral-8x7B-Instruct", "hf_model_id": "mistralai/Mixtral-8x7B-Instruct-v0.1"},
+    "Nous-Hermes-2-Mixtral-8x7B-DPO": {"pretty_name": "Nous-Hermes-2-Mixtral-8x7B-DPO", "hf_model_id": "NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO"},
+    "Yi-34B-Chat": {"pretty_name": "Yi-34B-Chat", "hf_model_id": "01-ai/Yi-34B"},
+    "gemini-1.0-pro": {"pretty_name": "gemini-1.0-pro", "hf_model_id": "https://blog.google/technology/ai/google-gemini-ai/"},
+    "gemma-7b-it": {"pretty_name": "Gemma-7B-it", "hf_model_id": "google/gemma-7b"},
+    "gpt-3.5-turbo-0125": {"pretty_name": "gpt-3.5-turbo-0125", "hf_model_id": "https://platform.openai.com/"},
+    "gpt-4-0125-preview": {"pretty_name": "gpt-4-0125-preview", "hf_model_id": "https://platform.openai.com/"},
+    "tulu-2-dpo-70b": {"pretty_name": "Tulu-2-dpo-70b", "hf_model_id": "cmu-lti/tulu-2-dpo-70b"},
+    "vicuna-13b-v1.5": {"pretty_name": "Vicuna-13b-v1.5", "hf_model_id": "lmsys/vicuna-13b-v1.5"},
+    "zephyr-7b-beta": {"pretty_name": "Zephyr-7b-beta", "hf_model_id": "HuggingFaceH4/zephyr-7b-beta"},
+    "mistral-large-2402": {"pretty_name": "Mistral-Large", "hf_model_id": "https://mistral.ai/news/mistral-large/"},
+    "claude-3-opus-20240229": {"pretty_name": "Claude 3 Opus", "hf_model_id": "https://www.anthropic.com/claude"},
+    "claude-3-sonnet-20240229": {"pretty_name": "Claude 3 Sonnet", "hf_model_id": "https://www.anthropic.com/claude"},
+    "zephyr-7b-gemma-v0.1": {"pretty_name": "Zephyr-7b-Gemma", "hf_model_id": "HuggingFaceH4/zephyr-7b-gemma-v0.1"},
+    "Starling-LM-7B-beta": {"pretty_name": "StarlingLM-7B-beta", "hf_model_id": "Nexusflow/Starling-LM-7B-beta"},
+    "dbrx-instruct": {"pretty_name": "DBRX Instruct", "hf_model_id": "databricks/dbrx-instruct"}
+}

sotopia_space/utils.py ADDED Viewed

	@@ -0,0 +1,223 @@

+from datasets import load_dataset, Dataset
+import os
+import json
+from datasets import load_dataset
+from datasets.utils.logging import disable_progress_bar # type: ignore
+from ui_constants import column_names, all_task_types
+import random
+disable_progress_bar()
+import math
+from sotopia_space.constants import MODEL_INFO
+id_to_data = None
+model_len_info = None
+def make_clickable_model(model_name):
+    global MODEL_INFO
+    if model_name in MODEL_INFO:
+        if MODEL_INFO[model_name]["hf_model_id"].startswith("http"):
+            link = MODEL_INFO[model_name]["hf_model_id"]
+            return f'🔒 <a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{MODEL_INFO[model_name]["pretty_name"]}</a>'
+        else:
+            link = f"https://huggingface.co/{MODEL_INFO[model_name]['hf_model_id']}"
+            return f'🔥 <a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{MODEL_INFO[model_name]["pretty_name"]}</a>'
+    else:
+        return model_name
+def styled_error(error):
+    return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
+def styled_warning(warn):
+    return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
+def styled_message(message):
+    return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
+def estimated_win_rate(elo_a, elo_b, LP=0):
+    """
+    Calculate the estimated win rate for player A against player B using their Elo ratings.
+    :param elo_a: Elo rating of player A
+    :param elo_b: Elo rating of player B
+    :return: Estimated win rate for player A
+    """
+    exponent = (elo_b - elo_a)*(10**LP) / 400
+    probability_a_wins = 1 / (1 + 10 ** exponent)
+    return (1-probability_a_wins)*100
+# Formats the columns
+def formatter(x):
+    if type(x) is str:
+        x = x
+    else:
+        x = round(x, 1)
+    return x
+def add_winrates(current_df, LP=0):
+    df = current_df.copy()
+    elo_column = "Task-Avg Elo"
+    # Correct way to filter the DataFrame and get the Elo rating for "gpt-4-0125-preview"
+    model_a_elo = df[df["Model"].str.contains("gpt-4")][elo_column].iloc[0]
+    # Correct way to filter the DataFrame and get the Elo rating for "gpt-3.5-turbo-0125"
+    model_b_elo = df[df["Model"].str.contains("gpt-3.5")][elo_column].iloc[0]
+    # Calculate the win rate of "gpt-4-0125-preview" against all models
+    df['Win% vs GPT-4'] = df[elo_column].apply(lambda x: estimated_win_rate(model_a_elo, x, LP=LP)).apply(formatter)
+    df['Win% vs GPT-3.5T'] = df[elo_column].apply(lambda x: estimated_win_rate(model_b_elo, x, LP=LP)).apply(formatter)
+    # apply the formatter for the two new columns
+    cols = list(df.columns)
+    cols.remove("# battles"); cols.append("# battles")
+    cols.remove("Length"); cols.append("Length")
+    df = df[cols]
+    return df
+def add_winrates_tasks(current_df, ref="gpt-4", LP=0):
+    new_df = current_df.copy()
+    for t in all_task_types:
+        column = column_names[t]
+        model_a_elo = current_df[current_df["Model"].str.contains(ref)][column].iloc[0]
+        new_df[column] = current_df[column].apply(lambda x: estimated_win_rate(model_a_elo, x, LP=LP)).apply(formatter)
+    return new_df
+def post_processing(df, model_len_info):
+    if model_len_info:
+        df["Length"] = df["model name "].apply(lambda x: model_len_info[x]["avg_len"])
+    for col in df.columns:
+        if col == "model name ":
+            df[col] = df[col].apply(lambda x: x.replace(x, make_clickable_model(x)))
+        else:
+            df[col] = df[col].apply(formatter) # For numerical values
+    df.rename(columns=column_names, inplace=True)
+    df.sort_values(by="Task-Avg Elo", inplace=True, ascending=False)
+    # put the "Overall Elo" and "Task-Avg Elo" column to the front
+    # add the length info
+    df = df[["Model", "Task-Avg Elo"] + [col for col in df.columns if col not in ["Model", "Task-Avg Elo"]]]
+    return df
+def apply_length_penalty(original_df, ablation_df, length_penalty=0.2, mode='v1', LP_original_dfs=None):
+    """
+    Temporarily disable the length penalty feature
+    if mode == 'v2' and LP_original_dfs is not None:
+        L = f"{length_penalty:.1f}"
+        return LP_original_dfs[L]
+    original_df = original_df.copy()
+    ablation_df = ablation_df.copy()
+    # replace all values in original_df with the values as z = x - y * length_penalty where y is from ablation_df at the same row and column
+    # except for the "Model" column and the "# battles" column
+    # do not assume the order of the rows are the same in both dataframes
+    for i, row in original_df.iterrows():
+        for col in original_df.columns:
+            if col == "Model" or col == "# battles" or col == "Length":
+                continue
+            # assert that the model names are the same in both dataframes
+            assert original_df.at[i, "Model"] == ablation_df[ablation_df["Model"] == row["Model"]]["Model"].values[0]
+            original_df[col] = original_df[col].astype(float)
+            if mode == "v1":
+                original_df.at[i, col] = original_df.at[i, col] - ablation_df[ablation_df["Model"] == row["Model"]][col].values[0] * length_penalty
+            elif mode == "v1.1":
+                diff = original_df.at[i, col] - ablation_df[ablation_df["Model"] == row["Model"]][col].values[0]
+                original_df.at[i, col] = original_df.at[i, col] * (1-length_penalty) + diff*length_penalty
+    # post_processing
+    original_df = post_processing(original_df, model_len_info=None)
+    """
+    return original_df
+def load_benchdata():
+    print("Loading sotopia data...")
+    bench_data = load_dataset("cmu-lti/sotopia", split="test")
+    return bench_data
+def load_benchdata_dict():
+    print("Loading sotopia data....")
+    bench_data = load_dataset("cmu-lti/sotopia", data_files="sotopia_episodes_v1_hf.jsonl")['train']
+    id_to_data = {}
+    for item in bench_data:
+        id_to_data[item["session_id"]] = item
+    return id_to_data
+def load_eval_results():
+    print("Loading sotopia Evaluation data...")
+    eval_results = load_dataset("WildEval/sotopia-Evaluation", "all", split="train")
+    return eval_results
+def load_infer_results(model_name):
+    print(f"Loading sotopia Results for {model_name}...")
+    infer_results = load_dataset("WildEval/sotopia-Results", model_name, split="train")
+    return infer_results
+def sample_an_eval_result(eval_results, model_list=[], tag_list=[]):
+    global id_to_data
+    eval_results = list(eval_results)
+    random.shuffle(eval_results)
+    for eval_item in eval_results:
+        # print(json.dumps(eval_item, indent=2))
+        # print(f"## Session ID: {eval_item['session_id']}")
+        # eval_item["eval_id"]
+        assignment = eval_item['assignment']
+        model_1, model_2 = eval_item['model_1'], eval_item['model_2']
+        model_A = model_1 if assignment['A'] == model_1 else model_2
+        model_B = model_2 if assignment['B'] == model_2 else model_1
+        if len(model_list) >= 2:
+            if model_A not in model_list or model_B not in model_list:
+                continue
+        elif len(model_list) == 1:
+            if model_A != model_list[0] and model_B != model_list[0]:
+                continue
+        else:
+            pass
+        if tag_list:
+            if set(tag_list).isdisjoint(set(eval_item['tags'])):
+                continue
+        winner = eval_item['winner']
+        # print(f"## Model A: {model_A} | Model B: {model_B} | Winner: {winner}")
+        task_type = eval_item['tags'][0] # primary task type
+        chat_history = eval_item['history']
+        last_query = eval_item['last_query']
+        # print(f"## Task Type: {task_type}")
+        # print(f"## Chat History: {chat_history}")
+        # print(f"## Last Query -->  USER: {last_query}")
+        model_A_output = eval_item['model_1_output'] if model_1 == model_A else eval_item['model_2_output']
+        model_B_output = eval_item['model_2_output'] if model_2 == model_B else eval_item['model_1_output']
+        if len(model_A_output.strip()) == 0 or len(model_B_output.strip()) == 0:
+            continue
+        conversation_input = id_to_data[eval_item['session_id']]["conversation_input"]
+        # print(f"\n\n\n## Model A ({model_A}) Output ##\n{model_A_output}")
+        # print(f"\n\n\n## Model B ({model_B}) Output ##\n{model_B_output}")
+        # print(f"\n\n\n## Winner ##\n{winner}")
+        # print(f"\n\n\n## GPT-4 Judgement ##\n{eval_item['parsed_result']}")
+        result_dict = {
+            "session_id": eval_item['session_id'],
+            "model_A": model_A,
+            "model_B": model_B,
+            "winner": winner,
+            "intent": id_to_data[eval_item['session_id']]["intent"],
+            "task_type": task_type,
+            "all_tags": eval_item['tags'],
+            "chat_history": chat_history,
+            "last_query": last_query,
+            "conversation_input": conversation_input,
+            "model_A_output": model_A_output,
+            "model_B_output": model_B_output,
+            "reason": eval_item['parsed_result']["reason"],
+            "choice": eval_item['parsed_result']["choice"],
+            "checklist": id_to_data[eval_item['session_id']]["checklist"],
+        }
+        break
+    return result_dict
+#id_to_data = load_benchdata_dict()

start_app.sh CHANGED Viewed

@@ -1,4 +1,4 @@
 export OPENAI_API_KEY=$(cat openai_api.key)
 export HF_TOKEN=$(cat hf_token.key)
-python app.py

 export OPENAI_API_KEY=$(cat openai_api.key)
 export HF_TOKEN=$(cat hf_token.key)
+gradio app.py

ui_constants.py ADDED Viewed

	@@ -0,0 +1,191 @@

+from pathlib import Path
+DEFAULT_LP = 0.5
+banner_url = "https://github.com/sotopia-lab/sotopia-website/blob/main/public/bg_xl.png" # the same repo here.
+BANNER = f'<div style="display: flex; justify-content: flex-start;"><img src="{banner_url}" alt="Banner" style="width: 40vw; min-width: 300px; max-width: 800px;"> </div>'
+TITLE = "<html> <head> <style> h1 {text-align: center;} </style> </head> <body> <h1> 🦁 AI2 sotopia Leaderboard </b> </body> </html>"
+WINRATE_HEATMAP = "<div><img src='https://github.com/WildEval/sotopia-Leaderboard/blob/main/gradio/pairwise_win_fractions.png?raw=true' style='width:100%;'></div>"
+CITATION_TEXT = """@inproceedings{
+zhou2024sotopia,
+title={{SOTOPIA}: Interactive Evaluation for Social Intelligence in Language Agents},
+author={Xuhui Zhou and Hao Zhu and Leena Mathur and Ruohong Zhang and Haofei Yu and Zhengyang Qi and Louis-Philippe Morency and Yonatan Bisk and Daniel Fried and Graham Neubig and Maarten Sap},
+booktitle={The Twelfth International Conference on Learning Representations},
+year={2024},
+url={https://openreview.net/forum?id=mM7VurbA4r}
+}
+"""
+column_names = {
+    "model name ": "Model",
+    "elo overall": "Overall Elo",
+    'Information seeking': 'InfoSek',
+    'Creative Writing': 'CrtWrt',
+    'Coding & Debugging': 'Code',
+    'Reasoning': 'Reason',
+    'Editing': 'Edit',
+    'Math': 'Math',
+    'Planning': 'Plan',
+    'Brainstorming': 'Brnstrm',
+    'Role playing': 'RolPly',
+    'Advice seeking': 'AdvSek',
+    'Data Analysis': 'DataAna',
+    'Others': 'Misc',
+    "average": "Task-Avg Elo",
+}
+all_task_types = [
+    'Information seeking',
+    'Creative Writing',
+    'Coding & Debugging',
+    'Reasoning',
+    'Editing',
+    'Math',
+    'Planning',
+    'Brainstorming',
+    'Role playing',
+    'Advice seeking',
+    'Data Analysis',
+    'Others'
+]
+js_light = """
+function refresh() {
+    const url = new URL(window.location);
+    if (url.searchParams.get('__theme') !== 'light') {
+        url.searchParams.set('__theme', 'light');
+        window.location.href = url.href;
+    }
+}
+"""
+js_code = """
+function scroll_top() {
+    console.log("Hello from Gradio!");
+    const bubbles = document.querySelectorAll('.bubble-wrap');
+    bubbles.forEach((bubble, index) => {
+        setTimeout(() => {
+            bubble.scrollTop = 0;
+        }, index * 100); // Delay of 100ms between each iteration
+    });
+}
+"""
+TASK_TYPE_STR = "**Tasks**: Info seeking (**InfoSek**), Creative Writing (**CrtWrt**), Coding&Debugging (**Code**), Reasoning (**Reason**), Editing (**Edit**), **Math**, Planning (**Plan**), Brainstorming (**Brnstrm**), Role playing (**RolPly**), Advice seeking (**AdvSek**), Data Analysis (**DataAna**)"
+css = """
+code {
+    font-size: large;
+}
+footer {visibility: hidden}
+.top-left-LP{
+    margin-top: 6px;
+    margin-left: 5px;
+}
+.markdown-text{font-size: 14pt}
+.markdown-text-small{font-size: 13pt}
+.markdown-text-tiny{font-size: 12pt}
+.markdown-text-tiny-red{
+    font-size: 12pt;
+    color: red;
+    background-color: yellow;
+    font-color: red;
+    font-weight: bold;
+}
+th {
+  text-align: center;
+  font-size: 17px; /* Adjust the font size as needed */
+}
+td {
+  font-size: 15px; /* Adjust the font size as needed */
+  text-align: center;
+}
+.sample_button{
+    border: 1px solid #000000;
+    border-radius: 5px;
+    padding: 5px;
+    font-size: 15pt;
+    font-weight: bold;
+    margin: 5px;
+}
+.chat-common{
+    height: auto;
+    max-height: 400px;
+    min-height: 100px;
+}
+.chat-specific{
+    height: auto;
+    max-height: 600px;
+    min-height: 200px;
+}
+#od-benchmark-tab-table-button{
+    font-size: 15pt;
+    font-weight: bold;
+}
+.btn_boderline{
+    border: 1px solid #000000;
+    border-radius: 5px;
+    padding: 5px;
+    margin: 5px;
+    font-size: 15pt;
+    font-weight: bold;
+}
+.btn_boderline_next{
+    border: 0.1px solid #000000;
+    border-radius: 5px;
+    padding: 5px;
+    margin: 5px;
+    font-size: 15pt;
+    font-weight: bold;
+}
+.btn_boderline_gray{
+    border: 0.5px solid gray;
+    border-radius: 5px;
+    padding: 5px;
+    margin: 5px;
+    font-size: 15pt;
+    font-weight: italic;
+}
+.btn_boderline_selected{
+    border: 2px solid purple;
+    background-color: #f2f2f2;
+    border-radius: 5px;
+    padding: 5px;
+    margin: 5px;
+    font-size: 15pt;
+    font-weight: bold;
+}
+.accordion-label button span{
+    font-size: 14pt;
+    font-weight: bold;
+}
+#select-models span{
+    font-size: 10pt;
+}
+#select-tasks span{
+    font-size: 10pt;
+}
+.markdown-text-details{
+    margin: 10px;
+    padding: 10px;
+}
+button.selected[role="tab"][aria-selected="true"] {
+    font-size: 18px; /* or any other size you prefer */
+    font-weight: bold;
+}
+#od-benchmark-tab-table-ablation-button {
+    font-size: larger; /* Adjust the font size as needed */
+}
+.plotly-plot{
+    height: auto;
+    max-height: 600px;
+    min-height: 600px;
+}
+"""