Spaces:
Running
Running
refine update logic
Browse files
app.py
CHANGED
|
@@ -28,9 +28,6 @@ openai_client = OpenAI(api_key=api_key, base_url=base_url)
|
|
| 28 |
# Timeout in seconds for model responses
|
| 29 |
TIMEOUT = 90
|
| 30 |
|
| 31 |
-
# leaderboard data
|
| 32 |
-
leaderboard_data = None
|
| 33 |
-
|
| 34 |
# Hint string constant
|
| 35 |
SHOW_HINT_STRING = True # Set to False to hide the hint string altogether
|
| 36 |
HINT_STRING = "Once signed in, your votes will be recorded securely."
|
|
@@ -252,10 +249,7 @@ def chat_with_models(
|
|
| 252 |
|
| 253 |
def request_model_response():
|
| 254 |
try:
|
| 255 |
-
request_params = {
|
| 256 |
-
"model": model_name,
|
| 257 |
-
"messages": truncated_input
|
| 258 |
-
}
|
| 259 |
response = openai_client.chat.completions.create(**request_params)
|
| 260 |
model_response["content"] = response.choices[0].message.content
|
| 261 |
except Exception as e:
|
|
@@ -366,89 +360,94 @@ def load_content_from_hf(repo_name="SE-Arena/votes"):
|
|
| 366 |
raise Exception("Error loading feedback data from Hugging Face repository.")
|
| 367 |
|
| 368 |
|
| 369 |
-
def get_leaderboard_data():
|
| 370 |
-
|
| 371 |
-
|
| 372 |
-
|
| 373 |
-
try:
|
| 374 |
-
feedback_data = load_content_from_hf()
|
| 375 |
-
feedback_df = pd.DataFrame(feedback_data)
|
| 376 |
-
|
| 377 |
-
# map vote to winner
|
| 378 |
-
feedback_df["winner"] = feedback_df["winner"].map(
|
| 379 |
-
{
|
| 380 |
-
"left": evalica.Winner.X,
|
| 381 |
-
"right": evalica.Winner.Y,
|
| 382 |
-
"tie": evalica.Winner.Draw,
|
| 383 |
-
}
|
| 384 |
-
)
|
| 385 |
|
| 386 |
-
|
| 387 |
-
|
| 388 |
-
|
| 389 |
-
)
|
| 390 |
-
|
| 391 |
-
feedback_df["left"], feedback_df["right"], feedback_df["winner"]
|
| 392 |
-
)
|
| 393 |
-
newman_result = evalica.newman(
|
| 394 |
-
feedback_df["left"], feedback_df["right"], feedback_df["winner"]
|
| 395 |
-
)
|
| 396 |
-
eigen_result = evalica.eigen(
|
| 397 |
-
feedback_df["left"], feedback_df["right"], feedback_df["winner"]
|
| 398 |
-
)
|
| 399 |
-
elo_result = evalica.elo(
|
| 400 |
-
feedback_df["left"], feedback_df["right"], feedback_df["winner"]
|
| 401 |
-
)
|
| 402 |
-
pagerank_result = evalica.pagerank(
|
| 403 |
-
feedback_df["left"], feedback_df["right"], feedback_df["winner"]
|
| 404 |
-
)
|
| 405 |
|
| 406 |
-
|
| 407 |
-
|
| 408 |
-
|
| 409 |
-
|
| 410 |
-
|
| 411 |
-
|
| 412 |
-
|
| 413 |
-
|
| 414 |
-
|
| 415 |
-
|
| 416 |
-
|
| 417 |
-
|
|
|
|
| 418 |
|
| 419 |
-
|
| 420 |
-
|
| 421 |
-
|
| 422 |
-
|
| 423 |
-
|
| 424 |
-
|
| 425 |
-
|
| 426 |
-
|
| 427 |
-
"PageRank Score": 2,
|
| 428 |
-
}
|
| 429 |
-
)
|
| 430 |
|
| 431 |
-
|
| 432 |
-
|
| 433 |
-
|
| 434 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 435 |
|
| 436 |
-
|
| 437 |
-
|
| 438 |
-
|
| 439 |
-
|
| 440 |
-
|
| 441 |
-
|
| 442 |
-
|
| 443 |
-
|
| 444 |
-
|
| 445 |
-
|
| 446 |
-
|
| 447 |
-
|
| 448 |
-
|
| 449 |
-
|
| 450 |
-
|
| 451 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 452 |
return leaderboard_data
|
| 453 |
|
| 454 |
|
|
@@ -536,7 +535,7 @@ with gr.Blocks() as app:
|
|
| 536 |
login_button = gr.Button(
|
| 537 |
"Sign in with Hugging Face", elem_id="oauth-button"
|
| 538 |
)
|
| 539 |
-
|
| 540 |
# NEW: Add a textbox for the repository URL above the user prompt
|
| 541 |
repo_url = gr.Textbox(
|
| 542 |
show_label=False,
|
|
@@ -544,7 +543,7 @@ with gr.Blocks() as app:
|
|
| 544 |
lines=1,
|
| 545 |
interactive=False,
|
| 546 |
)
|
| 547 |
-
|
| 548 |
# Components with initial non-interactive state
|
| 549 |
shared_input = gr.Textbox(
|
| 550 |
show_label=False,
|
|
@@ -648,7 +647,11 @@ with gr.Blocks() as app:
|
|
| 648 |
repo_info, user_input, models_state, conversation_state
|
| 649 |
):
|
| 650 |
# Combine repo-related information (if any) and user query into one prompt.
|
| 651 |
-
combined_user_input =
|
|
|
|
|
|
|
|
|
|
|
|
|
| 652 |
|
| 653 |
# Dynamically select two random models
|
| 654 |
if len(available_models) < 2:
|
|
@@ -775,7 +778,7 @@ with gr.Blocks() as app:
|
|
| 775 |
print(f"Login failed: {e}")
|
| 776 |
return (
|
| 777 |
gr.update(visible=True), # Keep the login button visible
|
| 778 |
-
gr.update(interactive=False),
|
| 779 |
gr.update(interactive=False), # Keep shared_input disabled
|
| 780 |
gr.update(interactive=False), # Keep send_first disabled
|
| 781 |
gr.update(
|
|
@@ -791,7 +794,7 @@ with gr.Blocks() as app:
|
|
| 791 |
inputs=[],
|
| 792 |
outputs=[
|
| 793 |
login_button, # Hide the login button after successful login
|
| 794 |
-
repo_url,
|
| 795 |
shared_input, # Enable shared_input
|
| 796 |
send_first, # Enable send_first button
|
| 797 |
feedback, # Enable feedback radio buttons
|
|
@@ -923,10 +926,7 @@ with gr.Blocks() as app:
|
|
| 923 |
"winner": winner_model,
|
| 924 |
"timestamp": datetime.now().strftime("%Y%m%d_%H%M%S"),
|
| 925 |
}
|
| 926 |
-
|
| 927 |
-
# Concatenate the new feedback with the existing leaderboard data
|
| 928 |
-
leaderboard_data = pd.concat([get_leaderboard_data(), pd.DataFrame([feedback_entry])], ignore_index=True)
|
| 929 |
-
|
| 930 |
# Save feedback back to the Hugging Face dataset
|
| 931 |
save_content_to_hf(feedback_entry, "SE-Arena/votes")
|
| 932 |
|
|
@@ -942,9 +942,7 @@ with gr.Blocks() as app:
|
|
| 942 |
gr.update(
|
| 943 |
value="", interactive=True, visible=True
|
| 944 |
), # Clear shared_input
|
| 945 |
-
gr.update(
|
| 946 |
-
value="", interactive=True, visible=True
|
| 947 |
-
), # Clear repo_url
|
| 948 |
gr.update(value="", visible=False), # Hide user_prompt_md
|
| 949 |
gr.update(value="", visible=False), # Hide response_a_title
|
| 950 |
gr.update(value="", visible=False), # Hide response_b_title
|
|
@@ -958,9 +956,11 @@ with gr.Blocks() as app:
|
|
| 958 |
gr.update(
|
| 959 |
value="Can't Decide", interactive=True
|
| 960 |
), # Reset feedback selection
|
| 961 |
-
|
| 962 |
gr.update(visible=True), # Show the thanks message
|
| 963 |
-
gr.update(
|
|
|
|
|
|
|
| 964 |
)
|
| 965 |
|
| 966 |
# Update the click event for the submit feedback button
|
|
@@ -969,7 +969,7 @@ with gr.Blocks() as app:
|
|
| 969 |
inputs=[feedback, models_state, conversation_state],
|
| 970 |
outputs=[
|
| 971 |
shared_input, # Reset shared_input
|
| 972 |
-
repo_url,
|
| 973 |
user_prompt_md, # Hide user_prompt_md
|
| 974 |
response_a_title, # Hide Model A title
|
| 975 |
response_b_title, # Hide Model B title
|
|
|
|
| 28 |
# Timeout in seconds for model responses
|
| 29 |
TIMEOUT = 90
|
| 30 |
|
|
|
|
|
|
|
|
|
|
| 31 |
# Hint string constant
|
| 32 |
SHOW_HINT_STRING = True # Set to False to hide the hint string altogether
|
| 33 |
HINT_STRING = "Once signed in, your votes will be recorded securely."
|
|
|
|
| 249 |
|
| 250 |
def request_model_response():
|
| 251 |
try:
|
| 252 |
+
request_params = {"model": model_name, "messages": truncated_input}
|
|
|
|
|
|
|
|
|
|
| 253 |
response = openai_client.chat.completions.create(**request_params)
|
| 254 |
model_response["content"] = response.choices[0].message.content
|
| 255 |
except Exception as e:
|
|
|
|
| 360 |
raise Exception("Error loading feedback data from Hugging Face repository.")
|
| 361 |
|
| 362 |
|
| 363 |
+
def get_leaderboard_data(feedback_entry=None):
|
| 364 |
+
# Load feedback data from the Hugging Face repository
|
| 365 |
+
feedback_data = load_content_from_hf()
|
| 366 |
+
feedback_df = pd.DataFrame(feedback_data)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 367 |
|
| 368 |
+
# Concatenate the new feedback with the existing leaderboard data
|
| 369 |
+
if feedback_entry is not None:
|
| 370 |
+
feedback_df = pd.concat(
|
| 371 |
+
[feedback_df, pd.DataFrame([feedback_entry])], ignore_index=True
|
| 372 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 373 |
|
| 374 |
+
if feedback_df.empty():
|
| 375 |
+
return pd.DataFrame(
|
| 376 |
+
columns=[
|
| 377 |
+
"Rank",
|
| 378 |
+
"Model",
|
| 379 |
+
"Elo Score",
|
| 380 |
+
"Average Win Rate",
|
| 381 |
+
"Bradley-Terry Coefficient",
|
| 382 |
+
"Eigenvector Centrality Value",
|
| 383 |
+
"Newman Modularity Score",
|
| 384 |
+
"PageRank Score",
|
| 385 |
+
]
|
| 386 |
+
)
|
| 387 |
|
| 388 |
+
# map vote to winner
|
| 389 |
+
feedback_df["winner"] = feedback_df["winner"].map(
|
| 390 |
+
{
|
| 391 |
+
"left": evalica.Winner.X,
|
| 392 |
+
"right": evalica.Winner.Y,
|
| 393 |
+
"tie": evalica.Winner.Draw,
|
| 394 |
+
}
|
| 395 |
+
)
|
|
|
|
|
|
|
|
|
|
| 396 |
|
| 397 |
+
# Calculate scores using various metrics
|
| 398 |
+
avr_result = evalica.average_win_rate(
|
| 399 |
+
feedback_df["left"], feedback_df["right"], feedback_df["winner"]
|
| 400 |
+
)
|
| 401 |
+
bt_result = evalica.bradley_terry(
|
| 402 |
+
feedback_df["left"], feedback_df["right"], feedback_df["winner"]
|
| 403 |
+
)
|
| 404 |
+
newman_result = evalica.newman(
|
| 405 |
+
feedback_df["left"], feedback_df["right"], feedback_df["winner"]
|
| 406 |
+
)
|
| 407 |
+
eigen_result = evalica.eigen(
|
| 408 |
+
feedback_df["left"], feedback_df["right"], feedback_df["winner"]
|
| 409 |
+
)
|
| 410 |
+
elo_result = evalica.elo(
|
| 411 |
+
feedback_df["left"], feedback_df["right"], feedback_df["winner"]
|
| 412 |
+
)
|
| 413 |
+
pagerank_result = evalica.pagerank(
|
| 414 |
+
feedback_df["left"], feedback_df["right"], feedback_df["winner"]
|
| 415 |
+
)
|
| 416 |
|
| 417 |
+
# Combine all results into a single DataFrame
|
| 418 |
+
leaderboard_data = pd.DataFrame(
|
| 419 |
+
{
|
| 420 |
+
"Model": elo_result.scores.index,
|
| 421 |
+
"Elo Score": elo_result.scores.values,
|
| 422 |
+
"Average Win Rate": avr_result.scores.values * 100,
|
| 423 |
+
"Bradley-Terry Coefficient": bt_result.scores.values,
|
| 424 |
+
"Eigenvector Centrality Value": eigen_result.scores.values,
|
| 425 |
+
"Newman Modularity Score": newman_result.scores.values,
|
| 426 |
+
"PageRank Score": pagerank_result.scores.values,
|
| 427 |
+
}
|
| 428 |
+
)
|
| 429 |
+
|
| 430 |
+
# Round all numeric columns to two decimal places
|
| 431 |
+
leaderboard_data = leaderboard_data.round(
|
| 432 |
+
{
|
| 433 |
+
"Elo Score": 2,
|
| 434 |
+
"Average Win Rate": 2,
|
| 435 |
+
"Bradley-Terry Coefficient": 2,
|
| 436 |
+
"Eigenvector Centrality Value": 2,
|
| 437 |
+
"Newman Modularity Score": 2,
|
| 438 |
+
"PageRank Score": 2,
|
| 439 |
+
}
|
| 440 |
+
)
|
| 441 |
+
|
| 442 |
+
# Add a Rank column based on Elo scores
|
| 443 |
+
leaderboard_data["Rank"] = (
|
| 444 |
+
leaderboard_data["Elo Score"].rank(ascending=False).astype(int)
|
| 445 |
+
)
|
| 446 |
+
|
| 447 |
+
# Place rank in the first column
|
| 448 |
+
leaderboard_data = leaderboard_data[
|
| 449 |
+
["Rank"] + [col for col in leaderboard_data.columns if col != "Rank"]
|
| 450 |
+
]
|
| 451 |
return leaderboard_data
|
| 452 |
|
| 453 |
|
|
|
|
| 535 |
login_button = gr.Button(
|
| 536 |
"Sign in with Hugging Face", elem_id="oauth-button"
|
| 537 |
)
|
| 538 |
+
|
| 539 |
# NEW: Add a textbox for the repository URL above the user prompt
|
| 540 |
repo_url = gr.Textbox(
|
| 541 |
show_label=False,
|
|
|
|
| 543 |
lines=1,
|
| 544 |
interactive=False,
|
| 545 |
)
|
| 546 |
+
|
| 547 |
# Components with initial non-interactive state
|
| 548 |
shared_input = gr.Textbox(
|
| 549 |
show_label=False,
|
|
|
|
| 647 |
repo_info, user_input, models_state, conversation_state
|
| 648 |
):
|
| 649 |
# Combine repo-related information (if any) and user query into one prompt.
|
| 650 |
+
combined_user_input = (
|
| 651 |
+
f"Repo-related Information: {fetch_url_content(repo_info)}\n\n{user_input}"
|
| 652 |
+
if repo_info
|
| 653 |
+
else user_input
|
| 654 |
+
)
|
| 655 |
|
| 656 |
# Dynamically select two random models
|
| 657 |
if len(available_models) < 2:
|
|
|
|
| 778 |
print(f"Login failed: {e}")
|
| 779 |
return (
|
| 780 |
gr.update(visible=True), # Keep the login button visible
|
| 781 |
+
gr.update(interactive=False), # repo_url -> disable if login failed
|
| 782 |
gr.update(interactive=False), # Keep shared_input disabled
|
| 783 |
gr.update(interactive=False), # Keep send_first disabled
|
| 784 |
gr.update(
|
|
|
|
| 794 |
inputs=[],
|
| 795 |
outputs=[
|
| 796 |
login_button, # Hide the login button after successful login
|
| 797 |
+
repo_url, # Keep this in sync with shared_input
|
| 798 |
shared_input, # Enable shared_input
|
| 799 |
send_first, # Enable send_first button
|
| 800 |
feedback, # Enable feedback radio buttons
|
|
|
|
| 926 |
"winner": winner_model,
|
| 927 |
"timestamp": datetime.now().strftime("%Y%m%d_%H%M%S"),
|
| 928 |
}
|
| 929 |
+
|
|
|
|
|
|
|
|
|
|
| 930 |
# Save feedback back to the Hugging Face dataset
|
| 931 |
save_content_to_hf(feedback_entry, "SE-Arena/votes")
|
| 932 |
|
|
|
|
| 942 |
gr.update(
|
| 943 |
value="", interactive=True, visible=True
|
| 944 |
), # Clear shared_input
|
| 945 |
+
gr.update(value="", interactive=True, visible=True), # Clear repo_url
|
|
|
|
|
|
|
| 946 |
gr.update(value="", visible=False), # Hide user_prompt_md
|
| 947 |
gr.update(value="", visible=False), # Hide response_a_title
|
| 948 |
gr.update(value="", visible=False), # Hide response_b_title
|
|
|
|
| 956 |
gr.update(
|
| 957 |
value="Can't Decide", interactive=True
|
| 958 |
), # Reset feedback selection
|
| 959 |
+
get_leaderboard_data(feedback_entry), # Updated leaderboard data
|
| 960 |
gr.update(visible=True), # Show the thanks message
|
| 961 |
+
gr.update(
|
| 962 |
+
value="", interactive=True, visible=True
|
| 963 |
+
), # Show the repo-related url message
|
| 964 |
)
|
| 965 |
|
| 966 |
# Update the click event for the submit feedback button
|
|
|
|
| 969 |
inputs=[feedback, models_state, conversation_state],
|
| 970 |
outputs=[
|
| 971 |
shared_input, # Reset shared_input
|
| 972 |
+
repo_url, # Show the repo-related URL message
|
| 973 |
user_prompt_md, # Hide user_prompt_md
|
| 974 |
response_a_title, # Hide Model A title
|
| 975 |
response_b_title, # Hide Model B title
|