Spaces:
Running
Running
fix the sync bug
Browse files- app.py +92 -107
- context_window.json +2 -3
app.py
CHANGED
|
@@ -28,6 +28,9 @@ openai_client = OpenAI(api_key=api_key, base_url=base_url)
|
|
| 28 |
# Timeout in seconds for model responses
|
| 29 |
TIMEOUT = 90
|
| 30 |
|
|
|
|
|
|
|
|
|
|
| 31 |
# Hint string constant
|
| 32 |
SHOW_HINT_STRING = True # Set to False to hide the hint string altogether
|
| 33 |
HINT_STRING = "Once signed in, your votes will be recorded securely."
|
|
@@ -282,7 +285,7 @@ def chat_with_models(
|
|
| 282 |
return formatted_response
|
| 283 |
|
| 284 |
|
| 285 |
-
def save_content_to_hf(
|
| 286 |
"""
|
| 287 |
Save feedback content to Hugging Face repository organized by month and year.
|
| 288 |
|
|
@@ -291,13 +294,8 @@ def save_content_to_hf(content, repo_name):
|
|
| 291 |
month_year (str): Year and month string in the format "YYYY_MM".
|
| 292 |
repo_name (str): Hugging Face repository name.
|
| 293 |
"""
|
| 294 |
-
# Ensure the user is authenticated with HF
|
| 295 |
-
token = HfFolder.get_token()
|
| 296 |
-
if token is None:
|
| 297 |
-
raise ValueError("Please log in to Hugging Face using `huggingface-cli login`.")
|
| 298 |
-
|
| 299 |
# Serialize the content to JSON and encode it as bytes
|
| 300 |
-
json_content = json.dumps(
|
| 301 |
|
| 302 |
# Create a binary file-like object
|
| 303 |
file_like_object = io.BytesIO(json_content)
|
|
@@ -309,6 +307,11 @@ def save_content_to_hf(content, repo_name):
|
|
| 309 |
# Define the path in the repository
|
| 310 |
filename = f"{month_year}/{day_hour_minute_second}.json"
|
| 311 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 312 |
# Upload to Hugging Face repository
|
| 313 |
upload_file(
|
| 314 |
path_or_fileobj=file_like_object,
|
|
@@ -340,15 +343,15 @@ def load_content_from_hf(repo_name="SE-Arena/votes"):
|
|
| 340 |
repo_files = api.list_repo_files(repo_id=repo_name, repo_type="dataset")
|
| 341 |
|
| 342 |
# Filter files by current year and month
|
| 343 |
-
|
| 344 |
|
| 345 |
-
if not
|
| 346 |
raise FileNotFoundError(
|
| 347 |
f"No feedback files found for {year_month} in {repo_name}."
|
| 348 |
)
|
| 349 |
|
| 350 |
# Download and aggregate data
|
| 351 |
-
for file in
|
| 352 |
local_path = hf_hub_download(
|
| 353 |
repo_id=repo_name, filename=file, repo_type="dataset"
|
| 354 |
)
|
|
@@ -366,100 +369,85 @@ def load_content_from_hf(repo_name="SE-Arena/votes"):
|
|
| 366 |
|
| 367 |
|
| 368 |
def get_leaderboard_data():
|
| 369 |
-
|
| 370 |
-
|
| 371 |
-
|
| 372 |
-
|
| 373 |
-
|
| 374 |
-
|
| 375 |
-
|
| 376 |
-
|
| 377 |
-
|
| 378 |
-
|
| 379 |
-
|
| 380 |
-
|
| 381 |
-
|
| 382 |
-
|
| 383 |
-
"Newman Modularity Score",
|
| 384 |
-
"PageRank Score",
|
| 385 |
-
]
|
| 386 |
-
)
|
| 387 |
-
|
| 388 |
-
feedback_df["winner"] = feedback_df["winner"].map(
|
| 389 |
-
{
|
| 390 |
-
"left": evalica.Winner.X,
|
| 391 |
-
"right": evalica.Winner.Y,
|
| 392 |
-
"tie": evalica.Winner.Draw,
|
| 393 |
-
}
|
| 394 |
-
)
|
| 395 |
-
|
| 396 |
-
# Calculate scores using various metrics
|
| 397 |
-
avr_result = evalica.average_win_rate(
|
| 398 |
-
feedback_df["left"], feedback_df["right"], feedback_df["winner"]
|
| 399 |
-
)
|
| 400 |
-
bt_result = evalica.bradley_terry(
|
| 401 |
-
feedback_df["left"], feedback_df["right"], feedback_df["winner"]
|
| 402 |
-
)
|
| 403 |
-
newman_result = evalica.newman(
|
| 404 |
-
feedback_df["left"], feedback_df["right"], feedback_df["winner"]
|
| 405 |
-
)
|
| 406 |
-
eigen_result = evalica.eigen(
|
| 407 |
-
feedback_df["left"], feedback_df["right"], feedback_df["winner"]
|
| 408 |
-
)
|
| 409 |
-
elo_result = evalica.elo(
|
| 410 |
-
feedback_df["left"], feedback_df["right"], feedback_df["winner"]
|
| 411 |
-
)
|
| 412 |
-
pagerank_result = evalica.pagerank(
|
| 413 |
-
feedback_df["left"], feedback_df["right"], feedback_df["winner"]
|
| 414 |
-
)
|
| 415 |
-
|
| 416 |
-
# Combine all results into a single DataFrame
|
| 417 |
-
ranking_df = pd.DataFrame(
|
| 418 |
-
{
|
| 419 |
-
"Model": elo_result.scores.index,
|
| 420 |
-
"Elo Score": elo_result.scores.values,
|
| 421 |
-
"Average Win Rate": avr_result.scores.values * 100,
|
| 422 |
-
"Bradley-Terry Coefficient": bt_result.scores.values,
|
| 423 |
-
"Eigenvector Centrality Value": eigen_result.scores.values,
|
| 424 |
-
"PageRank Score": pagerank_result.scores.values,
|
| 425 |
-
"Newman Modularity Score": newman_result.scores.values,
|
| 426 |
-
}
|
| 427 |
-
)
|
| 428 |
|
| 429 |
-
|
| 430 |
-
|
| 431 |
-
|
| 432 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 433 |
|
| 434 |
-
|
| 435 |
-
|
| 436 |
-
|
| 437 |
-
|
| 438 |
-
|
| 439 |
-
|
| 440 |
-
|
| 441 |
-
|
| 442 |
-
|
| 443 |
-
|
| 444 |
-
|
|
|
|
| 445 |
|
| 446 |
-
|
| 447 |
-
|
| 448 |
-
|
| 449 |
-
|
| 450 |
-
|
| 451 |
-
|
| 452 |
-
|
| 453 |
-
|
| 454 |
-
|
| 455 |
-
|
| 456 |
-
|
| 457 |
-
"Newman Modularity Score",
|
| 458 |
-
"PageRank Score",
|
| 459 |
-
]
|
| 460 |
-
]
|
| 461 |
|
| 462 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 463 |
|
| 464 |
|
| 465 |
# Function to enable or disable submit buttons based on textbox content
|
|
@@ -916,9 +904,6 @@ with gr.Blocks() as app:
|
|
| 916 |
)
|
| 917 |
|
| 918 |
def submit_feedback(vote, models_state, conversation_state):
|
| 919 |
-
# Get current timestamp
|
| 920 |
-
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 921 |
-
|
| 922 |
# Map vote to actual model names
|
| 923 |
match vote:
|
| 924 |
case "Model A":
|
|
@@ -933,9 +918,12 @@ with gr.Blocks() as app:
|
|
| 933 |
"left": models_state["Model A"],
|
| 934 |
"right": models_state["Model B"],
|
| 935 |
"winner": winner_model,
|
| 936 |
-
"timestamp":
|
| 937 |
}
|
| 938 |
|
|
|
|
|
|
|
|
|
|
| 939 |
# Save feedback back to the Hugging Face dataset
|
| 940 |
save_content_to_hf(feedback_entry, "SE-Arena/votes")
|
| 941 |
|
|
@@ -946,9 +934,6 @@ with gr.Blocks() as app:
|
|
| 946 |
models_state.clear()
|
| 947 |
conversation_state.clear()
|
| 948 |
|
| 949 |
-
# Recalculate leaderboard
|
| 950 |
-
leaderboard_data = get_leaderboard_data()
|
| 951 |
-
|
| 952 |
# Adjust output count to match the interface definition
|
| 953 |
return (
|
| 954 |
gr.update(
|
|
|
|
| 28 |
# Timeout in seconds for model responses
|
| 29 |
TIMEOUT = 90
|
| 30 |
|
| 31 |
+
# leaderboard data
|
| 32 |
+
leaderboard_data = None
|
| 33 |
+
|
| 34 |
# Hint string constant
|
| 35 |
SHOW_HINT_STRING = True # Set to False to hide the hint string altogether
|
| 36 |
HINT_STRING = "Once signed in, your votes will be recorded securely."
|
|
|
|
| 285 |
return formatted_response
|
| 286 |
|
| 287 |
|
| 288 |
+
def save_content_to_hf(feedback_data, repo_name):
|
| 289 |
"""
|
| 290 |
Save feedback content to Hugging Face repository organized by month and year.
|
| 291 |
|
|
|
|
| 294 |
month_year (str): Year and month string in the format "YYYY_MM".
|
| 295 |
repo_name (str): Hugging Face repository name.
|
| 296 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 297 |
# Serialize the content to JSON and encode it as bytes
|
| 298 |
+
json_content = json.dumps(feedback_data, indent=4).encode("utf-8")
|
| 299 |
|
| 300 |
# Create a binary file-like object
|
| 301 |
file_like_object = io.BytesIO(json_content)
|
|
|
|
| 307 |
# Define the path in the repository
|
| 308 |
filename = f"{month_year}/{day_hour_minute_second}.json"
|
| 309 |
|
| 310 |
+
# Ensure the user is authenticated with HF
|
| 311 |
+
token = HfFolder.get_token()
|
| 312 |
+
if token is None:
|
| 313 |
+
raise ValueError("Please log in to Hugging Face using `huggingface-cli login`.")
|
| 314 |
+
|
| 315 |
# Upload to Hugging Face repository
|
| 316 |
upload_file(
|
| 317 |
path_or_fileobj=file_like_object,
|
|
|
|
| 343 |
repo_files = api.list_repo_files(repo_id=repo_name, repo_type="dataset")
|
| 344 |
|
| 345 |
# Filter files by current year and month
|
| 346 |
+
leaderboard_files = [file for file in repo_files if year_month in file]
|
| 347 |
|
| 348 |
+
if not leaderboard_files:
|
| 349 |
raise FileNotFoundError(
|
| 350 |
f"No feedback files found for {year_month} in {repo_name}."
|
| 351 |
)
|
| 352 |
|
| 353 |
# Download and aggregate data
|
| 354 |
+
for file in leaderboard_files:
|
| 355 |
local_path = hf_hub_download(
|
| 356 |
repo_id=repo_name, filename=file, repo_type="dataset"
|
| 357 |
)
|
|
|
|
| 369 |
|
| 370 |
|
| 371 |
def get_leaderboard_data():
|
| 372 |
+
if leaderboard_data is None:
|
| 373 |
+
# Load feedback data from the Hugging Face repository
|
| 374 |
+
try:
|
| 375 |
+
feedback_data = load_content_from_hf()
|
| 376 |
+
feedback_df = pd.DataFrame(feedback_data)
|
| 377 |
+
|
| 378 |
+
# map vote to winner
|
| 379 |
+
feedback_df["winner"] = feedback_df["winner"].map(
|
| 380 |
+
{
|
| 381 |
+
"left": evalica.Winner.X,
|
| 382 |
+
"right": evalica.Winner.Y,
|
| 383 |
+
"tie": evalica.Winner.Draw,
|
| 384 |
+
}
|
| 385 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 386 |
|
| 387 |
+
# Calculate scores using various metrics
|
| 388 |
+
avr_result = evalica.average_win_rate(
|
| 389 |
+
feedback_df["left"], feedback_df["right"], feedback_df["winner"]
|
| 390 |
+
)
|
| 391 |
+
bt_result = evalica.bradley_terry(
|
| 392 |
+
feedback_df["left"], feedback_df["right"], feedback_df["winner"]
|
| 393 |
+
)
|
| 394 |
+
newman_result = evalica.newman(
|
| 395 |
+
feedback_df["left"], feedback_df["right"], feedback_df["winner"]
|
| 396 |
+
)
|
| 397 |
+
eigen_result = evalica.eigen(
|
| 398 |
+
feedback_df["left"], feedback_df["right"], feedback_df["winner"]
|
| 399 |
+
)
|
| 400 |
+
elo_result = evalica.elo(
|
| 401 |
+
feedback_df["left"], feedback_df["right"], feedback_df["winner"]
|
| 402 |
+
)
|
| 403 |
+
pagerank_result = evalica.pagerank(
|
| 404 |
+
feedback_df["left"], feedback_df["right"], feedback_df["winner"]
|
| 405 |
+
)
|
| 406 |
|
| 407 |
+
# Combine all results into a single DataFrame
|
| 408 |
+
leaderboard_data = pd.DataFrame(
|
| 409 |
+
{
|
| 410 |
+
"Model": elo_result.scores.index,
|
| 411 |
+
"Elo Score": elo_result.scores.values,
|
| 412 |
+
"Average Win Rate": avr_result.scores.values * 100,
|
| 413 |
+
"Bradley-Terry Coefficient": bt_result.scores.values,
|
| 414 |
+
"Eigenvector Centrality Value": eigen_result.scores.values,
|
| 415 |
+
"Newman Modularity Score": newman_result.scores.values,
|
| 416 |
+
"PageRank Score": pagerank_result.scores.values,
|
| 417 |
+
}
|
| 418 |
+
)
|
| 419 |
|
| 420 |
+
# Round all numeric columns to two decimal places
|
| 421 |
+
leaderboard_data = leaderboard_data.round(
|
| 422 |
+
{
|
| 423 |
+
"Elo Score": 2,
|
| 424 |
+
"Average Win Rate": 2,
|
| 425 |
+
"Bradley-Terry Coefficient": 2,
|
| 426 |
+
"Eigenvector Centrality Value": 2,
|
| 427 |
+
"Newman Modularity Score": 2,
|
| 428 |
+
"PageRank Score": 2,
|
| 429 |
+
}
|
| 430 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 431 |
|
| 432 |
+
# Add a Rank column based on Elo scores
|
| 433 |
+
leaderboard_data["Rank"] = (
|
| 434 |
+
leaderboard_data["Elo Score"].rank(ascending=False).astype(int)
|
| 435 |
+
)
|
| 436 |
+
except:
|
| 437 |
+
# If no feedback exists, return an empty DataFrame
|
| 438 |
+
return pd.DataFrame(
|
| 439 |
+
columns=[
|
| 440 |
+
"Rank",
|
| 441 |
+
"Model",
|
| 442 |
+
"Elo Score",
|
| 443 |
+
"Average Win Rate",
|
| 444 |
+
"Bradley-Terry Coefficient",
|
| 445 |
+
"Eigenvector Centrality Value",
|
| 446 |
+
"Newman Modularity Score",
|
| 447 |
+
"PageRank Score",
|
| 448 |
+
]
|
| 449 |
+
)
|
| 450 |
+
return leaderboard_data
|
| 451 |
|
| 452 |
|
| 453 |
# Function to enable or disable submit buttons based on textbox content
|
|
|
|
| 904 |
)
|
| 905 |
|
| 906 |
def submit_feedback(vote, models_state, conversation_state):
|
|
|
|
|
|
|
|
|
|
| 907 |
# Map vote to actual model names
|
| 908 |
match vote:
|
| 909 |
case "Model A":
|
|
|
|
| 918 |
"left": models_state["Model A"],
|
| 919 |
"right": models_state["Model B"],
|
| 920 |
"winner": winner_model,
|
| 921 |
+
"timestamp": datetime.now().strftime("%Y%m%d_%H%M%S"),
|
| 922 |
}
|
| 923 |
|
| 924 |
+
# Concatenate the new feedback with the existing leaderboard data
|
| 925 |
+
leaderboard_data = pd.concat([get_leaderboard_data(), pd.DataFrame([feedback_entry])], ignore_index=True)
|
| 926 |
+
|
| 927 |
# Save feedback back to the Hugging Face dataset
|
| 928 |
save_content_to_hf(feedback_entry, "SE-Arena/votes")
|
| 929 |
|
|
|
|
| 934 |
models_state.clear()
|
| 935 |
conversation_state.clear()
|
| 936 |
|
|
|
|
|
|
|
|
|
|
| 937 |
# Adjust output count to match the interface definition
|
| 938 |
return (
|
| 939 |
gr.update(
|
context_window.json
CHANGED
|
@@ -14,9 +14,8 @@
|
|
| 14 |
"llama-3.1-405b": 128000,
|
| 15 |
"llama-3.1-70b": 128000,
|
| 16 |
"llama-3.3-70b": 128000,
|
| 17 |
-
"o1
|
| 18 |
-
"o1-mini
|
| 19 |
-
"Qwen2-72B-Instruct": 131072,
|
| 20 |
"Qwen2.5-32B-Instruct": 131072,
|
| 21 |
"qwen2.5-72b": 32768,
|
| 22 |
"Qwen2.5-72B-Instruct": 131072,
|
|
|
|
| 14 |
"llama-3.1-405b": 128000,
|
| 15 |
"llama-3.1-70b": 128000,
|
| 16 |
"llama-3.3-70b": 128000,
|
| 17 |
+
"o1": 128000,
|
| 18 |
+
"o1-mini": 128000,
|
|
|
|
| 19 |
"Qwen2.5-32B-Instruct": 131072,
|
| 20 |
"qwen2.5-72b": 32768,
|
| 21 |
"Qwen2.5-72B-Instruct": 131072,
|