Spaces:
Sleeping
Sleeping
add Conversation Efficiency Index
Browse files
app.py
CHANGED
|
@@ -296,12 +296,12 @@ def format_conversation_history(conversation_history):
|
|
| 296 |
return formatted_text
|
| 297 |
|
| 298 |
|
| 299 |
-
def save_content_to_hf(
|
| 300 |
"""
|
| 301 |
Save feedback content to Hugging Face repository organized by quarter.
|
| 302 |
"""
|
| 303 |
# Serialize the content to JSON and encode it as bytes
|
| 304 |
-
json_content = json.dumps(
|
| 305 |
|
| 306 |
# Create a binary file-like object
|
| 307 |
file_like_object = io.BytesIO(json_content)
|
|
@@ -334,7 +334,7 @@ def load_content_from_hf(repo_name="SE-Arena/votes"):
|
|
| 334 |
Returns:
|
| 335 |
list: Aggregated feedback data read from the repository.
|
| 336 |
"""
|
| 337 |
-
|
| 338 |
|
| 339 |
# Get the current year and quarter
|
| 340 |
now = datetime.now()
|
|
@@ -354,35 +354,31 @@ def load_content_from_hf(repo_name="SE-Arena/votes"):
|
|
| 354 |
)
|
| 355 |
with open(local_path, "r") as f:
|
| 356 |
data = json.load(f)
|
| 357 |
-
|
| 358 |
-
|
|
|
|
| 359 |
|
| 360 |
except:
|
| 361 |
raise Exception("Error loading feedback data from Hugging Face repository.")
|
| 362 |
|
| 363 |
|
| 364 |
-
def get_leaderboard_data(
|
| 365 |
# Load feedback data from the Hugging Face repository
|
| 366 |
-
|
| 367 |
-
|
| 368 |
-
|
| 369 |
-
# Load conversation data from the Hugging Face repository
|
| 370 |
-
conversation_data = load_content_from_hf("SE-Arena/conversations")
|
| 371 |
-
conversation_df = pd.DataFrame(conversation_data)
|
| 372 |
|
| 373 |
# Concatenate the new feedback with the existing leaderboard data
|
| 374 |
-
if
|
| 375 |
-
|
| 376 |
-
[feedback_df, pd.DataFrame([feedback_entry])], ignore_index=True
|
| 377 |
-
)
|
| 378 |
|
| 379 |
-
if
|
| 380 |
return pd.DataFrame(
|
| 381 |
columns=[
|
| 382 |
"Rank",
|
| 383 |
"Model",
|
| 384 |
"Elo Score",
|
| 385 |
-
"
|
|
|
|
| 386 |
"Average Win Rate",
|
| 387 |
"Bradley-Terry Coefficient",
|
| 388 |
"Eigenvector Centrality Value",
|
|
@@ -392,7 +388,7 @@ def get_leaderboard_data(feedback_entry=None):
|
|
| 392 |
)
|
| 393 |
|
| 394 |
# map vote to winner
|
| 395 |
-
|
| 396 |
{
|
| 397 |
"left": evalica.Winner.X,
|
| 398 |
"right": evalica.Winner.Y,
|
|
@@ -402,51 +398,76 @@ def get_leaderboard_data(feedback_entry=None):
|
|
| 402 |
|
| 403 |
# Calculate scores using various metrics
|
| 404 |
avr_result = evalica.average_win_rate(
|
| 405 |
-
|
| 406 |
)
|
| 407 |
bt_result = evalica.bradley_terry(
|
| 408 |
-
|
| 409 |
-
)
|
| 410 |
-
newman_result = evalica.newman(
|
| 411 |
-
feedback_df["left"], feedback_df["right"], feedback_df["winner"]
|
| 412 |
-
)
|
| 413 |
-
eigen_result = evalica.eigen(
|
| 414 |
-
feedback_df["left"], feedback_df["right"], feedback_df["winner"]
|
| 415 |
-
)
|
| 416 |
-
elo_result = evalica.elo(
|
| 417 |
-
feedback_df["left"], feedback_df["right"], feedback_df["winner"]
|
| 418 |
)
|
|
|
|
|
|
|
|
|
|
| 419 |
pagerank_result = evalica.pagerank(
|
| 420 |
-
|
| 421 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 422 |
|
| 423 |
-
#
|
| 424 |
-
|
| 425 |
-
"
|
| 426 |
-
)
|
| 427 |
-
|
| 428 |
-
#
|
| 429 |
-
for model
|
| 430 |
-
|
| 431 |
-
|
| 432 |
-
|
| 433 |
-
|
| 434 |
-
|
| 435 |
-
|
| 436 |
-
|
| 437 |
-
|
| 438 |
-
|
| 439 |
-
|
| 440 |
-
|
| 441 |
-
|
| 442 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 443 |
|
| 444 |
# Combine all results into a single DataFrame
|
| 445 |
leaderboard_data = pd.DataFrame(
|
| 446 |
{
|
| 447 |
"Model": elo_result.scores.index,
|
| 448 |
"Elo Score": elo_result.scores.values,
|
| 449 |
-
"
|
|
|
|
| 450 |
"Average Win Rate": avr_result.scores.values,
|
| 451 |
"Bradley-Terry Coefficient": bt_result.scores.values,
|
| 452 |
"Eigenvector Centrality Value": eigen_result.scores.values,
|
|
@@ -459,6 +480,7 @@ def get_leaderboard_data(feedback_entry=None):
|
|
| 459 |
leaderboard_data = leaderboard_data.round(
|
| 460 |
{
|
| 461 |
"Elo Score": 2,
|
|
|
|
| 462 |
"Average Win Rate": 2,
|
| 463 |
"Bradley-Terry Coefficient": 2,
|
| 464 |
"Eigenvector Centrality Value": 2,
|
|
@@ -509,12 +531,14 @@ with gr.Blocks() as app:
|
|
| 509 |
"Rank",
|
| 510 |
"Model",
|
| 511 |
"Elo Score",
|
| 512 |
-
"
|
|
|
|
| 513 |
],
|
| 514 |
search_columns=["Model"],
|
| 515 |
filter_columns=[
|
| 516 |
"Elo Score",
|
| 517 |
-
"
|
|
|
|
| 518 |
"Average Win Rate",
|
| 519 |
"Bradley-Terry Coefficient",
|
| 520 |
"Eigenvector Centrality Value",
|
|
@@ -1117,7 +1141,7 @@ with gr.Blocks() as app:
|
|
| 1117 |
winner_model = "tie"
|
| 1118 |
|
| 1119 |
# Create feedback entry
|
| 1120 |
-
|
| 1121 |
"left": models_state["left"],
|
| 1122 |
"right": models_state["right"],
|
| 1123 |
"winner": winner_model,
|
|
@@ -1130,7 +1154,7 @@ with gr.Blocks() as app:
|
|
| 1130 |
file_name = now.strftime("%Y%m%d_%H%M%S")
|
| 1131 |
|
| 1132 |
# Save feedback back to the Hugging Face dataset
|
| 1133 |
-
save_content_to_hf(
|
| 1134 |
|
| 1135 |
conversation_state["right_chat"][0]["content"] = conversation_state[
|
| 1136 |
"right_chat"
|
|
@@ -1175,7 +1199,7 @@ with gr.Blocks() as app:
|
|
| 1175 |
gr.update(
|
| 1176 |
value="Can't Decide", interactive=True
|
| 1177 |
), # [10] Reset feedback radio selection
|
| 1178 |
-
get_leaderboard_data(
|
| 1179 |
gr.update(
|
| 1180 |
visible=True
|
| 1181 |
), # [12] Show the thanks_message markdown component
|
|
|
|
| 296 |
return formatted_text
|
| 297 |
|
| 298 |
|
| 299 |
+
def save_content_to_hf(vote_data, repo_name, folder_name, file_name):
|
| 300 |
"""
|
| 301 |
Save feedback content to Hugging Face repository organized by quarter.
|
| 302 |
"""
|
| 303 |
# Serialize the content to JSON and encode it as bytes
|
| 304 |
+
json_content = json.dumps(vote_data, indent=4).encode("utf-8")
|
| 305 |
|
| 306 |
# Create a binary file-like object
|
| 307 |
file_like_object = io.BytesIO(json_content)
|
|
|
|
| 334 |
Returns:
|
| 335 |
list: Aggregated feedback data read from the repository.
|
| 336 |
"""
|
| 337 |
+
vote_data = []
|
| 338 |
|
| 339 |
# Get the current year and quarter
|
| 340 |
now = datetime.now()
|
|
|
|
| 354 |
)
|
| 355 |
with open(local_path, "r") as f:
|
| 356 |
data = json.load(f)
|
| 357 |
+
data["timestamp"] = file.split("/")[-1].split(".")[0]
|
| 358 |
+
vote_data.append(data)
|
| 359 |
+
return vote_data
|
| 360 |
|
| 361 |
except:
|
| 362 |
raise Exception("Error loading feedback data from Hugging Face repository.")
|
| 363 |
|
| 364 |
|
| 365 |
+
def get_leaderboard_data(vote_entry=None):
|
| 366 |
# Load feedback data from the Hugging Face repository
|
| 367 |
+
vote_data = load_content_from_hf()
|
| 368 |
+
vote_df = pd.DataFrame(vote_data)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 369 |
|
| 370 |
# Concatenate the new feedback with the existing leaderboard data
|
| 371 |
+
if vote_entry is not None:
|
| 372 |
+
vote_df = pd.concat([vote_df, pd.DataFrame([vote_entry])], ignore_index=True)
|
|
|
|
|
|
|
| 373 |
|
| 374 |
+
if vote_df.empty:
|
| 375 |
return pd.DataFrame(
|
| 376 |
columns=[
|
| 377 |
"Rank",
|
| 378 |
"Model",
|
| 379 |
"Elo Score",
|
| 380 |
+
"Conversation Efficiency Index",
|
| 381 |
+
"Model Consistency Score",
|
| 382 |
"Average Win Rate",
|
| 383 |
"Bradley-Terry Coefficient",
|
| 384 |
"Eigenvector Centrality Value",
|
|
|
|
| 388 |
)
|
| 389 |
|
| 390 |
# map vote to winner
|
| 391 |
+
vote_df["winner"] = vote_df["winner"].map(
|
| 392 |
{
|
| 393 |
"left": evalica.Winner.X,
|
| 394 |
"right": evalica.Winner.Y,
|
|
|
|
| 398 |
|
| 399 |
# Calculate scores using various metrics
|
| 400 |
avr_result = evalica.average_win_rate(
|
| 401 |
+
vote_df["left"], vote_df["right"], vote_df["winner"]
|
| 402 |
)
|
| 403 |
bt_result = evalica.bradley_terry(
|
| 404 |
+
vote_df["left"], vote_df["right"], vote_df["winner"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 405 |
)
|
| 406 |
+
newman_result = evalica.newman(vote_df["left"], vote_df["right"], vote_df["winner"])
|
| 407 |
+
eigen_result = evalica.eigen(vote_df["left"], vote_df["right"], vote_df["winner"])
|
| 408 |
+
elo_result = evalica.elo(vote_df["left"], vote_df["right"], vote_df["winner"])
|
| 409 |
pagerank_result = evalica.pagerank(
|
| 410 |
+
vote_df["left"], vote_df["right"], vote_df["winner"]
|
| 411 |
)
|
| 412 |
+
|
| 413 |
+
# Load conversation data from the Hugging Face repository
|
| 414 |
+
conversation_data = load_content_from_hf("SE-Arena/conversations")
|
| 415 |
+
conversation_df = pd.DataFrame(conversation_data)
|
| 416 |
|
| 417 |
+
# Merge vote data with conversation data
|
| 418 |
+
all_df = pd.merge(
|
| 419 |
+
vote_df, conversation_df, on=["timestamp", "left", "right"], how="inner"
|
| 420 |
+
)
|
| 421 |
+
|
| 422 |
+
# Calculate Conversation Efficiency Indexs more efficiently
|
| 423 |
+
# Create a dictionary to store accumulated scores and counts for each model
|
| 424 |
+
model_rcs_sum = {}
|
| 425 |
+
model_rcs_max = {}
|
| 426 |
+
|
| 427 |
+
# Process each row once and accumulate scores
|
| 428 |
+
for _, row in all_df.iterrows():
|
| 429 |
+
# Determine scores based on winner
|
| 430 |
+
match row["winner"]:
|
| 431 |
+
case evalica.Winner.X:
|
| 432 |
+
left_score = 1.0
|
| 433 |
+
right_score = -1.0
|
| 434 |
+
case evalica.Winner.Y:
|
| 435 |
+
left_score = -1.0
|
| 436 |
+
right_score = 1.0
|
| 437 |
+
case _: # Draw
|
| 438 |
+
left_score = 0.1
|
| 439 |
+
right_score = 0.1
|
| 440 |
+
|
| 441 |
+
# Count rounds for each side
|
| 442 |
+
left_round = sum(1 for msg in row["left_chat"] if msg["role"] == "assistant")
|
| 443 |
+
right_round = sum(1 for msg in row["right_chat"] if msg["role"] == "assistant")
|
| 444 |
+
|
| 445 |
+
left_model = row["left"]
|
| 446 |
+
right_model = row["right"]
|
| 447 |
+
|
| 448 |
+
model_rcs_max[left_model] = model_rcs_max.get(left_model, 0) + 1.0 / left_round
|
| 449 |
+
model_rcs_max[right_model] = model_rcs_max.get(right_model, 0) + 1.0 / right_round
|
| 450 |
+
|
| 451 |
+
# Calculate per-round scores
|
| 452 |
+
model_rcs_sum[left_model] = model_rcs_sum.get(left_model, 0) + left_score / left_round
|
| 453 |
+
model_rcs_sum[right_model] = model_rcs_sum.get(right_model, 0) + right_score / right_round
|
| 454 |
+
|
| 455 |
+
cei_result = {model: model_rcs_sum[model] / model_rcs_max[model] for model in model_rcs_sum}
|
| 456 |
+
cei_result = pd.Series({model: cei_result[model] for model in elo_result.scores.index})
|
| 457 |
+
|
| 458 |
+
self_matches = vote_df[vote_df["left"] == vote_df["right"]]
|
| 459 |
+
model_matches = self_matches.groupby("left")
|
| 460 |
+
draw_counts = model_matches["winner"].apply(lambda x: (x == evalica.Winner.Draw).sum())
|
| 461 |
+
total_counts = model_matches.size()
|
| 462 |
+
mcs_result = (draw_counts / total_counts).round(2).reindex(elo_result.scores.index, fill_value="N/A")
|
| 463 |
|
| 464 |
# Combine all results into a single DataFrame
|
| 465 |
leaderboard_data = pd.DataFrame(
|
| 466 |
{
|
| 467 |
"Model": elo_result.scores.index,
|
| 468 |
"Elo Score": elo_result.scores.values,
|
| 469 |
+
"Conversation Efficiency Index": cei_result.values,
|
| 470 |
+
"Model Consistency Score": mcs_result.values,
|
| 471 |
"Average Win Rate": avr_result.scores.values,
|
| 472 |
"Bradley-Terry Coefficient": bt_result.scores.values,
|
| 473 |
"Eigenvector Centrality Value": eigen_result.scores.values,
|
|
|
|
| 480 |
leaderboard_data = leaderboard_data.round(
|
| 481 |
{
|
| 482 |
"Elo Score": 2,
|
| 483 |
+
"Conversation Efficiency Index": 2,
|
| 484 |
"Average Win Rate": 2,
|
| 485 |
"Bradley-Terry Coefficient": 2,
|
| 486 |
"Eigenvector Centrality Value": 2,
|
|
|
|
| 531 |
"Rank",
|
| 532 |
"Model",
|
| 533 |
"Elo Score",
|
| 534 |
+
"Conversation Efficiency Index",
|
| 535 |
+
"Model Consistency Score",
|
| 536 |
],
|
| 537 |
search_columns=["Model"],
|
| 538 |
filter_columns=[
|
| 539 |
"Elo Score",
|
| 540 |
+
"Conversation Efficiency Index",
|
| 541 |
+
"Model Consistency Score",
|
| 542 |
"Average Win Rate",
|
| 543 |
"Bradley-Terry Coefficient",
|
| 544 |
"Eigenvector Centrality Value",
|
|
|
|
| 1141 |
winner_model = "tie"
|
| 1142 |
|
| 1143 |
# Create feedback entry
|
| 1144 |
+
vote_entry = {
|
| 1145 |
"left": models_state["left"],
|
| 1146 |
"right": models_state["right"],
|
| 1147 |
"winner": winner_model,
|
|
|
|
| 1154 |
file_name = now.strftime("%Y%m%d_%H%M%S")
|
| 1155 |
|
| 1156 |
# Save feedback back to the Hugging Face dataset
|
| 1157 |
+
save_content_to_hf(vote_entry, "SE-Arena/votes", folder_name, file_name)
|
| 1158 |
|
| 1159 |
conversation_state["right_chat"][0]["content"] = conversation_state[
|
| 1160 |
"right_chat"
|
|
|
|
| 1199 |
gr.update(
|
| 1200 |
value="Can't Decide", interactive=True
|
| 1201 |
), # [10] Reset feedback radio selection
|
| 1202 |
+
get_leaderboard_data(vote_entry), # [11] Updated leaderboard data
|
| 1203 |
gr.update(
|
| 1204 |
visible=True
|
| 1205 |
), # [12] Show the thanks_message markdown component
|