Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -419,63 +419,71 @@ def get_leaderboard_data(vote_entry=None):
|
|
| 419 |
vote_df, conversation_df, on=["timestamp", "left", "right"], how="inner"
|
| 420 |
)
|
| 421 |
|
| 422 |
-
#
|
| 423 |
-
|
| 424 |
-
model_rcs_sum = {}
|
| 425 |
-
model_rcs_max = {}
|
| 426 |
|
| 427 |
# Process each row once and accumulate scores
|
| 428 |
for _, row in all_df.iterrows():
|
| 429 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 430 |
match row["winner"]:
|
| 431 |
case evalica.Winner.X:
|
| 432 |
-
left_score = 1
|
| 433 |
-
right_score = -1
|
| 434 |
case evalica.Winner.Y:
|
| 435 |
-
left_score = -1
|
| 436 |
-
right_score = 1
|
| 437 |
case _: # Draw
|
| 438 |
left_score = 0.1
|
| 439 |
right_score = 0.1
|
| 440 |
-
|
| 441 |
# Count rounds for each side
|
| 442 |
left_round = sum(1 for msg in row["left_chat"] if msg["role"] == "assistant")
|
| 443 |
right_round = sum(1 for msg in row["right_chat"] if msg["role"] == "assistant")
|
| 444 |
-
|
| 445 |
-
|
| 446 |
-
|
| 447 |
-
|
| 448 |
-
|
| 449 |
-
|
| 450 |
-
|
| 451 |
-
|
| 452 |
-
|
| 453 |
-
|
| 454 |
-
|
| 455 |
-
|
| 456 |
-
|
| 457 |
-
|
| 458 |
-
|
| 459 |
-
|
| 460 |
-
|
| 461 |
-
|
| 462 |
-
|
| 463 |
-
|
| 464 |
-
|
| 465 |
-
|
| 466 |
-
|
| 467 |
-
|
| 468 |
-
self_matches = vote_df[vote_df["left"] == vote_df["right"]]
|
| 469 |
-
model_matches = self_matches.groupby("left")
|
| 470 |
-
draw_counts = model_matches["winner"].apply(
|
| 471 |
-
lambda x: (x == evalica.Winner.Draw).sum()
|
| 472 |
-
)
|
| 473 |
-
total_counts = model_matches.size()
|
| 474 |
-
mcs_result = (
|
| 475 |
-
(draw_counts / total_counts)
|
| 476 |
-
.round(2)
|
| 477 |
-
.reindex(elo_result.scores.index, fill_value="N/A")
|
| 478 |
-
)
|
| 479 |
|
| 480 |
# Combine all results into a single DataFrame
|
| 481 |
leaderboard_data = pd.DataFrame(
|
|
@@ -496,7 +504,6 @@ def get_leaderboard_data(vote_entry=None):
|
|
| 496 |
leaderboard_data = leaderboard_data.round(
|
| 497 |
{
|
| 498 |
"Elo Score": 2,
|
| 499 |
-
"Conversation Efficiency Index": 2,
|
| 500 |
"Average Win Rate": 2,
|
| 501 |
"Bradley-Terry Coefficient": 2,
|
| 502 |
"Eigenvector Centrality Value": 2,
|
|
|
|
| 419 |
vote_df, conversation_df, on=["timestamp", "left", "right"], how="inner"
|
| 420 |
)
|
| 421 |
|
| 422 |
+
# Create dictionaries to track scores and match counts
|
| 423 |
+
model_stats = {}
|
|
|
|
|
|
|
| 424 |
|
| 425 |
# Process each row once and accumulate scores
|
| 426 |
for _, row in all_df.iterrows():
|
| 427 |
+
left_model = row["left"]
|
| 428 |
+
right_model = row["right"]
|
| 429 |
+
is_self_match = left_model == right_model
|
| 430 |
+
|
| 431 |
+
# Initialize dictionaries for models if they don't exist yet
|
| 432 |
+
for model in [left_model, right_model]:
|
| 433 |
+
if model not in model_stats:
|
| 434 |
+
model_stats[model] = {
|
| 435 |
+
"cei_sum": 0, # Sum of per-round scores
|
| 436 |
+
"cei_max": 0, # Sum of per-round maximums
|
| 437 |
+
"self_matches": 0, # Count of self-matches
|
| 438 |
+
"self_draws": 0 # Count of draws in self-matches
|
| 439 |
+
}
|
| 440 |
+
|
| 441 |
+
# Handle self-matches (same model on both sides)
|
| 442 |
+
if is_self_match:
|
| 443 |
+
model_stats[left_model]["self_matches"] += 1
|
| 444 |
+
if row["winner"] == evalica.Winner.Draw:
|
| 445 |
+
model_stats[left_model]["self_draws"] += 1
|
| 446 |
+
continue
|
| 447 |
+
|
| 448 |
+
# Determine scores based on winner for competitive matches
|
| 449 |
match row["winner"]:
|
| 450 |
case evalica.Winner.X:
|
| 451 |
+
left_score = 1
|
| 452 |
+
right_score = -1
|
| 453 |
case evalica.Winner.Y:
|
| 454 |
+
left_score = -1
|
| 455 |
+
right_score = 1
|
| 456 |
case _: # Draw
|
| 457 |
left_score = 0.1
|
| 458 |
right_score = 0.1
|
| 459 |
+
|
| 460 |
# Count rounds for each side
|
| 461 |
left_round = sum(1 for msg in row["left_chat"] if msg["role"] == "assistant")
|
| 462 |
right_round = sum(1 for msg in row["right_chat"] if msg["role"] == "assistant")
|
| 463 |
+
|
| 464 |
+
# Update CEI metrics
|
| 465 |
+
model_stats[left_model]["cei_max"] += 1 / left_round
|
| 466 |
+
model_stats[right_model]["cei_max"] += 1 / right_round
|
| 467 |
+
model_stats[left_model]["cei_sum"] += left_score / left_round
|
| 468 |
+
model_stats[right_model]["cei_sum"] += right_score / right_round
|
| 469 |
+
|
| 470 |
+
# Calculate CEI results
|
| 471 |
+
cei_result = {}
|
| 472 |
+
for model in elo_result.scores.index:
|
| 473 |
+
if model in model_stats and model_stats[model]["cei_max"] > 0:
|
| 474 |
+
cei_result[model] = round(model_stats[model]["cei_sum"] / model_stats[model]["cei_max"], 2)
|
| 475 |
+
else:
|
| 476 |
+
cei_result[model] = "N/A"
|
| 477 |
+
cei_result = pd.Series(cei_result)
|
| 478 |
+
|
| 479 |
+
# Calculate MCS results
|
| 480 |
+
mcs_result = {}
|
| 481 |
+
for model in elo_result.scores.index:
|
| 482 |
+
if model in model_stats and model_stats[model]["self_matches"] > 0:
|
| 483 |
+
mcs_result[model] = round(model_stats[model]["self_draws"] / model_stats[model]["self_matches"], 2)
|
| 484 |
+
else:
|
| 485 |
+
mcs_result[model] = "N/A"
|
| 486 |
+
mcs_result = pd.Series(mcs_result)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 487 |
|
| 488 |
# Combine all results into a single DataFrame
|
| 489 |
leaderboard_data = pd.DataFrame(
|
|
|
|
| 504 |
leaderboard_data = leaderboard_data.round(
|
| 505 |
{
|
| 506 |
"Elo Score": 2,
|
|
|
|
| 507 |
"Average Win Rate": 2,
|
| 508 |
"Bradley-Terry Coefficient": 2,
|
| 509 |
"Eigenvector Centrality Value": 2,
|