Spaces:

bigcode
/

arena

Running

File size: 12,051 Bytes

ba99c06

"""
Elo Rating Calculation Module for BigCodeArena
Contains Bradley-Terry Model with confidence intervals and traditional Elo calculation
"""

import math
import numpy as np
import pandas as pd
from collections import defaultdict
from tqdm import tqdm
from sklearn.linear_model import LogisticRegression
import yaml
import os


def load_model_metadata():
    """Load model metadata from api_config.yaml"""
    try:
        config_path = os.path.join(os.path.dirname(__file__), "api_config.yaml")
        with open(config_path, "r", encoding="utf-8") as file:
            config = yaml.safe_load(file)

        metadata = {}
        for model_key, model_config in config.items():
            if isinstance(model_config, dict):
                model_name = model_config.get("model", model_key)
                metadata[model_name] = {
                    "organization": model_config.get("organization", "Unknown"),
                    "license": model_config.get("license", "Unknown"),
                }
                # Also store with the key name for lookup
                metadata[model_key] = {
                    "organization": model_config.get("organization", "Unknown"),
                    "license": model_config.get("license", "Unknown"),
                }

        return metadata
    except Exception as e:
        print(f"Warning: Could not load model metadata: {e}")
        return {}


def compute_mle_elo(df, SCALE=400, BASE=10, INIT_RATING=1000, sample_weight=None):
    """Compute Elo ratings using Bradley-Terry Model with Maximum Likelihood Estimation"""

    # Get all unique models
    all_models = sorted(list(set(df["model_a"].tolist() + df["model_b"].tolist())))
    
    # Create win matrices for each outcome type
    # Initialize empty matrices with float dtype to avoid warnings
    ptbl_a_win = pd.DataFrame(0.0, index=all_models, columns=all_models)
    ptbl_b_win = pd.DataFrame(0.0, index=all_models, columns=all_models)  
    ptbl_tie = pd.DataFrame(0.0, index=all_models, columns=all_models)

    # Count wins for model_a
    model_a_wins = df[df["winner"] == "model_a"]
    if not model_a_wins.empty:
        a_win_counts = model_a_wins.groupby(["model_a", "model_b"]).size()
        for (model_a, model_b), count in a_win_counts.items():
            ptbl_a_win.loc[model_a, model_b] = count

    # Count wins for model_b  
    model_b_wins = df[df["winner"] == "model_b"]
    if not model_b_wins.empty:
        b_win_counts = model_b_wins.groupby(["model_a", "model_b"]).size()
        for (model_a, model_b), count in b_win_counts.items():
            ptbl_b_win.loc[model_a, model_b] = count

    # Count ties
    ties = df[df["winner"].isin(["tie", "tie (bothbad)"])]
    if not ties.empty:
        tie_counts = ties.groupby(["model_a", "model_b"]).size()
        for (model_a, model_b), count in tie_counts.items():
            # For ties, we count 0.5 win for each model
            ptbl_tie.loc[model_a, model_b] = count * 0.5
            ptbl_tie.loc[model_b, model_a] = count * 0.5

    models = pd.Series(np.arange(len(all_models)), index=all_models)
    p = len(models)
    
    # Create training data for logistic regression
    X = []
    Y = []
    sample_weights = []
    
    for model_a in all_models:
        for model_b in all_models:
            if model_a == model_b:
                continue
                
            # Count total games between these models
            a_wins = ptbl_a_win.loc[model_a, model_b]
            b_wins = ptbl_b_win.loc[model_a, model_b] 
            ties = ptbl_tie.loc[model_a, model_b]
            
            total_games = a_wins + b_wins + ties
            if total_games == 0:
                continue
                
            # Create feature vector: difference in model strengths
            x = np.zeros(p)
            x[models[model_a]] = 1.0
            x[models[model_b]] = -1.0
            
            # Add data points for model_a wins
            if a_wins > 0:
                X.append(x)
                Y.append(1)  # model_a wins
                sample_weights.append(a_wins)
            
            # Add data points for model_b wins (model_a loses)
            if b_wins > 0:
                X.append(x)  # same feature vector
                Y.append(0)  # model_a loses
                sample_weights.append(b_wins)
                
            # Add data points for ties - treat as half wins for model_a
            if ties > 0:
                # Add ties as both wins and losses with half weight each
                X.append(x)
                Y.append(1)  # model_a wins (tie counted as win)
                sample_weights.append(ties / 2)
                
                X.append(x)
                Y.append(0)  # model_a loses (tie counted as loss)
                sample_weights.append(ties / 2)

    if len(X) == 0 or len(set(Y)) < 2:
        # Not enough data or no variation in outcomes
        return pd.Series({model: INIT_RATING for model in all_models}).sort_values(ascending=False)

    X = np.array(X)
    Y = np.array(Y)
    sample_weights = np.array(sample_weights)

    # Fit logistic regression
    lr = LogisticRegression(fit_intercept=False, penalty=None, tol=1e-6, max_iter=1000)
    lr.fit(X, Y, sample_weight=sample_weights)
    
    # Convert coefficients to Elo ratings
    elo_scores = SCALE * lr.coef_[0] + INIT_RATING


    return pd.Series(elo_scores, index=models.index).sort_values(ascending=False)


def get_bootstrap_result(battles, func_compute_elo, num_round=1000):
    """Get bootstrap results for confidence interval calculation"""

    rows = []
    for i in tqdm(range(num_round), desc="bootstrap"):
        # Bootstrap sample with replacement
        bootstrap_sample = battles.sample(frac=1.0, replace=True)
        try:
            elo_result = func_compute_elo(bootstrap_sample)
            rows.append(elo_result)
        except Exception as e:
            # Skip failed bootstrap samples
            continue

    if not rows:
        return pd.DataFrame()

    df = pd.DataFrame(rows)
    # Sort columns by median Elo score (descending)
    return df[df.median().sort_values(ascending=False).index]


def compute_online_elo(battles, K=4, SCALE=400, BASE=10, INIT_RATING=1000):
    """Compute Elo ratings for models based on battle results (legacy function for compatibility)"""
    rating = defaultdict(lambda: INIT_RATING)

    for rd, model_a, model_b, winner in battles[
        ["model_a", "model_b", "winner"]
    ].itertuples():
        ra = rating[model_a]
        rb = rating[model_b]
        ea = 1 / (1 + BASE ** ((rb - ra) / SCALE))
        eb = 1 / (1 + BASE ** ((ra - rb) / SCALE))
        if winner == "model_a":
            sa = 1
        elif winner == "model_b":
            sa = 0
        elif winner == "tie" or winner == "tie (bothbad)":
            sa = 0.5
        else:
            raise Exception(f"unexpected vote {winner}")
        rating[model_a] += K * (sa - ea)
        rating[model_b] += K * (1 - sa - eb)

    # calibrate llama-13b to 800 if it exists
    if "llama-13b" in rating:
        delta = 800 - rating["llama-13b"]
        for model in battles["model_a"].unique():
            rating[model] += delta

    return rating


def calculate_elo_with_confidence_intervals(battles_df, vote_counts):
    """
    Main function to calculate Elo ratings with confidence intervals
    
    Args:
        battles_df (pd.DataFrame): DataFrame with columns ['model_a', 'model_b', 'winner']
        vote_counts (dict): Dictionary with vote counts for each model
        
    Returns:
        tuple: (elo_ratings, confidence_intervals)
    """
    confidence_intervals = {}  # Initialize to avoid uninitialized variable error

    # Check if we have sufficient data for Bradley-Terry model
    if len(battles_df) < 2:
        # Not enough battles, use default ratings
        all_models = set(
            battles_df["model_a"].tolist() + battles_df["model_b"].tolist()
        )
        elo_ratings = pd.Series({model: 1000 for model in all_models})
        confidence_intervals = {model: 0 for model in all_models}
    else:
        try:
            # Use the new Bradley-Terry Model
            elo_ratings = compute_mle_elo(battles_df)

            # Calculate confidence intervals using bootstrap
            if len(battles_df) >= 10:  # Only calculate CI if we have enough data
                try:
                    bootstrap_df = get_bootstrap_result(
                        battles_df, compute_mle_elo, num_round=100
                    )

                    # Calculate 95% confidence intervals
                    if not bootstrap_df.empty:
                        for model in bootstrap_df.columns:
                            scores = bootstrap_df[model].dropna()
                            if len(scores) > 0:
                                lower = scores.quantile(0.025)
                                upper = scores.quantile(0.975)
                                median_score = scores.median()
                                ci_margin = (upper - lower) / 2
                                confidence_intervals[model] = ci_margin
                            else:
                                confidence_intervals[model] = 0
                    else:
                        # Fallback: no confidence intervals
                        for model in elo_ratings.index:
                            confidence_intervals[model] = 0
                except Exception as bootstrap_error:
                    print(
                        f"Bootstrap calculation failed: {bootstrap_error}, skipping confidence intervals"
                    )
                    for model in elo_ratings.index:
                        confidence_intervals[model] = 0
            else:
                # Not enough data for bootstrap, set CI to 0
                for model in elo_ratings.index:
                    confidence_intervals[model] = 0
        except Exception as e:
            # Fallback to old method if Bradley-Terry fails
            print(
                f"Bradley-Terry calculation failed: {e}, falling back to online Elo"
            )
            old_elo_ratings = compute_online_elo(battles_df)
            elo_ratings = pd.Series(old_elo_ratings)
            confidence_intervals = {model: 0 for model in elo_ratings.index}
    return elo_ratings, confidence_intervals


def create_ranking_dataframe(elo_ratings, confidence_intervals, vote_counts):
    """
    Create ranking DataFrame with all necessary columns

    Args:
        elo_ratings (pd.Series): Elo ratings for each model
        confidence_intervals (dict): Confidence interval margins for each model
        vote_counts (dict): Vote counts for each model

    Returns:
        pd.DataFrame: Ranking table with columns [Rank, Model, Score, 95% CI (±), Votes, Organization, License]
    """
    # Load model metadata
    metadata = load_model_metadata()

    # Create ranking list with Elo ratings and confidence intervals
    ranking_list = []
    for model in elo_ratings.index:
        ci_margin = confidence_intervals.get(model, 0)

        # Get metadata for this model
        model_metadata = metadata.get(model, {})
        organization = model_metadata.get("organization", "Unknown")
        license_type = model_metadata.get("license", "Unknown")

        ranking_list.append(
            {
                "Model": model,
                "Score": round(elo_ratings[model], 1),
                "95% CI (±)": round(ci_margin, 1) if ci_margin > 0 else "-",
                "Votes": vote_counts[model],
                "Organization": organization,
                "License": license_type,
            }
        )

    # Sort by Elo rating (highest first)
    ranking_df = pd.DataFrame(ranking_list).sort_values("Score", ascending=False)
    ranking_df["Rank"] = range(1, len(ranking_df) + 1)

    # Reorder columns
    ranking_df = ranking_df[
        ["Rank", "Model", "Score", "95% CI (±)", "Votes", "Organization", "License"]
    ]

    return ranking_df