abdev-leaderboard

Running

File size: 7,599 Bytes

from datetime import datetime, timezone, timedelta
import hashlib
import os
from typing import Iterable, Union

from datasets import load_dataset
import gradio as gr
import pandas as pd

from constants import (
    RESULTS_REPO,
    ASSAY_RENAME,
    LEADERBOARD_RESULTS_COLUMNS,
    BASELINE_USERNAMES,
)

pd.set_option("display.max_columns", None)


def get_time(tz_name="EST") -> str:
    offsets = {"EST": -5, "UTC": 0}
    if tz_name not in offsets:
        print("Invalid timezone, using EST")
        tz_name = "EST"
    offset = offsets[tz_name]
    return (
        datetime.now(timezone(timedelta(hours=offset))).strftime("%Y-%m-%d %H:%M:%S")
        + f" ({tz_name})"
    )


def show_output_box(message):
    return gr.update(value=message, visible=True)


def anonymize_user(username: str) -> str:
    # Anonymize using a hash of the username
    return hashlib.sha256(username.encode()).hexdigest()[:8]


def fetch_hf_results():
    # load_dataset should cache by default if not using force_redownload
    df = load_dataset(
        RESULTS_REPO,
        data_files="auto_submissions/metrics_all.csv",
    )["train"].to_pandas()
    assert all(
        col in df.columns for col in LEADERBOARD_RESULTS_COLUMNS
    ), f"Expected columns {LEADERBOARD_RESULTS_COLUMNS} not found in {df.columns}. Missing columns: {set(LEADERBOARD_RESULTS_COLUMNS) - set(df.columns)}"

    df_baseline = df[df["user"].isin(BASELINE_USERNAMES)]
    df_non_baseline = df[~df["user"].isin(BASELINE_USERNAMES)]
    # Show latest submission only
    # For baselines: Keep unique model names
    df_baseline = df_baseline.sort_values(
        "submission_time", ascending=False
    ).drop_duplicates(subset=["model", "assay", "dataset", "user"], keep="first")
    # For users: Just show latest submission
    df_non_baseline = df_non_baseline.sort_values(
        "submission_time", ascending=False
    ).drop_duplicates(subset=["assay", "dataset", "user"], keep="first")
    df = pd.concat([df_baseline, df_non_baseline], ignore_index=True)
    df["property"] = df["assay"].map(ASSAY_RENAME)

    # Rename baseline username to just "Baseline"
    df.loc[df["user"].isin(BASELINE_USERNAMES), "user"] = "Baseline"
    # Note: Could optionally add a column "is_baseline" to the dataframe to indicate whether the model is a baseline model or not. If things get crowded.
    # Anonymize the user column at this point (so note: users can submit anonymous / non-anonymous and we'll show their latest submission regardless)
    df.loc[df["anonymous"] != False, "user"] = "anon-" + df.loc[
        df["anonymous"] != False, "user"
    ].apply(readable_hash)
    
    # Compare to previous dataframe
    if os.path.exists("debug-current-results.csv"):
        old_df = pd.read_csv("debug-current-results.csv")
    else:
        old_df = df
    if len(df) != len(old_df):
        print(f"New results: Length {len(old_df)} -> {len(df)} ({get_time()})")
    
    df.to_csv("debug-current-results.csv", index=False)


# Readable hashing function similar to coolname or codenamize
ADJECTIVES = [
    "ancient",
    "brave",
    "calm",
    "clever",
    "crimson",
    "curious",
    "dapper",
    "eager",
    "fuzzy",
    "gentle",
    "glowing",
    "golden",
    "happy",
    "icy",
    "jolly",
    "lucky",
    "magical",
    "mellow",
    "nimble",
    "peachy",
    "quick",
    "royal",
    "shiny",
    "silent",
    "sly",
    "sparkly",
    "spicy",
    "spry",
    "sturdy",
    "sunny",
    "swift",
    "tiny",
    "vivid",
    "witty",
]

ANIMALS = [
    "ant",
    "bat",
    "bear",
    "bee",
    "bison",
    "boar",
    "bug",
    "cat",
    "crab",
    "crow",
    "deer",
    "dog",
    "duck",
    "eel",
    "elk",
    "fox",
    "frog",
    "goat",
    "gull",
    "hare",
    "hawk",
    "hen",
    "horse",
    "ibis",
    "kid",
    "kiwi",
    "koala",
    "lamb",
    "lark",
    "lemur",
    "lion",
    "llama",
    "loon",
    "lynx",
    "mole",
    "moose",
    "mouse",
    "newt",
    "otter",
    "owl",
    "ox",
    "panda",
    "pig",
    "prawn",
    "puma",
    "quail",
    "quokka",
    "rabbit",
    "rat",
    "ray",
    "robin",
    "seal",
    "shark",
    "sheep",
    "shrew",
    "skunk",
    "slug",
    "snail",
    "snake",
    "swan",
    "toad",
    "trout",
    "turtle",
    "vole",
    "walrus",
    "wasp",
    "whale",
    "wolf",
    "worm",
    "yak",
    "zebra",
]
NOUNS = [
    "rock",
    "sand",
    "star",
    "tree",
    "leaf",
    "seed",
    "stone",
    "cloud",
    "rain",
    "snow",
    "wind",
    "fire",
    "ash",
    "dirt",
    "mud",
    "ice",
    "wave",
    "shell",
    "dust",
    "sun",
    "moon",
    "hill",
    "lake",
    "pond",
    "reef",
    "root",
    "twig",
    "wood",
]


def readable_hash(
    data: Union[str, bytes, Iterable[int]],
    *,
    salt: Union[str, bytes, None] = None,
    words: tuple[list[str], list[str]] = (ADJECTIVES, ANIMALS + NOUNS),
    sep: str = "-",
    checksum_len: int = 2,  # 0 to disable; 2–3 is plenty
    case: str = "lower",  # "lower" | "title" | "upper"
) -> str:
    """
    Deterministically map input data to 'adjective-animal[-checksum]'. Generated using ChatGPT.

    Examples
    --------
    >>> readable_hash("hello world")
    'magical-panda-6h'

    >>> readable_hash("hello world", salt="my-app-v1", checksum_len=3)
    'royal-otter-1pz'

    >>> readable_hash(b"\x00\x01\x02\x03", case="title", checksum_len=0)
    'Fuzzy-Tiger'

    Vocabulary
    ----------
    ADJECTIVES: ~160 safe, descriptive words (e.g. "ancient", "brave", "silent", "swift")
    ANIMALS: ~80 short, common animals (e.g. "dog", "owl", "whale", "tiger")
    NOUNS: optional set of ~30 neutral nouns (e.g. "rock", "star", "tree", "cloud")

    Combinations
    ------------
    - adjective + animal: ~13,000 unique names
    - adjective + noun: ~5,000 unique names
    - adjective + animal + noun: ~390,000 unique names

    Checksum
    --------
    An optional short base-36 suffix (e.g. "-6h" or "-1pz"). The checksum
    acts as a disambiguator in case two different inputs map to the same
    word combination. With 2-3 characters, collisions become vanishingly rare.
    If you only need fun, human-readable names, you can disable it by setting
    ``checksum_len=0``. If you need unique, stable identifiers, keep it enabled.
    """
    if isinstance(data, str):
        data = data.encode()
    elif isinstance(data, Iterable) and not isinstance(data, (bytes, bytearray)):
        data = bytes(data)

    h = hashlib.blake2b(digest_size=8)  # fast, stable, short digest
    if salt:
        h.update(salt.encode() if isinstance(salt, str) else salt)
        h.update(b"\x00")  # domain-separate salt from data
    h.update(data)
    digest = h.digest()

    # Use the first 6 bytes to index words; last bytes for checksum
    n1 = int.from_bytes(digest[0:3], "big")
    n2 = int.from_bytes(digest[3:6], "big")

    adj = words[0][n1 % len(words[0])]
    noun = words[1][n2 % len(words[1])]
    phrase = f"{adj}{sep}{noun}"

    if checksum_len > 0:
        # Short base36 checksum for collision visibility
        cs = int.from_bytes(digest[6:], "big")
        base36 = ""
        alphabet = "0123456789abcdefghijklmnopqrstuvwxyz"
        while cs:
            cs, r = divmod(cs, 36)
            base36 = alphabet[r] + base36
        base36 = (base36 or "0")[:checksum_len]
        phrase = f"{phrase}{sep}{base36}"

    if case == "title":
        phrase = sep.join(p.capitalize() for p in phrase.split(sep))
    elif case == "upper":
        phrase = phrase.upper()

    return phrase