Spaces:
Running
Running
| import os | |
| import re | |
| from pathlib import Path | |
| import numpy as np | |
| import pandas as pd | |
| from datasets import load_dataset | |
| UNVERIFIED_MODELS = [] | |
| CONTAMINATED_MODELS = [] | |
| UNVERIFIED_MODELS_V1 = [ | |
| "nvidia/Nemotron-4-340B-Reward", | |
| "nvidia/Llama3-70B-SteerLM-RM", | |
| "Cohere May 2024", | |
| "google/gemini-1.5-pro-0514", | |
| "google/flame-24b-july-2024", | |
| "Cohere March 2024", | |
| "facebook/Self-taught-Llama-3-70B", | |
| "facebook/Self-taught-evaluator-llama3.1-70B", | |
| "google/flame-1.0-24B-july-2024", | |
| "Salesforce/SFR-LLaMa-3.1-70B-Judge-r", | |
| "Salesforce/SFR-nemo-12B-Judge-r", | |
| "Salesforce/SFR-LLaMa-3.1-8B-Judge-r", | |
| "SF-Foundation/TextEval-OffsetBias-12B", | |
| "SF-Foundation/TextEval-Llama3.1-70B", | |
| "nvidia/Llama-3.1-Nemotron-70B-Reward", | |
| ] | |
| # No longer used | |
| CONTAMINATED_MODELS_V1 = [ | |
| "Skywork/Skywork-Reward-Gemma-2-27B", | |
| "Skywork/Skywork-Critic-Llama-3.1-70B", | |
| "LxzGordon/URM-LLaMa-3.1-8B", | |
| "Skywork/Skywork-Reward-Llama-3.1-8B", | |
| "Ray2333/GRM-Llama3-8B-rewardmodel-ft", | |
| "nicolinho/QRM-Llama3.1-8B", | |
| "nicolinho/QRM-Llama3-8B", | |
| "general-preference/GPM-Llama-3.1-8B", | |
| "SF-Foundation/TextEval-Llama3.1-70B", | |
| "ZiyiYe/Con-J-Qwen2-7B", | |
| "Ray2333/Gemma-2B-rewardmodel-ft", | |
| "Ray2333/GRM-Gemma-2B-rewardmodel-ft", | |
| ] | |
| UNVERIFIED_MODELS_V2 = [ | |
| "ContextualAI/LMUnit-llama3.1-70b", | |
| "ContextualAI/LMUnit-qwen2.5-72b", | |
| "Databricks-Mosaic-Research/PGRM" | |
| ] | |
| UNVERIFIED_MODELS = UNVERIFIED_MODELS_V2 | |
| # From Open LLM Leaderboard | |
| def model_hyperlink(link, model_name): | |
| # if model_name is above 50 characters, return first 47 characters and "..." | |
| if len(model_name) > 50: | |
| model_name = model_name[:47] + "..." | |
| if model_name == "random": | |
| output = "random" | |
| elif model_name == "Cohere March 2024": | |
| output = f'<a target="_blank" href="https://huggingface.co/Cohere" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>' | |
| elif "openai" == model_name.split("/")[0]: | |
| output = f'<a target="_blank" href="https://huggingface.co/openai" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>' | |
| elif "Anthropic" == model_name.split("/")[0]: | |
| output = f'<a target="_blank" href="https://huggingface.co/Anthropic" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>' | |
| elif "google" == model_name.split("/")[0]: | |
| output = f'<a target="_blank" href="https://huggingface.co/google" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>' | |
| elif "PoLL" == model_name.split("/")[0]: | |
| output = model_name | |
| output = f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>' | |
| if model_name in UNVERIFIED_MODELS: | |
| output += " *" | |
| if model_name in CONTAMINATED_MODELS: | |
| output += " ⚠️" | |
| return output | |
| def undo_hyperlink(html_string): | |
| # Regex pattern to match content inside > and < | |
| pattern = r">[^<]+<" | |
| match = re.search(pattern, html_string) | |
| if match: | |
| # Extract the matched text and remove leading '>' and trailing '<' | |
| return match.group(0)[1:-1] | |
| else: | |
| return "No text found" | |
| # Define a function to fetch and process data | |
| def load_all_data(data_repo, subdir: str, subsubsets=False): # use HF api to pull the git repo | |
| dir = Path(data_repo) | |
| data_dir = dir / subdir | |
| orgs = [d for d in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir, d))] | |
| # get all files within the sub folders orgs | |
| models_results = [] | |
| for org in orgs: | |
| org_dir = data_dir / org | |
| files = [f for f in os.listdir(org_dir) if os.path.isfile(os.path.join(org_dir, f))] | |
| for file in files: | |
| if file.endswith(".json"): | |
| models_results.append(org + "/" + file) | |
| # create empty dataframe to add all data to | |
| df = pd.DataFrame() | |
| # load all json data in the list models_results one by one to avoid not having the same entries | |
| for model in models_results: | |
| model_data = load_dataset("json", data_files=data_repo + subdir + "/" + model, split="train") | |
| df2 = pd.DataFrame(model_data) | |
| # add to df | |
| df = pd.concat([df2, df]) | |
| # remove chat_template comlumn | |
| df = df.drop(columns=["chat_template"]) | |
| # sort columns alphabetically | |
| df = df.reindex(sorted(df.columns), axis=1) | |
| # move column "model" to the front | |
| cols = list(df.columns) | |
| cols.insert(0, cols.pop(cols.index("model"))) | |
| df = df.loc[:, cols] | |
| # select all columns except "model" | |
| cols = df.columns.tolist() | |
| cols.remove("model") | |
| # if model_type is a column (pref tests may not have it) | |
| if "model_type" in cols: | |
| cols.remove("model_type") | |
| # remove ref_model if in columns | |
| if "ref_model" in cols: | |
| cols.remove("ref_model") | |
| # remove model_beaker from dataframe | |
| if "model_beaker" in cols: | |
| cols.remove("model_beaker") | |
| df = df.drop(columns=["model_beaker"]) | |
| # remove column xstest (outdated data) | |
| # if xstest is a column | |
| if "xstest" in cols: | |
| df = df.drop(columns=["xstest"]) | |
| cols.remove("xstest") | |
| if "ref_model" in df.columns: | |
| df = df.drop(columns=["ref_model"]) | |
| # remove column anthropic and summarize_prompted (outdated data) | |
| if "anthropic" in cols: | |
| df = df.drop(columns=["anthropic"]) | |
| cols.remove("anthropic") | |
| if "summarize_prompted" in cols: | |
| df = df.drop(columns=["summarize_prompted"]) | |
| cols.remove("summarize_prompted") | |
| # remove pku_better and pku_safer (removed from the leaderboard) | |
| if "pku_better" in cols: | |
| df = df.drop(columns=["pku_better"]) | |
| cols.remove("pku_better") | |
| if "pku_safer" in cols: | |
| df = df.drop(columns=["pku_safer"]) | |
| cols.remove("pku_safer") | |
| # convert to score | |
| df[cols] = df[cols] * 100 | |
| avg = np.nanmean(df[cols].values, axis=1) | |
| # add average column | |
| df["average"] = avg | |
| # apply model_hyperlink function to column "model" | |
| df["model"] = df["model"].apply(lambda x: model_hyperlink(f"https://huggingface.co/{x}", x)) | |
| # move average column to the second | |
| cols = list(df.columns) | |
| cols.insert(1, cols.pop(cols.index("average"))) | |
| df = df.loc[:, cols] | |
| # move model_type column to first | |
| if "model_type" in cols: | |
| cols = list(df.columns) | |
| cols.insert(1, cols.pop(cols.index("model_type"))) | |
| df = df.loc[:, cols] | |
| # remove models with DPO Ref. Free as type (future work) | |
| df = df[~df["model_type"].str.contains("DPO Ref. Free", na=False)] | |
| return df | |