Spaces:
Running
Running
| import json | |
| import os | |
| from typing import Any, Dict | |
| import pandas as pd | |
| from huggingface_hub import HfApi, hf_hub_download | |
| from required_categories import required_mmlu_categories, required_unified_exam_categories | |
| class ModelHandler: | |
| def __init__(self, model_infos_path="model_results.json"): | |
| self.api = HfApi() | |
| self.model_infos_path = model_infos_path | |
| self.model_infos = self._load_model_infos() | |
| def _load_model_infos(self) -> Dict: | |
| if os.path.exists(self.model_infos_path): | |
| with open(self.model_infos_path) as f: | |
| return json.load(f) | |
| return {} | |
| def _save_model_infos(self): | |
| print("Saving model infos") | |
| with open(self.model_infos_path, "w") as f: | |
| json.dump(self.model_infos, f, indent=4) | |
| def get_arm_bench_data(self): | |
| models = self.api.list_models(filter="ArmBench-LLM") | |
| model_names = {model["model_name"] for model in self.model_infos} | |
| repositories = [model.modelId for model in models] | |
| for repo_id in repositories: | |
| files = [f for f in self.api.list_repo_files(repo_id) if f == "results.json"] | |
| if not files: | |
| continue | |
| for file in files: | |
| model_name = repo_id | |
| if model_name not in model_names: | |
| try: | |
| result_path = hf_hub_download(repo_id, filename=file) | |
| with open(result_path) as f: | |
| results = json.load(f) | |
| self.model_infos.append({ | |
| "model_name": model_name, | |
| "results": results | |
| }) | |
| except Exception as e: | |
| print(f"Error loading {model_name} - {e}") | |
| continue | |
| self._save_model_infos() | |
| mmlu_data = [] | |
| unified_exam_data = [] | |
| for model in self.model_infos: | |
| model_name = model["model_name"] | |
| results = model.get("results", {}) | |
| mmlu_results = results.get("mmlu_results", []) | |
| unified_exam_results = results.get("unified_exam_results", []) | |
| if mmlu_results: | |
| mmlu_row = {"Model": model_name} | |
| mmlu_categories = {result["category"] for result in mmlu_results} | |
| if all(category in mmlu_categories for category in required_mmlu_categories): | |
| for result in mmlu_results: | |
| mmlu_row[result["category"]] = result["score"] | |
| mmlu_data.append(mmlu_row) | |
| if unified_exam_results: | |
| unified_exam_row = {"Model": model_name} | |
| unified_exam_categories = {result["category"] for result in unified_exam_results} | |
| if all(category in unified_exam_categories for category in required_unified_exam_categories): | |
| for result in unified_exam_results: | |
| unified_exam_row[result["category"]] = result["score"] | |
| unified_exam_data.append(unified_exam_row) | |
| mmlu_df = pd.DataFrame(mmlu_data) | |
| unified_exam_df = pd.DataFrame(unified_exam_data) | |
| return mmlu_df, unified_exam_df |