update
Browse files
app.py
CHANGED
|
@@ -3,8 +3,10 @@ import json
|
|
| 3 |
import pandas as pd
|
| 4 |
from collections import defaultdict
|
| 5 |
import copy as cp
|
| 6 |
-
from urllib.request import urlopen
|
| 7 |
import re
|
|
|
|
|
|
|
| 8 |
|
| 9 |
# Constants
|
| 10 |
CITATION_BUTTON_TEXT = r"""@misc{2023opencompass,
|
|
@@ -22,11 +24,37 @@ GITHUB_REPO = 'https://github.com/open-compass/opencompass'
|
|
| 22 |
GITHUB_RAW = 'https://raw.githubusercontent.com/open-compass/opencompass'
|
| 23 |
GITHUB_BLOB = 'https://github.com/open-compass/opencompass/blob'
|
| 24 |
|
| 25 |
-
# URL for the JSON data
|
| 26 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
|
| 28 |
# Markdown content
|
| 29 |
-
|
|
|
|
|
|
|
| 30 |
MAIN_LEADERBOARD_DESCRIPTION = """## Main Evaluation Results
|
| 31 |
The CompassAcademic currently focuses on the comprehensive reasoning abilities of LLMs.
|
| 32 |
- The datasets selected so far include General Knowledge Reasoning (MMLU-Pro/GPQA-Diamond), Logical Reasoning (BBH), Mathematical Reasoning (MATH-500, AIME), Code Completion (LiveCodeBench, HumanEval), and Instruction Following (IFEval).
|
|
@@ -34,7 +62,6 @@ The CompassAcademic currently focuses on the comprehensive reasoning abilities o
|
|
| 34 |
- Prompts and reproduction scripts can be found in [**OpenCompass**: A Toolkit for Evaluation of LLMs](https://github.com/open-compass/opencompass)π.
|
| 35 |
"""
|
| 36 |
|
| 37 |
-
|
| 38 |
def fix_image_urls(content):
|
| 39 |
"""Fix image URLs in markdown content."""
|
| 40 |
# Handle the specific logo.svg path
|
|
@@ -57,8 +84,8 @@ MODEL_SIZE = ['<10B', '10B-70B', '>70B', 'Unknown']
|
|
| 57 |
MODEL_TYPE = ['API', 'OpenSource']
|
| 58 |
|
| 59 |
|
| 60 |
-
def load_data():
|
| 61 |
-
response = urlopen(
|
| 62 |
data = json.loads(response.read().decode('utf-8'))
|
| 63 |
return data
|
| 64 |
|
|
@@ -141,7 +168,6 @@ def filter_table(df, size_ranges, model_types):
|
|
| 141 |
type_mask |= filtered_df['OpenSource'] == 'Yes'
|
| 142 |
filtered_df = filtered_df[type_mask]
|
| 143 |
|
| 144 |
-
# η΄ζ₯θΏεθΏζ»€εη DataFrame
|
| 145 |
return filtered_df
|
| 146 |
|
| 147 |
|
|
@@ -172,11 +198,13 @@ def calculate_column_widths(df):
|
|
| 172 |
|
| 173 |
|
| 174 |
def create_interface():
|
| 175 |
-
|
|
|
|
| 176 |
df = build_main_table(data)
|
|
|
|
| 177 |
|
| 178 |
with gr.Blocks() as demo:
|
| 179 |
-
gr.Markdown(
|
| 180 |
|
| 181 |
with gr.Tabs() as tabs:
|
| 182 |
with gr.TabItem("π
Main Leaderboard", elem_id='main'):
|
|
@@ -206,6 +234,22 @@ def create_interface():
|
|
| 206 |
column_widths=calculate_column_widths(df),
|
| 207 |
)
|
| 208 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 209 |
def update_table(size_ranges, model_types):
|
| 210 |
filtered_df = filter_table(df, size_ranges, model_types)
|
| 211 |
return filtered_df.sort_values(
|
|
@@ -224,10 +268,8 @@ def create_interface():
|
|
| 224 |
outputs=table,
|
| 225 |
)
|
| 226 |
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
# fixed_content = fix_image_urls(readme_content)
|
| 230 |
-
# gr.Markdown(fixed_content)
|
| 231 |
|
| 232 |
with gr.Row():
|
| 233 |
with gr.Accordion("Citation", open=False):
|
|
|
|
| 3 |
import pandas as pd
|
| 4 |
from collections import defaultdict
|
| 5 |
import copy as cp
|
| 6 |
+
from urllib.request import urlopen, URLError
|
| 7 |
import re
|
| 8 |
+
from datetime import datetime
|
| 9 |
+
import time
|
| 10 |
|
| 11 |
# Constants
|
| 12 |
CITATION_BUTTON_TEXT = r"""@misc{2023opencompass,
|
|
|
|
| 24 |
GITHUB_RAW = 'https://raw.githubusercontent.com/open-compass/opencompass'
|
| 25 |
GITHUB_BLOB = 'https://github.com/open-compass/opencompass/blob'
|
| 26 |
|
| 27 |
+
# Base URL for the JSON data
|
| 28 |
+
DATA_URL_BASE = "http://opencompass.oss-cn-shanghai.aliyuncs.com/assets/research-rank/research-data.REALTIME."
|
| 29 |
+
|
| 30 |
+
def find_latest_data_url():
|
| 31 |
+
"""Find the latest available data URL by trying different dates."""
|
| 32 |
+
today = datetime.now()
|
| 33 |
+
# Try last 365 days
|
| 34 |
+
for i in range(365):
|
| 35 |
+
date = today.replace(day=today.day - i)
|
| 36 |
+
date_str = date.strftime("%Y%m%d")
|
| 37 |
+
url = f"{DATA_URL_BASE}{date_str}.json"
|
| 38 |
+
try:
|
| 39 |
+
urlopen(url)
|
| 40 |
+
return url, date_str
|
| 41 |
+
except URLError:
|
| 42 |
+
continue
|
| 43 |
+
# If no valid URL found, return None
|
| 44 |
+
return None, None
|
| 45 |
+
|
| 46 |
+
def get_latest_data():
|
| 47 |
+
"""Get latest data URL and update time"""
|
| 48 |
+
data_url, update_time = find_latest_data_url()
|
| 49 |
+
if not data_url:
|
| 50 |
+
raise Exception("Could not find valid data URL")
|
| 51 |
+
formatted_update_time = datetime.strptime(update_time, "%Y%m%d").strftime("%Y-%m-%d")
|
| 52 |
+
return data_url, formatted_update_time
|
| 53 |
|
| 54 |
# Markdown content
|
| 55 |
+
def get_leaderboard_title(update_time):
|
| 56 |
+
return f"# CompassAcademic Leaderboard (Last Updated: {update_time})"
|
| 57 |
+
|
| 58 |
MAIN_LEADERBOARD_DESCRIPTION = """## Main Evaluation Results
|
| 59 |
The CompassAcademic currently focuses on the comprehensive reasoning abilities of LLMs.
|
| 60 |
- The datasets selected so far include General Knowledge Reasoning (MMLU-Pro/GPQA-Diamond), Logical Reasoning (BBH), Mathematical Reasoning (MATH-500, AIME), Code Completion (LiveCodeBench, HumanEval), and Instruction Following (IFEval).
|
|
|
|
| 62 |
- Prompts and reproduction scripts can be found in [**OpenCompass**: A Toolkit for Evaluation of LLMs](https://github.com/open-compass/opencompass)π.
|
| 63 |
"""
|
| 64 |
|
|
|
|
| 65 |
def fix_image_urls(content):
|
| 66 |
"""Fix image URLs in markdown content."""
|
| 67 |
# Handle the specific logo.svg path
|
|
|
|
| 84 |
MODEL_TYPE = ['API', 'OpenSource']
|
| 85 |
|
| 86 |
|
| 87 |
+
def load_data(data_url):
|
| 88 |
+
response = urlopen(data_url)
|
| 89 |
data = json.loads(response.read().decode('utf-8'))
|
| 90 |
return data
|
| 91 |
|
|
|
|
| 168 |
type_mask |= filtered_df['OpenSource'] == 'Yes'
|
| 169 |
filtered_df = filtered_df[type_mask]
|
| 170 |
|
|
|
|
| 171 |
return filtered_df
|
| 172 |
|
| 173 |
|
|
|
|
| 198 |
|
| 199 |
|
| 200 |
def create_interface():
|
| 201 |
+
data_url, update_time = get_latest_data()
|
| 202 |
+
data = load_data(data_url)
|
| 203 |
df = build_main_table(data)
|
| 204 |
+
title = gr.Markdown(get_leaderboard_title(update_time))
|
| 205 |
|
| 206 |
with gr.Blocks() as demo:
|
| 207 |
+
title_comp = gr.Markdown(get_leaderboard_title(update_time))
|
| 208 |
|
| 209 |
with gr.Tabs() as tabs:
|
| 210 |
with gr.TabItem("π
Main Leaderboard", elem_id='main'):
|
|
|
|
| 234 |
column_widths=calculate_column_widths(df),
|
| 235 |
)
|
| 236 |
|
| 237 |
+
def update_data():
|
| 238 |
+
"""Periodically check for new data and update the interface"""
|
| 239 |
+
while True:
|
| 240 |
+
time.sleep(300) # Check every 5 minutes
|
| 241 |
+
try:
|
| 242 |
+
new_data_url, new_update_time = get_latest_data()
|
| 243 |
+
if new_data_url != data_url:
|
| 244 |
+
new_data = load_data(new_data_url)
|
| 245 |
+
new_df = build_main_table(new_data)
|
| 246 |
+
filtered_df = filter_table(new_df, size_filter.value, type_filter.value)
|
| 247 |
+
title_comp.value = get_leaderboard_title(new_update_time)
|
| 248 |
+
table.value = filtered_df.sort_values("Average Score", ascending=False)
|
| 249 |
+
except Exception as e:
|
| 250 |
+
print(f"Error updating data: {e}")
|
| 251 |
+
continue
|
| 252 |
+
|
| 253 |
def update_table(size_ranges, model_types):
|
| 254 |
filtered_df = filter_table(df, size_ranges, model_types)
|
| 255 |
return filtered_df.sort_values(
|
|
|
|
| 268 |
outputs=table,
|
| 269 |
)
|
| 270 |
|
| 271 |
+
# Set up periodic data update
|
| 272 |
+
demo.load(update_data)
|
|
|
|
|
|
|
| 273 |
|
| 274 |
with gr.Row():
|
| 275 |
with gr.Accordion("Citation", open=False):
|