Spaces:
Runtime error
Runtime error
Commit
Β·
ec01232
1
Parent(s):
6bd3956
fix due to vision leaderboard addtion upstream
Browse files- app.py +3 -0
- release_date_mapping.json +75 -0
- utils.py +15 -0
app.py
CHANGED
|
@@ -48,6 +48,9 @@ latest_elo_file_local = download_latest_data_from_space(
|
|
| 48 |
with open(latest_elo_file_local, "rb") as fin:
|
| 49 |
elo_results = pickle.load(fin)
|
| 50 |
|
|
|
|
|
|
|
|
|
|
| 51 |
arena_dfs = {}
|
| 52 |
for k in KEY_TO_CATEGORY_NAME.keys():
|
| 53 |
if k not in elo_results:
|
|
|
|
| 48 |
with open(latest_elo_file_local, "rb") as fin:
|
| 49 |
elo_results = pickle.load(fin)
|
| 50 |
|
| 51 |
+
# TO-DO: need to also include vision
|
| 52 |
+
elo_results = elo_results["text"]
|
| 53 |
+
|
| 54 |
arena_dfs = {}
|
| 55 |
for k in KEY_TO_CATEGORY_NAME.keys():
|
| 56 |
if k not in elo_results:
|
release_date_mapping.json
CHANGED
|
@@ -493,5 +493,80 @@
|
|
| 493 |
"key": "yi-large-preview",
|
| 494 |
"Model": "Yi-Large-preview",
|
| 495 |
"Release Date": "2024-05-23"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 496 |
}
|
| 497 |
]
|
|
|
|
| 493 |
"key": "yi-large-preview",
|
| 494 |
"Model": "Yi-Large-preview",
|
| 495 |
"Release Date": "2024-05-23"
|
| 496 |
+
},
|
| 497 |
+
{
|
| 498 |
+
"key": "claude-3-5-sonnet-20240620",
|
| 499 |
+
"Model": "Claude 3.5 Sonnet",
|
| 500 |
+
"Release Date": "2024-07-01"
|
| 501 |
+
},
|
| 502 |
+
{
|
| 503 |
+
"key": "deepseek-coder-v2",
|
| 504 |
+
"Model": "DeepSeek-Coder-V2-Instruct",
|
| 505 |
+
"Release Date": "2024-07-01"
|
| 506 |
+
},
|
| 507 |
+
{
|
| 508 |
+
"key": "gemini-1.5-flash-api-0514",
|
| 509 |
+
"Model": "Gemini-1.5-Flash-API-0514",
|
| 510 |
+
"Release Date": "2024-07-01"
|
| 511 |
+
},
|
| 512 |
+
{
|
| 513 |
+
"key": "gemini-1.5-pro-api-0514",
|
| 514 |
+
"Model": "Gemini-1.5-Pro-API-0514",
|
| 515 |
+
"Release Date": "2024-07-01"
|
| 516 |
+
},
|
| 517 |
+
{
|
| 518 |
+
"key": "gemini-advanced-0514",
|
| 519 |
+
"Model": "Gemini-Advanced-0514",
|
| 520 |
+
"Release Date": "2024-07-01"
|
| 521 |
+
},
|
| 522 |
+
{
|
| 523 |
+
"key": "gemma-2-27b-it",
|
| 524 |
+
"Model": "Gemma-2-27B-it",
|
| 525 |
+
"Release Date": "2024-07-01"
|
| 526 |
+
},
|
| 527 |
+
{
|
| 528 |
+
"key": "gemma-2-9b-it",
|
| 529 |
+
"Model": "Gemma-2-9B-it",
|
| 530 |
+
"Release Date": "2024-07-01"
|
| 531 |
+
},
|
| 532 |
+
{
|
| 533 |
+
"key": "glm-4-0520",
|
| 534 |
+
"Model": "GLM-4-0520",
|
| 535 |
+
"Release Date": "2024-07-01"
|
| 536 |
+
},
|
| 537 |
+
{
|
| 538 |
+
"key": "nemotron-4-340b-instruct",
|
| 539 |
+
"Model": "Nemotron-4-340B-Instruct",
|
| 540 |
+
"Release Date": "2024-07-01"
|
| 541 |
+
},
|
| 542 |
+
{
|
| 543 |
+
"key": "phi-3-medium-4k-instruct",
|
| 544 |
+
"Model": "Phi-3-Medium-4k-Instruct",
|
| 545 |
+
"Release Date": "2024-07-01"
|
| 546 |
+
},
|
| 547 |
+
{
|
| 548 |
+
"key": "phi-3-small-8k-instruct",
|
| 549 |
+
"Model": "Phi-3-Small-8k-Instruct",
|
| 550 |
+
"Release Date": "2024-07-01"
|
| 551 |
+
},
|
| 552 |
+
{
|
| 553 |
+
"key": "qwen2-72b-instruct",
|
| 554 |
+
"Model": "Qwen2-72B-Instruct",
|
| 555 |
+
"Release Date": "2024-07-01"
|
| 556 |
+
},
|
| 557 |
+
{
|
| 558 |
+
"key": "reka-flash-preview-20240611",
|
| 559 |
+
"Model": "Reka-Flash-Preview-20240611",
|
| 560 |
+
"Release Date": "2024-07-01"
|
| 561 |
+
},
|
| 562 |
+
{
|
| 563 |
+
"key": "yi-1.5-34b-chat",
|
| 564 |
+
"Model": "Yi-1.5-34B-Chat",
|
| 565 |
+
"Release Date": "2024-07-01"
|
| 566 |
+
},
|
| 567 |
+
{
|
| 568 |
+
"key": "yi-large",
|
| 569 |
+
"Model": "Yi-Large",
|
| 570 |
+
"Release Date": "2024-07-01"
|
| 571 |
}
|
| 572 |
]
|
utils.py
CHANGED
|
@@ -11,6 +11,7 @@ from huggingface_hub import HfFileSystem, hf_hub_download
|
|
| 11 |
KEY_TO_CATEGORY_NAME = {
|
| 12 |
"full": "Overall",
|
| 13 |
"dedup": "De-duplicate Top Redundant Queries (soon to be default)",
|
|
|
|
| 14 |
"coding": "Coding",
|
| 15 |
"hard_6": "Hard Prompts (Overall)",
|
| 16 |
"hard_english_6": "Hard Prompts (English)",
|
|
@@ -18,14 +19,22 @@ KEY_TO_CATEGORY_NAME = {
|
|
| 18 |
"english": "English",
|
| 19 |
"chinese": "Chinese",
|
| 20 |
"french": "French",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
"no_tie": "Exclude Ties",
|
| 22 |
"no_short": "Exclude Short Query (< 5 tokens)",
|
| 23 |
"no_refusal": "Exclude Refusal",
|
| 24 |
"overall_limit_5_user_vote": "overall_limit_5_user_vote",
|
|
|
|
| 25 |
}
|
|
|
|
| 26 |
CAT_NAME_TO_EXPLANATION = {
|
| 27 |
"Overall": "Overall Questions",
|
| 28 |
"De-duplicate Top Redundant Queries (soon to be default)": "De-duplicate top redundant queries (top 0.1%). See details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/#note-enhancing-quality-through-de-duplication).",
|
|
|
|
| 29 |
"Coding": "Coding: whether conversation contains code snippets",
|
| 30 |
"Hard Prompts (Overall)": "Hard Prompts (Overall): details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/)",
|
| 31 |
"Hard Prompts (English)": "Hard Prompts (English), note: the delta is to English Category. details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/)",
|
|
@@ -33,10 +42,16 @@ CAT_NAME_TO_EXPLANATION = {
|
|
| 33 |
"English": "English Prompts",
|
| 34 |
"Chinese": "Chinese Prompts",
|
| 35 |
"French": "French Prompts",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
"Exclude Ties": "Exclude Ties and Bothbad",
|
| 37 |
"Exclude Short Query (< 5 tokens)": "Exclude Short User Query (< 5 tokens)",
|
| 38 |
"Exclude Refusal": 'Exclude model responses with refusal (e.g., "I cannot answer")',
|
| 39 |
"overall_limit_5_user_vote": "overall_limit_5_user_vote",
|
|
|
|
| 40 |
}
|
| 41 |
|
| 42 |
PROPRIETARY_LICENSES = ["Proprietary", "Proprietory"]
|
|
|
|
| 11 |
KEY_TO_CATEGORY_NAME = {
|
| 12 |
"full": "Overall",
|
| 13 |
"dedup": "De-duplicate Top Redundant Queries (soon to be default)",
|
| 14 |
+
"multiturn": "Multi-Turn",
|
| 15 |
"coding": "Coding",
|
| 16 |
"hard_6": "Hard Prompts (Overall)",
|
| 17 |
"hard_english_6": "Hard Prompts (English)",
|
|
|
|
| 19 |
"english": "English",
|
| 20 |
"chinese": "Chinese",
|
| 21 |
"french": "French",
|
| 22 |
+
"german": "German",
|
| 23 |
+
"spanish": "Spanish",
|
| 24 |
+
"russian": "Russian",
|
| 25 |
+
"japanese": "Japanese",
|
| 26 |
+
"korean": "Korean",
|
| 27 |
"no_tie": "Exclude Ties",
|
| 28 |
"no_short": "Exclude Short Query (< 5 tokens)",
|
| 29 |
"no_refusal": "Exclude Refusal",
|
| 30 |
"overall_limit_5_user_vote": "overall_limit_5_user_vote",
|
| 31 |
+
"full_old": "Overall (Deprecated)",
|
| 32 |
}
|
| 33 |
+
|
| 34 |
CAT_NAME_TO_EXPLANATION = {
|
| 35 |
"Overall": "Overall Questions",
|
| 36 |
"De-duplicate Top Redundant Queries (soon to be default)": "De-duplicate top redundant queries (top 0.1%). See details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/#note-enhancing-quality-through-de-duplication).",
|
| 37 |
+
"Multi-Turn": "Multi-Turn Conversation (>= 2 turns)",
|
| 38 |
"Coding": "Coding: whether conversation contains code snippets",
|
| 39 |
"Hard Prompts (Overall)": "Hard Prompts (Overall): details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/)",
|
| 40 |
"Hard Prompts (English)": "Hard Prompts (English), note: the delta is to English Category. details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/)",
|
|
|
|
| 42 |
"English": "English Prompts",
|
| 43 |
"Chinese": "Chinese Prompts",
|
| 44 |
"French": "French Prompts",
|
| 45 |
+
"German": "German Prompts",
|
| 46 |
+
"Spanish": "Spanish Prompts",
|
| 47 |
+
"Russian": "Russian Prompts",
|
| 48 |
+
"Japanese": "Japanese Prompts",
|
| 49 |
+
"Korean": "Korean Prompts",
|
| 50 |
"Exclude Ties": "Exclude Ties and Bothbad",
|
| 51 |
"Exclude Short Query (< 5 tokens)": "Exclude Short User Query (< 5 tokens)",
|
| 52 |
"Exclude Refusal": 'Exclude model responses with refusal (e.g., "I cannot answer")',
|
| 53 |
"overall_limit_5_user_vote": "overall_limit_5_user_vote",
|
| 54 |
+
"Overall (Deprecated)": "Overall without De-duplicating Top Redundant Queries (top 0.1%). See details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/#note-enhancing-quality-through-de-duplication).",
|
| 55 |
}
|
| 56 |
|
| 57 |
PROPRIETARY_LICENSES = ["Proprietary", "Proprietory"]
|