Spaces:
Running
Running
| from pathlib import Path | |
| from collections import OrderedDict | |
| DEFAULT_K = "∞" | |
| # DEFAULT_K = "1500" | |
| banner_url = "https://github.com/yuchenlin/ZeroEval/blob/main/docs/zebra/zebra_banner.png?raw=true" # the same repo here. | |
| BANNER = f'<div style="display: flex; justify-content: flex-start;"><img src="{banner_url}" alt="Banner" style="width: 50vw; min-width: 300px; max-width: 800px;border: 3px solid gray; border-color: gray black;"> </div>' | |
| TITLE = "<html> <head> <style> h1 {text-align: center;} </style> </head> <body> <h1> 🦁 AI2 WildBench Leaderboard </b> </body> </html>" | |
| WINRATE_HEATMAP = "<div><img src='https://github.com/WildEval/WildBench-Leaderboard/blob/main/gradio/pairwise_win_fractions.png?raw=true' style='width:100%;'></div>" | |
| CITATION_TEXT = """@article{lin2024wildbench, | |
| title={WildBench: Benchmarking LLMs with Challenging Tasks from Real Users in the Wild}, | |
| author={Bill Yuchen Lin and Yuntian Deng and Khyathi Chandu and Faeze Brahman and Abhilasha Ravichander and Valentina Pyatkin and Nouha Dziri and Ronan Le Bras and Yejin Choi}, | |
| year={2024}, | |
| eprint={2406.04770}, | |
| archivePrefix={arXiv}, | |
| primaryClass={cs.CL}, | |
| url={https://arxiv.org/abs/2406.04770} | |
| } | |
| """ | |
| # make column_names as an ordered dict | |
| column_names = OrderedDict({ | |
| "Model": "Model", | |
| "Mode": "Mode", | |
| "Puzzle Acc": "Puzzle Acc", | |
| "Cell Acc": "Cell Acc", | |
| "No answer": "No answer", | |
| "Easy Puzzle Acc": "Easy Puzzle Acc", | |
| "Hard Puzzle Acc": "Hard Puzzle Acc", | |
| # "Total Puzzles": "Total Puzzles", | |
| # "Reason Lens": "Reason Lens", | |
| }) | |
| LEADERBOARD_REMARKS = """**WB Reward**: for each comparison (A vs B), a reward for A is **+/-1** if A is **much better/worse** than B, and **+/-0.5** if A is **slightly better/worse** than B; when there is a **Tie**, the reward is **0**. | |
| """ | |
| # **WB Reward**: for each pairwise comparison, a reward for A is **+/-1** if A is **much better/worse** than B, and **+/-0.5** if A is **slightly better/worse** than B; 0 for a **Tie**. | |
| # The baseline models are GPT4-Turbo, Haiku, and Llama2-70B, and Mix is the average of the three. | |
| # **WB Score** individually scores each model based on checklists. | |
| # Evaluator is GPT-4-Turbo. | |
| LEADERBOARD_REMARKS_MAIN = """ | |
| """ | |
| RANKING_COLUMN = "Puzzle Acc" | |
| ORDERED_COLUMN_NAMES = [ | |
| "Model", | |
| "Mode", | |
| "Puzzle Acc", | |
| "Easy Puzzle Acc", | |
| "Hard Puzzle Acc", | |
| "Cell Acc", | |
| "No answer", | |
| ] | |
| js_light = """ | |
| function refresh() { | |
| const url = new URL(window.location); | |
| if (url.searchParams.get('__theme') !== 'light') { | |
| url.searchParams.set('__theme', 'light'); | |
| window.location.href = url.href; | |
| } | |
| // Find the fieldset with the given id | |
| const fieldset = document.getElementById("rank-column-radio"); | |
| // Create a new span element with the text "Decoding Mode:" | |
| const rankBySpan = document.createElement("span"); | |
| rankBySpan.textContent = "Decoding Mode: "; | |
| rankBySpan.style.fontWeight = "bold"; // Optional: make the text bold | |
| rankBySpan.style.fontSize = "19px"; // Larger font size | |
| rankBySpan.style.paddingRight = "18px"; // Add padding on the right | |
| // Wrap the span and the labels in a flex container | |
| const flexContainer = document.createElement("div"); | |
| flexContainer.style.display = "flex"; | |
| flexContainer.style.alignItems = "center"; | |
| // Insert the rankBySpan at the beginning of the flex container | |
| flexContainer.appendChild(rankBySpan); | |
| // Move all existing labels into the flex container | |
| while (fieldset.firstChild) { | |
| flexContainer.appendChild(fieldset.firstChild); | |
| } | |
| // Append the flex container back to the fieldset | |
| fieldset.appendChild(flexContainer); | |
| } | |
| """ | |
| js_code = """ | |
| function scroll_top() { | |
| console.log("Hello from Gradio!"); | |
| const bubbles = document.querySelectorAll('.bubble-wrap'); | |
| bubbles.forEach((bubble, index) => { | |
| setTimeout(() => { | |
| bubble.scrollTop = 0; | |
| }, index * 100); // Delay of 100ms between each iteration | |
| }); | |
| } | |
| """ | |
| TASK_TYPE_STR = "**Tasks**: Info seeking (**InfoSek**), Creative Writing (**CrtWrt**), Coding&Debugging (**Code**), Reasoning (**Reason**), Editing (**Edit**), **Math**, Planning (**Plan**), Brainstorming (**Brnstrm**), Role playing (**RolPly**), Advice seeking (**AdvSek**), Data Analysis (**DataAna**)" | |
| css = """ | |
| code { | |
| font-size: large; | |
| } | |
| footer {visibility: hidden} | |
| .top-left-LP{ | |
| margin-top: 6px; | |
| margin-left: 5px; | |
| } | |
| .no_margin{ | |
| margin-top: 0px; | |
| margin-left: 0px; | |
| margin-right: 0px; | |
| margin-bottom: 0px; | |
| padding-top: 0px; | |
| padding-left: 0px; | |
| padding-right: 0px; | |
| padding-bottom: 0px; | |
| } | |
| .markdown-text{font-size: 14pt} | |
| .markdown-text-tiny{font-size: 10pt} | |
| .markdown-text-small{font-size: 13pt} | |
| .markdown-text-tiny{font-size: 12pt} | |
| .markdown-text-tiny-red{ | |
| font-size: 12pt; | |
| color: red; | |
| background-color: yellow; | |
| font-color: red; | |
| font-weight: bold; | |
| } | |
| th { | |
| text-align: center; | |
| font-size: 17px; /* Adjust the font size as needed */ | |
| } | |
| td { | |
| font-size: 15px; /* Adjust the font size as needed */ | |
| text-align: center; | |
| } | |
| .sample_button{ | |
| border: 2px solid #000000; | |
| border-radius: 10px; | |
| padding: 10px; | |
| font-size: 17pt; | |
| font-weight: bold; | |
| margin: 5px; | |
| background-color: #D8BFD8; | |
| } | |
| .chat-common{ | |
| height: auto; | |
| max-height: 400px; | |
| min-height: 100px; | |
| } | |
| .chat-specific{ | |
| height: auto; | |
| max-height: 600px; | |
| min-height: 200px; | |
| } | |
| #od-benchmark-tab-table-button{ | |
| font-size: 15pt; | |
| font-weight: bold; | |
| } | |
| .btn_boderline{ | |
| border: 1px solid #000000; | |
| border-radius: 5px; | |
| padding: 5px; | |
| margin: 5px; | |
| font-size: 15pt; | |
| font-weight: bold; | |
| } | |
| .btn_boderline_next{ | |
| border: 0.1px solid #000000; | |
| border-radius: 5px; | |
| padding: 5px; | |
| margin: 5px; | |
| font-size: 15pt; | |
| font-weight: bold; | |
| } | |
| .btn_boderline_gray{ | |
| border: 0.5px solid gray; | |
| border-radius: 5px; | |
| padding: 5px; | |
| margin: 5px; | |
| font-size: 15pt; | |
| font-weight: italic; | |
| } | |
| .btn_boderline_selected{ | |
| border: 2px solid purple; | |
| background-color: #f2f2f2; | |
| border-radius: 5px; | |
| padding: 5px; | |
| margin: 5px; | |
| font-size: 15pt; | |
| font-weight: bold; | |
| } | |
| .accordion-label button span{ | |
| font-size: 14pt; | |
| font-weight: bold; | |
| } | |
| #show-task-categorized span{ | |
| font-size: 13pt; | |
| font-weight: bold; | |
| } | |
| #show-open-source-models span{ | |
| font-size: 13pt; | |
| font-weight: bold; | |
| } | |
| #select-models span{ | |
| font-size: 10pt; | |
| } | |
| #select-tasks span{ | |
| font-size: 10pt; | |
| } | |
| .markdown-text-details{ | |
| margin: 10px; | |
| padding: 10px; | |
| } | |
| button.selected[role="tab"][aria-selected="true"] { | |
| font-size: 18px; /* or any other size you prefer */ | |
| font-weight: bold; | |
| } | |
| #od-benchmark-tab-table-ablation-button { | |
| font-size: larger; /* Adjust the font size as needed */ | |
| } | |
| .plotly-plot{ | |
| height: auto; | |
| max-height: 600px; | |
| min-height: 600px; | |
| } | |
| #length-margin-radio{ | |
| font-size: 10pt; | |
| # padding: 0px; | |
| # margin: 1px; | |
| } | |
| #show-task-categorized{ | |
| font-size: 12pt; | |
| font-decoration: bold; | |
| } | |
| #show-open-source-models{ | |
| font-size: 12pt; | |
| font-decoration: bold; | |
| } | |
| """ | |