Spaces:
				
			
			
	
			
			
		Runtime error
		
	
	
	
			
			
	
	
	
	
		
		
		Runtime error
		
	Merge branch 'main' of hf.co:spaces/lmsys/chatbot-arena-leaderboard into main
Browse files
    	
        app.py
    CHANGED
    
    | @@ -8,7 +8,8 @@ import gradio as gr | |
| 8 | 
             
            import numpy as np
         | 
| 9 |  | 
| 10 |  | 
| 11 | 
            -
            notebook_url = "https://colab.research.google.com/drive/1RAWb22-PFNI-X1gPVzc927SGUdfr6nsR?usp=sharing"
         | 
|  | |
| 12 |  | 
| 13 |  | 
| 14 | 
             
            basic_component_values = [None] * 6
         | 
| @@ -21,7 +22,7 @@ def make_leaderboard_md(elo_results): | |
| 21 | 
             
            | [Vote](https://chat.lmsys.org/?arena) | [Blog](https://lmsys.org/blog/2023-05-03-arena/) | [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2306.05685) | [Dataset](https://github.com/lm-sys/FastChat/blob/main/docs/dataset_release.md) | [Twitter](https://twitter.com/lmsysorg) | [Discord](https://discord.gg/HSWAKCrnFx) |
         | 
| 22 |  | 
| 23 | 
             
            🏆 This leaderboard is based on the following three benchmarks.
         | 
| 24 | 
            -
            - [Chatbot Arena](https://chat.lmsys.org/?arena) - a crowdsourced, randomized battle platform. We use  | 
| 25 | 
             
            - [MT-Bench](https://arxiv.org/abs/2306.05685) - a set of challenging multi-turn questions. We use GPT-4 to grade the model responses.
         | 
| 26 | 
             
            - [MMLU](https://arxiv.org/abs/2009.03300) (5-shot) - a test to measure a model's multitask accuracy on 57 tasks.
         | 
| 27 |  | 
| @@ -227,7 +228,7 @@ Please note that you may see different orders from different ranking methods. Th | |
| 227 | 
             
                with gr.Row():
         | 
| 228 | 
             
                    with gr.Column():
         | 
| 229 | 
             
                        gr.Markdown(
         | 
| 230 | 
            -
                            "#### Figure 3: Bootstrap of Elo Estimates (1000 Rounds of Random Sampling)"
         | 
| 231 | 
             
                        )
         | 
| 232 | 
             
                        plot_3 = gr.Plot(p3, show_label=False)
         | 
| 233 | 
             
                    with gr.Column():
         | 
|  | |
| 8 | 
             
            import numpy as np
         | 
| 9 |  | 
| 10 |  | 
| 11 | 
            +
            # notebook_url = "https://colab.research.google.com/drive/1RAWb22-PFNI-X1gPVzc927SGUdfr6nsR?usp=sharing"
         | 
| 12 | 
            +
            notebook_url = "https://colab.research.google.com/drive/1KdwokPjirkTmpO_P1WByFNFiqxWQquwH#scrollTo=o_CpbkGEbhrK"
         | 
| 13 |  | 
| 14 |  | 
| 15 | 
             
            basic_component_values = [None] * 6
         | 
|  | |
| 22 | 
             
            | [Vote](https://chat.lmsys.org/?arena) | [Blog](https://lmsys.org/blog/2023-05-03-arena/) | [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2306.05685) | [Dataset](https://github.com/lm-sys/FastChat/blob/main/docs/dataset_release.md) | [Twitter](https://twitter.com/lmsysorg) | [Discord](https://discord.gg/HSWAKCrnFx) |
         | 
| 23 |  | 
| 24 | 
             
            🏆 This leaderboard is based on the following three benchmarks.
         | 
| 25 | 
            +
            - [Chatbot Arena](https://chat.lmsys.org/?arena) - a crowdsourced, randomized battle platform. We use 130K+ user votes to compute Elo ratings.
         | 
| 26 | 
             
            - [MT-Bench](https://arxiv.org/abs/2306.05685) - a set of challenging multi-turn questions. We use GPT-4 to grade the model responses.
         | 
| 27 | 
             
            - [MMLU](https://arxiv.org/abs/2009.03300) (5-shot) - a test to measure a model's multitask accuracy on 57 tasks.
         | 
| 28 |  | 
|  | |
| 228 | 
             
                with gr.Row():
         | 
| 229 | 
             
                    with gr.Column():
         | 
| 230 | 
             
                        gr.Markdown(
         | 
| 231 | 
            +
                            "#### Figure 3: Bootstrap of MLE Elo Estimates (1000 Rounds of Random Sampling)"
         | 
| 232 | 
             
                        )
         | 
| 233 | 
             
                        plot_3 = gr.Plot(p3, show_label=False)
         | 
| 234 | 
             
                    with gr.Column():
         | 
 
			
