Spaces:
Running
Running
terryyz
commited on
Commit
Β·
ba99c06
1
Parent(s):
bc55571
update
Browse files- .gitignore +3 -0
- api_config.yaml +49 -1
- app.py +254 -472
- elo_calculation.py +315 -0
- ranking.py +199 -0
- requirements.txt +2 -1
- sandbox/sandbox_manager.py +1 -1
- voting.py +329 -0
.gitignore
CHANGED
|
@@ -33,6 +33,9 @@ logs/
|
|
| 33 |
*.manifest
|
| 34 |
*.spec
|
| 35 |
|
|
|
|
|
|
|
|
|
|
| 36 |
# Installer logs
|
| 37 |
pip-log.txt
|
| 38 |
pip-delete-this-directory.txt
|
|
|
|
| 33 |
*.manifest
|
| 34 |
*.spec
|
| 35 |
|
| 36 |
+
e2b_sandbox_template/
|
| 37 |
+
build.sh
|
| 38 |
+
|
| 39 |
# Installer logs
|
| 40 |
pip-log.txt
|
| 41 |
pip-delete-this-directory.txt
|
api_config.yaml
CHANGED
|
@@ -5,6 +5,8 @@ gpt-4o-mini-2024-07-18:
|
|
| 5 |
parallel: 32
|
| 6 |
max_tokens: 8192
|
| 7 |
temperature: 0.0
|
|
|
|
|
|
|
| 8 |
|
| 9 |
gpt-4.1-mini-2025-04-14:
|
| 10 |
model: gpt-4.1-mini-2025-04-14
|
|
@@ -13,6 +15,8 @@ gpt-4.1-mini-2025-04-14:
|
|
| 13 |
parallel: 32
|
| 14 |
max_tokens: 8192
|
| 15 |
temperature: 0.0
|
|
|
|
|
|
|
| 16 |
|
| 17 |
# o1-2024-12-17:
|
| 18 |
# model: o1-2024-12-17
|
|
@@ -21,6 +25,8 @@ gpt-4.1-mini-2025-04-14:
|
|
| 21 |
# parallel: 32
|
| 22 |
# max_tokens: 8192
|
| 23 |
# temperature: 0.0
|
|
|
|
|
|
|
| 24 |
|
| 25 |
# o4-mini-2025-04-16:
|
| 26 |
# model: o4-mini-2025-04-16
|
|
@@ -29,6 +35,8 @@ gpt-4.1-mini-2025-04-14:
|
|
| 29 |
# parallel: 32
|
| 30 |
# max_tokens: 8192
|
| 31 |
# temperature: 1.0
|
|
|
|
|
|
|
| 32 |
|
| 33 |
# o3-mini-2025-01-31:
|
| 34 |
# model: o3-mini-2025-01-31
|
|
@@ -37,6 +45,8 @@ gpt-4.1-mini-2025-04-14:
|
|
| 37 |
# parallel: 32
|
| 38 |
# max_tokens: 8192
|
| 39 |
# temperature: 0.0
|
|
|
|
|
|
|
| 40 |
|
| 41 |
# gemini-2.0-flash-001:
|
| 42 |
# model: google/gemini-2.0-flash-001
|
|
@@ -47,6 +57,8 @@ gpt-4.1-mini-2025-04-14:
|
|
| 47 |
# parallel: 32
|
| 48 |
# max_tokens: 81920
|
| 49 |
# temperature: 0.0
|
|
|
|
|
|
|
| 50 |
|
| 51 |
# gemini-2.5-pro:
|
| 52 |
# model: google/gemini-2.5-pro
|
|
@@ -57,6 +69,8 @@ gpt-4.1-mini-2025-04-14:
|
|
| 57 |
# parallel: 32
|
| 58 |
# max_tokens: 8192
|
| 59 |
# temperature: 0.0
|
|
|
|
|
|
|
| 60 |
|
| 61 |
# gemini-2.5-flash:
|
| 62 |
# model: google/gemini-2.5-flash
|
|
@@ -67,6 +81,8 @@ gpt-4.1-mini-2025-04-14:
|
|
| 67 |
# parallel: 32
|
| 68 |
# max_tokens: 8192
|
| 69 |
# temperature: 0.0
|
|
|
|
|
|
|
| 70 |
|
| 71 |
# claude35_haiku:
|
| 72 |
# model: bedrock/anthropic.claude-3-5-haiku-20241022-v1:0
|
|
@@ -75,6 +91,8 @@ gpt-4.1-mini-2025-04-14:
|
|
| 75 |
# parallel: 32
|
| 76 |
# max_tokens: 8192
|
| 77 |
# temperature: 0.0
|
|
|
|
|
|
|
| 78 |
|
| 79 |
# claude35_sonnet:
|
| 80 |
# model: bedrock/anthropic.claude-3-5-sonnet-20241022-v2:0
|
|
@@ -83,6 +101,8 @@ gpt-4.1-mini-2025-04-14:
|
|
| 83 |
# parallel: 32
|
| 84 |
# max_tokens: 8192
|
| 85 |
# temperature: 0.0
|
|
|
|
|
|
|
| 86 |
|
| 87 |
# claude37_sonnet:
|
| 88 |
# model: bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0
|
|
@@ -91,6 +111,8 @@ gpt-4.1-mini-2025-04-14:
|
|
| 91 |
# parallel: 32
|
| 92 |
# max_tokens: 8192
|
| 93 |
# temperature: 0.0
|
|
|
|
|
|
|
| 94 |
|
| 95 |
# qwen3-coder:
|
| 96 |
# model: qwen/qwen3-coder
|
|
@@ -101,6 +123,8 @@ gpt-4.1-mini-2025-04-14:
|
|
| 101 |
# parallel: 32
|
| 102 |
# max_tokens: 8192
|
| 103 |
# temperature: 0.0
|
|
|
|
|
|
|
| 104 |
|
| 105 |
# kimi-k2:
|
| 106 |
# model: moonshotai/kimi-k2
|
|
@@ -111,6 +135,8 @@ gpt-4.1-mini-2025-04-14:
|
|
| 111 |
# parallel: 32
|
| 112 |
# max_tokens: 8192
|
| 113 |
# temperature: 0.0
|
|
|
|
|
|
|
| 114 |
|
| 115 |
# claude-4-sonnet:
|
| 116 |
# model: bedrock/us.anthropic.claude-sonnet-4-20250514-v1:0
|
|
@@ -119,6 +145,8 @@ gpt-4.1-mini-2025-04-14:
|
|
| 119 |
# parallel: 16
|
| 120 |
# max_tokens: 8192
|
| 121 |
# temperature: 0.0
|
|
|
|
|
|
|
| 122 |
|
| 123 |
# claude-4-opus:
|
| 124 |
# model: bedrock/us.anthropic.claude-opus-4-20250514-v1:0
|
|
@@ -127,6 +155,8 @@ gpt-4.1-mini-2025-04-14:
|
|
| 127 |
# parallel: 16
|
| 128 |
# max_tokens: 8192
|
| 129 |
# temperature: 0.0
|
|
|
|
|
|
|
| 130 |
|
| 131 |
# gpt-oss-120b:
|
| 132 |
# model: openai/gpt-oss-120b
|
|
@@ -137,6 +167,8 @@ gpt-4.1-mini-2025-04-14:
|
|
| 137 |
# parallel: 32
|
| 138 |
# max_tokens: 8192
|
| 139 |
# temperature: 1.0
|
|
|
|
|
|
|
| 140 |
|
| 141 |
# gpt-oss-20b:
|
| 142 |
# model: openai/gpt-oss-20b
|
|
@@ -147,6 +179,8 @@ gpt-4.1-mini-2025-04-14:
|
|
| 147 |
# parallel: 32
|
| 148 |
# max_tokens: 8192
|
| 149 |
# temperature: 1.0
|
|
|
|
|
|
|
| 150 |
|
| 151 |
# deepseek-chat-v3-0324:
|
| 152 |
# model: deepseek/deepseek-chat-v3-0324
|
|
@@ -157,6 +191,8 @@ gpt-4.1-mini-2025-04-14:
|
|
| 157 |
# parallel: 32
|
| 158 |
# max_tokens: 8192
|
| 159 |
# temperature: 0.0
|
|
|
|
|
|
|
| 160 |
|
| 161 |
# deepseek-chat-v3.1:
|
| 162 |
# model: deepseek-chat
|
|
@@ -167,6 +203,8 @@ gpt-4.1-mini-2025-04-14:
|
|
| 167 |
# parallel: 32
|
| 168 |
# max_tokens: 8192
|
| 169 |
# temperature: 0.0
|
|
|
|
|
|
|
| 170 |
|
| 171 |
# glm-4.5:
|
| 172 |
# model: z-ai/glm-4.5
|
|
@@ -177,6 +215,8 @@ gpt-4.1-mini-2025-04-14:
|
|
| 177 |
# parallel: 32
|
| 178 |
# max_tokens: 8192
|
| 179 |
# temperature: 0.0
|
|
|
|
|
|
|
| 180 |
|
| 181 |
# gpt-4.1-2025-04-14:
|
| 182 |
# model: gpt-4.1-2025-04-14
|
|
@@ -185,6 +225,8 @@ gpt-4.1-mini-2025-04-14:
|
|
| 185 |
# parallel: 32
|
| 186 |
# max_tokens: 8192
|
| 187 |
# temperature: 0.0
|
|
|
|
|
|
|
| 188 |
|
| 189 |
|
| 190 |
# deepseek-r1-0528:
|
|
@@ -196,6 +238,8 @@ gpt-4.1-mini-2025-04-14:
|
|
| 196 |
# parallel: 32
|
| 197 |
# max_tokens: 81920
|
| 198 |
# temperature: 1.0
|
|
|
|
|
|
|
| 199 |
|
| 200 |
# gpt-5-2025-08-07:
|
| 201 |
# model: gpt-5-2025-08-07
|
|
@@ -204,6 +248,8 @@ gpt-4.1-mini-2025-04-14:
|
|
| 204 |
# parallel: 32
|
| 205 |
# max_tokens: 8192
|
| 206 |
# temperature: 1.0
|
|
|
|
|
|
|
| 207 |
|
| 208 |
# grok-code:
|
| 209 |
# model: x-ai/grok-code-fast-1
|
|
@@ -213,4 +259,6 @@ gpt-4.1-mini-2025-04-14:
|
|
| 213 |
# api_type: openai_thinking
|
| 214 |
# parallel: 32
|
| 215 |
# max_tokens: 8192
|
| 216 |
-
# temperature: 1.0
|
|
|
|
|
|
|
|
|
| 5 |
parallel: 32
|
| 6 |
max_tokens: 8192
|
| 7 |
temperature: 0.0
|
| 8 |
+
organization: OpenAI
|
| 9 |
+
license: Proprietary
|
| 10 |
|
| 11 |
gpt-4.1-mini-2025-04-14:
|
| 12 |
model: gpt-4.1-mini-2025-04-14
|
|
|
|
| 15 |
parallel: 32
|
| 16 |
max_tokens: 8192
|
| 17 |
temperature: 0.0
|
| 18 |
+
organization: OpenAI
|
| 19 |
+
license: Proprietary
|
| 20 |
|
| 21 |
# o1-2024-12-17:
|
| 22 |
# model: o1-2024-12-17
|
|
|
|
| 25 |
# parallel: 32
|
| 26 |
# max_tokens: 8192
|
| 27 |
# temperature: 0.0
|
| 28 |
+
# organization: OpenAI
|
| 29 |
+
# license: Proprietary
|
| 30 |
|
| 31 |
# o4-mini-2025-04-16:
|
| 32 |
# model: o4-mini-2025-04-16
|
|
|
|
| 35 |
# parallel: 32
|
| 36 |
# max_tokens: 8192
|
| 37 |
# temperature: 1.0
|
| 38 |
+
# organization: OpenAI
|
| 39 |
+
# license: Proprietary
|
| 40 |
|
| 41 |
# o3-mini-2025-01-31:
|
| 42 |
# model: o3-mini-2025-01-31
|
|
|
|
| 45 |
# parallel: 32
|
| 46 |
# max_tokens: 8192
|
| 47 |
# temperature: 0.0
|
| 48 |
+
# organization: OpenAI
|
| 49 |
+
# license: Proprietary
|
| 50 |
|
| 51 |
# gemini-2.0-flash-001:
|
| 52 |
# model: google/gemini-2.0-flash-001
|
|
|
|
| 57 |
# parallel: 32
|
| 58 |
# max_tokens: 81920
|
| 59 |
# temperature: 0.0
|
| 60 |
+
# organization: Google
|
| 61 |
+
# license: Proprietary
|
| 62 |
|
| 63 |
# gemini-2.5-pro:
|
| 64 |
# model: google/gemini-2.5-pro
|
|
|
|
| 69 |
# parallel: 32
|
| 70 |
# max_tokens: 8192
|
| 71 |
# temperature: 0.0
|
| 72 |
+
# organization: Google
|
| 73 |
+
# license: Proprietary
|
| 74 |
|
| 75 |
# gemini-2.5-flash:
|
| 76 |
# model: google/gemini-2.5-flash
|
|
|
|
| 81 |
# parallel: 32
|
| 82 |
# max_tokens: 8192
|
| 83 |
# temperature: 0.0
|
| 84 |
+
# organization: Google
|
| 85 |
+
# license: Proprietary
|
| 86 |
|
| 87 |
# claude35_haiku:
|
| 88 |
# model: bedrock/anthropic.claude-3-5-haiku-20241022-v1:0
|
|
|
|
| 91 |
# parallel: 32
|
| 92 |
# max_tokens: 8192
|
| 93 |
# temperature: 0.0
|
| 94 |
+
# organization: Anthropic
|
| 95 |
+
# license: Proprietary
|
| 96 |
|
| 97 |
# claude35_sonnet:
|
| 98 |
# model: bedrock/anthropic.claude-3-5-sonnet-20241022-v2:0
|
|
|
|
| 101 |
# parallel: 32
|
| 102 |
# max_tokens: 8192
|
| 103 |
# temperature: 0.0
|
| 104 |
+
# organization: Anthropic
|
| 105 |
+
# license: Proprietary
|
| 106 |
|
| 107 |
# claude37_sonnet:
|
| 108 |
# model: bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0
|
|
|
|
| 111 |
# parallel: 32
|
| 112 |
# max_tokens: 8192
|
| 113 |
# temperature: 0.0
|
| 114 |
+
# organization: Anthropic
|
| 115 |
+
# license: Proprietary
|
| 116 |
|
| 117 |
# qwen3-coder:
|
| 118 |
# model: qwen/qwen3-coder
|
|
|
|
| 123 |
# parallel: 32
|
| 124 |
# max_tokens: 8192
|
| 125 |
# temperature: 0.0
|
| 126 |
+
# organization: Alibaba
|
| 127 |
+
# license: Apache 2.0
|
| 128 |
|
| 129 |
# kimi-k2:
|
| 130 |
# model: moonshotai/kimi-k2
|
|
|
|
| 135 |
# parallel: 32
|
| 136 |
# max_tokens: 8192
|
| 137 |
# temperature: 0.0
|
| 138 |
+
# organization: Moonshot
|
| 139 |
+
# license: Modified MIT
|
| 140 |
|
| 141 |
# claude-4-sonnet:
|
| 142 |
# model: bedrock/us.anthropic.claude-sonnet-4-20250514-v1:0
|
|
|
|
| 145 |
# parallel: 16
|
| 146 |
# max_tokens: 8192
|
| 147 |
# temperature: 0.0
|
| 148 |
+
# organization: Anthropic
|
| 149 |
+
# license: Proprietary
|
| 150 |
|
| 151 |
# claude-4-opus:
|
| 152 |
# model: bedrock/us.anthropic.claude-opus-4-20250514-v1:0
|
|
|
|
| 155 |
# parallel: 16
|
| 156 |
# max_tokens: 8192
|
| 157 |
# temperature: 0.0
|
| 158 |
+
# organization: Anthropic
|
| 159 |
+
# license: Proprietary
|
| 160 |
|
| 161 |
# gpt-oss-120b:
|
| 162 |
# model: openai/gpt-oss-120b
|
|
|
|
| 167 |
# parallel: 32
|
| 168 |
# max_tokens: 8192
|
| 169 |
# temperature: 1.0
|
| 170 |
+
# organization: OpenAI
|
| 171 |
+
# license: MIT
|
| 172 |
|
| 173 |
# gpt-oss-20b:
|
| 174 |
# model: openai/gpt-oss-20b
|
|
|
|
| 179 |
# parallel: 32
|
| 180 |
# max_tokens: 8192
|
| 181 |
# temperature: 1.0
|
| 182 |
+
# organization: OpenAI
|
| 183 |
+
# license: MIT
|
| 184 |
|
| 185 |
# deepseek-chat-v3-0324:
|
| 186 |
# model: deepseek/deepseek-chat-v3-0324
|
|
|
|
| 191 |
# parallel: 32
|
| 192 |
# max_tokens: 8192
|
| 193 |
# temperature: 0.0
|
| 194 |
+
# organization: DeepSeek
|
| 195 |
+
# license: MIT
|
| 196 |
|
| 197 |
# deepseek-chat-v3.1:
|
| 198 |
# model: deepseek-chat
|
|
|
|
| 203 |
# parallel: 32
|
| 204 |
# max_tokens: 8192
|
| 205 |
# temperature: 0.0
|
| 206 |
+
# organization: DeepSeek
|
| 207 |
+
# license: MIT
|
| 208 |
|
| 209 |
# glm-4.5:
|
| 210 |
# model: z-ai/glm-4.5
|
|
|
|
| 215 |
# parallel: 32
|
| 216 |
# max_tokens: 8192
|
| 217 |
# temperature: 0.0
|
| 218 |
+
# organization: Zhipu AI
|
| 219 |
+
# license: Custom
|
| 220 |
|
| 221 |
# gpt-4.1-2025-04-14:
|
| 222 |
# model: gpt-4.1-2025-04-14
|
|
|
|
| 225 |
# parallel: 32
|
| 226 |
# max_tokens: 8192
|
| 227 |
# temperature: 0.0
|
| 228 |
+
# organization: OpenAI
|
| 229 |
+
# license: Proprietary
|
| 230 |
|
| 231 |
|
| 232 |
# deepseek-r1-0528:
|
|
|
|
| 238 |
# parallel: 32
|
| 239 |
# max_tokens: 81920
|
| 240 |
# temperature: 1.0
|
| 241 |
+
# organization: DeepSeek
|
| 242 |
+
# license: MIT
|
| 243 |
|
| 244 |
# gpt-5-2025-08-07:
|
| 245 |
# model: gpt-5-2025-08-07
|
|
|
|
| 248 |
# parallel: 32
|
| 249 |
# max_tokens: 8192
|
| 250 |
# temperature: 1.0
|
| 251 |
+
# organization: OpenAI
|
| 252 |
+
# license: Proprietary
|
| 253 |
|
| 254 |
# grok-code:
|
| 255 |
# model: x-ai/grok-code-fast-1
|
|
|
|
| 259 |
# api_type: openai_thinking
|
| 260 |
# parallel: 32
|
| 261 |
# max_tokens: 8192
|
| 262 |
+
# temperature: 1.0
|
| 263 |
+
# organization: xAI
|
| 264 |
+
# license: Proprietary
|
app.py
CHANGED
|
@@ -10,8 +10,37 @@ import datetime
|
|
| 10 |
import os
|
| 11 |
import asyncio
|
| 12 |
import concurrent.futures
|
|
|
|
| 13 |
import time
|
|
|
|
|
|
|
| 14 |
from datasets import Dataset, load_dataset
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
# Import completion utilities
|
| 16 |
from completion import make_config, registered_api_completion
|
| 17 |
from sandbox.prompts import GENERAL_SANDBOX_INSTRUCTION
|
|
@@ -95,18 +124,130 @@ available_models = list(api_config.keys()) if api_config else []
|
|
| 95 |
HF_DATASET_NAME = os.getenv("HF_DATASET_NAME")
|
| 96 |
HF_TOKEN = os.getenv("HF_TOKEN")
|
| 97 |
|
| 98 |
-
# Global ranking data cache
|
| 99 |
-
ranking_data = None
|
| 100 |
-
ranking_last_updated = None
|
| 101 |
|
| 102 |
def get_random_models():
|
| 103 |
-
"""Get two random models from available models"""
|
| 104 |
if len(available_models) < 2:
|
| 105 |
return available_models[0] if available_models else None, available_models[0] if available_models else None
|
| 106 |
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
|
| 111 |
def create_chat_state(model_name: str) -> dict:
|
| 112 |
"""Create a new chat state for a model"""
|
|
@@ -488,7 +629,7 @@ def clear_chat(state0, state1):
|
|
| 488 |
|
| 489 |
# Get current model names for display
|
| 490 |
model_a, model_b = get_random_models()
|
| 491 |
-
|
| 492 |
return (
|
| 493 |
None, # state0
|
| 494 |
None, # state1
|
|
@@ -524,52 +665,49 @@ def retry_last_message(state0, state1, model_a, model_b):
|
|
| 524 |
"""Retry the last user message"""
|
| 525 |
if not state0 or not state1:
|
| 526 |
return state0, state1, "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""
|
| 527 |
-
|
| 528 |
# Get the last user message
|
| 529 |
last_user_message = ""
|
| 530 |
for msg in reversed(state0["messages"]):
|
| 531 |
if msg["role"] == "user":
|
| 532 |
last_user_message = msg["content"]
|
| 533 |
break
|
| 534 |
-
|
| 535 |
if not last_user_message:
|
| 536 |
return state0, state1, "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""
|
| 537 |
-
|
| 538 |
# Remove the last user message and assistant responses from both states
|
| 539 |
if state0["messages"] and state0["messages"][-1]["role"] == "assistant":
|
| 540 |
state0["messages"].pop() # Remove last assistant response
|
| 541 |
if state0["messages"] and state0["messages"][-1]["role"] == "user":
|
| 542 |
state0["messages"].pop() # Remove last user message
|
| 543 |
-
|
| 544 |
if state1["messages"] and state1["messages"][-1]["role"] == "assistant":
|
| 545 |
state1["messages"].pop() # Remove last assistant response
|
| 546 |
if state1["messages"] and state1["messages"][-1]["role"] == "user":
|
| 547 |
state1["messages"].pop() # Remove last user message
|
| 548 |
-
|
| 549 |
# Generate new responses with the same message
|
| 550 |
result = add_text_and_generate(state0, state1, last_user_message, 0.4, 8192, model_a, model_b)
|
| 551 |
-
|
| 552 |
# Extract the state from the result
|
| 553 |
new_state0, new_state1 = result[0], result[1]
|
| 554 |
-
|
| 555 |
# Check if both models have output and are not generating to show vote buttons
|
| 556 |
-
show_vote_buttons = (
|
| 557 |
-
|
| 558 |
-
and new_state0.get("has_output", False)
|
| 559 |
-
and not new_state0.get("generating", False)
|
| 560 |
-
and new_state1
|
| 561 |
-
and new_state1.get("has_output", False)
|
| 562 |
-
and not new_state1.get("generating", False)
|
| 563 |
-
)
|
| 564 |
-
|
| 565 |
# Return all the original outputs plus the updated state for run buttons
|
| 566 |
return (
|
| 567 |
new_state0, # state0
|
| 568 |
new_state1, # state1
|
| 569 |
result[2], # chatbot_a (chat0)
|
| 570 |
result[3], # chatbot_b (chat1)
|
| 571 |
-
|
| 572 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 573 |
result[6], # code_a (code0)
|
| 574 |
result[7], # code_b (code1)
|
| 575 |
result[10] if len(result) > 10 else "", # sandbox_state0
|
|
@@ -608,37 +746,37 @@ def send_to_left_only(state0, state1, text, temperature, max_tokens, model_a, mo
|
|
| 608 |
"""Send message to left model (Model A) only"""
|
| 609 |
if not text.strip():
|
| 610 |
return state0, state1, "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""
|
| 611 |
-
|
| 612 |
# Initialize states if needed
|
| 613 |
if state0 is None:
|
| 614 |
state0 = create_chat_state(model_a)
|
| 615 |
if state1 is None:
|
| 616 |
state1 = create_chat_state(model_b)
|
| 617 |
-
|
| 618 |
# Add user message to left state only
|
| 619 |
state0["messages"].append({"role": "user", "content": text})
|
| 620 |
state0["generating"] = True
|
| 621 |
-
|
| 622 |
# Generate response for left model only
|
| 623 |
state0, response0 = generate_response_with_completion(state0, temperature, max_tokens)
|
| 624 |
state0["messages"].append({"role": "assistant", "content": response0["content"]})
|
| 625 |
state0["has_output"] = True
|
| 626 |
state0["generating"] = False
|
| 627 |
-
|
| 628 |
# Format chat history for display
|
| 629 |
chat0 = format_chat_history(state0["messages"])
|
| 630 |
chat1 = format_chat_history(state1["messages"]) if state1 else []
|
| 631 |
-
|
| 632 |
# Extract code from response for sandbox
|
| 633 |
sandbox_state0 = state0.get("sandbox_state", create_sandbox_state())
|
| 634 |
sandbox_state0, code0, env0 = extract_and_execute_code(response0["content"], sandbox_state0)
|
| 635 |
state0["sandbox_state"] = sandbox_state0
|
| 636 |
-
|
| 637 |
# Clear previous sandbox outputs
|
| 638 |
sandbox_output0 = ""
|
| 639 |
sandbox_component_update0 = gr.update(value=("", False, []), visible=False)
|
| 640 |
sandbox_view_a = ""
|
| 641 |
-
|
| 642 |
# Run sandbox execution if there's code
|
| 643 |
if code0.strip():
|
| 644 |
install_command0 = sandbox_state0.get('install_command', "")
|
|
@@ -653,28 +791,33 @@ def send_to_left_only(state0, state1, text, temperature, max_tokens, model_a, mo
|
|
| 653 |
sandbox_view_a += f"# Output\n{sandbox_output0}"
|
| 654 |
if sandbox_error0:
|
| 655 |
sandbox_view_a = f"<details closed><summary><strong>π¨ Errors/Warnings</strong></summary>\n\n```\n{sandbox_error0.strip()}\n```\n\n</details>\n\n" + sandbox_view_a
|
| 656 |
-
|
| 657 |
# Calculate conversation statistics
|
| 658 |
turn_count_a = len([msg for msg in state0["messages"] if msg["role"] == "assistant" and msg["content"]])
|
| 659 |
turn_count_b = len([msg for msg in state1["messages"] if msg["role"] == "assistant" and msg["content"]]) if state1 else 0
|
| 660 |
-
|
| 661 |
chat_stats_a = f"**Conversation:** {turn_count_a} turns | **Total Messages:** {len(state0['messages'])}"
|
| 662 |
chat_stats_b = f"**Conversation:** {turn_count_b} turns | **Total Messages:** {len(state1['messages']) if state1 else 0}"
|
| 663 |
-
|
| 664 |
# Don't show vote buttons since only one model responded
|
| 665 |
show_vote_buttons = False
|
| 666 |
-
|
| 667 |
return (
|
| 668 |
state0, # state0
|
| 669 |
state1, # state1
|
| 670 |
chat0, # chatbot_a
|
| 671 |
chat1, # chatbot_b
|
| 672 |
-
|
|
|
|
|
|
|
| 673 |
"", # response_b (empty)
|
| 674 |
code0, # code_a
|
| 675 |
"", # code_b (empty)
|
| 676 |
sandbox_state0, # sandbox_state0
|
| 677 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 678 |
sandbox_output0, # sandbox_output0
|
| 679 |
"", # sandbox_output1 (empty)
|
| 680 |
sandbox_component_update0, # sandbox_component_update0
|
|
@@ -701,37 +844,37 @@ def send_to_right_only(state0, state1, text, temperature, max_tokens, model_a, m
|
|
| 701 |
"""Send message to right model (Model B) only"""
|
| 702 |
if not text.strip():
|
| 703 |
return state0, state1, "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""
|
| 704 |
-
|
| 705 |
# Initialize states if needed
|
| 706 |
if state0 is None:
|
| 707 |
state0 = create_chat_state(model_a)
|
| 708 |
if state1 is None:
|
| 709 |
state1 = create_chat_state(model_b)
|
| 710 |
-
|
| 711 |
# Add user message to right state only
|
| 712 |
state1["messages"].append({"role": "user", "content": text})
|
| 713 |
state1["generating"] = True
|
| 714 |
-
|
| 715 |
# Generate response for right model only
|
| 716 |
state1, response1 = generate_response_with_completion(state1, temperature, max_tokens)
|
| 717 |
state1["messages"].append({"role": "assistant", "content": response1["content"]})
|
| 718 |
state1["has_output"] = True
|
| 719 |
state1["generating"] = False
|
| 720 |
-
|
| 721 |
# Format chat history for display
|
| 722 |
chat0 = format_chat_history(state0["messages"]) if state0 else []
|
| 723 |
chat1 = format_chat_history(state1["messages"])
|
| 724 |
-
|
| 725 |
# Extract code from response for sandbox
|
| 726 |
sandbox_state1 = state1.get("sandbox_state", create_sandbox_state())
|
| 727 |
sandbox_state1, code1, env1 = extract_and_execute_code(response1["content"], sandbox_state1)
|
| 728 |
state1["sandbox_state"] = sandbox_state1
|
| 729 |
-
|
| 730 |
# Clear previous sandbox outputs
|
| 731 |
sandbox_output1 = ""
|
| 732 |
sandbox_component_update1 = gr.update(value=("", False, []), visible=False)
|
| 733 |
sandbox_view_b = ""
|
| 734 |
-
|
| 735 |
# Run sandbox execution if there's code
|
| 736 |
if code1.strip():
|
| 737 |
install_command1 = sandbox_state1.get('install_command', "")
|
|
@@ -746,27 +889,32 @@ def send_to_right_only(state0, state1, text, temperature, max_tokens, model_a, m
|
|
| 746 |
sandbox_view_b += f"# Output\n{sandbox_output1}"
|
| 747 |
if sandbox_error1:
|
| 748 |
sandbox_view_b = f"<details closed><summary><strong>π¨ Errors/Warnings</strong></summary>\n\n```\n{sandbox_error1.strip()}\n```\n\n</details>\n\n" + sandbox_view_b
|
| 749 |
-
|
| 750 |
# Calculate conversation statistics
|
| 751 |
turn_count_a = len([msg for msg in state0["messages"] if msg["role"] == "assistant" and msg["content"]]) if state0 else 0
|
| 752 |
turn_count_b = len([msg for msg in state1["messages"] if msg["role"] == "assistant" and msg["content"]])
|
| 753 |
-
|
| 754 |
chat_stats_a = f"**Conversation:** {turn_count_a} turns | **Total Messages:** {len(state0['messages']) if state0 else 0}"
|
| 755 |
chat_stats_b = f"**Conversation:** {turn_count_b} turns | **Total Messages:** {len(state1['messages'])}"
|
| 756 |
-
|
| 757 |
# Don't show vote buttons since only one model responded
|
| 758 |
show_vote_buttons = False
|
| 759 |
-
|
| 760 |
return (
|
| 761 |
state0, # state0
|
| 762 |
state1, # state1
|
| 763 |
chat0, # chatbot_a
|
| 764 |
chat1, # chatbot_b
|
| 765 |
"", # response_a (empty)
|
| 766 |
-
|
|
|
|
|
|
|
| 767 |
"", # code_a (empty)
|
| 768 |
code1, # code_b
|
| 769 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 770 |
sandbox_state1, # sandbox_state1
|
| 771 |
"", # sandbox_output0 (empty)
|
| 772 |
sandbox_output1, # sandbox_output1
|
|
@@ -791,90 +939,6 @@ def send_to_right_only(state0, state1, text, temperature, max_tokens, model_a, m
|
|
| 791 |
)
|
| 792 |
|
| 793 |
|
| 794 |
-
def handle_vote(state0, state1, vote_type):
|
| 795 |
-
"""Handle vote submission"""
|
| 796 |
-
if (
|
| 797 |
-
not state0
|
| 798 |
-
or not state1
|
| 799 |
-
or not state0.get("has_output")
|
| 800 |
-
or not state1.get("has_output")
|
| 801 |
-
):
|
| 802 |
-
return (
|
| 803 |
-
"No output to vote on!",
|
| 804 |
-
gr.update(),
|
| 805 |
-
"**Last Updated:** No data available",
|
| 806 |
-
)
|
| 807 |
-
|
| 808 |
-
# Get all user messages and the last responses
|
| 809 |
-
user_messages = []
|
| 810 |
-
response_a = ""
|
| 811 |
-
response_b = ""
|
| 812 |
-
|
| 813 |
-
# Collect all user messages from the conversation
|
| 814 |
-
for msg in state0["messages"]:
|
| 815 |
-
if msg["role"] == "user":
|
| 816 |
-
user_messages.append(msg["content"])
|
| 817 |
-
|
| 818 |
-
for msg in reversed(state0["messages"]):
|
| 819 |
-
if msg["role"] == "assistant":
|
| 820 |
-
response_a = msg["content"]
|
| 821 |
-
break
|
| 822 |
-
|
| 823 |
-
for msg in reversed(state1["messages"]):
|
| 824 |
-
if msg["role"] == "assistant":
|
| 825 |
-
response_b = msg["content"]
|
| 826 |
-
break
|
| 827 |
-
|
| 828 |
-
# Get interactions and full conversation history for remote dataset saving
|
| 829 |
-
interactions_a = state0.get("interactions", [])
|
| 830 |
-
interactions_b = state1.get("interactions", [])
|
| 831 |
-
|
| 832 |
-
# Get full conversation history for both models
|
| 833 |
-
conversation_a = state0.get("messages", [])
|
| 834 |
-
conversation_b = state1.get("messages", [])
|
| 835 |
-
|
| 836 |
-
# Save vote with full conversation history to remote dataset in background (async)
|
| 837 |
-
import threading
|
| 838 |
-
def save_vote_background():
|
| 839 |
-
try:
|
| 840 |
-
success, message = save_vote_to_hf(
|
| 841 |
-
state0["model_name"],
|
| 842 |
-
state1["model_name"],
|
| 843 |
-
user_messages[0],
|
| 844 |
-
response_a,
|
| 845 |
-
response_b,
|
| 846 |
-
vote_type,
|
| 847 |
-
interactions_a,
|
| 848 |
-
interactions_b,
|
| 849 |
-
conversation_a,
|
| 850 |
-
conversation_b,
|
| 851 |
-
)
|
| 852 |
-
|
| 853 |
-
except Exception as e:
|
| 854 |
-
print(f"Error saving vote: {str(e)}")
|
| 855 |
-
pass
|
| 856 |
-
|
| 857 |
-
print("Saving vote in background...")
|
| 858 |
-
# Start background upload thread
|
| 859 |
-
upload_thread = threading.Thread(target=save_vote_background)
|
| 860 |
-
upload_thread.daemon = True
|
| 861 |
-
upload_thread.start()
|
| 862 |
-
|
| 863 |
-
# Return immediately without waiting for upload
|
| 864 |
-
success = True # Assume success for immediate UI response
|
| 865 |
-
message = "Vote recorded! Uploading data in background..."
|
| 866 |
-
|
| 867 |
-
if success:
|
| 868 |
-
# Return immediately without waiting for ranking refresh
|
| 869 |
-
return (
|
| 870 |
-
message + " Clearing conversation...",
|
| 871 |
-
gr.update(), # Keep existing ranking table
|
| 872 |
-
"**Last Updated:** Processing in background...",
|
| 873 |
-
)
|
| 874 |
-
else:
|
| 875 |
-
return message, gr.update(), "**Last Updated:** Error occurred"
|
| 876 |
-
|
| 877 |
-
|
| 878 |
def run_sandbox_code(sandbox_state: dict, code: str, install_command: str) -> tuple[str, str, str]:
|
| 879 |
"""Run code in the appropriate sandbox environment"""
|
| 880 |
if not code.strip():
|
|
@@ -886,7 +950,6 @@ def run_sandbox_code(sandbox_state: dict, code: str, install_command: str) -> tu
|
|
| 886 |
|
| 887 |
# Determine environment
|
| 888 |
env = sandbox_state.get('auto_selected_sandbox_environment') or sandbox_state.get('sandbox_environment')
|
| 889 |
-
print(f"DEBUG: env: {env}")
|
| 890 |
try:
|
| 891 |
if env == SandboxEnvironment.HTML:
|
| 892 |
sandbox_url, sandbox_id, stderr = run_html_sandbox(code, install_command, sandbox_state.get('sandbox_id'))
|
|
@@ -979,227 +1042,28 @@ async def run_sandbox_code_async(sandbox_state: dict, code: str, install_command
|
|
| 979 |
async def run_sandboxes_parallel(sandbox_state0, code0, install_command0, sandbox_state1, code1, install_command1):
|
| 980 |
"""Run both sandbox executions in parallel with error handling"""
|
| 981 |
loop = asyncio.get_event_loop()
|
| 982 |
-
|
| 983 |
# Create tasks for both sandbox executions
|
| 984 |
task0 = loop.run_in_executor(None, run_sandbox_code, sandbox_state0, code0, install_command0)
|
| 985 |
task1 = loop.run_in_executor(None, run_sandbox_code, sandbox_state1, code1, install_command1)
|
| 986 |
-
|
| 987 |
# Wait for both to complete with error handling
|
| 988 |
try:
|
| 989 |
result0, result1 = await asyncio.gather(task0, task1, return_exceptions=True)
|
| 990 |
-
|
| 991 |
# Handle exceptions
|
| 992 |
if isinstance(result0, Exception):
|
| 993 |
result0 = ("", "", f"Sandbox execution error: {str(result0)}")
|
| 994 |
-
|
| 995 |
if isinstance(result1, Exception):
|
| 996 |
result1 = ("", "", f"Sandbox execution error: {str(result1)}")
|
| 997 |
-
|
| 998 |
except Exception as e:
|
| 999 |
# Fallback to sequential processing
|
| 1000 |
result0 = run_sandbox_code(sandbox_state0, code0, install_command0)
|
| 1001 |
result1 = run_sandbox_code(sandbox_state1, code1, install_command1)
|
| 1002 |
-
|
| 1003 |
-
return result0, result1
|
| 1004 |
-
|
| 1005 |
-
|
| 1006 |
-
def serialize_interactions(interactions):
|
| 1007 |
-
"""Convert datetime objects in interactions to ISO format strings"""
|
| 1008 |
-
if not interactions:
|
| 1009 |
-
return interactions
|
| 1010 |
-
|
| 1011 |
-
serialized = []
|
| 1012 |
-
for interaction in interactions:
|
| 1013 |
-
# Handle case where interaction might be a list instead of a dict
|
| 1014 |
-
if isinstance(interaction, list):
|
| 1015 |
-
# If it's a list, recursively serialize each item
|
| 1016 |
-
serialized.append(serialize_interactions(interaction))
|
| 1017 |
-
elif isinstance(interaction, dict):
|
| 1018 |
-
# If it's a dict, serialize it normally
|
| 1019 |
-
serialized_interaction = {}
|
| 1020 |
-
for key, value in interaction.items():
|
| 1021 |
-
if isinstance(value, datetime.datetime):
|
| 1022 |
-
serialized_interaction[key] = value.isoformat()
|
| 1023 |
-
else:
|
| 1024 |
-
serialized_interaction[key] = value
|
| 1025 |
-
serialized.append(serialized_interaction)
|
| 1026 |
-
else:
|
| 1027 |
-
# If it's neither list nor dict, just add it as is
|
| 1028 |
-
serialized.append(interaction)
|
| 1029 |
-
return serialized
|
| 1030 |
-
|
| 1031 |
-
|
| 1032 |
-
def save_vote_to_hf(
|
| 1033 |
-
model_a, model_b, prompt, response_a, response_b, vote_result, interactions_a=None, interactions_b=None, conversation_a=None, conversation_b=None, hf_token=None
|
| 1034 |
-
):
|
| 1035 |
-
"""Save vote result to HuggingFace dataset with full conversation history"""
|
| 1036 |
-
try:
|
| 1037 |
-
# Use global token if not provided
|
| 1038 |
-
token = hf_token or HF_TOKEN
|
| 1039 |
-
if not token:
|
| 1040 |
-
return False, "HuggingFace token not found in environment (HF_TOKEN)"
|
| 1041 |
-
|
| 1042 |
-
if not HF_DATASET_NAME:
|
| 1043 |
-
return False, "HuggingFace dataset name not found in environment (HF_DATASET_NAME)"
|
| 1044 |
-
|
| 1045 |
-
|
| 1046 |
-
# Serialize conversations for JSON compatibility
|
| 1047 |
-
serialized_conversation_a = serialize_interactions(conversation_a or [])
|
| 1048 |
-
serialized_conversation_b = serialize_interactions(conversation_b or [])
|
| 1049 |
-
|
| 1050 |
-
# Organize interactions by turns - each turn contains a list of interactions
|
| 1051 |
-
def organize_interactions_by_turns(interactions, conversation):
|
| 1052 |
-
"""Organize interactions by conversation turns"""
|
| 1053 |
-
if not interactions:
|
| 1054 |
-
return []
|
| 1055 |
-
|
| 1056 |
-
# For now, put all interactions in a single turn
|
| 1057 |
-
# This can be enhanced later to properly group by conversation turns
|
| 1058 |
-
# when we have more context about how interactions are timestamped
|
| 1059 |
-
return interactions if interactions else []
|
| 1060 |
-
|
| 1061 |
-
# Organize interactions by turns for both models
|
| 1062 |
-
action_a = organize_interactions_by_turns(interactions_a or [], conversation_a or [])
|
| 1063 |
-
action_b = organize_interactions_by_turns(interactions_b or [], conversation_b or [])
|
| 1064 |
-
|
| 1065 |
-
# Serialize actions for JSON compatibility
|
| 1066 |
-
serialized_action_a = serialize_interactions(action_a)
|
| 1067 |
-
serialized_action_b = serialize_interactions(action_b)
|
| 1068 |
-
|
| 1069 |
-
# Create vote data with full conversation history and actions organized by turns
|
| 1070 |
-
# Each conversation is a list of messages in format: [{"role": "user"/"assistant", "content": "...", "action": [...]}, ...]
|
| 1071 |
-
# Actions are organized as list of lists: [[turn1_interactions], [turn2_interactions], ...]
|
| 1072 |
-
vote_data = {
|
| 1073 |
-
"timestamp": datetime.datetime.now().isoformat(),
|
| 1074 |
-
"model_a": model_a,
|
| 1075 |
-
"model_b": model_b,
|
| 1076 |
-
"initial_prompt": prompt, # Convert list to single string
|
| 1077 |
-
"action_a": serialized_action_a, # Actions organized by turns for model A
|
| 1078 |
-
"action_b": serialized_action_b, # Actions organized by turns for model B
|
| 1079 |
-
"conversation_a": serialized_conversation_a, # Full conversation history for model A
|
| 1080 |
-
"conversation_b": serialized_conversation_b, # Full conversation history for model B
|
| 1081 |
-
"vote": vote_result, # "left", "right", "tie", "both_bad"
|
| 1082 |
-
}
|
| 1083 |
-
|
| 1084 |
-
# Try to load existing dataset or create new one
|
| 1085 |
-
try:
|
| 1086 |
-
dataset = load_dataset(HF_DATASET_NAME, split="train", token=token)
|
| 1087 |
-
# Convert to pandas DataFrame - handle both Dataset and DatasetDict
|
| 1088 |
-
if hasattr(dataset, "to_pandas"):
|
| 1089 |
-
df = dataset.to_pandas()
|
| 1090 |
-
else:
|
| 1091 |
-
df = pd.DataFrame(dataset)
|
| 1092 |
-
# Add new vote
|
| 1093 |
-
new_df = pd.concat([df, pd.DataFrame([vote_data])], ignore_index=True)
|
| 1094 |
-
except Exception as load_error:
|
| 1095 |
-
# Create new dataset if it doesn't exist
|
| 1096 |
-
new_df = pd.DataFrame([vote_data])
|
| 1097 |
-
|
| 1098 |
-
|
| 1099 |
-
# Convert back to dataset and push
|
| 1100 |
-
new_dataset = Dataset.from_pandas(new_df)
|
| 1101 |
-
try:
|
| 1102 |
-
new_dataset.push_to_hub(HF_DATASET_NAME, token=token)
|
| 1103 |
-
return True, "Vote saved successfully!"
|
| 1104 |
-
except Exception as upload_error:
|
| 1105 |
-
return False, f"Error uploading to HuggingFace: {str(upload_error)}"
|
| 1106 |
-
except Exception as e:
|
| 1107 |
-
return False, f"Error saving vote: {str(e)}"
|
| 1108 |
-
|
| 1109 |
|
| 1110 |
-
|
| 1111 |
-
"""Load and calculate ranking data from HuggingFace dataset"""
|
| 1112 |
-
global ranking_data, ranking_last_updated
|
| 1113 |
-
|
| 1114 |
-
try:
|
| 1115 |
-
# Use global token if not provided
|
| 1116 |
-
token = hf_token or HF_TOKEN
|
| 1117 |
-
if not token:
|
| 1118 |
-
return pd.DataFrame()
|
| 1119 |
-
|
| 1120 |
-
# Load dataset - force download if requested
|
| 1121 |
-
if force_reload:
|
| 1122 |
-
# Force download from remote, ignore cache
|
| 1123 |
-
dataset = load_dataset(
|
| 1124 |
-
HF_DATASET_NAME,
|
| 1125 |
-
split="train",
|
| 1126 |
-
token=token,
|
| 1127 |
-
download_mode="force_redownload",
|
| 1128 |
-
)
|
| 1129 |
-
else:
|
| 1130 |
-
dataset = load_dataset(HF_DATASET_NAME, split="train", token=token)
|
| 1131 |
-
# Convert to pandas DataFrame - handle both Dataset and DatasetDict
|
| 1132 |
-
if hasattr(dataset, "to_pandas"):
|
| 1133 |
-
df = dataset.to_pandas()
|
| 1134 |
-
else:
|
| 1135 |
-
df = pd.DataFrame(dataset)
|
| 1136 |
-
|
| 1137 |
-
if df.empty:
|
| 1138 |
-
return pd.DataFrame()
|
| 1139 |
-
|
| 1140 |
-
# Calculate rankings
|
| 1141 |
-
model_stats = {}
|
| 1142 |
-
|
| 1143 |
-
for _, row in df.iterrows():
|
| 1144 |
-
model_a = row["model_a"]
|
| 1145 |
-
model_b = row["model_b"]
|
| 1146 |
-
vote = row["vote"]
|
| 1147 |
-
|
| 1148 |
-
# Initialize models if not exists
|
| 1149 |
-
if model_a not in model_stats:
|
| 1150 |
-
model_stats[model_a] = {"wins": 0, "losses": 0, "ties": 0, "total": 0}
|
| 1151 |
-
if model_b not in model_stats:
|
| 1152 |
-
model_stats[model_b] = {"wins": 0, "losses": 0, "ties": 0, "total": 0}
|
| 1153 |
-
|
| 1154 |
-
# Update stats based on vote
|
| 1155 |
-
if vote == "left": # Model A wins
|
| 1156 |
-
model_stats[model_a]["wins"] += 1
|
| 1157 |
-
model_stats[model_b]["losses"] += 1
|
| 1158 |
-
elif vote == "right": # Model B wins
|
| 1159 |
-
model_stats[model_b]["wins"] += 1
|
| 1160 |
-
model_stats[model_a]["losses"] += 1
|
| 1161 |
-
elif vote == "tie":
|
| 1162 |
-
model_stats[model_a]["ties"] += 1
|
| 1163 |
-
model_stats[model_b]["ties"] += 1
|
| 1164 |
-
# both_bad doesn't count as win/loss for either
|
| 1165 |
-
|
| 1166 |
-
model_stats[model_a]["total"] += 1
|
| 1167 |
-
model_stats[model_b]["total"] += 1
|
| 1168 |
-
|
| 1169 |
-
# Convert to DataFrame and calculate win rate
|
| 1170 |
-
ranking_list = []
|
| 1171 |
-
for model, stats in model_stats.items():
|
| 1172 |
-
win_rate = (
|
| 1173 |
-
(stats["wins"] + stats["ties"]) / max(stats["total"], 1) * 100
|
| 1174 |
-
)
|
| 1175 |
-
ranking_list.append(
|
| 1176 |
-
{
|
| 1177 |
-
"Model": model,
|
| 1178 |
-
"Win Rate (%)": round(win_rate, 1),
|
| 1179 |
-
"Wins": stats["wins"],
|
| 1180 |
-
"Losses": stats["losses"],
|
| 1181 |
-
"Ties": stats["ties"],
|
| 1182 |
-
"Total Battles": stats["total"],
|
| 1183 |
-
}
|
| 1184 |
-
)
|
| 1185 |
-
|
| 1186 |
-
# Sort by win rate
|
| 1187 |
-
ranking_df = pd.DataFrame(ranking_list).sort_values(
|
| 1188 |
-
"Win Rate (%)", ascending=False
|
| 1189 |
-
)
|
| 1190 |
-
ranking_df["Rank"] = range(1, len(ranking_df) + 1)
|
| 1191 |
-
|
| 1192 |
-
# Reorder columns
|
| 1193 |
-
ranking_df = ranking_df[
|
| 1194 |
-
["Rank", "Model", "Win Rate (%)", "Wins", "Losses", "Ties", "Total Battles"]
|
| 1195 |
-
]
|
| 1196 |
-
|
| 1197 |
-
ranking_data = ranking_df
|
| 1198 |
-
ranking_last_updated = datetime.datetime.now()
|
| 1199 |
-
|
| 1200 |
-
return ranking_df
|
| 1201 |
-
except Exception as e:
|
| 1202 |
-
return pd.DataFrame()
|
| 1203 |
|
| 1204 |
|
| 1205 |
def instantiate_send_button():
|
|
@@ -1262,7 +1126,7 @@ def build_ui():
|
|
| 1262 |
|
| 1263 |
# Get random models for this session
|
| 1264 |
model_a, model_b = get_random_models()
|
| 1265 |
-
|
| 1266 |
with gr.Blocks(title="BigCodeArena", theme=gr.themes.Soft()) as demo:
|
| 1267 |
# Add custom CSS for centering and button styling
|
| 1268 |
demo.css = """
|
|
@@ -1296,7 +1160,7 @@ def build_ui():
|
|
| 1296 |
min-width: 60px;
|
| 1297 |
}
|
| 1298 |
"""
|
| 1299 |
-
|
| 1300 |
gr.Markdown("# πΈ BigCodeArena - Start Your Vibe Coding!", elem_classes="center-text")
|
| 1301 |
|
| 1302 |
# Main tabs
|
|
@@ -1361,25 +1225,15 @@ def build_ui():
|
|
| 1361 |
interactive=False,
|
| 1362 |
)
|
| 1363 |
|
| 1364 |
-
# Vote
|
| 1365 |
-
|
| 1366 |
-
|
| 1367 |
-
|
| 1368 |
-
|
| 1369 |
-
|
| 1370 |
-
|
| 1371 |
-
|
| 1372 |
-
|
| 1373 |
-
)
|
| 1374 |
-
vote_both_bad_btn = gr.Button(
|
| 1375 |
-
"π Both are Bad", variant="secondary", size="lg"
|
| 1376 |
-
)
|
| 1377 |
-
vote_right_btn = gr.Button(
|
| 1378 |
-
"π B is Better", variant="primary", size="lg"
|
| 1379 |
-
)
|
| 1380 |
-
|
| 1381 |
-
# Vote status message
|
| 1382 |
-
vote_status = gr.Markdown("", visible=False)
|
| 1383 |
|
| 1384 |
# Main chat interface - Collapsible and hidden by default
|
| 1385 |
with gr.Accordion("π¬ Chat Interface", open=False):
|
|
@@ -1419,7 +1273,7 @@ def build_ui():
|
|
| 1419 |
with gr.Row():
|
| 1420 |
send_left_btn = instantiate_send_left_button()
|
| 1421 |
send_right_btn = instantiate_send_right_button()
|
| 1422 |
-
|
| 1423 |
# Additional control buttons
|
| 1424 |
with gr.Row():
|
| 1425 |
clear_btn = gr.Button("ποΈ Clear Chat", variant="secondary")
|
|
@@ -1568,38 +1422,7 @@ def build_ui():
|
|
| 1568 |
inputs=[text_input],
|
| 1569 |
)
|
| 1570 |
# Ranking Tab
|
| 1571 |
-
|
| 1572 |
-
gr.Markdown("## π Model Leaderboard")
|
| 1573 |
-
gr.Markdown("*Rankings auto-refresh every 10 minutes*")
|
| 1574 |
-
|
| 1575 |
-
ranking_table = gr.Dataframe(
|
| 1576 |
-
headers=[
|
| 1577 |
-
"Rank",
|
| 1578 |
-
"Model",
|
| 1579 |
-
"Win Rate (%)",
|
| 1580 |
-
"Wins",
|
| 1581 |
-
"Losses",
|
| 1582 |
-
"Ties",
|
| 1583 |
-
"Total Battles",
|
| 1584 |
-
],
|
| 1585 |
-
datatype=[
|
| 1586 |
-
"number",
|
| 1587 |
-
"str",
|
| 1588 |
-
"number",
|
| 1589 |
-
"number",
|
| 1590 |
-
"number",
|
| 1591 |
-
"number",
|
| 1592 |
-
"number",
|
| 1593 |
-
],
|
| 1594 |
-
label="Model Rankings",
|
| 1595 |
-
interactive=False,
|
| 1596 |
-
wrap=True,
|
| 1597 |
-
)
|
| 1598 |
-
|
| 1599 |
-
ranking_last_update = gr.Markdown("**Last Updated:** Not loaded yet")
|
| 1600 |
-
|
| 1601 |
-
# Timer for auto-refresh every 10 minutes
|
| 1602 |
-
ranking_timer = gr.Timer(value=600.0, active=True)
|
| 1603 |
|
| 1604 |
# Event handlers
|
| 1605 |
# Create state variables for the run buttons
|
|
@@ -1620,7 +1443,7 @@ def build_ui():
|
|
| 1620 |
state0["interactions"].extend(interactions)
|
| 1621 |
return log_sandbox_telemetry_gradio_fn(state0["sandbox_state"], sandbox_ui)
|
| 1622 |
return None
|
| 1623 |
-
|
| 1624 |
def log_telemetry_b(state1, sandbox_ui):
|
| 1625 |
if state1 and "sandbox_state" in state1:
|
| 1626 |
# Print user interactions for debugging
|
|
@@ -1633,7 +1456,7 @@ def build_ui():
|
|
| 1633 |
state1["interactions"].extend(interactions)
|
| 1634 |
return log_sandbox_telemetry_gradio_fn(state1["sandbox_state"], sandbox_ui)
|
| 1635 |
return None
|
| 1636 |
-
|
| 1637 |
sandbox_component_a.change(
|
| 1638 |
fn=log_telemetry_a,
|
| 1639 |
inputs=[state0_var, sandbox_component_a],
|
|
@@ -1649,24 +1472,17 @@ def build_ui():
|
|
| 1649 |
|
| 1650 |
# Create a wrapper function that handles both the main execution and state update
|
| 1651 |
def send_and_update_state(state0, state1, text, temp, max_tok, model_a, model_b):
|
| 1652 |
-
|
| 1653 |
# Hide vote buttons immediately when generation starts
|
| 1654 |
initial_vote_visibility = False
|
| 1655 |
-
|
| 1656 |
# Call the main function
|
| 1657 |
result = add_text_and_generate(state0, state1, text, temp, max_tok, model_a, model_b)
|
| 1658 |
# Extract the state from the result
|
| 1659 |
new_state0, new_state1 = result[0], result[1]
|
| 1660 |
|
| 1661 |
# Check if both models have output and are not generating to show vote buttons
|
| 1662 |
-
show_vote_buttons = (
|
| 1663 |
-
new_state0
|
| 1664 |
-
and new_state0.get("has_output", False)
|
| 1665 |
-
and not new_state0.get("generating", False)
|
| 1666 |
-
and new_state1
|
| 1667 |
-
and new_state1.get("has_output", False)
|
| 1668 |
-
and not new_state1.get("generating", False)
|
| 1669 |
-
)
|
| 1670 |
|
| 1671 |
# Return all the original outputs plus the updated state for run buttons
|
| 1672 |
# Make sure all outputs are properly formatted for their expected types
|
|
@@ -1675,8 +1491,12 @@ def build_ui():
|
|
| 1675 |
new_state1, # state1
|
| 1676 |
result[2], # chatbot_a (chat0)
|
| 1677 |
result[3], # chatbot_b (chat1)
|
| 1678 |
-
|
| 1679 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1680 |
result[6], # code_a (code0)
|
| 1681 |
result[7], # code_b (code1)
|
| 1682 |
result[10] if len(result) > 10 else "", # sandbox_state0
|
|
@@ -2035,13 +1855,12 @@ def build_ui():
|
|
| 2035 |
],
|
| 2036 |
)
|
| 2037 |
|
| 2038 |
-
#
|
| 2039 |
-
def
|
| 2040 |
-
#
|
| 2041 |
message, ranking_update, last_update = handle_vote(
|
| 2042 |
state0, state1, vote_type
|
| 2043 |
)
|
| 2044 |
-
|
| 2045 |
# Get the model names from the current session
|
| 2046 |
model_a = state0["model_name"] if state0 else "Unknown"
|
| 2047 |
model_b = state1["model_name"] if state1 else "Unknown"
|
|
@@ -2057,23 +1876,23 @@ def build_ui():
|
|
| 2057 |
|
| 2058 |
# Clear everything and start fresh immediately, but preserve examples
|
| 2059 |
return (
|
| 2060 |
-
|
| 2061 |
-
|
| 2062 |
-
|
| 2063 |
-
|
| 2064 |
-
|
| 2065 |
-
|
| 2066 |
-
|
| 2067 |
-
|
| 2068 |
-
|
| 2069 |
-
|
| 2070 |
-
|
| 2071 |
-
gr.update(
|
| 2072 |
-
gr.update(
|
| 2073 |
-
|
| 2074 |
-
|
| 2075 |
-
|
| 2076 |
-
|
| 2077 |
gr.update(visible=False), # Hide vote_section
|
| 2078 |
gr.update(visible=False), # Hide vote_buttons_row
|
| 2079 |
None, # Reset state0_var
|
|
@@ -2095,8 +1914,8 @@ def build_ui():
|
|
| 2095 |
(vote_both_bad_btn, "both_bad"),
|
| 2096 |
]:
|
| 2097 |
vote_btn.click(
|
| 2098 |
-
fn=
|
| 2099 |
-
inputs=[state0_var, state1_var, gr.State(vote_type)],
|
| 2100 |
outputs=[
|
| 2101 |
vote_status, # vote status message
|
| 2102 |
state0_var, # state0
|
|
@@ -2129,45 +1948,8 @@ def build_ui():
|
|
| 2129 |
],
|
| 2130 |
)
|
| 2131 |
|
| 2132 |
-
#
|
| 2133 |
-
|
| 2134 |
-
df = load_ranking_data()
|
| 2135 |
-
if df.empty:
|
| 2136 |
-
return gr.update(value=df), "**Last Updated:** No data available"
|
| 2137 |
-
|
| 2138 |
-
last_update = (
|
| 2139 |
-
ranking_last_updated.strftime("%Y-%m-%d %H:%M:%S")
|
| 2140 |
-
if ranking_last_updated
|
| 2141 |
-
else "Unknown"
|
| 2142 |
-
)
|
| 2143 |
-
return gr.update(value=df), f"**Last Updated:** {last_update}"
|
| 2144 |
-
|
| 2145 |
-
def force_update_ranking_display():
|
| 2146 |
-
"""Force update ranking data from HuggingFace (for timer)"""
|
| 2147 |
-
df = load_ranking_data(force_reload=True)
|
| 2148 |
-
if df.empty:
|
| 2149 |
-
return gr.update(value=df), "**Last Updated:** No data available"
|
| 2150 |
-
|
| 2151 |
-
last_update = (
|
| 2152 |
-
ranking_last_updated.strftime("%Y-%m-%d %H:%M:%S")
|
| 2153 |
-
if ranking_last_updated
|
| 2154 |
-
else "Unknown"
|
| 2155 |
-
)
|
| 2156 |
-
return gr.update(value=df), f"**Last Updated:** {last_update}"
|
| 2157 |
-
|
| 2158 |
-
# Timer tick handler for auto-refresh with force reload
|
| 2159 |
-
ranking_timer.tick(
|
| 2160 |
-
fn=force_update_ranking_display,
|
| 2161 |
-
inputs=[],
|
| 2162 |
-
outputs=[ranking_table, ranking_last_update],
|
| 2163 |
-
)
|
| 2164 |
-
|
| 2165 |
-
# Auto-load ranking on startup
|
| 2166 |
-
demo.load(
|
| 2167 |
-
fn=update_ranking_display,
|
| 2168 |
-
inputs=[],
|
| 2169 |
-
outputs=[ranking_table, ranking_last_update],
|
| 2170 |
-
)
|
| 2171 |
|
| 2172 |
return demo
|
| 2173 |
|
|
|
|
| 10 |
import os
|
| 11 |
import asyncio
|
| 12 |
import concurrent.futures
|
| 13 |
+
import random
|
| 14 |
import time
|
| 15 |
+
import numpy as np
|
| 16 |
+
from collections import defaultdict
|
| 17 |
from datasets import Dataset, load_dataset
|
| 18 |
+
# Import Elo calculation utilities
|
| 19 |
+
from elo_calculation import (
|
| 20 |
+
calculate_elo_with_confidence_intervals,
|
| 21 |
+
create_ranking_dataframe,
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
# Import ranking functionality
|
| 25 |
+
from ranking import (
|
| 26 |
+
load_ranking_data,
|
| 27 |
+
update_ranking_display,
|
| 28 |
+
force_update_ranking_display,
|
| 29 |
+
create_ranking_tab,
|
| 30 |
+
setup_ranking_handlers,
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
# Import voting functionality
|
| 34 |
+
from voting import (
|
| 35 |
+
handle_vote,
|
| 36 |
+
save_vote_to_hf,
|
| 37 |
+
serialize_interactions,
|
| 38 |
+
create_vote_ui,
|
| 39 |
+
should_show_vote_buttons,
|
| 40 |
+
get_vote_ui_updates,
|
| 41 |
+
setup_vote_handlers,
|
| 42 |
+
)
|
| 43 |
+
|
| 44 |
# Import completion utilities
|
| 45 |
from completion import make_config, registered_api_completion
|
| 46 |
from sandbox.prompts import GENERAL_SANDBOX_INSTRUCTION
|
|
|
|
| 124 |
HF_DATASET_NAME = os.getenv("HF_DATASET_NAME")
|
| 125 |
HF_TOKEN = os.getenv("HF_TOKEN")
|
| 126 |
|
|
|
|
|
|
|
|
|
|
| 127 |
|
| 128 |
def get_random_models():
|
| 129 |
+
"""Get two random models from available models using weighted sampling"""
|
| 130 |
if len(available_models) < 2:
|
| 131 |
return available_models[0] if available_models else None, available_models[0] if available_models else None
|
| 132 |
|
| 133 |
+
# Use get_battle_pair for weighted sampling
|
| 134 |
+
return get_battle_pair(available_models, {}, [], {}, [])
|
| 135 |
+
|
| 136 |
+
# Configuration for battle sampling
|
| 137 |
+
ANON_MODELS = [] # Models that should not battle against each other in anonymous mode
|
| 138 |
+
BATTLE_STRICT_TARGETS = {} # Strict battle targets for specific models
|
| 139 |
+
|
| 140 |
+
def get_sample_weight(model, outage_models, sampling_weights, sampling_boost_models=None):
|
| 141 |
+
"""Get the sampling weight for a model"""
|
| 142 |
+
# Check if model is in outage
|
| 143 |
+
if model in outage_models:
|
| 144 |
+
return 0
|
| 145 |
+
|
| 146 |
+
# Get base weight from API config
|
| 147 |
+
model_config = api_config.get(model, {})
|
| 148 |
+
base_weight = model_config.get('weight', 1.0) # Default weight is 1.0
|
| 149 |
+
|
| 150 |
+
# Apply custom sampling weights if provided
|
| 151 |
+
if model in sampling_weights:
|
| 152 |
+
base_weight *= sampling_weights[model]
|
| 153 |
+
|
| 154 |
+
# Apply boost if model is in boost list
|
| 155 |
+
if sampling_boost_models and model in sampling_boost_models:
|
| 156 |
+
base_weight *= 2.0 # Example boost factor
|
| 157 |
+
|
| 158 |
+
return base_weight
|
| 159 |
+
|
| 160 |
+
def is_model_match_pattern(model, pattern):
|
| 161 |
+
"""Check if model matches a pattern (for battle strict targets)"""
|
| 162 |
+
# Simple pattern matching - can be extended for more complex patterns
|
| 163 |
+
if isinstance(pattern, str):
|
| 164 |
+
return pattern in model
|
| 165 |
+
elif isinstance(pattern, list):
|
| 166 |
+
return any(p in model for p in pattern)
|
| 167 |
+
return False
|
| 168 |
+
|
| 169 |
+
def get_battle_pair(
|
| 170 |
+
models, battle_targets, outage_models, sampling_weights, sampling_boost_models
|
| 171 |
+
):
|
| 172 |
+
"""
|
| 173 |
+
Sample a pair of models for battle using weighted sampling.
|
| 174 |
+
|
| 175 |
+
Args:
|
| 176 |
+
models: List of available model names
|
| 177 |
+
battle_targets: Dict mapping models to their preferred battle targets
|
| 178 |
+
outage_models: List of models currently in outage
|
| 179 |
+
sampling_weights: Dict of custom sampling weights per model
|
| 180 |
+
sampling_boost_models: List of models to boost in sampling
|
| 181 |
+
|
| 182 |
+
Returns:
|
| 183 |
+
Tuple of (model_a, model_b) for battle
|
| 184 |
+
"""
|
| 185 |
+
if len(models) == 1:
|
| 186 |
+
return models[0], models[0]
|
| 187 |
+
|
| 188 |
+
# Calculate weights for all models
|
| 189 |
+
model_weights = []
|
| 190 |
+
for model in models:
|
| 191 |
+
weight = get_sample_weight(
|
| 192 |
+
model, outage_models, sampling_weights, sampling_boost_models
|
| 193 |
+
)
|
| 194 |
+
model_weights.append(weight)
|
| 195 |
+
total_weight = np.sum(model_weights)
|
| 196 |
+
|
| 197 |
+
if total_weight == 0:
|
| 198 |
+
# Fallback to uniform sampling if all weights are 0
|
| 199 |
+
return random.sample(models, 2)
|
| 200 |
+
|
| 201 |
+
model_weights = np.array(model_weights) / total_weight
|
| 202 |
+
|
| 203 |
+
# Sample first model
|
| 204 |
+
chosen_idx = np.random.choice(len(models), p=model_weights)
|
| 205 |
+
chosen_model = models[chosen_idx]
|
| 206 |
+
|
| 207 |
+
# Find eligible rival models
|
| 208 |
+
rival_models = []
|
| 209 |
+
rival_weights = []
|
| 210 |
+
for model in models:
|
| 211 |
+
if model == chosen_model:
|
| 212 |
+
continue
|
| 213 |
+
if model in ANON_MODELS and chosen_model in ANON_MODELS:
|
| 214 |
+
continue
|
| 215 |
+
if chosen_model in BATTLE_STRICT_TARGETS:
|
| 216 |
+
if not is_model_match_pattern(model, BATTLE_STRICT_TARGETS[chosen_model]):
|
| 217 |
+
continue
|
| 218 |
+
if model in BATTLE_STRICT_TARGETS:
|
| 219 |
+
if not is_model_match_pattern(chosen_model, BATTLE_STRICT_TARGETS[model]):
|
| 220 |
+
continue
|
| 221 |
+
|
| 222 |
+
weight = get_sample_weight(model, outage_models, sampling_weights)
|
| 223 |
+
if (
|
| 224 |
+
weight != 0
|
| 225 |
+
and chosen_model in battle_targets
|
| 226 |
+
and model in battle_targets[chosen_model]
|
| 227 |
+
):
|
| 228 |
+
# boost to higher chance for targeted battles
|
| 229 |
+
weight = 0.5 * total_weight / len(battle_targets[chosen_model])
|
| 230 |
+
rival_models.append(model)
|
| 231 |
+
rival_weights.append(weight)
|
| 232 |
+
|
| 233 |
+
if not rival_models:
|
| 234 |
+
# Fallback: if no eligible rivals, pick any other model
|
| 235 |
+
rival_models = [m for m in models if m != chosen_model]
|
| 236 |
+
if rival_models:
|
| 237 |
+
rival_model = random.choice(rival_models)
|
| 238 |
+
else:
|
| 239 |
+
rival_model = chosen_model
|
| 240 |
+
else:
|
| 241 |
+
rival_weights = np.array(rival_weights) / np.sum(rival_weights)
|
| 242 |
+
rival_idx = np.random.choice(len(rival_models), p=rival_weights)
|
| 243 |
+
rival_model = rival_models[rival_idx]
|
| 244 |
+
|
| 245 |
+
# Randomly swap order
|
| 246 |
+
swap = np.random.randint(2)
|
| 247 |
+
if swap == 0:
|
| 248 |
+
return chosen_model, rival_model
|
| 249 |
+
else:
|
| 250 |
+
return rival_model, chosen_model
|
| 251 |
|
| 252 |
def create_chat_state(model_name: str) -> dict:
|
| 253 |
"""Create a new chat state for a model"""
|
|
|
|
| 629 |
|
| 630 |
# Get current model names for display
|
| 631 |
model_a, model_b = get_random_models()
|
| 632 |
+
print(f"Model A: {model_a}, Model B: {model_b}")
|
| 633 |
return (
|
| 634 |
None, # state0
|
| 635 |
None, # state1
|
|
|
|
| 665 |
"""Retry the last user message"""
|
| 666 |
if not state0 or not state1:
|
| 667 |
return state0, state1, "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""
|
| 668 |
+
|
| 669 |
# Get the last user message
|
| 670 |
last_user_message = ""
|
| 671 |
for msg in reversed(state0["messages"]):
|
| 672 |
if msg["role"] == "user":
|
| 673 |
last_user_message = msg["content"]
|
| 674 |
break
|
| 675 |
+
|
| 676 |
if not last_user_message:
|
| 677 |
return state0, state1, "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""
|
| 678 |
+
|
| 679 |
# Remove the last user message and assistant responses from both states
|
| 680 |
if state0["messages"] and state0["messages"][-1]["role"] == "assistant":
|
| 681 |
state0["messages"].pop() # Remove last assistant response
|
| 682 |
if state0["messages"] and state0["messages"][-1]["role"] == "user":
|
| 683 |
state0["messages"].pop() # Remove last user message
|
| 684 |
+
|
| 685 |
if state1["messages"] and state1["messages"][-1]["role"] == "assistant":
|
| 686 |
state1["messages"].pop() # Remove last assistant response
|
| 687 |
if state1["messages"] and state1["messages"][-1]["role"] == "user":
|
| 688 |
state1["messages"].pop() # Remove last user message
|
| 689 |
+
|
| 690 |
# Generate new responses with the same message
|
| 691 |
result = add_text_and_generate(state0, state1, last_user_message, 0.4, 8192, model_a, model_b)
|
| 692 |
+
|
| 693 |
# Extract the state from the result
|
| 694 |
new_state0, new_state1 = result[0], result[1]
|
| 695 |
+
|
| 696 |
# Check if both models have output and are not generating to show vote buttons
|
| 697 |
+
show_vote_buttons = should_show_vote_buttons(new_state0, new_state1)
|
| 698 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 699 |
# Return all the original outputs plus the updated state for run buttons
|
| 700 |
return (
|
| 701 |
new_state0, # state0
|
| 702 |
new_state1, # state1
|
| 703 |
result[2], # chatbot_a (chat0)
|
| 704 |
result[3], # chatbot_b (chat1)
|
| 705 |
+
(
|
| 706 |
+
result[4]["content"] if isinstance(result[4], dict) else result[4]
|
| 707 |
+
), # response_a (response0)
|
| 708 |
+
(
|
| 709 |
+
result[5]["content"] if isinstance(result[5], dict) else result[5]
|
| 710 |
+
), # response_b (response1)
|
| 711 |
result[6], # code_a (code0)
|
| 712 |
result[7], # code_b (code1)
|
| 713 |
result[10] if len(result) > 10 else "", # sandbox_state0
|
|
|
|
| 746 |
"""Send message to left model (Model A) only"""
|
| 747 |
if not text.strip():
|
| 748 |
return state0, state1, "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""
|
| 749 |
+
|
| 750 |
# Initialize states if needed
|
| 751 |
if state0 is None:
|
| 752 |
state0 = create_chat_state(model_a)
|
| 753 |
if state1 is None:
|
| 754 |
state1 = create_chat_state(model_b)
|
| 755 |
+
|
| 756 |
# Add user message to left state only
|
| 757 |
state0["messages"].append({"role": "user", "content": text})
|
| 758 |
state0["generating"] = True
|
| 759 |
+
|
| 760 |
# Generate response for left model only
|
| 761 |
state0, response0 = generate_response_with_completion(state0, temperature, max_tokens)
|
| 762 |
state0["messages"].append({"role": "assistant", "content": response0["content"]})
|
| 763 |
state0["has_output"] = True
|
| 764 |
state0["generating"] = False
|
| 765 |
+
|
| 766 |
# Format chat history for display
|
| 767 |
chat0 = format_chat_history(state0["messages"])
|
| 768 |
chat1 = format_chat_history(state1["messages"]) if state1 else []
|
| 769 |
+
|
| 770 |
# Extract code from response for sandbox
|
| 771 |
sandbox_state0 = state0.get("sandbox_state", create_sandbox_state())
|
| 772 |
sandbox_state0, code0, env0 = extract_and_execute_code(response0["content"], sandbox_state0)
|
| 773 |
state0["sandbox_state"] = sandbox_state0
|
| 774 |
+
|
| 775 |
# Clear previous sandbox outputs
|
| 776 |
sandbox_output0 = ""
|
| 777 |
sandbox_component_update0 = gr.update(value=("", False, []), visible=False)
|
| 778 |
sandbox_view_a = ""
|
| 779 |
+
|
| 780 |
# Run sandbox execution if there's code
|
| 781 |
if code0.strip():
|
| 782 |
install_command0 = sandbox_state0.get('install_command', "")
|
|
|
|
| 791 |
sandbox_view_a += f"# Output\n{sandbox_output0}"
|
| 792 |
if sandbox_error0:
|
| 793 |
sandbox_view_a = f"<details closed><summary><strong>π¨ Errors/Warnings</strong></summary>\n\n```\n{sandbox_error0.strip()}\n```\n\n</details>\n\n" + sandbox_view_a
|
|
|
|
| 794 |
# Calculate conversation statistics
|
| 795 |
turn_count_a = len([msg for msg in state0["messages"] if msg["role"] == "assistant" and msg["content"]])
|
| 796 |
turn_count_b = len([msg for msg in state1["messages"] if msg["role"] == "assistant" and msg["content"]]) if state1 else 0
|
| 797 |
+
|
| 798 |
chat_stats_a = f"**Conversation:** {turn_count_a} turns | **Total Messages:** {len(state0['messages'])}"
|
| 799 |
chat_stats_b = f"**Conversation:** {turn_count_b} turns | **Total Messages:** {len(state1['messages']) if state1 else 0}"
|
| 800 |
+
|
| 801 |
# Don't show vote buttons since only one model responded
|
| 802 |
show_vote_buttons = False
|
| 803 |
+
|
| 804 |
return (
|
| 805 |
state0, # state0
|
| 806 |
state1, # state1
|
| 807 |
chat0, # chatbot_a
|
| 808 |
chat1, # chatbot_b
|
| 809 |
+
(
|
| 810 |
+
response0["content"] if isinstance(response0, dict) else response0
|
| 811 |
+
), # response_a
|
| 812 |
"", # response_b (empty)
|
| 813 |
code0, # code_a
|
| 814 |
"", # code_b (empty)
|
| 815 |
sandbox_state0, # sandbox_state0
|
| 816 |
+
(
|
| 817 |
+
state1.get("sandbox_state", create_sandbox_state())
|
| 818 |
+
if state1
|
| 819 |
+
else create_sandbox_state()
|
| 820 |
+
), # sandbox_state1
|
| 821 |
sandbox_output0, # sandbox_output0
|
| 822 |
"", # sandbox_output1 (empty)
|
| 823 |
sandbox_component_update0, # sandbox_component_update0
|
|
|
|
| 844 |
"""Send message to right model (Model B) only"""
|
| 845 |
if not text.strip():
|
| 846 |
return state0, state1, "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""
|
| 847 |
+
|
| 848 |
# Initialize states if needed
|
| 849 |
if state0 is None:
|
| 850 |
state0 = create_chat_state(model_a)
|
| 851 |
if state1 is None:
|
| 852 |
state1 = create_chat_state(model_b)
|
| 853 |
+
|
| 854 |
# Add user message to right state only
|
| 855 |
state1["messages"].append({"role": "user", "content": text})
|
| 856 |
state1["generating"] = True
|
| 857 |
+
|
| 858 |
# Generate response for right model only
|
| 859 |
state1, response1 = generate_response_with_completion(state1, temperature, max_tokens)
|
| 860 |
state1["messages"].append({"role": "assistant", "content": response1["content"]})
|
| 861 |
state1["has_output"] = True
|
| 862 |
state1["generating"] = False
|
| 863 |
+
|
| 864 |
# Format chat history for display
|
| 865 |
chat0 = format_chat_history(state0["messages"]) if state0 else []
|
| 866 |
chat1 = format_chat_history(state1["messages"])
|
| 867 |
+
|
| 868 |
# Extract code from response for sandbox
|
| 869 |
sandbox_state1 = state1.get("sandbox_state", create_sandbox_state())
|
| 870 |
sandbox_state1, code1, env1 = extract_and_execute_code(response1["content"], sandbox_state1)
|
| 871 |
state1["sandbox_state"] = sandbox_state1
|
| 872 |
+
|
| 873 |
# Clear previous sandbox outputs
|
| 874 |
sandbox_output1 = ""
|
| 875 |
sandbox_component_update1 = gr.update(value=("", False, []), visible=False)
|
| 876 |
sandbox_view_b = ""
|
| 877 |
+
|
| 878 |
# Run sandbox execution if there's code
|
| 879 |
if code1.strip():
|
| 880 |
install_command1 = sandbox_state1.get('install_command', "")
|
|
|
|
| 889 |
sandbox_view_b += f"# Output\n{sandbox_output1}"
|
| 890 |
if sandbox_error1:
|
| 891 |
sandbox_view_b = f"<details closed><summary><strong>π¨ Errors/Warnings</strong></summary>\n\n```\n{sandbox_error1.strip()}\n```\n\n</details>\n\n" + sandbox_view_b
|
|
|
|
| 892 |
# Calculate conversation statistics
|
| 893 |
turn_count_a = len([msg for msg in state0["messages"] if msg["role"] == "assistant" and msg["content"]]) if state0 else 0
|
| 894 |
turn_count_b = len([msg for msg in state1["messages"] if msg["role"] == "assistant" and msg["content"]])
|
| 895 |
+
|
| 896 |
chat_stats_a = f"**Conversation:** {turn_count_a} turns | **Total Messages:** {len(state0['messages']) if state0 else 0}"
|
| 897 |
chat_stats_b = f"**Conversation:** {turn_count_b} turns | **Total Messages:** {len(state1['messages'])}"
|
| 898 |
+
|
| 899 |
# Don't show vote buttons since only one model responded
|
| 900 |
show_vote_buttons = False
|
| 901 |
+
|
| 902 |
return (
|
| 903 |
state0, # state0
|
| 904 |
state1, # state1
|
| 905 |
chat0, # chatbot_a
|
| 906 |
chat1, # chatbot_b
|
| 907 |
"", # response_a (empty)
|
| 908 |
+
(
|
| 909 |
+
response1["content"] if isinstance(response1, dict) else response1
|
| 910 |
+
), # response_b
|
| 911 |
"", # code_a (empty)
|
| 912 |
code1, # code_b
|
| 913 |
+
(
|
| 914 |
+
state0.get("sandbox_state", create_sandbox_state())
|
| 915 |
+
if state0
|
| 916 |
+
else create_sandbox_state()
|
| 917 |
+
), # sandbox_state0
|
| 918 |
sandbox_state1, # sandbox_state1
|
| 919 |
"", # sandbox_output0 (empty)
|
| 920 |
sandbox_output1, # sandbox_output1
|
|
|
|
| 939 |
)
|
| 940 |
|
| 941 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 942 |
def run_sandbox_code(sandbox_state: dict, code: str, install_command: str) -> tuple[str, str, str]:
|
| 943 |
"""Run code in the appropriate sandbox environment"""
|
| 944 |
if not code.strip():
|
|
|
|
| 950 |
|
| 951 |
# Determine environment
|
| 952 |
env = sandbox_state.get('auto_selected_sandbox_environment') or sandbox_state.get('sandbox_environment')
|
|
|
|
| 953 |
try:
|
| 954 |
if env == SandboxEnvironment.HTML:
|
| 955 |
sandbox_url, sandbox_id, stderr = run_html_sandbox(code, install_command, sandbox_state.get('sandbox_id'))
|
|
|
|
| 1042 |
async def run_sandboxes_parallel(sandbox_state0, code0, install_command0, sandbox_state1, code1, install_command1):
|
| 1043 |
"""Run both sandbox executions in parallel with error handling"""
|
| 1044 |
loop = asyncio.get_event_loop()
|
| 1045 |
+
|
| 1046 |
# Create tasks for both sandbox executions
|
| 1047 |
task0 = loop.run_in_executor(None, run_sandbox_code, sandbox_state0, code0, install_command0)
|
| 1048 |
task1 = loop.run_in_executor(None, run_sandbox_code, sandbox_state1, code1, install_command1)
|
| 1049 |
+
|
| 1050 |
# Wait for both to complete with error handling
|
| 1051 |
try:
|
| 1052 |
result0, result1 = await asyncio.gather(task0, task1, return_exceptions=True)
|
| 1053 |
+
|
| 1054 |
# Handle exceptions
|
| 1055 |
if isinstance(result0, Exception):
|
| 1056 |
result0 = ("", "", f"Sandbox execution error: {str(result0)}")
|
| 1057 |
+
|
| 1058 |
if isinstance(result1, Exception):
|
| 1059 |
result1 = ("", "", f"Sandbox execution error: {str(result1)}")
|
| 1060 |
+
|
| 1061 |
except Exception as e:
|
| 1062 |
# Fallback to sequential processing
|
| 1063 |
result0 = run_sandbox_code(sandbox_state0, code0, install_command0)
|
| 1064 |
result1 = run_sandbox_code(sandbox_state1, code1, install_command1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1065 |
|
| 1066 |
+
return result0, result1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1067 |
|
| 1068 |
|
| 1069 |
def instantiate_send_button():
|
|
|
|
| 1126 |
|
| 1127 |
# Get random models for this session
|
| 1128 |
model_a, model_b = get_random_models()
|
| 1129 |
+
print(f"Model A: {model_a}, Model B: {model_b}")
|
| 1130 |
with gr.Blocks(title="BigCodeArena", theme=gr.themes.Soft()) as demo:
|
| 1131 |
# Add custom CSS for centering and button styling
|
| 1132 |
demo.css = """
|
|
|
|
| 1160 |
min-width: 60px;
|
| 1161 |
}
|
| 1162 |
"""
|
| 1163 |
+
|
| 1164 |
gr.Markdown("# πΈ BigCodeArena - Start Your Vibe Coding!", elem_classes="center-text")
|
| 1165 |
|
| 1166 |
# Main tabs
|
|
|
|
| 1225 |
interactive=False,
|
| 1226 |
)
|
| 1227 |
|
| 1228 |
+
# Vote UI components
|
| 1229 |
+
vote_components = create_vote_ui()
|
| 1230 |
+
vote_section = vote_components["vote_section"]
|
| 1231 |
+
vote_buttons_row = vote_components["vote_buttons_row"]
|
| 1232 |
+
vote_left_btn = vote_components["vote_left_btn"]
|
| 1233 |
+
vote_right_btn = vote_components["vote_right_btn"]
|
| 1234 |
+
vote_tie_btn = vote_components["vote_tie_btn"]
|
| 1235 |
+
vote_both_bad_btn = vote_components["vote_both_bad_btn"]
|
| 1236 |
+
vote_status = vote_components["vote_status"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1237 |
|
| 1238 |
# Main chat interface - Collapsible and hidden by default
|
| 1239 |
with gr.Accordion("π¬ Chat Interface", open=False):
|
|
|
|
| 1273 |
with gr.Row():
|
| 1274 |
send_left_btn = instantiate_send_left_button()
|
| 1275 |
send_right_btn = instantiate_send_right_button()
|
| 1276 |
+
|
| 1277 |
# Additional control buttons
|
| 1278 |
with gr.Row():
|
| 1279 |
clear_btn = gr.Button("ποΈ Clear Chat", variant="secondary")
|
|
|
|
| 1422 |
inputs=[text_input],
|
| 1423 |
)
|
| 1424 |
# Ranking Tab
|
| 1425 |
+
ranking_table, ranking_last_update, ranking_timer = create_ranking_tab()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1426 |
|
| 1427 |
# Event handlers
|
| 1428 |
# Create state variables for the run buttons
|
|
|
|
| 1443 |
state0["interactions"].extend(interactions)
|
| 1444 |
return log_sandbox_telemetry_gradio_fn(state0["sandbox_state"], sandbox_ui)
|
| 1445 |
return None
|
| 1446 |
+
|
| 1447 |
def log_telemetry_b(state1, sandbox_ui):
|
| 1448 |
if state1 and "sandbox_state" in state1:
|
| 1449 |
# Print user interactions for debugging
|
|
|
|
| 1456 |
state1["interactions"].extend(interactions)
|
| 1457 |
return log_sandbox_telemetry_gradio_fn(state1["sandbox_state"], sandbox_ui)
|
| 1458 |
return None
|
| 1459 |
+
|
| 1460 |
sandbox_component_a.change(
|
| 1461 |
fn=log_telemetry_a,
|
| 1462 |
inputs=[state0_var, sandbox_component_a],
|
|
|
|
| 1472 |
|
| 1473 |
# Create a wrapper function that handles both the main execution and state update
|
| 1474 |
def send_and_update_state(state0, state1, text, temp, max_tok, model_a, model_b):
|
| 1475 |
+
|
| 1476 |
# Hide vote buttons immediately when generation starts
|
| 1477 |
initial_vote_visibility = False
|
| 1478 |
+
|
| 1479 |
# Call the main function
|
| 1480 |
result = add_text_and_generate(state0, state1, text, temp, max_tok, model_a, model_b)
|
| 1481 |
# Extract the state from the result
|
| 1482 |
new_state0, new_state1 = result[0], result[1]
|
| 1483 |
|
| 1484 |
# Check if both models have output and are not generating to show vote buttons
|
| 1485 |
+
show_vote_buttons = should_show_vote_buttons(new_state0, new_state1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1486 |
|
| 1487 |
# Return all the original outputs plus the updated state for run buttons
|
| 1488 |
# Make sure all outputs are properly formatted for their expected types
|
|
|
|
| 1491 |
new_state1, # state1
|
| 1492 |
result[2], # chatbot_a (chat0)
|
| 1493 |
result[3], # chatbot_b (chat1)
|
| 1494 |
+
(
|
| 1495 |
+
result[4]["content"] if isinstance(result[4], dict) else result[4]
|
| 1496 |
+
), # response_a (response0)
|
| 1497 |
+
(
|
| 1498 |
+
result[5]["content"] if isinstance(result[5], dict) else result[5]
|
| 1499 |
+
), # response_b (response1)
|
| 1500 |
result[6], # code_a (code0)
|
| 1501 |
result[7], # code_b (code1)
|
| 1502 |
result[10] if len(result) > 10 else "", # sandbox_state0
|
|
|
|
| 1855 |
],
|
| 1856 |
)
|
| 1857 |
|
| 1858 |
+
# Setup vote handlers
|
| 1859 |
+
def process_vote(state0, state1, vote_type, current_text):
|
| 1860 |
+
# Save the vote and get updates
|
| 1861 |
message, ranking_update, last_update = handle_vote(
|
| 1862 |
state0, state1, vote_type
|
| 1863 |
)
|
|
|
|
| 1864 |
# Get the model names from the current session
|
| 1865 |
model_a = state0["model_name"] if state0 else "Unknown"
|
| 1866 |
model_b = state1["model_name"] if state1 else "Unknown"
|
|
|
|
| 1876 |
|
| 1877 |
# Clear everything and start fresh immediately, but preserve examples
|
| 1878 |
return (
|
| 1879 |
+
message, # vote status message
|
| 1880 |
+
gr.update(), # Keep state0 unchanged
|
| 1881 |
+
gr.update(), # Keep state1 unchanged
|
| 1882 |
+
gr.update(), # Keep chatbot_a unchanged
|
| 1883 |
+
gr.update(), # Keep chatbot_b unchanged
|
| 1884 |
+
gr.update(), # Keep response_a unchanged
|
| 1885 |
+
gr.update(), # Keep response_b unchanged
|
| 1886 |
+
gr.update(), # Keep code_a unchanged
|
| 1887 |
+
gr.update(), # Keep code_b unchanged
|
| 1888 |
+
gr.update(), # Keep sandbox_view_a unchanged
|
| 1889 |
+
gr.update(), # Keep sandbox_view_b unchanged
|
| 1890 |
+
gr.update(), # Keep sandbox_component_a unchanged
|
| 1891 |
+
gr.update(), # Keep sandbox_component_b unchanged
|
| 1892 |
+
gr.update(), # Keep chat_stats_a unchanged
|
| 1893 |
+
gr.update(), # Keep chat_stats_b unchanged
|
| 1894 |
+
gr.update(), # Keep model_display_a unchanged
|
| 1895 |
+
gr.update(), # Keep model_display_b unchanged
|
| 1896 |
gr.update(visible=False), # Hide vote_section
|
| 1897 |
gr.update(visible=False), # Hide vote_buttons_row
|
| 1898 |
None, # Reset state0_var
|
|
|
|
| 1914 |
(vote_both_bad_btn, "both_bad"),
|
| 1915 |
]:
|
| 1916 |
vote_btn.click(
|
| 1917 |
+
fn=process_vote,
|
| 1918 |
+
inputs=[state0_var, state1_var, gr.State(vote_type), text_input],
|
| 1919 |
outputs=[
|
| 1920 |
vote_status, # vote status message
|
| 1921 |
state0_var, # state0
|
|
|
|
| 1948 |
],
|
| 1949 |
)
|
| 1950 |
|
| 1951 |
+
# Setup ranking handlers
|
| 1952 |
+
setup_ranking_handlers(demo, ranking_table, ranking_last_update, ranking_timer)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1953 |
|
| 1954 |
return demo
|
| 1955 |
|
elo_calculation.py
ADDED
|
@@ -0,0 +1,315 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Elo Rating Calculation Module for BigCodeArena
|
| 3 |
+
Contains Bradley-Terry Model with confidence intervals and traditional Elo calculation
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import math
|
| 7 |
+
import numpy as np
|
| 8 |
+
import pandas as pd
|
| 9 |
+
from collections import defaultdict
|
| 10 |
+
from tqdm import tqdm
|
| 11 |
+
from sklearn.linear_model import LogisticRegression
|
| 12 |
+
import yaml
|
| 13 |
+
import os
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def load_model_metadata():
|
| 17 |
+
"""Load model metadata from api_config.yaml"""
|
| 18 |
+
try:
|
| 19 |
+
config_path = os.path.join(os.path.dirname(__file__), "api_config.yaml")
|
| 20 |
+
with open(config_path, "r", encoding="utf-8") as file:
|
| 21 |
+
config = yaml.safe_load(file)
|
| 22 |
+
|
| 23 |
+
metadata = {}
|
| 24 |
+
for model_key, model_config in config.items():
|
| 25 |
+
if isinstance(model_config, dict):
|
| 26 |
+
model_name = model_config.get("model", model_key)
|
| 27 |
+
metadata[model_name] = {
|
| 28 |
+
"organization": model_config.get("organization", "Unknown"),
|
| 29 |
+
"license": model_config.get("license", "Unknown"),
|
| 30 |
+
}
|
| 31 |
+
# Also store with the key name for lookup
|
| 32 |
+
metadata[model_key] = {
|
| 33 |
+
"organization": model_config.get("organization", "Unknown"),
|
| 34 |
+
"license": model_config.get("license", "Unknown"),
|
| 35 |
+
}
|
| 36 |
+
|
| 37 |
+
return metadata
|
| 38 |
+
except Exception as e:
|
| 39 |
+
print(f"Warning: Could not load model metadata: {e}")
|
| 40 |
+
return {}
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def compute_mle_elo(df, SCALE=400, BASE=10, INIT_RATING=1000, sample_weight=None):
|
| 44 |
+
"""Compute Elo ratings using Bradley-Terry Model with Maximum Likelihood Estimation"""
|
| 45 |
+
|
| 46 |
+
# Get all unique models
|
| 47 |
+
all_models = sorted(list(set(df["model_a"].tolist() + df["model_b"].tolist())))
|
| 48 |
+
|
| 49 |
+
# Create win matrices for each outcome type
|
| 50 |
+
# Initialize empty matrices with float dtype to avoid warnings
|
| 51 |
+
ptbl_a_win = pd.DataFrame(0.0, index=all_models, columns=all_models)
|
| 52 |
+
ptbl_b_win = pd.DataFrame(0.0, index=all_models, columns=all_models)
|
| 53 |
+
ptbl_tie = pd.DataFrame(0.0, index=all_models, columns=all_models)
|
| 54 |
+
|
| 55 |
+
# Count wins for model_a
|
| 56 |
+
model_a_wins = df[df["winner"] == "model_a"]
|
| 57 |
+
if not model_a_wins.empty:
|
| 58 |
+
a_win_counts = model_a_wins.groupby(["model_a", "model_b"]).size()
|
| 59 |
+
for (model_a, model_b), count in a_win_counts.items():
|
| 60 |
+
ptbl_a_win.loc[model_a, model_b] = count
|
| 61 |
+
|
| 62 |
+
# Count wins for model_b
|
| 63 |
+
model_b_wins = df[df["winner"] == "model_b"]
|
| 64 |
+
if not model_b_wins.empty:
|
| 65 |
+
b_win_counts = model_b_wins.groupby(["model_a", "model_b"]).size()
|
| 66 |
+
for (model_a, model_b), count in b_win_counts.items():
|
| 67 |
+
ptbl_b_win.loc[model_a, model_b] = count
|
| 68 |
+
|
| 69 |
+
# Count ties
|
| 70 |
+
ties = df[df["winner"].isin(["tie", "tie (bothbad)"])]
|
| 71 |
+
if not ties.empty:
|
| 72 |
+
tie_counts = ties.groupby(["model_a", "model_b"]).size()
|
| 73 |
+
for (model_a, model_b), count in tie_counts.items():
|
| 74 |
+
# For ties, we count 0.5 win for each model
|
| 75 |
+
ptbl_tie.loc[model_a, model_b] = count * 0.5
|
| 76 |
+
ptbl_tie.loc[model_b, model_a] = count * 0.5
|
| 77 |
+
|
| 78 |
+
models = pd.Series(np.arange(len(all_models)), index=all_models)
|
| 79 |
+
p = len(models)
|
| 80 |
+
|
| 81 |
+
# Create training data for logistic regression
|
| 82 |
+
X = []
|
| 83 |
+
Y = []
|
| 84 |
+
sample_weights = []
|
| 85 |
+
|
| 86 |
+
for model_a in all_models:
|
| 87 |
+
for model_b in all_models:
|
| 88 |
+
if model_a == model_b:
|
| 89 |
+
continue
|
| 90 |
+
|
| 91 |
+
# Count total games between these models
|
| 92 |
+
a_wins = ptbl_a_win.loc[model_a, model_b]
|
| 93 |
+
b_wins = ptbl_b_win.loc[model_a, model_b]
|
| 94 |
+
ties = ptbl_tie.loc[model_a, model_b]
|
| 95 |
+
|
| 96 |
+
total_games = a_wins + b_wins + ties
|
| 97 |
+
if total_games == 0:
|
| 98 |
+
continue
|
| 99 |
+
|
| 100 |
+
# Create feature vector: difference in model strengths
|
| 101 |
+
x = np.zeros(p)
|
| 102 |
+
x[models[model_a]] = 1.0
|
| 103 |
+
x[models[model_b]] = -1.0
|
| 104 |
+
|
| 105 |
+
# Add data points for model_a wins
|
| 106 |
+
if a_wins > 0:
|
| 107 |
+
X.append(x)
|
| 108 |
+
Y.append(1) # model_a wins
|
| 109 |
+
sample_weights.append(a_wins)
|
| 110 |
+
|
| 111 |
+
# Add data points for model_b wins (model_a loses)
|
| 112 |
+
if b_wins > 0:
|
| 113 |
+
X.append(x) # same feature vector
|
| 114 |
+
Y.append(0) # model_a loses
|
| 115 |
+
sample_weights.append(b_wins)
|
| 116 |
+
|
| 117 |
+
# Add data points for ties - treat as half wins for model_a
|
| 118 |
+
if ties > 0:
|
| 119 |
+
# Add ties as both wins and losses with half weight each
|
| 120 |
+
X.append(x)
|
| 121 |
+
Y.append(1) # model_a wins (tie counted as win)
|
| 122 |
+
sample_weights.append(ties / 2)
|
| 123 |
+
|
| 124 |
+
X.append(x)
|
| 125 |
+
Y.append(0) # model_a loses (tie counted as loss)
|
| 126 |
+
sample_weights.append(ties / 2)
|
| 127 |
+
|
| 128 |
+
if len(X) == 0 or len(set(Y)) < 2:
|
| 129 |
+
# Not enough data or no variation in outcomes
|
| 130 |
+
return pd.Series({model: INIT_RATING for model in all_models}).sort_values(ascending=False)
|
| 131 |
+
|
| 132 |
+
X = np.array(X)
|
| 133 |
+
Y = np.array(Y)
|
| 134 |
+
sample_weights = np.array(sample_weights)
|
| 135 |
+
|
| 136 |
+
# Fit logistic regression
|
| 137 |
+
lr = LogisticRegression(fit_intercept=False, penalty=None, tol=1e-6, max_iter=1000)
|
| 138 |
+
lr.fit(X, Y, sample_weight=sample_weights)
|
| 139 |
+
|
| 140 |
+
# Convert coefficients to Elo ratings
|
| 141 |
+
elo_scores = SCALE * lr.coef_[0] + INIT_RATING
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
return pd.Series(elo_scores, index=models.index).sort_values(ascending=False)
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
def get_bootstrap_result(battles, func_compute_elo, num_round=1000):
|
| 148 |
+
"""Get bootstrap results for confidence interval calculation"""
|
| 149 |
+
|
| 150 |
+
rows = []
|
| 151 |
+
for i in tqdm(range(num_round), desc="bootstrap"):
|
| 152 |
+
# Bootstrap sample with replacement
|
| 153 |
+
bootstrap_sample = battles.sample(frac=1.0, replace=True)
|
| 154 |
+
try:
|
| 155 |
+
elo_result = func_compute_elo(bootstrap_sample)
|
| 156 |
+
rows.append(elo_result)
|
| 157 |
+
except Exception as e:
|
| 158 |
+
# Skip failed bootstrap samples
|
| 159 |
+
continue
|
| 160 |
+
|
| 161 |
+
if not rows:
|
| 162 |
+
return pd.DataFrame()
|
| 163 |
+
|
| 164 |
+
df = pd.DataFrame(rows)
|
| 165 |
+
# Sort columns by median Elo score (descending)
|
| 166 |
+
return df[df.median().sort_values(ascending=False).index]
|
| 167 |
+
|
| 168 |
+
|
| 169 |
+
def compute_online_elo(battles, K=4, SCALE=400, BASE=10, INIT_RATING=1000):
|
| 170 |
+
"""Compute Elo ratings for models based on battle results (legacy function for compatibility)"""
|
| 171 |
+
rating = defaultdict(lambda: INIT_RATING)
|
| 172 |
+
|
| 173 |
+
for rd, model_a, model_b, winner in battles[
|
| 174 |
+
["model_a", "model_b", "winner"]
|
| 175 |
+
].itertuples():
|
| 176 |
+
ra = rating[model_a]
|
| 177 |
+
rb = rating[model_b]
|
| 178 |
+
ea = 1 / (1 + BASE ** ((rb - ra) / SCALE))
|
| 179 |
+
eb = 1 / (1 + BASE ** ((ra - rb) / SCALE))
|
| 180 |
+
if winner == "model_a":
|
| 181 |
+
sa = 1
|
| 182 |
+
elif winner == "model_b":
|
| 183 |
+
sa = 0
|
| 184 |
+
elif winner == "tie" or winner == "tie (bothbad)":
|
| 185 |
+
sa = 0.5
|
| 186 |
+
else:
|
| 187 |
+
raise Exception(f"unexpected vote {winner}")
|
| 188 |
+
rating[model_a] += K * (sa - ea)
|
| 189 |
+
rating[model_b] += K * (1 - sa - eb)
|
| 190 |
+
|
| 191 |
+
# calibrate llama-13b to 800 if it exists
|
| 192 |
+
if "llama-13b" in rating:
|
| 193 |
+
delta = 800 - rating["llama-13b"]
|
| 194 |
+
for model in battles["model_a"].unique():
|
| 195 |
+
rating[model] += delta
|
| 196 |
+
|
| 197 |
+
return rating
|
| 198 |
+
|
| 199 |
+
|
| 200 |
+
def calculate_elo_with_confidence_intervals(battles_df, vote_counts):
|
| 201 |
+
"""
|
| 202 |
+
Main function to calculate Elo ratings with confidence intervals
|
| 203 |
+
|
| 204 |
+
Args:
|
| 205 |
+
battles_df (pd.DataFrame): DataFrame with columns ['model_a', 'model_b', 'winner']
|
| 206 |
+
vote_counts (dict): Dictionary with vote counts for each model
|
| 207 |
+
|
| 208 |
+
Returns:
|
| 209 |
+
tuple: (elo_ratings, confidence_intervals)
|
| 210 |
+
"""
|
| 211 |
+
confidence_intervals = {} # Initialize to avoid uninitialized variable error
|
| 212 |
+
|
| 213 |
+
# Check if we have sufficient data for Bradley-Terry model
|
| 214 |
+
if len(battles_df) < 2:
|
| 215 |
+
# Not enough battles, use default ratings
|
| 216 |
+
all_models = set(
|
| 217 |
+
battles_df["model_a"].tolist() + battles_df["model_b"].tolist()
|
| 218 |
+
)
|
| 219 |
+
elo_ratings = pd.Series({model: 1000 for model in all_models})
|
| 220 |
+
confidence_intervals = {model: 0 for model in all_models}
|
| 221 |
+
else:
|
| 222 |
+
try:
|
| 223 |
+
# Use the new Bradley-Terry Model
|
| 224 |
+
elo_ratings = compute_mle_elo(battles_df)
|
| 225 |
+
|
| 226 |
+
# Calculate confidence intervals using bootstrap
|
| 227 |
+
if len(battles_df) >= 10: # Only calculate CI if we have enough data
|
| 228 |
+
try:
|
| 229 |
+
bootstrap_df = get_bootstrap_result(
|
| 230 |
+
battles_df, compute_mle_elo, num_round=100
|
| 231 |
+
)
|
| 232 |
+
|
| 233 |
+
# Calculate 95% confidence intervals
|
| 234 |
+
if not bootstrap_df.empty:
|
| 235 |
+
for model in bootstrap_df.columns:
|
| 236 |
+
scores = bootstrap_df[model].dropna()
|
| 237 |
+
if len(scores) > 0:
|
| 238 |
+
lower = scores.quantile(0.025)
|
| 239 |
+
upper = scores.quantile(0.975)
|
| 240 |
+
median_score = scores.median()
|
| 241 |
+
ci_margin = (upper - lower) / 2
|
| 242 |
+
confidence_intervals[model] = ci_margin
|
| 243 |
+
else:
|
| 244 |
+
confidence_intervals[model] = 0
|
| 245 |
+
else:
|
| 246 |
+
# Fallback: no confidence intervals
|
| 247 |
+
for model in elo_ratings.index:
|
| 248 |
+
confidence_intervals[model] = 0
|
| 249 |
+
except Exception as bootstrap_error:
|
| 250 |
+
print(
|
| 251 |
+
f"Bootstrap calculation failed: {bootstrap_error}, skipping confidence intervals"
|
| 252 |
+
)
|
| 253 |
+
for model in elo_ratings.index:
|
| 254 |
+
confidence_intervals[model] = 0
|
| 255 |
+
else:
|
| 256 |
+
# Not enough data for bootstrap, set CI to 0
|
| 257 |
+
for model in elo_ratings.index:
|
| 258 |
+
confidence_intervals[model] = 0
|
| 259 |
+
except Exception as e:
|
| 260 |
+
# Fallback to old method if Bradley-Terry fails
|
| 261 |
+
print(
|
| 262 |
+
f"Bradley-Terry calculation failed: {e}, falling back to online Elo"
|
| 263 |
+
)
|
| 264 |
+
old_elo_ratings = compute_online_elo(battles_df)
|
| 265 |
+
elo_ratings = pd.Series(old_elo_ratings)
|
| 266 |
+
confidence_intervals = {model: 0 for model in elo_ratings.index}
|
| 267 |
+
return elo_ratings, confidence_intervals
|
| 268 |
+
|
| 269 |
+
|
| 270 |
+
def create_ranking_dataframe(elo_ratings, confidence_intervals, vote_counts):
|
| 271 |
+
"""
|
| 272 |
+
Create ranking DataFrame with all necessary columns
|
| 273 |
+
|
| 274 |
+
Args:
|
| 275 |
+
elo_ratings (pd.Series): Elo ratings for each model
|
| 276 |
+
confidence_intervals (dict): Confidence interval margins for each model
|
| 277 |
+
vote_counts (dict): Vote counts for each model
|
| 278 |
+
|
| 279 |
+
Returns:
|
| 280 |
+
pd.DataFrame: Ranking table with columns [Rank, Model, Score, 95% CI (Β±), Votes, Organization, License]
|
| 281 |
+
"""
|
| 282 |
+
# Load model metadata
|
| 283 |
+
metadata = load_model_metadata()
|
| 284 |
+
|
| 285 |
+
# Create ranking list with Elo ratings and confidence intervals
|
| 286 |
+
ranking_list = []
|
| 287 |
+
for model in elo_ratings.index:
|
| 288 |
+
ci_margin = confidence_intervals.get(model, 0)
|
| 289 |
+
|
| 290 |
+
# Get metadata for this model
|
| 291 |
+
model_metadata = metadata.get(model, {})
|
| 292 |
+
organization = model_metadata.get("organization", "Unknown")
|
| 293 |
+
license_type = model_metadata.get("license", "Unknown")
|
| 294 |
+
|
| 295 |
+
ranking_list.append(
|
| 296 |
+
{
|
| 297 |
+
"Model": model,
|
| 298 |
+
"Score": round(elo_ratings[model], 1),
|
| 299 |
+
"95% CI (Β±)": round(ci_margin, 1) if ci_margin > 0 else "-",
|
| 300 |
+
"Votes": vote_counts[model],
|
| 301 |
+
"Organization": organization,
|
| 302 |
+
"License": license_type,
|
| 303 |
+
}
|
| 304 |
+
)
|
| 305 |
+
|
| 306 |
+
# Sort by Elo rating (highest first)
|
| 307 |
+
ranking_df = pd.DataFrame(ranking_list).sort_values("Score", ascending=False)
|
| 308 |
+
ranking_df["Rank"] = range(1, len(ranking_df) + 1)
|
| 309 |
+
|
| 310 |
+
# Reorder columns
|
| 311 |
+
ranking_df = ranking_df[
|
| 312 |
+
["Rank", "Model", "Score", "95% CI (Β±)", "Votes", "Organization", "License"]
|
| 313 |
+
]
|
| 314 |
+
|
| 315 |
+
return ranking_df
|
ranking.py
ADDED
|
@@ -0,0 +1,199 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Ranking module for BigCodeArena
|
| 3 |
+
Handles model leaderboard functionality and data management
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import gradio as gr
|
| 7 |
+
import pandas as pd
|
| 8 |
+
import datetime
|
| 9 |
+
import os
|
| 10 |
+
from collections import defaultdict
|
| 11 |
+
from datasets import Dataset, load_dataset
|
| 12 |
+
|
| 13 |
+
# Import Elo calculation utilities
|
| 14 |
+
from elo_calculation import (
|
| 15 |
+
calculate_elo_with_confidence_intervals,
|
| 16 |
+
create_ranking_dataframe,
|
| 17 |
+
)
|
| 18 |
+
|
| 19 |
+
# HuggingFace dataset configuration
|
| 20 |
+
HF_DATASET_NAME = os.getenv("HF_DATASET_NAME")
|
| 21 |
+
HF_TOKEN = os.getenv("HF_TOKEN")
|
| 22 |
+
REFRESH_TIME = os.getenv("REFRESH_TIME") or 60*60*12 # 12 hours by default
|
| 23 |
+
|
| 24 |
+
# Global ranking data cache
|
| 25 |
+
ranking_data = None
|
| 26 |
+
ranking_last_updated = None
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def load_ranking_data(hf_token=None, force_reload=False):
|
| 30 |
+
"""Load and calculate ranking data from HuggingFace dataset"""
|
| 31 |
+
global ranking_data, ranking_last_updated
|
| 32 |
+
|
| 33 |
+
try:
|
| 34 |
+
# Use global token if not provided
|
| 35 |
+
token = hf_token or HF_TOKEN
|
| 36 |
+
|
| 37 |
+
if not token:
|
| 38 |
+
return pd.DataFrame()
|
| 39 |
+
|
| 40 |
+
if not HF_DATASET_NAME:
|
| 41 |
+
return pd.DataFrame()
|
| 42 |
+
|
| 43 |
+
# Load dataset - force download if requested
|
| 44 |
+
if force_reload:
|
| 45 |
+
# Force download from remote, ignore cache
|
| 46 |
+
dataset = load_dataset(
|
| 47 |
+
HF_DATASET_NAME,
|
| 48 |
+
split="train",
|
| 49 |
+
token=token,
|
| 50 |
+
download_mode="force_redownload",
|
| 51 |
+
)
|
| 52 |
+
else:
|
| 53 |
+
dataset = load_dataset(HF_DATASET_NAME, split="train", token=token)
|
| 54 |
+
# Convert to pandas DataFrame - handle both Dataset and DatasetDict
|
| 55 |
+
if hasattr(dataset, "to_pandas"):
|
| 56 |
+
df = dataset.to_pandas()
|
| 57 |
+
else:
|
| 58 |
+
df = pd.DataFrame(dataset)
|
| 59 |
+
|
| 60 |
+
if df.empty:
|
| 61 |
+
return pd.DataFrame()
|
| 62 |
+
|
| 63 |
+
# Convert vote format for Elo calculation and count votes
|
| 64 |
+
battle_data = []
|
| 65 |
+
vote_counts = defaultdict(int)
|
| 66 |
+
|
| 67 |
+
for _, row in df.iterrows():
|
| 68 |
+
model_a = row["model_a"]
|
| 69 |
+
model_b = row["model_b"]
|
| 70 |
+
vote = row["vote"]
|
| 71 |
+
|
| 72 |
+
# Convert vote to winner format for Elo
|
| 73 |
+
if vote == "left": # Model A wins
|
| 74 |
+
winner = "model_a"
|
| 75 |
+
elif vote == "right": # Model B wins
|
| 76 |
+
winner = "model_b"
|
| 77 |
+
elif vote == "tie":
|
| 78 |
+
winner = "tie"
|
| 79 |
+
elif vote == "both_bad":
|
| 80 |
+
winner = "tie (bothbad)"
|
| 81 |
+
else:
|
| 82 |
+
continue # Skip invalid votes
|
| 83 |
+
|
| 84 |
+
battle_data.append(
|
| 85 |
+
{"model_a": model_a, "model_b": model_b, "winner": winner}
|
| 86 |
+
)
|
| 87 |
+
|
| 88 |
+
# Count votes for each model
|
| 89 |
+
vote_counts[model_a] += 1
|
| 90 |
+
vote_counts[model_b] += 1
|
| 91 |
+
|
| 92 |
+
# Create DataFrame for Elo calculation
|
| 93 |
+
battles_df = pd.DataFrame(battle_data)
|
| 94 |
+
|
| 95 |
+
if battles_df.empty:
|
| 96 |
+
return pd.DataFrame()
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
# Calculate Elo ratings using Bradley-Terry Model with confidence intervals
|
| 100 |
+
elo_ratings, confidence_intervals = calculate_elo_with_confidence_intervals(
|
| 101 |
+
battles_df, vote_counts
|
| 102 |
+
)
|
| 103 |
+
|
| 104 |
+
# Create ranking DataFrame
|
| 105 |
+
ranking_df = create_ranking_dataframe(
|
| 106 |
+
elo_ratings, confidence_intervals, vote_counts
|
| 107 |
+
)
|
| 108 |
+
|
| 109 |
+
ranking_data = ranking_df
|
| 110 |
+
ranking_last_updated = datetime.datetime.now()
|
| 111 |
+
|
| 112 |
+
return ranking_df
|
| 113 |
+
except Exception as e:
|
| 114 |
+
return pd.DataFrame()
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
def update_ranking_display():
|
| 118 |
+
"""Update ranking display with current data"""
|
| 119 |
+
df = load_ranking_data()
|
| 120 |
+
if df.empty:
|
| 121 |
+
return gr.update(value=df), "**Last Updated:** No data available"
|
| 122 |
+
|
| 123 |
+
last_update = (
|
| 124 |
+
ranking_last_updated.strftime("%Y-%m-%d %H:%M:%S")
|
| 125 |
+
if ranking_last_updated
|
| 126 |
+
else "Unknown"
|
| 127 |
+
)
|
| 128 |
+
return gr.update(value=df), f"**Last Updated:** {last_update}"
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
def force_update_ranking_display():
|
| 132 |
+
"""Force update ranking data from HuggingFace (for timer)"""
|
| 133 |
+
df = load_ranking_data(force_reload=True)
|
| 134 |
+
if df.empty:
|
| 135 |
+
return gr.update(value=df), "**Last Updated:** No data available"
|
| 136 |
+
|
| 137 |
+
last_update = (
|
| 138 |
+
ranking_last_updated.strftime("%Y-%m-%d %H:%M:%S")
|
| 139 |
+
if ranking_last_updated
|
| 140 |
+
else "Unknown"
|
| 141 |
+
)
|
| 142 |
+
return gr.update(value=df), f"**Last Updated:** {last_update}"
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
def create_ranking_tab():
|
| 146 |
+
"""Create the ranking tab UI component"""
|
| 147 |
+
with gr.Tab("π Ranking", id="ranking"):
|
| 148 |
+
gr.Markdown("## π Model Leaderboard")
|
| 149 |
+
|
| 150 |
+
ranking_table = gr.Dataframe(
|
| 151 |
+
headers=[
|
| 152 |
+
"Rank",
|
| 153 |
+
"Model",
|
| 154 |
+
"Score",
|
| 155 |
+
"95% CI (Β±)",
|
| 156 |
+
"Votes",
|
| 157 |
+
"Organization",
|
| 158 |
+
"License",
|
| 159 |
+
],
|
| 160 |
+
datatype=[
|
| 161 |
+
"number",
|
| 162 |
+
"str",
|
| 163 |
+
"number",
|
| 164 |
+
"str",
|
| 165 |
+
"number",
|
| 166 |
+
"str",
|
| 167 |
+
"str",
|
| 168 |
+
],
|
| 169 |
+
label="Model Rankings",
|
| 170 |
+
interactive=False,
|
| 171 |
+
wrap=True,
|
| 172 |
+
)
|
| 173 |
+
|
| 174 |
+
ranking_last_update = gr.Markdown("**Last Updated:** Not loaded yet")
|
| 175 |
+
|
| 176 |
+
# Timer for auto-refresh every REFRESH_TIME seconds
|
| 177 |
+
ranking_timer = gr.Timer(value=REFRESH_TIME, active=True)
|
| 178 |
+
|
| 179 |
+
return ranking_table, ranking_last_update, ranking_timer
|
| 180 |
+
|
| 181 |
+
|
| 182 |
+
def setup_ranking_handlers(demo, ranking_table, ranking_last_update, ranking_timer):
|
| 183 |
+
"""Setup event handlers for ranking functionality"""
|
| 184 |
+
|
| 185 |
+
# Timer tick handler for auto-refresh with force reload
|
| 186 |
+
ranking_timer.tick(
|
| 187 |
+
fn=force_update_ranking_display,
|
| 188 |
+
inputs=[],
|
| 189 |
+
outputs=[ranking_table, ranking_last_update],
|
| 190 |
+
)
|
| 191 |
+
|
| 192 |
+
# Auto-load ranking on startup
|
| 193 |
+
demo.load(
|
| 194 |
+
fn=update_ranking_display,
|
| 195 |
+
inputs=[],
|
| 196 |
+
outputs=[ranking_table, ranking_last_update],
|
| 197 |
+
)
|
| 198 |
+
|
| 199 |
+
return ranking_table, ranking_last_update
|
requirements.txt
CHANGED
|
@@ -23,4 +23,5 @@ tree-sitter-c
|
|
| 23 |
e2b-code-interpreter==1.5.2
|
| 24 |
azure-storage-blob
|
| 25 |
huggingface_hub
|
| 26 |
-
datasets
|
|
|
|
|
|
| 23 |
e2b-code-interpreter==1.5.2
|
| 24 |
azure-storage-blob
|
| 25 |
huggingface_hub
|
| 26 |
+
datasets
|
| 27 |
+
scikit-learn
|
sandbox/sandbox_manager.py
CHANGED
|
@@ -76,7 +76,7 @@ def run_command_in_sandbox(
|
|
| 76 |
|
| 77 |
try:
|
| 78 |
if "uv" in command:
|
| 79 |
-
command = "uv venv;" + command
|
| 80 |
command_result = sandbox.commands.run(
|
| 81 |
cmd=command,
|
| 82 |
cwd=working_directory,
|
|
|
|
| 76 |
|
| 77 |
try:
|
| 78 |
if "uv" in command:
|
| 79 |
+
command = "uv venv; source .venv/bin/activate;" + command
|
| 80 |
command_result = sandbox.commands.run(
|
| 81 |
cmd=command,
|
| 82 |
cwd=working_directory,
|
voting.py
ADDED
|
@@ -0,0 +1,329 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Voting module for BigCodeArena
|
| 3 |
+
Handles vote submission, data management, and UI components
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import gradio as gr
|
| 7 |
+
import pandas as pd
|
| 8 |
+
import datetime
|
| 9 |
+
import os
|
| 10 |
+
import threading
|
| 11 |
+
from datasets import Dataset, load_dataset
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
# HuggingFace dataset configuration
|
| 15 |
+
HF_DATASET_NAME = os.getenv("HF_DATASET_NAME")
|
| 16 |
+
HF_TOKEN = os.getenv("HF_TOKEN")
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def serialize_interactions(interactions):
|
| 20 |
+
"""Convert datetime objects in interactions to ISO format strings"""
|
| 21 |
+
if not interactions:
|
| 22 |
+
return interactions
|
| 23 |
+
|
| 24 |
+
serialized = []
|
| 25 |
+
for interaction in interactions:
|
| 26 |
+
# Handle case where interaction might be a list instead of a dict
|
| 27 |
+
if isinstance(interaction, list):
|
| 28 |
+
# If it's a list, recursively serialize each item
|
| 29 |
+
serialized.append(serialize_interactions(interaction))
|
| 30 |
+
elif isinstance(interaction, dict):
|
| 31 |
+
# If it's a dict, serialize it normally
|
| 32 |
+
serialized_interaction = {}
|
| 33 |
+
for key, value in interaction.items():
|
| 34 |
+
if isinstance(value, datetime.datetime):
|
| 35 |
+
serialized_interaction[key] = value.isoformat()
|
| 36 |
+
else:
|
| 37 |
+
serialized_interaction[key] = value
|
| 38 |
+
serialized.append(serialized_interaction)
|
| 39 |
+
else:
|
| 40 |
+
# If it's neither list nor dict, just add it as is
|
| 41 |
+
serialized.append(interaction)
|
| 42 |
+
return serialized
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def save_vote_to_hf(
|
| 46 |
+
model_a, model_b, prompt, response_a, response_b, vote_result, interactions_a=None, interactions_b=None, conversation_a=None, conversation_b=None, hf_token=None
|
| 47 |
+
):
|
| 48 |
+
"""Save vote result to HuggingFace dataset with full conversation history"""
|
| 49 |
+
try:
|
| 50 |
+
# Use global token if not provided
|
| 51 |
+
token = hf_token or HF_TOKEN
|
| 52 |
+
if not token:
|
| 53 |
+
return False, "HuggingFace token not found in environment (HF_TOKEN)"
|
| 54 |
+
|
| 55 |
+
if not HF_DATASET_NAME:
|
| 56 |
+
return False, "HuggingFace dataset name not found in environment (HF_DATASET_NAME)"
|
| 57 |
+
|
| 58 |
+
# Serialize conversations for JSON compatibility
|
| 59 |
+
serialized_conversation_a = serialize_interactions(conversation_a or [])
|
| 60 |
+
serialized_conversation_b = serialize_interactions(conversation_b or [])
|
| 61 |
+
|
| 62 |
+
# Organize interactions by turns - each turn contains a list of interactions
|
| 63 |
+
def organize_interactions_by_turns(interactions, conversation):
|
| 64 |
+
"""Organize interactions by conversation turns"""
|
| 65 |
+
if not interactions:
|
| 66 |
+
return []
|
| 67 |
+
|
| 68 |
+
# For now, put all interactions in a single turn
|
| 69 |
+
# This can be enhanced later to properly group by conversation turns
|
| 70 |
+
# when we have more context about how interactions are timestamped
|
| 71 |
+
return interactions if interactions else []
|
| 72 |
+
|
| 73 |
+
# Organize interactions by turns for both models
|
| 74 |
+
action_a = organize_interactions_by_turns(interactions_a or [], conversation_a or [])
|
| 75 |
+
action_b = organize_interactions_by_turns(interactions_b or [], conversation_b or [])
|
| 76 |
+
|
| 77 |
+
# Serialize actions for JSON compatibility
|
| 78 |
+
serialized_action_a = serialize_interactions(action_a)
|
| 79 |
+
serialized_action_b = serialize_interactions(action_b)
|
| 80 |
+
|
| 81 |
+
# Create vote data with full conversation history and actions organized by turns
|
| 82 |
+
# Each conversation is a list of messages in format: [{"role": "user"/"assistant", "content": "...", "action": [...]}, ...]
|
| 83 |
+
# Actions are organized as list of lists: [[turn1_interactions], [turn2_interactions], ...]
|
| 84 |
+
vote_data = {
|
| 85 |
+
"timestamp": datetime.datetime.now().isoformat(),
|
| 86 |
+
"model_a": model_a,
|
| 87 |
+
"model_b": model_b,
|
| 88 |
+
"initial_prompt": prompt, # Convert list to single string
|
| 89 |
+
"action_a": serialized_action_a, # Actions organized by turns for model A
|
| 90 |
+
"action_b": serialized_action_b, # Actions organized by turns for model B
|
| 91 |
+
"conversation_a": serialized_conversation_a, # Full conversation history for model A
|
| 92 |
+
"conversation_b": serialized_conversation_b, # Full conversation history for model B
|
| 93 |
+
"vote": vote_result, # "left", "right", "tie", "both_bad"
|
| 94 |
+
}
|
| 95 |
+
|
| 96 |
+
# Try to load existing dataset or create new one
|
| 97 |
+
try:
|
| 98 |
+
dataset = load_dataset(HF_DATASET_NAME, split="train", token=token)
|
| 99 |
+
# Convert to pandas DataFrame - handle both Dataset and DatasetDict
|
| 100 |
+
if hasattr(dataset, "to_pandas"):
|
| 101 |
+
df = dataset.to_pandas()
|
| 102 |
+
else:
|
| 103 |
+
df = pd.DataFrame(dataset)
|
| 104 |
+
# Add new vote
|
| 105 |
+
new_df = pd.concat([df, pd.DataFrame([vote_data])], ignore_index=True)
|
| 106 |
+
except Exception as load_error:
|
| 107 |
+
# Create new dataset if it doesn't exist
|
| 108 |
+
new_df = pd.DataFrame([vote_data])
|
| 109 |
+
|
| 110 |
+
# Convert back to dataset and push
|
| 111 |
+
new_dataset = Dataset.from_pandas(new_df)
|
| 112 |
+
try:
|
| 113 |
+
new_dataset.push_to_hub(HF_DATASET_NAME, token=token)
|
| 114 |
+
return True, "Vote saved successfully!"
|
| 115 |
+
except Exception as upload_error:
|
| 116 |
+
return False, f"Error uploading to HuggingFace: {str(upload_error)}"
|
| 117 |
+
except Exception as e:
|
| 118 |
+
return False, f"Error saving vote: {str(e)}"
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
def handle_vote(state0, state1, vote_type):
|
| 122 |
+
"""Handle vote submission"""
|
| 123 |
+
if (
|
| 124 |
+
not state0
|
| 125 |
+
or not state1
|
| 126 |
+
or not state0.get("has_output")
|
| 127 |
+
or not state1.get("has_output")
|
| 128 |
+
):
|
| 129 |
+
return (
|
| 130 |
+
"No output to vote on!",
|
| 131 |
+
gr.update(),
|
| 132 |
+
"**Last Updated:** No data available",
|
| 133 |
+
)
|
| 134 |
+
|
| 135 |
+
# Get all user messages and the last responses
|
| 136 |
+
user_messages = []
|
| 137 |
+
response_a = ""
|
| 138 |
+
response_b = ""
|
| 139 |
+
|
| 140 |
+
# Collect all user messages from the conversation
|
| 141 |
+
for msg in state0["messages"]:
|
| 142 |
+
if msg["role"] == "user":
|
| 143 |
+
user_messages.append(msg["content"])
|
| 144 |
+
|
| 145 |
+
for msg in reversed(state0["messages"]):
|
| 146 |
+
if msg["role"] == "assistant":
|
| 147 |
+
response_a = msg["content"]
|
| 148 |
+
break
|
| 149 |
+
|
| 150 |
+
for msg in reversed(state1["messages"]):
|
| 151 |
+
if msg["role"] == "assistant":
|
| 152 |
+
response_b = msg["content"]
|
| 153 |
+
break
|
| 154 |
+
|
| 155 |
+
# Get interactions and full conversation history for remote dataset saving
|
| 156 |
+
interactions_a = state0.get("interactions", [])
|
| 157 |
+
interactions_b = state1.get("interactions", [])
|
| 158 |
+
|
| 159 |
+
# Get full conversation history for both models
|
| 160 |
+
conversation_a = state0.get("messages", [])
|
| 161 |
+
conversation_b = state1.get("messages", [])
|
| 162 |
+
|
| 163 |
+
# Save vote with full conversation history to remote dataset in background (async)
|
| 164 |
+
def save_vote_background():
|
| 165 |
+
try:
|
| 166 |
+
success, message = save_vote_to_hf(
|
| 167 |
+
state0["model_name"],
|
| 168 |
+
state1["model_name"],
|
| 169 |
+
user_messages[0],
|
| 170 |
+
response_a,
|
| 171 |
+
response_b,
|
| 172 |
+
vote_type,
|
| 173 |
+
interactions_a,
|
| 174 |
+
interactions_b,
|
| 175 |
+
conversation_a,
|
| 176 |
+
conversation_b,
|
| 177 |
+
)
|
| 178 |
+
|
| 179 |
+
except Exception as e:
|
| 180 |
+
print(f"Error saving vote: {str(e)}")
|
| 181 |
+
pass
|
| 182 |
+
|
| 183 |
+
print("Saving vote in background...")
|
| 184 |
+
# Start background upload thread
|
| 185 |
+
upload_thread = threading.Thread(target=save_vote_background)
|
| 186 |
+
upload_thread.daemon = True
|
| 187 |
+
upload_thread.start()
|
| 188 |
+
|
| 189 |
+
# Return immediately without waiting for upload
|
| 190 |
+
success = True # Assume success for immediate UI response
|
| 191 |
+
message = "Vote recorded! Uploading data in background..."
|
| 192 |
+
|
| 193 |
+
if success:
|
| 194 |
+
# Return immediately without waiting for ranking refresh
|
| 195 |
+
return (
|
| 196 |
+
message + " Clearing conversation...",
|
| 197 |
+
gr.update(), # Keep existing ranking table
|
| 198 |
+
"**Last Updated:** Processing in background...",
|
| 199 |
+
)
|
| 200 |
+
else:
|
| 201 |
+
return message, gr.update(), "**Last Updated:** Error occurred"
|
| 202 |
+
|
| 203 |
+
|
| 204 |
+
def create_vote_ui():
|
| 205 |
+
"""Create vote UI components"""
|
| 206 |
+
# Vote buttons section - only visible after output
|
| 207 |
+
with gr.Row(visible=False) as vote_section:
|
| 208 |
+
gr.Markdown("### π³οΈ Which response is better?")
|
| 209 |
+
|
| 210 |
+
with gr.Row(visible=False) as vote_buttons_row:
|
| 211 |
+
vote_left_btn = gr.Button(
|
| 212 |
+
"π A is Better", variant="primary", size="lg"
|
| 213 |
+
)
|
| 214 |
+
vote_tie_btn = gr.Button(
|
| 215 |
+
"π€ It's a Tie", variant="secondary", size="lg"
|
| 216 |
+
)
|
| 217 |
+
vote_both_bad_btn = gr.Button(
|
| 218 |
+
"π Both are Bad", variant="secondary", size="lg"
|
| 219 |
+
)
|
| 220 |
+
vote_right_btn = gr.Button(
|
| 221 |
+
"π B is Better", variant="primary", size="lg"
|
| 222 |
+
)
|
| 223 |
+
|
| 224 |
+
# Vote status message
|
| 225 |
+
vote_status = gr.Markdown("", visible=False)
|
| 226 |
+
|
| 227 |
+
return {
|
| 228 |
+
'vote_section': vote_section,
|
| 229 |
+
'vote_buttons_row': vote_buttons_row,
|
| 230 |
+
'vote_left_btn': vote_left_btn,
|
| 231 |
+
'vote_right_btn': vote_right_btn,
|
| 232 |
+
'vote_tie_btn': vote_tie_btn,
|
| 233 |
+
'vote_both_bad_btn': vote_both_bad_btn,
|
| 234 |
+
'vote_status': vote_status
|
| 235 |
+
}
|
| 236 |
+
|
| 237 |
+
|
| 238 |
+
def should_show_vote_buttons(state0, state1):
|
| 239 |
+
"""Check if vote buttons should be shown"""
|
| 240 |
+
return (
|
| 241 |
+
state0
|
| 242 |
+
and state0.get("has_output", False)
|
| 243 |
+
and not state0.get("generating", False)
|
| 244 |
+
and state1
|
| 245 |
+
and state1.get("has_output", False)
|
| 246 |
+
and not state1.get("generating", False)
|
| 247 |
+
)
|
| 248 |
+
|
| 249 |
+
|
| 250 |
+
def get_vote_ui_updates(show_buttons=False):
|
| 251 |
+
"""Get UI updates for vote components"""
|
| 252 |
+
return {
|
| 253 |
+
'vote_section': gr.update(visible=show_buttons),
|
| 254 |
+
'vote_buttons_row': gr.update(visible=show_buttons),
|
| 255 |
+
'vote_status': gr.update(visible=False),
|
| 256 |
+
'vote_left_btn': gr.update(interactive=show_buttons),
|
| 257 |
+
'vote_right_btn': gr.update(interactive=show_buttons),
|
| 258 |
+
'vote_tie_btn': gr.update(interactive=show_buttons),
|
| 259 |
+
'vote_both_bad_btn': gr.update(interactive=show_buttons),
|
| 260 |
+
}
|
| 261 |
+
|
| 262 |
+
|
| 263 |
+
def setup_vote_handlers(vote_components, state0_var, state1_var, text_input, ranking_table, ranking_last_update):
|
| 264 |
+
"""Setup vote button event handlers"""
|
| 265 |
+
|
| 266 |
+
def process_vote(state0, state1, vote_type, current_text):
|
| 267 |
+
# Save the vote and get updates
|
| 268 |
+
message, ranking_update, last_update = handle_vote(
|
| 269 |
+
state0, state1, vote_type
|
| 270 |
+
)
|
| 271 |
+
|
| 272 |
+
# Show thank you message
|
| 273 |
+
gr.Info(
|
| 274 |
+
"Thank you for your vote! π Your feedback has been recorded.",
|
| 275 |
+
duration=5,
|
| 276 |
+
)
|
| 277 |
+
|
| 278 |
+
# Return only vote status, ranking updates and hide voting interface
|
| 279 |
+
return (
|
| 280 |
+
message, # vote status message
|
| 281 |
+
gr.update(), # Keep state0 unchanged
|
| 282 |
+
gr.update(), # Keep state1 unchanged
|
| 283 |
+
gr.update(), # Keep chatbot_a unchanged
|
| 284 |
+
gr.update(), # Keep chatbot_b unchanged
|
| 285 |
+
gr.update(), # Keep response_a unchanged
|
| 286 |
+
gr.update(), # Keep response_b unchanged
|
| 287 |
+
gr.update(), # Keep code_a unchanged
|
| 288 |
+
gr.update(), # Keep code_b unchanged
|
| 289 |
+
gr.update(), # Keep sandbox_view_a unchanged
|
| 290 |
+
gr.update(), # Keep sandbox_view_b unchanged
|
| 291 |
+
gr.update(), # Keep sandbox_component_a unchanged
|
| 292 |
+
gr.update(), # Keep sandbox_component_b unchanged
|
| 293 |
+
gr.update(), # Keep chat_stats_a unchanged
|
| 294 |
+
gr.update(), # Keep chat_stats_b unchanged
|
| 295 |
+
gr.update(), # Keep model_display_a unchanged
|
| 296 |
+
gr.update(), # Keep model_display_b unchanged
|
| 297 |
+
gr.update(visible=False), # Hide vote_section
|
| 298 |
+
gr.update(visible=False), # Hide vote_buttons_row
|
| 299 |
+
gr.update(), # Keep state0_var unchanged
|
| 300 |
+
gr.update(), # Keep state1_var unchanged
|
| 301 |
+
ranking_update, # Update ranking_table
|
| 302 |
+
last_update, # Update ranking_last_update
|
| 303 |
+
gr.update(), # Keep vote_left_btn unchanged
|
| 304 |
+
gr.update(), # Keep vote_right_btn unchanged
|
| 305 |
+
gr.update(), # Keep vote_tie_btn unchanged
|
| 306 |
+
gr.update(), # Keep vote_both_bad_btn unchanged
|
| 307 |
+
gr.update(), # Keep text_input unchanged
|
| 308 |
+
)
|
| 309 |
+
|
| 310 |
+
# Vote button click handlers
|
| 311 |
+
for vote_btn, vote_type in [
|
| 312 |
+
(vote_components['vote_left_btn'], "left"),
|
| 313 |
+
(vote_components['vote_right_btn'], "right"),
|
| 314 |
+
(vote_components['vote_tie_btn'], "tie"),
|
| 315 |
+
(vote_components['vote_both_bad_btn'], "both_bad"),
|
| 316 |
+
]:
|
| 317 |
+
vote_btn.click(
|
| 318 |
+
fn=process_vote,
|
| 319 |
+
inputs=[state0_var, state1_var, gr.State(vote_type), text_input],
|
| 320 |
+
outputs=[
|
| 321 |
+
vote_components['vote_status'], # vote status message
|
| 322 |
+
state0_var, # state0
|
| 323 |
+
state1_var, # state1
|
| 324 |
+
# Note: The actual outputs list will need to be filled in by the calling code
|
| 325 |
+
# as it depends on the specific UI components in the main app
|
| 326 |
+
],
|
| 327 |
+
)
|
| 328 |
+
|
| 329 |
+
return vote_components
|