terryyz commited on
Commit
ba99c06
Β·
1 Parent(s): bc55571
Files changed (8) hide show
  1. .gitignore +3 -0
  2. api_config.yaml +49 -1
  3. app.py +254 -472
  4. elo_calculation.py +315 -0
  5. ranking.py +199 -0
  6. requirements.txt +2 -1
  7. sandbox/sandbox_manager.py +1 -1
  8. voting.py +329 -0
.gitignore CHANGED
@@ -33,6 +33,9 @@ logs/
33
  *.manifest
34
  *.spec
35
 
 
 
 
36
  # Installer logs
37
  pip-log.txt
38
  pip-delete-this-directory.txt
 
33
  *.manifest
34
  *.spec
35
 
36
+ e2b_sandbox_template/
37
+ build.sh
38
+
39
  # Installer logs
40
  pip-log.txt
41
  pip-delete-this-directory.txt
api_config.yaml CHANGED
@@ -5,6 +5,8 @@ gpt-4o-mini-2024-07-18:
5
  parallel: 32
6
  max_tokens: 8192
7
  temperature: 0.0
 
 
8
 
9
  gpt-4.1-mini-2025-04-14:
10
  model: gpt-4.1-mini-2025-04-14
@@ -13,6 +15,8 @@ gpt-4.1-mini-2025-04-14:
13
  parallel: 32
14
  max_tokens: 8192
15
  temperature: 0.0
 
 
16
 
17
  # o1-2024-12-17:
18
  # model: o1-2024-12-17
@@ -21,6 +25,8 @@ gpt-4.1-mini-2025-04-14:
21
  # parallel: 32
22
  # max_tokens: 8192
23
  # temperature: 0.0
 
 
24
 
25
  # o4-mini-2025-04-16:
26
  # model: o4-mini-2025-04-16
@@ -29,6 +35,8 @@ gpt-4.1-mini-2025-04-14:
29
  # parallel: 32
30
  # max_tokens: 8192
31
  # temperature: 1.0
 
 
32
 
33
  # o3-mini-2025-01-31:
34
  # model: o3-mini-2025-01-31
@@ -37,6 +45,8 @@ gpt-4.1-mini-2025-04-14:
37
  # parallel: 32
38
  # max_tokens: 8192
39
  # temperature: 0.0
 
 
40
 
41
  # gemini-2.0-flash-001:
42
  # model: google/gemini-2.0-flash-001
@@ -47,6 +57,8 @@ gpt-4.1-mini-2025-04-14:
47
  # parallel: 32
48
  # max_tokens: 81920
49
  # temperature: 0.0
 
 
50
 
51
  # gemini-2.5-pro:
52
  # model: google/gemini-2.5-pro
@@ -57,6 +69,8 @@ gpt-4.1-mini-2025-04-14:
57
  # parallel: 32
58
  # max_tokens: 8192
59
  # temperature: 0.0
 
 
60
 
61
  # gemini-2.5-flash:
62
  # model: google/gemini-2.5-flash
@@ -67,6 +81,8 @@ gpt-4.1-mini-2025-04-14:
67
  # parallel: 32
68
  # max_tokens: 8192
69
  # temperature: 0.0
 
 
70
 
71
  # claude35_haiku:
72
  # model: bedrock/anthropic.claude-3-5-haiku-20241022-v1:0
@@ -75,6 +91,8 @@ gpt-4.1-mini-2025-04-14:
75
  # parallel: 32
76
  # max_tokens: 8192
77
  # temperature: 0.0
 
 
78
 
79
  # claude35_sonnet:
80
  # model: bedrock/anthropic.claude-3-5-sonnet-20241022-v2:0
@@ -83,6 +101,8 @@ gpt-4.1-mini-2025-04-14:
83
  # parallel: 32
84
  # max_tokens: 8192
85
  # temperature: 0.0
 
 
86
 
87
  # claude37_sonnet:
88
  # model: bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0
@@ -91,6 +111,8 @@ gpt-4.1-mini-2025-04-14:
91
  # parallel: 32
92
  # max_tokens: 8192
93
  # temperature: 0.0
 
 
94
 
95
  # qwen3-coder:
96
  # model: qwen/qwen3-coder
@@ -101,6 +123,8 @@ gpt-4.1-mini-2025-04-14:
101
  # parallel: 32
102
  # max_tokens: 8192
103
  # temperature: 0.0
 
 
104
 
105
  # kimi-k2:
106
  # model: moonshotai/kimi-k2
@@ -111,6 +135,8 @@ gpt-4.1-mini-2025-04-14:
111
  # parallel: 32
112
  # max_tokens: 8192
113
  # temperature: 0.0
 
 
114
 
115
  # claude-4-sonnet:
116
  # model: bedrock/us.anthropic.claude-sonnet-4-20250514-v1:0
@@ -119,6 +145,8 @@ gpt-4.1-mini-2025-04-14:
119
  # parallel: 16
120
  # max_tokens: 8192
121
  # temperature: 0.0
 
 
122
 
123
  # claude-4-opus:
124
  # model: bedrock/us.anthropic.claude-opus-4-20250514-v1:0
@@ -127,6 +155,8 @@ gpt-4.1-mini-2025-04-14:
127
  # parallel: 16
128
  # max_tokens: 8192
129
  # temperature: 0.0
 
 
130
 
131
  # gpt-oss-120b:
132
  # model: openai/gpt-oss-120b
@@ -137,6 +167,8 @@ gpt-4.1-mini-2025-04-14:
137
  # parallel: 32
138
  # max_tokens: 8192
139
  # temperature: 1.0
 
 
140
 
141
  # gpt-oss-20b:
142
  # model: openai/gpt-oss-20b
@@ -147,6 +179,8 @@ gpt-4.1-mini-2025-04-14:
147
  # parallel: 32
148
  # max_tokens: 8192
149
  # temperature: 1.0
 
 
150
 
151
  # deepseek-chat-v3-0324:
152
  # model: deepseek/deepseek-chat-v3-0324
@@ -157,6 +191,8 @@ gpt-4.1-mini-2025-04-14:
157
  # parallel: 32
158
  # max_tokens: 8192
159
  # temperature: 0.0
 
 
160
 
161
  # deepseek-chat-v3.1:
162
  # model: deepseek-chat
@@ -167,6 +203,8 @@ gpt-4.1-mini-2025-04-14:
167
  # parallel: 32
168
  # max_tokens: 8192
169
  # temperature: 0.0
 
 
170
 
171
  # glm-4.5:
172
  # model: z-ai/glm-4.5
@@ -177,6 +215,8 @@ gpt-4.1-mini-2025-04-14:
177
  # parallel: 32
178
  # max_tokens: 8192
179
  # temperature: 0.0
 
 
180
 
181
  # gpt-4.1-2025-04-14:
182
  # model: gpt-4.1-2025-04-14
@@ -185,6 +225,8 @@ gpt-4.1-mini-2025-04-14:
185
  # parallel: 32
186
  # max_tokens: 8192
187
  # temperature: 0.0
 
 
188
 
189
 
190
  # deepseek-r1-0528:
@@ -196,6 +238,8 @@ gpt-4.1-mini-2025-04-14:
196
  # parallel: 32
197
  # max_tokens: 81920
198
  # temperature: 1.0
 
 
199
 
200
  # gpt-5-2025-08-07:
201
  # model: gpt-5-2025-08-07
@@ -204,6 +248,8 @@ gpt-4.1-mini-2025-04-14:
204
  # parallel: 32
205
  # max_tokens: 8192
206
  # temperature: 1.0
 
 
207
 
208
  # grok-code:
209
  # model: x-ai/grok-code-fast-1
@@ -213,4 +259,6 @@ gpt-4.1-mini-2025-04-14:
213
  # api_type: openai_thinking
214
  # parallel: 32
215
  # max_tokens: 8192
216
- # temperature: 1.0
 
 
 
5
  parallel: 32
6
  max_tokens: 8192
7
  temperature: 0.0
8
+ organization: OpenAI
9
+ license: Proprietary
10
 
11
  gpt-4.1-mini-2025-04-14:
12
  model: gpt-4.1-mini-2025-04-14
 
15
  parallel: 32
16
  max_tokens: 8192
17
  temperature: 0.0
18
+ organization: OpenAI
19
+ license: Proprietary
20
 
21
  # o1-2024-12-17:
22
  # model: o1-2024-12-17
 
25
  # parallel: 32
26
  # max_tokens: 8192
27
  # temperature: 0.0
28
+ # organization: OpenAI
29
+ # license: Proprietary
30
 
31
  # o4-mini-2025-04-16:
32
  # model: o4-mini-2025-04-16
 
35
  # parallel: 32
36
  # max_tokens: 8192
37
  # temperature: 1.0
38
+ # organization: OpenAI
39
+ # license: Proprietary
40
 
41
  # o3-mini-2025-01-31:
42
  # model: o3-mini-2025-01-31
 
45
  # parallel: 32
46
  # max_tokens: 8192
47
  # temperature: 0.0
48
+ # organization: OpenAI
49
+ # license: Proprietary
50
 
51
  # gemini-2.0-flash-001:
52
  # model: google/gemini-2.0-flash-001
 
57
  # parallel: 32
58
  # max_tokens: 81920
59
  # temperature: 0.0
60
+ # organization: Google
61
+ # license: Proprietary
62
 
63
  # gemini-2.5-pro:
64
  # model: google/gemini-2.5-pro
 
69
  # parallel: 32
70
  # max_tokens: 8192
71
  # temperature: 0.0
72
+ # organization: Google
73
+ # license: Proprietary
74
 
75
  # gemini-2.5-flash:
76
  # model: google/gemini-2.5-flash
 
81
  # parallel: 32
82
  # max_tokens: 8192
83
  # temperature: 0.0
84
+ # organization: Google
85
+ # license: Proprietary
86
 
87
  # claude35_haiku:
88
  # model: bedrock/anthropic.claude-3-5-haiku-20241022-v1:0
 
91
  # parallel: 32
92
  # max_tokens: 8192
93
  # temperature: 0.0
94
+ # organization: Anthropic
95
+ # license: Proprietary
96
 
97
  # claude35_sonnet:
98
  # model: bedrock/anthropic.claude-3-5-sonnet-20241022-v2:0
 
101
  # parallel: 32
102
  # max_tokens: 8192
103
  # temperature: 0.0
104
+ # organization: Anthropic
105
+ # license: Proprietary
106
 
107
  # claude37_sonnet:
108
  # model: bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0
 
111
  # parallel: 32
112
  # max_tokens: 8192
113
  # temperature: 0.0
114
+ # organization: Anthropic
115
+ # license: Proprietary
116
 
117
  # qwen3-coder:
118
  # model: qwen/qwen3-coder
 
123
  # parallel: 32
124
  # max_tokens: 8192
125
  # temperature: 0.0
126
+ # organization: Alibaba
127
+ # license: Apache 2.0
128
 
129
  # kimi-k2:
130
  # model: moonshotai/kimi-k2
 
135
  # parallel: 32
136
  # max_tokens: 8192
137
  # temperature: 0.0
138
+ # organization: Moonshot
139
+ # license: Modified MIT
140
 
141
  # claude-4-sonnet:
142
  # model: bedrock/us.anthropic.claude-sonnet-4-20250514-v1:0
 
145
  # parallel: 16
146
  # max_tokens: 8192
147
  # temperature: 0.0
148
+ # organization: Anthropic
149
+ # license: Proprietary
150
 
151
  # claude-4-opus:
152
  # model: bedrock/us.anthropic.claude-opus-4-20250514-v1:0
 
155
  # parallel: 16
156
  # max_tokens: 8192
157
  # temperature: 0.0
158
+ # organization: Anthropic
159
+ # license: Proprietary
160
 
161
  # gpt-oss-120b:
162
  # model: openai/gpt-oss-120b
 
167
  # parallel: 32
168
  # max_tokens: 8192
169
  # temperature: 1.0
170
+ # organization: OpenAI
171
+ # license: MIT
172
 
173
  # gpt-oss-20b:
174
  # model: openai/gpt-oss-20b
 
179
  # parallel: 32
180
  # max_tokens: 8192
181
  # temperature: 1.0
182
+ # organization: OpenAI
183
+ # license: MIT
184
 
185
  # deepseek-chat-v3-0324:
186
  # model: deepseek/deepseek-chat-v3-0324
 
191
  # parallel: 32
192
  # max_tokens: 8192
193
  # temperature: 0.0
194
+ # organization: DeepSeek
195
+ # license: MIT
196
 
197
  # deepseek-chat-v3.1:
198
  # model: deepseek-chat
 
203
  # parallel: 32
204
  # max_tokens: 8192
205
  # temperature: 0.0
206
+ # organization: DeepSeek
207
+ # license: MIT
208
 
209
  # glm-4.5:
210
  # model: z-ai/glm-4.5
 
215
  # parallel: 32
216
  # max_tokens: 8192
217
  # temperature: 0.0
218
+ # organization: Zhipu AI
219
+ # license: Custom
220
 
221
  # gpt-4.1-2025-04-14:
222
  # model: gpt-4.1-2025-04-14
 
225
  # parallel: 32
226
  # max_tokens: 8192
227
  # temperature: 0.0
228
+ # organization: OpenAI
229
+ # license: Proprietary
230
 
231
 
232
  # deepseek-r1-0528:
 
238
  # parallel: 32
239
  # max_tokens: 81920
240
  # temperature: 1.0
241
+ # organization: DeepSeek
242
+ # license: MIT
243
 
244
  # gpt-5-2025-08-07:
245
  # model: gpt-5-2025-08-07
 
248
  # parallel: 32
249
  # max_tokens: 8192
250
  # temperature: 1.0
251
+ # organization: OpenAI
252
+ # license: Proprietary
253
 
254
  # grok-code:
255
  # model: x-ai/grok-code-fast-1
 
259
  # api_type: openai_thinking
260
  # parallel: 32
261
  # max_tokens: 8192
262
+ # temperature: 1.0
263
+ # organization: xAI
264
+ # license: Proprietary
app.py CHANGED
@@ -10,8 +10,37 @@ import datetime
10
  import os
11
  import asyncio
12
  import concurrent.futures
 
13
  import time
 
 
14
  from datasets import Dataset, load_dataset
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  # Import completion utilities
16
  from completion import make_config, registered_api_completion
17
  from sandbox.prompts import GENERAL_SANDBOX_INSTRUCTION
@@ -95,18 +124,130 @@ available_models = list(api_config.keys()) if api_config else []
95
  HF_DATASET_NAME = os.getenv("HF_DATASET_NAME")
96
  HF_TOKEN = os.getenv("HF_TOKEN")
97
 
98
- # Global ranking data cache
99
- ranking_data = None
100
- ranking_last_updated = None
101
 
102
  def get_random_models():
103
- """Get two random models from available models"""
104
  if len(available_models) < 2:
105
  return available_models[0] if available_models else None, available_models[0] if available_models else None
106
 
107
- import random
108
- models = random.sample(available_models, 2)
109
- return models[0], models[1]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
 
111
  def create_chat_state(model_name: str) -> dict:
112
  """Create a new chat state for a model"""
@@ -488,7 +629,7 @@ def clear_chat(state0, state1):
488
 
489
  # Get current model names for display
490
  model_a, model_b = get_random_models()
491
-
492
  return (
493
  None, # state0
494
  None, # state1
@@ -524,52 +665,49 @@ def retry_last_message(state0, state1, model_a, model_b):
524
  """Retry the last user message"""
525
  if not state0 or not state1:
526
  return state0, state1, "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""
527
-
528
  # Get the last user message
529
  last_user_message = ""
530
  for msg in reversed(state0["messages"]):
531
  if msg["role"] == "user":
532
  last_user_message = msg["content"]
533
  break
534
-
535
  if not last_user_message:
536
  return state0, state1, "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""
537
-
538
  # Remove the last user message and assistant responses from both states
539
  if state0["messages"] and state0["messages"][-1]["role"] == "assistant":
540
  state0["messages"].pop() # Remove last assistant response
541
  if state0["messages"] and state0["messages"][-1]["role"] == "user":
542
  state0["messages"].pop() # Remove last user message
543
-
544
  if state1["messages"] and state1["messages"][-1]["role"] == "assistant":
545
  state1["messages"].pop() # Remove last assistant response
546
  if state1["messages"] and state1["messages"][-1]["role"] == "user":
547
  state1["messages"].pop() # Remove last user message
548
-
549
  # Generate new responses with the same message
550
  result = add_text_and_generate(state0, state1, last_user_message, 0.4, 8192, model_a, model_b)
551
-
552
  # Extract the state from the result
553
  new_state0, new_state1 = result[0], result[1]
554
-
555
  # Check if both models have output and are not generating to show vote buttons
556
- show_vote_buttons = (
557
- new_state0
558
- and new_state0.get("has_output", False)
559
- and not new_state0.get("generating", False)
560
- and new_state1
561
- and new_state1.get("has_output", False)
562
- and not new_state1.get("generating", False)
563
- )
564
-
565
  # Return all the original outputs plus the updated state for run buttons
566
  return (
567
  new_state0, # state0
568
  new_state1, # state1
569
  result[2], # chatbot_a (chat0)
570
  result[3], # chatbot_b (chat1)
571
- result[4]["content"] if isinstance(result[4], dict) else result[4], # response_a (response0)
572
- result[5]["content"] if isinstance(result[5], dict) else result[5], # response_b (response1)
 
 
 
 
573
  result[6], # code_a (code0)
574
  result[7], # code_b (code1)
575
  result[10] if len(result) > 10 else "", # sandbox_state0
@@ -608,37 +746,37 @@ def send_to_left_only(state0, state1, text, temperature, max_tokens, model_a, mo
608
  """Send message to left model (Model A) only"""
609
  if not text.strip():
610
  return state0, state1, "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""
611
-
612
  # Initialize states if needed
613
  if state0 is None:
614
  state0 = create_chat_state(model_a)
615
  if state1 is None:
616
  state1 = create_chat_state(model_b)
617
-
618
  # Add user message to left state only
619
  state0["messages"].append({"role": "user", "content": text})
620
  state0["generating"] = True
621
-
622
  # Generate response for left model only
623
  state0, response0 = generate_response_with_completion(state0, temperature, max_tokens)
624
  state0["messages"].append({"role": "assistant", "content": response0["content"]})
625
  state0["has_output"] = True
626
  state0["generating"] = False
627
-
628
  # Format chat history for display
629
  chat0 = format_chat_history(state0["messages"])
630
  chat1 = format_chat_history(state1["messages"]) if state1 else []
631
-
632
  # Extract code from response for sandbox
633
  sandbox_state0 = state0.get("sandbox_state", create_sandbox_state())
634
  sandbox_state0, code0, env0 = extract_and_execute_code(response0["content"], sandbox_state0)
635
  state0["sandbox_state"] = sandbox_state0
636
-
637
  # Clear previous sandbox outputs
638
  sandbox_output0 = ""
639
  sandbox_component_update0 = gr.update(value=("", False, []), visible=False)
640
  sandbox_view_a = ""
641
-
642
  # Run sandbox execution if there's code
643
  if code0.strip():
644
  install_command0 = sandbox_state0.get('install_command', "")
@@ -653,28 +791,33 @@ def send_to_left_only(state0, state1, text, temperature, max_tokens, model_a, mo
653
  sandbox_view_a += f"# Output\n{sandbox_output0}"
654
  if sandbox_error0:
655
  sandbox_view_a = f"<details closed><summary><strong>🚨 Errors/Warnings</strong></summary>\n\n```\n{sandbox_error0.strip()}\n```\n\n</details>\n\n" + sandbox_view_a
656
-
657
  # Calculate conversation statistics
658
  turn_count_a = len([msg for msg in state0["messages"] if msg["role"] == "assistant" and msg["content"]])
659
  turn_count_b = len([msg for msg in state1["messages"] if msg["role"] == "assistant" and msg["content"]]) if state1 else 0
660
-
661
  chat_stats_a = f"**Conversation:** {turn_count_a} turns | **Total Messages:** {len(state0['messages'])}"
662
  chat_stats_b = f"**Conversation:** {turn_count_b} turns | **Total Messages:** {len(state1['messages']) if state1 else 0}"
663
-
664
  # Don't show vote buttons since only one model responded
665
  show_vote_buttons = False
666
-
667
  return (
668
  state0, # state0
669
  state1, # state1
670
  chat0, # chatbot_a
671
  chat1, # chatbot_b
672
- response0["content"] if isinstance(response0, dict) else response0, # response_a
 
 
673
  "", # response_b (empty)
674
  code0, # code_a
675
  "", # code_b (empty)
676
  sandbox_state0, # sandbox_state0
677
- state1.get("sandbox_state", create_sandbox_state()) if state1 else create_sandbox_state(), # sandbox_state1
 
 
 
 
678
  sandbox_output0, # sandbox_output0
679
  "", # sandbox_output1 (empty)
680
  sandbox_component_update0, # sandbox_component_update0
@@ -701,37 +844,37 @@ def send_to_right_only(state0, state1, text, temperature, max_tokens, model_a, m
701
  """Send message to right model (Model B) only"""
702
  if not text.strip():
703
  return state0, state1, "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""
704
-
705
  # Initialize states if needed
706
  if state0 is None:
707
  state0 = create_chat_state(model_a)
708
  if state1 is None:
709
  state1 = create_chat_state(model_b)
710
-
711
  # Add user message to right state only
712
  state1["messages"].append({"role": "user", "content": text})
713
  state1["generating"] = True
714
-
715
  # Generate response for right model only
716
  state1, response1 = generate_response_with_completion(state1, temperature, max_tokens)
717
  state1["messages"].append({"role": "assistant", "content": response1["content"]})
718
  state1["has_output"] = True
719
  state1["generating"] = False
720
-
721
  # Format chat history for display
722
  chat0 = format_chat_history(state0["messages"]) if state0 else []
723
  chat1 = format_chat_history(state1["messages"])
724
-
725
  # Extract code from response for sandbox
726
  sandbox_state1 = state1.get("sandbox_state", create_sandbox_state())
727
  sandbox_state1, code1, env1 = extract_and_execute_code(response1["content"], sandbox_state1)
728
  state1["sandbox_state"] = sandbox_state1
729
-
730
  # Clear previous sandbox outputs
731
  sandbox_output1 = ""
732
  sandbox_component_update1 = gr.update(value=("", False, []), visible=False)
733
  sandbox_view_b = ""
734
-
735
  # Run sandbox execution if there's code
736
  if code1.strip():
737
  install_command1 = sandbox_state1.get('install_command', "")
@@ -746,27 +889,32 @@ def send_to_right_only(state0, state1, text, temperature, max_tokens, model_a, m
746
  sandbox_view_b += f"# Output\n{sandbox_output1}"
747
  if sandbox_error1:
748
  sandbox_view_b = f"<details closed><summary><strong>🚨 Errors/Warnings</strong></summary>\n\n```\n{sandbox_error1.strip()}\n```\n\n</details>\n\n" + sandbox_view_b
749
-
750
  # Calculate conversation statistics
751
  turn_count_a = len([msg for msg in state0["messages"] if msg["role"] == "assistant" and msg["content"]]) if state0 else 0
752
  turn_count_b = len([msg for msg in state1["messages"] if msg["role"] == "assistant" and msg["content"]])
753
-
754
  chat_stats_a = f"**Conversation:** {turn_count_a} turns | **Total Messages:** {len(state0['messages']) if state0 else 0}"
755
  chat_stats_b = f"**Conversation:** {turn_count_b} turns | **Total Messages:** {len(state1['messages'])}"
756
-
757
  # Don't show vote buttons since only one model responded
758
  show_vote_buttons = False
759
-
760
  return (
761
  state0, # state0
762
  state1, # state1
763
  chat0, # chatbot_a
764
  chat1, # chatbot_b
765
  "", # response_a (empty)
766
- response1["content"] if isinstance(response1, dict) else response1, # response_b
 
 
767
  "", # code_a (empty)
768
  code1, # code_b
769
- state0.get("sandbox_state", create_sandbox_state()) if state0 else create_sandbox_state(), # sandbox_state0
 
 
 
 
770
  sandbox_state1, # sandbox_state1
771
  "", # sandbox_output0 (empty)
772
  sandbox_output1, # sandbox_output1
@@ -791,90 +939,6 @@ def send_to_right_only(state0, state1, text, temperature, max_tokens, model_a, m
791
  )
792
 
793
 
794
- def handle_vote(state0, state1, vote_type):
795
- """Handle vote submission"""
796
- if (
797
- not state0
798
- or not state1
799
- or not state0.get("has_output")
800
- or not state1.get("has_output")
801
- ):
802
- return (
803
- "No output to vote on!",
804
- gr.update(),
805
- "**Last Updated:** No data available",
806
- )
807
-
808
- # Get all user messages and the last responses
809
- user_messages = []
810
- response_a = ""
811
- response_b = ""
812
-
813
- # Collect all user messages from the conversation
814
- for msg in state0["messages"]:
815
- if msg["role"] == "user":
816
- user_messages.append(msg["content"])
817
-
818
- for msg in reversed(state0["messages"]):
819
- if msg["role"] == "assistant":
820
- response_a = msg["content"]
821
- break
822
-
823
- for msg in reversed(state1["messages"]):
824
- if msg["role"] == "assistant":
825
- response_b = msg["content"]
826
- break
827
-
828
- # Get interactions and full conversation history for remote dataset saving
829
- interactions_a = state0.get("interactions", [])
830
- interactions_b = state1.get("interactions", [])
831
-
832
- # Get full conversation history for both models
833
- conversation_a = state0.get("messages", [])
834
- conversation_b = state1.get("messages", [])
835
-
836
- # Save vote with full conversation history to remote dataset in background (async)
837
- import threading
838
- def save_vote_background():
839
- try:
840
- success, message = save_vote_to_hf(
841
- state0["model_name"],
842
- state1["model_name"],
843
- user_messages[0],
844
- response_a,
845
- response_b,
846
- vote_type,
847
- interactions_a,
848
- interactions_b,
849
- conversation_a,
850
- conversation_b,
851
- )
852
-
853
- except Exception as e:
854
- print(f"Error saving vote: {str(e)}")
855
- pass
856
-
857
- print("Saving vote in background...")
858
- # Start background upload thread
859
- upload_thread = threading.Thread(target=save_vote_background)
860
- upload_thread.daemon = True
861
- upload_thread.start()
862
-
863
- # Return immediately without waiting for upload
864
- success = True # Assume success for immediate UI response
865
- message = "Vote recorded! Uploading data in background..."
866
-
867
- if success:
868
- # Return immediately without waiting for ranking refresh
869
- return (
870
- message + " Clearing conversation...",
871
- gr.update(), # Keep existing ranking table
872
- "**Last Updated:** Processing in background...",
873
- )
874
- else:
875
- return message, gr.update(), "**Last Updated:** Error occurred"
876
-
877
-
878
  def run_sandbox_code(sandbox_state: dict, code: str, install_command: str) -> tuple[str, str, str]:
879
  """Run code in the appropriate sandbox environment"""
880
  if not code.strip():
@@ -886,7 +950,6 @@ def run_sandbox_code(sandbox_state: dict, code: str, install_command: str) -> tu
886
 
887
  # Determine environment
888
  env = sandbox_state.get('auto_selected_sandbox_environment') or sandbox_state.get('sandbox_environment')
889
- print(f"DEBUG: env: {env}")
890
  try:
891
  if env == SandboxEnvironment.HTML:
892
  sandbox_url, sandbox_id, stderr = run_html_sandbox(code, install_command, sandbox_state.get('sandbox_id'))
@@ -979,227 +1042,28 @@ async def run_sandbox_code_async(sandbox_state: dict, code: str, install_command
979
  async def run_sandboxes_parallel(sandbox_state0, code0, install_command0, sandbox_state1, code1, install_command1):
980
  """Run both sandbox executions in parallel with error handling"""
981
  loop = asyncio.get_event_loop()
982
-
983
  # Create tasks for both sandbox executions
984
  task0 = loop.run_in_executor(None, run_sandbox_code, sandbox_state0, code0, install_command0)
985
  task1 = loop.run_in_executor(None, run_sandbox_code, sandbox_state1, code1, install_command1)
986
-
987
  # Wait for both to complete with error handling
988
  try:
989
  result0, result1 = await asyncio.gather(task0, task1, return_exceptions=True)
990
-
991
  # Handle exceptions
992
  if isinstance(result0, Exception):
993
  result0 = ("", "", f"Sandbox execution error: {str(result0)}")
994
-
995
  if isinstance(result1, Exception):
996
  result1 = ("", "", f"Sandbox execution error: {str(result1)}")
997
-
998
  except Exception as e:
999
  # Fallback to sequential processing
1000
  result0 = run_sandbox_code(sandbox_state0, code0, install_command0)
1001
  result1 = run_sandbox_code(sandbox_state1, code1, install_command1)
1002
-
1003
- return result0, result1
1004
-
1005
-
1006
- def serialize_interactions(interactions):
1007
- """Convert datetime objects in interactions to ISO format strings"""
1008
- if not interactions:
1009
- return interactions
1010
-
1011
- serialized = []
1012
- for interaction in interactions:
1013
- # Handle case where interaction might be a list instead of a dict
1014
- if isinstance(interaction, list):
1015
- # If it's a list, recursively serialize each item
1016
- serialized.append(serialize_interactions(interaction))
1017
- elif isinstance(interaction, dict):
1018
- # If it's a dict, serialize it normally
1019
- serialized_interaction = {}
1020
- for key, value in interaction.items():
1021
- if isinstance(value, datetime.datetime):
1022
- serialized_interaction[key] = value.isoformat()
1023
- else:
1024
- serialized_interaction[key] = value
1025
- serialized.append(serialized_interaction)
1026
- else:
1027
- # If it's neither list nor dict, just add it as is
1028
- serialized.append(interaction)
1029
- return serialized
1030
-
1031
-
1032
- def save_vote_to_hf(
1033
- model_a, model_b, prompt, response_a, response_b, vote_result, interactions_a=None, interactions_b=None, conversation_a=None, conversation_b=None, hf_token=None
1034
- ):
1035
- """Save vote result to HuggingFace dataset with full conversation history"""
1036
- try:
1037
- # Use global token if not provided
1038
- token = hf_token or HF_TOKEN
1039
- if not token:
1040
- return False, "HuggingFace token not found in environment (HF_TOKEN)"
1041
-
1042
- if not HF_DATASET_NAME:
1043
- return False, "HuggingFace dataset name not found in environment (HF_DATASET_NAME)"
1044
-
1045
-
1046
- # Serialize conversations for JSON compatibility
1047
- serialized_conversation_a = serialize_interactions(conversation_a or [])
1048
- serialized_conversation_b = serialize_interactions(conversation_b or [])
1049
-
1050
- # Organize interactions by turns - each turn contains a list of interactions
1051
- def organize_interactions_by_turns(interactions, conversation):
1052
- """Organize interactions by conversation turns"""
1053
- if not interactions:
1054
- return []
1055
-
1056
- # For now, put all interactions in a single turn
1057
- # This can be enhanced later to properly group by conversation turns
1058
- # when we have more context about how interactions are timestamped
1059
- return interactions if interactions else []
1060
-
1061
- # Organize interactions by turns for both models
1062
- action_a = organize_interactions_by_turns(interactions_a or [], conversation_a or [])
1063
- action_b = organize_interactions_by_turns(interactions_b or [], conversation_b or [])
1064
-
1065
- # Serialize actions for JSON compatibility
1066
- serialized_action_a = serialize_interactions(action_a)
1067
- serialized_action_b = serialize_interactions(action_b)
1068
-
1069
- # Create vote data with full conversation history and actions organized by turns
1070
- # Each conversation is a list of messages in format: [{"role": "user"/"assistant", "content": "...", "action": [...]}, ...]
1071
- # Actions are organized as list of lists: [[turn1_interactions], [turn2_interactions], ...]
1072
- vote_data = {
1073
- "timestamp": datetime.datetime.now().isoformat(),
1074
- "model_a": model_a,
1075
- "model_b": model_b,
1076
- "initial_prompt": prompt, # Convert list to single string
1077
- "action_a": serialized_action_a, # Actions organized by turns for model A
1078
- "action_b": serialized_action_b, # Actions organized by turns for model B
1079
- "conversation_a": serialized_conversation_a, # Full conversation history for model A
1080
- "conversation_b": serialized_conversation_b, # Full conversation history for model B
1081
- "vote": vote_result, # "left", "right", "tie", "both_bad"
1082
- }
1083
-
1084
- # Try to load existing dataset or create new one
1085
- try:
1086
- dataset = load_dataset(HF_DATASET_NAME, split="train", token=token)
1087
- # Convert to pandas DataFrame - handle both Dataset and DatasetDict
1088
- if hasattr(dataset, "to_pandas"):
1089
- df = dataset.to_pandas()
1090
- else:
1091
- df = pd.DataFrame(dataset)
1092
- # Add new vote
1093
- new_df = pd.concat([df, pd.DataFrame([vote_data])], ignore_index=True)
1094
- except Exception as load_error:
1095
- # Create new dataset if it doesn't exist
1096
- new_df = pd.DataFrame([vote_data])
1097
-
1098
-
1099
- # Convert back to dataset and push
1100
- new_dataset = Dataset.from_pandas(new_df)
1101
- try:
1102
- new_dataset.push_to_hub(HF_DATASET_NAME, token=token)
1103
- return True, "Vote saved successfully!"
1104
- except Exception as upload_error:
1105
- return False, f"Error uploading to HuggingFace: {str(upload_error)}"
1106
- except Exception as e:
1107
- return False, f"Error saving vote: {str(e)}"
1108
-
1109
 
1110
- def load_ranking_data(hf_token=None, force_reload=False):
1111
- """Load and calculate ranking data from HuggingFace dataset"""
1112
- global ranking_data, ranking_last_updated
1113
-
1114
- try:
1115
- # Use global token if not provided
1116
- token = hf_token or HF_TOKEN
1117
- if not token:
1118
- return pd.DataFrame()
1119
-
1120
- # Load dataset - force download if requested
1121
- if force_reload:
1122
- # Force download from remote, ignore cache
1123
- dataset = load_dataset(
1124
- HF_DATASET_NAME,
1125
- split="train",
1126
- token=token,
1127
- download_mode="force_redownload",
1128
- )
1129
- else:
1130
- dataset = load_dataset(HF_DATASET_NAME, split="train", token=token)
1131
- # Convert to pandas DataFrame - handle both Dataset and DatasetDict
1132
- if hasattr(dataset, "to_pandas"):
1133
- df = dataset.to_pandas()
1134
- else:
1135
- df = pd.DataFrame(dataset)
1136
-
1137
- if df.empty:
1138
- return pd.DataFrame()
1139
-
1140
- # Calculate rankings
1141
- model_stats = {}
1142
-
1143
- for _, row in df.iterrows():
1144
- model_a = row["model_a"]
1145
- model_b = row["model_b"]
1146
- vote = row["vote"]
1147
-
1148
- # Initialize models if not exists
1149
- if model_a not in model_stats:
1150
- model_stats[model_a] = {"wins": 0, "losses": 0, "ties": 0, "total": 0}
1151
- if model_b not in model_stats:
1152
- model_stats[model_b] = {"wins": 0, "losses": 0, "ties": 0, "total": 0}
1153
-
1154
- # Update stats based on vote
1155
- if vote == "left": # Model A wins
1156
- model_stats[model_a]["wins"] += 1
1157
- model_stats[model_b]["losses"] += 1
1158
- elif vote == "right": # Model B wins
1159
- model_stats[model_b]["wins"] += 1
1160
- model_stats[model_a]["losses"] += 1
1161
- elif vote == "tie":
1162
- model_stats[model_a]["ties"] += 1
1163
- model_stats[model_b]["ties"] += 1
1164
- # both_bad doesn't count as win/loss for either
1165
-
1166
- model_stats[model_a]["total"] += 1
1167
- model_stats[model_b]["total"] += 1
1168
-
1169
- # Convert to DataFrame and calculate win rate
1170
- ranking_list = []
1171
- for model, stats in model_stats.items():
1172
- win_rate = (
1173
- (stats["wins"] + stats["ties"]) / max(stats["total"], 1) * 100
1174
- )
1175
- ranking_list.append(
1176
- {
1177
- "Model": model,
1178
- "Win Rate (%)": round(win_rate, 1),
1179
- "Wins": stats["wins"],
1180
- "Losses": stats["losses"],
1181
- "Ties": stats["ties"],
1182
- "Total Battles": stats["total"],
1183
- }
1184
- )
1185
-
1186
- # Sort by win rate
1187
- ranking_df = pd.DataFrame(ranking_list).sort_values(
1188
- "Win Rate (%)", ascending=False
1189
- )
1190
- ranking_df["Rank"] = range(1, len(ranking_df) + 1)
1191
-
1192
- # Reorder columns
1193
- ranking_df = ranking_df[
1194
- ["Rank", "Model", "Win Rate (%)", "Wins", "Losses", "Ties", "Total Battles"]
1195
- ]
1196
-
1197
- ranking_data = ranking_df
1198
- ranking_last_updated = datetime.datetime.now()
1199
-
1200
- return ranking_df
1201
- except Exception as e:
1202
- return pd.DataFrame()
1203
 
1204
 
1205
  def instantiate_send_button():
@@ -1262,7 +1126,7 @@ def build_ui():
1262
 
1263
  # Get random models for this session
1264
  model_a, model_b = get_random_models()
1265
-
1266
  with gr.Blocks(title="BigCodeArena", theme=gr.themes.Soft()) as demo:
1267
  # Add custom CSS for centering and button styling
1268
  demo.css = """
@@ -1296,7 +1160,7 @@ def build_ui():
1296
  min-width: 60px;
1297
  }
1298
  """
1299
-
1300
  gr.Markdown("# 🌸 BigCodeArena - Start Your Vibe Coding!", elem_classes="center-text")
1301
 
1302
  # Main tabs
@@ -1361,25 +1225,15 @@ def build_ui():
1361
  interactive=False,
1362
  )
1363
 
1364
- # Vote buttons section - only visible after output
1365
- with gr.Row(visible=False) as vote_section:
1366
- gr.Markdown("### πŸ—³οΈ Which response is better?")
1367
- with gr.Row(visible=False) as vote_buttons_row:
1368
- vote_left_btn = gr.Button(
1369
- "πŸ‘ A is Better", variant="primary", size="lg"
1370
- )
1371
- vote_tie_btn = gr.Button(
1372
- "🀝 It's a Tie", variant="secondary", size="lg"
1373
- )
1374
- vote_both_bad_btn = gr.Button(
1375
- "πŸ‘Ž Both are Bad", variant="secondary", size="lg"
1376
- )
1377
- vote_right_btn = gr.Button(
1378
- "πŸ‘ B is Better", variant="primary", size="lg"
1379
- )
1380
-
1381
- # Vote status message
1382
- vote_status = gr.Markdown("", visible=False)
1383
 
1384
  # Main chat interface - Collapsible and hidden by default
1385
  with gr.Accordion("πŸ’¬ Chat Interface", open=False):
@@ -1419,7 +1273,7 @@ def build_ui():
1419
  with gr.Row():
1420
  send_left_btn = instantiate_send_left_button()
1421
  send_right_btn = instantiate_send_right_button()
1422
-
1423
  # Additional control buttons
1424
  with gr.Row():
1425
  clear_btn = gr.Button("πŸ—‘οΈ Clear Chat", variant="secondary")
@@ -1568,38 +1422,7 @@ def build_ui():
1568
  inputs=[text_input],
1569
  )
1570
  # Ranking Tab
1571
- with gr.Tab("πŸ“Š Ranking", id="ranking"):
1572
- gr.Markdown("## πŸ† Model Leaderboard")
1573
- gr.Markdown("*Rankings auto-refresh every 10 minutes*")
1574
-
1575
- ranking_table = gr.Dataframe(
1576
- headers=[
1577
- "Rank",
1578
- "Model",
1579
- "Win Rate (%)",
1580
- "Wins",
1581
- "Losses",
1582
- "Ties",
1583
- "Total Battles",
1584
- ],
1585
- datatype=[
1586
- "number",
1587
- "str",
1588
- "number",
1589
- "number",
1590
- "number",
1591
- "number",
1592
- "number",
1593
- ],
1594
- label="Model Rankings",
1595
- interactive=False,
1596
- wrap=True,
1597
- )
1598
-
1599
- ranking_last_update = gr.Markdown("**Last Updated:** Not loaded yet")
1600
-
1601
- # Timer for auto-refresh every 10 minutes
1602
- ranking_timer = gr.Timer(value=600.0, active=True)
1603
 
1604
  # Event handlers
1605
  # Create state variables for the run buttons
@@ -1620,7 +1443,7 @@ def build_ui():
1620
  state0["interactions"].extend(interactions)
1621
  return log_sandbox_telemetry_gradio_fn(state0["sandbox_state"], sandbox_ui)
1622
  return None
1623
-
1624
  def log_telemetry_b(state1, sandbox_ui):
1625
  if state1 and "sandbox_state" in state1:
1626
  # Print user interactions for debugging
@@ -1633,7 +1456,7 @@ def build_ui():
1633
  state1["interactions"].extend(interactions)
1634
  return log_sandbox_telemetry_gradio_fn(state1["sandbox_state"], sandbox_ui)
1635
  return None
1636
-
1637
  sandbox_component_a.change(
1638
  fn=log_telemetry_a,
1639
  inputs=[state0_var, sandbox_component_a],
@@ -1649,24 +1472,17 @@ def build_ui():
1649
 
1650
  # Create a wrapper function that handles both the main execution and state update
1651
  def send_and_update_state(state0, state1, text, temp, max_tok, model_a, model_b):
1652
-
1653
  # Hide vote buttons immediately when generation starts
1654
  initial_vote_visibility = False
1655
-
1656
  # Call the main function
1657
  result = add_text_and_generate(state0, state1, text, temp, max_tok, model_a, model_b)
1658
  # Extract the state from the result
1659
  new_state0, new_state1 = result[0], result[1]
1660
 
1661
  # Check if both models have output and are not generating to show vote buttons
1662
- show_vote_buttons = (
1663
- new_state0
1664
- and new_state0.get("has_output", False)
1665
- and not new_state0.get("generating", False)
1666
- and new_state1
1667
- and new_state1.get("has_output", False)
1668
- and not new_state1.get("generating", False)
1669
- )
1670
 
1671
  # Return all the original outputs plus the updated state for run buttons
1672
  # Make sure all outputs are properly formatted for their expected types
@@ -1675,8 +1491,12 @@ def build_ui():
1675
  new_state1, # state1
1676
  result[2], # chatbot_a (chat0)
1677
  result[3], # chatbot_b (chat1)
1678
- result[4]["content"] if isinstance(result[4], dict) else result[4], # response_a (response0)
1679
- result[5]["content"] if isinstance(result[5], dict) else result[5], # response_b (response1)
 
 
 
 
1680
  result[6], # code_a (code0)
1681
  result[7], # code_b (code1)
1682
  result[10] if len(result) > 10 else "", # sandbox_state0
@@ -2035,13 +1855,12 @@ def build_ui():
2035
  ],
2036
  )
2037
 
2038
- # Vote button handlers
2039
- def vote_and_clear(state0, state1, vote_type):
2040
- # First save the vote (now runs in background)
2041
  message, ranking_update, last_update = handle_vote(
2042
  state0, state1, vote_type
2043
  )
2044
-
2045
  # Get the model names from the current session
2046
  model_a = state0["model_name"] if state0 else "Unknown"
2047
  model_b = state1["model_name"] if state1 else "Unknown"
@@ -2057,23 +1876,23 @@ def build_ui():
2057
 
2058
  # Clear everything and start fresh immediately, but preserve examples
2059
  return (
2060
- "Thank you for your vote! πŸŽ‰", # vote status with thank you message
2061
- None, # Clear state0
2062
- None, # Clear state1
2063
- "", # Clear chatbot_a
2064
- "", # Clear chatbot_b
2065
- "", # Clear response_a
2066
- "", # Clear response_b
2067
- "", # Clear code_a
2068
- "", # Clear code_b
2069
- "", # Clear sandbox_view_a
2070
- "", # Clear sandbox_view_b
2071
- gr.update(visible=False), # Hide sandbox_component_a
2072
- gr.update(visible=False), # Hide sandbox_component_b
2073
- "**Conversation:** 0 turns | **Total Messages:** 0", # Reset chat_stats_a
2074
- "**Conversation:** 0 turns | **Total Messages:** 0", # Reset chat_stats_b
2075
- f"**Model A:** {model_a}", # Update model_display_a
2076
- f"**Model B:** {model_b}", # Update model_display_b
2077
  gr.update(visible=False), # Hide vote_section
2078
  gr.update(visible=False), # Hide vote_buttons_row
2079
  None, # Reset state0_var
@@ -2095,8 +1914,8 @@ def build_ui():
2095
  (vote_both_bad_btn, "both_bad"),
2096
  ]:
2097
  vote_btn.click(
2098
- fn=vote_and_clear,
2099
- inputs=[state0_var, state1_var, gr.State(vote_type)],
2100
  outputs=[
2101
  vote_status, # vote status message
2102
  state0_var, # state0
@@ -2129,45 +1948,8 @@ def build_ui():
2129
  ],
2130
  )
2131
 
2132
- # Ranking handlers
2133
- def update_ranking_display():
2134
- df = load_ranking_data()
2135
- if df.empty:
2136
- return gr.update(value=df), "**Last Updated:** No data available"
2137
-
2138
- last_update = (
2139
- ranking_last_updated.strftime("%Y-%m-%d %H:%M:%S")
2140
- if ranking_last_updated
2141
- else "Unknown"
2142
- )
2143
- return gr.update(value=df), f"**Last Updated:** {last_update}"
2144
-
2145
- def force_update_ranking_display():
2146
- """Force update ranking data from HuggingFace (for timer)"""
2147
- df = load_ranking_data(force_reload=True)
2148
- if df.empty:
2149
- return gr.update(value=df), "**Last Updated:** No data available"
2150
-
2151
- last_update = (
2152
- ranking_last_updated.strftime("%Y-%m-%d %H:%M:%S")
2153
- if ranking_last_updated
2154
- else "Unknown"
2155
- )
2156
- return gr.update(value=df), f"**Last Updated:** {last_update}"
2157
-
2158
- # Timer tick handler for auto-refresh with force reload
2159
- ranking_timer.tick(
2160
- fn=force_update_ranking_display,
2161
- inputs=[],
2162
- outputs=[ranking_table, ranking_last_update],
2163
- )
2164
-
2165
- # Auto-load ranking on startup
2166
- demo.load(
2167
- fn=update_ranking_display,
2168
- inputs=[],
2169
- outputs=[ranking_table, ranking_last_update],
2170
- )
2171
 
2172
  return demo
2173
 
 
10
  import os
11
  import asyncio
12
  import concurrent.futures
13
+ import random
14
  import time
15
+ import numpy as np
16
+ from collections import defaultdict
17
  from datasets import Dataset, load_dataset
18
+ # Import Elo calculation utilities
19
+ from elo_calculation import (
20
+ calculate_elo_with_confidence_intervals,
21
+ create_ranking_dataframe,
22
+ )
23
+
24
+ # Import ranking functionality
25
+ from ranking import (
26
+ load_ranking_data,
27
+ update_ranking_display,
28
+ force_update_ranking_display,
29
+ create_ranking_tab,
30
+ setup_ranking_handlers,
31
+ )
32
+
33
+ # Import voting functionality
34
+ from voting import (
35
+ handle_vote,
36
+ save_vote_to_hf,
37
+ serialize_interactions,
38
+ create_vote_ui,
39
+ should_show_vote_buttons,
40
+ get_vote_ui_updates,
41
+ setup_vote_handlers,
42
+ )
43
+
44
  # Import completion utilities
45
  from completion import make_config, registered_api_completion
46
  from sandbox.prompts import GENERAL_SANDBOX_INSTRUCTION
 
124
  HF_DATASET_NAME = os.getenv("HF_DATASET_NAME")
125
  HF_TOKEN = os.getenv("HF_TOKEN")
126
 
 
 
 
127
 
128
  def get_random_models():
129
+ """Get two random models from available models using weighted sampling"""
130
  if len(available_models) < 2:
131
  return available_models[0] if available_models else None, available_models[0] if available_models else None
132
 
133
+ # Use get_battle_pair for weighted sampling
134
+ return get_battle_pair(available_models, {}, [], {}, [])
135
+
136
+ # Configuration for battle sampling
137
+ ANON_MODELS = [] # Models that should not battle against each other in anonymous mode
138
+ BATTLE_STRICT_TARGETS = {} # Strict battle targets for specific models
139
+
140
+ def get_sample_weight(model, outage_models, sampling_weights, sampling_boost_models=None):
141
+ """Get the sampling weight for a model"""
142
+ # Check if model is in outage
143
+ if model in outage_models:
144
+ return 0
145
+
146
+ # Get base weight from API config
147
+ model_config = api_config.get(model, {})
148
+ base_weight = model_config.get('weight', 1.0) # Default weight is 1.0
149
+
150
+ # Apply custom sampling weights if provided
151
+ if model in sampling_weights:
152
+ base_weight *= sampling_weights[model]
153
+
154
+ # Apply boost if model is in boost list
155
+ if sampling_boost_models and model in sampling_boost_models:
156
+ base_weight *= 2.0 # Example boost factor
157
+
158
+ return base_weight
159
+
160
+ def is_model_match_pattern(model, pattern):
161
+ """Check if model matches a pattern (for battle strict targets)"""
162
+ # Simple pattern matching - can be extended for more complex patterns
163
+ if isinstance(pattern, str):
164
+ return pattern in model
165
+ elif isinstance(pattern, list):
166
+ return any(p in model for p in pattern)
167
+ return False
168
+
169
+ def get_battle_pair(
170
+ models, battle_targets, outage_models, sampling_weights, sampling_boost_models
171
+ ):
172
+ """
173
+ Sample a pair of models for battle using weighted sampling.
174
+
175
+ Args:
176
+ models: List of available model names
177
+ battle_targets: Dict mapping models to their preferred battle targets
178
+ outage_models: List of models currently in outage
179
+ sampling_weights: Dict of custom sampling weights per model
180
+ sampling_boost_models: List of models to boost in sampling
181
+
182
+ Returns:
183
+ Tuple of (model_a, model_b) for battle
184
+ """
185
+ if len(models) == 1:
186
+ return models[0], models[0]
187
+
188
+ # Calculate weights for all models
189
+ model_weights = []
190
+ for model in models:
191
+ weight = get_sample_weight(
192
+ model, outage_models, sampling_weights, sampling_boost_models
193
+ )
194
+ model_weights.append(weight)
195
+ total_weight = np.sum(model_weights)
196
+
197
+ if total_weight == 0:
198
+ # Fallback to uniform sampling if all weights are 0
199
+ return random.sample(models, 2)
200
+
201
+ model_weights = np.array(model_weights) / total_weight
202
+
203
+ # Sample first model
204
+ chosen_idx = np.random.choice(len(models), p=model_weights)
205
+ chosen_model = models[chosen_idx]
206
+
207
+ # Find eligible rival models
208
+ rival_models = []
209
+ rival_weights = []
210
+ for model in models:
211
+ if model == chosen_model:
212
+ continue
213
+ if model in ANON_MODELS and chosen_model in ANON_MODELS:
214
+ continue
215
+ if chosen_model in BATTLE_STRICT_TARGETS:
216
+ if not is_model_match_pattern(model, BATTLE_STRICT_TARGETS[chosen_model]):
217
+ continue
218
+ if model in BATTLE_STRICT_TARGETS:
219
+ if not is_model_match_pattern(chosen_model, BATTLE_STRICT_TARGETS[model]):
220
+ continue
221
+
222
+ weight = get_sample_weight(model, outage_models, sampling_weights)
223
+ if (
224
+ weight != 0
225
+ and chosen_model in battle_targets
226
+ and model in battle_targets[chosen_model]
227
+ ):
228
+ # boost to higher chance for targeted battles
229
+ weight = 0.5 * total_weight / len(battle_targets[chosen_model])
230
+ rival_models.append(model)
231
+ rival_weights.append(weight)
232
+
233
+ if not rival_models:
234
+ # Fallback: if no eligible rivals, pick any other model
235
+ rival_models = [m for m in models if m != chosen_model]
236
+ if rival_models:
237
+ rival_model = random.choice(rival_models)
238
+ else:
239
+ rival_model = chosen_model
240
+ else:
241
+ rival_weights = np.array(rival_weights) / np.sum(rival_weights)
242
+ rival_idx = np.random.choice(len(rival_models), p=rival_weights)
243
+ rival_model = rival_models[rival_idx]
244
+
245
+ # Randomly swap order
246
+ swap = np.random.randint(2)
247
+ if swap == 0:
248
+ return chosen_model, rival_model
249
+ else:
250
+ return rival_model, chosen_model
251
 
252
  def create_chat_state(model_name: str) -> dict:
253
  """Create a new chat state for a model"""
 
629
 
630
  # Get current model names for display
631
  model_a, model_b = get_random_models()
632
+ print(f"Model A: {model_a}, Model B: {model_b}")
633
  return (
634
  None, # state0
635
  None, # state1
 
665
  """Retry the last user message"""
666
  if not state0 or not state1:
667
  return state0, state1, "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""
668
+
669
  # Get the last user message
670
  last_user_message = ""
671
  for msg in reversed(state0["messages"]):
672
  if msg["role"] == "user":
673
  last_user_message = msg["content"]
674
  break
675
+
676
  if not last_user_message:
677
  return state0, state1, "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""
678
+
679
  # Remove the last user message and assistant responses from both states
680
  if state0["messages"] and state0["messages"][-1]["role"] == "assistant":
681
  state0["messages"].pop() # Remove last assistant response
682
  if state0["messages"] and state0["messages"][-1]["role"] == "user":
683
  state0["messages"].pop() # Remove last user message
684
+
685
  if state1["messages"] and state1["messages"][-1]["role"] == "assistant":
686
  state1["messages"].pop() # Remove last assistant response
687
  if state1["messages"] and state1["messages"][-1]["role"] == "user":
688
  state1["messages"].pop() # Remove last user message
689
+
690
  # Generate new responses with the same message
691
  result = add_text_and_generate(state0, state1, last_user_message, 0.4, 8192, model_a, model_b)
692
+
693
  # Extract the state from the result
694
  new_state0, new_state1 = result[0], result[1]
695
+
696
  # Check if both models have output and are not generating to show vote buttons
697
+ show_vote_buttons = should_show_vote_buttons(new_state0, new_state1)
698
+
 
 
 
 
 
 
 
699
  # Return all the original outputs plus the updated state for run buttons
700
  return (
701
  new_state0, # state0
702
  new_state1, # state1
703
  result[2], # chatbot_a (chat0)
704
  result[3], # chatbot_b (chat1)
705
+ (
706
+ result[4]["content"] if isinstance(result[4], dict) else result[4]
707
+ ), # response_a (response0)
708
+ (
709
+ result[5]["content"] if isinstance(result[5], dict) else result[5]
710
+ ), # response_b (response1)
711
  result[6], # code_a (code0)
712
  result[7], # code_b (code1)
713
  result[10] if len(result) > 10 else "", # sandbox_state0
 
746
  """Send message to left model (Model A) only"""
747
  if not text.strip():
748
  return state0, state1, "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""
749
+
750
  # Initialize states if needed
751
  if state0 is None:
752
  state0 = create_chat_state(model_a)
753
  if state1 is None:
754
  state1 = create_chat_state(model_b)
755
+
756
  # Add user message to left state only
757
  state0["messages"].append({"role": "user", "content": text})
758
  state0["generating"] = True
759
+
760
  # Generate response for left model only
761
  state0, response0 = generate_response_with_completion(state0, temperature, max_tokens)
762
  state0["messages"].append({"role": "assistant", "content": response0["content"]})
763
  state0["has_output"] = True
764
  state0["generating"] = False
765
+
766
  # Format chat history for display
767
  chat0 = format_chat_history(state0["messages"])
768
  chat1 = format_chat_history(state1["messages"]) if state1 else []
769
+
770
  # Extract code from response for sandbox
771
  sandbox_state0 = state0.get("sandbox_state", create_sandbox_state())
772
  sandbox_state0, code0, env0 = extract_and_execute_code(response0["content"], sandbox_state0)
773
  state0["sandbox_state"] = sandbox_state0
774
+
775
  # Clear previous sandbox outputs
776
  sandbox_output0 = ""
777
  sandbox_component_update0 = gr.update(value=("", False, []), visible=False)
778
  sandbox_view_a = ""
779
+
780
  # Run sandbox execution if there's code
781
  if code0.strip():
782
  install_command0 = sandbox_state0.get('install_command', "")
 
791
  sandbox_view_a += f"# Output\n{sandbox_output0}"
792
  if sandbox_error0:
793
  sandbox_view_a = f"<details closed><summary><strong>🚨 Errors/Warnings</strong></summary>\n\n```\n{sandbox_error0.strip()}\n```\n\n</details>\n\n" + sandbox_view_a
 
794
  # Calculate conversation statistics
795
  turn_count_a = len([msg for msg in state0["messages"] if msg["role"] == "assistant" and msg["content"]])
796
  turn_count_b = len([msg for msg in state1["messages"] if msg["role"] == "assistant" and msg["content"]]) if state1 else 0
797
+
798
  chat_stats_a = f"**Conversation:** {turn_count_a} turns | **Total Messages:** {len(state0['messages'])}"
799
  chat_stats_b = f"**Conversation:** {turn_count_b} turns | **Total Messages:** {len(state1['messages']) if state1 else 0}"
800
+
801
  # Don't show vote buttons since only one model responded
802
  show_vote_buttons = False
803
+
804
  return (
805
  state0, # state0
806
  state1, # state1
807
  chat0, # chatbot_a
808
  chat1, # chatbot_b
809
+ (
810
+ response0["content"] if isinstance(response0, dict) else response0
811
+ ), # response_a
812
  "", # response_b (empty)
813
  code0, # code_a
814
  "", # code_b (empty)
815
  sandbox_state0, # sandbox_state0
816
+ (
817
+ state1.get("sandbox_state", create_sandbox_state())
818
+ if state1
819
+ else create_sandbox_state()
820
+ ), # sandbox_state1
821
  sandbox_output0, # sandbox_output0
822
  "", # sandbox_output1 (empty)
823
  sandbox_component_update0, # sandbox_component_update0
 
844
  """Send message to right model (Model B) only"""
845
  if not text.strip():
846
  return state0, state1, "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""
847
+
848
  # Initialize states if needed
849
  if state0 is None:
850
  state0 = create_chat_state(model_a)
851
  if state1 is None:
852
  state1 = create_chat_state(model_b)
853
+
854
  # Add user message to right state only
855
  state1["messages"].append({"role": "user", "content": text})
856
  state1["generating"] = True
857
+
858
  # Generate response for right model only
859
  state1, response1 = generate_response_with_completion(state1, temperature, max_tokens)
860
  state1["messages"].append({"role": "assistant", "content": response1["content"]})
861
  state1["has_output"] = True
862
  state1["generating"] = False
863
+
864
  # Format chat history for display
865
  chat0 = format_chat_history(state0["messages"]) if state0 else []
866
  chat1 = format_chat_history(state1["messages"])
867
+
868
  # Extract code from response for sandbox
869
  sandbox_state1 = state1.get("sandbox_state", create_sandbox_state())
870
  sandbox_state1, code1, env1 = extract_and_execute_code(response1["content"], sandbox_state1)
871
  state1["sandbox_state"] = sandbox_state1
872
+
873
  # Clear previous sandbox outputs
874
  sandbox_output1 = ""
875
  sandbox_component_update1 = gr.update(value=("", False, []), visible=False)
876
  sandbox_view_b = ""
877
+
878
  # Run sandbox execution if there's code
879
  if code1.strip():
880
  install_command1 = sandbox_state1.get('install_command', "")
 
889
  sandbox_view_b += f"# Output\n{sandbox_output1}"
890
  if sandbox_error1:
891
  sandbox_view_b = f"<details closed><summary><strong>🚨 Errors/Warnings</strong></summary>\n\n```\n{sandbox_error1.strip()}\n```\n\n</details>\n\n" + sandbox_view_b
 
892
  # Calculate conversation statistics
893
  turn_count_a = len([msg for msg in state0["messages"] if msg["role"] == "assistant" and msg["content"]]) if state0 else 0
894
  turn_count_b = len([msg for msg in state1["messages"] if msg["role"] == "assistant" and msg["content"]])
895
+
896
  chat_stats_a = f"**Conversation:** {turn_count_a} turns | **Total Messages:** {len(state0['messages']) if state0 else 0}"
897
  chat_stats_b = f"**Conversation:** {turn_count_b} turns | **Total Messages:** {len(state1['messages'])}"
898
+
899
  # Don't show vote buttons since only one model responded
900
  show_vote_buttons = False
901
+
902
  return (
903
  state0, # state0
904
  state1, # state1
905
  chat0, # chatbot_a
906
  chat1, # chatbot_b
907
  "", # response_a (empty)
908
+ (
909
+ response1["content"] if isinstance(response1, dict) else response1
910
+ ), # response_b
911
  "", # code_a (empty)
912
  code1, # code_b
913
+ (
914
+ state0.get("sandbox_state", create_sandbox_state())
915
+ if state0
916
+ else create_sandbox_state()
917
+ ), # sandbox_state0
918
  sandbox_state1, # sandbox_state1
919
  "", # sandbox_output0 (empty)
920
  sandbox_output1, # sandbox_output1
 
939
  )
940
 
941
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
942
  def run_sandbox_code(sandbox_state: dict, code: str, install_command: str) -> tuple[str, str, str]:
943
  """Run code in the appropriate sandbox environment"""
944
  if not code.strip():
 
950
 
951
  # Determine environment
952
  env = sandbox_state.get('auto_selected_sandbox_environment') or sandbox_state.get('sandbox_environment')
 
953
  try:
954
  if env == SandboxEnvironment.HTML:
955
  sandbox_url, sandbox_id, stderr = run_html_sandbox(code, install_command, sandbox_state.get('sandbox_id'))
 
1042
  async def run_sandboxes_parallel(sandbox_state0, code0, install_command0, sandbox_state1, code1, install_command1):
1043
  """Run both sandbox executions in parallel with error handling"""
1044
  loop = asyncio.get_event_loop()
1045
+
1046
  # Create tasks for both sandbox executions
1047
  task0 = loop.run_in_executor(None, run_sandbox_code, sandbox_state0, code0, install_command0)
1048
  task1 = loop.run_in_executor(None, run_sandbox_code, sandbox_state1, code1, install_command1)
1049
+
1050
  # Wait for both to complete with error handling
1051
  try:
1052
  result0, result1 = await asyncio.gather(task0, task1, return_exceptions=True)
1053
+
1054
  # Handle exceptions
1055
  if isinstance(result0, Exception):
1056
  result0 = ("", "", f"Sandbox execution error: {str(result0)}")
1057
+
1058
  if isinstance(result1, Exception):
1059
  result1 = ("", "", f"Sandbox execution error: {str(result1)}")
1060
+
1061
  except Exception as e:
1062
  # Fallback to sequential processing
1063
  result0 = run_sandbox_code(sandbox_state0, code0, install_command0)
1064
  result1 = run_sandbox_code(sandbox_state1, code1, install_command1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1065
 
1066
+ return result0, result1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1067
 
1068
 
1069
  def instantiate_send_button():
 
1126
 
1127
  # Get random models for this session
1128
  model_a, model_b = get_random_models()
1129
+ print(f"Model A: {model_a}, Model B: {model_b}")
1130
  with gr.Blocks(title="BigCodeArena", theme=gr.themes.Soft()) as demo:
1131
  # Add custom CSS for centering and button styling
1132
  demo.css = """
 
1160
  min-width: 60px;
1161
  }
1162
  """
1163
+
1164
  gr.Markdown("# 🌸 BigCodeArena - Start Your Vibe Coding!", elem_classes="center-text")
1165
 
1166
  # Main tabs
 
1225
  interactive=False,
1226
  )
1227
 
1228
+ # Vote UI components
1229
+ vote_components = create_vote_ui()
1230
+ vote_section = vote_components["vote_section"]
1231
+ vote_buttons_row = vote_components["vote_buttons_row"]
1232
+ vote_left_btn = vote_components["vote_left_btn"]
1233
+ vote_right_btn = vote_components["vote_right_btn"]
1234
+ vote_tie_btn = vote_components["vote_tie_btn"]
1235
+ vote_both_bad_btn = vote_components["vote_both_bad_btn"]
1236
+ vote_status = vote_components["vote_status"]
 
 
 
 
 
 
 
 
 
 
1237
 
1238
  # Main chat interface - Collapsible and hidden by default
1239
  with gr.Accordion("πŸ’¬ Chat Interface", open=False):
 
1273
  with gr.Row():
1274
  send_left_btn = instantiate_send_left_button()
1275
  send_right_btn = instantiate_send_right_button()
1276
+
1277
  # Additional control buttons
1278
  with gr.Row():
1279
  clear_btn = gr.Button("πŸ—‘οΈ Clear Chat", variant="secondary")
 
1422
  inputs=[text_input],
1423
  )
1424
  # Ranking Tab
1425
+ ranking_table, ranking_last_update, ranking_timer = create_ranking_tab()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1426
 
1427
  # Event handlers
1428
  # Create state variables for the run buttons
 
1443
  state0["interactions"].extend(interactions)
1444
  return log_sandbox_telemetry_gradio_fn(state0["sandbox_state"], sandbox_ui)
1445
  return None
1446
+
1447
  def log_telemetry_b(state1, sandbox_ui):
1448
  if state1 and "sandbox_state" in state1:
1449
  # Print user interactions for debugging
 
1456
  state1["interactions"].extend(interactions)
1457
  return log_sandbox_telemetry_gradio_fn(state1["sandbox_state"], sandbox_ui)
1458
  return None
1459
+
1460
  sandbox_component_a.change(
1461
  fn=log_telemetry_a,
1462
  inputs=[state0_var, sandbox_component_a],
 
1472
 
1473
  # Create a wrapper function that handles both the main execution and state update
1474
  def send_and_update_state(state0, state1, text, temp, max_tok, model_a, model_b):
1475
+
1476
  # Hide vote buttons immediately when generation starts
1477
  initial_vote_visibility = False
1478
+
1479
  # Call the main function
1480
  result = add_text_and_generate(state0, state1, text, temp, max_tok, model_a, model_b)
1481
  # Extract the state from the result
1482
  new_state0, new_state1 = result[0], result[1]
1483
 
1484
  # Check if both models have output and are not generating to show vote buttons
1485
+ show_vote_buttons = should_show_vote_buttons(new_state0, new_state1)
 
 
 
 
 
 
 
1486
 
1487
  # Return all the original outputs plus the updated state for run buttons
1488
  # Make sure all outputs are properly formatted for their expected types
 
1491
  new_state1, # state1
1492
  result[2], # chatbot_a (chat0)
1493
  result[3], # chatbot_b (chat1)
1494
+ (
1495
+ result[4]["content"] if isinstance(result[4], dict) else result[4]
1496
+ ), # response_a (response0)
1497
+ (
1498
+ result[5]["content"] if isinstance(result[5], dict) else result[5]
1499
+ ), # response_b (response1)
1500
  result[6], # code_a (code0)
1501
  result[7], # code_b (code1)
1502
  result[10] if len(result) > 10 else "", # sandbox_state0
 
1855
  ],
1856
  )
1857
 
1858
+ # Setup vote handlers
1859
+ def process_vote(state0, state1, vote_type, current_text):
1860
+ # Save the vote and get updates
1861
  message, ranking_update, last_update = handle_vote(
1862
  state0, state1, vote_type
1863
  )
 
1864
  # Get the model names from the current session
1865
  model_a = state0["model_name"] if state0 else "Unknown"
1866
  model_b = state1["model_name"] if state1 else "Unknown"
 
1876
 
1877
  # Clear everything and start fresh immediately, but preserve examples
1878
  return (
1879
+ message, # vote status message
1880
+ gr.update(), # Keep state0 unchanged
1881
+ gr.update(), # Keep state1 unchanged
1882
+ gr.update(), # Keep chatbot_a unchanged
1883
+ gr.update(), # Keep chatbot_b unchanged
1884
+ gr.update(), # Keep response_a unchanged
1885
+ gr.update(), # Keep response_b unchanged
1886
+ gr.update(), # Keep code_a unchanged
1887
+ gr.update(), # Keep code_b unchanged
1888
+ gr.update(), # Keep sandbox_view_a unchanged
1889
+ gr.update(), # Keep sandbox_view_b unchanged
1890
+ gr.update(), # Keep sandbox_component_a unchanged
1891
+ gr.update(), # Keep sandbox_component_b unchanged
1892
+ gr.update(), # Keep chat_stats_a unchanged
1893
+ gr.update(), # Keep chat_stats_b unchanged
1894
+ gr.update(), # Keep model_display_a unchanged
1895
+ gr.update(), # Keep model_display_b unchanged
1896
  gr.update(visible=False), # Hide vote_section
1897
  gr.update(visible=False), # Hide vote_buttons_row
1898
  None, # Reset state0_var
 
1914
  (vote_both_bad_btn, "both_bad"),
1915
  ]:
1916
  vote_btn.click(
1917
+ fn=process_vote,
1918
+ inputs=[state0_var, state1_var, gr.State(vote_type), text_input],
1919
  outputs=[
1920
  vote_status, # vote status message
1921
  state0_var, # state0
 
1948
  ],
1949
  )
1950
 
1951
+ # Setup ranking handlers
1952
+ setup_ranking_handlers(demo, ranking_table, ranking_last_update, ranking_timer)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1953
 
1954
  return demo
1955
 
elo_calculation.py ADDED
@@ -0,0 +1,315 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Elo Rating Calculation Module for BigCodeArena
3
+ Contains Bradley-Terry Model with confidence intervals and traditional Elo calculation
4
+ """
5
+
6
+ import math
7
+ import numpy as np
8
+ import pandas as pd
9
+ from collections import defaultdict
10
+ from tqdm import tqdm
11
+ from sklearn.linear_model import LogisticRegression
12
+ import yaml
13
+ import os
14
+
15
+
16
+ def load_model_metadata():
17
+ """Load model metadata from api_config.yaml"""
18
+ try:
19
+ config_path = os.path.join(os.path.dirname(__file__), "api_config.yaml")
20
+ with open(config_path, "r", encoding="utf-8") as file:
21
+ config = yaml.safe_load(file)
22
+
23
+ metadata = {}
24
+ for model_key, model_config in config.items():
25
+ if isinstance(model_config, dict):
26
+ model_name = model_config.get("model", model_key)
27
+ metadata[model_name] = {
28
+ "organization": model_config.get("organization", "Unknown"),
29
+ "license": model_config.get("license", "Unknown"),
30
+ }
31
+ # Also store with the key name for lookup
32
+ metadata[model_key] = {
33
+ "organization": model_config.get("organization", "Unknown"),
34
+ "license": model_config.get("license", "Unknown"),
35
+ }
36
+
37
+ return metadata
38
+ except Exception as e:
39
+ print(f"Warning: Could not load model metadata: {e}")
40
+ return {}
41
+
42
+
43
+ def compute_mle_elo(df, SCALE=400, BASE=10, INIT_RATING=1000, sample_weight=None):
44
+ """Compute Elo ratings using Bradley-Terry Model with Maximum Likelihood Estimation"""
45
+
46
+ # Get all unique models
47
+ all_models = sorted(list(set(df["model_a"].tolist() + df["model_b"].tolist())))
48
+
49
+ # Create win matrices for each outcome type
50
+ # Initialize empty matrices with float dtype to avoid warnings
51
+ ptbl_a_win = pd.DataFrame(0.0, index=all_models, columns=all_models)
52
+ ptbl_b_win = pd.DataFrame(0.0, index=all_models, columns=all_models)
53
+ ptbl_tie = pd.DataFrame(0.0, index=all_models, columns=all_models)
54
+
55
+ # Count wins for model_a
56
+ model_a_wins = df[df["winner"] == "model_a"]
57
+ if not model_a_wins.empty:
58
+ a_win_counts = model_a_wins.groupby(["model_a", "model_b"]).size()
59
+ for (model_a, model_b), count in a_win_counts.items():
60
+ ptbl_a_win.loc[model_a, model_b] = count
61
+
62
+ # Count wins for model_b
63
+ model_b_wins = df[df["winner"] == "model_b"]
64
+ if not model_b_wins.empty:
65
+ b_win_counts = model_b_wins.groupby(["model_a", "model_b"]).size()
66
+ for (model_a, model_b), count in b_win_counts.items():
67
+ ptbl_b_win.loc[model_a, model_b] = count
68
+
69
+ # Count ties
70
+ ties = df[df["winner"].isin(["tie", "tie (bothbad)"])]
71
+ if not ties.empty:
72
+ tie_counts = ties.groupby(["model_a", "model_b"]).size()
73
+ for (model_a, model_b), count in tie_counts.items():
74
+ # For ties, we count 0.5 win for each model
75
+ ptbl_tie.loc[model_a, model_b] = count * 0.5
76
+ ptbl_tie.loc[model_b, model_a] = count * 0.5
77
+
78
+ models = pd.Series(np.arange(len(all_models)), index=all_models)
79
+ p = len(models)
80
+
81
+ # Create training data for logistic regression
82
+ X = []
83
+ Y = []
84
+ sample_weights = []
85
+
86
+ for model_a in all_models:
87
+ for model_b in all_models:
88
+ if model_a == model_b:
89
+ continue
90
+
91
+ # Count total games between these models
92
+ a_wins = ptbl_a_win.loc[model_a, model_b]
93
+ b_wins = ptbl_b_win.loc[model_a, model_b]
94
+ ties = ptbl_tie.loc[model_a, model_b]
95
+
96
+ total_games = a_wins + b_wins + ties
97
+ if total_games == 0:
98
+ continue
99
+
100
+ # Create feature vector: difference in model strengths
101
+ x = np.zeros(p)
102
+ x[models[model_a]] = 1.0
103
+ x[models[model_b]] = -1.0
104
+
105
+ # Add data points for model_a wins
106
+ if a_wins > 0:
107
+ X.append(x)
108
+ Y.append(1) # model_a wins
109
+ sample_weights.append(a_wins)
110
+
111
+ # Add data points for model_b wins (model_a loses)
112
+ if b_wins > 0:
113
+ X.append(x) # same feature vector
114
+ Y.append(0) # model_a loses
115
+ sample_weights.append(b_wins)
116
+
117
+ # Add data points for ties - treat as half wins for model_a
118
+ if ties > 0:
119
+ # Add ties as both wins and losses with half weight each
120
+ X.append(x)
121
+ Y.append(1) # model_a wins (tie counted as win)
122
+ sample_weights.append(ties / 2)
123
+
124
+ X.append(x)
125
+ Y.append(0) # model_a loses (tie counted as loss)
126
+ sample_weights.append(ties / 2)
127
+
128
+ if len(X) == 0 or len(set(Y)) < 2:
129
+ # Not enough data or no variation in outcomes
130
+ return pd.Series({model: INIT_RATING for model in all_models}).sort_values(ascending=False)
131
+
132
+ X = np.array(X)
133
+ Y = np.array(Y)
134
+ sample_weights = np.array(sample_weights)
135
+
136
+ # Fit logistic regression
137
+ lr = LogisticRegression(fit_intercept=False, penalty=None, tol=1e-6, max_iter=1000)
138
+ lr.fit(X, Y, sample_weight=sample_weights)
139
+
140
+ # Convert coefficients to Elo ratings
141
+ elo_scores = SCALE * lr.coef_[0] + INIT_RATING
142
+
143
+
144
+ return pd.Series(elo_scores, index=models.index).sort_values(ascending=False)
145
+
146
+
147
+ def get_bootstrap_result(battles, func_compute_elo, num_round=1000):
148
+ """Get bootstrap results for confidence interval calculation"""
149
+
150
+ rows = []
151
+ for i in tqdm(range(num_round), desc="bootstrap"):
152
+ # Bootstrap sample with replacement
153
+ bootstrap_sample = battles.sample(frac=1.0, replace=True)
154
+ try:
155
+ elo_result = func_compute_elo(bootstrap_sample)
156
+ rows.append(elo_result)
157
+ except Exception as e:
158
+ # Skip failed bootstrap samples
159
+ continue
160
+
161
+ if not rows:
162
+ return pd.DataFrame()
163
+
164
+ df = pd.DataFrame(rows)
165
+ # Sort columns by median Elo score (descending)
166
+ return df[df.median().sort_values(ascending=False).index]
167
+
168
+
169
+ def compute_online_elo(battles, K=4, SCALE=400, BASE=10, INIT_RATING=1000):
170
+ """Compute Elo ratings for models based on battle results (legacy function for compatibility)"""
171
+ rating = defaultdict(lambda: INIT_RATING)
172
+
173
+ for rd, model_a, model_b, winner in battles[
174
+ ["model_a", "model_b", "winner"]
175
+ ].itertuples():
176
+ ra = rating[model_a]
177
+ rb = rating[model_b]
178
+ ea = 1 / (1 + BASE ** ((rb - ra) / SCALE))
179
+ eb = 1 / (1 + BASE ** ((ra - rb) / SCALE))
180
+ if winner == "model_a":
181
+ sa = 1
182
+ elif winner == "model_b":
183
+ sa = 0
184
+ elif winner == "tie" or winner == "tie (bothbad)":
185
+ sa = 0.5
186
+ else:
187
+ raise Exception(f"unexpected vote {winner}")
188
+ rating[model_a] += K * (sa - ea)
189
+ rating[model_b] += K * (1 - sa - eb)
190
+
191
+ # calibrate llama-13b to 800 if it exists
192
+ if "llama-13b" in rating:
193
+ delta = 800 - rating["llama-13b"]
194
+ for model in battles["model_a"].unique():
195
+ rating[model] += delta
196
+
197
+ return rating
198
+
199
+
200
+ def calculate_elo_with_confidence_intervals(battles_df, vote_counts):
201
+ """
202
+ Main function to calculate Elo ratings with confidence intervals
203
+
204
+ Args:
205
+ battles_df (pd.DataFrame): DataFrame with columns ['model_a', 'model_b', 'winner']
206
+ vote_counts (dict): Dictionary with vote counts for each model
207
+
208
+ Returns:
209
+ tuple: (elo_ratings, confidence_intervals)
210
+ """
211
+ confidence_intervals = {} # Initialize to avoid uninitialized variable error
212
+
213
+ # Check if we have sufficient data for Bradley-Terry model
214
+ if len(battles_df) < 2:
215
+ # Not enough battles, use default ratings
216
+ all_models = set(
217
+ battles_df["model_a"].tolist() + battles_df["model_b"].tolist()
218
+ )
219
+ elo_ratings = pd.Series({model: 1000 for model in all_models})
220
+ confidence_intervals = {model: 0 for model in all_models}
221
+ else:
222
+ try:
223
+ # Use the new Bradley-Terry Model
224
+ elo_ratings = compute_mle_elo(battles_df)
225
+
226
+ # Calculate confidence intervals using bootstrap
227
+ if len(battles_df) >= 10: # Only calculate CI if we have enough data
228
+ try:
229
+ bootstrap_df = get_bootstrap_result(
230
+ battles_df, compute_mle_elo, num_round=100
231
+ )
232
+
233
+ # Calculate 95% confidence intervals
234
+ if not bootstrap_df.empty:
235
+ for model in bootstrap_df.columns:
236
+ scores = bootstrap_df[model].dropna()
237
+ if len(scores) > 0:
238
+ lower = scores.quantile(0.025)
239
+ upper = scores.quantile(0.975)
240
+ median_score = scores.median()
241
+ ci_margin = (upper - lower) / 2
242
+ confidence_intervals[model] = ci_margin
243
+ else:
244
+ confidence_intervals[model] = 0
245
+ else:
246
+ # Fallback: no confidence intervals
247
+ for model in elo_ratings.index:
248
+ confidence_intervals[model] = 0
249
+ except Exception as bootstrap_error:
250
+ print(
251
+ f"Bootstrap calculation failed: {bootstrap_error}, skipping confidence intervals"
252
+ )
253
+ for model in elo_ratings.index:
254
+ confidence_intervals[model] = 0
255
+ else:
256
+ # Not enough data for bootstrap, set CI to 0
257
+ for model in elo_ratings.index:
258
+ confidence_intervals[model] = 0
259
+ except Exception as e:
260
+ # Fallback to old method if Bradley-Terry fails
261
+ print(
262
+ f"Bradley-Terry calculation failed: {e}, falling back to online Elo"
263
+ )
264
+ old_elo_ratings = compute_online_elo(battles_df)
265
+ elo_ratings = pd.Series(old_elo_ratings)
266
+ confidence_intervals = {model: 0 for model in elo_ratings.index}
267
+ return elo_ratings, confidence_intervals
268
+
269
+
270
+ def create_ranking_dataframe(elo_ratings, confidence_intervals, vote_counts):
271
+ """
272
+ Create ranking DataFrame with all necessary columns
273
+
274
+ Args:
275
+ elo_ratings (pd.Series): Elo ratings for each model
276
+ confidence_intervals (dict): Confidence interval margins for each model
277
+ vote_counts (dict): Vote counts for each model
278
+
279
+ Returns:
280
+ pd.DataFrame: Ranking table with columns [Rank, Model, Score, 95% CI (Β±), Votes, Organization, License]
281
+ """
282
+ # Load model metadata
283
+ metadata = load_model_metadata()
284
+
285
+ # Create ranking list with Elo ratings and confidence intervals
286
+ ranking_list = []
287
+ for model in elo_ratings.index:
288
+ ci_margin = confidence_intervals.get(model, 0)
289
+
290
+ # Get metadata for this model
291
+ model_metadata = metadata.get(model, {})
292
+ organization = model_metadata.get("organization", "Unknown")
293
+ license_type = model_metadata.get("license", "Unknown")
294
+
295
+ ranking_list.append(
296
+ {
297
+ "Model": model,
298
+ "Score": round(elo_ratings[model], 1),
299
+ "95% CI (Β±)": round(ci_margin, 1) if ci_margin > 0 else "-",
300
+ "Votes": vote_counts[model],
301
+ "Organization": organization,
302
+ "License": license_type,
303
+ }
304
+ )
305
+
306
+ # Sort by Elo rating (highest first)
307
+ ranking_df = pd.DataFrame(ranking_list).sort_values("Score", ascending=False)
308
+ ranking_df["Rank"] = range(1, len(ranking_df) + 1)
309
+
310
+ # Reorder columns
311
+ ranking_df = ranking_df[
312
+ ["Rank", "Model", "Score", "95% CI (Β±)", "Votes", "Organization", "License"]
313
+ ]
314
+
315
+ return ranking_df
ranking.py ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Ranking module for BigCodeArena
3
+ Handles model leaderboard functionality and data management
4
+ """
5
+
6
+ import gradio as gr
7
+ import pandas as pd
8
+ import datetime
9
+ import os
10
+ from collections import defaultdict
11
+ from datasets import Dataset, load_dataset
12
+
13
+ # Import Elo calculation utilities
14
+ from elo_calculation import (
15
+ calculate_elo_with_confidence_intervals,
16
+ create_ranking_dataframe,
17
+ )
18
+
19
+ # HuggingFace dataset configuration
20
+ HF_DATASET_NAME = os.getenv("HF_DATASET_NAME")
21
+ HF_TOKEN = os.getenv("HF_TOKEN")
22
+ REFRESH_TIME = os.getenv("REFRESH_TIME") or 60*60*12 # 12 hours by default
23
+
24
+ # Global ranking data cache
25
+ ranking_data = None
26
+ ranking_last_updated = None
27
+
28
+
29
+ def load_ranking_data(hf_token=None, force_reload=False):
30
+ """Load and calculate ranking data from HuggingFace dataset"""
31
+ global ranking_data, ranking_last_updated
32
+
33
+ try:
34
+ # Use global token if not provided
35
+ token = hf_token or HF_TOKEN
36
+
37
+ if not token:
38
+ return pd.DataFrame()
39
+
40
+ if not HF_DATASET_NAME:
41
+ return pd.DataFrame()
42
+
43
+ # Load dataset - force download if requested
44
+ if force_reload:
45
+ # Force download from remote, ignore cache
46
+ dataset = load_dataset(
47
+ HF_DATASET_NAME,
48
+ split="train",
49
+ token=token,
50
+ download_mode="force_redownload",
51
+ )
52
+ else:
53
+ dataset = load_dataset(HF_DATASET_NAME, split="train", token=token)
54
+ # Convert to pandas DataFrame - handle both Dataset and DatasetDict
55
+ if hasattr(dataset, "to_pandas"):
56
+ df = dataset.to_pandas()
57
+ else:
58
+ df = pd.DataFrame(dataset)
59
+
60
+ if df.empty:
61
+ return pd.DataFrame()
62
+
63
+ # Convert vote format for Elo calculation and count votes
64
+ battle_data = []
65
+ vote_counts = defaultdict(int)
66
+
67
+ for _, row in df.iterrows():
68
+ model_a = row["model_a"]
69
+ model_b = row["model_b"]
70
+ vote = row["vote"]
71
+
72
+ # Convert vote to winner format for Elo
73
+ if vote == "left": # Model A wins
74
+ winner = "model_a"
75
+ elif vote == "right": # Model B wins
76
+ winner = "model_b"
77
+ elif vote == "tie":
78
+ winner = "tie"
79
+ elif vote == "both_bad":
80
+ winner = "tie (bothbad)"
81
+ else:
82
+ continue # Skip invalid votes
83
+
84
+ battle_data.append(
85
+ {"model_a": model_a, "model_b": model_b, "winner": winner}
86
+ )
87
+
88
+ # Count votes for each model
89
+ vote_counts[model_a] += 1
90
+ vote_counts[model_b] += 1
91
+
92
+ # Create DataFrame for Elo calculation
93
+ battles_df = pd.DataFrame(battle_data)
94
+
95
+ if battles_df.empty:
96
+ return pd.DataFrame()
97
+
98
+
99
+ # Calculate Elo ratings using Bradley-Terry Model with confidence intervals
100
+ elo_ratings, confidence_intervals = calculate_elo_with_confidence_intervals(
101
+ battles_df, vote_counts
102
+ )
103
+
104
+ # Create ranking DataFrame
105
+ ranking_df = create_ranking_dataframe(
106
+ elo_ratings, confidence_intervals, vote_counts
107
+ )
108
+
109
+ ranking_data = ranking_df
110
+ ranking_last_updated = datetime.datetime.now()
111
+
112
+ return ranking_df
113
+ except Exception as e:
114
+ return pd.DataFrame()
115
+
116
+
117
+ def update_ranking_display():
118
+ """Update ranking display with current data"""
119
+ df = load_ranking_data()
120
+ if df.empty:
121
+ return gr.update(value=df), "**Last Updated:** No data available"
122
+
123
+ last_update = (
124
+ ranking_last_updated.strftime("%Y-%m-%d %H:%M:%S")
125
+ if ranking_last_updated
126
+ else "Unknown"
127
+ )
128
+ return gr.update(value=df), f"**Last Updated:** {last_update}"
129
+
130
+
131
+ def force_update_ranking_display():
132
+ """Force update ranking data from HuggingFace (for timer)"""
133
+ df = load_ranking_data(force_reload=True)
134
+ if df.empty:
135
+ return gr.update(value=df), "**Last Updated:** No data available"
136
+
137
+ last_update = (
138
+ ranking_last_updated.strftime("%Y-%m-%d %H:%M:%S")
139
+ if ranking_last_updated
140
+ else "Unknown"
141
+ )
142
+ return gr.update(value=df), f"**Last Updated:** {last_update}"
143
+
144
+
145
+ def create_ranking_tab():
146
+ """Create the ranking tab UI component"""
147
+ with gr.Tab("πŸ“Š Ranking", id="ranking"):
148
+ gr.Markdown("## πŸ† Model Leaderboard")
149
+
150
+ ranking_table = gr.Dataframe(
151
+ headers=[
152
+ "Rank",
153
+ "Model",
154
+ "Score",
155
+ "95% CI (Β±)",
156
+ "Votes",
157
+ "Organization",
158
+ "License",
159
+ ],
160
+ datatype=[
161
+ "number",
162
+ "str",
163
+ "number",
164
+ "str",
165
+ "number",
166
+ "str",
167
+ "str",
168
+ ],
169
+ label="Model Rankings",
170
+ interactive=False,
171
+ wrap=True,
172
+ )
173
+
174
+ ranking_last_update = gr.Markdown("**Last Updated:** Not loaded yet")
175
+
176
+ # Timer for auto-refresh every REFRESH_TIME seconds
177
+ ranking_timer = gr.Timer(value=REFRESH_TIME, active=True)
178
+
179
+ return ranking_table, ranking_last_update, ranking_timer
180
+
181
+
182
+ def setup_ranking_handlers(demo, ranking_table, ranking_last_update, ranking_timer):
183
+ """Setup event handlers for ranking functionality"""
184
+
185
+ # Timer tick handler for auto-refresh with force reload
186
+ ranking_timer.tick(
187
+ fn=force_update_ranking_display,
188
+ inputs=[],
189
+ outputs=[ranking_table, ranking_last_update],
190
+ )
191
+
192
+ # Auto-load ranking on startup
193
+ demo.load(
194
+ fn=update_ranking_display,
195
+ inputs=[],
196
+ outputs=[ranking_table, ranking_last_update],
197
+ )
198
+
199
+ return ranking_table, ranking_last_update
requirements.txt CHANGED
@@ -23,4 +23,5 @@ tree-sitter-c
23
  e2b-code-interpreter==1.5.2
24
  azure-storage-blob
25
  huggingface_hub
26
- datasets
 
 
23
  e2b-code-interpreter==1.5.2
24
  azure-storage-blob
25
  huggingface_hub
26
+ datasets
27
+ scikit-learn
sandbox/sandbox_manager.py CHANGED
@@ -76,7 +76,7 @@ def run_command_in_sandbox(
76
 
77
  try:
78
  if "uv" in command:
79
- command = "uv venv;" + command
80
  command_result = sandbox.commands.run(
81
  cmd=command,
82
  cwd=working_directory,
 
76
 
77
  try:
78
  if "uv" in command:
79
+ command = "uv venv; source .venv/bin/activate;" + command
80
  command_result = sandbox.commands.run(
81
  cmd=command,
82
  cwd=working_directory,
voting.py ADDED
@@ -0,0 +1,329 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Voting module for BigCodeArena
3
+ Handles vote submission, data management, and UI components
4
+ """
5
+
6
+ import gradio as gr
7
+ import pandas as pd
8
+ import datetime
9
+ import os
10
+ import threading
11
+ from datasets import Dataset, load_dataset
12
+
13
+
14
+ # HuggingFace dataset configuration
15
+ HF_DATASET_NAME = os.getenv("HF_DATASET_NAME")
16
+ HF_TOKEN = os.getenv("HF_TOKEN")
17
+
18
+
19
+ def serialize_interactions(interactions):
20
+ """Convert datetime objects in interactions to ISO format strings"""
21
+ if not interactions:
22
+ return interactions
23
+
24
+ serialized = []
25
+ for interaction in interactions:
26
+ # Handle case where interaction might be a list instead of a dict
27
+ if isinstance(interaction, list):
28
+ # If it's a list, recursively serialize each item
29
+ serialized.append(serialize_interactions(interaction))
30
+ elif isinstance(interaction, dict):
31
+ # If it's a dict, serialize it normally
32
+ serialized_interaction = {}
33
+ for key, value in interaction.items():
34
+ if isinstance(value, datetime.datetime):
35
+ serialized_interaction[key] = value.isoformat()
36
+ else:
37
+ serialized_interaction[key] = value
38
+ serialized.append(serialized_interaction)
39
+ else:
40
+ # If it's neither list nor dict, just add it as is
41
+ serialized.append(interaction)
42
+ return serialized
43
+
44
+
45
+ def save_vote_to_hf(
46
+ model_a, model_b, prompt, response_a, response_b, vote_result, interactions_a=None, interactions_b=None, conversation_a=None, conversation_b=None, hf_token=None
47
+ ):
48
+ """Save vote result to HuggingFace dataset with full conversation history"""
49
+ try:
50
+ # Use global token if not provided
51
+ token = hf_token or HF_TOKEN
52
+ if not token:
53
+ return False, "HuggingFace token not found in environment (HF_TOKEN)"
54
+
55
+ if not HF_DATASET_NAME:
56
+ return False, "HuggingFace dataset name not found in environment (HF_DATASET_NAME)"
57
+
58
+ # Serialize conversations for JSON compatibility
59
+ serialized_conversation_a = serialize_interactions(conversation_a or [])
60
+ serialized_conversation_b = serialize_interactions(conversation_b or [])
61
+
62
+ # Organize interactions by turns - each turn contains a list of interactions
63
+ def organize_interactions_by_turns(interactions, conversation):
64
+ """Organize interactions by conversation turns"""
65
+ if not interactions:
66
+ return []
67
+
68
+ # For now, put all interactions in a single turn
69
+ # This can be enhanced later to properly group by conversation turns
70
+ # when we have more context about how interactions are timestamped
71
+ return interactions if interactions else []
72
+
73
+ # Organize interactions by turns for both models
74
+ action_a = organize_interactions_by_turns(interactions_a or [], conversation_a or [])
75
+ action_b = organize_interactions_by_turns(interactions_b or [], conversation_b or [])
76
+
77
+ # Serialize actions for JSON compatibility
78
+ serialized_action_a = serialize_interactions(action_a)
79
+ serialized_action_b = serialize_interactions(action_b)
80
+
81
+ # Create vote data with full conversation history and actions organized by turns
82
+ # Each conversation is a list of messages in format: [{"role": "user"/"assistant", "content": "...", "action": [...]}, ...]
83
+ # Actions are organized as list of lists: [[turn1_interactions], [turn2_interactions], ...]
84
+ vote_data = {
85
+ "timestamp": datetime.datetime.now().isoformat(),
86
+ "model_a": model_a,
87
+ "model_b": model_b,
88
+ "initial_prompt": prompt, # Convert list to single string
89
+ "action_a": serialized_action_a, # Actions organized by turns for model A
90
+ "action_b": serialized_action_b, # Actions organized by turns for model B
91
+ "conversation_a": serialized_conversation_a, # Full conversation history for model A
92
+ "conversation_b": serialized_conversation_b, # Full conversation history for model B
93
+ "vote": vote_result, # "left", "right", "tie", "both_bad"
94
+ }
95
+
96
+ # Try to load existing dataset or create new one
97
+ try:
98
+ dataset = load_dataset(HF_DATASET_NAME, split="train", token=token)
99
+ # Convert to pandas DataFrame - handle both Dataset and DatasetDict
100
+ if hasattr(dataset, "to_pandas"):
101
+ df = dataset.to_pandas()
102
+ else:
103
+ df = pd.DataFrame(dataset)
104
+ # Add new vote
105
+ new_df = pd.concat([df, pd.DataFrame([vote_data])], ignore_index=True)
106
+ except Exception as load_error:
107
+ # Create new dataset if it doesn't exist
108
+ new_df = pd.DataFrame([vote_data])
109
+
110
+ # Convert back to dataset and push
111
+ new_dataset = Dataset.from_pandas(new_df)
112
+ try:
113
+ new_dataset.push_to_hub(HF_DATASET_NAME, token=token)
114
+ return True, "Vote saved successfully!"
115
+ except Exception as upload_error:
116
+ return False, f"Error uploading to HuggingFace: {str(upload_error)}"
117
+ except Exception as e:
118
+ return False, f"Error saving vote: {str(e)}"
119
+
120
+
121
+ def handle_vote(state0, state1, vote_type):
122
+ """Handle vote submission"""
123
+ if (
124
+ not state0
125
+ or not state1
126
+ or not state0.get("has_output")
127
+ or not state1.get("has_output")
128
+ ):
129
+ return (
130
+ "No output to vote on!",
131
+ gr.update(),
132
+ "**Last Updated:** No data available",
133
+ )
134
+
135
+ # Get all user messages and the last responses
136
+ user_messages = []
137
+ response_a = ""
138
+ response_b = ""
139
+
140
+ # Collect all user messages from the conversation
141
+ for msg in state0["messages"]:
142
+ if msg["role"] == "user":
143
+ user_messages.append(msg["content"])
144
+
145
+ for msg in reversed(state0["messages"]):
146
+ if msg["role"] == "assistant":
147
+ response_a = msg["content"]
148
+ break
149
+
150
+ for msg in reversed(state1["messages"]):
151
+ if msg["role"] == "assistant":
152
+ response_b = msg["content"]
153
+ break
154
+
155
+ # Get interactions and full conversation history for remote dataset saving
156
+ interactions_a = state0.get("interactions", [])
157
+ interactions_b = state1.get("interactions", [])
158
+
159
+ # Get full conversation history for both models
160
+ conversation_a = state0.get("messages", [])
161
+ conversation_b = state1.get("messages", [])
162
+
163
+ # Save vote with full conversation history to remote dataset in background (async)
164
+ def save_vote_background():
165
+ try:
166
+ success, message = save_vote_to_hf(
167
+ state0["model_name"],
168
+ state1["model_name"],
169
+ user_messages[0],
170
+ response_a,
171
+ response_b,
172
+ vote_type,
173
+ interactions_a,
174
+ interactions_b,
175
+ conversation_a,
176
+ conversation_b,
177
+ )
178
+
179
+ except Exception as e:
180
+ print(f"Error saving vote: {str(e)}")
181
+ pass
182
+
183
+ print("Saving vote in background...")
184
+ # Start background upload thread
185
+ upload_thread = threading.Thread(target=save_vote_background)
186
+ upload_thread.daemon = True
187
+ upload_thread.start()
188
+
189
+ # Return immediately without waiting for upload
190
+ success = True # Assume success for immediate UI response
191
+ message = "Vote recorded! Uploading data in background..."
192
+
193
+ if success:
194
+ # Return immediately without waiting for ranking refresh
195
+ return (
196
+ message + " Clearing conversation...",
197
+ gr.update(), # Keep existing ranking table
198
+ "**Last Updated:** Processing in background...",
199
+ )
200
+ else:
201
+ return message, gr.update(), "**Last Updated:** Error occurred"
202
+
203
+
204
+ def create_vote_ui():
205
+ """Create vote UI components"""
206
+ # Vote buttons section - only visible after output
207
+ with gr.Row(visible=False) as vote_section:
208
+ gr.Markdown("### πŸ—³οΈ Which response is better?")
209
+
210
+ with gr.Row(visible=False) as vote_buttons_row:
211
+ vote_left_btn = gr.Button(
212
+ "πŸ‘ A is Better", variant="primary", size="lg"
213
+ )
214
+ vote_tie_btn = gr.Button(
215
+ "🀝 It's a Tie", variant="secondary", size="lg"
216
+ )
217
+ vote_both_bad_btn = gr.Button(
218
+ "πŸ‘Ž Both are Bad", variant="secondary", size="lg"
219
+ )
220
+ vote_right_btn = gr.Button(
221
+ "πŸ‘ B is Better", variant="primary", size="lg"
222
+ )
223
+
224
+ # Vote status message
225
+ vote_status = gr.Markdown("", visible=False)
226
+
227
+ return {
228
+ 'vote_section': vote_section,
229
+ 'vote_buttons_row': vote_buttons_row,
230
+ 'vote_left_btn': vote_left_btn,
231
+ 'vote_right_btn': vote_right_btn,
232
+ 'vote_tie_btn': vote_tie_btn,
233
+ 'vote_both_bad_btn': vote_both_bad_btn,
234
+ 'vote_status': vote_status
235
+ }
236
+
237
+
238
+ def should_show_vote_buttons(state0, state1):
239
+ """Check if vote buttons should be shown"""
240
+ return (
241
+ state0
242
+ and state0.get("has_output", False)
243
+ and not state0.get("generating", False)
244
+ and state1
245
+ and state1.get("has_output", False)
246
+ and not state1.get("generating", False)
247
+ )
248
+
249
+
250
+ def get_vote_ui_updates(show_buttons=False):
251
+ """Get UI updates for vote components"""
252
+ return {
253
+ 'vote_section': gr.update(visible=show_buttons),
254
+ 'vote_buttons_row': gr.update(visible=show_buttons),
255
+ 'vote_status': gr.update(visible=False),
256
+ 'vote_left_btn': gr.update(interactive=show_buttons),
257
+ 'vote_right_btn': gr.update(interactive=show_buttons),
258
+ 'vote_tie_btn': gr.update(interactive=show_buttons),
259
+ 'vote_both_bad_btn': gr.update(interactive=show_buttons),
260
+ }
261
+
262
+
263
+ def setup_vote_handlers(vote_components, state0_var, state1_var, text_input, ranking_table, ranking_last_update):
264
+ """Setup vote button event handlers"""
265
+
266
+ def process_vote(state0, state1, vote_type, current_text):
267
+ # Save the vote and get updates
268
+ message, ranking_update, last_update = handle_vote(
269
+ state0, state1, vote_type
270
+ )
271
+
272
+ # Show thank you message
273
+ gr.Info(
274
+ "Thank you for your vote! πŸŽ‰ Your feedback has been recorded.",
275
+ duration=5,
276
+ )
277
+
278
+ # Return only vote status, ranking updates and hide voting interface
279
+ return (
280
+ message, # vote status message
281
+ gr.update(), # Keep state0 unchanged
282
+ gr.update(), # Keep state1 unchanged
283
+ gr.update(), # Keep chatbot_a unchanged
284
+ gr.update(), # Keep chatbot_b unchanged
285
+ gr.update(), # Keep response_a unchanged
286
+ gr.update(), # Keep response_b unchanged
287
+ gr.update(), # Keep code_a unchanged
288
+ gr.update(), # Keep code_b unchanged
289
+ gr.update(), # Keep sandbox_view_a unchanged
290
+ gr.update(), # Keep sandbox_view_b unchanged
291
+ gr.update(), # Keep sandbox_component_a unchanged
292
+ gr.update(), # Keep sandbox_component_b unchanged
293
+ gr.update(), # Keep chat_stats_a unchanged
294
+ gr.update(), # Keep chat_stats_b unchanged
295
+ gr.update(), # Keep model_display_a unchanged
296
+ gr.update(), # Keep model_display_b unchanged
297
+ gr.update(visible=False), # Hide vote_section
298
+ gr.update(visible=False), # Hide vote_buttons_row
299
+ gr.update(), # Keep state0_var unchanged
300
+ gr.update(), # Keep state1_var unchanged
301
+ ranking_update, # Update ranking_table
302
+ last_update, # Update ranking_last_update
303
+ gr.update(), # Keep vote_left_btn unchanged
304
+ gr.update(), # Keep vote_right_btn unchanged
305
+ gr.update(), # Keep vote_tie_btn unchanged
306
+ gr.update(), # Keep vote_both_bad_btn unchanged
307
+ gr.update(), # Keep text_input unchanged
308
+ )
309
+
310
+ # Vote button click handlers
311
+ for vote_btn, vote_type in [
312
+ (vote_components['vote_left_btn'], "left"),
313
+ (vote_components['vote_right_btn'], "right"),
314
+ (vote_components['vote_tie_btn'], "tie"),
315
+ (vote_components['vote_both_bad_btn'], "both_bad"),
316
+ ]:
317
+ vote_btn.click(
318
+ fn=process_vote,
319
+ inputs=[state0_var, state1_var, gr.State(vote_type), text_input],
320
+ outputs=[
321
+ vote_components['vote_status'], # vote status message
322
+ state0_var, # state0
323
+ state1_var, # state1
324
+ # Note: The actual outputs list will need to be filled in by the calling code
325
+ # as it depends on the specific UI components in the main app
326
+ ],
327
+ )
328
+
329
+ return vote_components