terryyz commited on
Commit
fe3f5b0
·
1 Parent(s): f1aac33
.gitignore ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[codz]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+ run.sh
29
+
30
+ # PyInstaller
31
+ # Usually these files are written by a python script from a template
32
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
33
+ *.manifest
34
+ *.spec
35
+
36
+ # Installer logs
37
+ pip-log.txt
38
+ pip-delete-this-directory.txt
39
+
40
+ # Unit test / coverage reports
41
+ htmlcov/
42
+ .tox/
43
+ .nox/
44
+ .coverage
45
+ .coverage.*
46
+ .cache
47
+ nosetests.xml
48
+ coverage.xml
49
+ *.cover
50
+ *.py.cover
51
+ .hypothesis/
52
+ .pytest_cache/
53
+ cover/
54
+
55
+ # Translations
56
+ *.mo
57
+ *.pot
58
+
59
+ # Django stuff:
60
+ *.log
61
+ local_settings.py
62
+ db.sqlite3
63
+ db.sqlite3-journal
64
+
65
+ # Flask stuff:
66
+ instance/
67
+ .webassets-cache
68
+
69
+ # Scrapy stuff:
70
+ .scrapy
71
+
72
+ # Sphinx documentation
73
+ docs/_build/
74
+
75
+ # PyBuilder
76
+ .pybuilder/
77
+ target/
78
+
79
+ # Jupyter Notebook
80
+ .ipynb_checkpoints
81
+
82
+ # IPython
83
+ profile_default/
84
+ ipython_config.py
85
+
86
+ # pyenv
87
+ # For a library or package, you might want to ignore these files since the code is
88
+ # intended to run in multiple environments; otherwise, check them in:
89
+ # .python-version
90
+
91
+ # pipenv
92
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
93
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
94
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
95
+ # install all needed dependencies.
96
+ #Pipfile.lock
97
+
98
+ # UV
99
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
100
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
101
+ # commonly ignored for libraries.
102
+ #uv.lock
103
+
104
+ # poetry
105
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
106
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
107
+ # commonly ignored for libraries.
108
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
109
+ #poetry.lock
110
+ #poetry.toml
111
+
112
+ # pdm
113
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
114
+ # pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
115
+ # https://pdm-project.org/en/latest/usage/project/#working-with-version-control
116
+ #pdm.lock
117
+ #pdm.toml
118
+ .pdm-python
119
+ .pdm-build/
120
+
121
+ # pixi
122
+ # Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
123
+ #pixi.lock
124
+ # Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
125
+ # in the .venv directory. It is recommended not to include this directory in version control.
126
+ .pixi
127
+
128
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
129
+ __pypackages__/
130
+
131
+ # Celery stuff
132
+ celerybeat-schedule
133
+ celerybeat.pid
134
+
135
+ # SageMath parsed files
136
+ *.sage.py
137
+
138
+ # Environments
139
+ .env
140
+ .envrc
141
+ .venv
142
+ env/
143
+ venv/
144
+ ENV/
145
+ env.bak/
146
+ venv.bak/
147
+
148
+ # Spyder project settings
149
+ .spyderproject
150
+ .spyproject
151
+
152
+ # Rope project settings
153
+ .ropeproject
154
+
155
+ # mkdocs documentation
156
+ /site
157
+
158
+ # mypy
159
+ .mypy_cache/
160
+ .dmypy.json
161
+ dmypy.json
162
+
163
+ # Pyre type checker
164
+ .pyre/
165
+
166
+ # pytype static type analyzer
167
+ .pytype/
168
+
169
+ # Cython debug symbols
170
+ cython_debug/
171
+
172
+ # PyCharm
173
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
174
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
175
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
176
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
177
+ #.idea/
178
+
179
+ # Abstra
180
+ # Abstra is an AI-powered process automation framework.
181
+ # Ignore directories containing user credentials, local state, and settings.
182
+ # Learn more at https://abstra.io/docs
183
+ .abstra/
184
+
185
+ # Visual Studio Code
186
+ # Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
187
+ # that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
188
+ # and can be added to the global gitignore or merged into this file. However, if you prefer,
189
+ # you could uncomment the following to ignore the entire vscode folder
190
+ # .vscode/
191
+
192
+ # Ruff stuff:
193
+ .ruff_cache/
194
+
195
+ # PyPI configuration file
196
+ .pypirc
197
+
198
+ # Cursor
199
+ # Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
200
+ # exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
201
+ # refer to https://docs.cursor.com/context/ignore-files
202
+ .cursorignore
203
+ .cursorindexingignore
204
+
205
+ # Marimo
206
+ marimo/_static/
207
+ marimo/_lsp/
208
+ __marimo__/
README.md CHANGED
@@ -1,12 +1,12 @@
1
  ---
2
- title: Arena
3
  emoji: 🚀
4
  colorFrom: pink
5
  colorTo: yellow
6
  sdk: gradio
7
  sdk_version: 5.44.1
8
  app_file: app.py
9
- pinned: false
10
  license: apache-2.0
11
  ---
12
 
 
1
  ---
2
+ title: BigCodeArena
3
  emoji: 🚀
4
  colorFrom: pink
5
  colorTo: yellow
6
  sdk: gradio
7
  sdk_version: 5.44.1
8
  app_file: app.py
9
+ pinned: true
10
  license: apache-2.0
11
  ---
12
 
api_config.yaml ADDED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gpt-4o-mini-2024-07-18:
2
+ model: gpt-4o-mini-2024-07-18
3
+ endpoints: null
4
+ api_type: openai
5
+ parallel: 32
6
+ max_tokens: 8192
7
+ temperature: 0.0
8
+
9
+ gpt-4o-2024-11-20:
10
+ model: gpt-4o-2024-11-20
11
+ endpoints: null
12
+ api_type: openai
13
+ parallel: 32
14
+ max_tokens: 8192
15
+ temperature: 0.0
16
+
17
+ # o1-2024-12-17:
18
+ # model: o1-2024-12-17
19
+ # endpoints: null
20
+ # api_type: openai
21
+ # parallel: 32
22
+ # max_tokens: 8192
23
+ # temperature: 0.0
24
+
25
+ # o4-mini-2025-04-16:
26
+ # model: o4-mini-2025-04-16
27
+ # endpoints: null
28
+ # api_type: openai_thinking
29
+ # parallel: 32
30
+ # max_tokens: 8192
31
+ # temperature: 1.0
32
+
33
+ # o3-mini-2025-01-31:
34
+ # model: o3-mini-2025-01-31
35
+ # endpoints: null
36
+ # api_type: openai_thinking
37
+ # parallel: 32
38
+ # max_tokens: 8192
39
+ # temperature: 0.0
40
+
41
+ # gemini-2.0-flash-001:
42
+ # model: google/gemini-2.0-flash-001
43
+ # endpoints:
44
+ # - api_base: https://openrouter.ai/api/v1
45
+ # api_key: ${OPENROUTER_API_KEY}
46
+ # api_type: openai
47
+ # parallel: 32
48
+ # max_tokens: 81920
49
+ # temperature: 0.0
50
+
51
+ # gemini-2.5-pro:
52
+ # model: google/gemini-2.5-pro
53
+ # endpoints:
54
+ # - api_base: https://openrouter.ai/api/v1
55
+ # api_key: ${OPENROUTER_API_KEY}
56
+ # api_type: openai
57
+ # parallel: 32
58
+ # max_tokens: 8192
59
+ # temperature: 0.0
60
+
61
+ # gemini-2.5-flash:
62
+ # model: google/gemini-2.5-flash
63
+ # endpoints:
64
+ # - api_base: https://openrouter.ai/api/v1
65
+ # api_key: ${OPENROUTER_API_KEY}
66
+ # api_type: openai
67
+ # parallel: 32
68
+ # max_tokens: 8192
69
+ # temperature: 0.0
70
+
71
+ # claude35_haiku:
72
+ # model: bedrock/anthropic.claude-3-5-haiku-20241022-v1:0
73
+ # endpoints: null
74
+ # api_type: litellm
75
+ # parallel: 32
76
+ # max_tokens: 8192
77
+ # temperature: 0.0
78
+
79
+ # claude35_sonnet:
80
+ # model: bedrock/anthropic.claude-3-5-sonnet-20241022-v2:0
81
+ # endpoints: null
82
+ # api_type: litellm
83
+ # parallel: 32
84
+ # max_tokens: 8192
85
+ # temperature: 0.0
86
+
87
+ # claude37_sonnet:
88
+ # model: bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0
89
+ # endpoints: null
90
+ # api_type: litellm
91
+ # parallel: 32
92
+ # max_tokens: 8192
93
+ # temperature: 0.0
94
+
95
+ # qwen3-coder:
96
+ # model: qwen/qwen3-coder
97
+ # endpoints:
98
+ # - api_base: https://openrouter.ai/api/v1
99
+ # api_key: ${OPENROUTER_API_KEY}
100
+ # api_type: openai
101
+ # parallel: 32
102
+ # max_tokens: 8192
103
+ # temperature: 0.0
104
+
105
+ # kimi-k2:
106
+ # model: moonshotai/kimi-k2
107
+ # endpoints:
108
+ # - api_base: https://openrouter.ai/api/v1
109
+ # api_key: ${OPENROUTER_API_KEY}
110
+ # api_type: openai
111
+ # parallel: 32
112
+ # max_tokens: 8192
113
+ # temperature: 0.0
114
+
115
+ # claude-4-sonnet:
116
+ # model: bedrock/us.anthropic.claude-sonnet-4-20250514-v1:0
117
+ # endpoints: null
118
+ # api_type: litellm
119
+ # parallel: 16
120
+ # max_tokens: 8192
121
+ # temperature: 0.0
122
+
123
+ # claude-4-opus:
124
+ # model: bedrock/us.anthropic.claude-opus-4-20250514-v1:0
125
+ # endpoints: null
126
+ # api_type: litellm
127
+ # parallel: 16
128
+ # max_tokens: 8192
129
+ # temperature: 0.0
130
+
131
+ # gpt-oss-120b:
132
+ # model: openai/gpt-oss-120b
133
+ # endpoints:
134
+ # - api_base: https://openrouter.ai/api/v1
135
+ # api_key: ${OPENROUTER_API_KEY}
136
+ # api_type: openai_thinking
137
+ # parallel: 32
138
+ # max_tokens: 8192
139
+ # temperature: 1.0
140
+
141
+ # gpt-oss-20b:
142
+ # model: openai/gpt-oss-20b
143
+ # endpoints:
144
+ # - api_base: https://openrouter.ai/api/v1
145
+ # api_key: ${OPENROUTER_API_KEY}
146
+ # api_type: openai_thinking
147
+ # parallel: 32
148
+ # max_tokens: 8192
149
+ # temperature: 1.0
150
+
151
+ # deepseek-chat-v3-0324:
152
+ # model: deepseek/deepseek-chat-v3-0324
153
+ # endpoints:
154
+ # - api_base: https://openrouter.ai/api/v1
155
+ # api_key: ${OPENROUTER_API_KEY}
156
+ # api_type: openai
157
+ # parallel: 32
158
+ # max_tokens: 8192
159
+ # temperature: 0.0
160
+
161
+ # deepseek-chat-v3.1:
162
+ # model: deepseek-chat
163
+ # endpoints:
164
+ # - api_base: https://api.deepseek.com
165
+ # api_key: ${DEEPSEEK_API_KEY}
166
+ # api_type: openai
167
+ # parallel: 32
168
+ # max_tokens: 8192
169
+ # temperature: 0.0
170
+
171
+ # glm-4.5:
172
+ # model: z-ai/glm-4.5
173
+ # endpoints:
174
+ # - api_base: https://openrouter.ai/api/v1
175
+ # api_key: ${OPENROUTER_API_KEY}
176
+ # api_type: openai
177
+ # parallel: 32
178
+ # max_tokens: 8192
179
+ # temperature: 0.0
180
+
181
+ # gpt-4.1-2025-04-14:
182
+ # model: gpt-4.1-2025-04-14
183
+ # endpoints: null
184
+ # api_type: openai
185
+ # parallel: 32
186
+ # max_tokens: 8192
187
+ # temperature: 0.0
188
+
189
+
190
+ # deepseek-r1-0528:
191
+ # model: deepseek/deepseek-r1-0528
192
+ # endpoints:
193
+ # - api_base: https://openrouter.ai/api/v1
194
+ # api_key: ${OPENROUTER_API_KEY}
195
+ # api_type: openai_thinking
196
+ # parallel: 32
197
+ # max_tokens: 81920
198
+ # temperature: 1.0
199
+
200
+ # gpt-5-2025-08-07:
201
+ # model: gpt-5-2025-08-07
202
+ # endpoints: null
203
+ # api_type: openai_thinking
204
+ # parallel: 32
205
+ # max_tokens: 8192
206
+ # temperature: 1.0
207
+
208
+ # grok-code:
209
+ # model: x-ai/grok-code-fast-1
210
+ # endpoints:
211
+ # - api_base: https://openrouter.ai/api/v1
212
+ # api_key: ${OPENROUTER_API_KEY}
213
+ # api_type: openai_thinking
214
+ # parallel: 32
215
+ # max_tokens: 8192
216
+ # temperature: 1.0
app.py ADDED
@@ -0,0 +1,777 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Simple BigCodeArena - A simplified AI coding battle arena
3
+ Focuses on core functionality: two models, automatic code extraction, and execution
4
+ """
5
+
6
+ import gradio as gr
7
+ from gradio_sandboxcomponent import SandboxComponent
8
+
9
+ # Import completion utilities
10
+ from completion import make_config, registered_api_completion
11
+
12
+ # Import code extraction utilities
13
+ from sandbox.code_analyzer import (
14
+ SandboxEnvironment,
15
+ extract_code_from_markdown,
16
+ )
17
+
18
+ # Import sandbox execution functions
19
+ from sandbox.code_runner import (
20
+ run_html_sandbox,
21
+ run_react_sandbox,
22
+ run_vue_sandbox,
23
+ run_pygame_sandbox,
24
+ run_gradio_sandbox,
25
+ run_streamlit_sandbox,
26
+ run_code_interpreter,
27
+ run_c_code,
28
+ run_cpp_code,
29
+ run_java_code,
30
+ run_golang_code,
31
+ run_rust_code,
32
+ mermaid_to_html
33
+ )
34
+
35
+ # Create a proper sandbox state structure
36
+ def create_sandbox_state() -> dict:
37
+ """Create a new sandbox state for a model"""
38
+ return {
39
+ 'enable_sandbox': True,
40
+ 'enabled_round': 0,
41
+ 'sandbox_run_round': 0,
42
+ 'edit_round': 0,
43
+ 'sandbox_environment': SandboxEnvironment.AUTO,
44
+ 'auto_selected_sandbox_environment': None,
45
+ 'sandbox_instruction': "Run the extracted code in the appropriate sandbox environment",
46
+ 'code_to_execute': "",
47
+ 'code_dependencies': ([], []),
48
+ 'btn_list_length': 5,
49
+ 'sandbox_id': None,
50
+ 'chat_session_id': None,
51
+ 'conv_id': None,
52
+ "sandbox_output": None,
53
+ "sandbox_error": None,
54
+ }
55
+
56
+ def reset_sandbox_state(state: dict) -> dict:
57
+ """Reset the sandbox state"""
58
+ state['enabled_round'] = 0
59
+ state['sandbox_run_round'] = 0
60
+ state['edit_round'] = 0
61
+ state['auto_selected_sandbox_environment'] = None
62
+ state['code_to_execute'] = ""
63
+ state['code_dependencies'] = ([], [])
64
+ state['sandbox_error'] = None
65
+ state['sandbox_output'] = None
66
+ state['sandbox_id'] = None
67
+ state['conv_id'] = None
68
+ state['chat_session_id'] = None
69
+ return state
70
+
71
+ # Load API configuration
72
+ def load_api_config():
73
+ """Load API configuration from yaml file"""
74
+ try:
75
+ config = make_config("api_config.yaml")
76
+ return config
77
+ except Exception as e:
78
+ print(f"Error loading API config: {e}")
79
+ return {}
80
+
81
+ # Global variables
82
+ api_config = load_api_config()
83
+ available_models = list(api_config.keys()) if api_config else []
84
+
85
+ def get_random_models():
86
+ """Get two random models from available models"""
87
+ if len(available_models) < 2:
88
+ return available_models[0] if available_models else None, available_models[0] if available_models else None
89
+
90
+ import random
91
+ models = random.sample(available_models, 2)
92
+ return models[0], models[1]
93
+
94
+ def create_chat_state(model_name: str) -> dict:
95
+ """Create a new chat state for a model"""
96
+ return {
97
+ "model_name": model_name,
98
+ "messages": [],
99
+ "sandbox_state": create_sandbox_state()
100
+ }
101
+
102
+ def generate_response_with_completion(state, temperature, max_tokens):
103
+ """Generate response using the completion API system with full conversation history"""
104
+ if state is None:
105
+ return state, ""
106
+
107
+ # Get the last user message
108
+ user_message = None
109
+ for msg in reversed(state["messages"]):
110
+ if msg["role"] == "user":
111
+ user_message = msg["content"]
112
+ break
113
+
114
+ if not user_message:
115
+ return state, ""
116
+
117
+ # Prepare messages for API call - include full conversation history
118
+ messages = []
119
+ for msg in state["messages"]:
120
+ if msg["role"] in ["user", "assistant"] and msg["content"] is not None:
121
+ messages.append({"role": msg["role"], "content": msg["content"]})
122
+
123
+ # Get model config
124
+ model_name = state["model_name"]
125
+ if model_name not in api_config:
126
+ print(f"Model {model_name} not found in config")
127
+ return state, f"Error: Model {model_name} not configured"
128
+
129
+ model_config = api_config[model_name]
130
+ api_type = model_config.get("api_type", "openai")
131
+
132
+ # retrieve the api completion function from register
133
+ api_completion_func = registered_api_completion[api_type]
134
+
135
+ # build arguments for api completions
136
+ # Use the actual model identifier from config, not the display name
137
+ actual_model = model_config.get("model", model_name)
138
+ kwargs = {
139
+ "model": actual_model,
140
+ "temperature": temperature,
141
+ "max_tokens": max_tokens,
142
+ "api_dict": model_config.get("endpoints", [{}])[0] if model_config.get("endpoints") else None,
143
+ "messages": messages,
144
+ }
145
+ output = api_completion_func(**kwargs)
146
+
147
+ # Extract the answer from the response
148
+ if isinstance(output, dict) and "answer" in output:
149
+ response_text = output["answer"]
150
+ return state, response_text
151
+ else:
152
+ error_msg = f"Error: Invalid response format from {api_type}"
153
+ print(error_msg)
154
+ return state, error_msg
155
+
156
+ def extract_and_execute_code(message, sandbox_state):
157
+ """Extract code from message and prepare for execution"""
158
+ if not message:
159
+ return sandbox_state, "", ""
160
+
161
+ # Extract code using the same logic as code_runner.py
162
+ extract_result = extract_code_from_markdown(
163
+ message=message,
164
+ enable_auto_env=True
165
+ )
166
+
167
+ if extract_result is None:
168
+ return sandbox_state, "", ""
169
+
170
+ code, code_language, code_dependencies, env_selection = extract_result
171
+
172
+ # Update sandbox state (now a dictionary)
173
+ sandbox_state['code_to_execute'] = code
174
+ sandbox_state['code_dependencies'] = code_dependencies
175
+ sandbox_state['auto_selected_sandbox_environment'] = env_selection
176
+
177
+ return sandbox_state, code, str(env_selection)
178
+
179
+ def add_text_and_generate(state0, state1, text, temperature, max_tokens, model_a, model_b):
180
+ """Add text and generate responses for both models"""
181
+ if not text.strip():
182
+ return state0, state1, "", "", "", "", "", "", "", "", "", "", "", ""
183
+
184
+ # Initialize states if needed
185
+ if state0 is None or state1 is None:
186
+ if state0 is None:
187
+ state0 = create_chat_state(model_a)
188
+ if state1 is None:
189
+ state1 = create_chat_state(model_b)
190
+ print(f"Models: {state0['model_name']} vs {state1['model_name']}")
191
+
192
+ # Add user message to both states
193
+ state0["messages"].append({"role": "user", "content": text})
194
+ state1["messages"].append({"role": "user", "content": text})
195
+
196
+ # Generate responses
197
+ state0, response0 = generate_response_with_completion(state0, temperature, max_tokens)
198
+ state1, response1 = generate_response_with_completion(state1, temperature, max_tokens)
199
+
200
+ # Add the assistant responses to the message history
201
+ state0["messages"].append({"role": "assistant", "content": response0})
202
+ state1["messages"].append({"role": "assistant", "content": response1})
203
+
204
+ # Format chat history for display
205
+ chat0 = format_chat_history(state0["messages"])
206
+ chat1 = format_chat_history(state1["messages"])
207
+
208
+ # Extract code from responses for sandbox
209
+ sandbox_state0 = state0.get("sandbox_state", create_sandbox_state())
210
+ sandbox_state1 = state1.get("sandbox_state", create_sandbox_state())
211
+
212
+ _, code0, env0 = extract_and_execute_code(response0, sandbox_state0)
213
+ _, code1, env1 = extract_and_execute_code(response1, sandbox_state1)
214
+
215
+ # Update sandbox states in the main states
216
+ state0["sandbox_state"] = sandbox_state0
217
+ state1["sandbox_state"] = sandbox_state1
218
+
219
+ # Clear previous sandbox outputs when new message is sent
220
+ sandbox_output0 = ""
221
+ sandbox_output1 = ""
222
+ sandbox_component_update0 = gr.update(visible=False)
223
+ sandbox_component_update1 = gr.update(visible=False)
224
+
225
+ # Also clear the sandbox view components to show fresh results
226
+ sandbox_view_a = ""
227
+ sandbox_view_b = ""
228
+
229
+ if code0.strip():
230
+ # Get the dependencies from the sandbox state
231
+ dependencies0 = sandbox_state0.get('code_dependencies', ([], []))
232
+ print(f"DEBUG: Running code0 with dependencies: {dependencies0}")
233
+ sandbox_url0, sandbox_output0, sandbox_error0 = run_sandbox_code(sandbox_state0, code0, dependencies0)
234
+ print(f"DEBUG: Code0 result - URL: {sandbox_url0}, Output: {sandbox_output0[:100] if sandbox_output0 else 'None'}, Error: {sandbox_error0[:100] if sandbox_error0 else 'None'}")
235
+
236
+ # Check if this is a web-based environment that should use SandboxComponent
237
+ env_type = sandbox_state0.get('auto_selected_sandbox_environment') or sandbox_state0.get('sandbox_environment')
238
+ print(f"DEBUG: Model A environment type: {env_type}")
239
+ # Use the URL directly from the function return
240
+ if sandbox_url0:
241
+ sandbox_component_update0 = gr.update(value=(sandbox_url0, True, []), visible=True)
242
+
243
+ # Update sandbox view with output and errors
244
+ if sandbox_output0:
245
+ sandbox_view_a += f"# Output\n{sandbox_output0}"
246
+ if sandbox_error0:
247
+ sandbox_view_a += f"# Errors\n{sandbox_error0}"
248
+
249
+ if code1.strip():
250
+ # Get the dependencies from the sandbox state
251
+ dependencies1 = sandbox_state1.get('code_dependencies', ([], []))
252
+ print(f"DEBUG: Running code1 with dependencies: {dependencies1}")
253
+ sandbox_url1, sandbox_output1, sandbox_error1 = run_sandbox_code(sandbox_state1, code1, dependencies1)
254
+ print(f"DEBUG: Code1 result - URL: {sandbox_url1}, Output: {sandbox_output1[:100] if sandbox_output1 else 'None'}, Error: {sandbox_error1[:100] if sandbox_error1 else 'None'}")
255
+
256
+ # Check if this is a web-based environment that should use SandboxComponent
257
+ env_type = sandbox_state1.get('auto_selected_sandbox_environment') or sandbox_state1.get('sandbox_environment')
258
+ print(f"DEBUG: Model B environment type: {env_type}")
259
+ # Use the URL directly from the function return
260
+ if sandbox_url1:
261
+ sandbox_component_update1 = gr.update(value=(sandbox_url1, True, []), visible=True)
262
+
263
+ if sandbox_output1:
264
+ sandbox_view_b += f"## Output\n{sandbox_output1}"
265
+ if sandbox_error1:
266
+ sandbox_view_b += f"## Errors\n{sandbox_error1}"
267
+
268
+ # Calculate conversation statistics
269
+ turn_count_a = len([msg for msg in state0["messages"] if msg["role"] == "assistant" and msg["content"]])
270
+ turn_count_b = len([msg for msg in state1["messages"] if msg["role"] == "assistant" and msg["content"]])
271
+
272
+ # Format conversation statistics
273
+ chat_stats_a = f"**Conversation:** {turn_count_a} turns | **Total Messages:** {len(state0['messages'])}"
274
+ chat_stats_b = f"**Conversation:** {turn_count_b} turns | **Total Messages:** {len(state1['messages'])}"
275
+
276
+ return state0, state1, chat0, chat1, response0, response1, code0, code1, env0, env1, sandbox_state0, sandbox_state1, sandbox_output0, sandbox_output1, sandbox_component_update0, sandbox_component_update1, chat_stats_a, chat_stats_b, sandbox_view_a, sandbox_view_b
277
+
278
+ def format_chat_history(messages):
279
+ """Format messages for chat display with turn numbers"""
280
+ formatted = []
281
+
282
+ for msg in messages:
283
+ if msg["role"] == "user" and msg["content"]:
284
+ # Add turn number to user messages
285
+ formatted.append({
286
+ "role": "user",
287
+ "content": msg['content']
288
+ })
289
+ elif msg["role"] == "assistant" and msg["content"]:
290
+ # Add turn number to assistant messages
291
+ formatted.append({
292
+ "role": "assistant",
293
+ "content": msg['content']
294
+ })
295
+
296
+ return formatted
297
+
298
+ def clear_chat(state0, state1):
299
+ """Clear chat history"""
300
+ if state0 and "sandbox_state" in state0:
301
+ reset_sandbox_state(state0["sandbox_state"])
302
+ if state1 and "sandbox_state" in state1:
303
+ reset_sandbox_state(state1["sandbox_state"])
304
+
305
+ # Get current model names for display
306
+ model_a, model_b = get_random_models()
307
+
308
+ return None, None, "", "", "", "", "", "", "", "", "", "", "", "", gr.update(visible=False), gr.update(visible=False), "**Conversation:** 0 turns | **Total Messages:** 0", "**Conversation:** 0 turns | **Total Messages:** 0", "", "", f"**Model A:** {model_a}", f"**Model B:** {model_b}"
309
+
310
+ def run_sandbox_code(sandbox_state: dict, code: str, dependencies: tuple) -> tuple[str, str, str]:
311
+ """Run code in the appropriate sandbox environment"""
312
+ if not code.strip():
313
+ return "", "", "No code to run"
314
+
315
+ # Update sandbox state
316
+ sandbox_state['code_to_execute'] = code
317
+ sandbox_state['code_dependencies'] = dependencies
318
+
319
+ # Determine environment
320
+ env = sandbox_state.get('auto_selected_sandbox_environment') or sandbox_state.get('sandbox_environment')
321
+
322
+ try:
323
+ if env == SandboxEnvironment.HTML:
324
+ sandbox_url, sandbox_id, stderr = run_html_sandbox(code, dependencies, sandbox_state.get('sandbox_id'))
325
+ sandbox_state['sandbox_id'] = sandbox_id
326
+ return sandbox_url, "", stderr
327
+
328
+ elif env == SandboxEnvironment.REACT:
329
+ result = run_react_sandbox(code, dependencies, sandbox_state.get('sandbox_id'))
330
+ sandbox_state['sandbox_id'] = result['sandbox_id']
331
+ return result['sandbox_url'], "", result['stderr']
332
+
333
+ elif env == SandboxEnvironment.VUE:
334
+ result = run_vue_sandbox(code, dependencies, sandbox_state.get('sandbox_id'))
335
+ sandbox_state['sandbox_id'] = result['sandbox_id']
336
+ return result['sandbox_url'], "", result['stderr']
337
+
338
+ elif env == SandboxEnvironment.PYGAME:
339
+ result = run_pygame_sandbox(code, dependencies, sandbox_state.get('sandbox_id'))
340
+ sandbox_state['sandbox_id'] = result['sandbox_id']
341
+ return result['sandbox_url'], "", result['stderr']
342
+
343
+ elif env == SandboxEnvironment.GRADIO:
344
+ sandbox_url, sandbox_id, stderr = run_gradio_sandbox(code, dependencies, sandbox_state.get('sandbox_id'))
345
+ sandbox_state['sandbox_id'] = sandbox_id
346
+ return sandbox_url, "", stderr
347
+
348
+ elif env == SandboxEnvironment.STREAMLIT:
349
+ sandbox_url, sandbox_id, stderr = run_streamlit_sandbox(code, dependencies, sandbox_state.get('sandbox_id'))
350
+ sandbox_state['sandbox_id'] = sandbox_id
351
+ return sandbox_url, "", stderr
352
+
353
+ elif env == SandboxEnvironment.MERMAID:
354
+ # Convert Mermaid to HTML and run in HTML sandbox
355
+ html_code = mermaid_to_html(code, theme='light')
356
+ sandbox_url, sandbox_id, stderr = run_html_sandbox(html_code, dependencies, sandbox_state.get('sandbox_id'))
357
+ sandbox_state['sandbox_id'] = sandbox_id
358
+ return sandbox_url, "", stderr
359
+
360
+ elif env == SandboxEnvironment.PYTHON_RUNNER:
361
+ output, stderr = run_code_interpreter(code, 'python', dependencies)
362
+ return "", output, stderr
363
+
364
+ elif env == SandboxEnvironment.JAVASCRIPT_RUNNER:
365
+ html_code = javascript_to_html(code)
366
+ output, stderr = run_html_sandbox(html_code, dependencies, sandbox_state.get('sandbox_id'))
367
+ return "", output, stderr
368
+
369
+ elif env == SandboxEnvironment.C_RUNNER:
370
+ output, stderr = run_c_code(code, sandbox_state.get('sandbox_id'))
371
+ return "", output, stderr
372
+
373
+ elif env == SandboxEnvironment.CPP_RUNNER:
374
+ output, stderr = run_cpp_code(code, sandbox_state.get('sandbox_id'))
375
+ return "", output, stderr
376
+
377
+ elif env == SandboxEnvironment.JAVA_RUNNER:
378
+ output, stderr = run_java_code(code, sandbox_state.get('sandbox_id'))
379
+ return "", output, stderr
380
+
381
+ elif env == SandboxEnvironment.GOLANG_RUNNER:
382
+ output, stderr = run_golang_code(code, sandbox_state.get('sandbox_id'))
383
+ return "", output, stderr
384
+
385
+ elif env == SandboxEnvironment.RUST_RUNNER:
386
+ output, stderr = run_rust_code(code, sandbox_state.get('sandbox_id'))
387
+ return "", output, stderr
388
+
389
+ else:
390
+ # Fallback to Python runner
391
+ output, stderr = run_code_interpreter(code, 'python', dependencies)
392
+ return "", output, stderr
393
+
394
+ except Exception as e:
395
+ return "", "", str(e)
396
+
397
+
398
+
399
+ def build_ui():
400
+ """Build a UI for the coding arena with integrated sandbox"""
401
+
402
+ # Get random models for this session
403
+ model_a, model_b = get_random_models()
404
+
405
+ with gr.Blocks(title="BigCodeArena") as demo:
406
+ gr.Markdown("# BigCodeArena - Start Your Vibe Coding!")
407
+
408
+ # Model display (non-interactive)
409
+ with gr.Row():
410
+ with gr.Column():
411
+ model_display_a = gr.Markdown(f"**Model A:** {model_a}", visible=False)
412
+ with gr.Column():
413
+ model_display_b = gr.Markdown(f"**Model B:** {model_b}", visible=False)
414
+
415
+ # Sandbox section with tabs for each model - Collapsible and open by default
416
+ with gr.Accordion("🏗️ Code Execution & Sandbox", open=True):
417
+
418
+ with gr.Row():
419
+ # Model A Sandbox
420
+ with gr.Column():
421
+ gr.Markdown("### Model A Sandbox")
422
+ with gr.Tabs():
423
+ with gr.Tab("View"):
424
+ sandbox_view_a = gr.Markdown("**Sandbox output will appear here automatically**")
425
+ sandbox_component_a = SandboxComponent(
426
+ value=("", False, []),
427
+ label="Model A Sandbox",
428
+ visible=False
429
+ )
430
+ with gr.Tab("Code"):
431
+ code_a = gr.Code(
432
+ label="Extracted Code",
433
+ language="python",
434
+ lines=8,
435
+ interactive=False
436
+ )
437
+
438
+ # Model B Sandbox
439
+ with gr.Column():
440
+ gr.Markdown("### Model B Sandbox")
441
+ with gr.Tabs():
442
+ with gr.Tab("View"):
443
+ sandbox_view_b = gr.Markdown("**Sandbox output will appear here automatically**")
444
+ sandbox_component_b = SandboxComponent(
445
+ value=("", False, []),
446
+ label="Model B Sandbox",
447
+ visible=False
448
+ )
449
+ with gr.Tab("Code"):
450
+ code_b = gr.Code(
451
+ label="Extracted Code",
452
+ language="python",
453
+ lines=8,
454
+ interactive=False
455
+ )
456
+
457
+ # Main chat interface - Collapsible and hidden by default
458
+ with gr.Accordion("💬 Chat Interface", open=False):
459
+ with gr.Row():
460
+ with gr.Column():
461
+ gr.Markdown("## Model A")
462
+ chatbot_a = gr.Chatbot(
463
+ label="Model A",
464
+ height=300,
465
+ show_copy_button=True,
466
+ type="messages"
467
+ )
468
+ chat_stats_a = gr.Markdown("**Conversation:** 0 turns")
469
+
470
+ with gr.Column():
471
+ gr.Markdown("## Model B")
472
+ chatbot_b = gr.Chatbot(
473
+ label="Model B",
474
+ height=300,
475
+ show_copy_button=True,
476
+ type="messages"
477
+ )
478
+ chat_stats_b = gr.Markdown("**Conversation:** 0 turns")
479
+
480
+ # Input section
481
+ with gr.Row():
482
+ text_input = gr.Textbox(
483
+ label="Enter your coding prompt",
484
+ placeholder="e.g., 'Write a Python function to calculate fibonacci numbers'",
485
+ lines=1
486
+ )
487
+
488
+ # Control buttons
489
+ with gr.Row():
490
+ send_btn = gr.Button("🚀 Send to Both Models", variant="primary", size="lg")
491
+ clear_btn = gr.Button("🗑️ Clear Chat", variant="secondary")
492
+ refresh_models_btn = gr.Button("🔄 New Random Models", variant="secondary")
493
+
494
+ # Advanced Settings (Collapsible)
495
+ with gr.Accordion("⚙️ Advanced Settings", open=False):
496
+ with gr.Row():
497
+ with gr.Column(scale=1):
498
+ temperature = gr.Slider(
499
+ minimum=0.0,
500
+ maximum=1.0,
501
+ value=0.7,
502
+ step=0.1,
503
+ label="Temperature"
504
+ )
505
+ with gr.Column(scale=1):
506
+ max_tokens = gr.Slider(
507
+ minimum=100,
508
+ maximum=4000,
509
+ value=1000,
510
+ step=100,
511
+ label="Max Tokens"
512
+ )
513
+
514
+ # Event handlers
515
+ # Create state variables for the run buttons
516
+ state0_var = gr.State()
517
+ state1_var = gr.State()
518
+
519
+ # Create response components (hidden but needed for outputs)
520
+ response_a = gr.Markdown("", visible=False)
521
+ response_b = gr.Markdown("", visible=False)
522
+
523
+ # Create a wrapper function that handles both the main execution and state update
524
+ def send_and_update_state(state0, state1, text, temp, max_tok, model_a, model_b):
525
+ print(f"DEBUG: send_and_update_state called with text: {text[:50] if text else 'None'}")
526
+ # Call the main function
527
+ result = add_text_and_generate(state0, state1, text, temp, max_tok, model_a, model_b)
528
+ # Extract the state from the result
529
+ new_state0, new_state1 = result[0], result[1]
530
+ print(f"DEBUG: send_and_update_state returning new_state0: {type(new_state0)}, new_state1: {type(new_state1)}")
531
+ # Return all the original outputs plus the updated state for run buttons
532
+ # Make sure all outputs are properly formatted for their expected types
533
+ return (
534
+ new_state0, # state0
535
+ new_state1, # state1
536
+ result[2], # chatbot_a (chat0)
537
+ result[3], # chatbot_b (chat1)
538
+ result[4], # response_a (response0)
539
+ result[5], # response_b (response1)
540
+ result[6], # code_a (code0)
541
+ result[7], # code_b (code1)
542
+ result[10], # sandbox_state0
543
+ result[11], # sandbox_state1
544
+ result[12], # sandbox_output0
545
+ result[13], # sandbox_output1
546
+ result[14], # sandbox_component_update0
547
+ result[15], # sandbox_component_update1
548
+ result[16], # chat_stats_a
549
+ result[17], # chat_stats_b
550
+ result[18], # sandbox_view_a
551
+ result[19], # sandbox_view_b
552
+ new_state0, # state0_var
553
+ new_state1, # state1_var
554
+ "", # Clear text input
555
+ f"**Model A:** {model_a}", # Update model display A
556
+ f"**Model B:** {model_b}", # Update model display B
557
+ )
558
+
559
+ send_btn.click(
560
+ fn=send_and_update_state,
561
+ inputs=[
562
+ state0_var, # state0
563
+ state1_var, # state1
564
+ text_input,
565
+ temperature,
566
+ max_tokens,
567
+ gr.State(model_a), # Use fixed model A
568
+ gr.State(model_b) # Use fixed model B
569
+ ],
570
+ outputs=[
571
+ state0_var, # state0
572
+ state1_var, # state1
573
+ chatbot_a,
574
+ chatbot_b,
575
+ response_a,
576
+ response_b,
577
+ code_a,
578
+ code_b,
579
+ gr.State(), # sandbox_state0
580
+ gr.State(), # sandbox_state1
581
+ sandbox_view_a, # sandbox output for model A
582
+ sandbox_view_b, # sandbox output for model B
583
+ sandbox_component_a, # sandbox component for model A
584
+ sandbox_component_b, # sandbox component for model B
585
+ chat_stats_a, # Conversation statistics for model A
586
+ chat_stats_b, # Conversation statistics for model B
587
+ sandbox_view_a, # Sandbox view for model A
588
+ sandbox_view_b, # Sandbox view for model B
589
+ state0_var, # Updated state for run button A
590
+ state1_var, # Updated state for run button B
591
+ text_input, # Clear the text input after sending
592
+ model_display_a, # Update model display A
593
+ model_display_b, # Update model display B
594
+ ]
595
+ )
596
+
597
+ clear_btn.click(
598
+ fn=clear_chat,
599
+ inputs=[gr.State(), gr.State()],
600
+ outputs=[
601
+ gr.State(None),
602
+ gr.State(None),
603
+ chatbot_a,
604
+ chatbot_b,
605
+ response_a,
606
+ response_b,
607
+ code_a,
608
+ code_b,
609
+ gr.State(None),
610
+ gr.State(None),
611
+ sandbox_view_a,
612
+ sandbox_view_b,
613
+ sandbox_component_a,
614
+ sandbox_component_b,
615
+ state0_var, # Reset state for run button A
616
+ state1_var, # Reset state for run button B
617
+ chat_stats_a, # Reset conversation statistics for model A
618
+ chat_stats_b, # Reset conversation statistics for model B
619
+ sandbox_view_a, # Reset sandbox view for model A
620
+ sandbox_view_b, # Reset sandbox view for model B
621
+ model_display_a, # Reset model display A
622
+ model_display_b, # Reset model display B
623
+ ]
624
+ )
625
+
626
+ # Refresh models button handler
627
+ def refresh_models():
628
+ new_model_a, new_model_b = get_random_models()
629
+ return (
630
+ None, # Reset state0
631
+ None, # Reset state1
632
+ "", # Clear chat A
633
+ "", # Clear chat B
634
+ "", # Clear response A
635
+ "", # Clear response B
636
+ "", # Clear code A
637
+ "", # Clear code B
638
+ gr.State(None), # Reset sandbox state A
639
+ gr.State(None), # Reset sandbox state B
640
+ "", # Clear sandbox view A
641
+ "", # Clear sandbox view B
642
+ gr.update(visible=False), # Hide sandbox component A
643
+ gr.update(visible=False), # Hide sandbox component B
644
+ "**Conversation:** 0 turns | **Total Messages:** 0", # Reset stats A
645
+ "**Conversation:** 0 turns | **Total Messages:** 0", # Reset stats B
646
+ "", # Clear sandbox view A
647
+ "", # Clear sandbox view B
648
+ None, # Reset state0_var
649
+ None, # Reset state1_var
650
+ f"**Model A:** {new_model_a}", # Update model display A
651
+ f"**Model B:** {new_model_b}", # Update model display B
652
+ )
653
+
654
+ refresh_models_btn.click(
655
+ fn=refresh_models,
656
+ inputs=[],
657
+ outputs=[
658
+ state0_var,
659
+ state1_var,
660
+ chatbot_a,
661
+ chatbot_b,
662
+ response_a,
663
+ response_b,
664
+ code_a,
665
+ code_b,
666
+ gr.State(None),
667
+ gr.State(None),
668
+ sandbox_view_a,
669
+ sandbox_view_b,
670
+ sandbox_component_a,
671
+ sandbox_component_b,
672
+ chat_stats_a,
673
+ chat_stats_b,
674
+ sandbox_view_a,
675
+ sandbox_view_b,
676
+ state0_var,
677
+ state1_var,
678
+ model_display_a, # Update model display A
679
+ model_display_b, # Update model display B
680
+ ]
681
+ )
682
+
683
+ # Examples
684
+ gr.Examples(
685
+ examples=[
686
+ ["使用SVG绘制春节主题的动态图案,包括:1)一个红色的灯笼,带有金色的流苏 2)一个金色的福字,使用书法字体 3)背景添加一些烟花效果 4)在灯笼和福字周围添加一些祥云图案。确保图案布局美观,颜色搭配符合春节传统风格。"],
687
+ ["SVGを使用して日本の伝統的な和柄パターンを描画してください。1)波紋(さざなみ)模様 2)市松模様 3)麻の葉模様 4)雷文(らいもん)模様を含めてください。色は伝統的な日本の色(藍色、朱色、金色など)を使用し、レイアウトはバランスよく配置してください。"],
688
+ ["Write HTML with P5.js that simulates 25 particles in a vacuum space of a cylindrical container, bouncing within its boundaries. Use different colors for each ball and ensure they leave a trail showing their movement. Add a slow rotation of the container to give better view of what's going on in the scene. Make sure to create proper collision detection and physic rules to ensure particles remain in the container. Add an external spherical container. Add a slow zoom in and zoom out effect to the whole scene."],
689
+ ["Write a Python script to scrape NVIDIA's stock price for the past month using the yfinance library. Clean the data and create an interactive visualization using Matplotlib. Include: 1) A candlestick chart showing daily price movements 2) A line chart with 7-day and 30-day moving averages. Add hover tooltips showing exact values and date. Make the layout professional with proper titles and axis labels."],
690
+ ["Write a Python script that uses the Gradio library to create a functional calculator. The calculator should support basic arithmetic operations: addition, subtraction, multiplication, and division. It should have two input fields for numbers and a dropdown menu to select the operation."],
691
+ ["Write a Todo list app using React.js. The app should allow users to add, delete, and mark tasks as completed. Include features like filtering tasks by status (completed, active), sorting tasks by priority, and displaying the total number of tasks."],
692
+ ["Write a Python script using the Streamlit library to create a web application for uploading and displaying files. The app should allow users to upload files of type .csv or .txt. If a .csv file is uploaded, display its contents as a table using Streamlit's st.dataframe() method. If a .txt file is uploaded, display its content as plain text."],
693
+ ["Write a Python function to solve the Trapping Rain Water problem. The function should take a list of non-negative integers representing the height of bars in a histogram and return the total amount of water trapped between the bars after raining. Use an efficient algorithm with a time complexity of O(n)."],
694
+ ["Create a simple Pygame script for a game where the player controls a bouncing ball that changes direction when it collides with the edges of the window. Add functionality for the player to control a paddle using arrow keys, aiming to keep the ball from touching the bottom of the screen. Include basic collision detection and a scoring system that increases as the ball bounces off the paddle. You need to add clickable buttons to start the game, and reset the game."],
695
+ ["Create a financial management Dashboard using Vue.js, focusing on local data handling without APIs. Include features like a clean dashboard for tracking income and expenses, dynamic charts for visualizing finances, and a budget planner. Implement functionalities for adding, editing, and deleting transactions, as well as filtering by date or category. Ensure responsive design and smooth user interaction for an intuitive experience."],
696
+ ["Create a Mermaid diagram to visualize a flowchart of a user login process. Include the following steps: User enters login credentials; Credentials are validated; If valid, the user is directed to the dashboard; If invalid, an error message is shown, and the user can retry or reset the password."],
697
+ ["Write a Python function to calculate the Fibonacci sequence up to n numbers. Then write test cases to verify the function works correctly for edge cases like negative numbers, zero, and large inputs."],
698
+ ["Build an HTML page for a Kanban board with three columns with Vue.js: To Do, In Progress, and Done. Each column should allow adding, moving, and deleting tasks. Implement drag-and-drop functionality using Vue Draggable and persist the state using Vuex."],
699
+ ["Develop a Streamlit app that takes a CSV file as input and provides: 1) Basic statistics about the data 2) Interactive visualizations using Plotly 3) A data cleaning interface with options to handle missing values 4) An option to download the cleaned data."],
700
+ ["Write an HTML page with embedded JavaScript that creates an interactive periodic table. Each element should display its properties on hover and allow filtering by category (metals, non-metals, etc.). Include a search bar to find elements by name or symbol."],
701
+ ["Here's a Python function that sorts a list of dictionaries by a specified key:\n\n```python\ndef sort_dicts(data, key):\n return sorted(data, key=lambda x: x[key])\n```\n\nWrite test cases to verify the function works correctly for edge cases like empty lists, missing keys, and different data types. If you use unittest, please use `unittest.main(argv=['first-arg-is-ignored'], exit=False)` to run the tests."],
702
+ ["Create a React component for a fitness tracker that shows: 1) Daily step count 2) Calories burned 3) Distance walked 4) A progress bar for daily goals."],
703
+ ["Build a Vue.js dashboard for monitoring server health. Include: 1) Real-time CPU and memory usage graphs 2) Disk space visualization 3) Network activity monitor 4) Alerts for critical thresholds."],
704
+ ["Write a C program that calculates and prints the first 100 prime numbers in a formatted table with 10 numbers per row. Include a function to check if a number is prime and use it in your solution."],
705
+ ["Write a C++ program that implements a simple calculator using object-oriented programming. Create a Calculator class with methods for addition, subtraction, multiplication, and division. Include error handling for division by zero."],
706
+ ["Write a Rust program that generates and prints a Pascal's Triangle with 10 rows. Format the output to center-align the numbers in each row."],
707
+ ["Write a Java program that simulates a simple bank account system. Create a BankAccount class with methods for deposit, withdrawal, and balance inquiry. Include error handling for insufficient funds and demonstrate its usage with a few transactions."],
708
+ ["Write a Go program that calculates and prints the Fibonacci sequence up to the 50th number. Format the output in a table with 5 numbers per row and include the index of each Fibonacci number."],
709
+ ["Write a C program that calculates and prints a histogram of letter frequencies from a predefined string. Use ASCII art to display the histogram vertically."],
710
+ ["Write a C++ program that implements a simple stack data structure with push, pop, and peek operations. Demonstrate its usage by reversing a predefined string using the stack."],
711
+ ["Write a Rust program that calculates and prints the first 20 happy numbers. Include a function to check if a number is happy and use it in your solution."],
712
+ ["Write a Java program that implements a simple binary search algorithm. Create a sorted array of integers and demonstrate searching for different values, including cases where the value is found and not found."],
713
+ ["Write a Go program that generates and prints a multiplication table from 1 to 12. Format the output in a neat grid with proper alignment."],
714
+ ],
715
+ example_labels=[
716
+ "🏮 春节主题图案",
717
+ "🎎 日本の伝統的な和柄パターン",
718
+ "🌐 Particles in a Spherical Container",
719
+ "💹 NVIDIA Stock Analysis with Matplotlib",
720
+ "🧮 Calculator with Gradio",
721
+ "📝 Todo List App with React.js",
722
+ "📂 File Upload Web App with Streamlit",
723
+ "💦 Solve Trapping Rain Water Problem",
724
+ "🎮 Pygame Bouncing Ball Game",
725
+ "💳 Financial Dashboard with Vue.js",
726
+ "🔑 User Login Process Flowchart",
727
+ "🔢 Fibonacci Sequence with Tests",
728
+ "📌 Vue Kanban Board",
729
+ "🧹 Streamlit Data Cleaning App",
730
+ "⚗️ Interactive Periodic Table with React",
731
+ "📚 Dictionary Sorting Tests in Python",
732
+ "🏋️‍♂️ Fitness Tracker with React",
733
+ "🖥️ Vue Server Monitoring",
734
+ "🔢 Prime Numbers in C",
735
+ "🧮 OOP Calculator in C++",
736
+ "🔷 Pascal's Triangle in Rust",
737
+ "🏛️ Bank Account Simulation in Java",
738
+ "🐰 Fibonacci Sequence in Go",
739
+ "📊 Letter Frequency Histogram in C",
740
+ "📦 Stack Implementation in C++",
741
+ "😄 Happy Numbers in Rust",
742
+ "🔎 Binary Search in Java",
743
+ "✖️ Multiplication Table in Go",
744
+ ],
745
+ examples_per_page=100,
746
+ label="Example Prompts",
747
+ inputs=[text_input],
748
+ )
749
+
750
+ return demo
751
+
752
+ def main():
753
+ """Main function to run the Simple BigCodeArena app"""
754
+ print("🚀 Starting Simple BigCodeArena...")
755
+ if available_models:
756
+ print(f"🔍 Available models: {', '.join(available_models)}")
757
+ # Get random models for this session
758
+ model_a, model_b = get_random_models()
759
+ print(f"🎲 Randomly selected models for this session:")
760
+ print(f" Model A: {model_a}")
761
+ print(f" Model B: {model_b}")
762
+ else:
763
+ print("⚠️ No models found in config!")
764
+
765
+ # Build the UI
766
+ demo = build_ui()
767
+
768
+ # Launch the app
769
+ demo.launch(
770
+ server_name="0.0.0.0",
771
+ server_port=7860,
772
+ share=False,
773
+ debug=True
774
+ )
775
+
776
+ if __name__ == "__main__":
777
+ main()
chat_state.py ADDED
@@ -0,0 +1,259 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ '''
2
+ Chat State and Logging
3
+ '''
4
+
5
+ import json
6
+ import os
7
+ from typing import Any, Literal, Optional
8
+ from conversation import Conversation
9
+
10
+
11
+ import datetime
12
+ import uuid
13
+
14
+
15
+ LOG_DIR = os.getenv("LOGDIR", "./logs")
16
+ '''
17
+ The default output dir of log files
18
+ '''
19
+
20
+
21
+ class ModelChatState:
22
+ '''
23
+ The state of a chat with a model.
24
+ '''
25
+
26
+ is_vision: bool
27
+ '''
28
+ Whether the model is vision based.
29
+ '''
30
+
31
+ conv: Conversation
32
+ '''
33
+ The conversation
34
+ '''
35
+
36
+ conv_id: str
37
+ '''
38
+ Unique identifier for the model conversation.
39
+ Unique per chat per model.
40
+ '''
41
+
42
+ chat_session_id: str
43
+ '''
44
+ Unique identifier for the chat session.
45
+ Unique per chat. The two battle models share the same chat session id.
46
+ '''
47
+
48
+ skip_next: bool
49
+ '''
50
+ Flag to indicate skipping the next operation.
51
+ '''
52
+
53
+ model_name: str
54
+ '''
55
+ Name of the model being used.
56
+ '''
57
+
58
+ oai_thread_id: Optional[str]
59
+ '''
60
+ Identifier for the OpenAI thread.
61
+ '''
62
+
63
+ has_csam_image: bool
64
+ '''
65
+ Indicates if a CSAM image has been uploaded.
66
+ '''
67
+
68
+ regen_support: bool
69
+ '''
70
+ Indicates if regeneration is supported for the model.
71
+ '''
72
+
73
+ chat_start_time: datetime.datetime
74
+ '''
75
+ Chat start time.
76
+ '''
77
+
78
+ chat_mode: Literal['battle_anony', 'battle_named', 'direct']
79
+ '''
80
+ Chat mode.
81
+ '''
82
+
83
+ curr_response_type: Literal['chat_multi', 'chat_single', 'regenerate_multi', 'regenerate_single'] | None
84
+ '''
85
+ Current response type. Used for logging.
86
+ '''
87
+
88
+ @staticmethod
89
+ def create_chat_session_id() -> str:
90
+ '''
91
+ Create a new chat session id.
92
+ '''
93
+ return uuid.uuid4().hex
94
+
95
+ @staticmethod
96
+ def create_battle_chat_states(
97
+ model_name_1: str, model_name_2: str,
98
+ chat_mode: Literal['battle_anony', 'battle_named'],
99
+ is_vision: bool,
100
+ ) -> tuple['ModelChatState', 'ModelChatState']:
101
+ '''
102
+ Create two chat states for a battle.
103
+ '''
104
+ chat_session_id = ModelChatState.create_chat_session_id()
105
+ return (
106
+ ModelChatState(model_name_1, chat_mode,
107
+ is_vision=is_vision,
108
+ chat_session_id=chat_session_id),
109
+ ModelChatState(model_name_2, chat_mode,
110
+ is_vision=is_vision,
111
+ chat_session_id=chat_session_id),
112
+ )
113
+
114
+
115
+ def __init__(self,
116
+ model_name: str,
117
+ chat_mode: Literal['battle_anony', 'battle_named', 'direct'],
118
+ is_vision: bool,
119
+ chat_session_id: str | None = None,
120
+ ):
121
+ from fastchat.model.model_adapter import get_conversation_template
122
+
123
+ self.conv = get_conversation_template(model_name)
124
+ self.conv_id = uuid.uuid4().hex
125
+ # if no chat session id is provided, use the conversation id
126
+ self.chat_session_id = chat_session_id if chat_session_id else self.conv_id
127
+ self.chat_start_time = datetime.datetime.now()
128
+ self.chat_mode = chat_mode
129
+
130
+ self.skip_next = False
131
+ self.model_name = model_name
132
+ self.oai_thread_id = None
133
+ self.is_vision = is_vision
134
+
135
+ # NOTE(chris): This could be sort of a hack since it assumes the user only uploads one image. If they can upload multiple, we should store a list of image hashes.
136
+ self.has_csam_image = False
137
+
138
+ self.regen_support = True
139
+ if "browsing" in model_name:
140
+ self.regen_support = False
141
+ self.init_system_prompt(self.conv, is_vision)
142
+
143
+ def init_system_prompt(self, conv, is_vision):
144
+ system_prompt = conv.get_system_message(is_vision)
145
+ if len(system_prompt) == 0:
146
+ return
147
+ current_date = datetime.datetime.now().strftime("%Y-%m-%d")
148
+ system_prompt = system_prompt.replace("{{currentDateTime}}", current_date)
149
+
150
+ current_date_v2 = datetime.datetime.now().strftime("%d %b %Y")
151
+ system_prompt = system_prompt.replace("{{currentDateTimev2}}", current_date_v2)
152
+
153
+ current_date_v3 = datetime.datetime.now().strftime("%B %Y")
154
+ system_prompt = system_prompt.replace("{{currentDateTimev3}}", current_date_v3)
155
+ conv.set_system_message(system_prompt)
156
+
157
+ def set_response_type(
158
+ self,
159
+ response_type: Literal['chat_multi', 'chat_single', 'regenerate_multi', 'regenerate_single']
160
+ ):
161
+ '''
162
+ Set the response type for the chat state.
163
+ '''
164
+ self.curr_response_type = response_type
165
+
166
+ def to_gradio_chatbot(self):
167
+ '''
168
+ Convert to a Gradio chatbot.
169
+ '''
170
+ return self.conv.to_gradio_chatbot()
171
+
172
+ def get_conv_log_filepath(self, path_prefix: str):
173
+ '''
174
+ Get the filepath for the conversation log.
175
+
176
+ Expected directory structure:
177
+ softwarearenlog/
178
+ └── YEAR_MONTH_DAY/
179
+ ├── conv_logs/
180
+ └── sandbox_logs/
181
+ '''
182
+ date_str = self.chat_start_time.strftime('%Y_%m_%d')
183
+ filepath = os.path.join(
184
+ path_prefix,
185
+ date_str,
186
+ 'conv_logs',
187
+ self.chat_mode,
188
+ f"conv-log-{self.chat_session_id}.json"
189
+ )
190
+ return filepath
191
+
192
+ def to_dict(self):
193
+ base = self.conv.to_dict()
194
+ base.update(
195
+ {
196
+ "chat_session_id": self.chat_session_id,
197
+ "conv_id": self.conv_id,
198
+ "chat_mode": self.chat_mode,
199
+ "chat_start_time": self.chat_start_time,
200
+ "model_name": self.model_name,
201
+ }
202
+ )
203
+
204
+ if self.is_vision:
205
+ base.update({"has_csam_image": self.has_csam_image})
206
+ return base
207
+
208
+ def generate_vote_record(
209
+ self,
210
+ vote_type: str,
211
+ ip: str
212
+ ) -> dict[str, Any]:
213
+ '''
214
+ Generate a vote record for telemertry.
215
+ '''
216
+ data = {
217
+ "tstamp": round(datetime.datetime.now().timestamp(), 4),
218
+ "type": vote_type,
219
+ "model": self.model_name,
220
+ "state": self.to_dict(),
221
+ "ip": ip,
222
+ }
223
+ return data
224
+
225
+ def generate_response_record(
226
+ self,
227
+ gen_params: dict[str, Any],
228
+ start_ts: float,
229
+ end_ts: float,
230
+ ip: str
231
+ ) -> dict[str, Any]:
232
+ '''
233
+ Generate a vote record for telemertry.
234
+ '''
235
+ data = {
236
+ "tstamp": round(datetime.datetime.now().timestamp(), 4),
237
+ "type": self.curr_response_type,
238
+ "model": self.model_name,
239
+ "start_ts": round(start_ts, 4),
240
+ "end_ts": round(end_ts, 4),
241
+ "gen_params": gen_params,
242
+ "state": self.to_dict(),
243
+ "ip": ip,
244
+ }
245
+ return data
246
+
247
+
248
+ def save_log_to_local(
249
+ log_data: dict[str, Any],
250
+ log_path: str,
251
+ write_mode: Literal['overwrite', 'append'] = 'append'
252
+ ):
253
+ '''
254
+ Save the log locally.
255
+ '''
256
+ log_json = json.dumps(log_data, default=str)
257
+ os.makedirs(os.path.dirname(log_path), exist_ok=True)
258
+ with open(log_path, "w" if write_mode == 'overwrite' else 'a') as fout:
259
+ fout.write(log_json + "\n")
completion.py ADDED
@@ -0,0 +1,1304 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import time
4
+ import yaml
5
+ import random
6
+ import shortuuid
7
+
8
+ import requests
9
+ from typing import Optional
10
+ import boto3
11
+
12
+ from glob import glob
13
+ from tqdm import tqdm
14
+
15
+ # API setting constants
16
+ API_MAX_RETRY = 50
17
+ API_RETRY_SLEEP = 10
18
+ API_ERROR_OUTPUT = None
19
+
20
+ registered_api_completion = {}
21
+ registered_engine_completion = {}
22
+
23
+
24
+ def register_api(api_type):
25
+ def decorator(func):
26
+ registered_api_completion[api_type] = func
27
+ return func
28
+
29
+ return decorator
30
+
31
+
32
+ def register_engine(engine_type):
33
+ def decorator(func):
34
+ registered_engine_completion[engine_type] = func
35
+ return func
36
+
37
+ return decorator
38
+
39
+
40
+ def load_questions(question_file: str):
41
+ """Load questions from a file."""
42
+ questions = []
43
+ with open(question_file, "r") as ques_file:
44
+ for line in ques_file:
45
+ if line:
46
+ questions.append(json.loads(line))
47
+ return questions
48
+
49
+ def load_model_answers(answer_dir: str):
50
+ """Load model answers.
51
+
52
+ The return value is a python dict of type:
53
+ Dict[model_name: str -> Dict[uid: int -> answer: dict]]
54
+ """
55
+ if not os.path.exists(answer_dir):
56
+ return {}
57
+
58
+ filenames = []
59
+ for folder in os.listdir(answer_dir):
60
+ if not os.path.isdir(os.path.join(answer_dir, folder)):
61
+ continue
62
+ if not os.path.exists(os.path.join(answer_dir, folder, "generation.jsonl")):
63
+ print(f"WARNING: {folder} does not have generation.jsonl, skip it.")
64
+ continue
65
+ filenames.append(os.path.join(answer_dir, folder, "generation.jsonl"))
66
+
67
+ filenames.sort()
68
+ model_answers = {}
69
+ for filename in filenames:
70
+ # Use parent directory name as model name
71
+ model_name = os.path.basename(os.path.dirname(filename))
72
+ answer = {}
73
+ with open(filename) as fin:
74
+ for line in fin:
75
+ line = json.loads(line)
76
+ answer[line["uid"]] = line
77
+ model_answers[model_name] = answer
78
+ return model_answers
79
+
80
+
81
+ def load_model_judgements(answer_dir: str):
82
+ """Load model judgements.
83
+
84
+ The return value is a python dict of type:
85
+ Dict[model_name: str -> Dict[uid: int -> answer: dict]]
86
+ """
87
+ filenames = glob(os.path.join(answer_dir, "*.jsonl"))
88
+ filenames.sort()
89
+ model_answers = {}
90
+
91
+ for filename in filenames:
92
+ model_name = os.path.basename(filename)[:-6]
93
+ answer = {}
94
+ with open(filename) as fin:
95
+ for line in fin:
96
+ line = json.loads(line)
97
+ answer[line["uid"]] = line
98
+ model_answers[model_name] = answer
99
+
100
+ return model_answers
101
+
102
+
103
+ def load_model_answers_and_execution_results(data_dir: str):
104
+ """Load model answers and execution results.
105
+
106
+ The return value is a python dict of type:
107
+ Dict[model_name: str -> Dict[uid: int -> answer: dict]]
108
+ """
109
+ filenames = []
110
+ for folder in os.listdir(data_dir):
111
+ if not os.path.isdir(os.path.join(data_dir, folder)):
112
+ continue
113
+ if not os.path.exists(os.path.join(data_dir, folder, "execution_results.jsonl")):
114
+ print(f"WARNING: {folder} does not have execution_results.jsonl, skip it.")
115
+ continue
116
+ filenames.append(os.path.join(data_dir, folder, "execution_results.jsonl"))
117
+
118
+ filenames.sort()
119
+ model_answers = {}
120
+
121
+ for filename in filenames:
122
+ # Use parent directory name as model name
123
+ model_name = os.path.basename(os.path.dirname(filename))
124
+ answer = {}
125
+ with open(filename) as fin:
126
+ for line in fin:
127
+ line = json.loads(line)
128
+ answer[line["uid"]] = line
129
+ model_answers[model_name] = answer
130
+
131
+ return model_answers
132
+
133
+
134
+
135
+ def load_id_to_model_answers(answer_dir: str):
136
+ """Load model answers.
137
+
138
+ The return value is a python dict of type:
139
+ Dict[model_name: str -> Dict[uid: int -> answer: dict]]
140
+ """
141
+ filenames = glob(os.path.join(answer_dir, "*.jsonl"))
142
+ filenames.sort()
143
+ model_answers = {}
144
+
145
+ for filename in filenames:
146
+ model_name = os.path.basename(filename)[:-6]
147
+
148
+ with open(filename) as fin:
149
+ for line in fin:
150
+ line = json.loads(line)
151
+
152
+ if line["uid"] in model_answers:
153
+ model_answers[line["uid"]][model_name] = line
154
+ else:
155
+ model_answers[line["uid"]] = {model_name: line}
156
+
157
+ return model_answers
158
+
159
+
160
+ def get_endpoint(endpoint_list):
161
+ if endpoint_list is None:
162
+ return None
163
+ assert endpoint_list is not None
164
+ # randomly pick one
165
+ api_dict = random.choices(
166
+ endpoint_list
167
+ )[0]
168
+ return api_dict
169
+
170
+
171
+ # load config args from config yaml files
172
+ def make_config(config_file: str) -> dict:
173
+ config_kwargs = {}
174
+ with open(config_file, "r") as f:
175
+ config_kwargs = yaml.load(f, Loader=yaml.SafeLoader)
176
+
177
+ return config_kwargs
178
+
179
+
180
+ @register_api("openai")
181
+ def chat_completion_openai(model, messages, temperature, max_tokens, api_dict=None, **kwargs):
182
+ import openai
183
+ if api_dict:
184
+ client = openai.OpenAI(
185
+ base_url=api_dict["api_base"],
186
+ api_key=api_dict["api_key"],
187
+ )
188
+ else:
189
+ client = openai.OpenAI()
190
+
191
+ if api_dict and "model_name" in api_dict:
192
+ model = api_dict["model_name"]
193
+
194
+ output = API_ERROR_OUTPUT
195
+ for _ in range(API_MAX_RETRY):
196
+ try:
197
+ completion = client.chat.completions.create(
198
+ model=model,
199
+ messages=messages,
200
+ temperature=temperature,
201
+ max_tokens=max_tokens,
202
+ )
203
+ output = {
204
+ "answer": completion.choices[0].message.content
205
+ }
206
+ break
207
+ except openai.RateLimitError as e:
208
+ print(type(e), e)
209
+ time.sleep(API_RETRY_SLEEP)
210
+ except openai.BadRequestError as e:
211
+ print("=== DEBUG: OpenAI BadRequestError ===")
212
+ print("Error type:", type(e))
213
+ print("Error message:", str(e))
214
+ print("=== Analyzing messages for image issues ===")
215
+ for i, msg in enumerate(messages):
216
+ print(f"Message {i} role: {msg.get('role', 'unknown')}")
217
+ if "content" in msg:
218
+ content = msg["content"]
219
+ if isinstance(content, list):
220
+ for j, item in enumerate(content):
221
+ if isinstance(item, dict) and item.get("type") == "image_url":
222
+ url = item.get("image_url", {}).get("url", "")
223
+ if url.startswith("data:image/png;base64,"):
224
+ base64_part = url[22:] # Remove "data:image/png;base64," prefix
225
+ print(f" Image {j}: base64 length = {len(base64_part)}")
226
+ if len(base64_part) < 50:
227
+ print(f" *** ISSUE: Image {j} has very short/empty base64: '{url}'")
228
+ elif url.startswith("data:image/"):
229
+ print(f" Image {j}: Non-PNG data URL: {url[:50]}...")
230
+ else:
231
+ print(f" Image {j}: Unexpected URL format: {url[:50]}...")
232
+ elif isinstance(item, dict) and item.get("type") == "text":
233
+ text_content = item.get("text", "")
234
+ print(f" Text {j}: {len(text_content)} chars")
235
+ else:
236
+ print(f" Content {j}: {type(item)} - {str(item)[:50]}...")
237
+ else:
238
+ print(f" Content: {type(content)} - {str(content)[:100]}...")
239
+ print("=== End debug info ===")
240
+ break
241
+ except KeyError:
242
+ print(type(e), e)
243
+ break
244
+
245
+ return output
246
+
247
+
248
+ @register_api("openai_streaming")
249
+ def chat_completion_openai_streaming(model, messages, temperature, max_tokens, api_dict=None, **kwargs):
250
+ """Streaming version of OpenAI completion that yields tokens as they arrive"""
251
+ import openai
252
+ if api_dict:
253
+ client = openai.OpenAI(
254
+ base_url=api_dict["api_base"],
255
+ api_key=api_dict["api_key"],
256
+ )
257
+ else:
258
+ client = openai.OpenAI()
259
+
260
+ if api_dict and "model_name" in api_dict:
261
+ model = api_dict["model_name"]
262
+
263
+ try:
264
+ stream = client.chat.completions.create(
265
+ model=model,
266
+ messages=messages,
267
+ temperature=temperature,
268
+ max_tokens=max_tokens,
269
+ stream=True
270
+ )
271
+
272
+ for chunk in stream:
273
+ if chunk.choices[0].delta.content is not None:
274
+ yield chunk.choices[0].delta.content
275
+
276
+ except Exception as e:
277
+ print(f"Error in streaming completion: {e}")
278
+ yield f"Error: {str(e)}"
279
+
280
+
281
+ @register_api("openai_thinking")
282
+ def chat_completion_openai_thinking(model, messages, api_dict=None, **kwargs):
283
+ import openai
284
+
285
+ if api_dict:
286
+ client = openai.OpenAI(
287
+ api_key=api_dict["api_key"],
288
+ base_url=api_dict["api_base"],
289
+ )
290
+ else:
291
+ client = openai.OpenAI()
292
+
293
+ output = API_ERROR_OUTPUT
294
+ for i in range(API_MAX_RETRY):
295
+ try:
296
+ completion = client.chat.completions.create(
297
+ model=model,
298
+ messages=messages,
299
+ reasoning_effort=kwargs['reasoning_effort'] if 'reasoning_effort' in kwargs else 'medium',
300
+
301
+ )
302
+ output = {
303
+ "answer": completion.choices[0].message.content
304
+ }
305
+ break
306
+ except openai.RateLimitError as e:
307
+ print(type(e), e)
308
+ time.sleep(API_RETRY_SLEEP)
309
+ except openai.BadRequestError as e:
310
+ print("=== DEBUG: OpenAI BadRequestError ===")
311
+ print("Error type:", type(e))
312
+ print("Error message:", str(e))
313
+ print("=== Analyzing messages for image issues ===")
314
+ for i, msg in enumerate(messages):
315
+ print(f"Message {i} role: {msg.get('role', 'unknown')}")
316
+ if "content" in msg:
317
+ content = msg["content"]
318
+ if isinstance(content, list):
319
+ for j, item in enumerate(content):
320
+ if isinstance(item, dict) and item.get("type") == "image_url":
321
+ url = item.get("image_url", {}).get("url", "")
322
+ if url.startswith("data:image/png;base64,"):
323
+ base64_part = url[22:] # Remove "data:image/png;base64," prefix
324
+ print(f" Image {j}: base64 length = {len(base64_part)}")
325
+ if len(base64_part) < 50:
326
+ print(f" *** ISSUE: Image {j} has very short/empty base64: '{url}'")
327
+ elif url.startswith("data:image/"):
328
+ print(f" Image {j}: Non-PNG data URL: {url[:50]}...")
329
+ else:
330
+ print(f" Image {j}: Unexpected URL format: {url[:50]}...")
331
+ elif isinstance(item, dict) and item.get("type") == "text":
332
+ text_content = item.get("text", "")
333
+ print(f" Text {j}: {len(text_content)} chars")
334
+ else:
335
+ print(f" Content {j}: {type(item)} - {str(item)[:50]}...")
336
+ else:
337
+ print(f" Content: {type(content)} - {str(content)[:100]}...")
338
+ print("=== End debug info ===")
339
+ break
340
+ except KeyError:
341
+ print(type(e), e)
342
+ break
343
+
344
+ return output
345
+
346
+
347
+ @register_api("deepseek_reasoner")
348
+ def chat_completion_deepseek_reasoner(messages, api_dict, **kwargs):
349
+ import urllib.request
350
+
351
+ chat_endpoint_headers = {
352
+ "User-Agent": "curl/8.7.1",
353
+ "Authorization": "Bearer {}".format(api_dict['api_key']),
354
+ "Content-Type": "application/json",
355
+ "Accept": "application/json",
356
+ }
357
+ chat_endpoint_url = "https://api.deepseek.com/chat/completions"
358
+
359
+ req_body = {
360
+ "messages": messages,
361
+ "model": "deepseek-reasoner",
362
+ "stream": False,
363
+ }
364
+ req_data = json.dumps(req_body).encode("utf-8")
365
+
366
+ output = API_ERROR_OUTPUT
367
+ for i in range(API_MAX_RETRY):
368
+ try:
369
+ req = urllib.request.Request(
370
+ chat_endpoint_url,
371
+ headers = chat_endpoint_headers.copy(),
372
+ data = req_data,
373
+ )
374
+
375
+ with urllib.request.urlopen(req) as res:
376
+ res_data = res.read()
377
+ res_body = json.loads(res_data.decode("utf-8"))
378
+
379
+ output = {
380
+ "thought": res_body["choices"][0]["message"]["reasoning_content"],
381
+ "answer": res_body["choices"][0]["message"]["content"],
382
+ }
383
+ break
384
+ except Exception as e:
385
+ print(type(e), e)
386
+ time.sleep(API_RETRY_SLEEP)
387
+
388
+ return output
389
+
390
+
391
+ @register_api("deepseek")
392
+ def chat_completion_deepseek(messages, max_tokens, api_dict, **kwargs):
393
+ import urllib.request
394
+
395
+ chat_endpoint_headers = {
396
+ "User-Agent": "curl/8.7.1",
397
+ "Authorization": "Bearer {}".format(api_dict['api_key']),
398
+ "Content-Type": "application/json",
399
+ "Accept": "application/json",
400
+ }
401
+ chat_endpoint_url = "https://api.deepseek.com/chat/completions"
402
+
403
+ req_body = {
404
+ "messages": messages,
405
+ "model": "deepseek-chat",
406
+ "stream": False,
407
+ "max_tokens": max_tokens,
408
+ }
409
+ req_data = json.dumps(req_body).encode("utf-8")
410
+
411
+ output = API_ERROR_OUTPUT
412
+ for i in range(API_MAX_RETRY):
413
+ try:
414
+ req = urllib.request.Request(
415
+ chat_endpoint_url,
416
+ headers = chat_endpoint_headers.copy(),
417
+ data = req_data,
418
+ )
419
+
420
+ with urllib.request.urlopen(req) as res:
421
+ res_data = res.read()
422
+ res_body = json.loads(res_data.decode("utf-8"))
423
+
424
+ output = {
425
+ "answer": res_body["choices"][0]["message"]["content"],
426
+ }
427
+ break
428
+ except Exception as e:
429
+ print(type(e), e)
430
+ time.sleep(API_RETRY_SLEEP)
431
+
432
+ return output
433
+
434
+
435
+ @register_api("anthropic")
436
+ def chat_completion_anthropic(model, messages, temperature, max_tokens, api_dict=None, **kwargs):
437
+ import anthropic
438
+
439
+ if api_dict:
440
+ api_key = api_dict["api_key"]
441
+ else:
442
+ api_key = os.environ["ANTHROPIC_API_KEY"]
443
+
444
+ sys_msg = ""
445
+ if messages[0]["role"] == "system":
446
+ sys_msg = messages[0]["content"]
447
+ messages = messages[1:]
448
+
449
+ output = API_ERROR_OUTPUT
450
+ for _ in range(API_MAX_RETRY):
451
+ try:
452
+ c = anthropic.Anthropic(api_key=api_key)
453
+ response = c.messages.create(
454
+ model=model,
455
+ messages=messages,
456
+ stop_sequences=[anthropic.HUMAN_PROMPT],
457
+ max_tokens=max_tokens,
458
+ temperature=temperature,
459
+ system=sys_msg
460
+ )
461
+ output = {
462
+ "answer": response.content[0].text
463
+ }
464
+ break
465
+ except anthropic.APIError as e:
466
+ print(type(e), e)
467
+ time.sleep(API_RETRY_SLEEP)
468
+ return output
469
+
470
+
471
+ @register_api("anthropic_thinking")
472
+ def chat_completion_anthropic_thinking(model, messages, max_tokens, budget_tokens, **kwargs):
473
+ import anthropic
474
+
475
+ client = anthropic.Anthropic(
476
+ timeout=1200,
477
+ )
478
+
479
+ output = API_ERROR_OUTPUT
480
+ for _ in range(API_MAX_RETRY):
481
+ try:
482
+ response = client.messages.create(
483
+ model=model,
484
+ max_tokens=max_tokens,
485
+ thinking={
486
+ "type": "enabled",
487
+ "budget_tokens": budget_tokens
488
+ },
489
+ messages=messages,
490
+ )
491
+ output = {
492
+ "thought": response.content[0].thinking,
493
+ "answer": response.content[1].text,
494
+ }
495
+ break
496
+ except anthropic.APIError as e:
497
+ print(type(e), e)
498
+ time.sleep(API_RETRY_SLEEP)
499
+
500
+ return output
501
+
502
+
503
+ @register_api("mistral")
504
+ def chat_completion_mistral(model, messages, temperature, max_tokens, **kwargs):
505
+ from mistralai.client import MistralClient
506
+ from mistralai.models.chat_completion import ChatMessage
507
+ from mistralai.exceptions import MistralException
508
+
509
+ api_key = os.environ["MISTRAL_API_KEY"]
510
+ client = MistralClient(api_key=api_key)
511
+
512
+ prompts = [ChatMessage(role=message["role"], content=message["content"]) for message in messages]
513
+
514
+ output = API_ERROR_OUTPUT
515
+ for _ in range(API_MAX_RETRY):
516
+ try:
517
+ chat_response = client.chat(
518
+ model=model,
519
+ messages=prompts,
520
+ temperature=temperature,
521
+ max_tokens=max_tokens,
522
+ )
523
+ output = {
524
+ "answer": chat_response.choices[0].message.content
525
+ }
526
+ break
527
+ except MistralException as e:
528
+ print(type(e), e)
529
+ break
530
+
531
+ return output
532
+
533
+
534
+ @register_api("xai")
535
+ def chat_completion_xai(model, messages, temperature, max_tokens, api_dict=None, **kwargs):
536
+ import xai_sdk
537
+
538
+ client = xai_sdk.Client(api_key=api_dict['api_key'], api_host=api_dict['api_base']).compat
539
+ output = API_ERROR_OUTPUT
540
+
541
+ for _ in range(API_MAX_RETRY):
542
+ try:
543
+ stream = client.chat.completions.create(
544
+ model=model,
545
+ messages=messages,
546
+ stream=True,
547
+ max_tokens=max_tokens,
548
+ temperature=temperature,
549
+ top_p=0.95,
550
+ )
551
+ output_text = ""
552
+ for chunk in stream:
553
+ if chunk.choices[0].delta.content:
554
+ output_text += chunk.choices[0].delta.content
555
+
556
+ output = {
557
+ "answer": output_text
558
+ }
559
+ break
560
+ except Exception as e:
561
+ print(type(e), e)
562
+ time.sleep(API_RETRY_SLEEP)
563
+
564
+ return output
565
+
566
+
567
+ @register_api("litellm")
568
+ def chat_completion_litellm(model, messages, temperature, max_tokens, api_dict=None, **kwargs):
569
+ import litellm
570
+
571
+ output = API_ERROR_OUTPUT
572
+ for _ in range(API_MAX_RETRY):
573
+ try:
574
+ response = litellm.completion(
575
+ model=model,
576
+ messages=messages,
577
+ temperature=temperature,
578
+ max_tokens=max_tokens,
579
+ )
580
+ output = {
581
+ "answer": response.choices[0].message.content
582
+ }
583
+ break
584
+ except Exception as e:
585
+ print(type(e), e)
586
+ time.sleep(API_RETRY_SLEEP)
587
+
588
+ return output
589
+
590
+
591
+ @register_api("litellm_streaming")
592
+ def chat_completion_litellm_streaming(model, messages, temperature, max_tokens, api_dict=None, **kwargs):
593
+ """Streaming version of litellm completion"""
594
+ import litellm
595
+
596
+ try:
597
+ response = litellm.completion(
598
+ model=model,
599
+ messages=messages,
600
+ temperature=temperature,
601
+ max_tokens=max_tokens,
602
+ stream=True
603
+ )
604
+
605
+ for chunk in response:
606
+ if chunk.choices[0].delta.content is not None:
607
+ yield chunk.choices[0].delta.content
608
+
609
+ except Exception as e:
610
+ print(f"Error in litellm streaming completion: {e}")
611
+ yield f"Error: {str(e)}"
612
+
613
+
614
+ @register_api("anthropic_streaming")
615
+ def chat_completion_anthropic_streaming(model, messages, temperature, max_tokens, api_dict=None, **kwargs):
616
+ """Streaming version of Anthropic completion"""
617
+ import anthropic
618
+
619
+ if api_dict:
620
+ client = anthropic.Anthropic(api_key=api_dict["api_key"])
621
+ else:
622
+ client = anthropic.Anthropic()
623
+
624
+ try:
625
+ # Convert messages to Anthropic format
626
+ system_message = ""
627
+ conversation_messages = []
628
+
629
+ for msg in messages:
630
+ if msg["role"] == "system":
631
+ system_message = msg["content"]
632
+ else:
633
+ conversation_messages.append(msg)
634
+
635
+ stream = client.messages.create(
636
+ model=model,
637
+ max_tokens=max_tokens,
638
+ temperature=temperature,
639
+ system=system_message if system_message else None,
640
+ messages=conversation_messages,
641
+ stream=True
642
+ )
643
+
644
+ for chunk in stream:
645
+ if chunk.type == "content_block_delta" and chunk.delta.text:
646
+ yield chunk.delta.text
647
+
648
+ except Exception as e:
649
+ print(f"Error in Anthropic streaming completion: {e}")
650
+ yield f"Error: {str(e)}"
651
+
652
+ @register_api("gemini")
653
+ def http_completion_gemini(model, messages, **kwargs):
654
+ import requests
655
+
656
+ api_key = os.environ["GEMINI_API_KEY"]
657
+
658
+ safety_settings = [
659
+ {
660
+ "category": "HARM_CATEGORY_HARASSMENT",
661
+ "threshold": "BLOCK_NONE"
662
+ },
663
+ {
664
+ "category": "HARM_CATEGORY_HATE_SPEECH",
665
+ "threshold": "BLOCK_NONE"
666
+ },
667
+ {
668
+ "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
669
+ "threshold": "BLOCK_NONE"
670
+ },
671
+ {
672
+ "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
673
+ "threshold": "BLOCK_NONE"
674
+ },
675
+ ]
676
+
677
+ sys_prompt = None
678
+ if messages[0]["role"] == "system":
679
+ sys_prompt = {
680
+ "parts":[
681
+ {"text": messages[0]["content"]}
682
+ ]
683
+ }
684
+ messages = messages[1:]
685
+
686
+ role_map = {"user": "user",
687
+ "assistant": "model"}
688
+
689
+ conv = [{"parts":[{"text":turn["content"]}], "role":role_map[turn["role"]]} for turn in messages]
690
+
691
+ json_request = {
692
+ "contents": conv,
693
+ "safetySettings": safety_settings,
694
+ "systemInstruction": sys_prompt,
695
+ }
696
+
697
+ if "temperature" in kwargs and "max_tokens" in kwargs:
698
+ gen_config = {
699
+ "temperature": kwargs["temperature"],
700
+ "maxOutputTokens": kwargs["max_tokens"],
701
+ }
702
+ json_request["generationConfig"] = gen_config
703
+ elif "temperature" in kwargs:
704
+ gen_config = {
705
+ "temperature": kwargs["temperature"],
706
+ }
707
+ json_request["generationConfig"] = gen_config
708
+ elif "max_tokens" in kwargs:
709
+ gen_config = {
710
+ "maxOutputTokens": kwargs["max_tokens"],
711
+ }
712
+ json_request["generationConfig"] = gen_config
713
+
714
+ output = API_ERROR_OUTPUT
715
+ for _ in range(API_MAX_RETRY):
716
+ try:
717
+ response = requests.post(
718
+ f"https://generativelanguage.googleapis.com/v1beta/models/{model}:generateContent?key={api_key}",
719
+ json=json_request,
720
+ )
721
+ except Exception as e:
722
+ print(f"**API REQUEST ERROR** Reason: {e}.")
723
+ time.sleep(API_RETRY_SLEEP)
724
+ if response.status_code != 200:
725
+ print(f"**API REQUEST ERROR** Reason: status code {response.status_code}.")
726
+ time.sleep(API_RETRY_SLEEP)
727
+ try:
728
+ output = {
729
+ "answer": response.json()["candidates"][0]["content"]["parts"][0]["text"],
730
+ }
731
+ except KeyError as e:
732
+ print(type(e), e)
733
+ print(response.json())
734
+ return output
735
+
736
+
737
+ @register_api("vertex")
738
+ def vertex_completion_gemini(model, messages, project_id, regions, **kwargs):
739
+ import requests
740
+ import subprocess
741
+
742
+ output = API_ERROR_OUTPUT
743
+
744
+ # Obtain the access token using gcloud CLI
745
+ access_token = subprocess.check_output(
746
+ ["gcloud", "auth", "application-default", "print-access-token"],
747
+ text=True
748
+ ).strip()
749
+
750
+ if messages[0]["role"] == "system":
751
+ data = {
752
+ "systemInstruction": {
753
+ "role": "system", # ignored by vertexi api (04/18/2025)
754
+ "parts": [{
755
+ "text": messages[0]["content"]
756
+ }]
757
+ },
758
+ }
759
+ messages = messages[1:]
760
+ else:
761
+ data = {}
762
+
763
+ role_map = {
764
+ "user": "user",
765
+ "assistant": "model"
766
+ }
767
+
768
+ messages = [{"parts":[{"text":turn["content"]}], "role":role_map[turn["role"]]} for turn in messages]
769
+
770
+ url = (
771
+ f"https://us-central1-aiplatform.googleapis.com/v1/projects/"
772
+ f"{project_id}/locations/{regions}/publishers/google/models/"
773
+ f"{model}:generateContent"
774
+ )
775
+
776
+ headers = {
777
+ "Authorization": f"Bearer {access_token}",
778
+ "Content-Type": "application/json",
779
+ }
780
+
781
+ data = data | {
782
+ "contents": messages,
783
+ }
784
+
785
+ if "temperature" in kwargs or "max_tokens" in kwargs:
786
+ gen_config = {}
787
+ if "temperature" in kwargs:
788
+ gen_config["temperature"] = kwargs["temperature"]
789
+ if "max_tokens" in kwargs:
790
+ gen_config["maxOutputTokens"] = kwargs["max_tokens"]
791
+ data["generationConfig"] = gen_config
792
+
793
+ response = requests.post(url, json=data, headers=headers)
794
+
795
+ try:
796
+ output = {
797
+ "answer": response.json()["candidates"][0]["content"]["parts"][0]["text"],
798
+ }
799
+ except KeyError as e:
800
+ print(type(e), e)
801
+ print(response.json())
802
+
803
+ return output
804
+
805
+
806
+ @register_api("cohere")
807
+ def chat_completion_cohere(model, messages, temperature, max_tokens, **kwargs):
808
+ import cohere
809
+
810
+ co = cohere.Client(os.environ["COHERE_API_KEY"])
811
+ assert len(messages) > 0
812
+
813
+ template_map = {"system":"SYSTEM",
814
+ "assistant":"CHATBOT",
815
+ "user":"USER"}
816
+
817
+ assert messages[-1]["role"] == "user"
818
+ prompt = messages[-1]["content"]
819
+
820
+ if len(messages) > 1:
821
+ history = []
822
+ for message in messages[:-1]:
823
+ history.append({"role":template_map[message["role"]], "message":message["content"]})
824
+ else:
825
+ history = None
826
+
827
+ output = API_ERROR_OUTPUT
828
+ for _ in range(API_MAX_RETRY):
829
+ try:
830
+ response = co.chat(
831
+ message=prompt,
832
+ model=model,
833
+ temperature=temperature,
834
+ max_tokens=max_tokens,
835
+ chat_history=history,
836
+ )
837
+ output = {
838
+ "answer": response.text
839
+ }
840
+ break
841
+ except cohere.core.api_error.ApiError as e:
842
+ print(type(e), e)
843
+ raise
844
+ except Exception as e:
845
+ print(type(e), e)
846
+ break
847
+
848
+ return output
849
+
850
+
851
+ @register_api("meta")
852
+ def chat_completion_meta(model, messages, temperature, max_tokens, api_dict, **kwargs):
853
+ assert api_dict
854
+ texts = [{"role": m["role"],
855
+ "text": m["content"]} for m in messages]
856
+
857
+ output = ""
858
+ for _ in range(API_MAX_RETRY):
859
+ try:
860
+ res = requests.post(
861
+ f"{api_dict['api_base']}/chat_stream_completions?access_token={api_dict['api_key']}",
862
+ stream=True,
863
+ headers={"Content-Type": "application/json"},
864
+ json={
865
+ "model": model,
866
+ "chunks_delimited": True,
867
+ "messages": texts,
868
+ "options": {
869
+ "max_tokens": max_tokens,
870
+ "generation_algorithm": "top_p",
871
+ "top_p": 1,
872
+ "temperature": temperature,
873
+ },
874
+ },
875
+ timeout=30,
876
+ )
877
+ if res.status_code == 200:
878
+ for line in res.iter_lines():
879
+ if line:
880
+ part = json.loads(line.decode("utf-8"))
881
+ if "text" in part:
882
+ output += part["text"]
883
+ break
884
+ else:
885
+ print(f"**API REQUEST ERROR** Code: {res.status_code}")
886
+ time.sleep(API_RETRY_SLEEP)
887
+ except Exception as e:
888
+ print("**API REQUEST ERROR** Reason: Unknown.")
889
+ time.sleep(API_RETRY_SLEEP)
890
+ continue
891
+
892
+ return {
893
+ "answer": output
894
+ }
895
+
896
+
897
+ def batch_submit_sglang(
898
+ executor,
899
+ tokenizer,
900
+ temperature,
901
+ max_tokens,
902
+ all_context,
903
+ max_context_length=None,
904
+ end_think_token=None,
905
+ ):
906
+ print(f"DEBUG: sglang_completion_qwq: max_context_length: {max_context_length}")
907
+
908
+ sampling_params = {
909
+ "temperature": temperature,
910
+ "skip_special_tokens": False,
911
+ "max_new_tokens": max_tokens - 1,
912
+ "no_stop_trim": True,
913
+ }
914
+
915
+ batch_prompt_token_ids = []
916
+ batch_uids =[]
917
+ uid_to_prompt = {}
918
+ uid_to_response = {}
919
+
920
+ for context in all_context:
921
+ prompt_token_ids = tokenizer.apply_chat_template(
922
+ context['turns'],
923
+ add_generation_prompt=True,
924
+ tokenize=True,
925
+ )
926
+
927
+ if max_context_length and (len(prompt_token_ids) + max_tokens) > max_context_length:
928
+ print(f"DEBUG: sglang_completion_qwq: context length ({len(prompt_token_ids) + max_tokens}) > max_context_length ({max_context_length}), skip this context")
929
+ continue
930
+
931
+ batch_prompt_token_ids.append(prompt_token_ids)
932
+ batch_uids.append(context['uid'])
933
+
934
+ uid_to_prompt[context['uid']] = context['turns']
935
+
936
+ err_msg = f"ERROR: len(batch_prompt_token_ids): {len(batch_prompt_token_ids)} != len(batch_uids): {len(batch_uids)}"
937
+ assert len(batch_prompt_token_ids) == len(batch_uids), err_msg
938
+
939
+ _ = executor.submit(
940
+ prompt_token_ids=batch_prompt_token_ids,
941
+ sampling_params=[sampling_params] * len(batch_uids),
942
+ keys=batch_uids,
943
+ )
944
+
945
+ for request in tqdm(executor.as_completed(), total=len(batch_uids)):
946
+ uid = request.key()
947
+ result = request.result()
948
+ raw_response = tokenizer.decode(
949
+ result['output_ids'],
950
+ skip_special_tokens=True,
951
+ )
952
+
953
+ if end_think_token:
954
+ thought, _, ans = raw_response.partition(end_think_token)
955
+ if ans == "":
956
+ uid_to_response[uid] = {"thought": thought, "answer": raw_response}
957
+ else:
958
+ uid_to_response[uid] = {"thought": thought, "answer": ans}
959
+ else:
960
+ uid_to_response[uid] = {"answer": raw_response}
961
+
962
+ # assert len(uid_to_response) == len(all_context), f"ERROR: len output ({len(uid_to_response)}) != len input ({len(all_context)})"
963
+ return uid_to_response
964
+
965
+
966
+ def _infer_cuda_tp_world_size():
967
+ cuda_devices = os.environ.get("CUDA_VISIBLE_DEVICES", None)
968
+ if cuda_devices is None:
969
+ tp_world_size = 8
970
+ else:
971
+ tp_world_size = len(cuda_devices.split(","))
972
+ return tp_world_size
973
+
974
+
975
+ def download_model(model: str, max_workers: int = 64):
976
+ import subprocess
977
+
978
+ env = os.environ.copy()
979
+ env["HF_HUB_ENABLE_HF_TRANSFER"] = "0"
980
+
981
+ cmd = [
982
+ "huggingface-cli",
983
+ "download",
984
+ f"--max-workers={max_workers}",
985
+ model
986
+ ]
987
+
988
+ try:
989
+ subprocess.run(cmd, env=env, check=True)
990
+ print(f"Successfully downloaded model '{model}' with {max_workers} max workers.")
991
+ except subprocess.CalledProcessError as e:
992
+ print(f"Error occurred while downloading the model: {e}")
993
+
994
+
995
+ @register_engine("sglang")
996
+ def sglang_completion(
997
+ model,
998
+ batch_context,
999
+ answer_file,
1000
+ temperature,
1001
+ max_tokens=32768,
1002
+ end_think_token=None,
1003
+ **kwargs,
1004
+ ):
1005
+ from transformers import AutoTokenizer
1006
+ from utils.sglang_server import SGLangServerExecutor
1007
+ import re
1008
+
1009
+ tokenizer = AutoTokenizer.from_pretrained(model)
1010
+
1011
+ uids = [context['uid'] for context in batch_context]
1012
+ prompts = [context['instruction'] for context in batch_context]
1013
+ code_envs = [context['environment'] for context in batch_context]
1014
+ processed_context = [
1015
+ {
1016
+ "uid": uids[i],
1017
+ "turns": [{
1018
+ "content": prompts[i],
1019
+ "role": "user",
1020
+ }]
1021
+ }
1022
+ for i in tqdm(range(len(uids)))
1023
+ ]
1024
+ download_model(model=model)
1025
+
1026
+ server_args = {
1027
+ "model_path": model,
1028
+ "dtype": "auto",
1029
+ "tp_size": _infer_cuda_tp_world_size(),
1030
+ "mem_fraction_static": 0.7,
1031
+ "max_prefill_tokens": max_tokens,
1032
+ "max_workers": 256,
1033
+ "server_port": 30000,
1034
+ }
1035
+
1036
+ executor = SGLangServerExecutor(
1037
+ **server_args,
1038
+ )
1039
+
1040
+ print(f"DEBUG: sglang_completion: model: {model}")
1041
+
1042
+ uid_to_response = batch_submit_sglang(
1043
+ executor=executor,
1044
+ tokenizer=tokenizer,
1045
+ temperature=temperature,
1046
+ max_tokens=max_tokens,
1047
+ all_context=processed_context,
1048
+ end_think_token=end_think_token,
1049
+ )
1050
+
1051
+ executor.join()
1052
+ print("DEBUG: sglang_completion: done, sleep 10 seconds...")
1053
+ time.sleep(10)
1054
+
1055
+ num_null = sum(
1056
+ [uid_to_response[uid]['answer'] is None for uid in uids if uid in uid_to_response]
1057
+ )
1058
+ print(f"Number of null responses: {num_null}")
1059
+
1060
+ records = []
1061
+ for i, context in enumerate(processed_context):
1062
+ uid = context['uid']
1063
+ if uid not in uid_to_response:
1064
+ continue
1065
+
1066
+ answer_data = uid_to_response[uid]
1067
+
1068
+ record = {
1069
+ "uid": uid,
1070
+ "ans_id": shortuuid.uuid(),
1071
+ "model": kwargs.get("model_display_name", model),
1072
+ "messages": context['turns'] + [
1073
+ {"content": answer_data, "role": "assistant"}
1074
+ ],
1075
+ "environment": code_envs[i],
1076
+ "tstamp": time.time(),
1077
+ "metadata": {},
1078
+ }
1079
+
1080
+ records.append(record)
1081
+
1082
+ with open(answer_file, 'w', encoding='utf-8') as f:
1083
+ for rec in records:
1084
+ f.write(json.dumps(rec, ensure_ascii=True) + '\n')
1085
+
1086
+
1087
+ @register_api("aws_claude")
1088
+ def chat_completion_aws_bedrock_claude(messages, api_dict=None, aws_region="us-west-2", **kwargs):
1089
+ """
1090
+ Call AWS Bedrock API for chat completion
1091
+
1092
+ Args:
1093
+ model (str): Model ID
1094
+ conv (object): Conversation object containing messages
1095
+ temperature (float): Temperature parameter for response generation
1096
+ max_tokens (int): Maximum tokens in response
1097
+ api_dict (dict, optional): API configuration dictionary
1098
+ aws_region (str, optional): AWS region, defaults to "us-west-2"
1099
+
1100
+ Returns:
1101
+ str: Generated response text or error message
1102
+ """
1103
+
1104
+ # Configure AWS client if api_dict provided
1105
+ if api_dict is not None:
1106
+ bedrock_rt_client = boto3.client(
1107
+ service_name='bedrock-runtime',
1108
+ region_name=aws_region,
1109
+ aws_access_key_id=api_dict.get('aws_access_key_id'),
1110
+ aws_secret_access_key=api_dict.get('aws_secret_access_key')
1111
+ )
1112
+ else:
1113
+ bedrock_rt_client = boto3.client(
1114
+ service_name='bedrock-runtime',
1115
+ region_name=aws_region,)
1116
+
1117
+ output = API_ERROR_OUTPUT
1118
+
1119
+ #get kwargs from settings
1120
+ temperature= kwargs["temperature"]
1121
+ max_tokens= kwargs["max_tokens"]
1122
+ model = kwargs["model_id"]
1123
+
1124
+ sys_msg = ""
1125
+ if messages[0]["role"] == "system":
1126
+ sys_msg = messages[0]["content"]
1127
+ messages = messages[1:]
1128
+ else:
1129
+ prompt = messages[0]['content']
1130
+
1131
+
1132
+ # Retry logic for API calls
1133
+ for _ in range(API_MAX_RETRY):
1134
+ try:
1135
+ # Prepare request body
1136
+ prompt_json = {
1137
+ "system": sys_msg,
1138
+ "messages": messages,
1139
+ "max_tokens": max_tokens,
1140
+ "temperature": temperature,
1141
+ "anthropic_version": "bedrock-2023-05-31",
1142
+ "stop_sequences": ["Human"]
1143
+ }
1144
+
1145
+ # Call Bedrock API
1146
+ response = bedrock_rt_client.invoke_model(
1147
+ body=json.dumps(prompt_json),
1148
+ modelId=model,
1149
+ accept='application/json',
1150
+ contentType='application/json'
1151
+ )
1152
+
1153
+ # Parse response
1154
+ response_body = json.loads(response.get('body').read())
1155
+ output = {"answer":response_body.get("content")[0].get("text")}
1156
+ break
1157
+
1158
+ except Exception as e:
1159
+ print(type(e), e)
1160
+ time.sleep(API_RETRY_SLEEP)
1161
+
1162
+ return output
1163
+
1164
+ @register_api("aws_mistral")
1165
+ def chat_completion_aws_bedrock_mistral(messages, api_dict=None, aws_region="us-west-2", **kwargs):
1166
+ """
1167
+ Call AWS Bedrock API for chat completion
1168
+
1169
+ Args:
1170
+ model (str): Model ID
1171
+ conv (object): Conversation object containing messages
1172
+ temperature (float): Temperature parameter for response generation
1173
+ max_tokens (int): Maximum tokens in response
1174
+ api_dict (dict, optional): API configuration dictionary
1175
+ aws_region (str, optional): AWS region, defaults to "us-west-2"
1176
+
1177
+ Returns:
1178
+ str: Generated response text or error message
1179
+ """
1180
+
1181
+ # Configure AWS client if api_dict provided
1182
+ if api_dict is not None:
1183
+ bedrock_rt_client = boto3.client(
1184
+ service_name='bedrock-runtime',
1185
+ region_name=aws_region,
1186
+ aws_access_key_id=api_dict.get('aws_access_key_id'),
1187
+ aws_secret_access_key=api_dict.get('aws_secret_access_key')
1188
+ )
1189
+ else:
1190
+ bedrock_rt_client = boto3.client(
1191
+ service_name='bedrock-runtime',
1192
+ region_name=aws_region,)
1193
+
1194
+ output = API_ERROR_OUTPUT
1195
+
1196
+ #get kwargs from settings
1197
+ temperature= kwargs["temperature"]
1198
+ max_tokens= kwargs["max_tokens"]
1199
+ model = kwargs["model_id"]
1200
+
1201
+ # Retry logic for API calls
1202
+ for _ in range(API_MAX_RETRY):
1203
+ try:
1204
+ ## =============== Format prompt ================
1205
+ prompt = "\n".join([content for message in messages for content in message["content"]])
1206
+ formatted_prompt = f"<s>[INST] {prompt.strip()} [/INST]"
1207
+ body = {
1208
+ "prompt": formatted_prompt,
1209
+ "max_tokens": max_tokens,
1210
+ "stop": ["Human:"],
1211
+ "temperature": temperature,
1212
+ }
1213
+
1214
+ # Call Bedrock API
1215
+ response = bedrock_rt_client.invoke_model(
1216
+ body=json.dumps(body),
1217
+ modelId=model,
1218
+ accept='application/json',
1219
+ contentType='application/json'
1220
+ )
1221
+
1222
+ # Parse response
1223
+ response_body = json.loads(response.get('body').read())
1224
+
1225
+ if "pixtral-large" in model: #us.mistral.pixtral-large-2502-v1:0
1226
+ output = {"answer": response_body.get("choices")[0].get("message").get("content")}
1227
+ else:
1228
+ output = {"answer": response_body.get("outputs")[0].get("text")}
1229
+
1230
+ break
1231
+
1232
+ except Exception as e:
1233
+ print(type(e), e)
1234
+ time.sleep(API_RETRY_SLEEP)
1235
+
1236
+ return output
1237
+
1238
+
1239
+ @register_api("mistral_streaming")
1240
+ def chat_completion_mistral_streaming(model, messages, temperature, max_tokens, api_dict=None, **kwargs):
1241
+ """Streaming version of Mistral completion"""
1242
+ import openai
1243
+
1244
+ if api_dict:
1245
+ client = openai.OpenAI(
1246
+ base_url=api_dict["api_base"],
1247
+ api_key=api_dict["api_key"],
1248
+ )
1249
+ else:
1250
+ client = openai.OpenAI()
1251
+
1252
+ try:
1253
+ stream = client.chat.completions.create(
1254
+ model=model,
1255
+ messages=messages,
1256
+ temperature=temperature,
1257
+ max_tokens=max_tokens,
1258
+ stream=True
1259
+ )
1260
+
1261
+ for chunk in stream:
1262
+ if chunk.choices[0].delta.content is not None:
1263
+ yield chunk.choices[0].delta.content
1264
+
1265
+ except Exception as e:
1266
+ print(f"Error in Mistral streaming completion: {e}")
1267
+ yield f"Error: {str(e)}"
1268
+
1269
+
1270
+ @register_api("gemini_streaming")
1271
+ def chat_completion_gemini_streaming(model, messages, **kwargs):
1272
+ """Streaming version of Gemini completion"""
1273
+ import google.generativeai as genai
1274
+
1275
+ try:
1276
+ # Configure the API
1277
+ genai.configure(api_key=os.environ.get("GEMINI_API_KEY"))
1278
+
1279
+ # Create model
1280
+ model_genai = genai.GenerativeModel(model)
1281
+
1282
+ # Convert messages to Gemini format
1283
+ conversation = model_genai.start_chat(history=[])
1284
+
1285
+ # Get the last user message
1286
+ last_user_message = None
1287
+ for msg in messages:
1288
+ if msg["role"] == "user":
1289
+ last_user_message = msg["content"]
1290
+
1291
+ if not last_user_message:
1292
+ yield "Error: No user message found"
1293
+ return
1294
+
1295
+ # Stream the response
1296
+ response = conversation.send_message(last_user_message, stream=True)
1297
+
1298
+ for chunk in response:
1299
+ if chunk.text:
1300
+ yield chunk.text
1301
+
1302
+ except Exception as e:
1303
+ print(f"Error in Gemini streaming completion: {e}")
1304
+ yield f"Error: {str(e)}"
conversation.py ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core dependencies for BigCodeArena
2
+ gradio
3
+ gradio-sandboxcomponent
4
+ # HTTP and API handling
5
+ requests
6
+ # AWS SDK
7
+ boto3
8
+ # YAML configuration
9
+ PyYAML
10
+ # Progress bars and utilities
11
+ tqdm
12
+ # UUID generation
13
+ shortuuid>=1.0.11
14
+ # Image processing (for vision models)
15
+ Pillow>=10.0.0
16
+ # Optional: FastChat (if using conversation templates)
17
+ # fastchat>=0.2.0
18
+ # Development and testing (optional)
19
+ pytest>=7.4.0
20
+ black>=23.0.0
21
+ flake8>=6.0.0
sandbox/__init__.py ADDED
File without changes
sandbox/code_analyzer.py ADDED
@@ -0,0 +1,935 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ '''
2
+ Module for analyzing code snippets to determine the environments, dependencies, and other information needed to run the code.
3
+ '''
4
+
5
+
6
+ from enum import StrEnum
7
+ from typing import Any, Generator, TypeAlias, TypedDict, Set
8
+
9
+ import base64
10
+
11
+ import ast
12
+ from tree_sitter import Language, Node, Parser
13
+ import tree_sitter_javascript
14
+ import tree_sitter_typescript
15
+ import sys
16
+ import re
17
+
18
+
19
+ class SandboxEnvironment(StrEnum):
20
+ AUTO = 'Auto'
21
+
22
+ # Web UI Frameworks
23
+ HTML = 'HTML'
24
+ REACT = 'React'
25
+ VUE = 'Vue'
26
+ GRADIO = 'Gradio'
27
+ STREAMLIT = 'Streamlit'
28
+ PYGAME = 'PyGame'
29
+ MERMAID = 'Mermaid'
30
+
31
+ # Runner
32
+ PYTHON_RUNNER = 'Python Runner'
33
+ JAVASCRIPT_RUNNER = 'Javascript Runner'
34
+
35
+ # Compiler
36
+ C_RUNNER = 'C Runner'
37
+ CPP_RUNNER = 'C++ Runner'
38
+ # CSHARP_RUNNER = 'C# Runner'
39
+ JAVA_RUNNER = 'Java Runner'
40
+ RUST_RUNNER = 'Rust Runner'
41
+ GOLANG_RUNNER = 'Golang Runner'
42
+
43
+
44
+ def extract_python_imports(code: str) -> list[str]:
45
+ '''
46
+ Extract Python package imports using AST parsing.
47
+ Returns a list of top-level package names.
48
+ '''
49
+ try:
50
+ tree = ast.parse(code)
51
+ except SyntaxError:
52
+ return []
53
+
54
+ packages: Set[str] = set()
55
+
56
+ for node in ast.walk(tree):
57
+ try:
58
+ if isinstance(node, ast.Import):
59
+ for name in node.names:
60
+ # Get the top-level package name from any dotted path
61
+ # e.g., 'foo.bar.baz' -> 'foo'
62
+ if name.name: # Ensure there's a name
63
+ packages.add(name.name.split('.')[0])
64
+
65
+ elif isinstance(node, ast.ImportFrom):
66
+ # Skip relative imports (those starting with dots)
67
+ if node.level == 0 and node.module:
68
+ # Get the top-level package name
69
+ # e.g., from foo.bar import baz -> 'foo'
70
+ packages.add(node.module.split('.')[0])
71
+
72
+ # Also check for common dynamic import patterns
73
+ elif isinstance(node, ast.Call):
74
+ if isinstance(node.func, ast.Name) and node.func.id == 'importlib':
75
+ # Handle importlib.import_module('package')
76
+ if len(node.args) > 0 and isinstance(node.args[0], ast.Str):
77
+ packages.add(node.args[0].s.split('.')[0])
78
+ elif isinstance(node.func, ast.Attribute) and isinstance(node.func.value, ast.Name):
79
+ # Handle __import__('package') and importlib.import_module('package')
80
+ if node.func.value.id == 'importlib' and node.func.attr == 'import_module':
81
+ if len(node.args) > 0 and isinstance(node.args[0], ast.Str):
82
+ packages.add(node.args[0].s.split('.')[0])
83
+ elif node.func.attr == '__import__':
84
+ if len(node.args) > 0 and isinstance(node.args[0], ast.Str):
85
+ packages.add(node.args[0].s.split('.')[0])
86
+ except Exception as e:
87
+ print(f"Error processing node {type(node)}: {e}")
88
+ continue
89
+
90
+ # Filter out standard library modules using sys.stdlib_module_names
91
+ std_libs = set(sys.stdlib_module_names)
92
+
93
+ return list(packages - std_libs)
94
+
95
+
96
+ def extract_js_imports(code: str) -> list[str]:
97
+ '''
98
+ Extract npm package imports using Tree-sitter for robust parsing.
99
+ Handles both JavaScript and TypeScript code, including Vue SFC.
100
+ Returns a list of package names.
101
+ '''
102
+ try:
103
+ # For Vue SFC, extract the script section first
104
+ script_match = re.search(r'<script.*?>(.*?)</script>', code, re.DOTALL)
105
+ if script_match:
106
+ code = script_match.group(1).strip()
107
+
108
+ # Initialize parsers with language modules
109
+ ts_parser = Parser(Language(tree_sitter_typescript.language_tsx()))
110
+ js_parser = Parser(Language(tree_sitter_javascript.language()))
111
+
112
+ # Try parsing as TypeScript first, then JavaScript
113
+ code_bytes = bytes(code, "utf8")
114
+ try:
115
+ tree = ts_parser.parse(code_bytes)
116
+ except Exception as e:
117
+ print(f"TypeScript parsing failed: {e}")
118
+ try:
119
+ tree = js_parser.parse(code_bytes)
120
+ except Exception as e:
121
+ print(f"JavaScript parsing failed: {e}")
122
+ tree = None
123
+
124
+ if tree is None:
125
+ raise Exception("Both TypeScript and JavaScript parsing failed")
126
+
127
+ packages: Set[str] = set()
128
+
129
+ def extract_package_name(node: Node) -> str | None:
130
+ """Extract npm package name from string or template string.
131
+ Returns None for local aliases like @/ or relative paths."""
132
+ if node.type in ['string', 'string_fragment']:
133
+ pkg_path = code[node.start_byte:node.end_byte].strip('"\'')
134
+ if pkg_path.startswith('.') or pkg_path.startswith('/') or pkg_path.startswith('@/'):
135
+ return None # relative, absolute, or alias path
136
+
137
+ # Scoped npm package: @scope/package/...
138
+ if pkg_path.startswith('@'):
139
+ parts = pkg_path.split('/')
140
+ if len(parts) >= 2:
141
+ return '/'.join(parts[:2])
142
+
143
+ # Regular npm package: "lodash/cloneDeep" -> "lodash"
144
+ return pkg_path.split('/')[0]
145
+
146
+ elif node.type == 'template_string':
147
+ content = ''
148
+ has_template_var = False
149
+ for child in node.children:
150
+ if child.type == 'string_fragment':
151
+ content += code[child.start_byte:child.end_byte]
152
+ elif child.type == 'template_substitution':
153
+ has_template_var = True
154
+
155
+ if not content or content.startswith('.') or content.startswith('/') or content.startswith('@/'):
156
+ return None
157
+
158
+ if has_template_var:
159
+ if content.endswith('-literal'):
160
+ return 'package-template-literal'
161
+ return None
162
+
163
+ if content.startswith('@'):
164
+ parts = content.split('/')
165
+ if len(parts) >= 2:
166
+ return '/'.join(parts[:2])
167
+ return content.split('/')[0]
168
+
169
+ return None
170
+
171
+ def visit_node(node: Node) -> None:
172
+ if node.type == 'import_statement':
173
+ # Handle ES6 imports
174
+ string_node = node.child_by_field_name('source')
175
+ if string_node:
176
+ pkg_name = extract_package_name(string_node)
177
+ if pkg_name:
178
+ packages.add(pkg_name)
179
+
180
+ elif node.type == 'export_statement':
181
+ # Handle re-exports
182
+ source = node.child_by_field_name('source')
183
+ if source:
184
+ pkg_name = extract_package_name(source)
185
+ if pkg_name:
186
+ packages.add(pkg_name)
187
+
188
+ elif node.type == 'call_expression':
189
+ # Handle require calls and dynamic imports
190
+ func_node = node.child_by_field_name('function')
191
+ if func_node and func_node.text:
192
+ func_name = func_node.text.decode('utf8')
193
+ if func_name in ['require', 'import']:
194
+ args = node.child_by_field_name('arguments')
195
+ if args and args.named_children:
196
+ arg = args.named_children[0]
197
+ pkg_name = extract_package_name(arg)
198
+ if pkg_name:
199
+ packages.add(pkg_name)
200
+
201
+ # Recursively visit children
202
+ for child in node.children:
203
+ visit_node(child)
204
+
205
+ visit_node(tree.root_node)
206
+ return list(packages)
207
+
208
+ except Exception as e:
209
+ print(f"Tree-sitter parsing failed: {e}")
210
+ # Fallback to basic regex parsing if tree-sitter fails
211
+ packages: Set[str] = set()
212
+
213
+ # First try to extract script section for Vue SFC
214
+ script_match = re.search(r'<script.*?>(.*?)</script>', code, re.DOTALL)
215
+ if script_match:
216
+ code = script_match.group(1).strip()
217
+
218
+ # Look for imports
219
+ import_patterns = [
220
+ # dynamic imports
221
+ r'(?:import|require)\s*\(\s*[\'"](@?[\w-]+(?:/[\w-]+)*)[\'"]',
222
+ # static imports
223
+ r'(?:import|from)\s+[\'"](@?[\w-]+(?:/[\w-]+)*)[\'"]',
224
+ # require statements
225
+ r'require\s*\(\s*[\'"](@?[\w-]+(?:/[\w-]+)*)[\'"]',
226
+ ]
227
+
228
+ for pattern in import_patterns:
229
+ matches = re.finditer(pattern, code)
230
+ for match in matches:
231
+ pkg_name = match.group(1)
232
+ if not pkg_name.startswith('.'):
233
+ if pkg_name.startswith('@'):
234
+ parts = pkg_name.split('/')
235
+ if len(parts) >= 2:
236
+ packages.add('/'.join(parts[:2]))
237
+ else:
238
+ packages.add(pkg_name.split('/')[0])
239
+
240
+ return list(packages)
241
+
242
+
243
+ def determine_python_environment(code: str, imports: list[str]) -> SandboxEnvironment | None:
244
+ '''
245
+ Determine Python sandbox environment based on imports and AST analysis.
246
+ '''
247
+ try:
248
+ tree = ast.parse(code)
249
+ for node in ast.walk(tree):
250
+ # Check for specific framework usage patterns
251
+ if isinstance(node, ast.Name) and node.id == 'gr':
252
+ return SandboxEnvironment.GRADIO
253
+ elif isinstance(node, ast.Name) and node.id == 'st':
254
+ return SandboxEnvironment.STREAMLIT
255
+ except SyntaxError:
256
+ pass
257
+
258
+ # Check imports for framework detection
259
+ if 'pygame' in imports:
260
+ return SandboxEnvironment.PYGAME
261
+ elif 'gradio' in imports:
262
+ return SandboxEnvironment.GRADIO
263
+ elif 'streamlit' in imports:
264
+ return SandboxEnvironment.STREAMLIT
265
+ # elif 'nicegui' in imports:
266
+ # return SandboxEnvironment.NICEGUI
267
+
268
+ return SandboxEnvironment.PYTHON_RUNNER
269
+
270
+
271
+ def determine_jsts_environment(code: str, imports: list[str]) -> SandboxEnvironment | None:
272
+ '''
273
+ Determine JavaScript/TypeScript sandbox environment based on imports and AST analysis.
274
+ '''
275
+ # First check for Vue SFC structure
276
+ if '<template>' in code or '<script setup' in code:
277
+ return SandboxEnvironment.VUE
278
+
279
+ # Check imports for framework detection
280
+ react_packages = {'react', '@react', 'next', '@next'}
281
+ vue_packages = {'vue', '@vue', 'nuxt', '@nuxt'}
282
+
283
+ if any(pkg in react_packages for pkg in imports):
284
+ return SandboxEnvironment.REACT
285
+ elif any(pkg in vue_packages for pkg in imports):
286
+ return SandboxEnvironment.VUE
287
+
288
+ try:
289
+ ts_parser = Parser(Language(tree_sitter_typescript.language_tsx()))
290
+ tree = ts_parser.parse(bytes(code, "utf8"))
291
+
292
+ def has_framework_patterns(node: Node) -> tuple[bool, str]:
293
+ # React JSX patterns
294
+ if node.type in ['jsx_element', 'jsx_self_closing_element']:
295
+ return True, 'react'
296
+ # Vue <template> AST element
297
+ elif node.type == 'template_element':
298
+ return True, 'vue'
299
+ # Vue template string with restricted patterns
300
+ elif node.type == 'template_string':
301
+ content = code[node.start_byte:node.end_byte]
302
+ vue_patterns = [
303
+ r'\bv-(if|else|for|bind|on|model|show|html|text)=', # Vue directives
304
+ r'@(?:click|change|input|submit|keyup|keydown)\s*=', # Event bindings
305
+ r':(?:class|style|src|href|value|disabled|checked)\s*=' # Attribute bindings
306
+ ]
307
+ if any(re.search(p, content) for p in vue_patterns):
308
+ return True, 'vue'
309
+ return False, ''
310
+
311
+ cursor = tree.walk()
312
+
313
+ def visit_node() -> SandboxEnvironment | None:
314
+ is_framework, framework = has_framework_patterns(cursor.node)
315
+ if is_framework:
316
+ return SandboxEnvironment.REACT if framework == 'react' else SandboxEnvironment.VUE
317
+ if cursor.goto_first_child():
318
+ while True:
319
+ result = visit_node()
320
+ if result:
321
+ return result
322
+ if not cursor.goto_next_sibling():
323
+ break
324
+ cursor.goto_parent()
325
+ return None
326
+
327
+ result = visit_node()
328
+ if result:
329
+ return result
330
+
331
+ # More targeted Vue detection
332
+ vue_patterns = [
333
+ r'export\s+default\s+{[\s\S]*?(components|props|emits|data|methods|computed|watch)\s*:',
334
+ r'defineComponent\s*\(',
335
+ r'Vue\.extend\s*\(',
336
+ r'createApp\s*\(',
337
+ r'\b(ref|reactive|computed|watch|onMounted|onUnmounted|provide|inject)\s*\(',
338
+ r'defineProps\s*\(',
339
+ r'defineEmits\s*\(',
340
+ r'<[a-zA-Z][^>]+\s+(v-(if|else|for|bind|on|model|show|html|text)|@|:)[^>]*>' # in tag context
341
+ ]
342
+
343
+ for pattern in vue_patterns:
344
+ if re.search(pattern, code, re.MULTILINE):
345
+ return SandboxEnvironment.VUE
346
+
347
+ except Exception as e:
348
+ print(f"Tree-sitter parsing error: {e}")
349
+
350
+ return SandboxEnvironment.JAVASCRIPT_RUNNER
351
+
352
+
353
+ def detect_js_ts_code_lang(code: str) -> str:
354
+ '''
355
+ Detect whether code is JavaScript or TypeScript using Tree-sitter AST parsing.
356
+ Handles Vue SFC, React, and regular JS/TS files.
357
+
358
+ Args:
359
+ code (str): The code to analyze
360
+
361
+ Returns:
362
+ str: 'typescript' if TypeScript patterns are found, 'javascript' otherwise
363
+ '''
364
+ # Quick check for explicit TypeScript in Vue SFC
365
+ if '<script lang="ts">' in code or '<script lang="typescript">' in code:
366
+ return 'typescript'
367
+
368
+ try:
369
+ # Initialize TypeScript parser
370
+ ts_parser = Parser(Language(tree_sitter_typescript.language_tsx()))
371
+
372
+ # Parse the code
373
+ tree = ts_parser.parse(bytes(code, "utf8"))
374
+
375
+ def has_typescript_patterns(node: Node) -> bool:
376
+ # Check for TypeScript-specific syntax
377
+ if node.type in {
378
+ 'type_annotation', # Type annotations
379
+ 'type_alias_declaration', # type Foo = ...
380
+ 'interface_declaration', # interface Foo
381
+ 'enum_declaration', # enum Foo
382
+ 'implements_clause', # implements Interface
383
+ 'type_parameter', # Generic type parameters
384
+ 'type_assertion', # Type assertions
385
+ 'type_predicate', # Type predicates in functions
386
+ 'type_arguments', # Generic type arguments
387
+ 'readonly_type', # readonly keyword
388
+ 'mapped_type', # Mapped types
389
+ 'conditional_type', # Conditional types
390
+ 'union_type', # Union types
391
+ 'intersection_type', # Intersection types
392
+ 'tuple_type', # Tuple types
393
+ 'optional_parameter', # Optional parameters
394
+ 'decorator', # Decorators
395
+ 'ambient_declaration', # Ambient declarations
396
+ 'declare_statement', # declare keyword
397
+ 'accessibility_modifier', # private/protected/public
398
+ }:
399
+ return True
400
+
401
+ # Check for type annotations in variable declarations
402
+ if node.type == 'variable_declarator':
403
+ for child in node.children:
404
+ if child.type == 'type_annotation':
405
+ return True
406
+
407
+ # Check for return type annotations in functions
408
+ if node.type in {'function_declaration', 'method_definition', 'arrow_function'}:
409
+ for child in node.children:
410
+ if child.type == 'type_annotation':
411
+ return True
412
+
413
+ return False
414
+
415
+ # Walk the AST to find TypeScript patterns
416
+ cursor = tree.walk()
417
+
418
+ def visit_node() -> bool:
419
+ if has_typescript_patterns(cursor.node):
420
+ return True
421
+
422
+ # Check children
423
+ if cursor.goto_first_child():
424
+ while True:
425
+ if visit_node():
426
+ return True
427
+ if not cursor.goto_next_sibling():
428
+ break
429
+ cursor.goto_parent()
430
+
431
+ return False
432
+
433
+ if visit_node():
434
+ return 'typescript'
435
+
436
+ except Exception as e:
437
+ print(f"Tree-sitter parsing error: {e}")
438
+ # Fallback to basic checks if parsing fails
439
+ pass
440
+
441
+ return 'javascript'
442
+
443
+
444
+ def extract_inline_pip_install_commands(code: str) -> tuple[list[str], str]:
445
+ '''
446
+ Extracts pip install commands from inline code comments and returns both the packages and cleaned code.
447
+ This is useful for cases where pip install commands are written as comments in the code or
448
+ Jupyter notebook-style !pip install commands.
449
+
450
+ Args:
451
+ code (str): The code to analyze.
452
+
453
+ Returns:
454
+ tuple[list[str], str]: A tuple containing:
455
+ 1. List of Python packages extracted from pip install commands in comments
456
+ 2. Code with the pip install comments removed
457
+ '''
458
+ python_packages = []
459
+ cleaned_lines = []
460
+
461
+ # Regex patterns to match pip install commands in comments and Jupyter-style commands
462
+ pip_patterns = [
463
+ # Comments with pip install
464
+ r'#\s*(?:pip|pip3|python -m pip)\s+install\s+(?:(?:--upgrade|--user|--no-cache-dir|-U)\s+)*([^-\s][\w\-\[\]<>=~\.]+(?:\s+[^-\s][\w\-\[\]<>=~\.]+)*)',
465
+ # Jupyter-style !pip install
466
+ r'!\s*(?:pip|pip3|python -m pip)\s+install\s+(?:(?:--upgrade|--user|--no-cache-dir|-U)\s+)*([^-\s][\w\-\[\]<>=~\.]+(?:\s+[^-\s][\w\-\[\]<>=~\.]+)*)',
467
+ # Requirements file style pip install
468
+ r'(?:#|!)\s*(?:pip|pip3|python -m pip)\s+install\s+(?:-r\s+[\w\-\.\/]+\s+)*([^-\s][\w\-\[\]<>=~\.]+(?:\s+[^-\s][\w\-\[\]<>=~\.]+)*)'
469
+ ]
470
+
471
+ # Process each line
472
+ for line in code.splitlines():
473
+ matched = False
474
+ for pattern in pip_patterns:
475
+ match = re.search(pattern, line)
476
+ if match:
477
+ matched = True
478
+ # Extract packages from the command
479
+ pkgs = match.group(1).strip().split()
480
+ # Clean package names (remove version specifiers)
481
+ cleaned_pkgs = [pkg.split('==')[0].split('>=')[0].split('<=')[
482
+ 0].split('~=')[0] for pkg in pkgs]
483
+ python_packages.extend(cleaned_pkgs)
484
+
485
+ # Remove the pip install command from the line
486
+ cleaned_line = line[:match.start()].rstrip()
487
+ if cleaned_line: # Only add non-empty lines
488
+ cleaned_lines.append(cleaned_line)
489
+ break
490
+
491
+ if not matched:
492
+ cleaned_lines.append(line)
493
+
494
+ # Remove duplicates while preserving order
495
+ python_packages = list(dict.fromkeys(python_packages))
496
+
497
+ return python_packages, '\n'.join(cleaned_lines)
498
+
499
+
500
+ def extract_js_from_html_script_tags(code: str) -> list[str]:
501
+ '''
502
+ Extract JavaScript package names from HTML script tags.
503
+ Handles both CDN script tags and inline scripts.
504
+
505
+ Args:
506
+ code: HTML code containing script tags
507
+
508
+ Returns:
509
+ list[str]: List of package names
510
+ '''
511
+ packages: Set[str] = set()
512
+
513
+ # Extract packages from CDN script tags
514
+ script_patterns = [
515
+ # unpkg.com pattern
516
+ r'<script[^>]*src="https?://unpkg\.com/(@?[^@/"]+(?:/[^@/"]+)?(?:@[^/"]+)?)[^"]*"[^>]*>',
517
+ # cdn.jsdelivr.net pattern - explicitly handle /npm/ in the path
518
+ r'<script[^>]*src="https?://cdn\.jsdelivr\.net/npm/(@?[^@/"]+(?:/[^@/"]+)?(?:@[^/"]+)?)[^"]*"[^>]*>',
519
+ # Generic CDN pattern for any domain - exclude common path components
520
+ r'<script[^>]*src="https?://(?!(?:[^"]+/)?(?:npm|dist|lib|build|umd|esm|cjs|min)/)[^"]+?/(@?[\w-]+)(?:/[^"]*)?[^"]*"[^>]*>',
521
+ ]
522
+
523
+ seen_packages = set() # Track packages we've already added to avoid duplicates
524
+ for pattern in script_patterns:
525
+ matches = re.finditer(pattern, code, re.IGNORECASE)
526
+ for match in matches:
527
+ pkg_name = match.group(1)
528
+ if pkg_name.startswith('@'):
529
+ # Handle scoped packages
530
+ parts = pkg_name.split('/')
531
+ if len(parts) >= 2:
532
+ pkg_name = '/'.join(parts[:2])
533
+ else:
534
+ # Remove version and path components from package name
535
+ pkg_name = pkg_name.split('/')[0].split('@')[0]
536
+
537
+ # Skip common path components and duplicates
538
+ if pkg_name and pkg_name not in seen_packages and not pkg_name.lower() in {'npm', 'dist', 'lib', 'build', 'umd', 'esm', 'cjs', 'min'}:
539
+ seen_packages.add(pkg_name)
540
+ packages.add(pkg_name)
541
+
542
+ # Extract packages from inline scripts
543
+ script_tags = re.finditer(
544
+ r'<script[^>]*>(.*?)</script>', code, re.DOTALL | re.IGNORECASE)
545
+ for script in script_tags:
546
+ script_content = script.group(1)
547
+ # Check for ES module imports with full URLs
548
+ es_module_patterns = [
549
+ # Match imports from CDN URLs, being careful to extract only the package name
550
+ r'import\s+[\w\s{},*]+\s+from\s+[\'"]https?://[^/]+/npm/([^/@"\s]+)[@/][^"]*[\'"]',
551
+ ]
552
+ found_cdn_import = False
553
+ for pattern in es_module_patterns:
554
+ matches = re.finditer(pattern, script_content)
555
+ for match in matches:
556
+ pkg_name = match.group(1)
557
+ if pkg_name and pkg_name not in seen_packages and not pkg_name.lower() in {'npm', 'dist', 'lib', 'build', 'umd', 'esm', 'cjs', 'min', 'https', 'http'}:
558
+ seen_packages.add(pkg_name)
559
+ packages.add(pkg_name)
560
+ found_cdn_import = True
561
+
562
+ # Only check for regular imports if we didn't find a CDN import
563
+ if not found_cdn_import:
564
+ # Remove any URL imports before passing to extract_js_imports
565
+ cleaned_content = re.sub(
566
+ r'import\s+[\w\s{},*]+\s+from\s+[\'"]https?://[^"]+[\'"]', '', script_content)
567
+ packages.update(extract_js_imports(cleaned_content))
568
+
569
+ return list(packages)
570
+
571
+
572
+ def extract_code_from_markdown(message: str, enable_auto_env: bool = False) -> tuple[str, str, tuple[list[str], list[str]], SandboxEnvironment | None] | None:
573
+ '''
574
+ Extracts code from a markdown message by parsing code blocks directly.
575
+ Determines sandbox environment based on code content and frameworks used.
576
+
577
+ Returns:
578
+ tuple[str, str, tuple[list[str], list[str]], SandboxEnvironment | None]: A tuple:
579
+ 1. code - the longest code block found
580
+ 2. code language
581
+ 3. sandbox python and npm dependencies (extracted using static analysis)
582
+ 4. sandbox environment determined from code content
583
+ '''
584
+ code_block_regex = r'```(?P<code_lang>[\w\+\#\-\.]*)?[ \t]*\r?\n?(?P<code>.*?)```'
585
+ matches = list(re.finditer(code_block_regex, message, re.DOTALL))
586
+
587
+ if not matches:
588
+ return None
589
+
590
+ # Define a low-priority list for certain languages
591
+ low_priority_languages = ['bash', 'shell',
592
+ 'sh', 'zsh', 'powershell', 'pwsh', '']
593
+
594
+ # Find the main code block by avoiding low-priority languages
595
+ main_code = None
596
+ main_code_lang = None
597
+ max_length = 0
598
+
599
+ for match in matches:
600
+ code = match.group('code').strip()
601
+ code_lang = (match.group('code_lang') or '').lower()
602
+ if code_lang not in low_priority_languages and len(code) > max_length:
603
+ main_code = code
604
+ main_code_lang = code_lang
605
+ max_length = len(code)
606
+
607
+ # Fallback to the longest code block if no main code was found
608
+ if not main_code:
609
+ longest_match = max(matches, key=lambda m: len(m.group('code')))
610
+ main_code = longest_match.group('code').strip()
611
+ main_code_lang = (longest_match.group('code_lang') or '').lower()
612
+
613
+ # Define language prefixes for each environment
614
+ python_prefixes = ['py', 'ipython', 'pygame', 'gradio', 'streamlit']
615
+ vue_prefixes = ['vue']
616
+ react_prefixes = ['react', 'next']
617
+ js_prefixes = ['js', 'javascript', 'jsx', 'coffee', 'ecma', 'node', 'es', 'svelte']
618
+ html_prefixes = ['html', 'xhtml', 'htm']
619
+ ts_prefixes = ['ts', 'typescript', 'tsx']
620
+ mermaid_prefixes = ['mermaid', 'mmd']
621
+ c_prefixes = ['c']
622
+ cpp_prefixes = ['cpp', 'c++']
623
+ go_prefixes = ['go', 'golang']
624
+ java_prefixes = ['java']
625
+ rust_prefixes = ['rust']
626
+ csharp_prefixes = ['cs', 'csharp', 'dotnet']
627
+
628
+ # Extract package dependencies from the main program
629
+ python_packages: list[str] = []
630
+ npm_packages: list[str] = []
631
+
632
+ # Helper function to check if any prefix matches
633
+ def matches_prefix(lang: str, prefixes: list[str]) -> bool:
634
+ return any(lang.lower().startswith(prefix) for prefix in prefixes)
635
+
636
+ if matches_prefix(main_code_lang, python_prefixes):
637
+ python_packages = extract_python_imports(main_code)
638
+ extra_python_packages, main_code = extract_inline_pip_install_commands(
639
+ main_code)
640
+ python_packages.extend(extra_python_packages)
641
+ sandbox_env_name = determine_python_environment(
642
+ main_code, python_packages)
643
+ elif matches_prefix(main_code_lang, vue_prefixes):
644
+ npm_packages = extract_js_imports(main_code)
645
+ sandbox_env_name = SandboxEnvironment.VUE
646
+ main_code_lang = detect_js_ts_code_lang(main_code)
647
+ elif matches_prefix(main_code_lang, react_prefixes):
648
+ npm_packages = extract_js_imports(main_code)
649
+ sandbox_env_name = SandboxEnvironment.REACT
650
+ main_code_lang = detect_js_ts_code_lang(main_code)
651
+ elif ('<!DOCTYPE html>' in main_code and ('<head' in main_code or '<body' in main_code)) or (main_code.strip().startswith('<svg')) or (not matches_prefix(main_code_lang, [*react_prefixes, *vue_prefixes, *js_prefixes, *ts_prefixes]) and ('<html' in main_code or '<!DOCTYPE html>' in main_code)):
652
+ npm_packages = extract_js_from_html_script_tags(main_code)
653
+ sandbox_env_name = SandboxEnvironment.HTML
654
+ main_code_lang = 'html'
655
+ elif matches_prefix(main_code_lang, js_prefixes):
656
+ main_code_lang = 'javascript'
657
+ npm_packages = extract_js_imports(main_code)
658
+ sandbox_env_name = determine_jsts_environment(main_code, npm_packages)
659
+ elif matches_prefix(main_code_lang, ts_prefixes):
660
+ main_code_lang = 'typescript'
661
+ npm_packages = extract_js_imports(main_code)
662
+ sandbox_env_name = determine_jsts_environment(main_code, npm_packages)
663
+ elif matches_prefix(main_code_lang, html_prefixes):
664
+ main_code_lang = detect_js_ts_code_lang(main_code)
665
+ npm_packages = extract_js_imports(main_code)
666
+ sandbox_env_name = determine_jsts_environment(main_code, npm_packages)
667
+ elif matches_prefix(main_code_lang, mermaid_prefixes):
668
+ main_code_lang = 'markdown'
669
+ sandbox_env_name = SandboxEnvironment.MERMAID
670
+ elif matches_prefix(main_code_lang, cpp_prefixes):
671
+ main_code_lang = 'cpp'
672
+ sandbox_env_name = SandboxEnvironment.CPP_RUNNER
673
+ elif matches_prefix(main_code_lang, go_prefixes):
674
+ main_code_lang = 'go'
675
+ sandbox_env_name = SandboxEnvironment.GOLANG_RUNNER
676
+ elif matches_prefix(main_code_lang, java_prefixes):
677
+ main_code_lang = 'java'
678
+ sandbox_env_name = SandboxEnvironment.JAVA_RUNNER
679
+ elif matches_prefix(main_code_lang, rust_prefixes):
680
+ main_code_lang = 'rust'
681
+ sandbox_env_name = SandboxEnvironment.RUST_RUNNER
682
+ elif main_code_lang == 'c':
683
+ main_code_lang = 'c'
684
+ sandbox_env_name = sandbox_env_name = SandboxEnvironment.C_RUNNER
685
+ else:
686
+ sandbox_env_name = None
687
+
688
+ all_python_packages: Set[str] = set(python_packages)
689
+ all_npm_packages: Set[str] = set(npm_packages)
690
+
691
+ for match in matches:
692
+ code = match.group('code').strip()
693
+ if code != main_code:
694
+ install_python_packages, install_npm_packages = extract_installation_commands(
695
+ code)
696
+ all_python_packages.update(install_python_packages)
697
+ all_npm_packages.update(install_npm_packages)
698
+
699
+ if not main_code_lang:
700
+ main_code_lang = 'markdown'
701
+
702
+ return main_code, main_code_lang, (list(all_python_packages), list(all_npm_packages)), sandbox_env_name
703
+
704
+
705
+ def create_placeholder_svg_data_url(width: int, height: int) -> str:
706
+ '''
707
+ Create a data URL for a placeholder image with given dimensions.
708
+ Uses SVG to create an elegant placeholder.
709
+
710
+ Args:
711
+ width: Width of the placeholder image
712
+ height: Height of the placeholder image
713
+
714
+ Returns:
715
+ str: Data URL containing the SVG image
716
+ '''
717
+ # Create SVG with gradient background and text
718
+ # Use simpler SVG structure for better browser compatibility
719
+ svg = f'''<svg width="{width}" height="{height}" xmlns="http://www.w3.org/2000/svg">
720
+ <rect width="100%" height="100%" fill="#f3f4f6"/>
721
+ <text
722
+ x="50%"
723
+ y="50%"
724
+ font-family="Arial, sans-serif"
725
+ font-size="{max(12, min(width, height) // 8)}"
726
+ fill="#6b7280"
727
+ text-anchor="middle"
728
+ dominant-baseline="middle">
729
+ {width} × {height}
730
+ </text>
731
+ </svg>'''
732
+
733
+ # Convert to base64 data URL
734
+ try:
735
+ encoded_svg = base64.b64encode(svg.encode('utf-8')).decode('utf-8')
736
+ return f'data:image/svg+xml;base64,{encoded_svg}'
737
+ except Exception as e:
738
+ print(f'Error encoding SVG: {e}')
739
+ # Fallback to a simple colored div
740
+ return f''
741
+
742
+
743
+ def replace_placeholder_urls(code: str) -> str:
744
+ '''
745
+ Replace placeholder image URLs with SVG data URLs.
746
+ Only replaces exact matches of "/api/placeholder/{width}/{height}".
747
+
748
+ Args:
749
+ code: The source code containing placeholder URLs
750
+
751
+ Returns:
752
+ str: Code with placeholder URLs replaced with data URLs
753
+ '''
754
+
755
+ def replacer(match: re.Match) -> str:
756
+ try:
757
+ # Extract width and height from the URL using capturing groups
758
+ width = int(match.group(1))
759
+ height = int(match.group(2))
760
+
761
+ # Validate dimensions
762
+ if width <= 0 or height <= 0:
763
+ print(f'Warning: Invalid dimensions {width}x{height}, using default 100x100')
764
+ width, height = 100, 100
765
+ elif width > 10000 or height > 10000:
766
+ print(f'Warning: Dimensions {width}x{height} are very large, capping at 1000x1000')
767
+ width, height = min(width, 1000), min(height, 1000)
768
+
769
+ print(f'Replacing placeholder URL with SVG: {width}x{height}')
770
+ data_url = create_placeholder_svg_data_url(width, height)
771
+ return data_url
772
+ except Exception as e:
773
+ print(f'Error replacing placeholder URL: {e}')
774
+ # Return a simple fallback
775
+ return ''
776
+
777
+ # Regular expression pattern to match placeholder URLs
778
+ pattern = r'/api/placeholder/(\d+)/(\d+)'
779
+
780
+ try:
781
+ # Replace all occurrences
782
+ result = re.sub(pattern, replacer, code)
783
+ print(f'Placeholder URL replacement completed successfully')
784
+ return result
785
+ except Exception as e:
786
+ print(f'Error during placeholder URL replacement: {e}')
787
+ return code # Return original code if replacement fails
788
+
789
+
790
+ def extract_installation_commands(code: str) -> tuple[list[str], list[str]]:
791
+ '''
792
+ Extracts package installation commands from the code block, preserving version information.
793
+
794
+ Args:
795
+ code (str): The code block to analyze.
796
+
797
+ Returns:
798
+ tuple[list[str], list[str]]: A tuple containing two lists:
799
+ 1. Python packages from pip install commands (with versions if specified).
800
+ 2. npm packages from npm install commands (with versions if specified).
801
+ '''
802
+ python_packages = []
803
+ npm_packages = []
804
+
805
+ # Process the code line by line to handle both pip and npm commands
806
+ lines = code.split('\n')
807
+ for line in lines:
808
+ line = line.strip()
809
+
810
+ # Skip empty lines and comments
811
+ if not line or line.startswith('#'):
812
+ continue
813
+
814
+ # Handle pip install commands
815
+ if any(x in line for x in ['pip install', 'pip3 install', 'python -m pip install']):
816
+ # Remove the command part and any flags
817
+ parts = line.split('install', 1)[1].strip()
818
+ # Handle flags at the start
819
+ while parts.startswith(('-', '--')):
820
+ parts = parts.split(None, 1)[1]
821
+
822
+ # Split by whitespace, respecting quotes
823
+ current = ''
824
+ in_quotes = False
825
+ quote_char = None
826
+ packages = []
827
+
828
+ for char in parts:
829
+ if char in '"\'':
830
+ if not in_quotes:
831
+ in_quotes = True
832
+ quote_char = char
833
+ elif char == quote_char:
834
+ in_quotes = False
835
+ quote_char = None
836
+ elif char.isspace() and not in_quotes:
837
+ if current:
838
+ packages.append(current)
839
+ current = ''
840
+ else:
841
+ current += char
842
+ if current:
843
+ packages.append(current)
844
+
845
+ # Add packages, stripping quotes and ignoring flags
846
+ for pkg in packages:
847
+ pkg = pkg.strip('"\'')
848
+ if pkg and not pkg.startswith(('-', '--')) and not pkg == '-r':
849
+ python_packages.append(pkg)
850
+
851
+ # Handle npm/yarn install commands
852
+ elif any(x in line for x in ['npm install', 'npm i', 'yarn add']):
853
+ # Remove the command part and any flags
854
+ if 'yarn add' in line:
855
+ parts = line.split('add', 1)[1]
856
+ else:
857
+ parts = line.split('install', 1)[
858
+ 1] if 'install' in line else line.split('i', 1)[1]
859
+ parts = parts.strip()
860
+
861
+ # Handle flags at the start
862
+ while parts.startswith(('-', '--')):
863
+ parts = parts.split(None, 1)[1] if ' ' in parts else ''
864
+
865
+ # Process each package
866
+ for pkg in parts.split():
867
+ if pkg.startswith(('-', '--')) or pkg in ('install', 'i', 'add'):
868
+ continue
869
+
870
+ if pkg.startswith('@'):
871
+ # Handle scoped packages (e.g., @types/node@16.0.0)
872
+ if '@' in pkg[1:]: # Has version
873
+ pkg_parts = pkg.rsplit('@', 1)
874
+ base_pkg = pkg_parts[0] # @scope/name
875
+ version = pkg_parts[1] # version
876
+ npm_packages.append(f"{base_pkg}@{version}")
877
+ else:
878
+ npm_packages.append(pkg)
879
+ else:
880
+ npm_packages.append(pkg)
881
+
882
+ # Remove duplicates while preserving order
883
+ python_packages = list(dict.fromkeys(python_packages))
884
+ npm_packages = list(dict.fromkeys(npm_packages))
885
+
886
+ # Filter out npm command words
887
+ npm_packages = [p for p in npm_packages if p not in (
888
+ 'npm', 'install', 'i', 'add')]
889
+
890
+ return python_packages, npm_packages
891
+
892
+
893
+ def validate_dependencies(dependencies: list) -> tuple[bool, str]:
894
+ """
895
+ Validate dependency list format and values.
896
+ Allows empty rows but validates format when package name is specified.
897
+ """
898
+ if not dependencies:
899
+ return True, ""
900
+
901
+ valid_types = ["python", "npm"]
902
+ for dep in dependencies:
903
+ # Skip validation for empty rows
904
+ if len(dep) != 3:
905
+ return False, "Each dependency must have type, package and version fields"
906
+
907
+ dep_type, pkg_name, version = dep
908
+
909
+ # Skip empty rows
910
+ if not pkg_name.strip():
911
+ continue
912
+
913
+ if dep_type.lower() not in valid_types:
914
+ return False, f"Invalid dependency type: {dep_type}"
915
+
916
+ # Validate version format if specified
917
+ if version.strip():
918
+ if dep_type.lower() == "python":
919
+ # Check for valid pip version specifiers
920
+ if not any(op in version for op in ['==', '>=', '<=', '~=', '>', '<']) and version.lower() != "latest":
921
+ return False, f"Invalid Python version format for {pkg_name}: {version}"
922
+ elif dep_type.lower() == "npm":
923
+ # Check for valid npm version format (starts with @ or valid semver-like)
924
+ if not (version.startswith('@') or version.lower() == "latest"):
925
+ return False, f"Invalid NPM version format for {pkg_name}: {version}"
926
+
927
+ return True, ""
928
+
929
+
930
+ def extract_java_class_name(java_code: str) -> str:
931
+ '''
932
+ Extract the class name from Java code.
933
+ '''
934
+ match = re.search(r'public\s+class\s+(\w+)', java_code)
935
+ return match.group(1) if match else "Main"
sandbox/code_runner.py ADDED
@@ -0,0 +1,1612 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ '''
2
+ Run generated code in a sandbox environment.
3
+
4
+ Gradio will interact with this module.
5
+ '''
6
+
7
+ from typing import Any, Generator, Literal, TypeAlias, TypedDict, Set
8
+ import uuid
9
+ import gradio as gr
10
+
11
+ import base64
12
+ from e2b_code_interpreter import Sandbox as CodeSandbox
13
+ from gradio_sandboxcomponent import SandboxComponent
14
+
15
+ from sandbox.sandbox_state import ChatbotSandboxState
16
+ from sandbox.code_analyzer import SandboxEnvironment, extract_code_from_markdown, extract_installation_commands, extract_java_class_name, extract_js_imports, extract_python_imports, replace_placeholder_urls, validate_dependencies
17
+ from sandbox.prompts import (
18
+ DEFAULT_C_CODE_RUN_SANDBOX_INSTRUCTION, DEFAULT_CPP_CODE_RUN_SANDBOX_INSTRUCTION, DEFAULT_GOLANG_CODE_RUN_SANDBOX_INSTRUCTION, DEFAULT_GRADIO_SANDBOX_INSTRUCTION, DEFAULT_HTML_SANDBOX_INSTRUCTION, DEFAULT_JAVA_CODE_RUN_SANDBOX_INSTRUCTION, DEFAULT_JAVASCRIPT_RUNNER_INSTRUCTION, DEFAULT_MERMAID_SANDBOX_INSTRUCTION, DEFAULT_PYGAME_SANDBOX_INSTRUCTION, DEFAULT_PYTHON_RUNNER_INSTRUCTION, DEFAULT_REACT_SANDBOX_INSTRUCTION, DEFAULT_RUST_CODE_RUN_SANDBOX_INSTRUCTION, DEFAULT_STREAMLIT_SANDBOX_INSTRUCTION, DEFAULT_VUE_SANDBOX_INSTRUCTION, GENERAL_SANDBOX_INSTRUCTION
19
+ )
20
+ from sandbox.sandbox_telemetry import log_sandbox_telemetry_gradio_fn
21
+
22
+
23
+ from .constants import CODE_RUN_TIMEOUT_SECONDS, E2B_API_KEY, SANDBOX_TEMPLATE_ID, SANDBOX_NGINX_PORT
24
+ from .sandbox_manager import get_sandbox_app_url, create_sandbox, install_npm_dependencies, install_pip_dependencies, reuse_or_create_sandbox, run_background_command_with_timeout, run_command_in_sandbox
25
+
26
+ SUPPORTED_SANDBOX_ENVIRONMENTS: list[str] = [
27
+ env.value for env in SandboxEnvironment
28
+
29
+ ]
30
+
31
+ WEB_UI_SANDBOX_ENVIRONMENTS = [
32
+ SandboxEnvironment.HTML,
33
+ SandboxEnvironment.REACT,
34
+ SandboxEnvironment.VUE,
35
+ SandboxEnvironment.GRADIO,
36
+ SandboxEnvironment.STREAMLIT,
37
+ # SandboxEnvironment.NICEGUI,
38
+ SandboxEnvironment.PYGAME,
39
+ SandboxEnvironment.MERMAID
40
+ ]
41
+ '''
42
+ Sandbox environments that can be rendered in the web UI.
43
+ '''
44
+
45
+ VALID_GRADIO_CODE_LANGUAGES = [
46
+ 'python', 'c', 'cpp', 'markdown', 'json', 'html', 'css', 'javascript', 'jinja2', 'typescript', 'yaml', 'dockerfile', 'shell', 'r', 'sql',
47
+ 'sql-msSQL', 'sql-mySQL', 'sql-mariaDB', 'sql-sqlite', 'sql-cassandra', 'sql-plSQL', 'sql-hive', 'sql-pgSQL', 'sql-gql', 'sql-gpSQL', 'sql-sparkSQL',
48
+ 'sql-esper'
49
+ ]
50
+ '''
51
+ Languages that gradio code component can render.
52
+ '''
53
+
54
+ RUN_CODE_BUTTON_HTML = "<button style='background-color: #4CAF50; border: none; color: white; padding: 10px 24px; text-align: center; text-decoration: none; display: inline-block; font-size: 16px; margin: 4px 2px; cursor: pointer; border-radius: 12px;'>Click to Run in Sandbox</button>"
55
+ '''
56
+ Button in the chat to run the code in the sandbox.
57
+ '''
58
+
59
+
60
+ DEFAULT_SANDBOX_INSTRUCTIONS: dict[SandboxEnvironment, str] = {
61
+ SandboxEnvironment.AUTO: GENERAL_SANDBOX_INSTRUCTION.strip(),
62
+ SandboxEnvironment.PYTHON_RUNNER: DEFAULT_PYTHON_RUNNER_INSTRUCTION.strip(),
63
+ SandboxEnvironment.JAVASCRIPT_RUNNER: DEFAULT_JAVASCRIPT_RUNNER_INSTRUCTION.strip(),
64
+ SandboxEnvironment.HTML: DEFAULT_HTML_SANDBOX_INSTRUCTION.strip(),
65
+ SandboxEnvironment.REACT: DEFAULT_REACT_SANDBOX_INSTRUCTION.strip(),
66
+ SandboxEnvironment.VUE: DEFAULT_VUE_SANDBOX_INSTRUCTION.strip(),
67
+ SandboxEnvironment.GRADIO: DEFAULT_GRADIO_SANDBOX_INSTRUCTION.strip(),
68
+ SandboxEnvironment.STREAMLIT: DEFAULT_STREAMLIT_SANDBOX_INSTRUCTION.strip(),
69
+ SandboxEnvironment.PYGAME: DEFAULT_PYGAME_SANDBOX_INSTRUCTION.strip(),
70
+ SandboxEnvironment.MERMAID: DEFAULT_MERMAID_SANDBOX_INSTRUCTION.strip(),
71
+ # Runners
72
+ SandboxEnvironment.C_RUNNER: DEFAULT_C_CODE_RUN_SANDBOX_INSTRUCTION,
73
+ SandboxEnvironment.CPP_RUNNER: DEFAULT_CPP_CODE_RUN_SANDBOX_INSTRUCTION,
74
+ SandboxEnvironment.JAVA_RUNNER: DEFAULT_JAVA_CODE_RUN_SANDBOX_INSTRUCTION,
75
+ SandboxEnvironment.GOLANG_RUNNER: DEFAULT_GOLANG_CODE_RUN_SANDBOX_INSTRUCTION,
76
+ SandboxEnvironment.RUST_RUNNER: DEFAULT_RUST_CODE_RUN_SANDBOX_INSTRUCTION,
77
+
78
+ }
79
+
80
+
81
+ SandboxGradioSandboxComponents: TypeAlias = tuple[
82
+ gr.Markdown | Any, # sandbox_output_md
83
+ SandboxComponent | Any, # sandbox_ui
84
+ gr.Code | Any, # sandbox_code
85
+ Any
86
+ ]
87
+ '''
88
+ Gradio components for the sandbox.
89
+ '''
90
+
91
+ class CodeRunResult(TypedDict):
92
+ '''
93
+ The result of running the code in the sandbox.
94
+ '''
95
+ sandbox_id: str
96
+ '''
97
+ The sandbox id to run the code.
98
+ '''
99
+ sandbox_url: str
100
+ '''
101
+ The sandbox url to access the rendered results.
102
+ '''
103
+ is_run_success: bool
104
+ '''
105
+ Whether the code run is successful.
106
+ '''
107
+ stderr: str
108
+ '''
109
+ The stderr output from the sandbox.
110
+ '''
111
+
112
+
113
+ def create_chatbot_sandbox_state(btn_list_length: int = 5) -> ChatbotSandboxState:
114
+ '''
115
+ Create a new sandbox state for a chatbot.
116
+ '''
117
+ return {
118
+ 'enable_sandbox': True, # Always enabled
119
+ 'enabled_round': 0,
120
+ 'sandbox_run_round': 0,
121
+ 'edit_round': 0,
122
+ 'sandbox_environment': SandboxEnvironment.AUTO,
123
+ 'auto_selected_sandbox_environment': None,
124
+ 'sandbox_instruction': DEFAULT_SANDBOX_INSTRUCTIONS[SandboxEnvironment.AUTO],
125
+ 'code_to_execute': "",
126
+ 'code_language': None,
127
+ 'code_dependencies': ([], []),
128
+ 'btn_list_length': btn_list_length,
129
+ 'sandbox_id': None,
130
+ 'chat_session_id': None,
131
+ 'conv_id': None,
132
+ "sandbox_output": None,
133
+ "sandbox_error": None,
134
+ }
135
+
136
+
137
+ def set_sandbox_state_ids(
138
+ sandbox_state: ChatbotSandboxState,
139
+ conv_id: str,
140
+ chat_session_id: str,
141
+ ) -> ChatbotSandboxState:
142
+ '''
143
+ Set the conv_id and chat_session_id in the sandbox state.
144
+ '''
145
+ sandbox_state['conv_id'] = conv_id
146
+ sandbox_state['chat_session_id'] = chat_session_id
147
+ return sandbox_state
148
+
149
+
150
+ def reset_sandbox_state(state: ChatbotSandboxState) -> ChatbotSandboxState:
151
+ '''
152
+ Reset the sandbox state.
153
+ Used when the chatbot session is reset.
154
+ '''
155
+ # reset rounds
156
+ state['enabled_round'] = 0
157
+ state['sandbox_run_round'] = 0
158
+ state['edit_round'] = 0
159
+
160
+ # state['sandbox_environment'] = SandboxEnvironment.AUTO
161
+ state['auto_selected_sandbox_environment'] = None
162
+ state['sandbox_instruction'] = DEFAULT_SANDBOX_INSTRUCTIONS[SandboxEnvironment.AUTO]
163
+ state['code_to_execute'] = ""
164
+ state['code_language'] = None
165
+ state['code_dependencies'] = ([], [])
166
+ state['sandbox_error'] = None
167
+ state['sandbox_output'] = None
168
+
169
+ # reset ids
170
+ state['sandbox_id'] = None
171
+ state['conv_id'] = None
172
+ state['chat_session_id'] = None
173
+
174
+ return state
175
+
176
+
177
+ def update_sandbox_config_multi(
178
+ enable_sandbox: bool,
179
+ sandbox_environment: SandboxEnvironment,
180
+ *states: ChatbotSandboxState
181
+ ) -> list[ChatbotSandboxState]:
182
+ '''
183
+ Fn to update sandbox config.
184
+ '''
185
+ return [
186
+ update_sandbox_config(enable_sandbox, sandbox_environment, state)
187
+ for state
188
+ in states
189
+ ]
190
+
191
+
192
+ def update_sandbox_state_system_prompt(sandbox_state: ChatbotSandboxState, system_prompt: str):
193
+ if sandbox_state['enabled_round'] == 0:
194
+ sandbox_state['sandbox_instruction'] = system_prompt
195
+ return sandbox_state
196
+
197
+
198
+ def update_sandbox_config(
199
+ enable_sandbox: bool,
200
+ sandbox_environment: SandboxEnvironment,
201
+ state: ChatbotSandboxState
202
+ ) -> ChatbotSandboxState:
203
+ '''
204
+ Fn to update sandbox config for single model.
205
+ '''
206
+ state["enable_sandbox"] = enable_sandbox
207
+ state["sandbox_environment"] = sandbox_environment
208
+ state['sandbox_instruction'] = DEFAULT_SANDBOX_INSTRUCTIONS.get(sandbox_environment, None)
209
+ return state
210
+
211
+
212
+ def update_visibility(visible):
213
+ return [gr.update(visible=visible)] *14
214
+
215
+
216
+ def update_visibility_for_single_model(visible: bool, component_cnt: int):
217
+ return [gr.update(visible=visible)] * component_cnt
218
+
219
+
220
+ def mermaid_to_html(mermaid_code: str, theme: str = 'default') -> str:
221
+ """
222
+ Convert Mermaid diagram code to a minimal HTML document.
223
+
224
+ Args:
225
+ mermaid_code: The Mermaid diagram syntax
226
+ theme: Theme name ('default', 'dark', 'forest', 'neutral', etc.)
227
+
228
+ Returns:
229
+ str: Complete HTML document with embedded Mermaid diagram
230
+ """
231
+ html_template = f'''<!DOCTYPE html>
232
+ <html>
233
+ <head>
234
+ <meta charset="UTF-8">
235
+ <script src="https://cdn.jsdelivr.net/npm/mermaid@10.6.1/dist/mermaid.min.js"></script>
236
+ <script>
237
+ mermaid.initialize({{
238
+ startOnLoad: true,
239
+ theme: '{theme}',
240
+ securityLevel: 'loose',
241
+ fontFamily: 'Arial, sans-serif'
242
+ }});
243
+ </script>
244
+ </head>
245
+ <body>
246
+ <div class="mermaid">
247
+ {mermaid_code}
248
+ </div>
249
+ </body>
250
+ </html>'''
251
+ return html_template
252
+
253
+
254
+ def javascript_to_html(javascript_code: str) -> str:
255
+ """
256
+ Convert JavaScript code to a minimal HTML document that executes the code.
257
+
258
+ Args:
259
+ javascript_code: The JavaScript code to embed
260
+
261
+ Returns:
262
+ str: Complete HTML document with embedded JavaScript code
263
+ """
264
+ html_template = f'''<!DOCTYPE html>
265
+ <html>
266
+ <head>
267
+ <meta charset="UTF-8">
268
+ <title>JavaScript Code Execution</title>
269
+ <style>
270
+ body {{
271
+ font-family: Arial, sans-serif;
272
+ margin: 20px;
273
+ background-color: #f5f5f5;
274
+ }}
275
+ #output {{
276
+ background-color: white;
277
+ border: 1px solid #ddd;
278
+ border-radius: 4px;
279
+ padding: 15px;
280
+ margin-top: 20px;
281
+ white-space: pre-wrap;
282
+ font-family: monospace;
283
+ }}
284
+ .error {{
285
+ color: red;
286
+ }}
287
+ .log {{
288
+ color: black;
289
+ }}
290
+ </style>
291
+ </head>
292
+ <body>
293
+ <h1>JavaScript Code Execution</h1>
294
+ <div id="output"></div>
295
+
296
+ <script>
297
+ // Override console methods to capture output
298
+ const outputDiv = document.getElementById('output');
299
+ const originalConsole = {{}};
300
+
301
+ ['log', 'error', 'warn', 'info'].forEach(function(method) {{
302
+ originalConsole[method] = console[method];
303
+ console[method] = function(...args) {{
304
+ const message = args.map(function(arg) {{
305
+ return typeof arg === 'object' ? JSON.stringify(arg, null, 2) : String(arg);
306
+ }}).join(' ');
307
+
308
+ const span = document.createElement('span');
309
+ span.className = method === 'error' ? 'error' : 'log';
310
+ span.textContent = '[' + method.toUpperCase() + '] ' + message + '\\n';
311
+ outputDiv.appendChild(span);
312
+
313
+ // Also call original console method
314
+ originalConsole[method].apply(console, args);
315
+ }};
316
+ }});
317
+
318
+ // Capture uncaught errors
319
+ window.addEventListener('error', function(e) {{
320
+ const span = document.createElement('span');
321
+ span.className = 'error';
322
+ span.textContent = '[ERROR] ' + e.message + ' at line ' + e.lineno + '\\n';
323
+ outputDiv.appendChild(span);
324
+ }});
325
+
326
+ try {{
327
+ // Execute the user's JavaScript code
328
+ {javascript_code}
329
+ }} catch (error) {{
330
+ console.error('Execution error:', error.message);
331
+ }}
332
+ </script>
333
+ </body>
334
+ </html>'''
335
+ return html_template
336
+
337
+
338
+ def render_result(result):
339
+ if result.png:
340
+ if isinstance(result.png, str):
341
+ img_str = result.png
342
+ else:
343
+ img_str = base64.b64encode(result.png).decode()
344
+ return f"![png image](data:image/png;base64,{img_str})"
345
+ elif result.jpeg:
346
+ if isinstance(result.jpeg, str):
347
+ img_str = result.jpeg
348
+ else:
349
+ img_str = base64.b64encode(result.jpeg).decode()
350
+ return f"![jpeg image](data:image/jpeg;base64,{img_str})"
351
+ elif result.svg:
352
+ if isinstance(result.svg, str):
353
+ svg_data = result.svg
354
+ else:
355
+ svg_data = result.svg.decode()
356
+ svg_base64 = base64.b64encode(svg_data.encode()).decode()
357
+ return f"![svg image](data:image/svg+xml;base64,{svg_base64})"
358
+ elif result.html:
359
+ return result.html
360
+ elif result.markdown:
361
+ return f"```markdown\n{result.markdown}\n```"
362
+ elif result.latex:
363
+ return f"```latex\n{result.latex}\n```"
364
+ elif result.json:
365
+ return f"```json\n{result.json}\n```"
366
+ elif result.javascript:
367
+ return result.javascript # Return raw JavaScript
368
+ else:
369
+ return str(result)
370
+
371
+
372
+ def run_code_interpreter(code: str, code_language: str | None, code_dependencies: tuple[list[str], list[str]]) -> tuple[str, str]:
373
+ """
374
+ Executes the provided code within a sandboxed environment and returns the output.
375
+
376
+ Args:
377
+ code (str): The code to be executed.
378
+ """
379
+ sandbox = CodeSandbox(
380
+ api_key=E2B_API_KEY,
381
+ )
382
+
383
+ sandbox.commands.run("pip install uv",
384
+ timeout=60 * 3,
385
+ on_stderr=lambda message: print(message),)
386
+
387
+ stderrs = []
388
+
389
+ python_dependencies, npm_dependencies = code_dependencies
390
+ pip_install_errs = install_pip_dependencies(sandbox, python_dependencies)
391
+ npm_install_errs = install_npm_dependencies(sandbox, npm_dependencies)
392
+
393
+ stderrs.extend(pip_install_errs)
394
+ stderrs.extend(npm_install_errs)
395
+
396
+ execution = sandbox.run_code(
397
+ code=code,
398
+ language=code_language
399
+ )
400
+
401
+ # collect stdout, stderr from sandbox
402
+ stdout = "\n".join(execution.logs.stdout)
403
+ stderr = "\n".join(execution.logs.stderr)
404
+ if execution.error:
405
+ stderr += f"\n{execution.error.name}: {execution.error.value}"
406
+ output = ""
407
+ if stdout:
408
+ output += f"### Stdout:\n```markdown\n{stdout}\n```\n\n"
409
+
410
+ stderrs.append(stderr)
411
+
412
+ results = []
413
+ for result in execution.results:
414
+ if result.html or result.javascript:
415
+ # TODO: fix this
416
+ continue
417
+ # with open('html_code.html', 'w') as f:
418
+ # f.write(result.html)
419
+ # url, _ = run_html_sandbox(result.html, ([], extract_js_imports(result.html)))
420
+ else:
421
+ rendered_result = render_result(result)
422
+ results.append(rendered_result)
423
+ if results:
424
+ output += "\n### Results:\n" + "\n".join(results)
425
+
426
+ stderrs = '\n'.join(stderrs)
427
+ return output, "" if output else stderrs
428
+
429
+
430
+ def run_html_sandbox(code: str, code_dependencies: tuple[list[str], list[str]], existing_sandbox_id: str | None = None) -> tuple[str, str, str]:
431
+ """
432
+ Executes the provided code within a sandboxed environment and returns the output.
433
+ Supports both React and Vue.js rendering in HTML files.
434
+
435
+ Args:
436
+ code (str): The code to be executed.
437
+ code_dependencies: Tuple of (python_deps, npm_deps)
438
+
439
+ Returns:
440
+ tuple: (sandbox_url, sandbox_id, stderr)
441
+ """
442
+ sandbox = reuse_or_create_sandbox(sandbox_id=existing_sandbox_id)
443
+ project_root = "~/html_app"
444
+ sandbox.files.make_dir(project_root)
445
+
446
+ # HTML does not support dependencies for now
447
+ # _, npm_dependencies = code_dependencies
448
+ # install_npm_dependencies(sandbox, npm_dependencies, project_root=project_root)
449
+
450
+ # replace placeholder URLs with SVG data URLs
451
+ code = replace_placeholder_urls(code)
452
+
453
+ file_path = f"{project_root}/index.html"
454
+ sandbox.files.write(file_path, code, "user", 60)
455
+
456
+ sandbox_url = get_sandbox_app_url(sandbox, 'html')
457
+ return (sandbox_url, sandbox.sandbox_id, '')
458
+
459
+
460
+ def run_react_sandbox(code: str, code_dependencies: tuple[list[str], list[str]], existing_sandbox_id: str | None = None) -> CodeRunResult:
461
+ """
462
+ Executes the provided code within a sandboxed environment and returns the output.
463
+
464
+ Args:
465
+ code (str): The code to be executed.
466
+
467
+ Returns:
468
+ url for remote sandbox
469
+ """
470
+ project_root = "~/react_app"
471
+ sandbox = reuse_or_create_sandbox(sandbox_id=existing_sandbox_id)
472
+
473
+ stderrs: list[str] = [] # to collect errors
474
+
475
+ _, npm_dependencies = code_dependencies
476
+ if npm_dependencies:
477
+ print(f"Installing NPM dependencies...: {npm_dependencies}")
478
+ install_errs = install_npm_dependencies(sandbox, npm_dependencies, project_root=project_root)
479
+ stderrs.extend(install_errs)
480
+ print("NPM dependencies installed. " + "Errors: " + str(install_errs))
481
+
482
+ # replace placeholder URLs with SVG data URLs
483
+ code = replace_placeholder_urls(code)
484
+
485
+ # set up the sandbox
486
+ print("Setting up sandbox directory structure...")
487
+ file_path = "~/react_app/src/App.tsx"
488
+ sandbox.files.write(file_path, code, "user", 60)
489
+ print("Code files written successfully.")
490
+
491
+ is_run_success, _, build_stderrs = run_command_in_sandbox(
492
+ sandbox=sandbox,
493
+ command="npm run build --loglevel=error -- --mode development --logLevel error",
494
+ working_directory=project_root,
495
+ )
496
+ stderrs.extend(build_stderrs)
497
+
498
+ sandbox_url = get_sandbox_app_url(sandbox, 'react')
499
+ return {
500
+ 'sandbox_id': sandbox.sandbox_id,
501
+ 'sandbox_url': sandbox_url,
502
+ 'is_run_success': is_run_success,
503
+ 'stderr': '\n'.join(stderrs),
504
+ }
505
+
506
+
507
+ def run_vue_sandbox(code: str, code_dependencies: tuple[list[str], list[str]], existing_sandbox_id: str | None = None) -> CodeRunResult:
508
+ """
509
+ Executes the provided Vue code within a sandboxed environment and returns the output.
510
+
511
+ Args:
512
+ code (str): The Vue code to be executed.
513
+
514
+ Returns:
515
+ url for remote sandbox
516
+ """
517
+ sandbox = reuse_or_create_sandbox(sandbox_id=existing_sandbox_id)
518
+ project_root = "~/vue_app"
519
+
520
+ stderrs: list[str] = [] # to collect errors
521
+
522
+ # replace placeholder URLs with SVG data URLs
523
+ code = replace_placeholder_urls(code)
524
+
525
+ # Set up the sandbox
526
+ file_path = "~/vue_app/src/App.vue"
527
+ sandbox.files.write(file_path, code, "user", 60)
528
+
529
+ _, npm_dependencies = code_dependencies
530
+ if npm_dependencies:
531
+ print(f"Installing NPM dependencies...: {npm_dependencies}")
532
+ install_errs = install_npm_dependencies(sandbox, npm_dependencies, project_root=project_root)
533
+ stderrs.extend(install_errs)
534
+ print("NPM dependencies installed. " + "Errors: " + str(install_errs))
535
+
536
+ is_run_success, _, build_stderrs = run_command_in_sandbox(
537
+ sandbox=sandbox,
538
+ command="npm run build --loglevel=error -- --mode development --logLevel error",
539
+ working_directory=project_root,
540
+ )
541
+ stderrs.extend(build_stderrs)
542
+
543
+ sandbox_url = get_sandbox_app_url(sandbox, 'vue')
544
+ return {
545
+ 'sandbox_id': sandbox.sandbox_id,
546
+ 'sandbox_url': sandbox_url,
547
+ 'is_run_success': is_run_success,
548
+ 'stderr': '\n'.join(stderrs),
549
+ }
550
+
551
+
552
+ def run_pygame_sandbox(code: str, code_dependencies: tuple[list[str], list[str]], existing_sandbox_id: str | None = None) -> CodeRunResult:
553
+ """
554
+ Executes the provided code within a sandboxed environment and returns the output.
555
+
556
+ Args:
557
+ code (str): The code to be executed.
558
+
559
+ Returns:
560
+ url for remote sandbox
561
+ """
562
+ sandbox = reuse_or_create_sandbox(sandbox_id=existing_sandbox_id)
563
+ project_root = "~/pygame_app"
564
+ file_path = f"{project_root}/main.py"
565
+
566
+ stderrs = []
567
+
568
+ sandbox.files.write(file_path, code, "user", 60)
569
+
570
+ python_dependencies, _ = code_dependencies
571
+ install_errs = install_pip_dependencies(sandbox, python_dependencies)
572
+ stderrs.extend(install_errs)
573
+
574
+ # build the pygame code
575
+ is_run_success, _, build_stderrs = run_command_in_sandbox(
576
+ sandbox=sandbox,
577
+ command="pygbag --build ~/pygame_app",
578
+ )
579
+ stderrs.extend(build_stderrs)
580
+
581
+ sandbox_url = get_sandbox_app_url(sandbox, 'pygame')
582
+ return {
583
+ 'sandbox_id': sandbox.sandbox_id,
584
+ 'sandbox_url': sandbox_url,
585
+ 'is_run_success': is_run_success,
586
+ 'stderr': '\n'.join(stderrs),
587
+ }
588
+
589
+
590
+ def run_gradio_sandbox(code: str, code_dependencies: tuple[list[str], list[str]], existing_sandbox_id: str | None = None) -> tuple[str, str, str]:
591
+ """
592
+ Executes the provided code within a sandboxed environment and returns the output.
593
+
594
+ Args:
595
+ code (str): The code to be executed.
596
+
597
+ Returns:
598
+ url for remote sandbox and sandbox id
599
+ """
600
+ sandbox = reuse_or_create_sandbox(sandbox_id=existing_sandbox_id)
601
+
602
+ file_path = "~/gradio_app/main.py"
603
+ sandbox.files.write(file_path, code, "user", 60)
604
+
605
+ stderrs = []
606
+
607
+ python_dependencies, _ = code_dependencies
608
+ install_stderr = install_pip_dependencies(sandbox, python_dependencies)
609
+ stderrs.extend(install_stderr)
610
+
611
+ stderr = run_background_command_with_timeout(
612
+ sandbox,
613
+ f"python {file_path}",
614
+ timeout=10,
615
+ )
616
+ stderrs.append(stderr)
617
+
618
+ sandbox_url = 'https://' + sandbox.get_host(7860)
619
+
620
+ return (sandbox_url, sandbox.sandbox_id, '\n'.join(stderrs))
621
+
622
+
623
+ def run_streamlit_sandbox(code: str, code_dependencies: tuple[list[str], list[str]], existing_sandbox_id: str | None = None) -> tuple[str, str, str]:
624
+ sandbox = reuse_or_create_sandbox(sandbox_id=existing_sandbox_id)
625
+
626
+ stderrs = []
627
+
628
+ sandbox.files.make_dir('mystreamlit')
629
+ file_path = "~/mystreamlit/app.py"
630
+ sandbox.files.write(file_path, code, "user", 60)
631
+
632
+ python_dependencies, _ = code_dependencies
633
+ install_stderr = install_pip_dependencies(sandbox, python_dependencies)
634
+ stderrs.extend(install_stderr)
635
+
636
+ stderr = run_background_command_with_timeout(
637
+ sandbox,
638
+ r"sudo kill -9 $(ss -lptn 'sport = :8501' | grep -oP '(?<=pid=)\d+'); streamlit run ~/mystreamlit/app.py --server.port 8501 --server.headless true",
639
+ timeout=8,
640
+ )
641
+ stderrs.append(stderr)
642
+
643
+ host = sandbox.get_host(port=8501)
644
+ url = f"https://{host}"
645
+ return (url, sandbox.sandbox_id, '\n'.join(stderrs))
646
+
647
+
648
+ def run_c_code(code: str, existing_sandbox_id: str | None = None) -> tuple[str, str]:
649
+ """
650
+ Executes the provided C code within a sandboxed environment and returns the output.
651
+
652
+ Args:
653
+ code (str): The C code to be executed.
654
+
655
+ Returns:
656
+ tuple: (stdout, stderr)
657
+ """
658
+ sandbox = reuse_or_create_sandbox(sandbox_id=existing_sandbox_id)
659
+
660
+ file_path = "~/main.c"
661
+ sandbox.files.write(file_path, code, "user", 60)
662
+
663
+ is_success, stdouts, stderrs = run_command_in_sandbox(
664
+ sandbox=sandbox,
665
+ command=f"gcc {file_path} -o ~/main && ./main",
666
+ timeout=CODE_RUN_TIMEOUT_SECONDS,
667
+ )
668
+
669
+ # collect stdout, stderr from sandbox
670
+ stdout = "\n".join(stdouts)
671
+ stderr = "\n".join(stderrs)
672
+ return stdout, stderr
673
+
674
+
675
+ def run_cpp_code(code: str, existing_sandbox_id: str | None = None) -> tuple[str, str]:
676
+ """
677
+ Executes the provided C++ code within a sandboxed environment and returns the output.
678
+
679
+ Args:
680
+ code (str): The C++ code to be executed.
681
+
682
+ Returns:
683
+ tuple: (stdout, stderr)
684
+ """
685
+ sandbox = reuse_or_create_sandbox(sandbox_id=existing_sandbox_id)
686
+
687
+ file_path = "~/main.cpp"
688
+ sandbox.files.write(file_path, code, "user", 60)
689
+
690
+ is_success, stdouts, stderrs = run_command_in_sandbox(
691
+ sandbox=sandbox,
692
+ command=f"g++ {file_path} -o ~/main && ./main",
693
+ timeout=CODE_RUN_TIMEOUT_SECONDS,
694
+ )
695
+
696
+ # collect stdout, stderr from sandbox
697
+ stdout = "\n".join(stdouts)
698
+ stderr = "\n".join(stderrs)
699
+ return stdout, stderr
700
+
701
+
702
+ def run_java_code(code: str, existing_sandbox_id: str | None = None) -> tuple[str, str]:
703
+ """
704
+ Executes the provided Java code within a sandboxed environment and returns the output.
705
+
706
+ Args:
707
+ code (str): The Java code to be executed.
708
+
709
+ Returns:
710
+ tuple: (stdout, stderr)
711
+ """
712
+ sandbox = reuse_or_create_sandbox(sandbox_id=existing_sandbox_id)
713
+
714
+ class_name = extract_java_class_name(code)
715
+ file_path = f"~/{class_name}.java"
716
+ sandbox.files.write(file_path, code, "user", 60)
717
+
718
+ is_success, stdouts, stderrs = run_command_in_sandbox(
719
+ sandbox=sandbox,
720
+ command=f"javac {file_path} && java {class_name}",
721
+ timeout=CODE_RUN_TIMEOUT_SECONDS,
722
+ )
723
+
724
+ # collect stdout, stderr from sandbox
725
+ stdout = "\n".join(stdouts)
726
+ stderr = "\n".join(stderrs)
727
+ return stdout, stderr
728
+
729
+
730
+ def run_golang_code(code: str, existing_sandbox_id: str | None = None) -> tuple[str, str]:
731
+ """
732
+ Executes the provided Go code within a sandboxed environment and returns the output.
733
+
734
+ Args:
735
+ code (str): The Go code to be executed
736
+
737
+ Returns:
738
+ tuple: (stdout, stderr)
739
+ """
740
+ sandbox = reuse_or_create_sandbox(sandbox_id=existing_sandbox_id)
741
+
742
+ file_path = "~/main.go"
743
+ sandbox.files.write(file_path, code, "user", 60)
744
+
745
+ is_success, stdouts, stderrs = run_command_in_sandbox(
746
+ sandbox=sandbox,
747
+ command=f"go run {file_path}",
748
+ timeout=CODE_RUN_TIMEOUT_SECONDS,
749
+ )
750
+
751
+ # collect stdout, stderr from sandbox
752
+ stdout = "\n".join(stdouts)
753
+ stderr = "\n".join(stderrs)
754
+ return stdout, stderr
755
+
756
+
757
+ # def run_csharp_code(code: str, existing_sandbox_id: str | None = None) -> tuple[str, str]:
758
+ # """
759
+ # Executes the provided C# code within a sandboxed environment and returns the output.
760
+
761
+ # Args:
762
+ # code (str): The C# code to be executed
763
+
764
+ # Returns:
765
+ # tuple: (stdout, stderr)
766
+ # """
767
+ # sandbox = reuse_or_create_sandbox(sandbox_id=existing_sandbox_id)
768
+
769
+ # file_path = "~/main.cs"
770
+ # sandbox.files.write(file_path, code, "user", 60)
771
+
772
+ # is_success, stdouts, stderrs = run_command_in_sandbox(
773
+ # sandbox=sandbox,
774
+ # command=f"mcs {file_path} && mono main.exe",
775
+ # timeout=CODE_RUN_TIMEOUT_SECONDS,
776
+ # )
777
+
778
+ # # collect stdout, stderr from sandbox
779
+ # stdout = "\n".join(stdouts)
780
+ # stderr = "\n".join(stderrs)
781
+ # return stdout, stderr
782
+
783
+
784
+ def run_rust_code(code: str, existing_sandbox_id: str | None = None) -> tuple[str, str]:
785
+ """
786
+ Executes the provided Rust code within a sandboxed environment and returns the output.
787
+
788
+ Args:
789
+ code (str): The Rust code to be executed
790
+
791
+ Returns:
792
+ tuple: (stdout, stderr)
793
+ """
794
+ sandbox = reuse_or_create_sandbox(sandbox_id=existing_sandbox_id)
795
+
796
+ file_path = "~/main.rs"
797
+ sandbox.files.write(file_path, code, "user", 60)
798
+
799
+ is_success, stdouts, stderrs = run_command_in_sandbox(
800
+ sandbox=sandbox,
801
+ command=f"rustc {file_path} && ./main",
802
+ timeout=CODE_RUN_TIMEOUT_SECONDS,
803
+ )
804
+
805
+ # collect stdout, stderr from sandbox
806
+ stdout = "\n".join(stdouts)
807
+ stderr = "\n".join(stderrs)
808
+ return stdout, stderr
809
+
810
+
811
+ def on_edit_code(
812
+ state,
813
+ sandbox_state: ChatbotSandboxState,
814
+ sandbox_output_md: gr.Markdown,
815
+ sandbox_ui: SandboxComponent,
816
+ sandbox_code: str,
817
+ sandbox_dependency: gr.Dataframe,
818
+ ) -> Generator[tuple[Any, Any, Any, Any], None, None]:
819
+ '''
820
+ Gradio Handler when code is edited manually by users.
821
+ '''
822
+ if sandbox_state['enable_sandbox'] is False:
823
+ yield None, None, None, None
824
+ return
825
+ if len(sandbox_code.strip()) == 0 or sandbox_code == sandbox_state['code_to_execute']:
826
+ yield gr.skip(), gr.skip(), gr.skip(), gr.skip()
827
+ return
828
+ sandbox_state['code_to_execute'] = sandbox_code
829
+
830
+ # Extract packages from imports (without versions)
831
+ python_deps_from_imports = set(extract_python_imports(sandbox_code))
832
+ npm_deps_from_imports = set(extract_js_imports(sandbox_code))
833
+
834
+ # Get existing dependencies with versions from state
835
+ existing_python_deps, existing_npm_deps = sandbox_state["code_dependencies"]
836
+
837
+ # Create dictionaries to track package versions
838
+ python_deps_dict = {} # pkg_name -> version
839
+ npm_deps_dict = {} # pkg_name -> version
840
+
841
+ # First add existing dependencies with their specific versions
842
+ for dep in existing_python_deps:
843
+ pkg_name = dep.split('==')[0].split('>=')[0].split('<=')[0].split('~=')[0]
844
+ version = dep[len(pkg_name):]
845
+ if version: # If it has a specific version
846
+ python_deps_dict[pkg_name] = version
847
+ elif pkg_name in python_deps_from_imports: # Only keep packages that are still imported
848
+ python_deps_dict[pkg_name] = "latest"
849
+
850
+ for dep in existing_npm_deps:
851
+ if '@' in dep and not dep.startswith('@'):
852
+ pkg_name = dep.split('@')[0]
853
+ version = '@' + dep.split('@')[1]
854
+ elif '@' in dep[1:]: # Handle scoped packages
855
+ pkg_name, version = dep.rsplit('@', 1)
856
+ version = '@' + version
857
+ else:
858
+ pkg_name = dep
859
+ version = "latest"
860
+ if version != "latest": # If it has a specific version
861
+ npm_deps_dict[pkg_name] = version
862
+ elif pkg_name in npm_deps_from_imports: # Only keep packages that are still imported
863
+ npm_deps_dict[pkg_name] = "latest"
864
+
865
+ # Add new dependencies from imports with "latest" if not already present
866
+ for dep in python_deps_from_imports:
867
+ if dep not in python_deps_dict:
868
+ python_deps_dict[dep] = "latest"
869
+
870
+ for dep in npm_deps_from_imports:
871
+ if dep not in npm_deps_dict:
872
+ npm_deps_dict[dep] = "latest"
873
+
874
+ # Convert to dataframe format
875
+ dependencies = []
876
+
877
+ # Add Python packages
878
+ for pkg_name, version in python_deps_dict.items():
879
+ dependencies.append(["python", pkg_name, version])
880
+
881
+ # Add NPM packages
882
+ for pkg_name, version in npm_deps_dict.items():
883
+ dependencies.append(["npm", pkg_name, version])
884
+
885
+ # If no dependencies found, provide default empty rows
886
+ if not dependencies:
887
+ dependencies = [["python", "", ""], ["npm", "", ""]]
888
+
889
+ # Update dependencies in sandbox state
890
+ sandbox_state["code_dependencies"] = (
891
+ [f"{pkg}{ver}" if ver != "latest" else pkg for pkg, ver in python_deps_dict.items()],
892
+ [f"{pkg}{ver}" if ver != "latest" else pkg for pkg, ver in npm_deps_dict.items()]
893
+ )
894
+
895
+ yield (
896
+ gr.skip(), # sandbox_output_md
897
+ gr.skip(), # sandbox_ui
898
+ gr.skip(), # sandbox_code
899
+ gr.update(value=dependencies), # sandbox_dependency
900
+ )
901
+ yield from on_run_code(
902
+ state,
903
+ sandbox_state,
904
+ sandbox_output_md,
905
+ sandbox_ui,
906
+ sandbox_code,
907
+ sandbox_dependency,
908
+ )
909
+
910
+
911
+ def on_edit_dependency(
912
+ state,
913
+ sandbox_state: ChatbotSandboxState,
914
+ sandbox_dependency: gr.Dataframe,
915
+ sandbox_output_md: gr.Markdown,
916
+ sandbox_ui: SandboxComponent,
917
+ sandbox_code: str,
918
+ ) -> Generator[tuple[Any, Any, Any, Any], None, None]:
919
+ """
920
+ Gradio Handler when dependencies are edited manually by users.
921
+ Handles version specifications and dependency removal.
922
+ """
923
+ if sandbox_state["enable_sandbox"] is False:
924
+ yield None, None, None, None
925
+ return
926
+
927
+ # Validate dependencies format
928
+ is_valid, error_msg = validate_dependencies(sandbox_dependency)
929
+ if not is_valid:
930
+ yield (
931
+ gr.Markdown(f"Invalid dependencies: {error_msg}"),
932
+ gr.skip(),
933
+ gr.skip(),
934
+ sandbox_dependency, # Return original dataframe
935
+ )
936
+ return
937
+
938
+ # Convert dataframe format to separate python and npm lists
939
+ python_deps = []
940
+ npm_deps = []
941
+ for dep in sandbox_dependency:
942
+ dep_type, pkg_name, version = dep
943
+ pkg_name = pkg_name.strip()
944
+ version = version.strip()
945
+
946
+ # Skip empty rows
947
+ if not pkg_name:
948
+ continue
949
+
950
+ if dep_type.lower() == "python":
951
+ # Handle Python package with version
952
+ if version and version.lower() != "latest":
953
+ if not any(op in version for op in ["==", ">=", "<=", "~=", ">", "<"]):
954
+ python_deps.append(f"{pkg_name}=={version}")
955
+ else:
956
+ python_deps.append(f"{pkg_name}{version}")
957
+ else:
958
+ python_deps.append(pkg_name)
959
+
960
+ elif dep_type.lower() == "npm":
961
+ # Handle NPM package with version
962
+ if version and version.lower() != "latest":
963
+ if not version.startswith("@"):
964
+ version = "@" + version
965
+ npm_deps.append(f"{pkg_name}{version}")
966
+ else:
967
+ npm_deps.append(pkg_name)
968
+
969
+ # Update sandbox state with new dependencies
970
+ sandbox_state["code_dependencies"] = (python_deps, npm_deps)
971
+
972
+ # increase edit round
973
+ sandbox_state['edit_round'] += 1
974
+
975
+ # First yield: Update UI with success message
976
+ yield (
977
+ gr.Markdown("Dependencies updated successfully"),
978
+ gr.skip(), # sandbox_ui
979
+ gr.skip(), # sandbox_code
980
+ sandbox_dependency, # Return the same dataframe
981
+ )
982
+
983
+ # Second yield: Run code with new dependencies
984
+ yield from on_run_code(
985
+ state,
986
+ sandbox_state,
987
+ sandbox_output_md,
988
+ sandbox_ui,
989
+ sandbox_code,
990
+ sandbox_dependency,
991
+ )
992
+
993
+
994
+ def on_click_code_message_run(
995
+ state,
996
+ sandbox_state: ChatbotSandboxState,
997
+ sandbox_output_md: gr.Markdown,
998
+ sandbox_ui: SandboxComponent,
999
+ sandbox_code: str,
1000
+ sandbox_dependency: gr.Dataframe,
1001
+ evt: gr.SelectData
1002
+ ) -> Generator[SandboxGradioSandboxComponents, None, None]:
1003
+ '''
1004
+ Gradio Handler when run code button in message is clicked. Update Sandbox components.
1005
+ '''
1006
+ print("on_click_code_message_run")
1007
+
1008
+ if sandbox_state['enable_sandbox'] is False:
1009
+ yield None, None, None, None
1010
+ return
1011
+ if not evt.value.endswith(RUN_CODE_BUTTON_HTML):
1012
+ yield gr.skip(), gr.skip(), gr.skip(), gr.skip()
1013
+ return
1014
+
1015
+ message = evt.value.replace(RUN_CODE_BUTTON_HTML, "").strip()
1016
+ extract_result = extract_code_from_markdown(
1017
+ message=message,
1018
+ enable_auto_env=sandbox_state['sandbox_environment'] == SandboxEnvironment.AUTO
1019
+ )
1020
+ if extract_result is None:
1021
+ yield gr.skip(), gr.skip(), gr.skip(), gr.skip()
1022
+ return
1023
+
1024
+ code, code_language, code_dependencies, env_selection = extract_result
1025
+
1026
+ # As sandbox is reused, no need to skip
1027
+ # if sandbox_state['code_to_execute'] == code and sandbox_state['code_language'] == code_language:
1028
+ # # skip if no changes
1029
+ # yield gr.skip(), gr.skip(), gr.skip(), gr.skip()
1030
+ # return
1031
+
1032
+ if code_language == 'tsx':
1033
+ code_language = 'typescript'
1034
+ code_language = code_language.lower()
1035
+ gradio_code_language = code_language.lower() if code_language and code_language.lower(
1036
+ # ensure gradio supports the code language
1037
+ ) in VALID_GRADIO_CODE_LANGUAGES else None
1038
+
1039
+ python_deps, npm_deps = code_dependencies
1040
+
1041
+ # Convert to dataframe format
1042
+ dependencies = []
1043
+
1044
+ # Add Python packages with versions
1045
+ for dep in python_deps:
1046
+ # Check if package has version specifier
1047
+ if any(op in dep for op in ['==', '>=', '<=', '~=']):
1048
+ # Split on first occurrence of version operator
1049
+ pkg_name = dep.split('==')[0].split('>=')[0].split('<=')[0].split('~=')[0]
1050
+ version = dep[len(pkg_name):] # Get everything after package name
1051
+ dependencies.append(["python", pkg_name, version])
1052
+ else:
1053
+ dependencies.append(["python", dep, "latest"])
1054
+
1055
+ # Add NPM packages with versions
1056
+ for dep in npm_deps:
1057
+ # Check if package has version specifier
1058
+ if '@' in dep and not dep.startswith('@'):
1059
+ # Handle non-scoped packages with version
1060
+ pkg_name, version = dep.split('@', 1)
1061
+ dependencies.append(["npm", pkg_name, '@' + version])
1062
+ elif '@' in dep[1:]: # Handle scoped packages with version
1063
+ # Split on last @ for scoped packages
1064
+ pkg_parts = dep.rsplit('@', 1)
1065
+ dependencies.append(["npm", pkg_parts[0], '@' + pkg_parts[1]])
1066
+ else:
1067
+ dependencies.append(["npm", dep, "latest"])
1068
+
1069
+ # If no dependencies found, provide default empty rows
1070
+ if not dependencies:
1071
+ dependencies = [["python", "", ""], ["npm", "", ""]]
1072
+
1073
+ sandbox_state['code_to_execute'] = code
1074
+ sandbox_state['code_language'] = code_language
1075
+ sandbox_state["code_dependencies"] = code_dependencies
1076
+ if sandbox_state['sandbox_environment'] == SandboxEnvironment.AUTO:
1077
+ sandbox_state['auto_selected_sandbox_environment'] = env_selection
1078
+
1079
+ # reset edit round
1080
+ sandbox_state['edit_round'] = 0
1081
+
1082
+ yield (
1083
+ gr.skip(), # sandbox_output_md
1084
+ gr.skip(), # sandbox_ui
1085
+ gr.update(value=code, language=gradio_code_language), # sandbox_code
1086
+ gr.update(value=dependencies) # sandbox_dependency
1087
+ )
1088
+
1089
+ yield from on_run_code(
1090
+ state,
1091
+ sandbox_state,
1092
+ sandbox_output_md,
1093
+ sandbox_ui,
1094
+ sandbox_code,
1095
+ sandbox_dependency,
1096
+ )
1097
+
1098
+
1099
+ def on_run_code(
1100
+ state,
1101
+ sandbox_state: ChatbotSandboxState,
1102
+ sandbox_output_md: gr.Markdown,
1103
+ sandbox_ui: SandboxComponent,
1104
+ sandbox_code: str,
1105
+ sandbox_dependency: gr.Dataframe,
1106
+ ) -> Generator[tuple[Any, Any, Any, Any], None, None]:
1107
+ '''
1108
+ gradio fn when run code button is clicked. Update Sandbox components.
1109
+ '''
1110
+ print("on_run_code")
1111
+
1112
+ if sandbox_state['enable_sandbox'] is False:
1113
+ yield None, None, None, None
1114
+ return
1115
+
1116
+ # validate e2b api key
1117
+ if not E2B_API_KEY:
1118
+ raise ValueError("E2B_API_KEY is not set in env vars.")
1119
+
1120
+ # hide and change value of the current sandbox UI to force refresh the sandbox
1121
+ # otherwise the sandbox might not change if the url is same
1122
+ yield (
1123
+ gr.skip(),
1124
+ SandboxComponent(
1125
+ value=('', False, []),
1126
+ label="Example",
1127
+ visible=False,
1128
+ ),
1129
+ gr.skip(),
1130
+ gr.skip(),
1131
+ )
1132
+
1133
+ code, code_language = sandbox_state['code_to_execute'], sandbox_state['code_language']
1134
+ if code is None or code_language is None:
1135
+ yield None, None, None, None
1136
+ return
1137
+
1138
+ gradio_code_language = code_language.lower() if code_language and code_language.lower(
1139
+ # ensure gradio supports the code language
1140
+ ) in VALID_GRADIO_CODE_LANGUAGES else None
1141
+
1142
+ # Use dependencies from sandbox_state instead of re-extracting
1143
+ code_dependencies = sandbox_state['code_dependencies']
1144
+ python_deps, npm_deps = code_dependencies
1145
+
1146
+ # Helper function to extract package name without version
1147
+ def get_base_package_name(pkg: str) -> str:
1148
+ # For Python packages
1149
+ if any(op in pkg for op in ['==', '>=', '<=', '~=', '>', '<']):
1150
+ return pkg.split('==')[0].split('>=')[0].split('<=')[0].split('~=')[0].split('>')[0].split('<')[0]
1151
+ # For NPM packages
1152
+ if '@' in pkg and not pkg.startswith('@'):
1153
+ return pkg.split('@')[0]
1154
+ elif '@' in pkg[1:]: # Handle scoped packages
1155
+ return pkg.rsplit('@', 1)[0]
1156
+ return pkg
1157
+
1158
+ # Helper function to extract version from package string
1159
+ def get_package_version(pkg: str) -> str:
1160
+ # For Python packages
1161
+ if any(op in pkg for op in ['==', '>=', '<=', '~=', '>', '<']):
1162
+ base_name = get_base_package_name(pkg)
1163
+ return pkg[len(base_name):]
1164
+ # For NPM packages
1165
+ if '@' in pkg and not pkg.startswith('@'):
1166
+ return '@' + pkg.split('@', 1)[1]
1167
+ elif '@' in pkg[1:]: # Handle scoped packages
1168
+ _, version = pkg.rsplit('@', 1)
1169
+ return '@' + version
1170
+ return "latest"
1171
+
1172
+ # Create unified dependency dictionaries to avoid duplicates
1173
+ python_deps_dict = {} # pkg_name -> version
1174
+ npm_deps_dict = {} # pkg_name -> version
1175
+
1176
+ # Process Python dependencies
1177
+ for dep in python_deps:
1178
+ base_name = get_base_package_name(dep)
1179
+ version = get_package_version(dep)
1180
+ # Only update if we don't have a version yet or if we're replacing 'latest'
1181
+ if base_name not in python_deps_dict or python_deps_dict[base_name] == "latest":
1182
+ python_deps_dict[base_name] = version
1183
+
1184
+ # Process NPM dependencies
1185
+ for dep in npm_deps:
1186
+ base_name = get_base_package_name(dep)
1187
+ version = get_package_version(dep)
1188
+ # Only update if we don't have a version yet or if we're replacing 'latest'
1189
+ if base_name not in npm_deps_dict or npm_deps_dict[base_name] == "latest":
1190
+ npm_deps_dict[base_name] = version
1191
+
1192
+ # Convert unified dictionaries to dataframe format
1193
+ dependencies = []
1194
+ for pkg_name, version in python_deps_dict.items():
1195
+ dependencies.append(["python", pkg_name, version])
1196
+ for pkg_name, version in npm_deps_dict.items():
1197
+ dependencies.append(["npm", pkg_name, version])
1198
+
1199
+ # If no dependencies found, provide default empty rows
1200
+ if not dependencies:
1201
+ dependencies = [["python", "", ""], ["npm", "", ""]]
1202
+
1203
+ # Initialize output with loading message
1204
+ markdown_output_text = "### Sandbox Execution Log\n\n"
1205
+ yield (
1206
+ gr.Markdown(
1207
+ value=markdown_output_text + "🔄 Initializing sandbox environment...", visible=True
1208
+ ),
1209
+ SandboxComponent(visible=False),
1210
+ gr.Code(value=code, language=gradio_code_language, visible=True),
1211
+ gr.update(value=dependencies, visible=True), # Update with unified dependencies
1212
+ )
1213
+
1214
+ # Use auto_selected_sandbox_environment only when in AUTO mode, otherwise use sandbox_environment
1215
+ sandbox_env = (
1216
+ sandbox_state['auto_selected_sandbox_environment']
1217
+ if sandbox_state['sandbox_environment'] == SandboxEnvironment.AUTO
1218
+ else sandbox_state['sandbox_environment']
1219
+ )
1220
+
1221
+ def update_markdown_output(message: str, clear_output: bool = False):
1222
+ nonlocal markdown_output_text
1223
+ if clear_output:
1224
+ markdown_output_text = ""
1225
+ markdown_output_text += f"\n{message}"
1226
+ return (
1227
+ gr.Markdown(value=markdown_output_text, visible=True, sanitize_html=False),
1228
+ gr.skip(),
1229
+ gr.skip(),
1230
+ gr.skip() # Always include dependencies update
1231
+ )
1232
+
1233
+ sandbox_id: str | None = sandbox_state["sandbox_id"] # the sandbox id
1234
+ sandbox_output: str = "" # stdout from sandbox
1235
+ sandbox_error: str = "" # stderr from sandbox
1236
+ print(f"sandbox_env: {sandbox_env}")
1237
+ match sandbox_env:
1238
+ case SandboxEnvironment.HTML:
1239
+ yield update_markdown_output("🔄 Setting up HTML sandbox...")
1240
+ sandbox_url, sandbox_id, sandbox_error = run_html_sandbox(
1241
+ code=code,
1242
+ code_dependencies=code_dependencies,
1243
+ existing_sandbox_id=sandbox_state['sandbox_id'],
1244
+ )
1245
+ if sandbox_error:
1246
+ yield update_markdown_output("❌ HTML sandbox failed to run!", clear_output=True)
1247
+ yield update_markdown_output(f"### Stderr:\n```markdown\n{sandbox_error}\n```\n\n")
1248
+ else:
1249
+ yield update_markdown_output("✅ HTML sandbox is ready!", clear_output=True)
1250
+ yield (
1251
+ gr.Markdown(value=markdown_output_text, visible=True),
1252
+ SandboxComponent(
1253
+ value=(sandbox_url, True, []),
1254
+ label="Example",
1255
+ visible=True,
1256
+ key="newsandbox",
1257
+ ),
1258
+ gr.skip(),
1259
+ gr.skip(),
1260
+ )
1261
+ case SandboxEnvironment.REACT:
1262
+ yield update_markdown_output("🔄 Setting up React sandbox...")
1263
+ code_run_result = run_react_sandbox(
1264
+ code=code,
1265
+ code_dependencies=code_dependencies,
1266
+ existing_sandbox_id=sandbox_state['sandbox_id'],
1267
+ )
1268
+ sandbox_id, sandbox_error = code_run_result['sandbox_id'], code_run_result['stderr']
1269
+ if code_run_result['is_run_success'] is False and sandbox_error:
1270
+ yield update_markdown_output("❌ React sandbox failed to run!", clear_output=True)
1271
+ yield update_markdown_output(f"### Stderr:\n```markdown\n{sandbox_error}\n```\n\n")
1272
+ else:
1273
+ yield update_markdown_output("✅ React sandbox is ready!", clear_output=True)
1274
+ yield (
1275
+ gr.Markdown(value=markdown_output_text, visible=True),
1276
+ SandboxComponent(
1277
+ value=(code_run_result['sandbox_url'], True, []),
1278
+ label="Example",
1279
+ visible=True,
1280
+ key="newsandbox",
1281
+ ),
1282
+ gr.skip(),
1283
+ gr.skip(),
1284
+ )
1285
+ case SandboxEnvironment.VUE:
1286
+ yield update_markdown_output("🔄 Setting up Vue sandbox...")
1287
+ code_run_result = run_vue_sandbox(
1288
+ code=code,
1289
+ code_dependencies=code_dependencies,
1290
+ existing_sandbox_id=sandbox_state['sandbox_id'],
1291
+ )
1292
+ sandbox_id, sandbox_error = code_run_result['sandbox_id'], code_run_result['stderr']
1293
+ if code_run_result['is_run_success'] is False and code_run_result['stderr']:
1294
+ yield update_markdown_output("❌ Vue sandbox failed to run!", clear_output=True)
1295
+ yield update_markdown_output(f"### Stderr:\n```markdown\n{code_run_result['stderr']}\n```\n\n")
1296
+ else:
1297
+ yield update_markdown_output("✅ Vue sandbox is ready!", clear_output=True)
1298
+ yield (
1299
+ gr.Markdown(value=markdown_output_text, visible=True),
1300
+ SandboxComponent(
1301
+ value=(code_run_result['sandbox_url'], True, []),
1302
+ label="Example",
1303
+ visible=True,
1304
+ key="newsandbox",
1305
+ ),
1306
+ gr.skip(),
1307
+ gr.skip(),
1308
+ )
1309
+ case SandboxEnvironment.PYGAME:
1310
+ yield update_markdown_output("🔄 Setting up PyGame sandbox...")
1311
+ code_run_result = run_pygame_sandbox(
1312
+ code=code,
1313
+ code_dependencies=code_dependencies,
1314
+ existing_sandbox_id=sandbox_state['sandbox_id'],
1315
+ )
1316
+ sandbox_id, sandbox_error = code_run_result['sandbox_id'], code_run_result['stderr']
1317
+ if code_run_result['is_run_success'] is False and code_run_result['stderr']:
1318
+ yield update_markdown_output("❌ PyGame sandbox failed to run!", clear_output=True)
1319
+ yield update_markdown_output(f"### Stderr:\n```markdown\n{code_run_result['stderr']}\n```\n\n")
1320
+ else:
1321
+ yield update_markdown_output("✅ PyGame sandbox is ready!", clear_output=True)
1322
+ yield (
1323
+ gr.Markdown(value=markdown_output_text, visible=True),
1324
+ SandboxComponent(
1325
+ value=(code_run_result['sandbox_url'], True, []),
1326
+ label="Example",
1327
+ visible=True,
1328
+ key="newsandbox",
1329
+ ),
1330
+ gr.skip(),
1331
+ gr.skip(),
1332
+ )
1333
+ case SandboxEnvironment.GRADIO:
1334
+ yield update_markdown_output("🔄 Setting up Gradio sandbox...")
1335
+ sandbox_url, sandbox_id, sandbox_error = run_gradio_sandbox(
1336
+ code=code,
1337
+ code_dependencies=code_dependencies,
1338
+ existing_sandbox_id=sandbox_state['sandbox_id'],
1339
+ )
1340
+ if sandbox_error:
1341
+ yield update_markdown_output("❌ Gradio sandbox failed to run!", clear_output=True)
1342
+ yield update_markdown_output(f"### Stderr:\n```markdown\n{sandbox_error}\n```\n\n")
1343
+ else:
1344
+ yield update_markdown_output("✅ Gradio sandbox is ready!", clear_output=True)
1345
+ yield (
1346
+ gr.Markdown(value=markdown_output_text, visible=True),
1347
+ SandboxComponent(
1348
+ value=(sandbox_url, True, []),
1349
+ label="Example",
1350
+ visible=True,
1351
+ key="newsandbox",
1352
+ ),
1353
+ gr.skip(),
1354
+ gr.skip(),
1355
+ )
1356
+ case SandboxEnvironment.STREAMLIT:
1357
+ yield update_markdown_output("🔄 Setting up Streamlit sandbox...")
1358
+ sandbox_url, sandbox_id, sandbox_error = run_streamlit_sandbox(
1359
+ code=code,
1360
+ code_dependencies=code_dependencies,
1361
+ existing_sandbox_id=sandbox_state['sandbox_id'],
1362
+ )
1363
+ if sandbox_error:
1364
+ yield update_markdown_output("❌ Streamlit sandbox failed to run!", clear_output=True)
1365
+ yield update_markdown_output(f"### Stderr:\n```markdown\n{sandbox_error}\n```\n\n")
1366
+ else:
1367
+ yield update_markdown_output("✅ Streamlit sandbox is ready!", clear_output=True)
1368
+ yield (
1369
+ gr.Markdown(value=markdown_output_text, visible=True),
1370
+ SandboxComponent(
1371
+ value=(sandbox_url, True, []),
1372
+ label="Example",
1373
+ visible=True,
1374
+ key="newsandbox",
1375
+ ),
1376
+ gr.skip(),
1377
+ gr.skip(),
1378
+ )
1379
+ case SandboxEnvironment.MERMAID:
1380
+ yield update_markdown_output("🔄 Setting up Mermaid visualization...")
1381
+ # Convert Mermaid to HTML at execution time
1382
+ html_code = mermaid_to_html(code, theme='light')
1383
+ sandbox_url, sandbox_id, sandbox_error = run_html_sandbox(
1384
+ code=html_code,
1385
+ code_dependencies=code_dependencies,
1386
+ existing_sandbox_id=sandbox_state['sandbox_id'],
1387
+ )
1388
+ if sandbox_error:
1389
+ yield update_markdown_output("❌ Mermaid visualization failed to render!", clear_output=True)
1390
+ yield update_markdown_output(f"### Stderr:\n```markdown\n{sandbox_error}\n```\n\n")
1391
+ else:
1392
+ yield update_markdown_output("✅ Mermaid visualization is ready!", clear_output=True)
1393
+ yield (
1394
+ gr.Markdown(value=markdown_output_text, visible=True),
1395
+ SandboxComponent(
1396
+ value=(sandbox_url, True, []),
1397
+ label="Mermaid Diagram",
1398
+ visible=True,
1399
+ key="newsandbox",
1400
+ ),
1401
+ gr.skip(),
1402
+ gr.skip(),
1403
+ )
1404
+ case SandboxEnvironment.PYTHON_RUNNER:
1405
+ yield update_markdown_output("🔄 Running Python Runner...", clear_output=True)
1406
+ sandbox_output, sandbox_error = run_code_interpreter(
1407
+ code=code, code_language='python', code_dependencies=code_dependencies
1408
+ )
1409
+ if sandbox_error:
1410
+ yield update_markdown_output("❌ Python Runner failed to run!", clear_output=True)
1411
+ yield update_markdown_output(f"### Stderr:\n```markdown\n{sandbox_error}\n```\n\n")
1412
+ else:
1413
+ yield update_markdown_output("✅ Code execution is ready!", clear_output=True)
1414
+ yield (
1415
+ gr.Markdown(
1416
+ value=markdown_output_text + "\n\n" + sandbox_output,
1417
+ sanitize_html=False,
1418
+ visible=True,
1419
+ ),
1420
+ SandboxComponent(
1421
+ value=("", False, []),
1422
+ label="Example",
1423
+ visible=False,
1424
+ key="newsandbox",
1425
+ ),
1426
+ gr.skip(),
1427
+ gr.skip(),
1428
+ )
1429
+ case SandboxEnvironment.JAVASCRIPT_RUNNER:
1430
+ yield update_markdown_output("🔄 Running JavaScript Runner...", clear_output=True)
1431
+ sandbox_output, sandbox_error = run_code_interpreter(
1432
+ code=code, code_language='javascript', code_dependencies=code_dependencies
1433
+ )
1434
+ if sandbox_error:
1435
+ yield update_markdown_output("❌ JavaScript Runner failed to run!", clear_output=True)
1436
+ yield update_markdown_output(f"### Stderr:\n```markdown\n{sandbox_error}\n```\n\n")
1437
+ else:
1438
+ yield update_markdown_output("✅ Code execution is ready!", clear_output=True)
1439
+ yield (
1440
+ gr.Markdown(
1441
+ value=markdown_output_text + "\n\n" + sandbox_output,
1442
+ sanitize_html=False,
1443
+ visible=True,
1444
+ ),
1445
+ SandboxComponent(
1446
+ value=("", False, []),
1447
+ label="Example",
1448
+ visible=False,
1449
+ key="newsandbox",
1450
+ ),
1451
+ gr.skip(),
1452
+ gr.skip(),
1453
+ )
1454
+ case SandboxEnvironment.C_RUNNER:
1455
+ yield update_markdown_output("🔄 Running C Runner...", clear_output=True)
1456
+ sandbox_output, sandbox_error = run_c_code(
1457
+ code=code, existing_sandbox_id=sandbox_state['sandbox_id']
1458
+ )
1459
+ if sandbox_error:
1460
+ yield update_markdown_output("❌ C Runner failed to run!", clear_output=True)
1461
+ yield update_markdown_output(f"### Stderr:\n```markdown\n{sandbox_error}\n```\n\n")
1462
+ else:
1463
+ yield update_markdown_output("✅ Code execution is ready!", clear_output=True)
1464
+ yield (
1465
+ gr.Markdown(
1466
+ value=markdown_output_text + "\n\n" + f"```markdown\n{sandbox_output}\n```",
1467
+ sanitize_html=False,
1468
+ visible=True,
1469
+ ),
1470
+ SandboxComponent(
1471
+ value=("", False, []),
1472
+ label="Example",
1473
+ visible=False,
1474
+ key="newsandbox",
1475
+ ),
1476
+ gr.skip(),
1477
+ gr.skip(),
1478
+ )
1479
+ case SandboxEnvironment.CPP_RUNNER:
1480
+ yield update_markdown_output("🔄 Running C++ Runner...", clear_output=True)
1481
+ sandbox_output, sandbox_error = run_cpp_code(
1482
+ code=code, existing_sandbox_id=sandbox_state['sandbox_id']
1483
+ )
1484
+ if sandbox_error:
1485
+ yield update_markdown_output("❌ C++ Runner failed to run!", clear_output=True)
1486
+ yield update_markdown_output(f"### Stderr:\n```markdown\n{sandbox_error}\n```\n\n")
1487
+ else:
1488
+ yield update_markdown_output("✅ Code execution is ready!", clear_output=True)
1489
+ yield (
1490
+ gr.Markdown(
1491
+ value=markdown_output_text + "\n\n" + f"```markdown\n{sandbox_output}\n```",
1492
+ sanitize_html=False,
1493
+ visible=True,
1494
+ ),
1495
+ SandboxComponent(
1496
+ value=("", False, []),
1497
+ label="Example",
1498
+ visible=False,
1499
+ key="newsandbox",
1500
+ ),
1501
+ gr.skip(),
1502
+ gr.skip(),
1503
+ )
1504
+ case SandboxEnvironment.JAVA_RUNNER:
1505
+ yield update_markdown_output("🔄 Running Java Runner...", clear_output=True)
1506
+ sandbox_output, sandbox_error = run_java_code(
1507
+ code=code, existing_sandbox_id=sandbox_state['sandbox_id']
1508
+ )
1509
+ if sandbox_error:
1510
+ yield update_markdown_output("❌ Java Runner failed to run!", clear_output=True)
1511
+ yield update_markdown_output(f"### Stderr:\n```markdown\n{sandbox_error}\n```\n\n")
1512
+ else:
1513
+ yield update_markdown_output("✅ Code execution is ready!", clear_output=True)
1514
+ yield (
1515
+ gr.Markdown(
1516
+ value=markdown_output_text + "\n\n" + f"```markdown\n{sandbox_output}\n```",
1517
+ sanitize_html=False,
1518
+ visible=True,
1519
+ ),
1520
+ SandboxComponent(
1521
+ value=("", False, []),
1522
+ label="Example",
1523
+ visible=False,
1524
+ key="newsandbox",
1525
+ ),
1526
+ gr.skip(),
1527
+ gr.skip(),
1528
+ )
1529
+ case SandboxEnvironment.GOLANG_RUNNER:
1530
+ yield update_markdown_output("🔄 Running Go Runner...", clear_output=True)
1531
+ sandbox_output, sandbox_error = run_golang_code(
1532
+ code=code, existing_sandbox_id=sandbox_state['sandbox_id']
1533
+ )
1534
+ if sandbox_error:
1535
+ yield update_markdown_output("❌ Go Runner failed to run!", clear_output=True)
1536
+ yield update_markdown_output(f"### Stderr:\n```markdown\n{sandbox_error}\n```\n\n")
1537
+ else:
1538
+ yield update_markdown_output("✅ Code execution is ready!", clear_output=True)
1539
+ yield (
1540
+ gr.Markdown(
1541
+ value=markdown_output_text + "\n\n" + f"```markdown\n{sandbox_output}\n```",
1542
+ sanitize_html=False,
1543
+ visible=True,
1544
+ ),
1545
+ SandboxComponent(
1546
+ value=("", False, []),
1547
+ label="Example",
1548
+ visible=False,
1549
+ key="newsandbox",
1550
+ ),
1551
+ gr.skip(),
1552
+ gr.skip(),
1553
+ )
1554
+ # case SandboxEnvironment.CSHARP_RUNNER:
1555
+ # yield update_markdown_output("🔄 Running C# Runner...", clear_output=True)
1556
+ # output, stderr = run_csharp_code(
1557
+ # code=code, existing_sandbox_id=sandbox_state['sandbox_id']
1558
+ # )
1559
+ # yield update_markdown_output("✅ Code execution is ready!", clear_output=True)
1560
+ # if output:
1561
+ # yield update_markdown_output(f"### Stdout:\n```markdown\n{output}\n```\n\n")
1562
+ # if stderr:
1563
+ # yield update_markdown_output(f"### Stderr:\n```markdown\n{stderr}\n```\n\n")
1564
+ case SandboxEnvironment.RUST_RUNNER:
1565
+ yield update_markdown_output("🔄 Running Rust Runner...", clear_output=True)
1566
+ sandbox_output, sandbox_error = run_rust_code(
1567
+ code=code, existing_sandbox_id=sandbox_state['sandbox_id']
1568
+ )
1569
+ if sandbox_error:
1570
+ yield update_markdown_output("❌ Rust Runner failed to run!", clear_output=True)
1571
+ yield update_markdown_output(f"### Stderr:\n```markdown\n{sandbox_error}\n```\n\n")
1572
+ else:
1573
+ yield update_markdown_output("✅ Code execution is ready!", clear_output=True)
1574
+ yield (
1575
+ gr.Markdown(
1576
+ value=markdown_output_text + "\n\n" + f"```markdown\n{sandbox_output}\n```",
1577
+ sanitize_html=False,
1578
+ visible=True,
1579
+ ),
1580
+ SandboxComponent(
1581
+ value=("", False, []),
1582
+ label="Example",
1583
+ visible=False,
1584
+ key="newsandbox",
1585
+ ),
1586
+ gr.skip(),
1587
+ gr.skip(),
1588
+ )
1589
+ case _:
1590
+ yield (
1591
+ gr.Markdown(value=code, visible=True),
1592
+ SandboxComponent(
1593
+ value=("", False, []),
1594
+ label="Example",
1595
+ visible=False,
1596
+ key="newsandbox",
1597
+ ),
1598
+ gr.skip(),
1599
+ gr.skip(),
1600
+ )
1601
+
1602
+ sandbox_state['sandbox_run_round'] += 1
1603
+ sandbox_state["sandbox_output"] = sandbox_output # record sandbox output if exists
1604
+ sandbox_state["sandbox_error"] = sandbox_error # record sandbox error if exists
1605
+ # generate a random sandbox id if not exists as some code runners might not return sandbox id
1606
+ sandbox_state['sandbox_id'] = sandbox_id if sandbox_id else str(uuid.uuid4())
1607
+ log_sandbox_telemetry_gradio_fn(
1608
+ sandbox_state=sandbox_state,
1609
+ sandbox_ui_value=None,
1610
+ )
1611
+
1612
+ print("on_run_code done")
sandbox/constants.py ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ '''
2
+ Constants for sandbox.
3
+ '''
4
+
5
+ import os
6
+
7
+ E2B_API_KEY = os.environ.get("E2B_API_KEY")
8
+ '''
9
+ API key for the e2b API.
10
+ '''
11
+
12
+ AZURE_BLOB_STORAGE_CONNECTION_STRING = os.environ.get("AZURE_STORAGE_CONNECTION_STRING")
13
+ '''
14
+ API key for the Azure Blob Storage.
15
+ '''
16
+
17
+ AZURE_BLOB_STORAGE_CONTAINER_NAME = "softwarearenalogs"
18
+ '''
19
+ Contianer name for the Azure Blob Storage.
20
+ '''
21
+
22
+ SANDBOX_TEMPLATE_ID: str = "bxq9sha9l55ytsyfturr"
23
+ '''
24
+ Template ID for the sandbox.
25
+ '''
26
+
27
+ SANDBOX_NGINX_PORT: int = 8000
28
+ '''
29
+ Nginx port for the sandbox.
30
+ '''
31
+
32
+ SANDBOX_TIMEOUT_SECONDS: int = 1 * 60
33
+ '''
34
+ Timeout in seconds for created sandboxes to expire.
35
+ '''
36
+
37
+ CODE_RUN_TIMEOUT_SECONDS: int = 60
38
+ '''
39
+ Timeout in seconds for code execution.
40
+ '''
41
+
42
+ SANDBOX_RETRY_COUNT: int = 3
43
+ '''
44
+ Number of times to retry the sandbox creation.
45
+ '''
46
+
47
+ INSTALLED_PYPI_PACKAGES = [
48
+ "boto3",
49
+ "botocore",
50
+ "urllib3",
51
+ "setuptools",
52
+ "requests",
53
+ "certifi",
54
+ "idna",
55
+ "charset-normalizer",
56
+ "packaging",
57
+ "typing-extensions",
58
+ "python-dateutil",
59
+ "aiobotocore",
60
+ "s3transfer",
61
+ "grpcio-status",
62
+ "pyyaml",
63
+ "six",
64
+ "fsspec",
65
+ "s3fs",
66
+ "numpy",
67
+ "wheel",
68
+ "pip",
69
+ "cryptography",
70
+ "awscli",
71
+ "pydantic",
72
+ "cffi",
73
+ "attrs",
74
+ "google-api-core",
75
+ "pycparser",
76
+ "pandas",
77
+ "importlib-metadata",
78
+ "jmespath",
79
+ "click",
80
+ "zipp",
81
+ "rsa",
82
+ "pyasn1",
83
+ "markupsafe",
84
+ "pytz",
85
+ "colorama",
86
+ "protobuf",
87
+ "platformdirs",
88
+ "jinja2",
89
+ "rich",
90
+ "tomli",
91
+ "pytest",
92
+ "pydantic-core",
93
+ "pyjwt",
94
+ "pluggy",
95
+ "aiohttp",
96
+ "virtualenv",
97
+ "jsonschema",
98
+ "googleapis-common-protos",
99
+ "cachetools",
100
+ "google-auth",
101
+ "filelock",
102
+ "wrapt",
103
+ "sqlalchemy",
104
+ "docutils",
105
+ "pyasn1-modules",
106
+ "pyarrow",
107
+ "greenlet",
108
+ "iniconfig",
109
+ "pygments",
110
+ "annotated-types",
111
+ "yarl",
112
+ "requests-oauthlib",
113
+ "tzdata",
114
+ "psutil",
115
+ "multidict",
116
+ "pyparsing",
117
+ "requests-toolbelt",
118
+ "exceptiongroup",
119
+ "werkzeug",
120
+ "soupsieve",
121
+ "oauthlib",
122
+ "beautifulsoup4",
123
+ "frozenlist",
124
+ "more-itertools",
125
+ "distlib",
126
+ "tomlkit",
127
+ "pathspec",
128
+ "aiosignal",
129
+ "grpcio",
130
+ "tqdm",
131
+ "scipy",
132
+ "async-timeout",
133
+ "pillow",
134
+ "isodate",
135
+ "anyio",
136
+ "sortedcontainers",
137
+ "decorator",
138
+ "markdown-it-py",
139
+ "deprecated",
140
+ "mypy-extensions",
141
+ "sniffio",
142
+ "httpx",
143
+ "coverage",
144
+ "openpyxl",
145
+ "flask",
146
+ "rpds-py",
147
+ "et-xmlfile"
148
+ ]
sandbox/prompts.py ADDED
@@ -0,0 +1,393 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ '''
2
+ Prompts for the sandbox.
3
+ '''
4
+
5
+ GENERAL_SANDBOX_INSTRUCTION = """\
6
+ You are an expert Software Engineer, UI/UX designer, and product manager. Your task is to generate self-contained, executable code for a single file or block that can run directly in a sandbox environment. Feel free to ask questions or explain your reasoning.
7
+ If you do a great job based on the instructions, you will be rewarded with a high salary and a promotion.
8
+
9
+ Your code must be written using one of these supported development frameworks and environments:
10
+ - React (JavaScript/TypeScript)
11
+ - Vue (JavaScript/TypeScript)
12
+ - HTML (Vanilla HTML)
13
+ - Gradio (Python)
14
+ - Streamlit (Python)
15
+ - PyGame (Python)
16
+ - Mermaid (Markdown)
17
+ - Python Runner
18
+ - JavaScript Runner
19
+ - Command Line Code Runner (C/C++/Go/Java/Rust)
20
+
21
+ All web framework code (React, Vue, HTML) must be directly rendered in a browser and immediately executable without additional setup. DO NOT create separate CSS files
22
+ Python-based frameworks should be directly executable in a browser environment.
23
+ The code to be executed in Runners must be plain Python or JavaScript programs that do not require web UI frameworks or standard user input.
24
+
25
+ The code must be in the markdown format:
26
+ ```<language>
27
+ <code>
28
+ ```
29
+
30
+ Before you begin writing any code, you must follow these fundamental rules:
31
+ - You are NOT allowed to start directly with a code block. Before writing code, ALWAYS think carefully step-by-step
32
+ - Your response must contain a clear explanation of the solution you are providing
33
+ - ALWAYS generate complete, self-contained code in a single file
34
+ - You CAN NOT split your program into multiple files or multiple code blocks
35
+ - If you use any external libraries, make sure to specify them for the installation command in either `pip install` or `npm install`
36
+ - You prefer JavaScript over HTML
37
+ - Each code block must be completely independent. If modifications are needed, the entire code block must be rewritten
38
+ - When fetching data, you MUST use external libraries and packages, and avoid using placeholder URLs or URLs that require API keys
39
+ - Make sure the program is functional by creating a state when needed and having no required props
40
+ - Make sure to include all necessary code in one file
41
+ - There are no additional files in the local file system, unless you create them inside the same program
42
+ - Do not touch project dependencies files like package.json, package-lock.json, requirements.txt, etc
43
+
44
+ When developing with React or Vue components, follow these specific requirements:
45
+ - Use TypeScript or JavaScript as the language
46
+ - DO NOT use gray text color on a white background
47
+ - Make sure it can run by itself by using a default export at the end of the file
48
+ - DO NOT CALL `ReactDOM.render()` AT THE END OF THE FILE
49
+ - Use Tailwind classes for styling. DO NOT USE ARBITRARY VALUES (e.g. 'h-[600px]'). Make sure to use a consistent color palette
50
+ - If you use any imports from React like `useState` or `useEffect`, make sure to import them directly
51
+ - Use Tailwind margin and padding classes to style the components and ensure proper spacing
52
+ - Various npm packages are available to be imported, e.g. `import { LineChart, XAxis, ... } from "recharts"` & `<LineChart ...><XAxis dataKey="name"> ...`
53
+ - Images from the web are not allowed, but you can use placeholder images by specifying the width and height like so `<img src="/api/placeholder/400/320" alt="placeholder" />`
54
+
55
+ For Python development, you must follow these constraints:
56
+ - For any programs that require user inputs, you MUST USE `gradio` or `streamlit`
57
+ - Choose suitable PyPI packages to be imported, e.g., `import pandas`
58
+ - Avoid using libraries that require desktop GUI interfaces, with the exceptions of `pygame`, `gradio`, and `streamlit` which are explicitly supported
59
+ - For PyGame applications, you have to write the main function as an async function like:
60
+ ```python
61
+ import asyncio
62
+ import pygame
63
+
64
+ async def main():
65
+ global game_state
66
+ while game_state:
67
+ game_state(pygame.event.get())
68
+ pygame.display.update()
69
+ await asyncio.sleep(0) # it must be called on every frame
70
+
71
+ if __name__ == "__main__":
72
+ asyncio.run(main())
73
+ ```
74
+
75
+ For HTML development, ensure that:
76
+ - All HTML code must be self-contained in a single file
77
+ - Include any necessary CSS and JavaScript within the HTML file
78
+ - Ensure the code is directly executable in a browser environment
79
+ - Images from the web are not allowed, but you can use placeholder images by specifying the width and height like so `<img src="/api/placeholder/400/320" alt="placeholder" />`
80
+
81
+ For Mermaid development:
82
+ - Write Mermaid diagrams directly using ```mermaid code blocks, e.g.:
83
+ ```mermaid
84
+ graph TD;
85
+ A-->B;
86
+ ```
87
+
88
+ For Command Line Code Runner (C/C++/Go/Java/Rust), ensure that:
89
+ - ALWAYS generate complete, self-contained code in a single file. Avoid non-standard libraries.
90
+ - Your code should be able to be compiled and run directly.
91
+ - Your code must complete the task without any user inputs. It should not be long running.
92
+ - You should provide example test cases in the code and output the result to stdout or stderr.
93
+
94
+ The code must be in the markdown format:
95
+ ```<language>
96
+ <code>
97
+ ```
98
+ """
99
+
100
+ DEFAULT_PYTHON_RUNNER_INSTRUCTION = """
101
+ You are an expert Software Engineer. Your task is to generate self-contained, executable Python code that can run directly in a code interpreter environment.
102
+
103
+ Before you begin writing any code, you must follow these fundamental rules:
104
+ - You are NOT allowed to start directly with a code block. Before writing code, ALWAYS think carefully step-by-step
105
+ - Your response must contain a clear explanation of the solution you are providing
106
+ - ALWAYS generate complete, self-contained code in a single file
107
+ - If you use any external libraries, make sure to specify them for installation with `pip install`
108
+ - Make sure to include all necessary code in one file
109
+ - Make sure it does not require any user inputs
110
+ - Choose suitable PyPI packages to be imported, e.g., `import pandas`
111
+
112
+ The code must be in the markdown format:
113
+ ```python
114
+ <code>
115
+ ```
116
+
117
+ You can output in stdout, stderr, or render images, plots, and tables.
118
+ """
119
+
120
+ DEFAULT_JAVASCRIPT_RUNNER_INSTRUCTION = """
121
+ You are an expert Software Engineer. Your task is to generate self-contained JavaScript code that can run directly in a code interpreter environment.
122
+
123
+ Before you begin writing any code, you must follow these fundamental rules:
124
+ - You are NOT allowed to start directly with a code block. Before writing code, ALWAYS think carefully step-by-step
125
+ - Your response must contain a clear explanation of the solution you are providing
126
+ - ALWAYS generate complete, self-contained code in a single file
127
+ - If you use any external libraries, make sure to specify them for installation with `npm install`
128
+ - Make sure to include all necessary code in one file
129
+ - Ensure the code is self-contained and does not rely on browser-specific APIs
130
+
131
+ The code must be in the markdown format:
132
+ ```javascript
133
+ <code>
134
+ ```
135
+
136
+ You can output in stdout, stderr, or render images, plots, and tables.
137
+ """
138
+
139
+ DEFAULT_HTML_SANDBOX_INSTRUCTION = """
140
+ You are an expert Software Engineer and UI/UX designer. Your task is to generate self-contained HTML code that can run directly in a browser environment.
141
+
142
+ Before you begin writing any code, you must follow these fundamental rules:
143
+ - You are NOT allowed to start directly with a code block. Before writing code, ALWAYS think carefully step-by-step
144
+ - Your response must contain a clear explanation of the solution you are providing
145
+ - ALWAYS generate complete, self-contained code in a single file
146
+ - Include any necessary CSS and JavaScript within the HTML file
147
+ - If you use any external libraries, make sure to specify them
148
+ - Make sure the program is functional by creating a state when needed
149
+ - Images from the web are not allowed, but you can use placeholder images by specifying the width and height like so `<img src="/api/placeholder/400/320" alt="placeholder" />`
150
+
151
+ The code must be in the markdown format:
152
+ ```html
153
+ <code>
154
+ ```
155
+
156
+ For HTML development, ensure that:
157
+ - All HTML code must be self-contained in a single file
158
+ - Include any necessary CSS and JavaScript within the HTML file
159
+ - Ensure the code is directly executable in a browser environment
160
+ - Images from the web are not allowed, but you can use placeholder images by specifying the width and height like so `<img src="/api/placeholder/400/320" alt="placeholder" />`
161
+ """
162
+
163
+ DEFAULT_REACT_SANDBOX_INSTRUCTION = """
164
+ You are an expert Software Engineer and UI/UX designer. Your task is to generate a self-contained React component using TypeScript that can run directly in a browser environment.
165
+
166
+ Before you begin writing any code, you must follow these fundamental rules:
167
+ - You are NOT allowed to start directly with a code block. Before writing code, ALWAYS think carefully step-by-step
168
+ - Your response must contain a clear explanation of the solution you are providing
169
+ - ALWAYS generate complete, self-contained code in a single file
170
+ - If you use any external libraries, make sure to specify them for installation with `npm install`
171
+ - Make sure the program is functional by creating a state when needed and having no required props
172
+ - Make sure it can run by itself by using a default export at the end of the file
173
+ - DO NOT CALL `ReactDOM.render()` AT THE END OF THE FILE
174
+ - Use Tailwind classes for styling. DO NOT USE ARBITRARY VALUES (e.g. 'h-[600px]'). Make sure to use a consistent color palette
175
+ - If you use any imports from React like `useState` or `useEffect`, make sure to import them directly
176
+ - Images from the web are not allowed, but you can use placeholder images by specifying the width and height like so `<img src="/api/placeholder/400/320" alt="placeholder" />`
177
+
178
+ The code must be in the markdown format:
179
+ ```typescript
180
+ <code>
181
+ ```
182
+
183
+ When developing with React components, follow these specific requirements:
184
+ - Use TypeScript or JavaScript as the language
185
+ - DO NOT use gray text color on a white background
186
+ - Make sure it can run by itself by using a default export at the end of the file
187
+ - DO NOT CALL `ReactDOM.render()` AT THE END OF THE FILE
188
+ - Use Tailwind classes for styling. DO NOT USE ARBITRARY VALUES (e.g. 'h-[600px]'). Make sure to use a consistent color palette
189
+ - If you use any imports from React like `useState` or `useEffect`, make sure to import them directly
190
+ - Use Tailwind margin and padding classes to style the components and ensure proper spacing
191
+ - Various npm packages are available to be imported, e.g. `import { LineChart, XAxis, ... } from "recharts"` & `<LineChart ...><XAxis dataKey="name"> ...`
192
+ - Images from the web are not allowed, but you can use placeholder images by specifying the width and height like so `<img src="/api/placeholder/400/320" alt="placeholder" />`
193
+ """
194
+
195
+ DEFAULT_VUE_SANDBOX_INSTRUCTION = """
196
+ You are an expert Software Engineer and UI/UX designer. Your task is to generate a self-contained Vue.js component using TypeScript that can run directly in a browser environment.
197
+
198
+ Before you begin writing any code, you must follow these fundamental rules:
199
+ - You are NOT allowed to start directly with a code block. Before writing code, ALWAYS think carefully step-by-step
200
+ - Your response must contain a clear explanation of the solution you are providing
201
+ - ALWAYS generate complete, self-contained code in a single file
202
+ - If you use any external libraries, make sure to specify them for installation with `npm install`
203
+ - Make sure the program is functional by creating a state when needed and having no required props
204
+ - The component should be a simple custom page in a styled `<div>` element
205
+ - Do not include <NuxtWelcome /> or reference any external components
206
+ - Use Tailwind classes for styling. DO NOT USE ARBITRARY VALUES (e.g. 'h-[600px]'). Make sure to use a consistent color palette
207
+ - Images from the web are not allowed, but you can use placeholder images by specifying the width and height like so `<img src="/api/placeholder/400/320" alt="placeholder" />`
208
+
209
+ The code must be in the markdown format:
210
+ ```vue
211
+ <code>
212
+ ```
213
+
214
+ When developing with Vue components, follow these specific requirements:
215
+ - Use Vue 3's Composition API with <script setup> syntax for better TypeScript integration
216
+ - Use TypeScript for type safety and better developer experience
217
+ - Properly type all props, emits, and refs using Vue 3's type system
218
+ - Use defineProps, defineEmits, and other Vue 3 macros correctly
219
+ - Implement reactive state management using ref() or reactive() from Vue
220
+ - Follow Vue 3's best practices for component organization and lifecycle management
221
+ - Use computed properties for derived state
222
+ - Handle component events using proper Vue 3 event handling syntax
223
+ - Use Tailwind classes for styling with a consistent design system
224
+ - Ensure components are responsive using Tailwind's responsive classes
225
+ - Use Vue's built-in transition and animation systems when needed
226
+ - Follow proper Vue 3 security practices (e.g., v-html only when necessary)
227
+ - Implement proper error handling and loading states
228
+ - Add comments explaining complex logic or component structure
229
+ - Use async/await for asynchronous operations
230
+ - Ensure the component is accessible following ARIA best practices
231
+ """
232
+
233
+ DEFAULT_PYGAME_SANDBOX_INSTRUCTION = """
234
+ You are an expert Software Engineer and UI/UX designer. Your task is to generate self-contained PyGame code that can run directly in a browser environment.
235
+
236
+ Before you begin writing any code, you must follow these fundamental rules:
237
+ - You are NOT allowed to start directly with a code block. Before writing code, ALWAYS think carefully step-by-step
238
+ - Your response must contain a clear explanation of the solution you are providing
239
+ - ALWAYS generate complete, self-contained code in a single file
240
+ - If you use any external libraries, make sure to specify them for installation with `pip install`
241
+ - Make sure it does not require any user inputs
242
+ - Write the main function as an async function like:
243
+
244
+ ```python
245
+ import asyncio
246
+ import pygame
247
+
248
+ async def main():
249
+ global game_state
250
+ while game_state:
251
+ game_state(pygame.event.get())
252
+ pygame.display.update()
253
+ await asyncio.sleep(0) # it must be called on every frame
254
+
255
+ if __name__ == "__main__":
256
+ asyncio.run(main())
257
+ ```
258
+
259
+ The code must be in the markdown format:
260
+ ```python
261
+ <code>
262
+ ```
263
+ """
264
+
265
+ DEFAULT_GRADIO_SANDBOX_INSTRUCTION = """
266
+ You are an expert Software Engineer and UI/UX designer. Your task is to generate self-contained Gradio application code that can run directly in a browser environment.
267
+
268
+ Before you begin writing any code, you must follow these fundamental rules:
269
+ - You are NOT allowed to start directly with a code block. Before writing code, ALWAYS think carefully step-by-step
270
+ - Your response must contain a clear explanation of the solution you are providing
271
+ - ALWAYS generate complete, self-contained code in a single file
272
+ - If you use any external libraries, make sure to specify them for installation with `pip install`
273
+ - Make sure it does not require any user inputs
274
+ - Choose suitable PyPI packages to be imported, e.g., `import pandas`
275
+
276
+ The code must be in the markdown format:
277
+ ```python
278
+ <code>
279
+ ```
280
+ """
281
+
282
+ DEFAULT_STREAMLIT_SANDBOX_INSTRUCTION = """
283
+ You are an expert Software Engineer and UI/UX designer. Your task is to generate self-contained Streamlit application code that can run directly in a browser environment.
284
+
285
+ Before you begin writing any code, you must follow these fundamental rules:
286
+ - You are NOT allowed to start directly with a code block. Before writing code, ALWAYS think carefully step-by-step
287
+ - Your response must contain a clear explanation of the solution you are providing
288
+ - ALWAYS generate complete, self-contained code in a single file
289
+ - If you use any external libraries, make sure to specify them for installation with `pip install`
290
+ - Make sure it does not require any user inputs
291
+ - Choose suitable PyPI packages to be imported, e.g., `import pandas`
292
+ - The app should automatically reload when changes are made
293
+
294
+ The code must be in the markdown format:
295
+ ```python
296
+ <code>
297
+ ```
298
+ """
299
+
300
+ DEFAULT_MERMAID_SANDBOX_INSTRUCTION = """
301
+ You are an expert Software Engineer. Your task is to generate self-contained Mermaid diagram code that can be rendered directly.
302
+
303
+ Before you begin writing any code, you must follow these fundamental rules:
304
+ - You are NOT allowed to start directly with a code block. Before writing code, ALWAYS think carefully step-by-step
305
+ - Your response must contain a clear explanation of the solution you are providing
306
+ - ALWAYS generate complete, self-contained code in a single file
307
+
308
+ The code must be in the markdown format:
309
+ ```mermaid
310
+ <code>
311
+ ```
312
+
313
+ Example:
314
+ ```mermaid
315
+ graph TD;
316
+ A-->B;
317
+ ```
318
+ """
319
+
320
+ DEFAULT_C_CODE_RUN_SANDBOX_INSTRUCTION = """
321
+ You are an expert Software Engineer. Your task is to generate self-contained C code that can run directly in a code runner environment.
322
+
323
+ Ensure that:
324
+ - ALWAYS generate complete, self-contained code in a single file. Avoid non-standard libraries.
325
+ - Your code should be able to be compiled and run directly.
326
+ - Your code must complete the task without any user inputs. It should not be long running.
327
+ - You should provide example test cases in the code and output the result to stdout or stderr.
328
+
329
+ The code must be in the markdown format:
330
+ ```c
331
+ <code>
332
+ ```
333
+ """
334
+
335
+ DEFAULT_CPP_CODE_RUN_SANDBOX_INSTRUCTION = """
336
+ You are an expert Software Engineer. Your task is to generate self-contained C++ code that can run directly in a code runner environment.
337
+
338
+ Ensure that:
339
+ - ALWAYS generate complete, self-contained code in a single file. Avoid non-standard libraries.
340
+ - Your code should be able to be compiled and run directly.
341
+ - Your code must complete the task without any user inputs. It should not be long running.
342
+ - You should provide example test cases in the code and output the result to stdout or stderr.
343
+
344
+ The code must be in the markdown format:
345
+ ```cpp
346
+ <code>
347
+ ```
348
+ """
349
+
350
+ DEFAULT_JAVA_CODE_RUN_SANDBOX_INSTRUCTION = """
351
+ You are an expert Software Engineer. Your task is to generate self-contained Java code that can run directly in a code runner environment.
352
+
353
+ Ensure that:
354
+ - ALWAYS generate complete, self-contained code in a single file. Avoid non-standard libraries.
355
+ - Your code should be able to be compiled and run directly.
356
+ - Your code must complete the task without any user inputs. It should not be long running.
357
+ - You should provide example test cases in the code and output the result to stdout or stderr.
358
+
359
+ The code must be in the markdown format:
360
+ ```java
361
+ <code>
362
+ ```
363
+ """
364
+
365
+ DEFAULT_GOLANG_CODE_RUN_SANDBOX_INSTRUCTION = """
366
+ You are an expert Software Engineer. Your task is to generate self-contained Go code that can run directly in a code runner environment.
367
+
368
+ Ensure that:
369
+ - ALWAYS generate complete, self-contained code in a single file. Avoid non-standard libraries.
370
+ - Your code should be able to be compiled and run directly.
371
+ - Your code must complete the task without any user inputs. It should not be long running.
372
+ - You should provide example test cases in the code and output the result to stdout or stderr.
373
+
374
+ The code must be in the markdown format:
375
+ ```go
376
+ <code>
377
+ ```
378
+ """
379
+
380
+ DEFAULT_RUST_CODE_RUN_SANDBOX_INSTRUCTION = """
381
+ You are an expert Software Engineer. Your task is to generate self-contained Rust code that can run directly in a code runner environment.
382
+
383
+ Ensure that:
384
+ - ALWAYS generate complete, self-contained code in a single file. Avoid non-standard libraries.
385
+ - Your code should be able to be compiled and run directly.
386
+ - Your code must complete the task without any user inputs. It should not be long running.
387
+ - You should provide example test cases in the code and output the result to stdout or stderr.
388
+
389
+ The code must be in the markdown format:
390
+ ```rust
391
+ <code>
392
+ ```
393
+ """
sandbox/sandbox_manager.py ADDED
@@ -0,0 +1,254 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ '''
2
+ Facades for interacting with the e2b sandbox.
3
+ '''
4
+
5
+ import json
6
+ from typing import Literal
7
+ from e2b import Sandbox
8
+ from e2b.sandbox.commands.command_handle import CommandExitException
9
+ from e2b.exceptions import TimeoutException
10
+ import time
11
+ import threading
12
+ from httpcore import ReadTimeout
13
+ import queue
14
+
15
+ from .constants import E2B_API_KEY, SANDBOX_TEMPLATE_ID, SANDBOX_NGINX_PORT, SANDBOX_RETRY_COUNT, SANDBOX_TIMEOUT_SECONDS, INSTALLED_PYPI_PACKAGES
16
+
17
+
18
+ def create_sandbox(template: str = SANDBOX_TEMPLATE_ID) -> Sandbox:
19
+ '''
20
+ Create a new sandbox.
21
+ Will retry if the sandbox creation fails.
22
+ '''
23
+ print("Creating new sandbox...")
24
+ for attempt in range(1, SANDBOX_RETRY_COUNT + 1):
25
+ try:
26
+ return Sandbox(
27
+ api_key=E2B_API_KEY,
28
+ domain="e2b-foxtrot.dev",
29
+ template=template,
30
+ timeout=SANDBOX_TIMEOUT_SECONDS,
31
+ )
32
+ except Exception as e:
33
+ if attempt < SANDBOX_RETRY_COUNT:
34
+ time.sleep(1 * attempt)
35
+ else:
36
+ raise e
37
+ raise RuntimeError("Failed to create sandbox after maximum attempts")
38
+
39
+
40
+ def reuse_or_create_sandbox(sandbox_id: str | None, template: str = SANDBOX_TEMPLATE_ID) -> Sandbox:
41
+ '''
42
+ Reuse an existing sandbox if it is running, otherwise create a new sandbox.
43
+ '''
44
+ sandbox = None
45
+
46
+ if sandbox_id is not None:
47
+ try:
48
+ sandbox = Sandbox.connect(
49
+ sandbox_id=sandbox_id,
50
+ api_key=E2B_API_KEY,
51
+ )
52
+ if not sandbox.is_running(request_timeout=5):
53
+ sandbox = None
54
+ except Exception as e:
55
+ pass
56
+
57
+ if sandbox is not None:
58
+ sandbox.set_timeout(timeout=SANDBOX_TIMEOUT_SECONDS)
59
+ else:
60
+ sandbox = create_sandbox(template=template)
61
+
62
+ return sandbox
63
+
64
+
65
+ def run_command_in_sandbox(
66
+ sandbox: Sandbox,
67
+ command: str,
68
+ working_directory: str | None = None,
69
+ timeout: int = 60,
70
+ print_output: bool = True,
71
+ ) -> tuple[bool, list[str], list[str]]:
72
+ '''
73
+ Run a command in the sandbox.
74
+ Return whether the command was successful and the stdout and stderr output.
75
+ '''
76
+ is_run_success = False
77
+ stdouts: list[str] = []
78
+ stderrs: list[str] = []
79
+
80
+ try:
81
+ command_result = sandbox.commands.run(
82
+ cmd=command,
83
+ cwd=working_directory,
84
+ timeout=timeout,
85
+ request_timeout=timeout + 5,
86
+ on_stdout=lambda message: stdouts.append(message),
87
+ on_stderr=lambda message: stderrs.append(message),
88
+ )
89
+ if command_result and command_result.exit_code == 0:
90
+ is_run_success = True
91
+ except Exception as e:
92
+ stderrs.append(str(e))
93
+ is_run_success = False
94
+
95
+ if print_output:
96
+ print(f"Command: {command}")
97
+ for stdout in stdouts:
98
+ print(stdout)
99
+ for stderr in stderrs:
100
+ print(stderr)
101
+
102
+ return is_run_success, stdouts, stderrs
103
+
104
+
105
+ def install_pip_dependencies(sandbox: Sandbox, dependencies: list[str]) -> list[str]:
106
+ '''
107
+ Install pip dependencies in the sandbox.
108
+
109
+ Return errors if any.
110
+ '''
111
+ install_errors = []
112
+ if not dependencies:
113
+ return install_errors
114
+
115
+ for dependency in dependencies:
116
+ if dependency not in INSTALLED_PYPI_PACKAGES:
117
+ try:
118
+ sandbox.commands.run(
119
+ f"uv pip install --system {dependency}",
120
+ timeout=60 * 3,
121
+ on_stdout=lambda message: print(message),
122
+ on_stderr=lambda message: print(message),
123
+ )
124
+ except Exception as e:
125
+ install_errors.append(f"Error during installing pip package {dependency}: {str(e)}")
126
+ continue
127
+
128
+ return install_errors
129
+
130
+
131
+ def parse_npm_package_name(package) -> tuple[str, str | None]:
132
+ '''abc@123 -> abc, 123'''
133
+ return package.split("@")[0], package.split("@")[1] if "@" in package else None
134
+
135
+
136
+ def is_npm_package_installed(package: str, installed_packages: dict[str, str | None]) -> bool:
137
+ package_name, package_version = parse_npm_package_name(package)
138
+ return package_name in installed_packages and (package_version is None or installed_packages[package_name] == package_version)
139
+
140
+
141
+ def get_installed_npm_packages(sandbox: Sandbox, project_root: str) -> dict[str, str | None]:
142
+ installed_packages_raw = []
143
+ sandbox.commands.run(
144
+ "npm list --depth=0 --json",
145
+ cwd=project_root,
146
+ timeout=30,
147
+ on_stdout=lambda message: installed_packages_raw.append(message),
148
+ )
149
+ lines = [json.loads(line)
150
+ for line in installed_packages_raw if line.strip()]
151
+ if not lines:
152
+ return {}
153
+ package_data = lines[-1]
154
+ dependencies = package_data.get("dependencies", {})
155
+ return {
156
+ dep_name: details.get("version")
157
+ for dep_name, details in dependencies.items()
158
+ }
159
+
160
+
161
+ def install_npm_dependencies(sandbox: Sandbox, dependencies: list[str], project_root: str = '~') -> list[str]:
162
+ '''
163
+ Install npm dependencies in the sandbox.
164
+
165
+ Return errors if any.
166
+ '''
167
+ install_errors = []
168
+ if not dependencies:
169
+ return install_errors
170
+
171
+ installed_packages: dict[str, str | None] = get_installed_npm_packages(
172
+ sandbox, project_root)
173
+
174
+ dependencies_to_install = [dependency for dependency in dependencies if not is_npm_package_installed(
175
+ dependency, installed_packages)]
176
+
177
+ for dependency in dependencies_to_install:
178
+ try:
179
+ sandbox.commands.run(
180
+ f"npm install {dependency} --prefer-offline --no-audit --no-fund --legacy-peer-deps",
181
+ cwd=project_root,
182
+ timeout=60 * 3,
183
+ on_stdout=lambda message: print(message),
184
+ on_stderr=lambda message: print(message),
185
+ )
186
+ except Exception as e:
187
+ install_errors.append(f"Error during installing npm package {dependency}:" + str(e))
188
+ continue
189
+
190
+ return install_errors
191
+
192
+
193
+ def run_background_command_with_timeout(
194
+ sandbox: Sandbox,
195
+ command: str,
196
+ cwd: str = "~",
197
+ timeout: int = 5,
198
+ ) -> str:
199
+ """
200
+ Run a command in the background and wait for a short time to check for startup errors.
201
+
202
+ Args:
203
+ sandbox: The sandbox instance
204
+ command: The command to run
205
+ cwd: The working directory for the command
206
+ timeout: How long to wait for startup errors (in seconds)
207
+
208
+ Returns:
209
+ str: Any error output collected during startup
210
+ """
211
+ stderr = ""
212
+
213
+ cmd = sandbox.commands.run(
214
+ command,
215
+ timeout=60 * 3, # Overall timeout for the command
216
+ cwd=cwd,
217
+ background=True,
218
+ )
219
+
220
+ def wait_for_command(result_queue):
221
+ nonlocal stderr
222
+ try:
223
+ result = cmd.wait()
224
+ if result.stderr:
225
+ stderr += result.stderr
226
+ result_queue.put(stderr)
227
+ except ReadTimeout:
228
+ result_queue.put(stderr)
229
+ except CommandExitException as e:
230
+ stderr += "".join(e.stderr)
231
+ result_queue.put(stderr)
232
+ except TimeoutException:
233
+ return
234
+
235
+ result_queue = queue.Queue()
236
+ wait_thread = threading.Thread(
237
+ target=wait_for_command, args=(result_queue,))
238
+ wait_thread.daemon = True # Make thread daemon so it won't prevent program exit
239
+ wait_thread.start()
240
+
241
+ try:
242
+ return result_queue.get(timeout=timeout)
243
+ except queue.Empty:
244
+ return stderr
245
+
246
+
247
+ def get_sandbox_app_url(
248
+ sandbox: Sandbox,
249
+ app_type: Literal["react", "vue", "html", "pygame"]
250
+ ) -> str:
251
+ '''
252
+ Get the URL for the app in the sandbox with container wrapper.
253
+ '''
254
+ return f"https://{sandbox.get_host(port=SANDBOX_NGINX_PORT)}/container/?app={app_type}"
sandbox/sandbox_state.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ '''
2
+ Chatbot state.
3
+ '''
4
+
5
+ from sandbox.code_analyzer import SandboxEnvironment
6
+ from typing import TypedDict
7
+
8
+ class ChatbotSandboxState(TypedDict):
9
+ '''
10
+ Chatbot sandbox state in gr.state.
11
+ '''
12
+ enable_sandbox: bool
13
+ '''
14
+ Whether the code sandbox is enabled.
15
+ '''
16
+ sandbox_instruction: str | None
17
+ '''
18
+ The sandbox instruction to display.
19
+ '''
20
+
21
+ enabled_round: int
22
+ '''
23
+ The chat round after which the sandbox is enabled.
24
+ '''
25
+ sandbox_run_round: int
26
+ '''
27
+ How many rounds the sandbox has been run inside the session.
28
+ '''
29
+ edit_round: int
30
+ '''
31
+ How many rounds the code has been edited.
32
+ Starting from 0, incrementing each time the code is edited.
33
+ Refreshed when running a generated code.
34
+ '''
35
+
36
+ sandbox_environment: SandboxEnvironment | None
37
+ '''
38
+ The sandbox environment to run the code.
39
+ '''
40
+ auto_selected_sandbox_environment: SandboxEnvironment | None
41
+ '''
42
+ The sandbox environment selected automatically.
43
+ '''
44
+ code_to_execute: str | None
45
+ '''
46
+ The code to execute in the sandbox.
47
+ '''
48
+ code_language: str | None
49
+ '''
50
+ The code language to execute in the sandbox.
51
+ '''
52
+ code_dependencies: tuple[list[str], list[str]]
53
+ '''
54
+ The code dependencies for the sandbox (python, npm).
55
+ '''
56
+
57
+ sandbox_output: str | None
58
+ '''
59
+ The sandbox output.
60
+ '''
61
+ sandbox_error: str | None
62
+ '''
63
+ The sandbox error.
64
+ '''
65
+
66
+ sandbox_id: str | None
67
+ '''
68
+ The remote e2b sandbox id. None if not run yet.
69
+ '''
70
+ chat_session_id: str | None
71
+ '''
72
+ The chat session id, unique per chat.
73
+ The two battle models share the same chat session id.
74
+ '''
75
+ conv_id: str | None
76
+ '''
77
+ The conv id, unique per chat per model.
78
+ '''
79
+
80
+ btn_list_length: int
81
+ '''
82
+ Count of Gradio user interface buttons.
83
+ '''
sandbox/sandbox_telemetry.py ADDED
@@ -0,0 +1,221 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ '''
2
+ Module for logging the sandbox interactions and state.
3
+ '''
4
+ from concurrent.futures import ThreadPoolExecutor
5
+ import json
6
+ import os
7
+ from typing import Any, List, Literal, Optional, TypedDict
8
+ import datetime
9
+
10
+ from chat_state import LOG_DIR
11
+ from sandbox.sandbox_state import ChatbotSandboxState
12
+
13
+ from azure.storage.blob import BlobServiceClient
14
+
15
+ from sandbox.constants import AZURE_BLOB_STORAGE_CONNECTION_STRING, AZURE_BLOB_STORAGE_CONTAINER_NAME
16
+
17
+
18
+ class SandboxLog(TypedDict):
19
+ '''
20
+ The schema of the sandbox log stored.
21
+ '''
22
+ sandbox_state: ChatbotSandboxState
23
+ user_interaction_records: Optional[List[Any]]
24
+
25
+
26
+ def upload_data_to_azure_storage(
27
+ data: bytes,
28
+ blob_name: str,
29
+ write_mode: Literal['overwrite', 'append'],
30
+ connection_string: str | None = AZURE_BLOB_STORAGE_CONNECTION_STRING,
31
+ container_name: str = AZURE_BLOB_STORAGE_CONTAINER_NAME,
32
+ ) -> None:
33
+ '''
34
+ Upload data to Azure Blob Storage.
35
+ '''
36
+ if not connection_string:
37
+ raise ValueError("AZURE_STORAGE_CONNECTION_STRING is not set")
38
+
39
+ blob_service_client = BlobServiceClient.from_connection_string(connection_string)
40
+ container_client = blob_service_client.get_container_client(container_name)
41
+
42
+ if write_mode == "overwrite":
43
+ container_client.upload_blob(
44
+ name=blob_name,
45
+ data=data,
46
+ overwrite=True
47
+ )
48
+ elif write_mode == "append":
49
+ blob_client = container_client.get_blob_client(blob=blob_name)
50
+ if not blob_client.exists():
51
+ blob_client.upload_blob(data, blob_type="AppendBlob")
52
+ else:
53
+ blob_client.append_block(data)
54
+ else:
55
+ raise ValueError("Unsupported write_mode. Use 'w' for overwrite or 'a' for append.")
56
+
57
+
58
+ def get_sandbox_log_blob_name(filename: str) -> str:
59
+ date_str = datetime.datetime.now().strftime('%Y_%m_%d')
60
+ blob_name = f"{date_str}/sandbox_logs/{filename}"
61
+ return blob_name
62
+
63
+ def get_conv_log_filepath(
64
+ date: datetime.date,
65
+ chat_mode: Literal['battle_anony', 'battle_named', 'direct'],
66
+ chat_session_id: str,
67
+ ) -> str:
68
+ '''
69
+ Get the filepath for the conversation log.
70
+
71
+ Expected directory structure:
72
+ softwarearenlog/
73
+ └── YEAR_MONTH_DAY/
74
+ ├── conv_logs/
75
+ │ ├── battle_anony/
76
+ │ │ └── CHATSESSIONID.json
77
+ │ ├── battle_named/
78
+ │ │ └── CHATSESSIONID.json
79
+ │ └── direct/
80
+ │ └── CHATSESSIONID.json
81
+ '''
82
+ date_str = date.strftime('%Y_%m_%d')
83
+ filepath = os.path.join(
84
+ date_str,
85
+ 'conv_logs',
86
+ chat_mode,
87
+ f"{chat_session_id}.json"
88
+ )
89
+ return filepath
90
+
91
+
92
+ def get_sandbox_log_filepath(
93
+ date: datetime.date,
94
+ chat_mode: Literal['battle_anony', 'battle_named', 'direct'],
95
+ chat_session_id: str,
96
+ ) -> str:
97
+ '''
98
+ Get the filepath for the conversation log.
99
+
100
+ Expected directory structure:
101
+ softwarearenlog/
102
+ └── YEAR_MONTH_DAY/
103
+ ├── conv_logs/
104
+ └── sandbox_logs/
105
+ ├── battle/
106
+ │ └── sandbox-records-SESSIONID-A-B-EDITID.json
107
+ ├── side-by-side/
108
+ │ └── sandbox-records-SESSIONID-A-B-EDITID.json
109
+ └── direct/
110
+ └── sandbox-records-SESSIONID-A-B-EDITID.json
111
+ '''
112
+ date_str = date.strftime('%Y_%m_%d')
113
+ filepath = os.path.join(
114
+ date_str,
115
+ 'sandbox_logs',
116
+ chat_mode,
117
+ f"{chat_session_id}.json"
118
+ )
119
+ return filepath
120
+
121
+
122
+ def get_conv_log_blob_name(filename: str) -> str:
123
+ date_str = datetime.datetime.now().strftime('%Y_%m_%d')
124
+ blob_name = f"{date_str}/conv_logs/{filename}"
125
+ return blob_name
126
+
127
+ _executor = ThreadPoolExecutor(max_workers=20)
128
+ def save_conv_log_to_azure_storage(
129
+ filename: str,
130
+ log_data: dict[str, Any],
131
+ write_mode: Literal['overwrite', 'append'] = 'append',
132
+ use_async: bool = True
133
+ ) -> None:
134
+ try:
135
+ if AZURE_BLOB_STORAGE_CONNECTION_STRING:
136
+ blob_name = get_conv_log_blob_name(filename)
137
+ log_json: str = json.dumps(
138
+ obj=log_data,
139
+ default=str,
140
+ )
141
+
142
+ def _run_upload():
143
+ upload_data_to_azure_storage(
144
+ str.encode(log_json + "\n"),
145
+ blob_name,
146
+ write_mode
147
+ )
148
+
149
+ if use_async:
150
+ _executor.submit(_run_upload)
151
+ else:
152
+ _run_upload()
153
+ except Exception as e:
154
+ print(f"Error uploading conv log to Azure Blob Storage: {e}")
155
+
156
+
157
+ def get_sandbox_log_filename(sandbox_state: ChatbotSandboxState) -> str:
158
+ return (
159
+ '-'.join(
160
+ [
161
+ "sandbox-logs",
162
+ f"{sandbox_state['conv_id']}", # chat conv id
163
+ f"{sandbox_state['enabled_round']}", # current chat round
164
+ f"{sandbox_state['sandbox_run_round']}", # current sandbox round
165
+ ]
166
+ ) + ".json"
167
+ )
168
+
169
+
170
+ def upsert_sandbox_log(filename: str, data: str) -> None:
171
+ filepath = os.path.join(
172
+ LOG_DIR,
173
+ datetime.datetime.now().strftime('%Y_%m_%d'), # current date as 2025_02_02
174
+ 'sandbox_logs',
175
+ filename
176
+ )
177
+ # create directory if not exists
178
+ os.makedirs(os.path.dirname(filepath), exist_ok=True)
179
+ with open(filepath, "w") as fout:
180
+ fout.write(data)
181
+
182
+
183
+ def create_sandbox_log(sandbox_state: ChatbotSandboxState, user_interaction_records: list[Any] | None) -> SandboxLog:
184
+ return {
185
+ "sandbox_state": sandbox_state,
186
+ "user_interaction_records": user_interaction_records,
187
+ }
188
+
189
+
190
+ def log_sandbox_telemetry_gradio_fn(
191
+ sandbox_state: ChatbotSandboxState,
192
+ sandbox_ui_value: tuple[str, bool, list[Any]] | None
193
+ ) -> None:
194
+ if sandbox_state is None:
195
+ return
196
+ sandbox_id = sandbox_state['sandbox_id']
197
+ user_interaction_records = sandbox_ui_value[2] if sandbox_ui_value else None
198
+ if sandbox_id is None:
199
+ return
200
+
201
+ log_json = create_sandbox_log(sandbox_state, user_interaction_records)
202
+ log_data = json.dumps(
203
+ log_json,
204
+ indent=2,
205
+ default=str,
206
+ ensure_ascii=False
207
+ )
208
+ filename = get_sandbox_log_filename(sandbox_state)
209
+ upsert_sandbox_log(filename=filename, data=log_data)
210
+
211
+ # # Upload to Azure Blob Storage
212
+ # if AZURE_BLOB_STORAGE_CONNECTION_STRING:
213
+ # try:
214
+ # blob_name = get_sandbox_log_blob_name(filename)
215
+ # upload_data_to_azure_storage(
216
+ # data=str.encode(log_data),
217
+ # blob_name=blob_name,
218
+ # write_mode='overwrite'
219
+ # )
220
+ # except Exception as e:
221
+ # print(f"Error uploading sandbox log to Azure Blob Storage: {e}")