xhluca
commited on
Commit
·
b915816
1
Parent(s):
972a7b5
fix issues
Browse files- demo.py +14 -3
- requirements.txt +2 -1
demo.py
CHANGED
|
@@ -399,8 +399,19 @@ def get_judgment_path(base_judgments_dir, benchmark, agent, judge, task_id):
|
|
| 399 |
return judgment_path
|
| 400 |
|
| 401 |
|
| 402 |
-
def list_benchmarks():
|
| 403 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 404 |
|
| 405 |
|
| 406 |
def list_agents(base_traj_dir, benchmark):
|
|
@@ -477,7 +488,7 @@ with gr.Blocks(title="AgentRewardBench Demo") as demo, gr.Row():
|
|
| 477 |
with gr.Column(scale=4):
|
| 478 |
benchmark_default = "WebArena"
|
| 479 |
benchmark_dd = gr.Dropdown(
|
| 480 |
-
label="Benchmark", choices=list_benchmarks(), value=benchmark_default
|
| 481 |
)
|
| 482 |
|
| 483 |
agents = list_agents(base_traj_dir, benchmark_default)
|
|
|
|
| 399 |
return judgment_path
|
| 400 |
|
| 401 |
|
| 402 |
+
def list_benchmarks(base_traj_dir):
|
| 403 |
+
benchmarks_all = list(benchmarks_dict.values())
|
| 404 |
+
# filter by the benchmarks that are in the base_traj_dir
|
| 405 |
+
benchmarks = []
|
| 406 |
+
for benchmark in benchmarks_all:
|
| 407 |
+
traj_dir = Path(base_traj_dir, benchmarks_inverse[benchmark])
|
| 408 |
+
traj_dir = traj_dir.resolve()
|
| 409 |
+
if traj_dir.exists():
|
| 410 |
+
benchmarks.append(benchmark)
|
| 411 |
+
# sort the benchmarks
|
| 412 |
+
benchmarks.sort()
|
| 413 |
+
return benchmarks
|
| 414 |
+
|
| 415 |
|
| 416 |
|
| 417 |
def list_agents(base_traj_dir, benchmark):
|
|
|
|
| 488 |
with gr.Column(scale=4):
|
| 489 |
benchmark_default = "WebArena"
|
| 490 |
benchmark_dd = gr.Dropdown(
|
| 491 |
+
label="Benchmark", choices=list_benchmarks(base_traj_dir), value=benchmark_default
|
| 492 |
)
|
| 493 |
|
| 494 |
agents = list_agents(base_traj_dir, benchmark_default)
|
requirements.txt
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
tqdm
|
| 2 |
orjson
|
| 3 |
Pillow
|
| 4 |
-
pyparsing
|
|
|
|
|
|
| 1 |
tqdm
|
| 2 |
orjson
|
| 3 |
Pillow
|
| 4 |
+
pyparsing
|
| 5 |
+
gradio
|