Spaces:
Runtime error
Runtime error
jjyang77
commited on
Commit
·
bb636ca
1
Parent(s):
25db7e9
add local_evaluator and some cleanup
Browse files- Dockerfile +2 -0
- README.md +8 -2
- local_evaluator.py +94 -0
- prod.sh +1 -1
Dockerfile
CHANGED
|
@@ -20,6 +20,8 @@ RUN adduser --disabled-password --gecos "" bigcodebenchuser
|
|
| 20 |
|
| 21 |
RUN pip install -I --timeout 2000 -r https://github.com/bigcode-project/bigcodebench-annotation/releases/download/v0.1.0/requirements.txt
|
| 22 |
|
|
|
|
|
|
|
| 23 |
COPY . .
|
| 24 |
|
| 25 |
WORKDIR /
|
|
|
|
| 20 |
|
| 21 |
RUN pip install -I --timeout 2000 -r https://github.com/bigcode-project/bigcodebench-annotation/releases/download/v0.1.0/requirements.txt
|
| 22 |
|
| 23 |
+
RUN python -m nltk.downloader punkt
|
| 24 |
+
|
| 25 |
COPY . .
|
| 26 |
|
| 27 |
WORKDIR /
|
README.md
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
---
|
| 2 |
-
title: OE Eval Bcb Evaluator
|
| 3 |
emoji: 🐢
|
| 4 |
colorFrom: green
|
| 5 |
colorTo: pink
|
|
@@ -7,4 +7,10 @@ sdk: docker
|
|
| 7 |
pinned: false
|
| 8 |
---
|
| 9 |
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: OE Eval Bcb Evaluator lite
|
| 3 |
emoji: 🐢
|
| 4 |
colorFrom: green
|
| 5 |
colorTo: pink
|
|
|
|
| 7 |
pinned: false
|
| 8 |
---
|
| 9 |
|
| 10 |
+
# For local testing
|
| 11 |
+
Build the docker image for the BCB eval env.
|
| 12 |
+
There is a `scikit-image` wheel that takes a long-time to build ...
|
| 13 |
+
|
| 14 |
+
Run the container while mounting a data volume with your generated code solutions, and mapping a port to 7860.
|
| 15 |
+
|
| 16 |
+
|
local_evaluator.py
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import asyncio
|
| 3 |
+
import argparse
|
| 4 |
+
import httpx
|
| 5 |
+
from typing import List, Optional
|
| 6 |
+
|
| 7 |
+
_parser = argparse.ArgumentParser()
|
| 8 |
+
|
| 9 |
+
_parser.add_argument("--filename", type=str, help="filename like data/codgen-...jsonl")
|
| 10 |
+
_parser.add_argument("--remoteapi", type=str, help="remote execution API if not running local eval")
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def load_jsonl(filename):
|
| 14 |
+
with open(filename, "r") as file:
|
| 15 |
+
return [json.loads(line.strip()) for line in file]
|
| 16 |
+
|
| 17 |
+
def save_jsonl(filename, data):
|
| 18 |
+
with open(filename, "w") as file:
|
| 19 |
+
for d in data:
|
| 20 |
+
file.write(json.dumps(d))
|
| 21 |
+
file.write("\n")
|
| 22 |
+
return filename
|
| 23 |
+
|
| 24 |
+
async def call_oe_eval_bcb_client(
|
| 25 |
+
samples_data: List[dict],
|
| 26 |
+
calibrate: bool = True,
|
| 27 |
+
parallel: int = -1,
|
| 28 |
+
min_time_limit: float = 1,
|
| 29 |
+
max_as_limit: int = 30 * 1024,
|
| 30 |
+
max_data_limit: int = 30 * 1024,
|
| 31 |
+
max_stack_limit: int = 10,
|
| 32 |
+
no_gt: bool = True,
|
| 33 |
+
execute_api: Optional[str] = None,
|
| 34 |
+
) -> List[dict]:
|
| 35 |
+
"""
|
| 36 |
+
OE-Eval BigCodeBench remote code execution API
|
| 37 |
+
"""
|
| 38 |
+
if execute_api is None:
|
| 39 |
+
execute_api = "http://localhost:9000/evaluate/"
|
| 40 |
+
|
| 41 |
+
async with httpx.AsyncClient() as client:
|
| 42 |
+
params = {
|
| 43 |
+
"calibrate": calibrate,
|
| 44 |
+
"parallel": parallel,
|
| 45 |
+
"min_time_limit": min_time_limit,
|
| 46 |
+
"max_as_limit": max_as_limit,
|
| 47 |
+
"max_data_limit": max_data_limit,
|
| 48 |
+
"max_stack_limit": max_stack_limit,
|
| 49 |
+
"no_gt": no_gt,
|
| 50 |
+
}
|
| 51 |
+
# Even for the Full BCB dataset, total execution time should not exceed 5-10 min unless many instances of
|
| 52 |
+
# generated codes are particularly mal-formed or slow. (per instance exec timeout is 30 sec)
|
| 53 |
+
total_timeout = 900
|
| 54 |
+
response = await client.post(
|
| 55 |
+
execute_api, json=samples_data, params=params, timeout=total_timeout
|
| 56 |
+
)
|
| 57 |
+
results = response.json()
|
| 58 |
+
|
| 59 |
+
print("Results received from remote API. Processing ...")
|
| 60 |
+
check_results = []
|
| 61 |
+
for doc in results["eval"].values():
|
| 62 |
+
for rep in doc:
|
| 63 |
+
rep["tested_completion"] = rep.pop("solution")
|
| 64 |
+
rep["passed"] = rep.pop("status") == "pass"
|
| 65 |
+
rep["exec_result"] = rep.pop("details")
|
| 66 |
+
check_results.append(rep)
|
| 67 |
+
if check_results:
|
| 68 |
+
pass_at_1 = sum([rep["passed"] for rep in check_results])/len(check_results)
|
| 69 |
+
return check_results, pass_at_1
|
| 70 |
+
else:
|
| 71 |
+
return None, None
|
| 72 |
+
|
| 73 |
+
def evaluate(sample_file, execute_api: Optional[str] = None):
|
| 74 |
+
batched_code_test = load_jsonl(sample_file)
|
| 75 |
+
results, pass_at_1 = asyncio.run(
|
| 76 |
+
call_oe_eval_bcb_client(
|
| 77 |
+
samples_data=batched_code_test,
|
| 78 |
+
calibrate=True,
|
| 79 |
+
parallel=-1,
|
| 80 |
+
min_time_limit=30,
|
| 81 |
+
execute_api = execute_api
|
| 82 |
+
)
|
| 83 |
+
)
|
| 84 |
+
print("pass@1:", pass_at_1)
|
| 85 |
+
return results
|
| 86 |
+
|
| 87 |
+
def main():
|
| 88 |
+
args = _parser.parse_args()
|
| 89 |
+
args_dict = vars(args)
|
| 90 |
+
results = evaluate(args_dict["filename"], args_dict["remoteapi"])
|
| 91 |
+
save_jsonl("data/eval_results.jsonl", results)
|
| 92 |
+
|
| 93 |
+
if __name__ == "__main__":
|
| 94 |
+
main()
|
prod.sh
CHANGED
|
@@ -2,7 +2,7 @@
|
|
| 2 |
exec \
|
| 3 |
gunicorn \
|
| 4 |
-k uvicorn.workers.UvicornWorker \
|
| 5 |
-
--workers
|
| 6 |
--timeout 0 \
|
| 7 |
--bind 0.0.0.0:7860 \
|
| 8 |
--enable-stdio-inheritance \
|
|
|
|
| 2 |
exec \
|
| 3 |
gunicorn \
|
| 4 |
-k uvicorn.workers.UvicornWorker \
|
| 5 |
+
--workers 8 \
|
| 6 |
--timeout 0 \
|
| 7 |
--bind 0.0.0.0:7860 \
|
| 8 |
--enable-stdio-inheritance \
|