Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
| import json | |
| import asyncio | |
| import argparse | |
| import httpx | |
| from typing import List, Optional | |
| _parser = argparse.ArgumentParser() | |
| _parser.add_argument("--filename", type=str, help="filename like data/codgen-...jsonl") | |
| _parser.add_argument("--remoteapi", type=str, help="remote execution API if not running local eval") | |
| def load_jsonl(filename): | |
| with open(filename, "r") as file: | |
| return [json.loads(line.strip()) for line in file] | |
| def save_jsonl(filename, data): | |
| with open(filename, "w") as file: | |
| for d in data: | |
| file.write(json.dumps(d)) | |
| file.write("\n") | |
| return filename | |
| async def call_oe_eval_bcb_client( | |
| samples_data: List[dict], | |
| calibrate: bool = True, | |
| parallel: int = -1, | |
| min_time_limit: float = 1, | |
| max_as_limit: int = 30 * 1024, | |
| max_data_limit: int = 30 * 1024, | |
| max_stack_limit: int = 10, | |
| no_gt: bool = True, | |
| execute_api: Optional[str] = None, | |
| ) -> List[dict]: | |
| """ | |
| OE-Eval BigCodeBench remote code execution API | |
| """ | |
| if execute_api is None: | |
| execute_api = "http://localhost:9000/evaluate/" | |
| async with httpx.AsyncClient() as client: | |
| params = { | |
| "calibrate": calibrate, | |
| "parallel": parallel, | |
| "min_time_limit": min_time_limit, | |
| "max_as_limit": max_as_limit, | |
| "max_data_limit": max_data_limit, | |
| "max_stack_limit": max_stack_limit, | |
| "no_gt": no_gt, | |
| } | |
| # Even for the Full BCB dataset, total execution time should not exceed 5-10 min unless many instances of | |
| # generated codes are particularly mal-formed or slow. (per instance exec timeout is 30 sec) | |
| total_timeout = 900 | |
| response = await client.post( | |
| execute_api, json=samples_data, params=params, timeout=total_timeout | |
| ) | |
| results = response.json() | |
| print("Results received from remote API. Processing ...") | |
| check_results = [] | |
| for doc in results["eval"].values(): | |
| for rep in doc: | |
| rep["tested_completion"] = rep.pop("solution") | |
| rep["passed"] = rep.pop("status") == "pass" | |
| rep["exec_result"] = rep.pop("details") | |
| check_results.append(rep) | |
| if check_results: | |
| pass_at_1 = sum([rep["passed"] for rep in check_results])/len(check_results) | |
| return check_results, pass_at_1 | |
| else: | |
| return None, None | |
| def evaluate(sample_file, execute_api: Optional[str] = None): | |
| batched_code_test = load_jsonl(sample_file) | |
| results, pass_at_1 = asyncio.run( | |
| call_oe_eval_bcb_client( | |
| samples_data=batched_code_test, | |
| calibrate=True, | |
| parallel=-1, | |
| min_time_limit=30, | |
| execute_api = execute_api | |
| ) | |
| ) | |
| print("pass@1:", pass_at_1) | |
| return results | |
| def main(): | |
| args = _parser.parse_args() | |
| args_dict = vars(args) | |
| results = evaluate(args_dict["filename"], args_dict["remoteapi"]) | |
| save_jsonl("data/eval_results.jsonl", results) | |
| if __name__ == "__main__": | |
| main() |