Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
| # The MIT License | |
| # | |
| # Copyright (c) OpenAI (https://openai.com) | |
| # | |
| # Permission is hereby granted, free of charge, to any person obtaining a copy | |
| # of this software and associated documentation files (the "Software"), to deal | |
| # in the Software without restriction, including without limitation the rights | |
| # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
| # copies of the Software, and to permit persons to whom the Software is | |
| # furnished to do so, subject to the following conditions: | |
| # | |
| # The above copyright notice and this permission notice shall be included in | |
| # all copies or substantial portions of the Software. | |
| # | |
| # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
| # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
| # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
| # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
| # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
| # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | |
| # THE SOFTWARE. | |
| import contextlib | |
| import faulthandler | |
| import tempfile | |
| import platform | |
| import itertools | |
| import io | |
| import os | |
| import sys | |
| import time | |
| import types | |
| import unittest | |
| import subprocess | |
| import signal | |
| import multiprocessing | |
| from multiprocessing import Value, Manager | |
| from typing import List, Tuple, Union | |
| import numpy as np | |
| TIMEOUT_LIMIT=120.0 # BCB default is 240.0 | |
| def swallow_subprocess_output(): | |
| """Context manager to swallow stdout and stderr for subprocesses.""" | |
| original_popen = subprocess.Popen | |
| original_run = subprocess.run | |
| def _popen_patch(*args, **kwargs): | |
| if 'capture_output' in kwargs and kwargs['capture_output']: | |
| # Avoid setting stdout or stderr if capture_output is True | |
| kwargs.pop('stdout', None) | |
| kwargs.pop('stderr', None) | |
| else: | |
| kwargs.setdefault('stdout', subprocess.PIPE) | |
| kwargs.setdefault('stderr', subprocess.PIPE) | |
| return original_popen(*args, **kwargs) | |
| def _run_patch(*args, **kwargs): | |
| if 'capture_output' in kwargs and kwargs['capture_output']: | |
| # Avoid setting stdout or stderr if capture_output is True | |
| kwargs.pop('stdout', None) | |
| kwargs.pop('stderr', None) | |
| else: | |
| kwargs.setdefault('stdout', subprocess.PIPE) | |
| kwargs.setdefault('stderr', subprocess.PIPE) | |
| return original_run(*args, **kwargs) | |
| subprocess.Popen = _popen_patch | |
| subprocess.run = _run_patch | |
| try: | |
| yield | |
| finally: | |
| subprocess.Popen = original_popen | |
| subprocess.run = original_run | |
| def swallow_io(): | |
| stream = WriteOnlyStringIO() | |
| with contextlib.redirect_stdout(stream): | |
| with contextlib.redirect_stderr(stream): | |
| with redirect_stdin(stream): | |
| with swallow_subprocess_output(): | |
| yield | |
| def time_limit(seconds: float): | |
| def signal_handler(signum, frame): | |
| raise TimeoutException("Timed out!") | |
| signal.setitimer(signal.ITIMER_REAL, seconds) | |
| signal.signal(signal.SIGALRM, signal_handler) | |
| try: | |
| yield | |
| finally: | |
| signal.setitimer(signal.ITIMER_REAL, 0) | |
| def create_tempdir(): | |
| with tempfile.TemporaryDirectory() as dirname: | |
| with chdir(dirname): | |
| yield dirname | |
| def chdir(root): | |
| if root == ".": | |
| yield | |
| return | |
| cwd = os.getcwd() | |
| os.chdir(root) | |
| try: | |
| yield | |
| except BaseException as exc: | |
| raise exc | |
| finally: | |
| os.chdir(cwd) | |
| def safe_environment(): | |
| # Save original functions | |
| original_kill = os.kill | |
| original_killpg = os.killpg | |
| original_system = os.system | |
| original_subprocess_call = subprocess.call | |
| original_subprocess_check_output = subprocess.check_output | |
| original_subprocess_run = subprocess.run | |
| original_subprocess_popen = subprocess.Popen | |
| original_os_popen = os.popen | |
| original_os_execv = os.execv | |
| original_os_execvp = os.execvp | |
| original_os_execvpe = os.execvpe | |
| current_pid = os.getpid() | |
| current_pgid = os.getpgid(current_pid) | |
| manager = multiprocessing.Manager() | |
| child_pids = manager.list() | |
| def safe_kill(pid, sig): | |
| try: | |
| pgid = os.getpgid(pid) | |
| if pid == current_pid or pid in child_pids: | |
| original_kill(pid, sig) | |
| else: | |
| print(f"Prevented attempt to kill PID {pid} with signal {sig}") | |
| except ProcessLookupError: | |
| pass | |
| def safe_killpg(pgid, sig): | |
| if pgid == current_pgid or pgid in {os.getpgid(pid) for pid in child_pids}: | |
| original_killpg(pgid, sig) | |
| else: | |
| print(f"Prevented attempt to kill PGID {pgid} with signal {sig}") | |
| def safe_system(command): | |
| print(f"Intercepted system command: {command}") | |
| if 'kill' in command or 'killall' in command: | |
| return 0 # Simulate successful execution without doing anything | |
| return original_system(command) | |
| def safe_subprocess_call(command, *args, **kwargs): | |
| print(f"Intercepted subprocess call: {command}") | |
| if 'kill' in command or 'killall' in command: | |
| return 0 # Simulate successful execution without doing anything | |
| return original_subprocess_call(command, *args, **kwargs) | |
| def safe_subprocess_check_output(command, *args, **kwargs): | |
| print(f"Intercepted command: {command}") | |
| if 'ps' in command: | |
| return b"" # Simulate no processes found | |
| return original_subprocess_check_output(command, *args, **kwargs) | |
| def safe_subprocess_run(*args, **kwargs): | |
| print(f"Intercepted subprocess run command: {args}") | |
| if 'kill' in args[0] or 'killall' in args[0]: | |
| return subprocess.CompletedProcess(args, 0, b'', b'') # Simulate successful execution | |
| return original_subprocess_run(*args, **kwargs) | |
| class SafePopen(subprocess.Popen): | |
| def __init__(self, *args, **kwargs): | |
| print(f"Intercepted Popen command: {args}") | |
| kwargs['preexec_fn'] = os.setsid # Start the process in a new session | |
| super().__init__(*args, **kwargs) | |
| child_pids.append(self.pid) | |
| def communicate(self, *args, **kwargs): | |
| try: | |
| return super().communicate(*args, **kwargs) | |
| except subprocess.TimeoutExpired: | |
| print("Timeout expired, intercepted and returning None") | |
| return None, None | |
| def kill(self): | |
| print(f"Intercepted kill call for PID {self.pid}") | |
| safe_kill(self.pid, signal.SIGTERM) | |
| def terminate(self): | |
| print(f"Intercepted terminate call for PID {self.pid}") | |
| safe_kill(self.pid, signal.SIGTERM) | |
| def safe_os_popen(command): | |
| print(f"Intercepted os.popen command: {command}") | |
| if 'kill' in command or 'killall' in command: | |
| return os.popen('echo Intercepted') | |
| return original_os_popen(command) | |
| def safe_exec(*args, **kwargs): | |
| print(f"Intercepted exec command: {args}") | |
| # Override the risky functions with the safe versions | |
| os.kill = safe_kill | |
| os.killpg = safe_killpg | |
| os.system = safe_system | |
| subprocess.call = safe_subprocess_call | |
| subprocess.check_output = safe_subprocess_check_output | |
| subprocess.run = safe_subprocess_run | |
| subprocess.Popen = SafePopen | |
| os.popen = safe_os_popen | |
| os.execv = safe_exec | |
| os.execvp = safe_exec | |
| os.execvpe = safe_exec | |
| try: | |
| yield | |
| finally: | |
| for pid in child_pids: | |
| try: | |
| os.kill(pid, signal.SIGTERM) | |
| for _ in range(10): | |
| time.sleep(0.1) | |
| try: | |
| os.kill(pid, 0) | |
| except ProcessLookupError: | |
| break | |
| else: | |
| os.kill(pid, signal.SIGKILL) | |
| except ProcessLookupError: | |
| pass | |
| except Exception as e: | |
| print(f"Error handling process {pid}: {e}") | |
| os.kill = original_kill | |
| os.killpg = original_killpg | |
| os.system = original_system | |
| subprocess.call = original_subprocess_call | |
| subprocess.check_output = original_subprocess_check_output | |
| subprocess.run = original_subprocess_run | |
| subprocess.Popen = original_subprocess_popen | |
| os.popen = original_os_popen | |
| os.execv = original_os_execv | |
| os.execvp = original_os_execvp | |
| os.execvpe = original_os_execvpe | |
| class TimeoutException(Exception): | |
| pass | |
| class WriteOnlyStringIO(io.StringIO): | |
| """StringIO that throws an exception when it's read from""" | |
| def read(self, *args, **kwargs): | |
| raise IOError | |
| def readline(self, *args, **kwargs): | |
| raise IOError | |
| def readlines(self, *args, **kwargs): | |
| raise IOError | |
| def readable(self, *args, **kwargs): | |
| """Returns True if the IO object can be read.""" | |
| return False | |
| class redirect_stdin(contextlib._RedirectStream): # type: ignore | |
| _stream = "stdin" | |
| def reliability_guard(max_as_limit, max_data_limit, max_stack_limit): | |
| """ | |
| This disables various destructive functions and prevents the generated code | |
| from interfering with the test (e.g. fork bomb, killing other processes, | |
| removing filesystem files, etc.) | |
| WARNING | |
| This function is NOT a security sandbox. Untrusted code, including, model- | |
| generated code, should not be blindly executed outside of one. See the | |
| Codex paper for more information about OpenAI's code sandbox, and proceed | |
| with caution. | |
| """ | |
| import os | |
| import time | |
| from datetime import datetime | |
| os.environ['TZ'] = 'UTC' | |
| time.tzset() | |
| os.environ["OMP_NUM_THREADS"] = "1" | |
| os.environ['TF_CPP_MIN_LOG_LEVEL'] = "3" | |
| os.environ['TF_ENABLE_ONEDNN_OPTS'] = "0" | |
| if max_as_limit and max_data_limit and max_stack_limit: | |
| import resource | |
| max_as_limit = max_as_limit * 1024 * 1024 | |
| max_data_limit = max_data_limit * 1024 * 1024 | |
| max_stack_limit = max_stack_limit * 1024 * 1024 | |
| resource.setrlimit( | |
| resource.RLIMIT_AS, (max_as_limit, max_as_limit) | |
| ) | |
| resource.setrlimit( | |
| resource.RLIMIT_DATA, (max_data_limit, max_data_limit) | |
| ) | |
| if not platform.uname().system == "Darwin": | |
| resource.setrlimit( | |
| resource.RLIMIT_STACK, (max_stack_limit, max_stack_limit) | |
| ) | |
| faulthandler.disable() | |
| import builtins | |
| builtins.exit = None | |
| builtins.quit = None | |
| import matplotlib.pyplot as plt | |
| plt.close('all') | |
| # unbiased estimator from https://github.com/openai/human-eval | |
| def estimate_pass_at_k( | |
| num_samples: Union[int, List[int], np.ndarray], | |
| num_correct: Union[List[int], np.ndarray], | |
| k: int, | |
| ) -> np.ndarray: | |
| """ | |
| Estimates pass@k of each problem and returns them in an array. | |
| """ | |
| def estimator(n: int, c: int, k: int) -> float: | |
| """ | |
| Calculates 1 - comb(n - c, k) / comb(n, k). | |
| """ | |
| if n - c < k: | |
| return 1.0 | |
| return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1)) | |
| if isinstance(num_samples, int): | |
| num_samples_it = itertools.repeat(num_samples, len(num_correct)) | |
| else: | |
| assert len(num_samples) == len(num_correct) | |
| num_samples_it = iter(num_samples) | |
| return np.array( | |
| [estimator(int(n), int(c), k) for n, c in zip(num_samples_it, num_correct)] | |
| ) | |
| PASS = "pass" | |
| FAIL = "fail" | |
| TIMEOUT = "timeout" | |
| _SUCCESS = 0 | |
| _FAILED = 1 | |
| _TIMEOUT = 2 | |
| _UNKNOWN = 3 | |
| _mapping = {_SUCCESS: PASS, _FAILED: FAIL, _TIMEOUT: TIMEOUT, _UNKNOWN: None} | |
| def is_floats(x) -> bool: | |
| # check if it is float; List[float]; Tuple[float] | |
| if isinstance(x, float): | |
| return True | |
| if isinstance(x, (list, tuple)): | |
| return all(isinstance(i, float) for i in x) | |
| if isinstance(x, np.ndarray): | |
| return x.dtype == np.float64 or x.dtype == np.float32 | |
| return False | |
| def unsafe_execute( | |
| entry_point: str, | |
| code: str, | |
| test_code: str, | |
| timeout: float, | |
| max_as_limit: float, | |
| max_data_limit: float, | |
| max_stack_limit: float, | |
| stat, # Value | |
| details, # Array | |
| ): | |
| with safe_environment(), create_tempdir(): | |
| # These system calls are needed when cleaning up tempdir. | |
| import os | |
| import shutil | |
| import builtins | |
| rmtree = shutil.rmtree | |
| rmdir = os.rmdir | |
| chdir = os.chdir | |
| # Disable functionalities that can make destructive changes to the test. | |
| reliability_guard(max_as_limit, max_data_limit, max_stack_limit) | |
| module_name = "__test__" | |
| new_module = types.ModuleType(module_name) | |
| # Set necessary attributes for the module | |
| new_module.__dict__.update({ | |
| '__builtins__': builtins, | |
| '__file__': f"{module_name}.py", | |
| '__package__': None, | |
| '__doc__': None, | |
| 'sys': sys, | |
| 'os': os, | |
| 'environ': os.environ, | |
| }) | |
| try: | |
| full_code = code + "\n" + test_code | |
| with swallow_io(): | |
| exec(compile(full_code, f"{module_name}.py", 'exec'), new_module.__dict__) | |
| sys.modules[module_name] = new_module | |
| TestCases = getattr(new_module, 'TestCases') | |
| loader = unittest.TestLoader() | |
| suite = loader.loadTestsFromTestCase(TestCases) | |
| test_result = unittest.TestResult() | |
| start_time = time.time() | |
| with time_limit(timeout): | |
| suite.run(test_result) | |
| issues = test_result.failures + test_result.errors | |
| for test, trace in issues: | |
| details[test.id().split(".")[-1]] = trace | |
| stat.value = _SUCCESS | |
| except BaseException as e: | |
| details["ALL"] = str(e) | |
| stat.value = _FAILED | |
| # Needed for cleaning up. | |
| shutil.rmtree = rmtree | |
| os.rmdir = rmdir | |
| os.chdir = chdir | |
| def untrusted_check( | |
| code: str, | |
| test_code: str, | |
| entry_point: str, | |
| max_as_limit: float, | |
| max_data_limit: float, | |
| max_stack_limit: float, | |
| min_time_limit: float = 10, | |
| gt_time_limit: float = 60 | |
| ) -> Tuple[str, np.ndarray]: | |
| min_time_limit = max(min_time_limit, gt_time_limit) | |
| timeout = max(os.getenv("BIGCODEBENCH_TIMEOUT_PER_TASK", TIMEOUT_LIMIT), min_time_limit) + 1 | |
| # shared memory objects | |
| stat = Value("i", _UNKNOWN) | |
| manager = Manager() | |
| details = manager.dict() | |
| p = multiprocessing.Process( | |
| target=unsafe_execute, | |
| args=( | |
| entry_point, | |
| code, | |
| test_code, | |
| timeout, | |
| max_as_limit, | |
| max_data_limit, | |
| max_stack_limit, | |
| stat, | |
| details, | |
| ), | |
| ) | |
| p.start() | |
| p.join(timeout=timeout+1) | |
| if p.is_alive(): | |
| p.terminate() | |
| time.sleep(0.1) | |
| if p.is_alive(): | |
| p.kill() | |
| time.sleep(0.1) | |
| stat = _mapping[stat.value] | |
| # convert details to a dict | |
| details = dict(details) | |
| if not stat: | |
| stat = TIMEOUT | |
| if stat == PASS: | |
| if details: | |
| stat = FAIL | |
| return stat, details | |
| def evaluate_files( | |
| files: List[str], | |
| inputs: List, | |
| entry_point: str, | |
| min_time_limit: float = 0.1, | |
| gt_time_limit_factor: float = 2.0, | |
| ) -> List[Tuple[str, List[bool]]]: | |
| ret = [] | |
| # sort files by the id in name (i.e., "../n.py") | |
| files = sorted(files, key=lambda x: int(x.split("/")[-1].split(".")[0])) | |
| for file in files: | |
| code = open(file, "r").read() | |
| stat, det = untrusted_check( | |
| code, | |
| inputs, | |
| entry_point, | |
| ) | |
| ret.append((stat, det.tolist())) | |
| return ret | |