Spaces:

Backup-bdg
/

OpenHands

Build error

App Files Files Community

OpenHands / openhands /runtime /browser /browser_env.py

Backup-bdg

Upload 964 files

51ff9e5 verified 5 months ago

raw

history blame

9.81 kB

	import atexit
	import json
	import multiprocessing
	import time
	import uuid

	import browsergym.core # noqa F401 (we register the openended task as a gym environment)
	import gymnasium as gym
	import html2text
	import tenacity
	from browsergym.utils.obs import flatten_dom_to_str, overlay_som

	from openhands.core.exceptions import BrowserInitException
	from openhands.core.logger import openhands_logger as logger
	from openhands.runtime.browser.base64 import image_to_png_base64_url
	from openhands.utils.shutdown_listener import should_continue, should_exit
	from openhands.utils.tenacity_stop import stop_if_should_exit

	BROWSER_EVAL_GET_GOAL_ACTION = 'GET_EVAL_GOAL'
	BROWSER_EVAL_GET_REWARDS_ACTION = 'GET_EVAL_REWARDS'


	class BrowserEnv:
	def __init__(self, browsergym_eval_env: str \| None = None):
	self.html_text_converter = self.get_html_text_converter()
	self.eval_mode = False
	self.eval_dir = ''

	# EVAL only: browsergym_eval_env must be provided for evaluation
	self.browsergym_eval_env = browsergym_eval_env
	self.eval_mode = bool(browsergym_eval_env)

	# Initialize browser environment process
	multiprocessing.set_start_method('spawn', force=True)
	self.browser_side, self.agent_side = multiprocessing.Pipe()

	self.init_browser()
	atexit.register(self.close)

	def get_html_text_converter(self) -> html2text.HTML2Text:
	html_text_converter = html2text.HTML2Text()
	# ignore links and images
	html_text_converter.ignore_links = False
	html_text_converter.ignore_images = True
	# use alt text for images
	html_text_converter.images_to_alt = True
	# disable auto text wrapping
	html_text_converter.body_width = 0
	return html_text_converter

	@tenacity.retry(
	wait=tenacity.wait_fixed(1),
	stop=tenacity.stop_after_attempt(5) \| stop_if_should_exit(),
	retry=tenacity.retry_if_exception_type(BrowserInitException),
	)
	def init_browser(self) -> None:
	logger.debug('Starting browser env...')
	try:
	self.process = multiprocessing.Process(target=self.browser_process)
	self.process.start()
	except Exception as e:
	logger.error(f'Failed to start browser process: {e}')
	raise

	if not self.check_alive(timeout=200):
	self.close()
	raise BrowserInitException('Failed to start browser environment.')

	def browser_process(self) -> None:
	if self.eval_mode:
	assert self.browsergym_eval_env is not None
	logger.info('Initializing browser env for web browsing evaluation.')
	if not self.browsergym_eval_env.startswith('browsergym/'):
	self.browsergym_eval_env = 'browsergym/' + self.browsergym_eval_env
	if 'visualwebarena' in self.browsergym_eval_env:
	import browsergym.visualwebarena # noqa F401 register visualwebarena tasks as gym environments
	import nltk

	nltk.download('punkt_tab')
	elif 'webarena' in self.browsergym_eval_env:
	import browsergym.webarena # noqa F401 register webarena tasks as gym environments
	elif 'miniwob' in self.browsergym_eval_env:
	import browsergym.miniwob # noqa F401 register miniwob tasks as gym environments
	else:
	raise ValueError(
	f'Unsupported browsergym eval env: {self.browsergym_eval_env}'
	)
	env = gym.make(self.browsergym_eval_env, tags_to_mark='all', timeout=100000)
	else:
	env = gym.make(
	'browsergym/openended',
	task_kwargs={'start_url': 'about:blank', 'goal': 'PLACEHOLDER_GOAL'},
	wait_for_user_message=False,
	headless=True,
	disable_env_checker=True,
	tags_to_mark='all',
	)
	obs, info = env.reset()

	logger.info('Successfully called env.reset')
	# EVAL ONLY: save the goal into file for evaluation
	self.eval_goal = None
	self.goal_image_urls = []
	self.eval_rewards: list[float] = []
	if self.eval_mode:
	self.eval_goal = obs['goal']
	if 'goal_object' in obs:
	if len(obs['goal_object']) > 0:
	self.eval_goal = obs['goal_object'][0]['text']
	for message in obs['goal_object']:
	if message['type'] == 'image_url':
	image_src = message['image_url']
	if isinstance(image_src, dict):
	image_src = image_src['url']
	self.goal_image_urls.append(image_src)
	logger.debug(f'Browsing goal: {self.eval_goal}')
	logger.info('Browser env started.')

	while should_continue():
	try:
	if self.browser_side.poll(timeout=0.01):
	unique_request_id, action_data = self.browser_side.recv()

	# shutdown the browser environment
	if unique_request_id == 'SHUTDOWN':
	logger.debug('SHUTDOWN recv, shutting down browser env...')
	env.close()
	return
	elif unique_request_id == 'IS_ALIVE':
	self.browser_side.send(('ALIVE', None))
	continue

	# EVAL ONLY: Get evaluation info
	if action_data['action'] == BROWSER_EVAL_GET_GOAL_ACTION:
	self.browser_side.send(
	(
	unique_request_id,
	{
	'text_content': self.eval_goal,
	'image_content': self.goal_image_urls,
	},
	)
	)
	continue
	elif action_data['action'] == BROWSER_EVAL_GET_REWARDS_ACTION:
	self.browser_side.send(
	(
	unique_request_id,
	{'text_content': json.dumps(self.eval_rewards)},
	)
	)
	continue

	action = action_data['action']
	obs, reward, terminated, truncated, info = env.step(action)

	# EVAL ONLY: Save the rewards into file for evaluation
	if self.eval_mode:
	self.eval_rewards.append(reward)

	# add text content of the page
	html_str = flatten_dom_to_str(obs['dom_object'])
	obs['text_content'] = self.html_text_converter.handle(html_str)
	# make observation serializable
	obs['set_of_marks'] = image_to_png_base64_url(
	overlay_som(
	obs['screenshot'], obs.get('extra_element_properties', {})
	),
	add_data_prefix=True,
	)
	obs['screenshot'] = image_to_png_base64_url(
	obs['screenshot'], add_data_prefix=True
	)
	obs['active_page_index'] = obs['active_page_index'].item()
	obs['elapsed_time'] = obs['elapsed_time'].item()
	self.browser_side.send((unique_request_id, obs))
	except KeyboardInterrupt:
	logger.debug('Browser env process interrupted by user.')
	try:
	env.close()
	except Exception:
	pass
	return

	def step(self, action_str: str, timeout: float = 100) -> dict:
	"""Execute an action in the browser environment and return the observation."""
	unique_request_id = str(uuid.uuid4())
	self.agent_side.send((unique_request_id, {'action': action_str}))
	start_time = time.time()
	while True:
	if should_exit() or time.time() - start_time > timeout:
	raise TimeoutError('Browser environment took too long to respond.')
	if self.agent_side.poll(timeout=0.01):
	response_id, obs = self.agent_side.recv()
	if response_id == unique_request_id:
	return dict(obs)

	def check_alive(self, timeout: float = 60) -> bool:
	self.agent_side.send(('IS_ALIVE', None))
	if self.agent_side.poll(timeout=timeout):
	response_id, _ = self.agent_side.recv()
	if response_id == 'ALIVE':
	return True
	logger.debug(f'Browser env is not alive. Response ID: {response_id}')
	return False

	def close(self) -> None:
	if not self.process.is_alive():
	return
	try:
	self.agent_side.send(('SHUTDOWN', None))
	self.process.join(5) # Wait for the process to terminate
	if self.process.is_alive():
	logger.error(
	'Browser process did not terminate, forcefully terminating...'
	)
	self.process.terminate()
	self.process.join(5) # Wait for the process to terminate
	if self.process.is_alive():
	self.process.kill()
	self.process.join(5) # Wait for the process to terminate
	self.agent_side.close()
	self.browser_side.close()
	except Exception as e:
	logger.error(f'Encountered an error when closing browser env: {e}')