Spaces:

HuggingFaceM4
/

IDEFICS_Data_Measurement_Tool

Runtime error

App Files Files Community

IDEFICS_Data_Measurement_Tool / run_data_measurements.py

Ezi

Upload 312 files

46df0b6 about 2 years ago

raw

history blame contribute delete

14.3 kB

	import argparse
	import json
	from dotenv import load_dotenv
	import plotly
	import shutil
	import smtplib
	import ssl
	import sys
	import textwrap
	from data_measurements import dataset_statistics
	from data_measurements.zipf import zipf
	from huggingface_hub import create_repo, Repository, hf_api
	from os import getenv
	from os.path import exists, join as pjoin
	from pathlib import Path
	import utils
	from utils import dataset_utils

	logs = utils.prepare_logging(__file__)

	def load_or_prepare_widgets(ds_args, show_embeddings=False,
	show_perplexities=False, use_cache=False):
	"""
	Loader specifically for the widgets used in the app.
	Args:
	ds_args:
	show_embeddings:
	show_perplexities:
	use_cache:

	Returns:

	"""
	dstats = dataset_statistics.DatasetStatisticsCacheClass(**ds_args, use_cache=use_cache)
	# Header widget
	dstats.load_or_prepare_dset_peek()
	# General stats widget
	dstats.load_or_prepare_general_stats()
	# Labels widget
	dstats.load_or_prepare_labels()
	# Text lengths widget
	dstats.load_or_prepare_text_lengths()
	if show_embeddings:
	# Embeddings widget
	dstats.load_or_prepare_embeddings()
	if show_perplexities:
	# Text perplexities widget
	dstats.load_or_prepare_text_perplexities()
	# Text duplicates widget
	dstats.load_or_prepare_text_duplicates()
	# nPMI widget
	dstats.load_or_prepare_npmi()
	# Zipf widget
	dstats.load_or_prepare_zipf()


	def load_or_prepare(dataset_args, calculation=False, use_cache=False):
	# TODO: Catch error exceptions for each measurement, so that an error
	# for one measurement doesn't break the calculation of all of them.

	do_all = False
	dstats = dataset_statistics.DatasetStatisticsCacheClass(**dataset_args,
	use_cache=use_cache)
	logs.info("Tokenizing dataset.")
	dstats.load_or_prepare_tokenized_df()
	logs.info("Calculating vocab.")
	dstats.load_or_prepare_vocab()

	if not calculation:
	do_all = True

	if do_all or calculation == "general":
	logs.info("\n* Calculating general statistics.")
	dstats.load_or_prepare_general_stats()
	logs.info("Done!")
	logs.info(
	"Basic text statistics now available at %s." % dstats.general_stats_json_fid)

	if do_all or calculation == "duplicates":
	logs.info("\n* Calculating text duplicates.")
	dstats.load_or_prepare_text_duplicates()
	duplicates_fid_dict = dstats.duplicates_files
	logs.info("If all went well, then results are in the following files:")
	for key, value in duplicates_fid_dict.items():
	logs.info("%s: %s" % (key, value))

	if do_all or calculation == "lengths":
	logs.info("\n* Calculating text lengths.")
	dstats.load_or_prepare_text_lengths()
	length_fid_dict = dstats.length_obj.get_filenames()
	print("If all went well, then results are in the following files:")
	for key, value in length_fid_dict.items():
	print("%s: %s" % (key, value))
	print()

	if do_all or calculation == "labels":
	logs.info("\n* Calculating label statistics.")
	if dstats.label_field not in dstats.dset.features:
	logs.warning("No label field found.")
	logs.info("No label statistics to calculate.")
	else:
	dstats.load_or_prepare_labels()
	npmi_fid_dict = dstats.label_files
	print("If all went well, then results are in the following files:")
	for key, value in npmi_fid_dict.items():
	print("%s: %s" % (key, value))
	print()

	if do_all or calculation == "npmi":
	print("\n* Preparing nPMI.")
	dstats.load_or_prepare_npmi()
	npmi_fid_dict = dstats.npmi_files
	print("If all went well, then results are in the following files:")
	for key, value in npmi_fid_dict.items():
	if isinstance(value, dict):
	print(key + ":")
	for key2, value2 in value.items():
	print("\t%s: %s" % (key2, value2))
	else:
	print("%s: %s" % (key, value))
	print()

	if do_all or calculation == "zipf":
	logs.info("\n* Preparing Zipf.")
	dstats.load_or_prepare_zipf()
	logs.info("Done!")
	zipf_json_fid, zipf_fig_json_fid, zipf_fig_html_fid = zipf.get_zipf_fids(
	dstats.dataset_cache_dir)
	logs.info("Zipf results now available at %s." % zipf_json_fid)
	logs.info(
	"Figure saved to %s, with corresponding json at %s."
	% (zipf_fig_html_fid, zipf_fig_json_fid)
	)

	# Don't do this one until someone specifically asks for it -- takes awhile.
	if calculation == "embeddings":
	logs.info("\n* Preparing text embeddings.")
	dstats.load_or_prepare_embeddings()

	# Don't do this one until someone specifically asks for it -- takes awhile.
	if calculation == "perplexities":
	logs.info("\n* Preparing text perplexities.")
	dstats.load_or_prepare_text_perplexities()

	def pass_args_to_DMT(dset_name, dset_config, split_name, text_field, label_field, label_names, calculation, dataset_cache_dir, prepare_gui=False, use_cache=True):
	if not use_cache:
	logs.info("Not using any cache; starting afresh")
	dataset_args = {
	"dset_name": dset_name,
	"dset_config": dset_config,
	"split_name": split_name,
	"text_field": text_field,
	"label_field": label_field,
	"label_names": label_names,
	"dataset_cache_dir": dataset_cache_dir
	}
	if prepare_gui:
	load_or_prepare_widgets(dataset_args, use_cache=use_cache)
	else:
	load_or_prepare(dataset_args, calculation=calculation, use_cache=use_cache)

	def set_defaults(args):
	if not args.config:
	args.config = "default"
	logs.info("Config name not specified. Assuming it's 'default'.")
	if not args.split:
	args.split = "train"
	logs.info("Split name not specified. Assuming it's 'train'.")
	if not args.feature:
	args.feature = "text"
	logs.info("Text column name not given. Assuming it's 'text'.")
	if not args.label_field:
	args.label_field = "label"
	logs.info("Label column name not given. Assuming it's 'label'.")
	return args

	def main():
	parser = argparse.ArgumentParser(
	formatter_class=argparse.RawDescriptionHelpFormatter,
	description=textwrap.dedent(
	"""

	Example for hate speech18 dataset:
	python3 run_data_measurements.py --dataset="hate_speech18" --config="default" --split="train" --feature="text"

	Example for IMDB dataset:
	python3 run_data_measurements.py --dataset="imdb" --config="plain_text" --split="train" --label_field="label" --feature="text"
	"""
	),
	)

	parser.add_argument(
	"-d", "--dataset", required=True, help="Name of dataset to prepare"
	)
	parser.add_argument(
	"-c", "--config", required=False, default="", help="Dataset configuration to prepare"
	)
	parser.add_argument(
	"-s", "--split", required=False, default="", type=str,
	help="Dataset split to prepare"
	)
	parser.add_argument(
	"-f",
	"--feature",
	"-t",
	"--text-field",
	required=False,
	nargs="+",
	type=str,
	default="",
	help="Column to prepare (handled as text)",
	)
	parser.add_argument(
	"-w",
	"--calculation",
	help="""What to calculate (defaults to everything except embeddings and perplexities).\n
	Options are:\n

	- `general` (for duplicate counts, missing values, length statistics.)\n

	- `duplicates` for duplicate counts\n

	- `lengths` for text length distribution\n

	- `labels` for label distribution\n

	- `embeddings` (Warning: Slow.)\n

	- `perplexities` (Warning: Slow.)\n

	- `npmi` for word associations\n

	- `zipf` for zipfian statistics
	""",
	)
	parser.add_argument(
	"-l",
	"--label_field",
	type=str,
	required=False,
	default="",
	help="Field name for label column in dataset (Required if there is a label field that you want information about)",
	)
	parser.add_argument('-n', '--label_names', nargs='+', default=[])
	parser.add_argument(
	"--use_cache",
	default=False,
	required=False,
	action="store_true",
	help="Whether to use cached files (Optional)",
	)
	parser.add_argument("--out_dir", default="cache_dir",
	help="Where to write out to.")
	parser.add_argument(
	"--overwrite_previous",
	default=False,
	required=False,
	action="store_true",
	help="Whether to overwrite a previous local cache for these same arguments (Optional)",
	)
	parser.add_argument(
	"--email",
	default=None,
	help="An email that recieves a message about whether the computation was successful. If email is not None, then you must have EMAIL_PASSWORD=<your email password> for the sender email (data.measurements.tool@gmail.com) in a file named .env at the root of this repo.")
	parser.add_argument(
	"--push_cache_to_hub",
	default=False,
	required=False,
	action="store_true",
	help="Whether to push the cache to an organization on the hub. If you are using this option, you must have HUB_CACHE_ORGANIZATION=<the organization you've set up on the hub to store your cache> and HF_TOKEN=<your hf token> on separate lines in a file named .env at the root of this repo.",
	)
	parser.add_argument("--prepare_GUI_data", default=False, required=False,
	action="store_true",
	help="Use this to process all of the stats used in the GUI.")
	parser.add_argument("--keep_local", default=True, required=False,
	action="store_true",
	help="Whether to save the data locally.")
	orig_args = parser.parse_args()
	args = set_defaults(orig_args)
	logs.info("Proceeding with the following arguments:")
	logs.info(args)
	# run_data_measurements.py -d hate_speech18 -c default -s train -f text -w npmi
	if args.email is not None:
	if Path(".env").is_file():
	load_dotenv(".env")
	EMAIL_PASSWORD = getenv("EMAIL_PASSWORD")
	context = ssl.create_default_context()
	port = 465
	server = smtplib.SMTP_SSL("smtp.gmail.com", port, context=context)
	server.login("data.measurements.tool@gmail.com", EMAIL_PASSWORD)

	dataset_cache_name, local_dataset_cache_dir = dataset_utils.get_cache_dir_naming(args.out_dir, args.dataset, args.config, args.split, args.feature)
	if not args.use_cache and exists(local_dataset_cache_dir):
	if args.overwrite_previous:
	shutil.rmtree(local_dataset_cache_dir)
	else:
	raise OSError("Cached results for this dataset already exist at %s. "
	"Delete it or use the --overwrite_previous argument." % local_dataset_cache_dir)

	# Initialize the local cache directory
	dataset_utils.make_path(local_dataset_cache_dir)

	# Initialize the repository
	# TODO: print out local or hub cache directory location.
	if args.push_cache_to_hub:
	repo = dataset_utils.initialize_cache_hub_repo(local_dataset_cache_dir, dataset_cache_name)
	# Run the measurements.
	try:
	pass_args_to_DMT(
	dset_name=args.dataset,
	dset_config=args.config,
	split_name=args.split,
	text_field=args.feature,
	label_field=args.label_field,
	label_names=args.label_names,
	calculation=args.calculation,
	dataset_cache_dir=local_dataset_cache_dir,
	prepare_gui=args.prepare_GUI_data,
	use_cache=args.use_cache,
	)
	if args.push_cache_to_hub:
	repo.push_to_hub(commit_message="Added dataset cache.")
	computed_message = f"Data measurements have been computed for dataset" \
	f" with these arguments: {args}."
	logs.info(computed_message)
	if args.email is not None:
	computed_message += "\nYou can return to the data measurements tool " \
	"to view them."
	server.sendmail("data.measurements.tool@gmail.com", args.email,
	"Subject: Data Measurements Computed!\n\n" + computed_message)
	logs.info(computed_message)
	except Exception as e:
	logs.exception(e)
	error_message = f"An error occurred in computing data measurements " \
	f"for dataset with arguments: {args}. " \
	f"Feel free to make an issue here: " \
	f"https://github.com/huggingface/data-measurements-tool/issues"
	if args.email is not None:
	server.sendmail("data.measurements.tool@gmail.com", args.email,
	"Subject: Data Measurements not Computed\n\n" + error_message)
	logs.warning("Data measurements not computed. ☹️")
	logs.warning(error_message)
	return
	if not args.keep_local:
	# Remove the dataset from local storage - we only want it stored on the hub.
	logs.warning("Deleting measurements data locally at %s" % local_dataset_cache_dir)
	shutil.rmtree(local_dataset_cache_dir)
	else:
	logs.info("Measurements made available locally at %s" % local_dataset_cache_dir)


	if __name__ == "__main__":
	main()