Spaces:

HuggingFaceM4
/

IDEFICS_Data_Measurement_Tool

Runtime error

App Files Files Community

IDEFICS_Data_Measurement_Tool / data_measurements /text_duplicates /text_duplicates.py

Ezi

Upload 312 files

46df0b6 over 2 years ago

raw

history blame

3.88 kB

	import evaluate
	import logging
	import os
	import pandas as pd
	import plotly.express as px
	import utils
	import utils.dataset_utils as ds_utils
	from collections import Counter
	from os.path import exists, isdir
	from os.path import join as pjoin

	TEXT = "text"
	# These are string constants defined in the evaluate library.
	# They may need to be updated if the evaluate library changes these strings
	DUPS_FRAC = "duplicate_fraction"
	# Evaluate calls the dictionary a "list"
	DUPS_DICT = "duplicates_dict"
	# This isn't in the evaluate measurement, but TODO to add that...
	# DUPS_SUM = "duplicate_sum"

	logs = utils.prepare_logging(__file__)

	class DMTHelper:
	"""Helper class for the Data Measurements Tool.
	This allows us to keep all variables and functions related to labels
	in one file.
	Does caching and using the evaluate library for computation.
	"""

	def __init__(self, dstats, load_only, save):
	# Input HuggingFace Dataset.
	self.dset = dstats.text_dset[TEXT]
	if self.dset is None:
	dstats.load_or_prepare_text_dset()
	self.dset = dstats.text_dset
	self.use_cache = dstats.use_cache
	# Note: This is None as it can be called different times with different
	# settings, and so we want fresh results each time. With the evaluate
	# integration, results are different depending on whether
	# list_duplicates is set.
	self.duplicates_results = None
	self.cache_dir = dstats.dataset_cache_dir
	self.save = save
	self.load_only = load_only
	# Filenames
	self.dups_dir = "text_duplicates"
	dups_json = "text_duplicates.json"
	dups_html = "text_duplicates.html"
	self.dups_result_json_fid = pjoin(self.cache_dir, self.dups_dir, dups_json)
	self.dups_result_html_fid = pjoin(self.cache_dir, self.dups_dir, dups_html)

	def run_DMT_processing(self, list_duplicates=True):
	"""Calls functions to do the main work.
	DMT uses the full duplicates list in a widget,
	so it is set to default True.
	"""
	# First look to see what we can load from cache.
	if self.use_cache:
	self.duplicates_results = self._load_duplicates_cache()
	if self.duplicates_results:
	logs.info("Loaded cached text duplicate results.")
	if not self.duplicates_results and not self.load_only:
	self.duplicates_results = self._prepare_duplicates(list_duplicates=list_duplicates)
	logs.info("Prepared duplicates.")
	if self.save:
	self._write_duplicates_cache()

	def _prepare_duplicates(self, list_duplicates=True):
	"""Wraps the evaluate library."""
	duplicates = evaluate.load("text_duplicates")
	results = duplicates.compute(data=self.dset, list_duplicates=list_duplicates)
	return results

	def _load_duplicates_cache(self):
	"""Loads previously computed results from cache."""
	results = {}
	if exists(self.dups_result_json_fid):
	results = ds_utils.read_json(self.dups_result_json_fid)
	return results

	def _write_duplicates_cache(self):
	"""Writes newly computed results to cache."""
	ds_utils.make_path(pjoin(self.cache_dir, self.dups_dir))
	if self.duplicates_results:
	ds_utils.write_json(self.duplicates_results, self.dups_result_json_fid)
	# TODO: Use df_to_html rather than write_json_as_html;
	# this will make it possible to order the results.
	# But they must first be turned into a dataframe.
	ds_utils.write_json_as_html(self.duplicates_results, self.dups_result_html_fid)

	def get_duplicates_filenames(self):
	dups_fid_dict = {"statistics": self.dups_result_json_fid, "html":self.dups_result_html_fid}
	return dups_fid_dict