Spaces:
Runtime error
Runtime error
| import evaluate | |
| import logging | |
| import os | |
| import pandas as pd | |
| import plotly.express as px | |
| import utils | |
| import utils.dataset_utils as ds_utils | |
| from collections import Counter | |
| from os.path import exists, isdir | |
| from os.path import join as pjoin | |
| TEXT = "text" | |
| # These are string constants defined in the evaluate library. | |
| # They may need to be updated if the evaluate library changes these strings | |
| DUPS_FRAC = "duplicate_fraction" | |
| # Evaluate calls the dictionary a "list" | |
| DUPS_DICT = "duplicates_dict" | |
| # This isn't in the evaluate measurement, but TODO to add that... | |
| # DUPS_SUM = "duplicate_sum" | |
| logs = utils.prepare_logging(__file__) | |
| class DMTHelper: | |
| """Helper class for the Data Measurements Tool. | |
| This allows us to keep all variables and functions related to labels | |
| in one file. | |
| Does caching and using the evaluate library for computation. | |
| """ | |
| def __init__(self, dstats, load_only, save): | |
| # Input HuggingFace Dataset. | |
| self.dset = dstats.text_dset[TEXT] | |
| if self.dset is None: | |
| dstats.load_or_prepare_text_dset() | |
| self.dset = dstats.text_dset | |
| self.use_cache = dstats.use_cache | |
| # Note: This is None as it can be called different times with different | |
| # settings, and so we want fresh results each time. With the evaluate | |
| # integration, results are different depending on whether | |
| # list_duplicates is set. | |
| self.duplicates_results = None | |
| self.cache_dir = dstats.dataset_cache_dir | |
| self.save = save | |
| self.load_only = load_only | |
| # Filenames | |
| self.dups_dir = "text_duplicates" | |
| dups_json = "text_duplicates.json" | |
| dups_html = "text_duplicates.html" | |
| self.dups_result_json_fid = pjoin(self.cache_dir, self.dups_dir, dups_json) | |
| self.dups_result_html_fid = pjoin(self.cache_dir, self.dups_dir, dups_html) | |
| def run_DMT_processing(self, list_duplicates=True): | |
| """Calls functions to do the main work. | |
| DMT uses the full duplicates list in a widget, | |
| so it is set to default True. | |
| """ | |
| # First look to see what we can load from cache. | |
| if self.use_cache: | |
| self.duplicates_results = self._load_duplicates_cache() | |
| if self.duplicates_results: | |
| logs.info("Loaded cached text duplicate results.") | |
| if not self.duplicates_results and not self.load_only: | |
| self.duplicates_results = self._prepare_duplicates(list_duplicates=list_duplicates) | |
| logs.info("Prepared duplicates.") | |
| if self.save: | |
| self._write_duplicates_cache() | |
| def _prepare_duplicates(self, list_duplicates=True): | |
| """Wraps the evaluate library.""" | |
| duplicates = evaluate.load("text_duplicates") | |
| results = duplicates.compute(data=self.dset, list_duplicates=list_duplicates) | |
| return results | |
| def _load_duplicates_cache(self): | |
| """Loads previously computed results from cache.""" | |
| results = {} | |
| if exists(self.dups_result_json_fid): | |
| results = ds_utils.read_json(self.dups_result_json_fid) | |
| return results | |
| def _write_duplicates_cache(self): | |
| """Writes newly computed results to cache.""" | |
| ds_utils.make_path(pjoin(self.cache_dir, self.dups_dir)) | |
| if self.duplicates_results: | |
| ds_utils.write_json(self.duplicates_results, self.dups_result_json_fid) | |
| # TODO: Use df_to_html rather than write_json_as_html; | |
| # this will make it possible to order the results. | |
| # But they must first be turned into a dataframe. | |
| ds_utils.write_json_as_html(self.duplicates_results, self.dups_result_html_fid) | |
| def get_duplicates_filenames(self): | |
| dups_fid_dict = {"statistics": self.dups_result_json_fid, "html":self.dups_result_html_fid} | |
| return dups_fid_dict | |