Spaces:
Build error
Build error
meg-huggingface
commited on
Commit
·
66693d5
1
Parent(s):
e1f2cc3
Removing need to keep around base dset for the header widget; now just saving what is shown -- the first n lines of the base dataset -- as a json, and loading if it's cached.
Browse files
data_measurements/dataset_statistics.py
CHANGED
|
@@ -185,6 +185,7 @@ class DatasetStatisticsCacheClass:
|
|
| 185 |
self.dset = None # original dataset
|
| 186 |
# HF dataset with all of the self.text_field instances in self.dset
|
| 187 |
self.text_dset = None
|
|
|
|
| 188 |
# HF dataset with text embeddings in the same order as self.text_dset
|
| 189 |
self.embeddings_dset = None
|
| 190 |
# HF dataset with all of the self.label_field instances in self.dset
|
|
@@ -254,6 +255,7 @@ class DatasetStatisticsCacheClass:
|
|
| 254 |
logs.warning("Creating cache directory %s." % self.cache_path)
|
| 255 |
mkdir(self.cache_path)
|
| 256 |
self.dset_fid = pjoin(self.cache_path, "base_dset")
|
|
|
|
| 257 |
self.text_dset_fid = pjoin(self.cache_path, "text_dset")
|
| 258 |
self.tokenized_df_fid = pjoin(self.cache_path, "tokenized_df.feather")
|
| 259 |
self.label_dset_fid = pjoin(self.cache_path, "label_dset")
|
|
@@ -283,10 +285,6 @@ class DatasetStatisticsCacheClass:
|
|
| 283 |
use_streaming=True,
|
| 284 |
)
|
| 285 |
|
| 286 |
-
def get_dataset_peek(self):
|
| 287 |
-
self.get_base_dataset()
|
| 288 |
-
return self.dset[:100]
|
| 289 |
-
|
| 290 |
def load_or_prepare_general_stats(self, use_cache=False, save=True):
|
| 291 |
"""
|
| 292 |
Content for expander_general_stats widget.
|
|
@@ -462,7 +460,19 @@ class DatasetStatisticsCacheClass:
|
|
| 462 |
self.load_or_prepare_text_dset(use_cache, save)
|
| 463 |
logs.info("Doing tokenized dataframe")
|
| 464 |
self.load_or_prepare_tokenized_df(use_cache, save)
|
|
|
|
|
|
|
| 465 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 466 |
|
| 467 |
def load_or_prepare_tokenized_df(self, use_cache, save):
|
| 468 |
if (use_cache and exists(self.tokenized_df_fid)):
|
|
@@ -483,20 +493,23 @@ class DatasetStatisticsCacheClass:
|
|
| 483 |
logs.info(self.text_dset)
|
| 484 |
# ...Or load it from the server and store it anew
|
| 485 |
else:
|
| 486 |
-
self.
|
| 487 |
-
# extract all text instances
|
| 488 |
-
self.text_dset = self.dset.map(
|
| 489 |
-
lambda examples: extract_field(
|
| 490 |
-
examples, self.text_field, OUR_TEXT_FIELD
|
| 491 |
-
),
|
| 492 |
-
batched=True,
|
| 493 |
-
remove_columns=list(self.dset.features),
|
| 494 |
-
)
|
| 495 |
if save:
|
| 496 |
# save extracted text instances
|
| 497 |
logs.warning("Saving dataset to disk")
|
| 498 |
self.text_dset.save_to_disk(self.text_dset_fid)
|
| 499 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 500 |
def do_tokenization(self):
|
| 501 |
"""
|
| 502 |
Tokenizes the dataset
|
|
|
|
| 185 |
self.dset = None # original dataset
|
| 186 |
# HF dataset with all of the self.text_field instances in self.dset
|
| 187 |
self.text_dset = None
|
| 188 |
+
self.dset_peek = None
|
| 189 |
# HF dataset with text embeddings in the same order as self.text_dset
|
| 190 |
self.embeddings_dset = None
|
| 191 |
# HF dataset with all of the self.label_field instances in self.dset
|
|
|
|
| 255 |
logs.warning("Creating cache directory %s." % self.cache_path)
|
| 256 |
mkdir(self.cache_path)
|
| 257 |
self.dset_fid = pjoin(self.cache_path, "base_dset")
|
| 258 |
+
self.dset_peek_fid = pjoin(self.cache_path, "dset_peek.json")
|
| 259 |
self.text_dset_fid = pjoin(self.cache_path, "text_dset")
|
| 260 |
self.tokenized_df_fid = pjoin(self.cache_path, "tokenized_df.feather")
|
| 261 |
self.label_dset_fid = pjoin(self.cache_path, "label_dset")
|
|
|
|
| 285 |
use_streaming=True,
|
| 286 |
)
|
| 287 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 288 |
def load_or_prepare_general_stats(self, use_cache=False, save=True):
|
| 289 |
"""
|
| 290 |
Content for expander_general_stats widget.
|
|
|
|
| 460 |
self.load_or_prepare_text_dset(use_cache, save)
|
| 461 |
logs.info("Doing tokenized dataframe")
|
| 462 |
self.load_or_prepare_tokenized_df(use_cache, save)
|
| 463 |
+
logs.info("Doing dataset peek")
|
| 464 |
+
self.load_or_prepare_dset_peek(save, use_cache)
|
| 465 |
|
| 466 |
+
def load_or_prepare_dset_peek(self, save, use_cache):
|
| 467 |
+
if use_cache and exists(self.dset_peek_fid):
|
| 468 |
+
with open(self.dset_peek_fid, "r") as f:
|
| 469 |
+
self.dset_peek = json.load(f)["dset peek"]
|
| 470 |
+
else:
|
| 471 |
+
if self.dset is None:
|
| 472 |
+
self.get_base_dataset()
|
| 473 |
+
self.dset_peek = self.dset[:100]
|
| 474 |
+
if save:
|
| 475 |
+
write_json({"dset_peek": self.dset_peek}, self.dset_peek_fid)
|
| 476 |
|
| 477 |
def load_or_prepare_tokenized_df(self, use_cache, save):
|
| 478 |
if (use_cache and exists(self.tokenized_df_fid)):
|
|
|
|
| 493 |
logs.info(self.text_dset)
|
| 494 |
# ...Or load it from the server and store it anew
|
| 495 |
else:
|
| 496 |
+
self.prepare_text_dset()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 497 |
if save:
|
| 498 |
# save extracted text instances
|
| 499 |
logs.warning("Saving dataset to disk")
|
| 500 |
self.text_dset.save_to_disk(self.text_dset_fid)
|
| 501 |
|
| 502 |
+
def prepare_text_dset(self):
|
| 503 |
+
self.get_base_dataset()
|
| 504 |
+
# extract all text instances
|
| 505 |
+
self.text_dset = self.dset.map(
|
| 506 |
+
lambda examples: extract_field(
|
| 507 |
+
examples, self.text_field, OUR_TEXT_FIELD
|
| 508 |
+
),
|
| 509 |
+
batched=True,
|
| 510 |
+
remove_columns=list(self.dset.features),
|
| 511 |
+
)
|
| 512 |
+
|
| 513 |
def do_tokenization(self):
|
| 514 |
"""
|
| 515 |
Tokenizes the dataset
|
data_measurements/streamlit_utils.py
CHANGED
|
@@ -99,7 +99,7 @@ def expander_header(dstats, ds_name_to_dict, column_id):
|
|
| 99 |
st.markdown(
|
| 100 |
ds_name_to_dict[dstats.dset_name][dstats.dset_config][HF_DESC_FIELD]
|
| 101 |
)
|
| 102 |
-
st.dataframe(dstats.
|
| 103 |
|
| 104 |
|
| 105 |
def expander_general_stats(dstats, column_id):
|
|
|
|
| 99 |
st.markdown(
|
| 100 |
ds_name_to_dict[dstats.dset_name][dstats.dset_config][HF_DESC_FIELD]
|
| 101 |
)
|
| 102 |
+
st.dataframe(dstats.dset_peek)
|
| 103 |
|
| 104 |
|
| 105 |
def expander_general_stats(dstats, column_id):
|