Spaces:
Runtime error
Runtime error
| import pandas as pd | |
| import utils | |
| from sklearn.feature_extraction.text import CountVectorizer | |
| logs = utils.prepare_logging(__file__) | |
| TEXT = "text" | |
| TOKENIZED_TEXT = "tokenized_text" | |
| class Tokenize: | |
| def __init__(self, text_dset, feature=TEXT, tok_feature=TOKENIZED_TEXT, | |
| lowercase=True): | |
| self.text_dset = text_dset | |
| self.feature = feature | |
| self.tok_feature = tok_feature | |
| self.lowercase = lowercase | |
| # Pattern for tokenization | |
| self.cvec = CountVectorizer(token_pattern="(?u)\\b\\w+\\b", | |
| lowercase=lowercase) | |
| self.tokenized_dset = self.do_tokenization() | |
| def do_tokenization(self): | |
| """ | |
| Tokenizes a Hugging Face dataset in the self.feature field. | |
| :return: Hugging Face Dataset with tokenized text in self.tok_feature. | |
| """ | |
| sent_tokenizer = self.cvec.build_tokenizer() | |
| def tokenize_batch(examples): | |
| if self.lowercase: | |
| tok_sent = { | |
| self.tok_feature: [tuple(sent_tokenizer(text.lower())) for | |
| text in examples[self.feature]]} | |
| else: | |
| tok_sent = { | |
| self.tok_feature: [tuple(sent_tokenizer(text)) for text in | |
| examples[self.feature]]} | |
| return tok_sent | |
| tokenized_dset = self.text_dset.map( | |
| tokenize_batch, | |
| batched=True | |
| ) | |
| logs.info("Tokenized the dataset.") | |
| return tokenized_dset | |
| def get(self): | |
| return self.tokenized_dset | |
| def get_df(self): | |
| return pd.DataFrame(self.tokenized_dset) | |