Spaces:
Runtime error
Runtime error
| import string | |
| import nltk | |
| from nltk.corpus import stopwords | |
| from nltk.tokenize import WordPunctTokenizer | |
| import pymorphy2 | |
| class DataPreprocessor: | |
| def __init__(self): | |
| nltk.download('stopwords') | |
| self.morph = pymorphy2.MorphAnalyzer() | |
| self.tokenizer = WordPunctTokenizer() | |
| self.punctuation = set(string.punctuation) | |
| self.stopwords_russian = stopwords.words("russian") | |
| self.stop_tokens = (set(self.stopwords_russian) - {'и', 'или', 'не'}).union(self.punctuation) | |
| def tokenize_data(self, texts): | |
| tokens = [self.tokenizer.tokenize(str(text).lower()) for text in texts] | |
| return tokens | |
| def lemmatize_tokens_string(self, tokens_string): | |
| new_tokens = [] | |
| for token in tokens_string: | |
| if token not in self.stop_tokens: | |
| new_tokens.append(self.morph.parse(token)[0].normal_form) | |
| return new_tokens | |
| def lemmatize_tokens(self, tokens): | |
| for i in range(len(tokens)): | |
| tokens[i] = self.lemmatize_tokens_string(tokens[i]) | |
| def preprocess_texts(self, texts): | |
| tokens = self.tokenize_data(texts) | |
| self.lemmatize_tokens(tokens) | |
| return tokens | |