Spaces:

SoooSlooow
/

AdsClassifier

Runtime error

App Files Files Community

SoooSlooow commited on Feb 16, 2023

Commit

d1ef404

1 Parent(s): a44a953

upload src

Browse files

Files changed (12) hide show

app.py +70 -0
models/nnet/nnet.pt +3 -0
src/__init__.py +0 -0
src/__pycache__/__init__.cpython-310.pyc +0 -0
src/data/__init__.py +0 -0
src/data/__pycache__/__init__.cpython-310.pyc +0 -0
src/data/__pycache__/preprocessing_utils.cpython-310.pyc +0 -0
src/data/preprocessing_utils.py +36 -0
src/models/__init__.py +0 -0
src/models/__pycache__/__init__.cpython-310.pyc +0 -0
src/models/__pycache__/models_utils.cpython-310.pyc +0 -0
src/models/models_utils.py +591 -0

app.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import gradio as gr
+import torch
+from src.data.preprocessing_utils import DataPreprocessor
+MODEL_FILEPATH = 'models/nnet/nnet.pt'
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+with open(MODEL_FILEPATH, 'rb') as file:
+    clf = torch.load(file, map_location=device)
+preprocessor = DataPreprocessor()
+strings = {
+    'nationality': 'Есть предпочтения по национальности',
+    'families': 'Есть предпочтение семьям',
+    'sex': 'Есть предпочтения по полу'
+}
+examples = [
+    '''Просьба посредников не беспокоить. Ищем ОДНУ ДЕВУШКУ.
+Сдаётся в аренду на длительный срок светлая и уютная квартира - студия
+общей площадью 33м2, находящаяся на 4м этаже 5и этажного теплого кирпичного дома. Современный ремонт!
+Рядом в пешей доступности парк Красная Пресня (5 мин)/ Красногвардейские Пруды (2 мин)/
+Москва-Сити (10 мин)! Магазины/кофейни/рестораны! 10 мин на машине до любой точки в центре города!
+В квартире есть вся необходимая для проживания мебель и техника.
+Строго без животных, строго Славян. Просмотр в любое время - ключи на руках.
+    ''',
+    '''Сдам на длительный срок семейной паре, только с гражданством РФ.
+Квартира после косметического ремонта. Без мебели.
+Есть кухонная мебель и мебель в ванной комнате.
+Бытовая техника для проживания присутствует.
+Оплата = аренда + счётчики (свет, вода).
+    ''',
+    '''В современном доме. Собственник без комиссии.
+Закрытая территория. Доступ через охрану.
+М Прокшино 10 мин пешком.
+Без детей и животных.
+Возможно без залога.
+Счетчики и интернет включены в стоимость
+    '''
+]
+def make_output_string(labels):
+    output_list = []
+    for label in strings.keys():
+        if labels[label]:
+            output_list.append(strings[label])
+    if output_list:
+        output_str = ', '.join(output_list).capitalize()
+    else:
+        output_str = 'Нет особенностей'
+    return output_str
+def predict_label(text):
+    preprocessed_text = preprocessor.preprocess_texts([text])
+    print(preprocessed_text)
+    if preprocessed_text == [[]]:
+        return 'Введите текст объявления!'
+    labels = clf.predict_labels(preprocessed_text)
+    output_str = make_output_string(labels)
+    return output_str
+demo = gr.Interface(fn=predict_label, inputs=[gr.Text(label="Текст объявления", lines=5)],
+                    outputs=[gr.Textbox(label="Особенности объявления")],
+                    examples=examples)
+demo.launch()

models/nnet/nnet.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:eca046bc6417544613037ccbd7c55537dbfa0d44d181480b0c5aedc32b775877
+size 3474673281

src/__init__.py ADDED Viewed

File without changes

src/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (156 Bytes). View file

src/data/__init__.py ADDED Viewed

File without changes

src/data/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (161 Bytes). View file

src/data/__pycache__/preprocessing_utils.cpython-310.pyc ADDED Viewed

Binary file (1.92 kB). View file

src/data/preprocessing_utils.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import string
+import nltk
+from nltk.corpus import stopwords
+from nltk.tokenize import WordPunctTokenizer
+import pymorphy2
+class DataPreprocessor:
+    def __init__(self):
+        nltk.download('stopwords')
+        self.morph = pymorphy2.MorphAnalyzer()
+        self.tokenizer = WordPunctTokenizer()
+        self.punctuation = set(string.punctuation)
+        self.stopwords_russian = stopwords.words("russian")
+        self.stop_tokens = (set(self.stopwords_russian) - {'и', 'или', 'не'}).union(self.punctuation)
+    def tokenize_data(self, texts):
+        tokens = [self.tokenizer.tokenize(str(text).lower()) for text in texts]
+        return tokens
+    def lemmatize_tokens_string(self, tokens_string):
+        new_tokens = []
+        for token in tokens_string:
+            if token not in self.stop_tokens:
+                new_tokens.append(self.morph.parse(token)[0].normal_form)
+        return new_tokens
+    def lemmatize_tokens(self, tokens):
+        for i in range(len(tokens)):
+            tokens[i] = self.lemmatize_tokens_string(tokens[i])
+    def preprocess_texts(self, texts):
+        tokens = self.tokenize_data(texts)
+        self.lemmatize_tokens(tokens)
+        return tokens

src/models/__init__.py ADDED Viewed

File without changes

src/models/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (163 Bytes). View file

src/models/__pycache__/models_utils.cpython-310.pyc ADDED Viewed

Binary file (18.4 kB). View file

src/models/models_utils.py ADDED Viewed

	@@ -0,0 +1,591 @@

+import os
+import pickle
+import numpy as np
+import pandas as pd
+from gensim.models import KeyedVectors
+from collections import Counter
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from sklearn.metrics import roc_auc_score, precision_recall_curve
+import tqdm
+from copy import deepcopy
+import matplotlib.pyplot as plt
+from transformers import DistilBertTokenizer, DistilBertModel
+def get_roc_aucs(y, probas):
+    y_onehot = pd.get_dummies(y)
+    roc_auc_scores = []
+    if y_onehot.shape[1] > 2:
+        for i in range(y_onehot.shape[1]):
+            roc_auc_scores.append(roc_auc_score(y_onehot[i], probas[:, i]))
+        roc_auc_scores.append(roc_auc_score(y, probas, multi_class='ovo', average='macro'))
+    else:
+        roc_auc_scores.append(roc_auc_score(y, probas[:, 1]))
+    return roc_auc_scores
+def get_max_f1_score(y, probas):
+    if probas.shape[1] != 2:
+        raise ValueError('Expected probabilities for 2 classes would be given')
+    y_onehot = pd.get_dummies(y)
+    f1_score = []
+    threshold = []
+    p, r, t = precision_recall_curve(y, probas[:, 1])
+    f1_scores = 2 * p * r / (p + r + 0.001)
+    threshold.append(t[np.argmax(f1_scores)])
+    f1_score.append(np.max(f1_scores))
+    return f1_score, threshold
+class RNN(nn.Module):
+    def __init__(self, vectors, n_of_words, n_of_classes, num_layers, bidirectional):
+        dim = vectors.shape[1]
+        d = 2 if bidirectional else 1
+        super().__init__()
+        self.emb = nn.Embedding(n_of_words, dim)
+        self.emb.load_state_dict({'weight': torch.tensor(vectors)})
+        self.emb.weight.requires_grad = False
+        self.gru = nn.GRU(input_size=dim, hidden_size=dim, batch_first=True,
+                          num_layers=num_layers, bidirectional=bidirectional)
+        self.linear = nn.Linear(dim * num_layers * d, n_of_classes)
+    def forward(self, batch):
+        emb = self.emb(batch)
+        _, last_state = self.gru(emb)
+        last_state = torch.permute(last_state, (1, 0, 2)).reshape(1, batch.shape[0], -1).squeeze()
+        out = self.linear(last_state.squeeze())
+        if len(out.size()) == 1:
+            out = out.unsqueeze(0)
+        return out
+class DistilBERTClass(torch.nn.Module):
+    def __init__(self, n_classes):
+        super().__init__()
+        self.l1 = DistilBertModel.from_pretrained('DeepPavlov/distilrubert-small-cased-conversational')
+        self.linear = torch.nn.Linear(768, n_classes)
+    def forward(self, input_ids, attention_mask, token_type_ids):
+        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
+        hidden_state = output_1[0]
+        pooler = hidden_state[:, 0]
+        output = self.linear(pooler)
+        return output
+class BaseClassifier:
+    def __init__(self, batch_size=16, epochs=100):
+        self.batch_size = batch_size
+        self.epochs = epochs
+        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    def preprocess_with_random_initialization(self, train_tokens):
+        self.pad_idx = 0
+        self.unk_idx = 1
+        set_of_words = set()
+        for tokens_string in train_tokens:
+            set_of_words.update(tokens_string)
+        self.idx_to_word = ['PADDING', 'UNK'] + list(set_of_words)
+        self.word_to_idx = {key: i for i, key in enumerate(self.idx_to_word)}
+        self.amount_of_words = len(self.idx_to_word)
+        self.vectors = np.zeros((len(self.idx_to_word), 300))
+        self.vectors[0, :] = np.zeros(300)
+        self.vectors[1:len(self.idx_to_word), :] = (np.random.rand(len(self.idx_to_word) - 1, 300) - 0.5) / 300
+    def preprocess(self, vectors_file_path):
+        self.emb = KeyedVectors.load_word2vec_format(vectors_file_path)
+        self.pad_idx = 0
+        self.unk_idx = 1
+        self.idx_to_word = ['PADDING', 'UNK'] + list(self.emb.index_to_key)
+        self.word_to_idx = {key: i for i, key in enumerate(self.idx_to_word)}
+        self.amount_of_words = len(self.idx_to_word)
+        self.vectors = np.zeros((len(self.idx_to_word), 300))
+        self.vectors[0, :] = np.zeros(300)
+        self.vectors[1, :] = (np.random.rand(300) - 0.5) / 300
+        for i in range(2, len(self.idx_to_word)):
+            self.vectors[i, :] = self.emb.get_vector(self.idx_to_word[i])
+    def fit(self, train_tokens, y_train, test_tokens=None, y_test=None,
+            reinitialize=True, stop_epochs=None, show_logs=False):
+        if reinitialize:
+            self.n_of_classes = y_train.nunique()
+            self.initialize_nnet()
+        self.print_test = test_tokens and y_test
+        self.stop_epochs = stop_epochs
+        train_scores = []
+        self.train_scores_mean = []
+        self.test_scores = []
+        self.test_aucs = []
+        self.test_f1 = []
+        criterion = nn.CrossEntropyLoss()
+        for epoch in tqdm.tqdm(range(self.epochs)):
+            self.epoch = epoch
+            self.nnet.train()
+            train_batches = self.batch_generator(train_tokens, y_train)
+            test_batches = self.batch_generator(test_tokens, y_test)
+            for i, batch in tqdm.tqdm(
+                    enumerate(train_batches),
+                    total=len(train_tokens) // self.batch_size
+            ):
+                pred = self.nnet(batch['tokens'])
+                loss = criterion(pred, batch['labels'])
+                self.optimizer.zero_grad()
+                loss.backward()
+                self.optimizer.step()
+                if show_logs and i % 400 == 0:
+                    train_score = criterion(self.nnet(batch['tokens']), batch['labels'])
+                    print(train_score.item())
+                    train_scores.append(train_score.item())
+            if show_logs:
+                self.train_scores_mean.append(sum(train_scores) / len(train_scores))
+                train_scores = []
+                if self.print_test:
+                    test_pred_prob = torch.tensor([], device='cpu')
+                    with torch.no_grad():
+                        self.nnet.eval()
+                        for batch in test_batches:
+                            test_batch_pred_prob = self.nnet(batch['tokens'])
+                            test_batch_pred_prob_cpu = test_batch_pred_prob.to('cpu')
+                            test_pred_prob = torch.cat((test_pred_prob, test_batch_pred_prob_cpu), 0)
+                    test_score = criterion(test_pred_prob, torch.tensor(y_test.values, device='cpu'))
+                    self.test_scores.append(test_score.item())
+                    test_pred_probas = F.softmax(test_pred_prob).detach().cpu().numpy()
+                    self.test_aucs.append(get_roc_aucs(y_test, test_pred_probas))
+                    self.test_f1.append(get_max_f1_score(y_test, test_pred_probas)[0])
+                self.print_metrics()
+            if self.early_stopping_check():
+                break
+    def count_tokens(self, tokens):
+        self.words_counter = Counter()
+        self.amount_of_tokens = 0
+        for s in tokens:
+            self.words_counter.update(s)
+            self.amount_of_tokens += len(s)
+    def index_tokens(self, tokens_string):
+        return [self.word_to_idx.get(token, self.unk_idx) for token in tokens_string]
+    def fill_with_pads(self, tokens):
+        tokens = deepcopy(tokens)
+        max_len = 0
+        for tokens_string in tokens:
+            max_len = max(max_len, len(tokens_string))
+        for tokens_string in tokens:
+            for i in range(len(tokens_string), max_len):
+                tokens_string.append(self.pad_idx)
+        return tokens
+    def as_matrix(self, tokens):
+        tokens = deepcopy(tokens)
+        for j, s in enumerate(tokens):
+            tokens[j] = self.index_tokens(s)
+        tokens = self.fill_with_pads(tokens)
+        return tokens
+    def batch_generator(self, tokens, labels=None):
+        for i in range(0, len(tokens), self.batch_size):
+            batch_tokens = tokens[i: i + self.batch_size]
+            if labels:
+                batch_labels = torch.tensor(labels.values[i: i + self.batch_size],
+                                            dtype=torch.long,
+                                            device=self.device)
+            else:
+                batch_labels = None
+            batch_tokens_idx = torch.tensor(self.as_matrix(batch_tokens),
+                                            dtype=torch.int,
+                                            device=self.device)
+            if len(batch_tokens_idx.size()) == 1:
+                batch_tokens_idx = torch.unsqueeze(batch_tokens_idx, 0)
+            batch = {
+                'tokens': batch_tokens_idx,
+                'labels': batch_labels
+            }
+            yield batch
+    def print_metrics(self, print_test=True):
+        if self.print_test:
+            print(f'epoch {self.epoch}/{self.epochs}')
+            print('auc', self.test_aucs[-1])
+            print('score', self.test_scores[-1])
+            print('f1 score', self.test_f1[-1])
+            legend_labels = []
+            if self.n_of_classes > 2:
+                for i in range(self.n_of_classes):
+                    legend_labels.append(f'Class {i}')
+            legend_labels.append('General')
+            plt.figure(figsize=(5, 15))
+            plt.clf()
+            plt.subplot(3, 1, 1)
+            plt.plot(np.arange(1, self.epoch + 2), self.test_aucs)
+            plt.grid()
+            plt.title('Test ROC AUC')
+            plt.xlabel('Num. of epochs')
+            plt.ylabel('ROC AUC')
+            plt.legend(legend_labels)
+            plt.subplot(3, 1, 2)
+            plt.plot(np.arange(1, self.epoch + 2), self.test_f1)
+            plt.grid()
+            plt.title('Test F1-score')
+            plt.xlabel('Num. of epochs')
+            plt.ylabel('F1-score')
+            plt.legend(legend_labels)
+            plt.subplot(3, 1, 3)
+            plt.plot(np.arange(1, self.epoch + 2), self.train_scores_mean, label='Train loss')
+            plt.plot(np.arange(1, self.epoch + 2), self.test_scores, label='Test loss')
+            plt.title('Loss')
+            plt.xlabel('Num. of epochs')
+            plt.ylabel('Loss')
+            plt.legend()
+            plt.grid()
+            plt.draw()
+        else:
+            plt.figure(figsize=(5, 15))
+            plt.plot(np.arange(1, self.epoch + 2), self.train_scores_mean, label='Train loss')
+            plt.title('Loss')
+            plt.xlabel('Num. of epochs')
+            plt.ylabel('Loss')
+            plt.legend()
+            plt.grid()
+            plt.show()
+    def early_stopping_check(self):
+        if self.stop_epochs is None or self.stop_epochs >= len(self.test_scores):
+            return False
+        else:
+            print(self.test_scores)
+            first_score = np.array(self.test_scores)[-self.stop_epochs - 1]
+            last_scores = np.array(self.test_scores)[-self.stop_epochs:]
+            return np.all(last_scores >= first_score)
+    def predict_proba(self, tokens, labels):
+        batches = self.batch_generator(tokens, labels)
+        pred_probas = torch.tensor([], device=self.device)
+        with torch.no_grad():
+            self.nnet.eval()
+            for batch in batches:
+                batch_prob = self.nnet(batch['tokens'])
+                pred_probas = torch.cat((pred_probas, batch_prob))
+        return F.softmax(pred_probas).detach().cpu().numpy()
+class RNNClassifier(BaseClassifier):
+    def __init__(self, batch_size=16, epochs=100,
+                 num_layers=1, bidirectional=False):
+        self.batch_size = batch_size
+        self.epochs = epochs
+        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
+        self.num_layers = num_layers
+        self.bidirectional = bidirectional
+    def initialize_nnet(self):
+        self.nnet = RNN(self.vectors, self.amount_of_words,
+                        n_of_classes=self.n_of_classes,
+                        num_layers=self.num_layers,
+                        bidirectional=self.bidirectional).to(self.device)
+        self.optimizer = torch.optim.Adam(self.nnet.parameters())
+    def save_model(self, filepath):
+        with open(filepath, 'wb') as file:
+            torch.save(self.nnet.state_dict(), file)
+    def load_model(self, filepath, amount_of_words):
+        self.amount_of_words = amount_of_words
+        self.vectors = np.zeros((amount_of_words, 300))
+        self.n_of_classes = 2
+        self.nnet = RNN(self.vectors, self.amount_of_words,
+                        n_of_classes=self.n_of_classes,
+                        num_layers=self.num_layers,
+                        bidirectional=self.bidirectional).to(self.device)
+        self.nnet.load_state_dict(torch.load(filepath, map_location=self.device))
+class DBERTClassifier(BaseClassifier):
+    def __init__(self, batch_size=16, epochs=100):
+        self.batch_size = batch_size
+        self.epochs = epochs
+        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    def initialize_nnet(self):
+        self.nnet = DistilBERTClass(self.n_of_classes).to(self.device)
+        self.optimizer = torch.optim.Adam(self.nnet.parameters(), lr=2e-6)
+        # 'DeepPavlov/rubert-base-cased' 'DeepPavlov/distilrubert-small-cased-conversational',
+        self.tokenizer = DistilBertTokenizer.from_pretrained('DeepPavlov/distilrubert-small-cased-conversational',
+                                                             do_lower_case=True)
+    def batch_generator(self, tokens, labels=None):
+        for i in range(0, len(tokens), self.batch_size):
+            batch_tokens = tokens[i: i + self.batch_size]
+            batch_tokens = [' '.join(s) for s in batch_tokens]
+            if labels:
+                batch_labels = torch.tensor(labels.values[i: i + self.batch_size],
+                                            dtype=torch.long,
+                                            device=self.device)
+            else:
+                batch_labels = None
+            if len(batch_tokens) == 1:
+                inputs = self.tokenizer.encode_plus(
+                    batch_tokens,
+                    None,
+                    add_special_tokens=True,
+                    max_length=512,
+                    truncation=True,
+                    pad_to_max_length=True,
+                    return_token_type_ids=True
+                )
+            else:
+                inputs = self.tokenizer.batch_encode_plus(
+                    batch_tokens,
+                    add_special_tokens=True,
+                    max_length=512,
+                    truncation=True,
+                    pad_to_max_length=True,
+                    return_token_type_ids=True
+                )
+            batch_token_ids = torch.tensor(inputs['input_ids'], device=self.device, dtype=torch.long)
+            batch_mask = torch.tensor(inputs['attention_mask'], device=self.device, dtype=torch.long)
+            batch_token_type_ids = torch.tensor(inputs["token_type_ids"], device=self.device, dtype=torch.long)
+            if len(batch_tokens) == 1:
+                batch_token_ids = batch_token_ids.unsqueeze(0)
+                batch_mask = batch_mask.unsqueeze(0)
+                batch_token_type_ids = batch_token_type_ids.unsqueeze(0)
+            batch = {
+                'tokens': batch_token_ids,
+                'mask': batch_mask,
+                'token_type_ids': batch_token_type_ids,
+                'labels': batch_labels
+            }
+            yield batch
+    def fit(self, train_tokens, y_train, test_tokens=None, y_test=None,
+            reinitialize=True, stop_epochs=None, show_logs=False):
+        if reinitialize:
+            self.n_of_classes = y_train.nunique()
+            self.initialize_nnet()
+        self.stop_epochs = stop_epochs
+        self.print_test = test_tokens and y_test
+        train_scores = []
+        self.train_scores_mean = []
+        self.test_scores = []
+        self.test_aucs = []
+        self.test_f1 = []
+        criterion = nn.CrossEntropyLoss()
+        for epoch in tqdm.tqdm(range(self.epochs)):
+            self.epoch = epoch
+            self.nnet.train()
+            train_batches = self.batch_generator(train_tokens, y_train)
+            test_batches = self.batch_generator(test_tokens, y_test)
+            for i, batch in tqdm.tqdm(
+                    enumerate(train_batches),
+                    total=len(train_tokens) // self.batch_size
+            ):
+                pred = self.nnet(batch['tokens'], batch['mask'], batch['token_type_ids'])
+                loss = criterion(pred, batch['labels'])
+                self.optimizer.zero_grad()
+                loss.backward()
+                self.optimizer.step()
+                if show_logs and i % 400 == 0:
+                    train_score = criterion(self.nnet(batch['tokens'], batch['mask'], batch['token_type_ids']),
+                                            batch['labels'])
+                    print(train_score.item())
+                    train_scores.append(train_score.item())
+            if show_logs:
+                self.train_scores_mean.append(sum(train_scores) / len(train_scores))
+                train_scores = []
+                if self.print_test:
+                    test_pred_prob = torch.tensor([], device='cpu')
+                    with torch.no_grad():
+                        self.nnet.eval()
+                        for batch in test_batches:
+                            test_batch_pred_prob = self.nnet(batch['tokens'], batch['mask'], batch['token_type_ids'])
+                            test_batch_pred_prob_cpu = test_batch_pred_prob.to('cpu')
+                            test_pred_prob = torch.cat((test_pred_prob, test_batch_pred_prob_cpu), 0)
+                        test_score = criterion(test_pred_prob, torch.tensor(y_test.values, device='cpu'))
+                        self.test_scores.append(test_score.item())
+                        test_pred_probas = F.softmax(test_pred_prob).detach().cpu().numpy()
+                        self.test_aucs.append(get_roc_aucs(y_test, test_pred_probas))
+                        self.test_f1.append(get_max_f1_score(y_test, test_pred_probas)[0])
+                    self.print_metrics()
+                if self.early_stopping_check():
+                    break
+    def predict_proba(self, tokens, labels):
+        batches = self.batch_generator(tokens, labels)
+        pred_probas = torch.tensor([], device=self.device)
+        with torch.no_grad():
+            self.nnet.eval()
+            for batch in batches:
+                batch_prob = self.nnet(batch['tokens'], batch['mask'],
+                                       batch['token_type_ids'])
+                pred_probas = torch.cat((pred_probas, batch_prob))
+        return F.softmax(pred_probas).detach().cpu().numpy()
+    def predict(self, tokens, labels):
+        return np.argmax(self.predict_proba(tokens, labels), axis=1)
+    def save_model(self, filepath):
+        with open(filepath, 'wb') as file:
+            torch.save(self.nnet.state_dict(), file)
+    def load_model(self, filepath):
+        self.n_of_classes = 2
+        self.nnet = DistilBERTClass(self.n_of_classes).to(self.device)
+        self.optimizer = torch.optim.Adam(self.nnet.parameters(), lr=2e-6)
+        self.tokenizer = DistilBertTokenizer.from_pretrained(
+            'DeepPavlov/distilrubert-small-cased-conversational',
+            do_lower_case=True
+        )
+        self.nnet.load_state_dict(torch.load(filepath, map_location=self.device))
+class AdClassifier:
+    def __init__(self, weights_folder, dictionary_path):
+        self.batch_size = 16
+        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
+        self.pad_idx = 0
+        self.unk_idx = 1
+        with open(dictionary_path, 'rb') as file:
+            self.word_to_idx = pickle.load(file)
+        self.tokenizer = DistilBertTokenizer.from_pretrained(
+            'DeepPavlov/distilrubert-small-cased-conversational',
+            do_lower_case=True
+        )
+        nationality_nn_path = os.path.join(weights_folder, 'model_nationality.pt')
+        families_nn_path = os.path.join(weights_folder, 'model_families.pt')
+        sex_nn_path = os.path.join(weights_folder, 'model_sex.pt')
+        limit_nn_path = os.path.join(weights_folder, 'model_limit.pt')
+        self.nationality_clf = DBERTClassifier()
+        self.nationality_clf.load_model(nationality_nn_path)
+        self.families_clf = DBERTClassifier()
+        self.families_clf.load_model(families_nn_path)
+        self.sex_clf = DBERTClassifier()
+        self.sex_clf.load_model(sex_nn_path)
+        self.limit_clf = RNNClassifier(bidirectional=True)
+        self.limit_clf.load_model(limit_nn_path, amount_of_words=len(self.word_to_idx))
+    def index_tokens(self, tokens_string):
+        return [self.word_to_idx.get(token, self.unk_idx) for token in tokens_string]
+    def fill_with_pads(self, tokens):
+        tokens = deepcopy(tokens)
+        max_len = 0
+        for tokens_string in tokens:
+            max_len = max(max_len, len(tokens_string))
+        for tokens_string in tokens:
+            for i in range(len(tokens_string), max_len):
+                tokens_string.append(self.pad_idx)
+        return tokens
+    def as_matrix(self, tokens):
+        tokens = deepcopy(tokens)
+        for j, s in enumerate(tokens):
+            tokens[j] = self.index_tokens(s)
+        tokens = self.fill_with_pads(tokens)
+        return tokens
+    def batch_generator(self, tokens):
+        for i in range(0, len(tokens), self.batch_size):
+            batch_tokens = tokens[i: i + self.batch_size]
+            batch_tokens = [' '.join(s) for s in batch_tokens]
+            inputs = self.tokenizer.batch_encode_plus(
+                batch_tokens,
+                add_special_tokens=True,
+                max_length=512,
+                truncation=True,
+                pad_to_max_length=True,
+                return_token_type_ids=True
+            )
+            batch_token_ids = torch.tensor(inputs['input_ids'], device=self.device, dtype=torch.long)
+            batch_mask = torch.tensor(inputs['attention_mask'], device=self.device, dtype=torch.long)
+            batch_token_type_ids = torch.tensor(inputs['token_type_ids'], device=self.device, dtype=torch.long)
+            batch_tokens_rnn = tokens[i: i + self.batch_size]
+            batch_tokens_rnn_ids = torch.tensor(self.as_matrix(batch_tokens_rnn),
+                                                dtype=torch.int,
+                                                device=self.device)
+            batch = {
+                'tokens': batch_token_ids,
+                'mask': batch_mask,
+                'token_type_ids': batch_token_type_ids,
+                'tokens_rnn': batch_tokens_rnn_ids
+            }
+            yield batch
+    def predict_probas(self, tokens):
+        batches = self.batch_generator(tokens)
+        pred_probas = {'nationality': torch.tensor([], device=self.device),
+                       'families': torch.tensor([], device=self.device),
+                       'sex': torch.tensor([], device=self.device),
+                       'limit': torch.tensor([], device=self.device)}
+        batch_probas = dict()
+        with torch.no_grad():
+            self.nationality_clf.nnet.eval()
+            self.families_clf.nnet.eval()
+            self.sex_clf.nnet.eval()
+            self.limit_clf.nnet.eval()
+            for batch in batches:
+                batch_probas['nationality'] = self.nationality_clf.nnet(batch['tokens'], batch['mask'],
+                                                                        batch['token_type_ids'])
+                batch_probas['families'] = self.families_clf.nnet(batch['tokens'], batch['mask'],
+                                                                  batch['token_type_ids'])
+                batch_probas['sex'] = self.sex_clf.nnet(batch['tokens'], batch['mask'],
+                                                        batch['token_type_ids'])
+                batch_probas['limit'] = self.limit_clf.nnet(batch['tokens_rnn'])
+                for batch_prob_label in batch_probas:
+                    pred_probas[batch_prob_label] = torch.cat((pred_probas[batch_prob_label],
+                                                               batch_probas[batch_prob_label]))
+            for pred_prob_label in pred_probas:
+                pred_probas[pred_prob_label] = F.softmax(pred_probas[pred_prob_label]).\
+                    detach().cpu().numpy()
+        return pred_probas
+    def predict_labels(self, tokens):
+        predicted_probas = self.predict_probas(tokens)
+        predicted_labels = dict()
+        thresholds = {
+            'nationality': 0.75,
+            'families': 0.7,
+            'sex': 0.25,
+            'limit': 0.42
+        }
+        for label in predicted_probas:
+            predicted_labels[label] = predicted_probas[label][:, 1] >= thresholds[label]
+        return predicted_labels
+    def save_model(self, filepath):
+        with open(filepath, 'wb') as file:
+            torch.save(self, file)