Spaces:
Build error
Build error
| # Let's import a few requirements | |
| import torch | |
| from transformers import AutoModelForSequenceClassification, AutoTokenizer | |
| import numpy | |
| class TransformerVectorizer: | |
| def __init__(self): | |
| # Load the tokenizer (converts text to tokens) | |
| self.tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest") | |
| # Load the pre-trained model | |
| self.transformer_model = AutoModelForSequenceClassification.from_pretrained( | |
| "cardiffnlp/twitter-roberta-base-sentiment-latest" | |
| ) | |
| self.device = "cuda:0" if torch.cuda.is_available() else "cpu" | |
| def text_to_tensor( | |
| self, | |
| texts: list, | |
| ) -> numpy.ndarray: | |
| """Function that transforms a list of texts to their learned representation. | |
| Args: | |
| list_text_X (list): List of texts to be transformed. | |
| Returns: | |
| numpy.ndarray: Transformed list of texts. | |
| """ | |
| # First, tokenize all the input text | |
| tokenized_text_X_train = self.tokenizer.batch_encode_plus( | |
| texts, return_tensors="pt" | |
| )["input_ids"] | |
| # Depending on the hardware used, the number of examples to be processed can be reduced | |
| # Here we split the data into 100 examples per batch | |
| tokenized_text_X_train_split = torch.split(tokenized_text_X_train, split_size_or_sections=50) | |
| # Send the model to the device | |
| transformer_model = self.transformer_model.to(self.device) | |
| output_hidden_states_list = [] | |
| for tokenized_x in tokenized_text_X_train_split: | |
| # Pass the tokens through the transformer model and get the hidden states | |
| # Only keep the last hidden layer state for now | |
| output_hidden_states = transformer_model(tokenized_x.to(self.device), output_hidden_states=True)[ | |
| 1 | |
| ][-1] | |
| # Average over the tokens axis to get a representation at the text level. | |
| output_hidden_states = output_hidden_states.mean(dim=1) | |
| output_hidden_states = output_hidden_states.detach().cpu().numpy() | |
| output_hidden_states_list.append(output_hidden_states) | |
| self.encodings = numpy.concatenate(output_hidden_states_list, axis=0) | |
| return self.encodings | |
| def transform(self, texts: list): | |
| return self.text_to_tensor(texts) | |