Spaces:
Runtime error
Runtime error
| from typing import List, Union | |
| from relik.inference.data.objects import Word | |
| class BaseTokenizer: | |
| """ | |
| A :obj:`Tokenizer` splits strings of text into single words, optionally adds | |
| pos tags and perform lemmatization. | |
| """ | |
| def __call__( | |
| self, | |
| texts: Union[str, List[str], List[List[str]]], | |
| is_split_into_words: bool = False, | |
| **kwargs | |
| ) -> List[List[Word]]: | |
| """ | |
| Tokenize the input into single words. | |
| Args: | |
| texts (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`): | |
| Text to tag. It can be a single string, a batch of string and pre-tokenized strings. | |
| is_split_into_words (:obj:`bool`, optional, defaults to :obj:`False`): | |
| If :obj:`True` and the input is a string, the input is split on spaces. | |
| Returns: | |
| :obj:`List[List[Word]]`: The input text tokenized in single words. | |
| """ | |
| raise NotImplementedError | |
| def tokenize(self, text: str) -> List[Word]: | |
| """ | |
| Implements splitting words into tokens. | |
| Args: | |
| text (:obj:`str`): | |
| Text to tokenize. | |
| Returns: | |
| :obj:`List[Word]`: The input text tokenized in single words. | |
| """ | |
| raise NotImplementedError | |
| def tokenize_batch(self, texts: List[str]) -> List[List[Word]]: | |
| """ | |
| Implements batch splitting words into tokens. | |
| Args: | |
| texts (:obj:`List[str]`): | |
| Batch of text to tokenize. | |
| Returns: | |
| :obj:`List[List[Word]]`: The input batch tokenized in single words. | |
| """ | |
| return [self.tokenize(text) for text in texts] | |
| def check_is_batched( | |
| texts: Union[str, List[str], List[List[str]]], is_split_into_words: bool | |
| ): | |
| """ | |
| Check if input is batched or a single sample. | |
| Args: | |
| texts (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`): | |
| Text to check. | |
| is_split_into_words (:obj:`bool`): | |
| If :obj:`True` and the input is a string, the input is split on spaces. | |
| Returns: | |
| :obj:`bool`: ``True`` if ``texts`` is batched, ``False`` otherwise. | |
| """ | |
| return bool( | |
| (not is_split_into_words and isinstance(texts, (list, tuple))) | |
| or ( | |
| is_split_into_words | |
| and isinstance(texts, (list, tuple)) | |
| and texts | |
| and isinstance(texts[0], (list, tuple)) | |
| ) | |
| ) | |