Spaces:
Runtime error
Runtime error
| import re | |
| from typing import List, Union | |
| from overrides import overrides | |
| from relik.inference.data.objects import Word | |
| from relik.inference.data.tokenizers.base_tokenizer import BaseTokenizer | |
| class RegexTokenizer(BaseTokenizer): | |
| """ | |
| A :obj:`Tokenizer` that splits the text based on a simple regex. | |
| """ | |
| def __init__(self): | |
| super(RegexTokenizer, self).__init__() | |
| # regex for splitting on spaces and punctuation and new lines | |
| # self._regex = re.compile(r"\S+|[\[\](),.!?;:\"]|\\n") | |
| self._regex = re.compile( | |
| r"\w+|\$[\d\.]+|\S+", re.UNICODE | re.MULTILINE | re.DOTALL | |
| ) | |
| def __call__( | |
| self, | |
| texts: Union[str, List[str], List[List[str]]], | |
| is_split_into_words: bool = False, | |
| **kwargs, | |
| ) -> List[List[Word]]: | |
| """ | |
| Tokenize the input into single words by splitting using a simple regex. | |
| Args: | |
| texts (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`): | |
| Text to tag. It can be a single string, a batch of string and pre-tokenized strings. | |
| is_split_into_words (:obj:`bool`, optional, defaults to :obj:`False`): | |
| If :obj:`True` and the input is a string, the input is split on spaces. | |
| Returns: | |
| :obj:`List[List[Word]]`: The input text tokenized in single words. | |
| Example:: | |
| >>> from relik.retriever.serve.tokenizers.regex_tokenizer import RegexTokenizer | |
| >>> regex_tokenizer = RegexTokenizer() | |
| >>> regex_tokenizer("Mary sold the car to John.") | |
| """ | |
| # check if input is batched or a single sample | |
| is_batched = self.check_is_batched(texts, is_split_into_words) | |
| if is_batched: | |
| tokenized = self.tokenize_batch(texts) | |
| else: | |
| tokenized = self.tokenize(texts) | |
| return tokenized | |
| def tokenize(self, text: Union[str, List[str]]) -> List[Word]: | |
| if not isinstance(text, (str, list)): | |
| raise ValueError( | |
| f"text must be either `str` or `list`, found: `{type(text)}`" | |
| ) | |
| if isinstance(text, list): | |
| text = " ".join(text) | |
| return [ | |
| Word(t[0], i, start_char=t[1], end_char=t[2]) | |
| for i, t in enumerate( | |
| (m.group(0), m.start(), m.end()) for m in self._regex.finditer(text) | |
| ) | |
| ] | |