Spaces:
Runtime error
Runtime error
| """ | |
| Split text to sentences. | |
| Modifed from seg_text.seg_text.py, vtext sentence_splitter removed, use | |
| Use sentence_splitter if supported, | |
| else use polyglot.text.Text | |
| !apt install libicu-dev | |
| !install pyicu pycld2 | |
| !pip install polyglot sentence_splitter | |
| Use vtext and fastlid to rid of polyglot? | |
| from vtext.tokenize_sentence import UnicodeSentenceTokenizer, PunctuationTokenizer | |
| tok = UnicodeSentenceTokenizer() | |
| seg = tok.tokenize(''' Text ''') for langs not in LANG_S | |
| """ | |
| # pylint: disable=invalid-name | |
| import re | |
| from typing import List, Optional, Union | |
| import pysbd | |
| from fastlid import fastlid | |
| from loguru import logger | |
| from tqdm.auto import tqdm | |
| def _seg_text( | |
| text: str, | |
| lang: Optional[str] = None, | |
| ) -> List[str]: | |
| """ | |
| Split text to sentences. | |
| Switched to pysbd | |
| Args: | |
| ---- | |
| text: string to split | |
| lang: language, two-letter ISO (22 languages) | |
| Returns: | |
| ------- | |
| List of segmented sentences | |
| """ | |
| if lang is None: | |
| try: | |
| lang, _ = fastlid(text) | |
| except Exception as exc: | |
| logger.warning(" fastlid: %s, setting lang='en'", exc) | |
| lang = "en" | |
| if not text.strip(): | |
| return [] | |
| seg = pysbd.Segmenter(language=lang, clean=True) | |
| try: | |
| # _ = tok.tokenize(text) | |
| _ = seg.segment(text) | |
| except Exception as exc: | |
| logger.exception(f"pysbd.Segmenter, {exc=}") | |
| raise | |
| return _ | |
| def seg_text( | |
| lst: Union[str, List[str]], | |
| lang: Optional[str] = None, | |
| maxlines: int = 1000, | |
| extra: Optional[str] = None, | |
| ) -> List[str]: | |
| """Split a list of text. | |
| Arguments: | |
| lst: text or text list | |
| lang: optional lang code | |
| maxlines: (default 1000), threshold for turn on tqdm progressbar, set to <1 or a large number to turn it off | |
| extra: re.split(rf"{extra}, text) first | |
| Returns: | |
| list of splitted text. | |
| """ | |
| if isinstance(lst, str): | |
| lst = [lst] | |
| if extra: | |
| # insert \n | |
| lst = [re.sub(rf"({extra})", r"\1\n", elm) for elm in lst] | |
| res = [] | |
| for elm in lst: | |
| res.extend( | |
| _seg_text( | |
| elm, | |
| lang=lang, | |
| # maxlines=maxlines, | |
| # flag=False, | |
| ) | |
| ) | |
| return res | |