Spaces:
Running
on
Zero
Running
on
Zero
| import logging | |
| import re | |
| from http import HTTPStatus | |
| from typing import TypeVar | |
| import regex | |
| from datasets import Dataset, DatasetDict, Version, load_dataset | |
| from datasets.exceptions import DatasetNotFoundError | |
| from huggingface_hub.errors import HfHubHTTPError | |
| from tenacity import ( | |
| before_sleep_log, | |
| retry, | |
| retry_if_exception, | |
| stop_after_attempt, | |
| wait_fixed, | |
| ) | |
| logger = logging.getLogger(__name__) | |
| # pylint: disable-next=invalid-name | |
| TDataset = TypeVar("TDataset", bound=Dataset | DatasetDict) | |
| def load_dataset_retrying( | |
| path: str, | |
| revision: str | Version | None = None, | |
| ) -> DatasetDict: | |
| return load_dataset(path, revision=revision) | |
| # SEE: https://www.compart.com/en/unicode/block/U+2070 for subscript letters | |
| invalid_chars_regex = re.compile( | |
| r"[^A-Za-z0-9Ξ-Ωα-Οββββββββββββββ°ΒΉΒ²Β³β΄β΅βΆβ·βΈβΉββββββ ββββΓ\s!\"#$%&Β±β»'Β΄Κ»ββΚΌββ()*+βΊ,\-ββββββββ€./:;Β«<β€=β‘βββ₯>βΊΒ»β?@[\\\]^_`{|}~βββββββ‘ββ§ββ¨βββΊββΆβ¦]" # noqa: RUF001 | |
| ) | |
| invalid_languages_regex = regex.compile( | |
| r"[\p{" | |
| + r"}\p{".join({ | |
| # SEE: https://jrgraphix.net/r/Unicode/ | |
| "Arabic", | |
| "Armenian", | |
| "Bengali", | |
| "Braille_Patterns", | |
| "Cyrillic", | |
| "Devanagari", | |
| "Ethiopic", | |
| "Georgian", | |
| "Gujarati", | |
| "Gurmukhi", | |
| "Han", | |
| "Hangul", | |
| "Hebrew", | |
| "Hiragana", | |
| "Kannada", | |
| "Katakana", | |
| "Khmer", | |
| "Latin_Extended_A", | |
| "Latin_Extended_Additional", | |
| "Latin_Extended_B", | |
| "Malayalam", | |
| "Myanmar", | |
| "Syriac", | |
| "Tamil", | |
| "Telugu", | |
| "Thaana", | |
| "Thai", | |
| "Tifinagh", | |
| }) | |
| + r"}]" | |
| ) | |
| def contains_invalid( | |
| text: str, chars: bool = False, languages: bool = False, threshold: int = 1 | |
| ) -> tuple[bool, list[str]]: | |
| """Check if the text contains invalid characters or languages.""" | |
| if chars: | |
| matches = invalid_chars_regex.findall(text) | |
| if len(matches) >= threshold: | |
| return True, sorted(matches) | |
| if languages: | |
| matches = invalid_languages_regex.findall(text) | |
| if len(matches) >= threshold: | |
| return True, sorted(matches) | |
| return False, [] | |