Spaces:
Running
on
Zero
Running
on
Zero
File size: 3,505 Bytes
4c346eb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 |
import logging
import re
from http import HTTPStatus
from typing import TypeVar
import regex
from datasets import Dataset, DatasetDict, Version, load_dataset
from datasets.exceptions import DatasetNotFoundError
from huggingface_hub.errors import HfHubHTTPError
from tenacity import (
before_sleep_log,
retry,
retry_if_exception,
stop_after_attempt,
wait_fixed,
)
logger = logging.getLogger(__name__)
# pylint: disable-next=invalid-name
TDataset = TypeVar("TDataset", bound=Dataset | DatasetDict)
@retry(
retry=retry_if_exception(
lambda x: (
(
# On 2/11/2025 James kept seeing on the g3 server cluster:
# > huggingface_hub.errors.HfHubHTTPError: 504 Server Error: Gateway Time-out for
# > url: https://huggingface.co/api/datasets/org/repo/paths-info/abc123
# And on 3/14 James saw this on the g3 server cluster:
# > huggingface_hub.errors.HfHubHTTPError: 502 Server Error: Bad Gateway for
# > url: https://huggingface.co/api/datasets/org/repo/paths-info/abc123
isinstance(x, HfHubHTTPError)
and x.response.status_code
in {HTTPStatus.BAD_GATEWAY.value, HTTPStatus.GATEWAY_TIMEOUT.value}
)
# On 4/14/2025 James kept seeing on the g5 server cluster:
# > datasets.exceptions.DatasetNotFoundError:
# > Dataset 'org/repo' doesn't exist on the Hub or cannot be accessed.
or isinstance(x, DatasetNotFoundError)
)
),
before_sleep=before_sleep_log(logger, logging.WARNING),
stop=stop_after_attempt(5),
wait=wait_fixed(5),
)
def load_dataset_retrying(
path: str,
revision: str | Version | None = None,
) -> DatasetDict:
return load_dataset(path, revision=revision)
# SEE: https://www.compart.com/en/unicode/block/U+2070 for subscript letters
invalid_chars_regex = re.compile(
r"[^A-Za-z0-9Ξ-Ωα-Οββββββββββββββ°ΒΉΒ²Β³β΄β΅βΆβ·βΈβΉββββββ
ββββΓ\s!\"#$%&Β±β»'Β΄Κ»ββΚΌββ()*+βΊ,\-ββββββββ€./:;Β«<β€=β‘βββ₯>βΊΒ»β?@[\\\]^_`{|}~βββββββ‘ββ§ββ¨βββΊββΆβ¦]" # noqa: RUF001
)
invalid_languages_regex = regex.compile(
r"[\p{"
+ r"}\p{".join({
# SEE: https://jrgraphix.net/r/Unicode/
"Arabic",
"Armenian",
"Bengali",
"Braille_Patterns",
"Cyrillic",
"Devanagari",
"Ethiopic",
"Georgian",
"Gujarati",
"Gurmukhi",
"Han",
"Hangul",
"Hebrew",
"Hiragana",
"Kannada",
"Katakana",
"Khmer",
"Latin_Extended_A",
"Latin_Extended_Additional",
"Latin_Extended_B",
"Malayalam",
"Myanmar",
"Syriac",
"Tamil",
"Telugu",
"Thaana",
"Thai",
"Tifinagh",
})
+ r"}]"
)
def contains_invalid(
text: str, chars: bool = False, languages: bool = False, threshold: int = 1
) -> tuple[bool, list[str]]:
"""Check if the text contains invalid characters or languages."""
if chars:
matches = invalid_chars_regex.findall(text)
if len(matches) >= threshold:
return True, sorted(matches)
if languages:
matches = invalid_languages_regex.findall(text)
if len(matches) >= threshold:
return True, sorted(matches)
return False, []
|