Spaces:
Paused
Paused
| import importlib | |
| from codecs import IncrementalDecoder | |
| from collections import Counter | |
| from functools import lru_cache | |
| from typing import Counter as TypeCounter, Dict, List, Optional, Tuple | |
| from .constant import ( | |
| FREQUENCIES, | |
| KO_NAMES, | |
| LANGUAGE_SUPPORTED_COUNT, | |
| TOO_SMALL_SEQUENCE, | |
| ZH_NAMES, | |
| ) | |
| from .md import is_suspiciously_successive_range | |
| from .models import CoherenceMatches | |
| from .utils import ( | |
| is_accentuated, | |
| is_latin, | |
| is_multi_byte_encoding, | |
| is_unicode_range_secondary, | |
| unicode_range, | |
| ) | |
| def encoding_unicode_range(iana_name: str) -> List[str]: | |
| """ | |
| Return associated unicode ranges in a single byte code page. | |
| """ | |
| if is_multi_byte_encoding(iana_name): | |
| raise IOError("Function not supported on multi-byte code page") | |
| decoder = importlib.import_module( | |
| "encodings.{}".format(iana_name) | |
| ).IncrementalDecoder | |
| p: IncrementalDecoder = decoder(errors="ignore") | |
| seen_ranges: Dict[str, int] = {} | |
| character_count: int = 0 | |
| for i in range(0x40, 0xFF): | |
| chunk: str = p.decode(bytes([i])) | |
| if chunk: | |
| character_range: Optional[str] = unicode_range(chunk) | |
| if character_range is None: | |
| continue | |
| if is_unicode_range_secondary(character_range) is False: | |
| if character_range not in seen_ranges: | |
| seen_ranges[character_range] = 0 | |
| seen_ranges[character_range] += 1 | |
| character_count += 1 | |
| return sorted( | |
| [ | |
| character_range | |
| for character_range in seen_ranges | |
| if seen_ranges[character_range] / character_count >= 0.15 | |
| ] | |
| ) | |
| def unicode_range_languages(primary_range: str) -> List[str]: | |
| """ | |
| Return inferred languages used with a unicode range. | |
| """ | |
| languages: List[str] = [] | |
| for language, characters in FREQUENCIES.items(): | |
| for character in characters: | |
| if unicode_range(character) == primary_range: | |
| languages.append(language) | |
| break | |
| return languages | |
| def encoding_languages(iana_name: str) -> List[str]: | |
| """ | |
| Single-byte encoding language association. Some code page are heavily linked to particular language(s). | |
| This function does the correspondence. | |
| """ | |
| unicode_ranges: List[str] = encoding_unicode_range(iana_name) | |
| primary_range: Optional[str] = None | |
| for specified_range in unicode_ranges: | |
| if "Latin" not in specified_range: | |
| primary_range = specified_range | |
| break | |
| if primary_range is None: | |
| return ["Latin Based"] | |
| return unicode_range_languages(primary_range) | |
| def mb_encoding_languages(iana_name: str) -> List[str]: | |
| """ | |
| Multi-byte encoding language association. Some code page are heavily linked to particular language(s). | |
| This function does the correspondence. | |
| """ | |
| if ( | |
| iana_name.startswith("shift_") | |
| or iana_name.startswith("iso2022_jp") | |
| or iana_name.startswith("euc_j") | |
| or iana_name == "cp932" | |
| ): | |
| return ["Japanese"] | |
| if iana_name.startswith("gb") or iana_name in ZH_NAMES: | |
| return ["Chinese"] | |
| if iana_name.startswith("iso2022_kr") or iana_name in KO_NAMES: | |
| return ["Korean"] | |
| return [] | |
| def get_target_features(language: str) -> Tuple[bool, bool]: | |
| """ | |
| Determine main aspects from a supported language if it contains accents and if is pure Latin. | |
| """ | |
| target_have_accents: bool = False | |
| target_pure_latin: bool = True | |
| for character in FREQUENCIES[language]: | |
| if not target_have_accents and is_accentuated(character): | |
| target_have_accents = True | |
| if target_pure_latin and is_latin(character) is False: | |
| target_pure_latin = False | |
| return target_have_accents, target_pure_latin | |
| def alphabet_languages( | |
| characters: List[str], ignore_non_latin: bool = False | |
| ) -> List[str]: | |
| """ | |
| Return associated languages associated to given characters. | |
| """ | |
| languages: List[Tuple[str, float]] = [] | |
| source_have_accents = any(is_accentuated(character) for character in characters) | |
| for language, language_characters in FREQUENCIES.items(): | |
| target_have_accents, target_pure_latin = get_target_features(language) | |
| if ignore_non_latin and target_pure_latin is False: | |
| continue | |
| if target_have_accents is False and source_have_accents: | |
| continue | |
| character_count: int = len(language_characters) | |
| character_match_count: int = len( | |
| [c for c in language_characters if c in characters] | |
| ) | |
| ratio: float = character_match_count / character_count | |
| if ratio >= 0.2: | |
| languages.append((language, ratio)) | |
| languages = sorted(languages, key=lambda x: x[1], reverse=True) | |
| return [compatible_language[0] for compatible_language in languages] | |
| def characters_popularity_compare( | |
| language: str, ordered_characters: List[str] | |
| ) -> float: | |
| """ | |
| Determine if a ordered characters list (by occurrence from most appearance to rarest) match a particular language. | |
| The result is a ratio between 0. (absolutely no correspondence) and 1. (near perfect fit). | |
| Beware that is function is not strict on the match in order to ease the detection. (Meaning close match is 1.) | |
| """ | |
| if language not in FREQUENCIES: | |
| raise ValueError("{} not available".format(language)) | |
| character_approved_count: int = 0 | |
| FREQUENCIES_language_set = set(FREQUENCIES[language]) | |
| ordered_characters_count: int = len(ordered_characters) | |
| target_language_characters_count: int = len(FREQUENCIES[language]) | |
| large_alphabet: bool = target_language_characters_count > 26 | |
| for character, character_rank in zip( | |
| ordered_characters, range(0, ordered_characters_count) | |
| ): | |
| if character not in FREQUENCIES_language_set: | |
| continue | |
| character_rank_in_language: int = FREQUENCIES[language].index(character) | |
| expected_projection_ratio: float = ( | |
| target_language_characters_count / ordered_characters_count | |
| ) | |
| character_rank_projection: int = int(character_rank * expected_projection_ratio) | |
| if ( | |
| large_alphabet is False | |
| and abs(character_rank_projection - character_rank_in_language) > 4 | |
| ): | |
| continue | |
| if ( | |
| large_alphabet is True | |
| and abs(character_rank_projection - character_rank_in_language) | |
| < target_language_characters_count / 3 | |
| ): | |
| character_approved_count += 1 | |
| continue | |
| characters_before_source: List[str] = FREQUENCIES[language][ | |
| 0:character_rank_in_language | |
| ] | |
| characters_after_source: List[str] = FREQUENCIES[language][ | |
| character_rank_in_language: | |
| ] | |
| characters_before: List[str] = ordered_characters[0:character_rank] | |
| characters_after: List[str] = ordered_characters[character_rank:] | |
| before_match_count: int = len( | |
| set(characters_before) & set(characters_before_source) | |
| ) | |
| after_match_count: int = len( | |
| set(characters_after) & set(characters_after_source) | |
| ) | |
| if len(characters_before_source) == 0 and before_match_count <= 4: | |
| character_approved_count += 1 | |
| continue | |
| if len(characters_after_source) == 0 and after_match_count <= 4: | |
| character_approved_count += 1 | |
| continue | |
| if ( | |
| before_match_count / len(characters_before_source) >= 0.4 | |
| or after_match_count / len(characters_after_source) >= 0.4 | |
| ): | |
| character_approved_count += 1 | |
| continue | |
| return character_approved_count / len(ordered_characters) | |
| def alpha_unicode_split(decoded_sequence: str) -> List[str]: | |
| """ | |
| Given a decoded text sequence, return a list of str. Unicode range / alphabet separation. | |
| Ex. a text containing English/Latin with a bit a Hebrew will return two items in the resulting list; | |
| One containing the latin letters and the other hebrew. | |
| """ | |
| layers: Dict[str, str] = {} | |
| for character in decoded_sequence: | |
| if character.isalpha() is False: | |
| continue | |
| character_range: Optional[str] = unicode_range(character) | |
| if character_range is None: | |
| continue | |
| layer_target_range: Optional[str] = None | |
| for discovered_range in layers: | |
| if ( | |
| is_suspiciously_successive_range(discovered_range, character_range) | |
| is False | |
| ): | |
| layer_target_range = discovered_range | |
| break | |
| if layer_target_range is None: | |
| layer_target_range = character_range | |
| if layer_target_range not in layers: | |
| layers[layer_target_range] = character.lower() | |
| continue | |
| layers[layer_target_range] += character.lower() | |
| return list(layers.values()) | |
| def merge_coherence_ratios(results: List[CoherenceMatches]) -> CoherenceMatches: | |
| """ | |
| This function merge results previously given by the function coherence_ratio. | |
| The return type is the same as coherence_ratio. | |
| """ | |
| per_language_ratios: Dict[str, List[float]] = {} | |
| for result in results: | |
| for sub_result in result: | |
| language, ratio = sub_result | |
| if language not in per_language_ratios: | |
| per_language_ratios[language] = [ratio] | |
| continue | |
| per_language_ratios[language].append(ratio) | |
| merge = [ | |
| ( | |
| language, | |
| round( | |
| sum(per_language_ratios[language]) / len(per_language_ratios[language]), | |
| 4, | |
| ), | |
| ) | |
| for language in per_language_ratios | |
| ] | |
| return sorted(merge, key=lambda x: x[1], reverse=True) | |
| def filter_alt_coherence_matches(results: CoherenceMatches) -> CoherenceMatches: | |
| """ | |
| We shall NOT return "English—" in CoherenceMatches because it is an alternative | |
| of "English". This function only keeps the best match and remove the em-dash in it. | |
| """ | |
| index_results: Dict[str, List[float]] = dict() | |
| for result in results: | |
| language, ratio = result | |
| no_em_name: str = language.replace("—", "") | |
| if no_em_name not in index_results: | |
| index_results[no_em_name] = [] | |
| index_results[no_em_name].append(ratio) | |
| if any(len(index_results[e]) > 1 for e in index_results): | |
| filtered_results: CoherenceMatches = [] | |
| for language in index_results: | |
| filtered_results.append((language, max(index_results[language]))) | |
| return filtered_results | |
| return results | |
| def coherence_ratio( | |
| decoded_sequence: str, threshold: float = 0.1, lg_inclusion: Optional[str] = None | |
| ) -> CoherenceMatches: | |
| """ | |
| Detect ANY language that can be identified in given sequence. The sequence will be analysed by layers. | |
| A layer = Character extraction by alphabets/ranges. | |
| """ | |
| results: List[Tuple[str, float]] = [] | |
| ignore_non_latin: bool = False | |
| sufficient_match_count: int = 0 | |
| lg_inclusion_list = lg_inclusion.split(",") if lg_inclusion is not None else [] | |
| if "Latin Based" in lg_inclusion_list: | |
| ignore_non_latin = True | |
| lg_inclusion_list.remove("Latin Based") | |
| for layer in alpha_unicode_split(decoded_sequence): | |
| sequence_frequencies: TypeCounter[str] = Counter(layer) | |
| most_common = sequence_frequencies.most_common() | |
| character_count: int = sum(o for c, o in most_common) | |
| if character_count <= TOO_SMALL_SEQUENCE: | |
| continue | |
| popular_character_ordered: List[str] = [c for c, o in most_common] | |
| for language in lg_inclusion_list or alphabet_languages( | |
| popular_character_ordered, ignore_non_latin | |
| ): | |
| ratio: float = characters_popularity_compare( | |
| language, popular_character_ordered | |
| ) | |
| if ratio < threshold: | |
| continue | |
| elif ratio >= 0.8: | |
| sufficient_match_count += 1 | |
| results.append((language, round(ratio, 4))) | |
| if sufficient_match_count >= 3: | |
| break | |
| return sorted( | |
| filter_alt_coherence_matches(results), key=lambda x: x[1], reverse=True | |
| ) | |