Spaces:
Sleeping
Sleeping
| # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| import re | |
| from typing import List | |
| from .char_convert import tranditional_to_simplified | |
| from .chronology import ( | |
| RE_DATE, | |
| RE_DATE2, | |
| RE_TIME, | |
| RE_TIME_RANGE, | |
| replace_date, | |
| replace_date2, | |
| replace_time, | |
| ) | |
| from .constants import F2H_ASCII_LETTERS, F2H_DIGITS, F2H_SPACE | |
| from .num import ( | |
| RE_DECIMAL_NUM, | |
| RE_DEFAULT_NUM, | |
| RE_FRAC, | |
| RE_INTEGER, | |
| RE_NUMBER, | |
| RE_PERCENTAGE, | |
| RE_POSITIVE_QUANTIFIERS, | |
| RE_RANGE, | |
| replace_default_num, | |
| replace_frac, | |
| replace_negative_num, | |
| replace_number, | |
| replace_percentage, | |
| replace_positive_quantifier, | |
| replace_range, | |
| ) | |
| from .phonecode import ( | |
| RE_MOBILE_PHONE, | |
| RE_NATIONAL_UNIFORM_NUMBER, | |
| RE_TELEPHONE, | |
| replace_mobile, | |
| replace_phone, | |
| ) | |
| from .quantifier import RE_TEMPERATURE, replace_measure, replace_temperature | |
| class TextNormalizer: | |
| def __init__(self): | |
| self.SENTENCE_SPLITOR = re.compile(r"([:、,;。?!,;?!][”’]?)") | |
| def _split(self, text: str, lang="zh") -> List[str]: | |
| """Split long text into sentences with sentence-splitting punctuations. | |
| Args: | |
| text (str): The input text. | |
| Returns: | |
| List[str]: Sentences. | |
| """ | |
| # Only for pure Chinese here | |
| if lang == "zh": | |
| text = text.replace(" ", "") | |
| # 过滤掉特殊字符 | |
| text = re.sub(r"[——《》【】<=>{}()()#&@“”^_|…\\]", "", text) | |
| text = self.SENTENCE_SPLITOR.sub(r"\1\n", text) | |
| text = text.strip() | |
| sentences = [sentence.strip() for sentence in re.split(r"\n+", text)] | |
| return sentences | |
| def _post_replace(self, sentence: str) -> str: | |
| # sentence = sentence.replace('/', '每') | |
| # sentence = sentence.replace('~', '至') | |
| # sentence = sentence.replace('~', '至') | |
| sentence = sentence.replace("①", "一") | |
| sentence = sentence.replace("②", "二") | |
| sentence = sentence.replace("③", "三") | |
| sentence = sentence.replace("④", "四") | |
| sentence = sentence.replace("⑤", "五") | |
| sentence = sentence.replace("⑥", "六") | |
| sentence = sentence.replace("⑦", "七") | |
| sentence = sentence.replace("⑧", "八") | |
| sentence = sentence.replace("⑨", "九") | |
| sentence = sentence.replace("⑩", "十") | |
| sentence = sentence.replace("α", "阿尔法") | |
| sentence = sentence.replace("β", "贝塔") | |
| sentence = sentence.replace("γ", "伽玛").replace("Γ", "伽玛") | |
| sentence = sentence.replace("δ", "德尔塔").replace("Δ", "德尔塔") | |
| sentence = sentence.replace("ε", "艾普西龙") | |
| sentence = sentence.replace("ζ", "捷塔") | |
| sentence = sentence.replace("η", "依塔") | |
| sentence = sentence.replace("θ", "西塔").replace("Θ", "西塔") | |
| sentence = sentence.replace("ι", "艾欧塔") | |
| sentence = sentence.replace("κ", "喀帕") | |
| sentence = sentence.replace("λ", "拉姆达").replace("Λ", "拉姆达") | |
| sentence = sentence.replace("μ", "缪") | |
| sentence = sentence.replace("ν", "拗") | |
| sentence = sentence.replace("ξ", "克西").replace("Ξ", "克西") | |
| sentence = sentence.replace("ο", "欧米克伦") | |
| sentence = sentence.replace("π", "派").replace("Π", "派") | |
| sentence = sentence.replace("ρ", "肉") | |
| sentence = ( | |
| sentence.replace("ς", "西格玛") | |
| .replace("Σ", "西格玛") | |
| .replace("σ", "西格玛") | |
| ) | |
| sentence = sentence.replace("τ", "套") | |
| sentence = sentence.replace("υ", "宇普西龙") | |
| sentence = sentence.replace("φ", "服艾").replace("Φ", "服艾") | |
| sentence = sentence.replace("χ", "器") | |
| sentence = sentence.replace("ψ", "普赛").replace("Ψ", "普赛") | |
| sentence = sentence.replace("ω", "欧米伽").replace("Ω", "欧米伽") | |
| # re filter special characters, have one more character "-" than line 68 | |
| # sentence = re.sub(r'[-——《》【】<=>{}()()#&@“”^_|…\\]', '', sentence) | |
| return sentence | |
| def normalize_sentence(self, sentence: str) -> str: | |
| # basic character conversions | |
| sentence = tranditional_to_simplified(sentence) | |
| sentence = ( | |
| sentence.translate(F2H_ASCII_LETTERS) | |
| .translate(F2H_DIGITS) | |
| .translate(F2H_SPACE) | |
| ) | |
| # number related NSW verbalization | |
| sentence = RE_DATE.sub(replace_date, sentence) | |
| sentence = RE_DATE2.sub(replace_date2, sentence) | |
| # range first | |
| sentence = RE_TIME_RANGE.sub(replace_time, sentence) | |
| sentence = RE_TIME.sub(replace_time, sentence) | |
| sentence = RE_TEMPERATURE.sub(replace_temperature, sentence) | |
| sentence = replace_measure(sentence) | |
| sentence = RE_FRAC.sub(replace_frac, sentence) | |
| sentence = RE_PERCENTAGE.sub(replace_percentage, sentence) | |
| sentence = RE_MOBILE_PHONE.sub(replace_mobile, sentence) | |
| sentence = RE_TELEPHONE.sub(replace_phone, sentence) | |
| sentence = RE_NATIONAL_UNIFORM_NUMBER.sub(replace_phone, sentence) | |
| sentence = RE_RANGE.sub(replace_range, sentence) | |
| sentence = RE_INTEGER.sub(replace_negative_num, sentence) | |
| sentence = RE_DECIMAL_NUM.sub(replace_number, sentence) | |
| sentence = RE_POSITIVE_QUANTIFIERS.sub(replace_positive_quantifier, sentence) | |
| sentence = RE_DEFAULT_NUM.sub(replace_default_num, sentence) | |
| sentence = RE_NUMBER.sub(replace_number, sentence) | |
| sentence = self._post_replace(sentence) | |
| return sentence | |
| def normalize(self, text: str, lang="") -> List[str]: | |
| sentences = self._split(text, lang) | |
| sentences = [self.normalize_sentence(sent) for sent in sentences] | |
| return sentences | |