Spaces:
Running
on
Zero
Running
on
Zero
| # Copyright (c) 2024 Amphion. | |
| # | |
| # This source code is licensed under the MIT license found in the | |
| # LICENSE file in the root directory of this source tree. | |
| import re | |
| """ | |
| Text clean time | |
| """ | |
| english_dictionary = { | |
| "KOREA": "์ฝ๋ฆฌ์", | |
| "IDOL": "์์ด๋", | |
| "IT": "์์ดํฐ", | |
| "IQ": "์์ดํ", | |
| "UP": "์ ", | |
| "DOWN": "๋ค์ด", | |
| "PC": "ํผ์จ", | |
| "CCTV": "์จ์จํฐ๋น", | |
| "SNS": "์์ค์์์ค", | |
| "AI": "์์ด์์ด", | |
| "CEO": "์จ์ด์ค", | |
| "A": "์์ด", | |
| "B": "๋น", | |
| "C": "์จ", | |
| "D": "๋", | |
| "E": "์ด", | |
| "F": "์ํ", | |
| "G": "์ง", | |
| "H": "์์ด์น", | |
| "I": "์์ด", | |
| "J": "์ ์ด", | |
| "K": "์ผ์ด", | |
| "L": "์", | |
| "M": "์ ", | |
| "N": "์", | |
| "O": "์ค", | |
| "P": "ํผ", | |
| "Q": "ํ", | |
| "R": "์", | |
| "S": "์์ค", | |
| "T": "ํฐ", | |
| "U": "์ ", | |
| "V": "๋ธ์ด", | |
| "W": "๋๋ธ์ ", | |
| "X": "์์ค", | |
| "Y": "์์ด", | |
| "Z": "์ ํธ", | |
| } | |
| def normalize(text): | |
| text = text.strip() | |
| text = re.sub( | |
| "[โบ-โบโบ-โปณโผ-โฟใ ใใก-ใฉใธ-ใบใปใ-ไถตไธ-้ฟ่ฑ-้ถดไพฎ-้ ปไธฆ-้พ]", "", text | |
| ) | |
| text = normalize_english(text) | |
| text = text.lower() | |
| return text | |
| def normalize_english(text): | |
| def fn(m): | |
| word = m.group() | |
| if word in english_dictionary: | |
| return english_dictionary.get(word) | |
| return word | |
| text = re.sub("([A-Za-z]+)", fn, text) | |
| return text | |
| def korean_to_ipa(text, text_tokenizer): | |
| if type(text) == str: | |
| text = normalize(text) | |
| phonemes = text_tokenizer(text) | |
| return phonemes | |
| else: | |
| for i, t in enumerate(text): | |
| text[i] = normalize(t) | |
| return text_tokenizer(text) | |