DiffRhythm2 / g2p /g2p /korean.py
ASLP-lab's picture
init
010341e verified
raw
history blame
1.71 kB
# Copyright (c) 2024 Amphion.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import re
"""
Text clean time
"""
english_dictionary = {
"KOREA": "์ฝ”๋ฆฌ์•„",
"IDOL": "์•„์ด๋Œ",
"IT": "์•„์ดํ‹ฐ",
"IQ": "์•„์ดํ",
"UP": "์—…",
"DOWN": "๋‹ค์šด",
"PC": "ํ”ผ์”จ",
"CCTV": "์”จ์”จํ‹ฐ๋น„",
"SNS": "์—์Šค์—”์—์Šค",
"AI": "์—์ด์•„์ด",
"CEO": "์”จ์ด์˜ค",
"A": "์—์ด",
"B": "๋น„",
"C": "์”จ",
"D": "๋””",
"E": "์ด",
"F": "์—ํ”„",
"G": "์ง€",
"H": "์—์ด์น˜",
"I": "์•„์ด",
"J": "์ œ์ด",
"K": "์ผ€์ด",
"L": "์—˜",
"M": "์— ",
"N": "์—”",
"O": "์˜ค",
"P": "ํ”ผ",
"Q": "ํ",
"R": "์•Œ",
"S": "์—์Šค",
"T": "ํ‹ฐ",
"U": "์œ ",
"V": "๋ธŒ์ด",
"W": "๋”๋ธ”์œ ",
"X": "์—‘์Šค",
"Y": "์™€์ด",
"Z": "์ œํŠธ",
}
def normalize(text):
text = text.strip()
text = re.sub(
"[โบ€-โบ™โบ›-โปณโผ€-โฟ•ใ€…ใ€‡ใ€ก-ใ€ฉใ€ธ-ใ€บใ€ปใ€-ไถตไธ€-้ฟƒ่ฑˆ-้ถดไพฎ-้ ปไธฆ-้พŽ]", "", text
)
text = normalize_english(text)
text = text.lower()
return text
def normalize_english(text):
def fn(m):
word = m.group()
if word in english_dictionary:
return english_dictionary.get(word)
return word
text = re.sub("([A-Za-z]+)", fn, text)
return text
def korean_to_ipa(text, text_tokenizer):
if type(text) == str:
text = normalize(text)
phonemes = text_tokenizer(text)
return phonemes
else:
for i, t in enumerate(text):
text[i] = normalize(t)
return text_tokenizer(text)