| 
							 | 
						""" | 
					
					
						
						| 
							 | 
						日语、韩语 等 | 
					
					
						
						| 
							 | 
						https://www.cnblogs.com/luoganttcc/p/16605150.html | 
					
					
						
						| 
							 | 
						https://zhuanlan.zhihu.com/p/618684374 | 
					
					
						
						| 
							 | 
						- https://zhuanlan.zhihu.com/p/84625185 赞 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						## 相关包 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						import opencc | 
					
					
						
						| 
							 | 
						import langid | 
					
					
						
						| 
							 | 
						imort langdetect | 
					
					
						
						| 
							 | 
						https://github.com/pemistahl/lingua-py | 
					
					
						
						| 
							 | 
						  - 原理: | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						""" | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						from zhon.hanzi import punctuation as zh_punc | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						def is_zh_char(uchar): | 
					
					
						
						| 
							 | 
						    """ | 
					
					
						
						| 
							 | 
						    https://github.com/fxsjy/jieba/blob/master/jieba/__init__.py#L48 | 
					
					
						
						| 
							 | 
						    re.compile("([\u4E00-\u9FD5]+)", re.U) | 
					
					
						
						| 
							 | 
						    """ | 
					
					
						
						| 
							 | 
						    return u'\u4e00' <= uchar <= u'\u9fa5' | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						def has_zh_punc(text): | 
					
					
						
						| 
							 | 
						    """ | 
					
					
						
						| 
							 | 
						    是否包含中文标点 | 
					
					
						
						| 
							 | 
						    """ | 
					
					
						
						| 
							 | 
						    return any(ch in zh_punc for ch in text) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						def has_zh(text): | 
					
					
						
						| 
							 | 
						    """ contains Chinese characters """ | 
					
					
						
						| 
							 | 
						    return any(is_zh_char(ch) for ch in text) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						def get_zh_count(text): | 
					
					
						
						| 
							 | 
						    return sum([is_zh_char(uchar) for uchar in text]) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						def is_all_zh(text): | 
					
					
						
						| 
							 | 
						    return all(is_zh_char(char) for char in text) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						def is_all_en(text): | 
					
					
						
						| 
							 | 
						    return text.encode('utf-8').isalpha() | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						ranges = [ | 
					
					
						
						| 
							 | 
						    {"from": ord(u"\u3300"), "to": ord(u"\u33ff")},   | 
					
					
						
						| 
							 | 
						    {"from": ord(u"\ufe30"), "to": ord(u"\ufe4f")},   | 
					
					
						
						| 
							 | 
						    {"from": ord(u"\uf900"), "to": ord(u"\ufaff")},   | 
					
					
						
						| 
							 | 
						    {"from": ord(u"\U0002F800"), "to": ord(u"\U0002fa1f")},   | 
					
					
						
						| 
							 | 
						    {'from': ord(u'\u3040'), 'to': ord(u'\u309f')},   | 
					
					
						
						| 
							 | 
						    {"from": ord(u"\u30a0"), "to": ord(u"\u30ff")},   | 
					
					
						
						| 
							 | 
						    {"from": ord(u"\u2e80"), "to": ord(u"\u2eff")},   | 
					
					
						
						| 
							 | 
						    {"from": ord(u"\u4e00"), "to": ord(u"\u9fff")},   | 
					
					
						
						| 
							 | 
						    {"from": ord(u"\u3400"), "to": ord(u"\u4dbf")},   | 
					
					
						
						| 
							 | 
						    {"from": ord(u"\U00020000"), "to": ord(u"\U0002a6df")}, | 
					
					
						
						| 
							 | 
						    {"from": ord(u"\U0002a700"), "to": ord(u"\U0002b73f")}, | 
					
					
						
						| 
							 | 
						    {"from": ord(u"\U0002b740"), "to": ord(u"\U0002b81f")}, | 
					
					
						
						| 
							 | 
						    {"from": ord(u"\U0002b820"), "to": ord(u"\U0002ceaf")}   | 
					
					
						
						| 
							 | 
						] | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						def is_cjk(char): | 
					
					
						
						| 
							 | 
						    """ | 
					
					
						
						| 
							 | 
						    CJK(Chinese、Japanese、Korean) | 
					
					
						
						| 
							 | 
						    日语中有很多汉字,日本汉字超过2万。 | 
					
					
						
						| 
							 | 
						    韩语有谚文,超过50个,有朝鲜汉字超过2万。 | 
					
					
						
						| 
							 | 
						    """ | 
					
					
						
						| 
							 | 
						    return any([range["from"] <= ord(char) <= range["to"] for range in ranges]) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						def cjk_substrings(string): | 
					
					
						
						| 
							 | 
						    i = 0 | 
					
					
						
						| 
							 | 
						    while i < len(string): | 
					
					
						
						| 
							 | 
						        if is_cjk(string[i]): | 
					
					
						
						| 
							 | 
						            start = i | 
					
					
						
						| 
							 | 
						            while is_cjk(string[i]): i += 1 | 
					
					
						
						| 
							 | 
						            yield string[start:i] | 
					
					
						
						| 
							 | 
						        i += 1 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						def aa(): | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    for idx, item in enumerate(ranges): | 
					
					
						
						| 
							 | 
						        print(idx, end=": ") | 
					
					
						
						| 
							 | 
						        for j in range(10): | 
					
					
						
						| 
							 | 
						            print(chr(item["from"] + j), end=", ") | 
					
					
						
						| 
							 | 
						        print("") | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						def is_traditional_chinese(text): | 
					
					
						
						| 
							 | 
						    cc = opencc.OpenCC('t2s') | 
					
					
						
						| 
							 | 
						    converted_text = cc.convert(text) | 
					
					
						
						| 
							 | 
						    if converted_text != text: | 
					
					
						
						| 
							 | 
						        return True | 
					
					
						
						| 
							 | 
						    return False | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						
 |