Spaces:
Runtime error
Runtime error
| from collections import Counter | |
| from dragonmapper import hanzi, transcriptions | |
| import jieba | |
| import pandas as pd | |
| import plotly.express as px | |
| import re | |
| import requests | |
| import spacy | |
| from spacy_streamlit import visualize_ner, visualize_tokens | |
| #from spacy.language import Language | |
| from spacy.tokens import Doc | |
| import streamlit as st | |
| # Global variables | |
| DEFAULT_TEXT = "我如此的過著孤單的生活,我沒有一個可以真正跟他談話的人,一直到六年前,我在撒哈拉沙漠飛機故障的時候。我的發動機裡有些東西壞了。而由於我身邊沒有機械師,也沒有乘客,我準備獨自去嘗試一次困難的修理。這對我是生死問題。我連足夠喝八天的水都沒有。頭一天晚上我在離開有人居住的地方一千英里的沙地上睡覺。我比一位漂流在汪洋大海裡的木筏上面的遇難者更孤單。當天剛破曉的時候,我被一種奇異的小聲音叫醒,你可以想像到,這時我是多麼的驚訝。那聲音說:「請你﹒﹒﹒給我畫一隻綿羊!」「哪!」「給我畫一隻綿羊!」《小王子》" | |
| DESCRIPTION = "AI模型輔助語言學習:華語" | |
| TOK_SEP = " | " | |
| PUNCT_SYM = ["PUNCT", "SYM"] | |
| MODEL_NAME = "zh_core_web_sm" | |
| # External API callers | |
| def moedict_caller(word): | |
| st.write(f"### {word}") | |
| req = requests.get(f"https://www.moedict.tw/uni/{word}.json") | |
| try: | |
| definitions = req.json().get('heteronyms')[0].get('definitions') | |
| df = pd.DataFrame(definitions) | |
| df.fillna("---", inplace=True) | |
| if 'example' not in df.columns: | |
| df['example'] = '---' | |
| if 'synonyms' not in df.columns: | |
| df['synonyms'] = '---' | |
| if 'antonyms' not in df.columns: | |
| df['antonyms'] = '---' | |
| cols = ['def', 'example', 'synonyms', 'antonyms'] | |
| df = df[cols] | |
| df.rename(columns={ | |
| 'def': '解釋', | |
| 'example': '例句', | |
| 'synonyms': '同義詞', | |
| 'antonyms': '反義詞', | |
| }, inplace=True) | |
| with st.expander("點擊 + 查看結果"): | |
| st.table(df) | |
| except: | |
| st.write("查無結果") | |
| # Custom tokenizer class | |
| class JiebaTokenizer: | |
| def __init__(self, vocab): | |
| self.vocab = vocab | |
| def __call__(self, text): | |
| words = jieba.cut(text) # returns a generator | |
| tokens = list(words) # convert the genetator to a list | |
| spaces = [False] * len(tokens) | |
| doc = Doc(self.vocab, words=tokens, spaces=spaces) | |
| return doc | |
| # Utility functions | |
| def filter_tokens(doc): | |
| clean_tokens = [tok for tok in doc if tok.pos_ not in PUNCT_SYM] | |
| clean_tokens = ( | |
| [tok for tok in clean_tokens if | |
| not tok.like_email and | |
| not tok.like_num and | |
| not tok.like_url and | |
| not tok.is_space] | |
| ) | |
| return clean_tokens | |
| def get_vocab(doc): | |
| clean_tokens = filter_tokens(doc) | |
| alphanum_pattern = re.compile(r"[a-zA-Z0-9]") | |
| clean_tokens_text = [tok.text for tok in clean_tokens if not alphanum_pattern.search(tok.text)] | |
| vocab = list(set(clean_tokens_text)) | |
| return vocab | |
| def get_counter(doc): | |
| clean_tokens = filter_tokens(doc) | |
| tokens = [token.text for token in clean_tokens] | |
| counter = Counter(tokens) | |
| return counter | |
| def get_freq_fig(doc): | |
| counter = get_counter(doc) | |
| counter_df = ( | |
| pd.DataFrame.from_dict(counter, orient='index'). | |
| reset_index(). | |
| rename(columns={ | |
| 0: 'count', | |
| 'index': 'word' | |
| }). | |
| sort_values(by='count', ascending=False) | |
| ) | |
| fig = px.bar(counter_df, x='word', y='count') | |
| return fig | |
| def get_level_pie(tocfl_result): | |
| level = tocfl_result['詞條分級'].value_counts() | |
| fig = px.pie(tocfl_result, | |
| values=level.values, | |
| names=level.index, | |
| title='詞彙分級圓餅圖') | |
| return fig | |
| def load_tocfl_table(filename="./tocfl_wordlist.csv"): | |
| table = pd.read_csv(filename) | |
| cols = "詞彙 漢語拼音 注音 任務領域 詞條分級".split() | |
| table = table[cols] | |
| return table | |
| # Page setting | |
| st.set_page_config( | |
| page_icon="🤠", | |
| layout="wide", | |
| initial_sidebar_state="auto", | |
| ) | |
| st.markdown(f"# {DESCRIPTION}") | |
| # Load the model | |
| nlp = spacy.load(MODEL_NAME) | |
| # Add pipelines to spaCy | |
| # nlp.add_pipe("yake") # keyword extraction | |
| # nlp.add_pipe("merge_entities") # Merge entity spans to tokens | |
| # Select a tokenizer if the Chinese model is chosen | |
| selected_tokenizer = st.radio("請選擇斷詞模型", ["jieba-TW", "spaCy"]) | |
| if selected_tokenizer == "jieba-TW": | |
| nlp.tokenizer = JiebaTokenizer(nlp.vocab) | |
| # Page starts from here | |
| st.markdown("## 待分析文本") | |
| st.info("請在下面的文字框輸入文本並按下Ctrl + Enter以更新分析結果") | |
| text = st.text_area("", DEFAULT_TEXT, height=200) | |
| doc = nlp(text) | |
| st.markdown("---") | |
| st.info("請勾選以下至少一項功能") | |
| # keywords_extraction = st.sidebar.checkbox("關鍵詞分析", False) # YAKE doesn't work for Chinese texts | |
| analyzed_text = st.checkbox("增強文本", True) | |
| defs_examples = st.checkbox("單詞解析", True) | |
| # morphology = st.sidebar.checkbox("詞形變化", True) | |
| freq_count = st.checkbox("詞頻統計", True) | |
| ner_viz = st.checkbox("命名實體", True) | |
| tok_table = st.checkbox("斷詞特徵", False) | |
| if analyzed_text: | |
| st.markdown("## 增強文本") | |
| pronunciation = st.radio("請選擇輔助發音類型", ["漢語拼音", "注音符號", "國際音標"]) | |
| for idx, sent in enumerate(doc.sents): | |
| tokens_text = [tok.text for tok in sent if tok.pos_ not in PUNCT_SYM] | |
| pinyins = [hanzi.to_pinyin(word) for word in tokens_text] | |
| sounds = pinyins | |
| if pronunciation == "注音符號": | |
| zhuyins = [transcriptions.pinyin_to_zhuyin(word) for word in pinyins] | |
| sounds = zhuyins | |
| elif pronunciation == "國際音標": | |
| ipas = [transcriptions.pinyin_to_ipa(word) for word in pinyins] | |
| sounds = ipas | |
| display = [] | |
| for text, sound in zip(tokens_text, sounds): | |
| res = f"{text} [{sound}]" | |
| display.append(res) | |
| if display: | |
| display_text = TOK_SEP.join(display) | |
| st.write(f"{idx+1} >>> {display_text}") | |
| else: | |
| st.write(f"{idx+1} >>> EMPTY LINE") | |
| if defs_examples: | |
| st.markdown("## 單詞解析") | |
| vocab = get_vocab(doc) | |
| if vocab: | |
| tocfl_table = load_tocfl_table() | |
| filt = tocfl_table['詞彙'].isin(vocab) | |
| tocfl_res = tocfl_table[filt] | |
| st.markdown("### 華語詞彙分級") | |
| fig = get_level_pie(tocfl_res) | |
| st.plotly_chart(fig, use_container_width=True) | |
| with st.expander("點擊 + 查看結果"): | |
| st.table(tocfl_res) | |
| st.markdown("---") | |
| st.markdown("### 單詞解釋與例句") | |
| selected_words = st.multiselect("請選擇要查詢的單詞: ", vocab, vocab[-1]) | |
| for w in selected_words: | |
| moedict_caller(w) | |
| if freq_count: | |
| st.markdown("## 詞頻統計") | |
| counter = get_counter(doc) | |
| topK = st.slider('請選擇前K個高頻詞', 1, len(counter), 5) | |
| most_common = counter.most_common(topK) | |
| st.write(most_common) | |
| st.markdown("---") | |
| fig = get_freq_fig(doc) | |
| st.plotly_chart(fig, use_container_width=True) | |
| if ner_viz: | |
| ner_labels = nlp.get_pipe("ner").labels | |
| visualize_ner(doc, labels=ner_labels, show_table=False, title="命名實體") | |
| if tok_table: | |
| visualize_tokens(doc, attrs=["text", "pos_", "tag_", "dep_", "head"], title="斷詞特徵") | |