Spaces:

jeffeux
/

spacy-streamlit-haowenchiang

Runtime error

App Files Files Community

spacy-streamlit-haowenchiang / pages /01_🍊Mandarin.py

jeffeux

Migrate to HF Space

d825710 almost 3 years ago

raw

history blame contribute delete

7.69 kB

	from collections import Counter
	from dragonmapper import hanzi, transcriptions
	import jieba
	import pandas as pd
	import plotly.express as px
	import re
	import requests
	import spacy
	from spacy_streamlit import visualize_ner, visualize_tokens
	#from spacy.language import Language
	from spacy.tokens import Doc
	import streamlit as st

	# Global variables
	DEFAULT_TEXT = "我如此的過著孤單的生活，我沒有一個可以真正跟他談話的人，一直到六年前，我在撒哈拉沙漠飛機故障的時候。我的發動機裡有些東西壞了。而由於我身邊沒有機械師，也沒有乘客，我準備獨自去嘗試一次困難的修理。這對我是生死問題。我連足夠喝八天的水都沒有。頭一天晚上我在離開有人居住的地方一千英里的沙地上睡覺。我比一位漂流在汪洋大海裡的木筏上面的遇難者更孤單。當天剛破曉的時候，我被一種奇異的小聲音叫醒，你可以想像到，這時我是多麼的驚訝。那聲音說：「請你﹒﹒﹒給我畫一隻綿羊！」「哪！」「給我畫一隻綿羊！」《小王子》"
	DESCRIPTION = "AI模型輔助語言學習：華語"
	TOK_SEP = " \| "
	PUNCT_SYM = ["PUNCT", "SYM"]
	MODEL_NAME = "zh_core_web_sm"

	# External API callers
	def moedict_caller(word):
	st.write(f"### {word}")
	req = requests.get(f"https://www.moedict.tw/uni/{word}.json")
	try:
	definitions = req.json().get('heteronyms')[0].get('definitions')
	df = pd.DataFrame(definitions)
	df.fillna("---", inplace=True)
	if 'example' not in df.columns:
	df['example'] = '---'
	if 'synonyms' not in df.columns:
	df['synonyms'] = '---'
	if 'antonyms' not in df.columns:
	df['antonyms'] = '---'
	cols = ['def', 'example', 'synonyms', 'antonyms']
	df = df[cols]
	df.rename(columns={
	'def': '解釋',
	'example': '例句',
	'synonyms': '同義詞',
	'antonyms': '反義詞',
	}, inplace=True)
	with st.expander("點擊 + 查看結果"):
	st.table(df)
	except:
	st.write("查無結果")

	# Custom tokenizer class
	class JiebaTokenizer:
	def __init__(self, vocab):
	self.vocab = vocab

	def __call__(self, text):
	words = jieba.cut(text) # returns a generator
	tokens = list(words) # convert the genetator to a list
	spaces = [False] * len(tokens)
	doc = Doc(self.vocab, words=tokens, spaces=spaces)
	return doc

	# Utility functions
	def filter_tokens(doc):
	clean_tokens = [tok for tok in doc if tok.pos_ not in PUNCT_SYM]
	clean_tokens = (
	[tok for tok in clean_tokens if
	not tok.like_email and
	not tok.like_num and
	not tok.like_url and
	not tok.is_space]
	)
	return clean_tokens

	def get_vocab(doc):
	clean_tokens = filter_tokens(doc)
	alphanum_pattern = re.compile(r"[a-zA-Z0-9]")
	clean_tokens_text = [tok.text for tok in clean_tokens if not alphanum_pattern.search(tok.text)]
	vocab = list(set(clean_tokens_text))
	return vocab

	def get_counter(doc):
	clean_tokens = filter_tokens(doc)
	tokens = [token.text for token in clean_tokens]
	counter = Counter(tokens)
	return counter

	def get_freq_fig(doc):
	counter = get_counter(doc)
	counter_df = (
	pd.DataFrame.from_dict(counter, orient='index').
	reset_index().
	rename(columns={
	0: 'count',
	'index': 'word'
	}).
	sort_values(by='count', ascending=False)
	)
	fig = px.bar(counter_df, x='word', y='count')
	return fig

	def get_level_pie(tocfl_result):
	level = tocfl_result['詞條分級'].value_counts()
	fig = px.pie(tocfl_result,
	values=level.values,
	names=level.index,
	title='詞彙分級圓餅圖')
	return fig

	@st.cache
	def load_tocfl_table(filename="./tocfl_wordlist.csv"):
	table = pd.read_csv(filename)
	cols = "詞彙漢語拼音注音任務領域詞條分級".split()
	table = table[cols]
	return table

	# Page setting
	st.set_page_config(
	page_icon="🤠",
	layout="wide",
	initial_sidebar_state="auto",
	)
	st.markdown(f"# {DESCRIPTION}")

	# Load the model
	nlp = spacy.load(MODEL_NAME)

	# Add pipelines to spaCy
	# nlp.add_pipe("yake") # keyword extraction
	# nlp.add_pipe("merge_entities") # Merge entity spans to tokens

	# Select a tokenizer if the Chinese model is chosen
	selected_tokenizer = st.radio("請選擇斷詞模型", ["jieba-TW", "spaCy"])
	if selected_tokenizer == "jieba-TW":
	nlp.tokenizer = JiebaTokenizer(nlp.vocab)

	# Page starts from here
	st.markdown("## 待分析文本")
	st.info("請在下面的文字框輸入文本並按下Ctrl + Enter以更新分析結果")
	text = st.text_area("", DEFAULT_TEXT, height=200)
	doc = nlp(text)
	st.markdown("---")

	st.info("請勾選以下至少一項功能")
	# keywords_extraction = st.sidebar.checkbox("關鍵詞分析", False) # YAKE doesn't work for Chinese texts
	analyzed_text = st.checkbox("增強文本", True)
	defs_examples = st.checkbox("單詞解析", True)
	# morphology = st.sidebar.checkbox("詞形變化", True)
	freq_count = st.checkbox("詞頻統計", True)
	ner_viz = st.checkbox("命名實體", True)
	tok_table = st.checkbox("斷詞特徵", False)

	if analyzed_text:
	st.markdown("## 增強文本")
	pronunciation = st.radio("請選擇輔助發音類型", ["漢語拼音", "注音符號", "國際音標"])
	for idx, sent in enumerate(doc.sents):
	tokens_text = [tok.text for tok in sent if tok.pos_ not in PUNCT_SYM]
	pinyins = [hanzi.to_pinyin(word) for word in tokens_text]
	sounds = pinyins
	if pronunciation == "注音符號":
	zhuyins = [transcriptions.pinyin_to_zhuyin(word) for word in pinyins]
	sounds = zhuyins
	elif pronunciation == "國際音標":
	ipas = [transcriptions.pinyin_to_ipa(word) for word in pinyins]
	sounds = ipas

	display = []
	for text, sound in zip(tokens_text, sounds):
	res = f"{text} [{sound}]"
	display.append(res)
	if display:
	display_text = TOK_SEP.join(display)
	st.write(f"{idx+1} >>> {display_text}")
	else:
	st.write(f"{idx+1} >>> EMPTY LINE")

	if defs_examples:
	st.markdown("## 單詞解析")
	vocab = get_vocab(doc)
	if vocab:
	tocfl_table = load_tocfl_table()
	filt = tocfl_table['詞彙'].isin(vocab)
	tocfl_res = tocfl_table[filt]
	st.markdown("### 華語詞彙分級")
	fig = get_level_pie(tocfl_res)
	st.plotly_chart(fig, use_container_width=True)

	with st.expander("點擊 + 查看結果"):
	st.table(tocfl_res)
	st.markdown("---")
	st.markdown("### 單詞解釋與例句")
	selected_words = st.multiselect("請選擇要查詢的單詞: ", vocab, vocab[-1])
	for w in selected_words:
	moedict_caller(w)

	if freq_count:
	st.markdown("## 詞頻統計")
	counter = get_counter(doc)
	topK = st.slider('請選擇前K個高頻詞', 1, len(counter), 5)
	most_common = counter.most_common(topK)
	st.write(most_common)
	st.markdown("---")

	fig = get_freq_fig(doc)
	st.plotly_chart(fig, use_container_width=True)

	if ner_viz:
	ner_labels = nlp.get_pipe("ner").labels
	visualize_ner(doc, labels=ner_labels, show_table=False, title="命名實體")

	if tok_table:
	visualize_tokens(doc, attrs=["text", "pos_", "tag_", "dep_", "head"], title="斷詞特徵")