Spaces:
				
			
			
	
			
			
		Runtime error
		
	
	
	
			
			
	
	
	
	
		
		
		Runtime error
		
	| from jisho_api.word import Word | |
| from jisho_api.sentence import Sentence | |
| import pandas as pd | |
| import re | |
| import requests | |
| import spacy | |
| from spacy_streamlit import visualize_ner, visualize_tokens | |
| #from spacy.language import Language | |
| from spacy.tokens import Doc | |
| import spacy_ke | |
| import streamlit as st | |
| # Global variables | |
| DEFAULT_TEXT = """それまで、ぼくはずっとひとりぼっちだった。だれともうちとけられないまま、6年まえ、ちょっとおかしくなって、サハラさばくに下りた。ぼくのエンジンのなかで、なにかがこわれていた。ぼくには、みてくれるひとも、おきゃくさんもいなかったから、なおすのはむずかしいけど、ぜんぶひとりでなんとかやってみることにした。それでぼくのいのちがきまってしまう。のみ水は、たった7日ぶんしかなかった。 | |
| 1日めの夜、ぼくはすなの上でねむった。ひとのすむところは、はるかかなただった。海のどまんなか、いかだでさまよっているひとよりも、もっとひとりぼっち。だから、ぼくがびっくりしたのも、みんなわかってくれるとおもう。じつは、あさ日がのぼるころ、ぼくは、ふしぎなかわいいこえでおこされたんだ。 | |
| 「ごめんください……ヒツジの絵をかいて!」 | |
| 「えっ?」 | |
| 「ぼくにヒツジの絵をかいて……」 | |
| 『星の王子さま』""" | |
| DESCRIPTION = "AI模型輔助語言學習:日語" | |
| TOK_SEP = " | " | |
| MODEL_NAME = "ja_ginza" | |
| # External API callers | |
| def parse_jisho_senses(word): | |
| res = Word.request(word) | |
| response = res.dict() | |
| if response["meta"]["status"] == 200: | |
| data = response["data"] | |
| commons = [d for d in data if d["is_common"]] | |
| if commons: | |
| common = commons[0] # Only get the first entry that is common | |
| senses = common["senses"] | |
| if len(senses) > 3: | |
| senses = senses[:3] | |
| with st.container(): | |
| for idx, sense in enumerate(senses): | |
| eng_def = "; ".join(sense["english_definitions"]) | |
| pos = "/".join(sense["parts_of_speech"]) | |
| st.write(f"Sense {idx+1}: {eng_def} ({pos})") | |
| else: | |
| st.info("Found no common words on Jisho!") | |
| else: | |
| st.error("Can't get response from Jisho!") | |
| def parse_jisho_sentences(word): | |
| res = Sentence.request(word) | |
| try: | |
| response = res.dict() | |
| data = response["data"] | |
| if len(data) > 3: | |
| sents = data[:3] | |
| else: | |
| sents = data | |
| with st.container(): | |
| for idx, sent in enumerate(sents): | |
| eng = sent["en_translation"] | |
| jap = sent["japanese"] | |
| st.write(f"Sentence {idx+1}: {jap}") | |
| st.write(f"({eng})") | |
| except: | |
| st.info("Found no results on Jisho!") | |
| # Utility functions | |
| def create_jap_df(tokens): | |
| seen_texts = [] | |
| filtered_tokens = [] | |
| for tok in tokens: | |
| if tok.text not in seen_texts: | |
| filtered_tokens.append(tok) | |
| df = pd.DataFrame( | |
| { | |
| "單詞": [tok.text for tok in filtered_tokens], | |
| "發音": ["/".join(tok.morph.get("Reading")) for tok in filtered_tokens], | |
| "詞形變化": ["/".join(tok.morph.get("Inflection")) for tok in filtered_tokens], | |
| "原形": [tok.lemma_ for tok in filtered_tokens], | |
| #"正規形": [tok.norm_ for tok in verbs], | |
| } | |
| ) | |
| st.dataframe(df) | |
| csv = df.to_csv().encode('utf-8') | |
| st.download_button( | |
| label="下載表格", | |
| data=csv, | |
| file_name='jap_forms.csv', | |
| ) | |
| def filter_tokens(doc): | |
| clean_tokens = [tok for tok in doc if tok.pos_ not in ["PUNCT", "SYM"]] | |
| clean_tokens = [tok for tok in clean_tokens if not tok.like_email] | |
| clean_tokens = [tok for tok in clean_tokens if not tok.like_url] | |
| clean_tokens = [tok for tok in clean_tokens if not tok.like_num] | |
| clean_tokens = [tok for tok in clean_tokens if not tok.is_punct] | |
| clean_tokens = [tok for tok in clean_tokens if not tok.is_space] | |
| return clean_tokens | |
| def create_kw_section(doc): | |
| st.markdown("## 關鍵詞分析") | |
| kw_num = st.slider("請選擇關鍵詞數量", 1, 10, 3) | |
| kws2scores = {keyword: score for keyword, score in doc._.extract_keywords(n=kw_num)} | |
| kws2scores = sorted(kws2scores.items(), key=lambda x: x[1], reverse=True) | |
| count = 1 | |
| for keyword, score in kws2scores: | |
| rounded_score = round(score, 3) | |
| st.write(f"{count} >>> {keyword} ({rounded_score})") | |
| count += 1 | |
| # Page setting | |
| st.set_page_config( | |
| page_icon="🤠", | |
| layout="wide", | |
| initial_sidebar_state="auto", | |
| ) | |
| st.markdown(f"# {DESCRIPTION}") | |
| # Load the model | |
| nlp = spacy.load(MODEL_NAME) | |
| # Add pipelines to spaCy | |
| nlp.add_pipe("yake") # keyword extraction | |
| # nlp.add_pipe("merge_entities") # Merge entity spans to tokens | |
| # Page starts from here | |
| st.markdown("## 待分析文本") | |
| st.info("請在下面的文字框輸入文本並按下Ctrl + Enter以更新分析結果") | |
| text = st.text_area("", DEFAULT_TEXT, height=200) | |
| doc = nlp(text) | |
| st.markdown("---") | |
| st.info("請勾選以下至少一項功能") | |
| keywords_extraction = st.checkbox("關鍵詞分析", False) | |
| analyzed_text = st.checkbox("增強文本", True) | |
| defs_examples = st.checkbox("單詞解析", True) | |
| morphology = st.checkbox("詞形變化", False) | |
| ner_viz = st.checkbox("命名實體", True) | |
| tok_table = st.checkbox("斷詞特徵", False) | |
| if keywords_extraction: | |
| create_kw_section(doc) | |
| if analyzed_text: | |
| st.markdown("## 分析後文本") | |
| for idx, sent in enumerate(doc.sents): | |
| clean_tokens = [tok for tok in sent if tok.pos_ not in ["PUNCT", "SYM"]] | |
| tokens_text = [tok.text for tok in clean_tokens] | |
| readings = ["/".join(tok.morph.get("Reading")) for tok in clean_tokens] | |
| display = [f"{text} [{reading}]" for text, reading in zip(tokens_text, readings)] | |
| if display: | |
| display_text = TOK_SEP.join(display) | |
| st.write(f"{idx+1} >>> {display_text}") | |
| else: | |
| st.write(f"{idx+1} >>> EMPTY LINE") | |
| if defs_examples: | |
| st.markdown("## 單詞解釋與例句") | |
| clean_tokens = filter_tokens(doc) | |
| alphanum_pattern = re.compile(r"[a-zA-Z0-9]") | |
| clean_lemmas = [tok.lemma_ for tok in clean_tokens if not alphanum_pattern.search(tok.lemma_)] | |
| vocab = list(set(clean_lemmas)) | |
| if vocab: | |
| selected_words = st.multiselect("請選擇要查詢的單詞: ", vocab, vocab[0:3]) | |
| for w in selected_words: | |
| st.write(f"### {w}") | |
| with st.expander("點擊 + 檢視結果"): | |
| parse_jisho_senses(w) | |
| parse_jisho_sentences(w) | |
| if morphology: | |
| st.markdown("## 詞形變化") | |
| # Collect inflected forms | |
| inflected_forms = [tok for tok in doc if tok.tag_.startswith("動詞") or tok.tag_.startswith("形")] | |
| if inflected_forms: | |
| create_jap_df(inflected_forms) | |
| if ner_viz: | |
| ner_labels = nlp.get_pipe("ner").labels | |
| visualize_ner(doc, labels=ner_labels, show_table=False, title="命名實體") | |
| if tok_table: | |
| visualize_tokens(doc, attrs=["text", "pos_", "tag_", "dep_", "head"], title="斷詞特徵") | |