Spaces:
Runtime error
Runtime error
| import numpy as np | |
| #import itertools | |
| from konlpy.tag import Okt | |
| from sklearn.feature_extraction.text import CountVectorizer | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| import gradio as gr | |
| import pandas as pd | |
| # make function using import pip to install torch | |
| import pip | |
| #pip.main(['install', 'torch']) | |
| #pip.main(['install', 'transformers']) | |
| import torch | |
| import transformers | |
| from transformers import BertTokenizerFast | |
| from transformers import AutoModel | |
| def make_candiadte(prompt): | |
| okt = Okt() | |
| tokenized_doc = okt.pos(prompt) | |
| tokenized_nouns = ' '.join([word[0] for word in tokenized_doc if word[1] == 'Noun']) | |
| n_gram_range = (2, 3) | |
| count = CountVectorizer(ngram_range=n_gram_range).fit([tokenized_nouns]) | |
| candidates = count.get_feature_names_out() | |
| return candidates | |
| # saved_model | |
| def load_model(): | |
| pretrained_model_name = "kykim/bert-kor-base" | |
| tokenizer = BertTokenizerFast.from_pretrained(pretrained_model_name) | |
| model = AutoModel.from_pretrained("./bertmodel/") | |
| return model, tokenizer | |
| # main | |
| def inference(prompt): | |
| candidates = make_candiadte(prompt) | |
| model, tokenizer = load_model() | |
| input_ids = tokenizer.encode(prompt) | |
| input_ids = torch.tensor(input_ids).unsqueeze(0) | |
| doc_embedding = model(input_ids)["pooler_output"] | |
| top_n = 5 | |
| words = [] | |
| distances = [] | |
| for word in candidates: | |
| input_ids = tokenizer.encode(word) | |
| input_ids = torch.tensor(input_ids).unsqueeze(0) | |
| word_embedding = model(input_ids)["pooler_output"] | |
| distance = torch.cosine_similarity(doc_embedding, word_embedding, dim=1).item() | |
| words.append(word) | |
| distances.append(distance) | |
| #print(word, torch.cosine_similarity(doc_embedding, word_embedding, dim=1).item()) | |
| cos_df = pd.DataFrame({'word':words, 'distance':distances}) | |
| # sort by distance | |
| cos_df = cos_df.sort_values(by='distance', ascending=False) | |
| # top n | |
| cos_df = cos_df[:top_n] | |
| cos_df["word"].values | |
| # ๋ช ์ฌ๋ง ์ถ์ถ | |
| outputs = [] | |
| for word in cos_df["word"].values: | |
| okt = Okt() | |
| tokenized_doc = okt.pos(word) | |
| tokenized_nouns = ' '.join([word[0] for word in tokenized_doc if word[1] == 'Noun']) | |
| outputs.append("#" + tokenized_nouns) | |
| outputs = " ".join(outputs) | |
| return outputs | |
| demo = gr.Interface( | |
| fn=inference, | |
| inputs="text", | |
| outputs="text", #return ๊ฐ | |
| examples=[ | |
| "์ง๋ํด ๊ตญ๋ด ํด๋์๊ณ ์ต๊ณ ์คํ๋ก ๋ ์ค๋ฅธ ํผ์๋์คํธ ์์ค์ฐฌ์ด ๋ฏธ๊ตญ ๋ฐด ํด๋ผ์ด๋ฒ ๊ตญ์ ์ฝฉ์ฟ ๋ฅด ๊ฒฐ์ ์์ ์ฐ์ฃผํ ๋ผํ๋ง๋๋ ธํ ํผ์๋ ธ ํ์ฃผ๊ณก ์ 3๋ฒ ์์์ด ์ ํ๋ธ์์ ์กฐํ์ 1000๋งํ๋ฅผ ๋๊ฒผ๋ค. ๋ผํ๋ง๋๋ ธํ 3๋ฒ ์ฐ์ฃผ ์์ ์ค ๋จ์ฐ ์ต๊ณ ์กฐํ์๋ค." | |
| ] | |
| ).launch() # launch(share=True)๋ฅผ ์ค์ ํ๋ฉด ์ธ๋ถ์์ ์ ์ ๊ฐ๋ฅํ ๋งํฌ๊ฐ ์์ฑ๋จ | |
| demo.launch() |