Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import numpy as np | |
| import pandas as pd | |
| import plotly.express as px | |
| import torch | |
| from transformers import AutoTokenizer, AutoModelForMaskedLM | |
| device = "cuda:0" if torch.cuda.is_available() else "cpu" | |
| model_checkpoint = "facebook/xlm-v-base" | |
| tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) | |
| model = AutoModelForMaskedLM.from_pretrained(model_checkpoint) | |
| model = model.to(device) | |
| mask_token = tokenizer.mask_token | |
| def add_mask(target_word, text): | |
| text_masked = text.replace(target_word, mask_token) | |
| return text_masked | |
| def eval_prob(target_word, text): | |
| # Replace target_word with mask | |
| text_masked = add_mask(target_word, text) | |
| # Get token ID of target_word | |
| target_idx = tokenizer.encode(target_word)[-2] | |
| # Convert masked text to token IDs | |
| inputs = tokenizer(text_masked, return_tensors="pt").to(device) | |
| # Calculate logits score (for each token, for each position) | |
| token_logits = model(**inputs).logits | |
| # Find the position of the mask and extract logits for that position | |
| mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1] | |
| mask_token_logits = token_logits[0, mask_token_index, :] | |
| # Convert logits to softmax probability | |
| logits = mask_token_logits[0].tolist() | |
| probs = torch.nn.functional.softmax(torch.tensor([logits]), dim=1)[0] | |
| return probs, target_idx | |
| def process_prob(target_word, text): | |
| probs, target_idx = eval_prob(target_word, text) | |
| # Sort tokens based on probability scores | |
| words = [tokenizer.decode(idx) for idx in torch.sort(probs, descending=True).indices] | |
| scores = torch.sort(probs, descending=True).values | |
| # Consolidate results in dataframe | |
| d = {'word': words, 'score': scores} | |
| df = pd.DataFrame(data=d) | |
| # Get score rank and probability of target word | |
| result_rank = words.index(target_word) | |
| result_prob = scores[result_rank] | |
| # Create color code | |
| target_col = [0] * len(scores) | |
| target_col[result_rank] = 1 | |
| df["target"] = target_col | |
| return result_rank, result_prob, df | |
| def plot_results(target_word, text): | |
| _, _, df = process_prob(target_word, text) | |
| # Plot | |
| fig = px.bar( | |
| df[:150], | |
| x='word', | |
| y='score', | |
| color='target', | |
| color_continuous_scale=px.colors.sequential.Bluered, | |
| ) | |
| # fig.update(layout_coloraxis_showscale=False) | |
| fig.show() | |
| return fig | |
| gr.Interface( | |
| fn=plot_results, | |
| inputs=[ | |
| gr.Textbox(label="词语", placeholder="Key in a 词语 or click an example"), | |
| gr.Textbox(label="造句", placeholder="造句 with the 词语 or click an example"), | |
| ], | |
| examples=[ | |
| ["与众不同", "他的产品很特别,与众不同,跟别人的不一样。"], | |
| ["尴尬", "小明去朋友的生日庆祝会,忘了带礼物,感到很尴尬。"], | |
| ["标准", "小明朗读课文时发音标准,被老师评为优秀。"], | |
| ], | |
| outputs=["plot"], | |
| title="Chinese Sentence Grading", | |
| ).launch() |