Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import pandas as pd | |
| from widgets.widget_base import Widget | |
| from data_measurements.dataset_statistics import DatasetStatisticsCacheClass as dmt_cls | |
| import utils | |
| logs = utils.prepare_logging(__file__) | |
| class Npmi(Widget): | |
| def __init__(self): | |
| self.npmi_first_word = gr.Dropdown( | |
| render=False, label="What is the first word you want to select?" | |
| ) | |
| self.npmi_second_word = gr.Dropdown( | |
| render=False, label="What is the second word you want to select?" | |
| ) | |
| self.npmi_error_text = gr.Markdown(render=False) | |
| self.npmi_df = gr.HTML(render=False) | |
| self.sort = gr.Dropdown(label="Sort By Column", render=False) | |
| self.npmi_empty_text = gr.Markdown(render=False) | |
| self.npmi_description = gr.Markdown(render=False) | |
| def output_components(self): | |
| return [ | |
| self.npmi_first_word, | |
| self.npmi_second_word, | |
| self.sort, | |
| self.npmi_error_text, | |
| self.npmi_df, | |
| self.npmi_description, | |
| self.npmi_empty_text, | |
| ] | |
| def render(self): | |
| with gr.TabItem("Word Association: nPMI"): | |
| self.npmi_description.render() | |
| self.npmi_first_word.render() | |
| self.npmi_second_word.render() | |
| self.sort.render() | |
| self.npmi_df.render() | |
| self.npmi_empty_text.render() | |
| self.npmi_error_text.render() | |
| def update(self, dstats: dmt_cls): | |
| min_vocab = dstats.min_vocab_count | |
| npmi_stats = dstats.npmi_obj | |
| available_terms = npmi_stats.avail_identity_terms | |
| output = {comp: gr.update(visible=False) for comp in self.output_components} | |
| if npmi_stats and len(available_terms) > 0: | |
| output[self.npmi_description] = gr.Markdown.update( | |
| value=self.expander_npmi_description(min_vocab), visible=True | |
| ) | |
| output[self.npmi_first_word] = gr.Dropdown.update( | |
| choices=available_terms, value=available_terms[0], visible=True | |
| ) | |
| output[self.npmi_second_word] = gr.Dropdown.update( | |
| choices=available_terms[::-1], value=available_terms[-1], visible=True | |
| ) | |
| output[self.sort] = gr.Dropdown.update(choices=['bias', available_terms[0], available_terms[-1]], | |
| value='bias') | |
| output.update( | |
| self.npmi_show(available_terms[0], available_terms[-1], 'bias', dstats) | |
| ) | |
| else: | |
| output[self.npmi_error_text] = gr.Markdown.update( | |
| visible=True, | |
| value="No words found co-occurring with both of the selected identity terms.", | |
| ) | |
| return output | |
| def npmi_show(self, term1, term2, sort_col, dstats): | |
| npmi_stats = dstats.npmi_obj | |
| paired_results = npmi_stats.get_display(term1, term2) | |
| output = {} | |
| if paired_results.empty: | |
| output[self.npmi_empty_text] = gr.Markdown.update( | |
| value="""No words that co-occur enough times for results! Or there's a π. | |
| Or we're still computing this one. π€·""", | |
| visible=True, | |
| ) | |
| output[self.npmi_df] = gr.DataFrame.update(visible=False) | |
| else: | |
| output[self.npmi_empty_text] = gr.Markdown.update(visible=False) | |
| logs.debug("Results to be shown in streamlit are") | |
| logs.debug(paired_results) | |
| s = pd.DataFrame( | |
| paired_results.sort_values(sort_col, ascending=False) | |
| ) | |
| s.index.name = "word" | |
| s = s.reset_index().round(4) | |
| bias_col = [col for col in s.columns if col != "word"] | |
| # Keep the dataframe from being crazy big. | |
| if s.shape[0] > 10000: | |
| bias_thres = max(abs(s[s[0]][5000]), abs(s[s[0]][-5000])) | |
| logs.info(f"filtering with bias threshold: {bias_thres}") | |
| s_filtered = s[s[0].abs() > bias_thres] | |
| else: | |
| s_filtered = s | |
| out_df = ( | |
| s_filtered.style.background_gradient(subset=bias_col) | |
| .format(formatter="{:,.3f}", subset=bias_col) | |
| .set_properties(**{"text-align": "center", "width": "100em"}) | |
| .set_caption( | |
| "nPMI scores between the selected identity terms and the words they both co-occur with" | |
| ) | |
| ) | |
| output[self.npmi_df] = out_df.to_html() | |
| return output | |
| def expander_npmi_description(min_vocab): | |
| return f""" | |
| Use this widget to identify problematic biases and stereotypes in | |
| your data. | |
| nPMI scores for a word help to identify potentially | |
| problematic associations, ranked by how close the association is. | |
| nPMI bias scores for paired words help to identify how word | |
| associations are skewed between the selected selected words | |
| ([Aka et al., 2021](https://arxiv.org/abs/2103.03417)). | |
| You can select from gender and sexual orientation | |
| identity terms that appear in the dataset at least {min_vocab} times. | |
| The resulting ranked words are those that co-occur with both identity terms. | |
| The more *positive* the score, the more associated the word is with | |
| the first identity term. | |
| The more *negative* the score, the more associated the word is with | |
| the second identity term. | |
| ----- | |
| """ | |
| def update_sort_and_npmi(self, first_word, second_word, sort_col, dstats): | |
| output = {self.sort: gr.Dropdown.update(choices=['bias', first_word, second_word], | |
| value='bias')} | |
| new_df = self.npmi_show(first_word, second_word, sort_col, dstats) | |
| output.update(new_df) | |
| return output | |
| def add_events(self, state: gr.State): | |
| self.npmi_first_word.change( | |
| self.update_sort_and_npmi, | |
| inputs=[self.npmi_first_word, self.npmi_second_word, self.sort, state], | |
| outputs=[self.npmi_df, self.npmi_empty_text, self.sort], | |
| ) | |
| self.npmi_second_word.change( | |
| self.update_sort_and_npmi, | |
| inputs=[self.npmi_first_word, self.npmi_second_word, self.sort, state], | |
| outputs=[self.npmi_df, self.npmi_empty_text, self.sort], | |
| ) | |
| self.sort.change( | |
| self.npmi_show, | |
| inputs=[self.npmi_first_word, self.npmi_second_word, self.sort, state], | |
| outputs=[self.npmi_df, self.npmi_empty_text], | |
| ) | |