Spaces:

huggingface
/

text-data-filtering

Runtime error

App Files Files Community

teven commited on Nov 26, 2021

Commit

a446a8b

1 Parent(s): 64ce142

update app

Browse files

Files changed (1) hide show

app.py +20 -19

app.py CHANGED Viewed

@@ -43,25 +43,26 @@ def visualization(path_data, lang, num_docs, num_docs_for_words):
         )
         cutoff_index = max(0, math.floor(stop_ratio * len(data.index) / 100) - 1)
         stop_cutoff = np.partition(data["stop_%"], cutoff_index)[cutoff_index]
-        st.sidebar.text(f"Kept text with >{stop_cutoff:.1f}% stop words")
         keys.append(("stop_%", stop_cutoff, False))
-    # def recalculate_bad_words(file):
-    #
-    #     def bad_word_ratio(text: str, bad_word_list):
-    #         return sum(
-    #             [text.count(bad_word.decode()) * len(bad_word.decode().split()) for bad_word in bad_word_list]) / len(
-    #             text.split())
-    #
-    #     bad_word_list = file.readlines()
-    #
-    #     bad_word_ratios = [bad_word_ratio(text, bad_word_list) * 100 for text in data["text"]]
-    #     data["bad_%"] = bad_word_ratios
-    #
-    # bad_word_file = st.sidebar.file_uploader("Upload your own list of bad words (1 word per line)")
-    #
-    # if bad_word_file is not None:
-    #     recalculate_bad_words(bad_word_file)
     if "bad_%" in columns:
         bad_ratio = st.sidebar.slider(
@@ -69,7 +70,7 @@ def visualization(path_data, lang, num_docs, num_docs_for_words):
         )
         bad_index = max(0, math.floor((100 - bad_ratio) * len(data.index) / 100) - 1)
         bad_cutoff = np.partition(data["bad_%"], bad_index)[bad_index]
-        st.sidebar.text(f"Kept text with <{bad_cutoff:.1f}% bad words")
         keys.append(("bad_%", bad_cutoff, True))
     if "perplexity" in columns:
@@ -133,7 +134,7 @@ def visualization(path_data, lang, num_docs, num_docs_for_words):
         )
-path_data = "./en_examples_with_stats_no_small_docs.json"
 lang = "English"
 num_docs = 5000
 num_docs_for_words = 500

         )
         cutoff_index = max(0, math.floor(stop_ratio * len(data.index) / 100) - 1)
         stop_cutoff = np.partition(data["stop_%"], cutoff_index)[cutoff_index]
+        st.sidebar.text(f"Kept text with >{stop_cutoff:.2f}% stop words")
         keys.append(("stop_%", stop_cutoff, False))
+    @st.cache(suppress_st_warning=True)
+    def recalculate_bad_words(file):
+        def bad_word_ratio(text: str, bad_word_list):
+            return len([word for word in text.split() if word.lower().strip() in bad_word_list]) / len(text.split())
+        bad_word_list = [word.decode().strip() for word in file.readlines()]
+        bad_word_ratios = [bad_word_ratio(text, bad_word_list) * 100 for text in data["text"]]
+        data["bad_%"] = bad_word_ratios
+    bad_word_file = st.sidebar.file_uploader("Upload your own list of bad words (1 word per line)")
+    st.session_state.old_bad_word_file = None
+    if bad_word_file != st.write(st.session_state.old_bad_word_file):
+        recalculate_bad_words(bad_word_file)
+        st.session_state.old_bad_word_file = bad_word_file
     if "bad_%" in columns:
         bad_ratio = st.sidebar.slider(
         )
         bad_index = max(0, math.floor((100 - bad_ratio) * len(data.index) / 100) - 1)
         bad_cutoff = np.partition(data["bad_%"], bad_index)[bad_index]
+        st.sidebar.text(f"Kept text with <{bad_cutoff:.2f}% bad words")
         keys.append(("bad_%", bad_cutoff, True))
     if "perplexity" in columns:
         )
+path_data = "./en_examples_with_stats_ldnoob.json"
 lang = "English"
 num_docs = 5000
 num_docs_for_words = 500