Spaces:
Runtime error
Runtime error
update app
Browse files
app.py
CHANGED
|
@@ -43,25 +43,26 @@ def visualization(path_data, lang, num_docs, num_docs_for_words):
|
|
| 43 |
)
|
| 44 |
cutoff_index = max(0, math.floor(stop_ratio * len(data.index) / 100) - 1)
|
| 45 |
stop_cutoff = np.partition(data["stop_%"], cutoff_index)[cutoff_index]
|
| 46 |
-
st.sidebar.text(f"Kept text with >{stop_cutoff:.
|
| 47 |
keys.append(("stop_%", stop_cutoff, False))
|
| 48 |
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
|
|
|
| 65 |
|
| 66 |
if "bad_%" in columns:
|
| 67 |
bad_ratio = st.sidebar.slider(
|
|
@@ -69,7 +70,7 @@ def visualization(path_data, lang, num_docs, num_docs_for_words):
|
|
| 69 |
)
|
| 70 |
bad_index = max(0, math.floor((100 - bad_ratio) * len(data.index) / 100) - 1)
|
| 71 |
bad_cutoff = np.partition(data["bad_%"], bad_index)[bad_index]
|
| 72 |
-
st.sidebar.text(f"Kept text with <{bad_cutoff:.
|
| 73 |
keys.append(("bad_%", bad_cutoff, True))
|
| 74 |
|
| 75 |
if "perplexity" in columns:
|
|
@@ -133,7 +134,7 @@ def visualization(path_data, lang, num_docs, num_docs_for_words):
|
|
| 133 |
)
|
| 134 |
|
| 135 |
|
| 136 |
-
path_data = "./
|
| 137 |
lang = "English"
|
| 138 |
num_docs = 5000
|
| 139 |
num_docs_for_words = 500
|
|
|
|
| 43 |
)
|
| 44 |
cutoff_index = max(0, math.floor(stop_ratio * len(data.index) / 100) - 1)
|
| 45 |
stop_cutoff = np.partition(data["stop_%"], cutoff_index)[cutoff_index]
|
| 46 |
+
st.sidebar.text(f"Kept text with >{stop_cutoff:.2f}% stop words")
|
| 47 |
keys.append(("stop_%", stop_cutoff, False))
|
| 48 |
|
| 49 |
+
@st.cache(suppress_st_warning=True)
|
| 50 |
+
def recalculate_bad_words(file):
|
| 51 |
+
|
| 52 |
+
def bad_word_ratio(text: str, bad_word_list):
|
| 53 |
+
return len([word for word in text.split() if word.lower().strip() in bad_word_list]) / len(text.split())
|
| 54 |
+
|
| 55 |
+
bad_word_list = [word.decode().strip() for word in file.readlines()]
|
| 56 |
+
|
| 57 |
+
bad_word_ratios = [bad_word_ratio(text, bad_word_list) * 100 for text in data["text"]]
|
| 58 |
+
data["bad_%"] = bad_word_ratios
|
| 59 |
+
|
| 60 |
+
bad_word_file = st.sidebar.file_uploader("Upload your own list of bad words (1 word per line)")
|
| 61 |
+
|
| 62 |
+
st.session_state.old_bad_word_file = None
|
| 63 |
+
if bad_word_file != st.write(st.session_state.old_bad_word_file):
|
| 64 |
+
recalculate_bad_words(bad_word_file)
|
| 65 |
+
st.session_state.old_bad_word_file = bad_word_file
|
| 66 |
|
| 67 |
if "bad_%" in columns:
|
| 68 |
bad_ratio = st.sidebar.slider(
|
|
|
|
| 70 |
)
|
| 71 |
bad_index = max(0, math.floor((100 - bad_ratio) * len(data.index) / 100) - 1)
|
| 72 |
bad_cutoff = np.partition(data["bad_%"], bad_index)[bad_index]
|
| 73 |
+
st.sidebar.text(f"Kept text with <{bad_cutoff:.2f}% bad words")
|
| 74 |
keys.append(("bad_%", bad_cutoff, True))
|
| 75 |
|
| 76 |
if "perplexity" in columns:
|
|
|
|
| 134 |
)
|
| 135 |
|
| 136 |
|
| 137 |
+
path_data = "./en_examples_with_stats_ldnoob.json"
|
| 138 |
lang = "English"
|
| 139 |
num_docs = 5000
|
| 140 |
num_docs_for_words = 500
|