Paula Leonova
commited on
Commit
·
7055ca6
1
Parent(s):
d855e09
Add a table for keywords for all uploaded text
Browse files
app.py
CHANGED
|
@@ -42,11 +42,12 @@ with st.form(key='my_form'):
|
|
| 42 |
|
| 43 |
text_csv_expander = st.expander(label=f'Want to upload multiple texts at once? Expand to upload your text files below.', expanded=False)
|
| 44 |
with text_csv_expander:
|
| 45 |
-
st.
|
|
|
|
| 46 |
uploaded_text_files = st.file_uploader(label="Upload file(s) that end with the .txt suffix",
|
| 47 |
accept_multiple_files=True, key = 'text_uploader',
|
| 48 |
type = 'txt')
|
| 49 |
-
st.write("
|
| 50 |
uploaded_csv_text_files = st.file_uploader(label='Upload a CSV file with columns: "title" and "text"',
|
| 51 |
accept_multiple_files=False, key = 'csv_text_uploader',
|
| 52 |
type = 'csv')
|
|
@@ -57,12 +58,12 @@ with st.form(key='my_form'):
|
|
| 57 |
|
| 58 |
st.text("\n\n\n")
|
| 59 |
st.markdown("##### Step 2: Enter Labels")
|
| 60 |
-
labels = st.text_input('Enter possible topic labels, which can be either keywords and/or general themes (comma-separated):',input_labels, max_chars=
|
| 61 |
labels = list(set([x.strip() for x in labels.strip().split(',') if len(x.strip()) > 0]))
|
| 62 |
|
| 63 |
labels_csv_expander = st.expander(label=f'Prefer to upload a list of labels instead? Click here to upload your CSV file.',expanded=False)
|
| 64 |
with labels_csv_expander:
|
| 65 |
-
uploaded_labels_file = st.file_uploader("
|
| 66 |
key='labels_uploader')
|
| 67 |
|
| 68 |
gen_keywords = st.radio(
|
|
@@ -72,16 +73,17 @@ with st.form(key='my_form'):
|
|
| 72 |
|
| 73 |
st.text("\n\n\n")
|
| 74 |
st.markdown("##### Step 3: Provide Ground Truth Labels (_Optional_)")
|
| 75 |
-
glabels = st.text_input('If available, enter ground truth topic labels to evaluate results, otherwise leave blank (comma-separated):',input_glabels, max_chars=
|
| 76 |
glabels = list(set([x.strip() for x in glabels.strip().split(',') if len(x.strip()) > 0]))
|
| 77 |
|
| 78 |
|
| 79 |
glabels_csv_expander = st.expander(label=f'Have a file with labels for the text? Click here to upload your CSV file.', expanded=False)
|
| 80 |
with glabels_csv_expander:
|
| 81 |
-
st.
|
|
|
|
| 82 |
uploaded_onetext_glabels_file = st.file_uploader("Choose a CSV file with one column and no header, where each cell is a separate label",
|
| 83 |
key = 'onetext_glabels_uploader')
|
| 84 |
-
st.write("
|
| 85 |
uploaded_multitext_glabels_file = st.file_uploader('Or Choose a CSV file with two columns "title" and "label", with the cells in the title column matching the name of the files uploaded in step #1.',
|
| 86 |
key = 'multitext_glabels_uploader')
|
| 87 |
|
|
@@ -116,8 +118,10 @@ if submit_button or example_button:
|
|
| 116 |
st.error("Enter some text to generate a summary")
|
| 117 |
else:
|
| 118 |
|
|
|
|
| 119 |
if uploaded_text_files is not None:
|
| 120 |
st.markdown("### Text Inputs")
|
|
|
|
| 121 |
file_names = []
|
| 122 |
raw_texts = []
|
| 123 |
for uploaded_file in uploaded_text_files:
|
|
@@ -125,63 +129,79 @@ if submit_button or example_button:
|
|
| 125 |
raw_texts.append(text)
|
| 126 |
title_file_name = uploaded_file.name.replace('.txt','')
|
| 127 |
file_names.append(title_file_name)
|
| 128 |
-
|
| 129 |
'text': raw_texts})
|
| 130 |
-
st.dataframe(
|
| 131 |
st.download_button(
|
| 132 |
label="Download data as CSV",
|
| 133 |
-
data=
|
| 134 |
-
file_name='
|
| 135 |
mime='title_text/csv',
|
| 136 |
)
|
|
|
|
| 137 |
|
| 138 |
|
| 139 |
with st.spinner('Breaking up text into more reasonable chunks (transformers cannot exceed a 1024 token max)...'):
|
| 140 |
# For each body of text, create text chunks of a certain token size required for the transformer
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 154 |
keywords_list = md.keyword_gen(kw_model, text_chunk)
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 158 |
|
| 159 |
-
top_kw_df = top_kw_df.sort_values('score', ascending = False).reset_index().drop(['index'], axis=1)
|
| 160 |
-
st.dataframe(top_kw_df.head(10))
|
| 161 |
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
|
| 186 |
if len(text_input) == 0 or len(labels) == 0:
|
| 187 |
st.error('Enter some text and at least one possible topic to see label predictions.')
|
|
|
|
| 42 |
|
| 43 |
text_csv_expander = st.expander(label=f'Want to upload multiple texts at once? Expand to upload your text files below.', expanded=False)
|
| 44 |
with text_csv_expander:
|
| 45 |
+
st.markdown('##### Choose one of the options below:')
|
| 46 |
+
st.write("__Option A:__")
|
| 47 |
uploaded_text_files = st.file_uploader(label="Upload file(s) that end with the .txt suffix",
|
| 48 |
accept_multiple_files=True, key = 'text_uploader',
|
| 49 |
type = 'txt')
|
| 50 |
+
st.write("__Option B:__")
|
| 51 |
uploaded_csv_text_files = st.file_uploader(label='Upload a CSV file with columns: "title" and "text"',
|
| 52 |
accept_multiple_files=False, key = 'csv_text_uploader',
|
| 53 |
type = 'csv')
|
|
|
|
| 58 |
|
| 59 |
st.text("\n\n\n")
|
| 60 |
st.markdown("##### Step 2: Enter Labels")
|
| 61 |
+
labels = st.text_input('Enter possible topic labels, which can be either keywords and/or general themes (comma-separated):',input_labels, max_chars=2000)
|
| 62 |
labels = list(set([x.strip() for x in labels.strip().split(',') if len(x.strip()) > 0]))
|
| 63 |
|
| 64 |
labels_csv_expander = st.expander(label=f'Prefer to upload a list of labels instead? Click here to upload your CSV file.',expanded=False)
|
| 65 |
with labels_csv_expander:
|
| 66 |
+
uploaded_labels_file = st.file_uploader("Choose a CSV file with one column and no header, where each cell is a separate label",
|
| 67 |
key='labels_uploader')
|
| 68 |
|
| 69 |
gen_keywords = st.radio(
|
|
|
|
| 73 |
|
| 74 |
st.text("\n\n\n")
|
| 75 |
st.markdown("##### Step 3: Provide Ground Truth Labels (_Optional_)")
|
| 76 |
+
glabels = st.text_input('If available, enter ground truth topic labels to evaluate results, otherwise leave blank (comma-separated):',input_glabels, max_chars=2000)
|
| 77 |
glabels = list(set([x.strip() for x in glabels.strip().split(',') if len(x.strip()) > 0]))
|
| 78 |
|
| 79 |
|
| 80 |
glabels_csv_expander = st.expander(label=f'Have a file with labels for the text? Click here to upload your CSV file.', expanded=False)
|
| 81 |
with glabels_csv_expander:
|
| 82 |
+
st.markdown('##### Choose one of the options below:')
|
| 83 |
+
st.write("__Option A:__")
|
| 84 |
uploaded_onetext_glabels_file = st.file_uploader("Choose a CSV file with one column and no header, where each cell is a separate label",
|
| 85 |
key = 'onetext_glabels_uploader')
|
| 86 |
+
st.write("__Option B:__")
|
| 87 |
uploaded_multitext_glabels_file = st.file_uploader('Or Choose a CSV file with two columns "title" and "label", with the cells in the title column matching the name of the files uploaded in step #1.',
|
| 88 |
key = 'multitext_glabels_uploader')
|
| 89 |
|
|
|
|
| 118 |
st.error("Enter some text to generate a summary")
|
| 119 |
else:
|
| 120 |
|
| 121 |
+
# OPTION A:
|
| 122 |
if uploaded_text_files is not None:
|
| 123 |
st.markdown("### Text Inputs")
|
| 124 |
+
st.write('Files concatenated into a dataframe:')
|
| 125 |
file_names = []
|
| 126 |
raw_texts = []
|
| 127 |
for uploaded_file in uploaded_text_files:
|
|
|
|
| 129 |
raw_texts.append(text)
|
| 130 |
title_file_name = uploaded_file.name.replace('.txt','')
|
| 131 |
file_names.append(title_file_name)
|
| 132 |
+
text_df = pd.DataFrame({'title': file_names,
|
| 133 |
'text': raw_texts})
|
| 134 |
+
st.dataframe(text_df.head())
|
| 135 |
st.download_button(
|
| 136 |
label="Download data as CSV",
|
| 137 |
+
data=text_df.to_csv().encode('utf-8'),
|
| 138 |
+
file_name='title_text.csv',
|
| 139 |
mime='title_text/csv',
|
| 140 |
)
|
| 141 |
+
# OPTION B: [TO DO: DIRECT CSV UPLOAD INSTEAD]
|
| 142 |
|
| 143 |
|
| 144 |
with st.spinner('Breaking up text into more reasonable chunks (transformers cannot exceed a 1024 token max)...'):
|
| 145 |
# For each body of text, create text chunks of a certain token size required for the transformer
|
| 146 |
+
|
| 147 |
+
text_chunks_lib = dict()
|
| 148 |
+
for i in range(0, len(text_df)):
|
| 149 |
+
nested_sentences = md.create_nest_sentences(document=text_df['text'][i], token_max_length=1024)
|
| 150 |
+
|
| 151 |
+
# For each chunk of sentences (within the token max)
|
| 152 |
+
text_chunks = []
|
| 153 |
+
for n in range(0, len(nested_sentences)):
|
| 154 |
+
tc = " ".join(map(str, nested_sentences[n]))
|
| 155 |
+
text_chunks.append(tc)
|
| 156 |
+
title_entry = text_df['title'][i]
|
| 157 |
+
text_chunks_lib[title_entry] = text_chunks
|
| 158 |
+
|
| 159 |
+
if gen_keywords == 'Yes':
|
| 160 |
+
st.markdown("### Top Keywords")
|
| 161 |
+
with st.spinner("Generating keywords from text..."):
|
| 162 |
+
|
| 163 |
+
kw_dict = dict()
|
| 164 |
+
for key in text_chunks_lib:
|
| 165 |
+
for text_chunk in text_chunks_lib[key]:
|
| 166 |
keywords_list = md.keyword_gen(kw_model, text_chunk)
|
| 167 |
+
kw_dict[key] = dict(keywords_list)
|
| 168 |
+
|
| 169 |
+
kw_df0 = pd.DataFrame.from_dict(kw_dict).reset_index()
|
| 170 |
+
kw_df0.rename(columns={'index': 'keyword'}, inplace=True)
|
| 171 |
+
kw_df = pd.melt(kw_df0, id_vars=['keyword'], var_name='title', value_name='score').dropna()
|
| 172 |
+
kw_df = kw_df[kw_df['score'] > 0.1][['title', 'keyword', 'score']].reset_index().drop(columns='index').sort_values(['title', 'score'], ascending=False)
|
| 173 |
+
st.dataframe(kw_df)
|
| 174 |
+
st.download_button(
|
| 175 |
+
label="Download data as CSV",
|
| 176 |
+
data=kw_df.to_csv().encode('utf-8'),
|
| 177 |
+
file_name='title_kewyords.csv',
|
| 178 |
+
mime='title_kewyords/csv',
|
| 179 |
+
)
|
| 180 |
|
|
|
|
|
|
|
| 181 |
|
| 182 |
+
st.markdown("### Summary")
|
| 183 |
+
with st.spinner(f'Generating summaries for {len(text_chunks)} text chunks (this may take a minute)...'):
|
| 184 |
+
|
| 185 |
+
my_expander = st.expander(label=f'Expand to see intermediate summary generation details for {len(text_chunks)} text chunks')
|
| 186 |
+
with my_expander:
|
| 187 |
+
summary = []
|
| 188 |
+
|
| 189 |
+
st.markdown("_Once the original text is broken into smaller chunks (totaling no more than 1024 tokens, \
|
| 190 |
+
with complete sentences), each block of text is then summarized separately using BART NLI \
|
| 191 |
+
and then combined at the very end to generate the final summary._")
|
| 192 |
+
|
| 193 |
+
for num_chunk, text_chunk in enumerate(text_chunks):
|
| 194 |
+
st.markdown(f"###### Original Text Chunk {num_chunk+1}/{len(text_chunks)}" )
|
| 195 |
+
st.markdown(text_chunk)
|
| 196 |
+
|
| 197 |
+
chunk_summary = md.summarizer_gen(summarizer, sequence=text_chunk, maximum_tokens = 300, minimum_tokens = 20)
|
| 198 |
+
summary.append(chunk_summary)
|
| 199 |
+
st.markdown(f"###### Partial Summary {num_chunk+1}/{len(text_chunks)}")
|
| 200 |
+
st.markdown(chunk_summary)
|
| 201 |
+
# Combine all the summaries into a list and compress into one document, again
|
| 202 |
+
final_summary = " \n\n".join(list(summary))
|
| 203 |
+
|
| 204 |
+
st.markdown(final_summary)
|
| 205 |
|
| 206 |
if len(text_input) == 0 or len(labels) == 0:
|
| 207 |
st.error('Enter some text and at least one possible topic to see label predictions.')
|