Paula Leonova
commited on
Commit
·
b1bf232
1
Parent(s):
009207e
Add option B of loading text csv
Browse files
app.py
CHANGED
|
@@ -46,15 +46,24 @@ with st.form(key='my_form'):
|
|
| 46 |
st.write("__Option A:__")
|
| 47 |
uploaded_text_files = st.file_uploader(label="Upload file(s) that end with the .txt suffix",
|
| 48 |
accept_multiple_files=True, key = 'text_uploader',
|
| 49 |
-
type
|
| 50 |
st.write("__Option B:__")
|
| 51 |
uploaded_csv_text_files = st.file_uploader(label='Upload a CSV file with two columns: "title" and "text"',
|
| 52 |
accept_multiple_files=False, key = 'csv_text_uploader',
|
| 53 |
-
type
|
| 54 |
|
| 55 |
if text_input == display_text and display_text != '':
|
| 56 |
text_input = example_text
|
| 57 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
|
| 59 |
st.text("\n\n\n")
|
| 60 |
st.markdown("##### Step 2: Enter Labels")
|
|
@@ -66,10 +75,11 @@ with st.form(key='my_form'):
|
|
| 66 |
uploaded_labels_file = st.file_uploader("Choose a CSV file with one column and no header, where each cell is a separate label",
|
| 67 |
key='labels_uploader')
|
| 68 |
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
|
|
|
| 73 |
|
| 74 |
st.text("\n\n\n")
|
| 75 |
st.markdown("##### Step 3: Provide Ground Truth Labels (_Optional_)")
|
|
@@ -119,9 +129,9 @@ if submit_button or example_button:
|
|
| 119 |
else:
|
| 120 |
|
| 121 |
if len(text_input) != 0:
|
| 122 |
-
text_df = pd.DataFrame.from_dict({'title': ['
|
| 123 |
|
| 124 |
-
# OPTION A
|
| 125 |
elif uploaded_text_files is not None:
|
| 126 |
st.markdown("### Text Inputs")
|
| 127 |
st.write('Files concatenated into a dataframe:')
|
|
@@ -141,11 +151,9 @@ if submit_button or example_button:
|
|
| 141 |
file_name='title_text.csv',
|
| 142 |
mime='title_text/csv',
|
| 143 |
)
|
| 144 |
-
# OPTION B
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
if len(text_input) != 0:
|
| 148 |
-
text_df = pd.DataFrame.from_dict({'title': ['Submitted Text'], 'text': [text_input]})
|
| 149 |
|
| 150 |
|
| 151 |
with st.spinner('Breaking up text into more reasonable chunks (transformers cannot exceed a 1024 token max)...'):
|
|
@@ -196,71 +204,85 @@ if submit_button or example_button:
|
|
| 196 |
|
| 197 |
|
| 198 |
st.markdown("### Summary")
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
|
|
|
| 228 |
)
|
| 229 |
|
| 230 |
if ((len(text_input) == 0 and uploaded_text_files is None and uploaded_csv_text_files is None)
|
| 231 |
or (len(labels) == 0 and uploaded_labels_file is None)):
|
| 232 |
st.error('Enter some text and at least one possible topic to see label predictions.')
|
| 233 |
else:
|
| 234 |
-
|
|
|
|
|
|
|
|
|
|
| 235 |
|
| 236 |
if uploaded_labels_file is not None:
|
| 237 |
-
labels_df = pd.read_csv(uploaded_labels_file)
|
| 238 |
label_list = labels_df.iloc[:, 0]
|
| 239 |
else:
|
| 240 |
label_list = labels
|
| 241 |
-
st.write(label_list)
|
| 242 |
|
| 243 |
-
with st.spinner('Matching labels...'):
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
|
| 248 |
labels_full_col_list = ['title', 'label', 'scores_from_full_text']
|
| 249 |
labels_full_df = pd.DataFrame(columns=labels_full_col_list)
|
| 250 |
|
| 251 |
for i in range(0, len(text_df)):
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
|
| 258 |
f_topics, f_scores = md.classifier_zero(classifier, sequence=text_df['text'][i], labels=label_list, multi_class=True)
|
| 259 |
lf_df = pd.DataFrame({'label': f_topics, 'scores_from_full_text': f_scores})
|
| 260 |
lf_df['title'] = text_df['title'][i]
|
| 261 |
labels_full_df = pd.concat([labels_full_df, lf_df[labels_full_col_list]])
|
| 262 |
|
| 263 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 264 |
st.dataframe(label_match_df)
|
| 265 |
st.download_button(
|
| 266 |
label="Download data as CSV",
|
|
|
|
| 46 |
st.write("__Option A:__")
|
| 47 |
uploaded_text_files = st.file_uploader(label="Upload file(s) that end with the .txt suffix",
|
| 48 |
accept_multiple_files=True, key = 'text_uploader',
|
| 49 |
+
type='txt')
|
| 50 |
st.write("__Option B:__")
|
| 51 |
uploaded_csv_text_files = st.file_uploader(label='Upload a CSV file with two columns: "title" and "text"',
|
| 52 |
accept_multiple_files=False, key = 'csv_text_uploader',
|
| 53 |
+
type='csv')
|
| 54 |
|
| 55 |
if text_input == display_text and display_text != '':
|
| 56 |
text_input = example_text
|
| 57 |
|
| 58 |
+
gen_keywords = st.radio(
|
| 59 |
+
"Generate keywords from text? (independent from the input labels below)",
|
| 60 |
+
('Yes', 'No')
|
| 61 |
+
)
|
| 62 |
+
|
| 63 |
+
gen_summary = st.radio(
|
| 64 |
+
"Generate summary from text? (recommended for label matching below, but will take longer)",
|
| 65 |
+
('Yes', 'No')
|
| 66 |
+
)
|
| 67 |
|
| 68 |
st.text("\n\n\n")
|
| 69 |
st.markdown("##### Step 2: Enter Labels")
|
|
|
|
| 75 |
uploaded_labels_file = st.file_uploader("Choose a CSV file with one column and no header, where each cell is a separate label",
|
| 76 |
key='labels_uploader')
|
| 77 |
|
| 78 |
+
# summary_option = st.multiselect(
|
| 79 |
+
# "Match labels to text using?",
|
| 80 |
+
# ['Summary', 'Full Text'],
|
| 81 |
+
# ['Summary', 'Full Text']
|
| 82 |
+
# )
|
| 83 |
|
| 84 |
st.text("\n\n\n")
|
| 85 |
st.markdown("##### Step 3: Provide Ground Truth Labels (_Optional_)")
|
|
|
|
| 129 |
else:
|
| 130 |
|
| 131 |
if len(text_input) != 0:
|
| 132 |
+
text_df = pd.DataFrame.from_dict({'title': ['Submitted Text'], 'text': [text_input]})
|
| 133 |
|
| 134 |
+
# OPTION A
|
| 135 |
elif uploaded_text_files is not None:
|
| 136 |
st.markdown("### Text Inputs")
|
| 137 |
st.write('Files concatenated into a dataframe:')
|
|
|
|
| 151 |
file_name='title_text.csv',
|
| 152 |
mime='title_text/csv',
|
| 153 |
)
|
| 154 |
+
# OPTION B
|
| 155 |
+
elif uploaded_csv_text_files is not None:
|
| 156 |
+
text_df = pd.read_csv(uploaded_csv_text_files)
|
|
|
|
|
|
|
| 157 |
|
| 158 |
|
| 159 |
with st.spinner('Breaking up text into more reasonable chunks (transformers cannot exceed a 1024 token max)...'):
|
|
|
|
| 204 |
|
| 205 |
|
| 206 |
st.markdown("### Summary")
|
| 207 |
+
if gen_summary == 'Yes':
|
| 208 |
+
with st.spinner(f'Generating summaries for {len(text_df)} texts consisting of a total of {text_chunk_counter} chunks (this may take a minute)...'):
|
| 209 |
+
sum_dict = dict()
|
| 210 |
+
for i, key in enumerate(text_chunks_lib):
|
| 211 |
+
with st.expander(label=f'({i+1}/{len(text_df)}) Expand to see intermediate summary generation details for: {key}', expanded=False):
|
| 212 |
+
# for key in text_chunks_lib:
|
| 213 |
+
summary = []
|
| 214 |
+
for num_chunk, text_chunk in enumerate(text_chunks_lib[key]):
|
| 215 |
+
chunk_summary = md.summarizer_gen(summarizer, sequence=text_chunk, maximum_tokens=300, minimum_tokens=20)
|
| 216 |
+
summary.append(chunk_summary)
|
| 217 |
+
|
| 218 |
+
st.markdown(f"###### Original Text Chunk {num_chunk+1}/{len(text_chunks)}" )
|
| 219 |
+
st.markdown(text_chunk)
|
| 220 |
+
st.markdown(f"###### Partial Summary {num_chunk+1}/{len(text_chunks)}")
|
| 221 |
+
st.markdown(chunk_summary)
|
| 222 |
+
|
| 223 |
+
# Combine all the summaries into a list and compress into one document, again
|
| 224 |
+
final_summary = "\n\n".join(list(summary))
|
| 225 |
+
sum_dict[key] = [final_summary]
|
| 226 |
+
|
| 227 |
+
sum_df = pd.DataFrame.from_dict(sum_dict).T.reset_index()
|
| 228 |
+
sum_df.columns = ['title', 'summary_text']
|
| 229 |
+
# TO DO: Make sure summary_text does not exceed the token length
|
| 230 |
+
|
| 231 |
+
st.dataframe(sum_df)
|
| 232 |
+
st.download_button(
|
| 233 |
+
label="Download data as CSV",
|
| 234 |
+
data=sum_df.to_csv().encode('utf-8'),
|
| 235 |
+
file_name='title_summary.csv',
|
| 236 |
+
mime='title_summary/csv',
|
| 237 |
)
|
| 238 |
|
| 239 |
if ((len(text_input) == 0 and uploaded_text_files is None and uploaded_csv_text_files is None)
|
| 240 |
or (len(labels) == 0 and uploaded_labels_file is None)):
|
| 241 |
st.error('Enter some text and at least one possible topic to see label predictions.')
|
| 242 |
else:
|
| 243 |
+
if gen_summary == 'Yes':
|
| 244 |
+
st.markdown("### Top Label Predictions on Summary vs Full Text")
|
| 245 |
+
else:
|
| 246 |
+
st.markdown("### Top Label Predictions on Full Text")
|
| 247 |
|
| 248 |
if uploaded_labels_file is not None:
|
| 249 |
+
labels_df = pd.read_csv(uploaded_labels_file, header=None)
|
| 250 |
label_list = labels_df.iloc[:, 0]
|
| 251 |
else:
|
| 252 |
label_list = labels
|
|
|
|
| 253 |
|
| 254 |
+
with st.spinner('Matching labels...(may take some time)'):
|
| 255 |
+
if gen_summary == 'Yes':
|
| 256 |
+
labels_sum_col_list = ['title', 'label', 'scores_from_summary']
|
| 257 |
+
labels_sum_df = pd.DataFrame(columns=labels_sum_col_list)
|
| 258 |
|
| 259 |
labels_full_col_list = ['title', 'label', 'scores_from_full_text']
|
| 260 |
labels_full_df = pd.DataFrame(columns=labels_full_col_list)
|
| 261 |
|
| 262 |
for i in range(0, len(text_df)):
|
| 263 |
+
if gen_summary == 'Yes':
|
| 264 |
+
s_topics, s_scores = md.classifier_zero(classifier, sequence=sum_df['summary_text'][i], labels=label_list, multi_class=True)
|
| 265 |
+
ls_df = pd.DataFrame({'label': s_topics, 'scores_from_summary': s_scores})
|
| 266 |
+
ls_df['title'] = text_df['title'][i]
|
| 267 |
+
labels_sum_df = pd.concat([labels_sum_df, ls_df[labels_sum_col_list]])
|
| 268 |
|
| 269 |
f_topics, f_scores = md.classifier_zero(classifier, sequence=text_df['text'][i], labels=label_list, multi_class=True)
|
| 270 |
lf_df = pd.DataFrame({'label': f_topics, 'scores_from_full_text': f_scores})
|
| 271 |
lf_df['title'] = text_df['title'][i]
|
| 272 |
labels_full_df = pd.concat([labels_full_df, lf_df[labels_full_col_list]])
|
| 273 |
|
| 274 |
+
with st.expander(f'({i+1}/{len(text_df)}) See intermediate label matching results'):
|
| 275 |
+
st.write(f"Results for {text_df['title'][i]}")
|
| 276 |
+
if gen_summary == 'Yes':
|
| 277 |
+
st.dataframe(pd.merge(labels_sum_df, labels_full_df, on=['title','label']))
|
| 278 |
+
else:
|
| 279 |
+
st.dataframe(labels_full_df)
|
| 280 |
+
|
| 281 |
+
if gen_summary == 'Yes':
|
| 282 |
+
label_match_df = pd.merge(labels_sum_df, labels_full_df, on=['title','label'])
|
| 283 |
+
else:
|
| 284 |
+
label_match_df = labels_full_df.copy()
|
| 285 |
+
|
| 286 |
st.dataframe(label_match_df)
|
| 287 |
st.download_button(
|
| 288 |
label="Download data as CSV",
|