Paula Leonova
commited on
Commit
·
39c7695
1
Parent(s):
7055ca6
Add back option for single text entry
Browse files
app.py
CHANGED
|
@@ -19,8 +19,8 @@ ex_long_text = example_long_text_load()
|
|
| 19 |
|
| 20 |
# if __name__ == '__main__':
|
| 21 |
st.markdown("### Long Text Summarization & Multi-Label Classification")
|
| 22 |
-
st.write("This app summarizes and then classifies your long text with multiple labels using [BART Large MNLI](https://huggingface.co/facebook/bart-large-mnli). The keywords are generated using [KeyBERT](https://github.com/MaartenGr/KeyBERT).")
|
| 23 |
-
st.write("__Inputs__: User enters their own custom text and labels.")
|
| 24 |
st.write("__Outputs__: A summary of the text, likelihood percentages for each label and a downloadable csv of the results. \
|
| 25 |
Includes additional options to generate a list of keywords and/or evaluate results against a list of ground truth labels, if available.")
|
| 26 |
|
|
@@ -110,16 +110,19 @@ with st.spinner('Loading pretrained models...'):
|
|
| 110 |
kw_model = md.load_keyword_model()
|
| 111 |
k_time = round(time.time() - start,4)
|
| 112 |
|
| 113 |
-
st.
|
| 114 |
-
|
| 115 |
|
| 116 |
if submit_button or example_button:
|
| 117 |
if len(text_input) == 0 and uploaded_text_files is None and uploaded_csv_text_files is None:
|
| 118 |
st.error("Enter some text to generate a summary")
|
| 119 |
else:
|
| 120 |
|
|
|
|
|
|
|
|
|
|
| 121 |
# OPTION A:
|
| 122 |
-
|
| 123 |
st.markdown("### Text Inputs")
|
| 124 |
st.write('Files concatenated into a dataframe:')
|
| 125 |
file_names = []
|
|
@@ -141,6 +144,10 @@ if submit_button or example_button:
|
|
| 141 |
# OPTION B: [TO DO: DIRECT CSV UPLOAD INSTEAD]
|
| 142 |
|
| 143 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 144 |
with st.spinner('Breaking up text into more reasonable chunks (transformers cannot exceed a 1024 token max)...'):
|
| 145 |
# For each body of text, create text chunks of a certain token size required for the transformer
|
| 146 |
|
|
@@ -165,17 +172,22 @@ if submit_button or example_button:
|
|
| 165 |
for text_chunk in text_chunks_lib[key]:
|
| 166 |
keywords_list = md.keyword_gen(kw_model, text_chunk)
|
| 167 |
kw_dict[key] = dict(keywords_list)
|
| 168 |
-
|
| 169 |
kw_df0 = pd.DataFrame.from_dict(kw_dict).reset_index()
|
| 170 |
kw_df0.rename(columns={'index': 'keyword'}, inplace=True)
|
| 171 |
kw_df = pd.melt(kw_df0, id_vars=['keyword'], var_name='title', value_name='score').dropna()
|
| 172 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 173 |
st.dataframe(kw_df)
|
| 174 |
st.download_button(
|
| 175 |
label="Download data as CSV",
|
| 176 |
data=kw_df.to_csv().encode('utf-8'),
|
| 177 |
-
file_name='
|
| 178 |
-
mime='
|
| 179 |
)
|
| 180 |
|
| 181 |
|
|
|
|
| 19 |
|
| 20 |
# if __name__ == '__main__':
|
| 21 |
st.markdown("### Long Text Summarization & Multi-Label Classification")
|
| 22 |
+
st.write("This app summarizes and then classifies your long text(s) with multiple labels using [BART Large MNLI](https://huggingface.co/facebook/bart-large-mnli). The keywords are generated using [KeyBERT](https://github.com/MaartenGr/KeyBERT).")
|
| 23 |
+
st.write("__Inputs__: User enters their own custom text(s) and labels.")
|
| 24 |
st.write("__Outputs__: A summary of the text, likelihood percentages for each label and a downloadable csv of the results. \
|
| 25 |
Includes additional options to generate a list of keywords and/or evaluate results against a list of ground truth labels, if available.")
|
| 26 |
|
|
|
|
| 110 |
kw_model = md.load_keyword_model()
|
| 111 |
k_time = round(time.time() - start,4)
|
| 112 |
|
| 113 |
+
st.spinner(f'Time taken to load various models: {k_time}s for KeyBERT model & {s_time}s for BART summarizer mnli model & {c_time}s for BART classifier mnli model.')
|
| 114 |
+
# st.success(None)
|
| 115 |
|
| 116 |
if submit_button or example_button:
|
| 117 |
if len(text_input) == 0 and uploaded_text_files is None and uploaded_csv_text_files is None:
|
| 118 |
st.error("Enter some text to generate a summary")
|
| 119 |
else:
|
| 120 |
|
| 121 |
+
if len(text_input) != 0:
|
| 122 |
+
text_df = pd.DataFrame.from_dict({'title': ['sample'], 'text': [text_input]})
|
| 123 |
+
|
| 124 |
# OPTION A:
|
| 125 |
+
elif uploaded_text_files is not None:
|
| 126 |
st.markdown("### Text Inputs")
|
| 127 |
st.write('Files concatenated into a dataframe:')
|
| 128 |
file_names = []
|
|
|
|
| 144 |
# OPTION B: [TO DO: DIRECT CSV UPLOAD INSTEAD]
|
| 145 |
|
| 146 |
|
| 147 |
+
if len(text_input) != 0:
|
| 148 |
+
text_df = pd.DataFrame.from_dict({'title': ['sample'], 'text': [text_input]})
|
| 149 |
+
|
| 150 |
+
|
| 151 |
with st.spinner('Breaking up text into more reasonable chunks (transformers cannot exceed a 1024 token max)...'):
|
| 152 |
# For each body of text, create text chunks of a certain token size required for the transformer
|
| 153 |
|
|
|
|
| 172 |
for text_chunk in text_chunks_lib[key]:
|
| 173 |
keywords_list = md.keyword_gen(kw_model, text_chunk)
|
| 174 |
kw_dict[key] = dict(keywords_list)
|
| 175 |
+
# Display as a dataframe
|
| 176 |
kw_df0 = pd.DataFrame.from_dict(kw_dict).reset_index()
|
| 177 |
kw_df0.rename(columns={'index': 'keyword'}, inplace=True)
|
| 178 |
kw_df = pd.melt(kw_df0, id_vars=['keyword'], var_name='title', value_name='score').dropna()
|
| 179 |
+
if len(text_input) != 0:
|
| 180 |
+
title_element = []
|
| 181 |
+
else:
|
| 182 |
+
title_element = ['title']
|
| 183 |
+
kw_column_list = ['keyword', 'score']
|
| 184 |
+
kw_df = kw_df[kw_df['score'] > 0.1][title_element + kw_column_list].sort_values(title_element + ['score'], ascending=False).reset_index().drop(columns='index')
|
| 185 |
st.dataframe(kw_df)
|
| 186 |
st.download_button(
|
| 187 |
label="Download data as CSV",
|
| 188 |
data=kw_df.to_csv().encode('utf-8'),
|
| 189 |
+
file_name='title_keywords.csv',
|
| 190 |
+
mime='title_keywords/csv',
|
| 191 |
)
|
| 192 |
|
| 193 |
|