Spaces:
Runtime error
Runtime error
⚡️ 🐛 fix issue of wrong input text, disambiguate vars
Browse filesSigned-off-by: peter szemraj <peterszemraj@gmail.com>
app.py
CHANGED
|
@@ -45,7 +45,9 @@ from aggregate import BatchAggregator
|
|
| 45 |
from pdf2text import convert_PDF_to_Text
|
| 46 |
from summarize import load_model_and_tokenizer, summarize_via_tokenbatches
|
| 47 |
from utils import (
|
|
|
|
| 48 |
extract_batches,
|
|
|
|
| 49 |
load_example_filenames,
|
| 50 |
remove_stagnant_files,
|
| 51 |
saves_summary,
|
|
@@ -241,10 +243,13 @@ def proc_submission(
|
|
| 241 |
history = {}
|
| 242 |
clean_text = clean(input_text, lower=False)
|
| 243 |
clean_text = remove_stopwords(clean_text) if predrop_stopwords else clean_text
|
| 244 |
-
|
|
|
|
|
|
|
|
|
|
| 245 |
|
| 246 |
-
if
|
| 247 |
-
|
| 248 |
# create elaborate HTML warning
|
| 249 |
input_wc = re.split(r"\s+", input_text)
|
| 250 |
msg = f"""
|
|
@@ -256,7 +261,7 @@ def proc_submission(
|
|
| 256 |
logging.warning(msg)
|
| 257 |
history["WARNING"] = msg
|
| 258 |
else:
|
| 259 |
-
|
| 260 |
msg = None
|
| 261 |
|
| 262 |
if len(input_text) < 50:
|
|
@@ -278,7 +283,7 @@ def proc_submission(
|
|
| 278 |
return msg, "<strong>No summary generated.</strong>", "", []
|
| 279 |
|
| 280 |
_summaries = predict(
|
| 281 |
-
input_text=
|
| 282 |
model_name=model_name,
|
| 283 |
token_batch_length=token_batch_length,
|
| 284 |
**settings,
|
|
@@ -410,14 +415,14 @@ def parse_args():
|
|
| 410 |
"--add_beam_option",
|
| 411 |
type=int,
|
| 412 |
default=None,
|
| 413 |
-
help=f"Add a beam search option to the
|
| 414 |
)
|
| 415 |
parser.add_argument(
|
| 416 |
"-batch",
|
| 417 |
"--token_batch_option",
|
| 418 |
type=int,
|
| 419 |
default=None,
|
| 420 |
-
help=f"Add a token batch
|
| 421 |
)
|
| 422 |
parser.add_argument(
|
| 423 |
"-level",
|
|
@@ -577,7 +582,7 @@ if __name__ == "__main__":
|
|
| 577 |
value="<center><i>Aggregate summary will appear here!</i></center>",
|
| 578 |
)
|
| 579 |
gr.Markdown(
|
| 580 |
-
"\n\n_Aggregate summary also appended to the bottom of the `.txt` file
|
| 581 |
)
|
| 582 |
|
| 583 |
gr.Markdown("---")
|
|
|
|
| 45 |
from pdf2text import convert_PDF_to_Text
|
| 46 |
from summarize import load_model_and_tokenizer, summarize_via_tokenbatches
|
| 47 |
from utils import (
|
| 48 |
+
contraction_aware_tokenize,
|
| 49 |
extract_batches,
|
| 50 |
+
extract_keywords,
|
| 51 |
load_example_filenames,
|
| 52 |
remove_stagnant_files,
|
| 53 |
saves_summary,
|
|
|
|
| 243 |
history = {}
|
| 244 |
clean_text = clean(input_text, lower=False)
|
| 245 |
clean_text = remove_stopwords(clean_text) if predrop_stopwords else clean_text
|
| 246 |
+
logging.info(
|
| 247 |
+
f"pre-truncation word count: {len(contraction_aware_tokenize(clean_text))}"
|
| 248 |
+
)
|
| 249 |
+
truncation_validated = truncate_word_count(clean_text, max_words=max_input_length)
|
| 250 |
|
| 251 |
+
if truncation_validated["was_truncated"]:
|
| 252 |
+
model_input_text = truncation_validated["processed_text"]
|
| 253 |
# create elaborate HTML warning
|
| 254 |
input_wc = re.split(r"\s+", input_text)
|
| 255 |
msg = f"""
|
|
|
|
| 261 |
logging.warning(msg)
|
| 262 |
history["WARNING"] = msg
|
| 263 |
else:
|
| 264 |
+
model_input_text = truncation_validated["processed_text"]
|
| 265 |
msg = None
|
| 266 |
|
| 267 |
if len(input_text) < 50:
|
|
|
|
| 283 |
return msg, "<strong>No summary generated.</strong>", "", []
|
| 284 |
|
| 285 |
_summaries = predict(
|
| 286 |
+
input_text=model_input_text,
|
| 287 |
model_name=model_name,
|
| 288 |
token_batch_length=token_batch_length,
|
| 289 |
**settings,
|
|
|
|
| 415 |
"--add_beam_option",
|
| 416 |
type=int,
|
| 417 |
default=None,
|
| 418 |
+
help=f"Add a beam search option to the demo UI options, default: {pp.pformat(BEAM_OPTIONS, compact=True)}",
|
| 419 |
)
|
| 420 |
parser.add_argument(
|
| 421 |
"-batch",
|
| 422 |
"--token_batch_option",
|
| 423 |
type=int,
|
| 424 |
default=None,
|
| 425 |
+
help=f"Add a token batch size to the demo UI options, default: {pp.pformat(TOKEN_BATCH_OPTIONS, compact=True)}",
|
| 426 |
)
|
| 427 |
parser.add_argument(
|
| 428 |
"-level",
|
|
|
|
| 582 |
value="<center><i>Aggregate summary will appear here!</i></center>",
|
| 583 |
)
|
| 584 |
gr.Markdown(
|
| 585 |
+
"\n\n_Aggregate summary is also appended to the bottom of the `.txt` file._"
|
| 586 |
)
|
| 587 |
|
| 588 |
gr.Markdown("---")
|
utils.py
CHANGED
|
@@ -27,8 +27,8 @@ STOPWORDS = set(
|
|
| 27 |
)
|
| 28 |
|
| 29 |
|
| 30 |
-
def
|
| 31 |
-
"""
|
| 32 |
|
| 33 |
# Tokenize the text using the WhitespaceTokenizer
|
| 34 |
tokenizer = WhitespaceTokenizer()
|
|
@@ -56,17 +56,21 @@ def custom_tokenize(text: str) -> List[str]:
|
|
| 56 |
|
| 57 |
|
| 58 |
def remove_stopwords(
|
| 59 |
-
text: str, stopwords: List[str] = STOPWORDS,
|
| 60 |
) -> str:
|
| 61 |
"""
|
| 62 |
remove_stopwords - Remove stopwords from text.
|
| 63 |
|
| 64 |
:param str text: input text
|
| 65 |
:param List[str] stopwords: list of stopwords, defaults to STOPWORDS
|
| 66 |
-
:param bool
|
| 67 |
:return str: text with stopwords removed
|
| 68 |
"""
|
| 69 |
-
words =
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
|
| 71 |
filtered_words = []
|
| 72 |
for word in words:
|
|
@@ -204,14 +208,14 @@ def truncate_word_count(text: str, max_words=1024) -> dict:
|
|
| 204 |
:param int max_words: the maximum number of words to keep, defaults to 1024
|
| 205 |
:return: dict, the processed text
|
| 206 |
"""
|
| 207 |
-
words =
|
| 208 |
processed = {}
|
| 209 |
if len(words) > max_words:
|
| 210 |
processed["was_truncated"] = True
|
| 211 |
-
processed["
|
| 212 |
else:
|
| 213 |
processed["was_truncated"] = False
|
| 214 |
-
processed["
|
| 215 |
return processed
|
| 216 |
|
| 217 |
|
|
|
|
| 27 |
)
|
| 28 |
|
| 29 |
|
| 30 |
+
def contraction_aware_tokenize(text: str) -> List[str]:
|
| 31 |
+
"""contraction_aware_tokenize - merges words containing apostrophes as one token."""
|
| 32 |
|
| 33 |
# Tokenize the text using the WhitespaceTokenizer
|
| 34 |
tokenizer = WhitespaceTokenizer()
|
|
|
|
| 56 |
|
| 57 |
|
| 58 |
def remove_stopwords(
|
| 59 |
+
text: str, stopwords: List[str] = STOPWORDS, contraction_tokenize: bool = True
|
| 60 |
) -> str:
|
| 61 |
"""
|
| 62 |
remove_stopwords - Remove stopwords from text.
|
| 63 |
|
| 64 |
:param str text: input text
|
| 65 |
:param List[str] stopwords: list of stopwords, defaults to STOPWORDS
|
| 66 |
+
:param bool contraction_tokenize: use custom apostrophe tokenizer, defaults to True
|
| 67 |
:return str: text with stopwords removed
|
| 68 |
"""
|
| 69 |
+
words = (
|
| 70 |
+
contraction_aware_tokenize(text)
|
| 71 |
+
if contraction_tokenize
|
| 72 |
+
else word_tokenize(text)
|
| 73 |
+
)
|
| 74 |
|
| 75 |
filtered_words = []
|
| 76 |
for word in words:
|
|
|
|
| 208 |
:param int max_words: the maximum number of words to keep, defaults to 1024
|
| 209 |
:return: dict, the processed text
|
| 210 |
"""
|
| 211 |
+
words = contraction_aware_tokenize(str(text))
|
| 212 |
processed = {}
|
| 213 |
if len(words) > max_words:
|
| 214 |
processed["was_truncated"] = True
|
| 215 |
+
processed["processed_text"] = " ".join(words[:max_words])
|
| 216 |
else:
|
| 217 |
processed["was_truncated"] = False
|
| 218 |
+
processed["processed_text"] = text
|
| 219 |
return processed
|
| 220 |
|
| 221 |
|