Spaces:
Runtime error
Runtime error
⚰️ 🎨 clean up and rm verbose testing code
Browse filesSigned-off-by: peter szemraj <peterszemraj@gmail.com>
- aggregate.py +1 -1
- app.py +2 -23
- utils.py +1 -1
aggregate.py
CHANGED
|
@@ -7,8 +7,8 @@ How it works:
|
|
| 7 |
2. The language model does it.
|
| 8 |
3. Yaay!
|
| 9 |
"""
|
| 10 |
-
import pprint as pp
|
| 11 |
import logging
|
|
|
|
| 12 |
import time
|
| 13 |
|
| 14 |
import torch
|
|
|
|
| 7 |
2. The language model does it.
|
| 8 |
3. Yaay!
|
| 9 |
"""
|
|
|
|
| 10 |
import logging
|
| 11 |
+
import pprint as pp
|
| 12 |
import time
|
| 13 |
|
| 14 |
import torch
|
app.py
CHANGED
|
@@ -19,9 +19,9 @@ import contextlib
|
|
| 19 |
import gc
|
| 20 |
import logging
|
| 21 |
import os
|
|
|
|
| 22 |
import random
|
| 23 |
import re
|
| 24 |
-
import pprint as pp
|
| 25 |
import sys
|
| 26 |
import time
|
| 27 |
from pathlib import Path
|
|
@@ -47,13 +47,12 @@ from summarize import load_model_and_tokenizer, summarize_via_tokenbatches
|
|
| 47 |
from utils import (
|
| 48 |
contraction_aware_tokenize,
|
| 49 |
extract_batches,
|
| 50 |
-
extract_keywords,
|
| 51 |
load_example_filenames,
|
| 52 |
remove_stagnant_files,
|
|
|
|
| 53 |
saves_summary,
|
| 54 |
textlist2html,
|
| 55 |
truncate_word_count,
|
| 56 |
-
remove_stopwords,
|
| 57 |
)
|
| 58 |
|
| 59 |
_here = Path(__file__).parent
|
|
@@ -268,22 +267,6 @@ def proc_submission(
|
|
| 268 |
model_input_text = truncation_validated["processed_text"]
|
| 269 |
msg = None
|
| 270 |
|
| 271 |
-
if predrop_stopwords:
|
| 272 |
-
# TODO: remove this
|
| 273 |
-
|
| 274 |
-
outdir = Path.cwd() / "scratch" / "predrop_stopwords-v4"
|
| 275 |
-
outdir.mkdir(parents=True, exist_ok=True)
|
| 276 |
-
keywords_cln = " ".join(extract_keywords(cln_text, kw_max_len=4))
|
| 277 |
-
keywords_sw_removed = "_".join(extract_keywords(model_input_text, kw_max_len=4))
|
| 278 |
-
cln_filename = f"{keywords_cln}_{len(cln_text)}.txt"
|
| 279 |
-
cln_outdir = outdir.parent / "source-text"
|
| 280 |
-
cln_outdir.mkdir(parents=True, exist_ok=True)
|
| 281 |
-
with open(cln_outdir / cln_filename, "w", encoding="utf-8") as f:
|
| 282 |
-
f.write(cln_text)
|
| 283 |
-
sw_rm_filename = f"{keywords_sw_removed}_{len(model_input_text)}.txt"
|
| 284 |
-
with open(outdir / sw_rm_filename, "w", encoding="utf-8") as f:
|
| 285 |
-
f.write(model_input_text)
|
| 286 |
-
logging.info(f"saved predrop_stopwords file to {outdir / sw_rm_filename}")
|
| 287 |
if len(input_text) < 50:
|
| 288 |
# this is essentially a different case from the above
|
| 289 |
msg = f"""
|
|
@@ -326,7 +309,6 @@ def proc_submission(
|
|
| 326 |
|
| 327 |
html += ""
|
| 328 |
|
| 329 |
-
# save to file
|
| 330 |
settings["remove_stopwords"] = predrop_stopwords
|
| 331 |
settings["model_name"] = model_name
|
| 332 |
saved_file = saves_summary(summarize_output=_summaries, outpath=None, **settings)
|
|
@@ -460,9 +442,6 @@ def parse_args():
|
|
| 460 |
choices=["DEBUG", "INFO", "WARNING", "ERROR"],
|
| 461 |
help="Set the logging level",
|
| 462 |
)
|
| 463 |
-
# if "--help" in sys.argv or "-h" in sys.argv:
|
| 464 |
-
# parser.print_help()
|
| 465 |
-
# sys.exit(0)
|
| 466 |
|
| 467 |
return parser.parse_args()
|
| 468 |
|
|
|
|
| 19 |
import gc
|
| 20 |
import logging
|
| 21 |
import os
|
| 22 |
+
import pprint as pp
|
| 23 |
import random
|
| 24 |
import re
|
|
|
|
| 25 |
import sys
|
| 26 |
import time
|
| 27 |
from pathlib import Path
|
|
|
|
| 47 |
from utils import (
|
| 48 |
contraction_aware_tokenize,
|
| 49 |
extract_batches,
|
|
|
|
| 50 |
load_example_filenames,
|
| 51 |
remove_stagnant_files,
|
| 52 |
+
remove_stopwords,
|
| 53 |
saves_summary,
|
| 54 |
textlist2html,
|
| 55 |
truncate_word_count,
|
|
|
|
| 56 |
)
|
| 57 |
|
| 58 |
_here = Path(__file__).parent
|
|
|
|
| 267 |
model_input_text = truncation_validated["processed_text"]
|
| 268 |
msg = None
|
| 269 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 270 |
if len(input_text) < 50:
|
| 271 |
# this is essentially a different case from the above
|
| 272 |
msg = f"""
|
|
|
|
| 309 |
|
| 310 |
html += ""
|
| 311 |
|
|
|
|
| 312 |
settings["remove_stopwords"] = predrop_stopwords
|
| 313 |
settings["model_name"] = model_name
|
| 314 |
saved_file = saves_summary(summarize_output=_summaries, outpath=None, **settings)
|
|
|
|
| 442 |
choices=["DEBUG", "INFO", "WARNING", "ERROR"],
|
| 443 |
help="Set the logging level",
|
| 444 |
)
|
|
|
|
|
|
|
|
|
|
| 445 |
|
| 446 |
return parser.parse_args()
|
| 447 |
|
utils.py
CHANGED
|
@@ -19,7 +19,7 @@ logging.basicConfig(
|
|
| 19 |
|
| 20 |
import torch
|
| 21 |
from natsort import natsorted
|
| 22 |
-
from nltk.tokenize import
|
| 23 |
from rapidfuzz import fuzz
|
| 24 |
|
| 25 |
STOPWORDS = set(
|
|
|
|
| 19 |
|
| 20 |
import torch
|
| 21 |
from natsort import natsorted
|
| 22 |
+
from nltk.tokenize import WhitespaceTokenizer, sent_tokenize, word_tokenize
|
| 23 |
from rapidfuzz import fuzz
|
| 24 |
|
| 25 |
STOPWORDS = set(
|