document-summarization

Runtime error

App Files Files Community

pszemraj commited on May 29, 2023

Commit

f84fce9

1 Parent(s): dcce2ac

📝 docs

Browse files

Signed-off-by: peter szemraj <peterszemraj@gmail.com>

Files changed (1) hide show

app.py +21 -4

app.py CHANGED Viewed

@@ -212,6 +212,7 @@ def proc_submission(
         length_penalty (float): the length penalty to use
         repetition_penalty (float): the repetition penalty to use
         no_repeat_ngram_size (int): the no repeat ngram size to use
         max_input_length (int, optional): the maximum input length to use. Defaults to 6144.
     Note:
@@ -219,7 +220,7 @@ def proc_submission(
         environment variable APP_MAX_WORDS to a different value.
     Returns:
-        str in HTML format, string of the summary, str of score
     """
     remove_stagnant_files()  # clean up old files
@@ -257,7 +258,7 @@ def proc_submission(
         msg = f"""
         <div style="background-color: #FFA500; color: white; padding: 20px;">
         <h3>Warning</h3>
-        <p>Input text was truncated to {max_input_length} words. That's about {100*max_input_length/len(input_wc):.2f}% of the submission.</p>
         <p>Dropping stopwords is set to {predrop_stopwords}. If this is not what you intended, please validate the advanced settings.</p>
         </div>
         """
@@ -267,6 +268,22 @@ def proc_submission(
         model_input_text = truncation_validated["processed_text"]
         msg = None
     if len(input_text) < 50:
         # this is essentially a different case from the above
         msg = f"""
@@ -589,8 +606,8 @@ if __name__ == "__main__":
                     )
                     gr.Markdown(
                         f"""Aggregate the above batches into a cohesive summary.
-                    - a secondary instruct-tuned LM consolidates info from the batches
-                    - current model: [{AGGREGATE_MODEL}](https://hf.co/{AGGREGATE_MODEL})
                                 """
                     )
                 with gr.Column(variant="panel"):

         length_penalty (float): the length penalty to use
         repetition_penalty (float): the repetition penalty to use
         no_repeat_ngram_size (int): the no repeat ngram size to use
+        predrop_stopwords (bool): whether to pre-drop stopwords before truncating/summarizing
         max_input_length (int, optional): the maximum input length to use. Defaults to 6144.
     Note:
         environment variable APP_MAX_WORDS to a different value.
     Returns:
+        tuple (4): a tuple containing the following:
     """
     remove_stagnant_files()  # clean up old files
         msg = f"""
         <div style="background-color: #FFA500; color: white; padding: 20px;">
         <h3>Warning</h3>
+        <p>Input text was truncated to {max_input_length} words. That's about {100*max_input_length/input_wc:.2f}% of the original text.</p>
         <p>Dropping stopwords is set to {predrop_stopwords}. If this is not what you intended, please validate the advanced settings.</p>
         </div>
         """
         model_input_text = truncation_validated["processed_text"]
         msg = None
+    if predrop_stopwords:
+        # TODO: remove this
+        outdir = Path.cwd() / "scratch" / "predrop_stopwords-v4"
+        outdir.mkdir(parents=True, exist_ok=True)
+        keywords_cln = " ".join(extract_keywords(cln_text, kw_max_len=4))
+        keywords_sw_removed = "_".join(extract_keywords(model_input_text, kw_max_len=4))
+        cln_filename = f"{keywords_cln}_{len(cln_text)}.txt"
+        cln_outdir = outdir.parent / "source-text"
+        cln_outdir.mkdir(parents=True, exist_ok=True)
+        with open(cln_outdir / cln_filename, "w", encoding="utf-8") as f:
+            f.write(cln_text)
+        sw_rm_filename = f"{keywords_sw_removed}_{len(model_input_text)}.txt"
+        with open(outdir / sw_rm_filename, "w", encoding="utf-8") as f:
+            f.write(model_input_text)
+        logging.info(f"saved predrop_stopwords file to {outdir / sw_rm_filename}")
     if len(input_text) < 50:
         # this is essentially a different case from the above
         msg = f"""
                     )
                     gr.Markdown(
                         f"""Aggregate the above batches into a cohesive summary.
+                    - A secondary instruct-tuned LM consolidates info
+                    - Current model: [{AGGREGATE_MODEL}](https://hf.co/{AGGREGATE_MODEL})
                                 """
                     )
                 with gr.Column(variant="panel"):