document-summarization

Runtime error

App Files Files Community

pszemraj commited on May 28, 2023

Commit

7e0dde7

1 Parent(s): 62a2921

📝 💄 improve docs and UI

Browse files

Signed-off-by: peter szemraj <peterszemraj@gmail.com>

Files changed (3) hide show

aggregate.py +6 -3
app.py +61 -39
utils.py +23 -21

aggregate.py CHANGED Viewed

@@ -1,5 +1,5 @@
 """
-aggregate.py is a module for aggregating text from multiple sources, or multiple parts of a single source.
     Primary usage is through the BatchAggregator class.
 How it works:
@@ -29,7 +29,8 @@ class BatchAggregator:
     Usage:
     >>> from aggregate import BatchAggregator
     >>> aggregator = BatchAggregator()
-    >>> aggregator.aggregate(["This is a test", "This is another test"])
     """
     GENERIC_CONFIG = GenerationConfig(
@@ -187,7 +188,7 @@ class BatchAggregator:
         **kwargs,
     ) -> str:
         f"""
-        Generate a summary of the specified texts.
         Args:
             text_list (list): The texts to summarize.
@@ -211,6 +212,8 @@ class BatchAggregator:
         self.logger.info(
             f"Input tokens:\t{self.count_tokens(prompt)}. Output tokens:\t{self.count_tokens(result)}"
         )
         return result
     def count_tokens(self, text: str) -> int:

 """
+aggregate.py - module for aggregating text from multiple sources/multiple parts of a single source.
     Primary usage is through the BatchAggregator class.
 How it works:
     Usage:
     >>> from aggregate import BatchAggregator
     >>> aggregator = BatchAggregator()
+    >>> agg = aggregator.infer_aggregate(["This is a test", "This is another test"])
+    >>> print(agg)
     """
     GENERIC_CONFIG = GenerationConfig(
         **kwargs,
     ) -> str:
         f"""
+        infer_aggregate - infers a consolidated summary from a list of texts.
         Args:
             text_list (list): The texts to summarize.
         self.logger.info(
             f"Input tokens:\t{self.count_tokens(prompt)}. Output tokens:\t{self.count_tokens(result)}"
         )
+        self.logger.debug(f"Generated text:\n{result}")
         return result
     def count_tokens(self, text: str) -> int:

app.py CHANGED Viewed

@@ -2,7 +2,7 @@
 app.py - the main module for the gradio app for summarization
 Usage:
-    python app.py
 Environment Variables:
     USE_TORCH (str): whether to use torch (1) or not (0)
@@ -20,7 +20,6 @@ import random
 import re
 import time
 from pathlib import Path
-import pprint as pp
 os.environ["USE_TORCH"] = "1"
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
@@ -33,18 +32,19 @@ logging.basicConfig(
 import gradio as gr
 import nltk
 import torch
-from aggregate import BatchAggregator
 from cleantext import clean
 from doctr.models import ocr_predictor
 from pdf2text import convert_PDF_to_Text
 from summarize import load_model_and_tokenizer, summarize_via_tokenbatches
 from utils import (
     extract_batches,
     load_example_filenames,
     saves_summary,
     textlist2html,
     truncate_word_count,
-    remove_stagnant_files,
 )
 _here = Path(__file__).parent
@@ -62,12 +62,13 @@ MODEL_OPTIONS = [
 ]  # models users can choose from
 SUMMARY_PLACEHOLDER = "<p><em>Output will appear below:</em></p>"
-# if duplicating space,, uncomment this line to adjust the max words
 # os.environ["APP_MAX_WORDS"] = str(2048)  # set the max words to 2048
 # os.environ["APP_OCR_MAX_PAGES"] = str(40)  # set the max pages to 40
-aggregator = BatchAggregator("MBZUAI/LaMini-Flan-T5-783M")
 def aggregate_text(
@@ -77,8 +78,8 @@ def aggregate_text(
     """
     Aggregate the text from the batches.
-        NOTE: you should probably include passing the BatchAggregator object as a parameter if using this code
-        outside of this file.
     :param batches_html: The batches to aggregate, in html format
     :param text_file: The text file to append the aggregate summary to
     :return: The aggregate summary in html format
@@ -104,13 +105,13 @@ def aggregate_text(
     content_batches = [batch["content"] for batch in extracted_batches]
     full_summary = aggregator.infer_aggregate(content_batches)
-    # if a path that exists is provided, save the summary with markdown formatting
     if out_path:
         out_path = Path(out_path)
         try:
             with open(out_path, "a", encoding="utf-8") as f:
-                f.write("\n\n### Aggregate Summary\n\n")
                 f.write(
                     "- This is an instruction-based LLM aggregation of the previous 'summary batches'.\n"
                 )
@@ -341,9 +342,9 @@ def load_uploaded_file(file_obj, max_pages: int = 20, lower: bool = False) -> st
                 raw_text = f.read()
             text = clean(raw_text, lower=lower)
         elif file_path.suffix == ".pdf":
-            logger.info(f"loading as PDF file {file_path}")
             max_pages = int(os.environ.get("APP_MAX_PAGES", max_pages))
-            logger.info(f"max_pages set to: {max_pages}")
             conversion_stats = convert_PDF_to_Text(
                 file_path,
                 ocr_model=ocr_model,
@@ -357,13 +358,15 @@ def load_uploaded_file(file_obj, max_pages: int = 20, lower: bool = False) -> st
         return text
     except Exception as e:
         logger.error(f"Trying to load file:\t{file_path},\nerror:\t{e}")
-        return "Error: Could not read file. Ensure that it is a valid text file with encoding UTF-8 if text, and a PDF if PDF."
 def parse_args():
     parser = argparse.ArgumentParser(
-        description="Document Summarization with Long-Document Transformers",
         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
     )
     parser.add_argument(
         "--share",
@@ -415,7 +418,7 @@ if __name__ == "__main__":
     with demo:
         gr.Markdown("# Document Summarization with Long-Document Transformers")
         gr.Markdown(
-            "An example use case for fine-tuned long document transformers. Model(s) are trained on [book summaries](https://huggingface.co/datasets/kmfoda/booksum). Architectures in this demo are [LongT5-base](https://huggingface.co/pszemraj/long-t5-tglobal-base-16384-book-summary) and [Pegasus-X-Large](https://huggingface.co/pszemraj/pegasus-x-large-book-summary)."
         )
         with gr.Column():
             gr.Markdown("## Load Inputs & Select Parameters")
@@ -440,7 +443,7 @@ if __name__ == "__main__":
                     load_examples_button = gr.Button(
                         "Load Example in Dropdown",
                     )
-                    load_file_button = gr.Button("Load an Uploaded File")
                 with gr.Column(variant="compact"):
                     example_name = gr.Dropdown(
                         _examples,
@@ -457,22 +460,23 @@ if __name__ == "__main__":
                 input_text = gr.Textbox(
                     lines=4,
                     max_lines=12,
-                    label="Input Text (for summarization)",
                     placeholder="Enter text to summarize, the text will be cleaned and truncated on Spaces. Narrative, academic (both papers and lecture transcription), and article text work well. May take a bit to generate depending on the input text :)",
                 )
         gr.Markdown("---")
         with gr.Column():
             gr.Markdown("## Generate Summary")
-            gr.Markdown(
-                "_Summarization should take ~1-2 minutes for most settings, but may extend up to 5-10 minutes in some scenarios._"
-            )
-            summarize_button = gr.Button(
-                "Summarize!",
-                variant="primary",
-            )  # TODO: collapse button to be on same line as something else
             output_text = gr.HTML("<p><em>Output will appear below:</em></p>")
             with gr.Column():
-                gr.Markdown("#### Results & Scores")
                 with gr.Row():
                     with gr.Column(variant="compact"):
                         gr.Markdown(
@@ -486,24 +490,42 @@ if __name__ == "__main__":
                         )
                     with gr.Column(variant="compact"):
                         gr.Markdown(
-                            "Scores represent the summary quality **roughly** as a measure of the model's 'confidence'. less-negative numbers (closer to 0) are better."
                         )
                         summary_scores = gr.Textbox(
                             label="Summary Scores",
                             placeholder="Summary scores will appear here",
                         )
-            with gr.Column():
-                gr.Markdown("#### **Summary Output**")
                 summary_text = gr.HTML(
-                    label="Summary", value="<i>Summary will appear here!</i>"
                 )
             with gr.Column():
-                gr.Markdown("##### **Aggregate Summary Batches**")
-                aggregate_button = gr.Button(
-                    "Aggregate!",
-                    variant="primary",
-                )  # TODO: collapse button to be on same line as something else
-                aggregated_summary = gr.HTML(label="Aggregate Summary", value="")
         gr.Markdown("---")
         with gr.Column():
@@ -539,15 +561,15 @@ if __name__ == "__main__":
                     value=3,
                 )
         with gr.Column():
-            gr.Markdown("### About")
             gr.Markdown(
-                "- Models are fine-tuned on the [BookSum dataset](https://arxiv.org/abs/2105.08209). The goal was to create a model that generalizes well and is useful for summarizing text in academic and everyday use."
             )
             gr.Markdown(
-                "- _Update April 2023:_ Additional models fine-tuned on the [PLOS](https://huggingface.co/datasets/pszemraj/scientific_lay_summarisation-plos-norm) and [ELIFE](https://huggingface.co/datasets/pszemraj/scientific_lay_summarisation-elife-norm) subsets of the [scientific lay summaries](https://arxiv.org/abs/2210.09932) dataset are available (see dropdown at the top)."
             )
             gr.Markdown(
-                "Adjust the max input words & max PDF pages for OCR by duplicating this space and [setting the environment variables](https://huggingface.co/docs/hub/spaces-overview#managing-secrets) `APP_MAX_WORDS` and `APP_OCR_MAX_PAGES` to the desired integer values."
             )
             gr.Markdown("---")

 app.py - the main module for the gradio app for summarization
 Usage:
+    python app.py --help
 Environment Variables:
     USE_TORCH (str): whether to use torch (1) or not (0)
 import re
 import time
 from pathlib import Path
 os.environ["USE_TORCH"] = "1"
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 import gradio as gr
 import nltk
 import torch
 from cleantext import clean
 from doctr.models import ocr_predictor
+from aggregate import BatchAggregator
 from pdf2text import convert_PDF_to_Text
 from summarize import load_model_and_tokenizer, summarize_via_tokenbatches
 from utils import (
     extract_batches,
     load_example_filenames,
+    remove_stagnant_files,
     saves_summary,
     textlist2html,
     truncate_word_count,
 )
 _here = Path(__file__).parent
 ]  # models users can choose from
 SUMMARY_PLACEHOLDER = "<p><em>Output will appear below:</em></p>"
+AGGREGATE_MODEL = "MBZUAI/LaMini-Flan-T5-783M"  # model to use for aggregation
+# if duplicating space: uncomment this line to adjust the max words
 # os.environ["APP_MAX_WORDS"] = str(2048)  # set the max words to 2048
 # os.environ["APP_OCR_MAX_PAGES"] = str(40)  # set the max pages to 40
+aggregator = BatchAggregator(AGGREGATE_MODEL)
 def aggregate_text(
     """
     Aggregate the text from the batches.
+        NOTE: you should probably include the BatchAggregator object as a fn arg if using this code
     :param batches_html: The batches to aggregate, in html format
     :param text_file: The text file to append the aggregate summary to
     :return: The aggregate summary in html format
     content_batches = [batch["content"] for batch in extracted_batches]
     full_summary = aggregator.infer_aggregate(content_batches)
+    # if a path that exists is provided, append the summary with markdown formatting
     if out_path:
         out_path = Path(out_path)
         try:
             with open(out_path, "a", encoding="utf-8") as f:
+                f.write("\n\n## Aggregate Summary\n\n")
                 f.write(
                     "- This is an instruction-based LLM aggregation of the previous 'summary batches'.\n"
                 )
                 raw_text = f.read()
             text = clean(raw_text, lower=lower)
         elif file_path.suffix == ".pdf":
+            logger.info(f"loading a PDF file: {file_path.name}")
             max_pages = int(os.environ.get("APP_MAX_PAGES", max_pages))
+            logger.info(f"max_pages is: {max_pages}. Starting conversion...")
             conversion_stats = convert_PDF_to_Text(
                 file_path,
                 ocr_model=ocr_model,
         return text
     except Exception as e:
         logger.error(f"Trying to load file:\t{file_path},\nerror:\t{e}")
+        return f"Error: Could not read file {file_path.name}. Make sure it is a PDF, TXT, or MD file."
 def parse_args():
+    """arguments for the command line interface"""
     parser = argparse.ArgumentParser(
+        description="Document Summarization with Long-Document Transformers Demo",
         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+        epilog="Runs a local-only web app to summarize documents. use --share for a public link to share.",
     )
     parser.add_argument(
         "--share",
     with demo:
         gr.Markdown("# Document Summarization with Long-Document Transformers")
         gr.Markdown(
+            "An example use case for fine-tuned long document transformers. Model(s) are trained on [book summaries](https://hf.co/datasets/kmfoda/booksum). Architectures [in this demo](https://hf.co/spaces/pszemraj/document-summarization) are [LongT5-base](https://hf.co/pszemraj/long-t5-tglobal-base-16384-book-summary) and [Pegasus-X-Large](https://hf.co/pszemraj/pegasus-x-large-book-summary)."
         )
         with gr.Column():
             gr.Markdown("## Load Inputs & Select Parameters")
                     load_examples_button = gr.Button(
                         "Load Example in Dropdown",
                     )
+                    load_file_button = gr.Button("Load & Process File")
                 with gr.Column(variant="compact"):
                     example_name = gr.Dropdown(
                         _examples,
                 input_text = gr.Textbox(
                     lines=4,
                     max_lines=12,
+                    label="Text to Summarize",
                     placeholder="Enter text to summarize, the text will be cleaned and truncated on Spaces. Narrative, academic (both papers and lecture transcription), and article text work well. May take a bit to generate depending on the input text :)",
                 )
         gr.Markdown("---")
         with gr.Column():
             gr.Markdown("## Generate Summary")
+            with gr.Row():
+                summarize_button = gr.Button(
+                    "Summarize!",
+                    variant="primary",
+                )
+                gr.Markdown(
+                    "_Summarization should take ~1-2 minutes for most settings, but may extend up to 5-10 minutes in some scenarios._"
+                )
             output_text = gr.HTML("<p><em>Output will appear below:</em></p>")
             with gr.Column():
+                gr.Markdown("### Results & Scores")
                 with gr.Row():
                     with gr.Column(variant="compact"):
                         gr.Markdown(
                         )
                     with gr.Column(variant="compact"):
                         gr.Markdown(
+                            "Scores **roughly** represent the summary quality as a measure of the model's 'confidence'. less-negative numbers (closer to 0) are better."
                         )
                         summary_scores = gr.Textbox(
                             label="Summary Scores",
                             placeholder="Summary scores will appear here",
                         )
+            with gr.Column(variant="panel"):
+                gr.Markdown("### **Summary Output**")
                 summary_text = gr.HTML(
+                    label="Summary",
+                    value="<center><i>Summary will appear here!</i></center>",
                 )
             with gr.Column():
+                gr.Markdown("### **Aggregate Summary Batches**")
+                gr.Markdown(
+                    "_Note: this is an experimental feature. Feedback welcome in the [discussions](https://hf.co/spaces/pszemraj/document-summarization/discussions)!_"
+                )
+                with gr.Row():
+                    aggregate_button = gr.Button(
+                        "Aggregate!",
+                        variant="primary",
+                    )
+                    gr.Markdown(
+                        f"""Aggregate the above batches into a cohesive summary.
+                    - a secondary instruct-tuned LM consolidates info from the batches
+                    - current model: [{AGGREGATE_MODEL}](https://hf.co/{AGGREGATE_MODEL})
+                                """
+                    )
+                with gr.Column(variant="panel"):
+                    aggregated_summary = gr.HTML(
+                        label="Aggregate Summary",
+                        value="<center><i>Aggregate summary will appear here!</i></center>",
+                    )
+                    gr.Markdown(
+                        "\n\n_Aggregate summary also appended to the bottom of the `.txt` file!_"
+                    )
         gr.Markdown("---")
         with gr.Column():
                     value=3,
                 )
         with gr.Column():
+            gr.Markdown("## About")
             gr.Markdown(
+                "- Models are fine-tuned on the [🅱️ookSum dataset](https://arxiv.org/abs/2105.08209). The goal was to create a model that generalizes well and is useful for summarizing text in academic and everyday use."
             )
             gr.Markdown(
+                "- _Update April 2023:_ Additional models fine-tuned on the [PLOS](https://hf.co/datasets/pszemraj/scientific_lay_summarisation-plos-norm) and [ELIFE](https://hf.co/datasets/pszemraj/scientific_lay_summarisation-elife-norm) subsets of the [scientific lay summaries](https://arxiv.org/abs/2210.09932) dataset are available (see dropdown at the top)."
             )
             gr.Markdown(
+                "Adjust the max input words & max PDF pages for OCR by duplicating this space and [setting the environment variables](https://hf.co/docs/hub/spaces-overview#managing-secrets) `APP_MAX_WORDS` and `APP_OCR_MAX_PAGES` to the desired integer values."
             )
             gr.Markdown("---")

utils.py CHANGED Viewed

@@ -4,6 +4,7 @@
 import logging
 import os
 import re
 import subprocess
 from collections import defaultdict, deque
 from datetime import datetime, timedelta
@@ -111,10 +112,9 @@ def compare_model_size(model_name: str, threshold: int = 500) -> bool:
     if not matches:
         return None
-    # Extract the parameter count and unit from the last match
     parameter_count, unit = matches[-1]
-    parameter_count = int(parameter_count)  # Convert to an integer
     # Convert to the standard form (M for million, G for billion, k for thousand)
     if unit == "G" or unit == "b":
@@ -129,7 +129,14 @@ def compare_model_size(model_name: str, threshold: int = 500) -> bool:
     return parameter_count > threshold
-def validate_pytorch2(torch_version: str = None):
     torch_version = torch.__version__ if torch_version is None else torch_version
     pattern = r"^2\.\d+(\.\d+)*"
@@ -140,8 +147,8 @@ def validate_pytorch2(torch_version: str = None):
 def get_timestamp(detailed=False) -> str:
     """
     get_timestamp - get a timestamp for the current time
-    Returns:
-        str, the timestamp
     """
     return (
         datetime.now().strftime("%b%d%Y_%H%M%S%f")
@@ -150,18 +157,13 @@ def get_timestamp(detailed=False) -> str:
     )
-def truncate_word_count(text, max_words=1024):
     """
-    truncate_word_count - a helper function for the gradio module
-    Parameters
-    ----------
-    text : str, required, the text to be processed
-    max_words : int, optional, the maximum number of words, default=512
-    Returns
-    -------
-    dict, the text and whether it was truncated
     """
-    # split on whitespace with regex
     words = re.split(r"\s+", text)
     processed = {}
     if len(words) > max_words:
@@ -176,8 +178,7 @@ def truncate_word_count(text, max_words=1024):
 def load_examples(src, filetypes=[".txt", ".pdf"]):
     """
     load_examples - a helper function for the gradio module to load examples
-    Returns:
-        list of str, the examples
     """
     src = Path(src)
     src.mkdir(exist_ok=True)
@@ -210,7 +211,8 @@ def load_example_filenames(example_path: str or Path):
     return examples
-def textlist2html(text_batches):
     # Step 1: Generate each summary batch as a string of HTML
     formatted_batches = [
         f"""
@@ -244,7 +246,7 @@ def textlist2html(text_batches):
     return text_html_block
-def extract_batches(html_string, pattern=None, flags=None) -> list:
     """
     Extract batches of text from an HTML string.
@@ -336,7 +338,7 @@ def extract_keywords(
 def saves_summary(
     summarize_output, outpath: str or Path = None, add_signature=True, **kwargs
-):
     """
     saves_summary - save the summary generated from summarize_via_tokenbatches() to a text file

 import logging
 import os
 import re
+import string
 import subprocess
 from collections import defaultdict, deque
 from datetime import datetime, timedelta
     if not matches:
         return None
+    # Extract the parameter count and unit
     parameter_count, unit = matches[-1]
+    parameter_count = int(parameter_count)
     # Convert to the standard form (M for million, G for billion, k for thousand)
     if unit == "G" or unit == "b":
     return parameter_count > threshold
+def validate_pytorch2(torch_version: str = None) -> bool:
+    """
+    validate_pytorch2 - validate that the PyTorch version is 2.0 or greater
+    :param str torch_version: the PyTorch version to validate, defaults to None
+    :return: True if the PyTorch version is 2.0 or greater, False otherwise
+    """
     torch_version = torch.__version__ if torch_version is None else torch_version
     pattern = r"^2\.\d+(\.\d+)*"
 def get_timestamp(detailed=False) -> str:
     """
     get_timestamp - get a timestamp for the current time
+    :param bool detailed: whether to include seconds and microseconds, defaults to False
+    :return: str, the timestamp
     """
     return (
         datetime.now().strftime("%b%d%Y_%H%M%S%f")
     )
+def truncate_word_count(text: str, max_words=1024) -> dict:
     """
+    truncate_word_count - truncate a text to a maximum number of words
+    :param str text: the text to truncate
+    :param int max_words: the maximum number of words to keep, defaults to 1024
+    :return: dict, the processed text
     """
     words = re.split(r"\s+", text)
     processed = {}
     if len(words) > max_words:
 def load_examples(src, filetypes=[".txt", ".pdf"]):
     """
     load_examples - a helper function for the gradio module to load examples
+    :param str src: the path to the examples
     """
     src = Path(src)
     src.mkdir(exist_ok=True)
     return examples
+def textlist2html(text_batches: List[str]) -> str:
+    """textlist2html - convert a list of text summaries into a single HTML string"""
     # Step 1: Generate each summary batch as a string of HTML
     formatted_batches = [
         f"""
     return text_html_block
+def extract_batches(html_string: str, pattern=None, flags=None) -> list:
     """
     Extract batches of text from an HTML string.
 def saves_summary(
     summarize_output, outpath: str or Path = None, add_signature=True, **kwargs
+) -> Path:
     """
     saves_summary - save the summary generated from summarize_via_tokenbatches() to a text file