document-summarization

Runtime error

App Files Files Community

pszemraj commited on Oct 4, 2022

Commit

5f2c216

1 Parent(s): 9350787

✨ add ability to load PDF

Browse files

Signed-off-by: peter szemraj <peterszemraj@gmail.com>

Files changed (1) hide show

app.py +54 -14

app.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import logging
 import time
 from pathlib import Path
@@ -5,6 +6,9 @@ from pathlib import Path
 import gradio as gr
 import nltk
 from cleantext import clean
 from summarize import load_model_and_tokenizer, summarize_via_tokenbatches
 from utils import load_example_filenames, truncate_word_count
@@ -101,6 +105,7 @@ def proc_submission(
 def load_single_example_text(
     example_path: str or Path,
 ):
     """
     load_single_example - a helper function for the gradio module to load examples
@@ -110,14 +115,26 @@ def load_single_example_text(
     global name_to_path
     full_ex_path = name_to_path[example_path]
     full_ex_path = Path(full_ex_path)
-    # load the examples into a list
-    with open(full_ex_path, "r", encoding="utf-8", errors="ignore") as f:
-        raw_text = f.read()
         text = clean(raw_text, lower=False)
     return text
-def load_uploaded_file(file_obj):
     """
     load_uploaded_file - process an uploaded file
@@ -135,29 +152,52 @@ def load_uploaded_file(file_obj):
         file_obj = file_obj[0]
     file_path = Path(file_obj.name)
     try:
-        with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
-            raw_text = f.read()
-        text = clean(raw_text, lower=False)
         return text
     except Exception as e:
         logging.info(f"Trying to load file with path {file_path}, error: {e}")
-        return "Error: Could not read file. Ensure that it is a valid text file with encoding UTF-8."
 if __name__ == "__main__":
-    model, tokenizer = load_model_and_tokenizer("pszemraj/led-large-book-summary")
-    model_sm, tokenizer_sm = load_model_and_tokenizer("pszemraj/led-base-book-summary")
     name_to_path = load_example_filenames(_here / "examples")
     logging.info(f"Loaded {len(name_to_path)} examples")
     demo = gr.Blocks()
     with demo:
-        gr.Markdown("# Long-Form Summarization: LED & BookSum")
         gr.Markdown(
-            "A simple demo using a fine-tuned LED model to summarize long-form text. See [model card](https://huggingface.co/pszemraj/led-large-book-summary) for a notebook with GPU inference (much faster) on Colab."
         )
         with gr.Column():

+import contextlib
 import logging
 import time
 from pathlib import Path
 import gradio as gr
 import nltk
 from cleantext import clean
+from doctr.io import DocumentFile
+from doctr.models import ocr_predictor
+from pdf2text import convert_PDF_to_Text
 from summarize import load_model_and_tokenizer, summarize_via_tokenbatches
 from utils import load_example_filenames, truncate_word_count
 def load_single_example_text(
     example_path: str or Path,
+    max_pages=20,
 ):
     """
     load_single_example - a helper function for the gradio module to load examples
     global name_to_path
     full_ex_path = name_to_path[example_path]
     full_ex_path = Path(full_ex_path)
+    if full_ex_path.suffix == ".txt":
+        with open(full_ex_path, "r", encoding="utf-8", errors="ignore") as f:
+            raw_text = f.read()
         text = clean(raw_text, lower=False)
+    elif full_ex_path.suffix == ".pdf":
+        logging.info(f"Loading PDF file {full_ex_path}")
+        conversion_stats = convert_PDF_to_Text(
+            full_ex_path,
+            ocr_model=ocr_model,
+            max_pages=max_pages,
+        )
+        text = conversion_stats["converted_text"]
+    else:
+        logging.error(f"Unknown file type {full_ex_path.suffix}")
+        text = "ERROR - check example path"
     return text
+def load_uploaded_file(file_obj, max_pages=20):
     """
     load_uploaded_file - process an uploaded file
         file_obj = file_obj[0]
     file_path = Path(file_obj.name)
     try:
+        if file_path.suffix == ".txt":
+            with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
+                raw_text = f.read()
+            text = clean(raw_text, lower=False)
+        elif file_path.suffix == ".pdf":
+            logging.info(f"Loading PDF file {file_path}")
+            conversion_stats = convert_PDF_to_Text(
+                file_path,
+                ocr_model=ocr_model,
+                max_pages=max_pages,
+            )
+            text = conversion_stats["converted_text"]
+        else:
+            logging.error(f"Unknown file type {file_path.suffix}")
+            text = "ERROR - check example path"
         return text
     except Exception as e:
         logging.info(f"Trying to load file with path {file_path}, error: {e}")
+        return "Error: Could not read file. Ensure that it is a valid text file with encoding UTF-8 if text, and a PDF if PDF."
 if __name__ == "__main__":
+    logging.info("Starting app instance")
+    logging.info("Loading summ models")
+    model, tokenizer = load_model_and_tokenizer("pszemraj/pegasus-x-large-book-summary")
+    model_sm, tokenizer_sm = load_model_and_tokenizer("pszemraj/long-t5-tglobal-base-16384-book-summary")
+    logging.info("Loading OCR model")
+    with contextlib.redirect_stdout(None):
+        ocr_model = ocr_predictor(
+            "db_resnet50",
+            "crnn_mobilenet_v3_large",
+            pretrained=True,
+            assume_straight_pages=True,
+        )
     name_to_path = load_example_filenames(_here / "examples")
     logging.info(f"Loaded {len(name_to_path)} examples")
     demo = gr.Blocks()
     with demo:
+        gr.Markdown("# Document Summarization with Long-Document Transformers")
         gr.Markdown(
+            "TODO: Add a description of the model and how it works, and a link to the paper"
         )
         with gr.Column():