Spaces:
Runtime error
Runtime error
π π improve docs and UI
Browse filesSigned-off-by: peter szemraj <peterszemraj@gmail.com>
- aggregate.py +6 -3
- app.py +61 -39
- utils.py +23 -21
aggregate.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
"""
|
| 2 |
-
aggregate.py
|
| 3 |
Primary usage is through the BatchAggregator class.
|
| 4 |
|
| 5 |
How it works:
|
|
@@ -29,7 +29,8 @@ class BatchAggregator:
|
|
| 29 |
Usage:
|
| 30 |
>>> from aggregate import BatchAggregator
|
| 31 |
>>> aggregator = BatchAggregator()
|
| 32 |
-
>>> aggregator.
|
|
|
|
| 33 |
"""
|
| 34 |
|
| 35 |
GENERIC_CONFIG = GenerationConfig(
|
|
@@ -187,7 +188,7 @@ class BatchAggregator:
|
|
| 187 |
**kwargs,
|
| 188 |
) -> str:
|
| 189 |
f"""
|
| 190 |
-
|
| 191 |
|
| 192 |
Args:
|
| 193 |
text_list (list): The texts to summarize.
|
|
@@ -211,6 +212,8 @@ class BatchAggregator:
|
|
| 211 |
self.logger.info(
|
| 212 |
f"Input tokens:\t{self.count_tokens(prompt)}. Output tokens:\t{self.count_tokens(result)}"
|
| 213 |
)
|
|
|
|
|
|
|
| 214 |
return result
|
| 215 |
|
| 216 |
def count_tokens(self, text: str) -> int:
|
|
|
|
| 1 |
"""
|
| 2 |
+
aggregate.py - module for aggregating text from multiple sources/multiple parts of a single source.
|
| 3 |
Primary usage is through the BatchAggregator class.
|
| 4 |
|
| 5 |
How it works:
|
|
|
|
| 29 |
Usage:
|
| 30 |
>>> from aggregate import BatchAggregator
|
| 31 |
>>> aggregator = BatchAggregator()
|
| 32 |
+
>>> agg = aggregator.infer_aggregate(["This is a test", "This is another test"])
|
| 33 |
+
>>> print(agg)
|
| 34 |
"""
|
| 35 |
|
| 36 |
GENERIC_CONFIG = GenerationConfig(
|
|
|
|
| 188 |
**kwargs,
|
| 189 |
) -> str:
|
| 190 |
f"""
|
| 191 |
+
infer_aggregate - infers a consolidated summary from a list of texts.
|
| 192 |
|
| 193 |
Args:
|
| 194 |
text_list (list): The texts to summarize.
|
|
|
|
| 212 |
self.logger.info(
|
| 213 |
f"Input tokens:\t{self.count_tokens(prompt)}. Output tokens:\t{self.count_tokens(result)}"
|
| 214 |
)
|
| 215 |
+
self.logger.debug(f"Generated text:\n{result}")
|
| 216 |
+
|
| 217 |
return result
|
| 218 |
|
| 219 |
def count_tokens(self, text: str) -> int:
|
app.py
CHANGED
|
@@ -2,7 +2,7 @@
|
|
| 2 |
app.py - the main module for the gradio app for summarization
|
| 3 |
|
| 4 |
Usage:
|
| 5 |
-
python app.py
|
| 6 |
|
| 7 |
Environment Variables:
|
| 8 |
USE_TORCH (str): whether to use torch (1) or not (0)
|
|
@@ -20,7 +20,6 @@ import random
|
|
| 20 |
import re
|
| 21 |
import time
|
| 22 |
from pathlib import Path
|
| 23 |
-
import pprint as pp
|
| 24 |
|
| 25 |
os.environ["USE_TORCH"] = "1"
|
| 26 |
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
|
@@ -33,18 +32,19 @@ logging.basicConfig(
|
|
| 33 |
import gradio as gr
|
| 34 |
import nltk
|
| 35 |
import torch
|
| 36 |
-
from aggregate import BatchAggregator
|
| 37 |
from cleantext import clean
|
| 38 |
from doctr.models import ocr_predictor
|
|
|
|
|
|
|
| 39 |
from pdf2text import convert_PDF_to_Text
|
| 40 |
from summarize import load_model_and_tokenizer, summarize_via_tokenbatches
|
| 41 |
from utils import (
|
| 42 |
extract_batches,
|
| 43 |
load_example_filenames,
|
|
|
|
| 44 |
saves_summary,
|
| 45 |
textlist2html,
|
| 46 |
truncate_word_count,
|
| 47 |
-
remove_stagnant_files,
|
| 48 |
)
|
| 49 |
|
| 50 |
_here = Path(__file__).parent
|
|
@@ -62,12 +62,13 @@ MODEL_OPTIONS = [
|
|
| 62 |
] # models users can choose from
|
| 63 |
|
| 64 |
SUMMARY_PLACEHOLDER = "<p><em>Output will appear below:</em></p>"
|
|
|
|
| 65 |
|
| 66 |
-
# if duplicating space
|
| 67 |
# os.environ["APP_MAX_WORDS"] = str(2048) # set the max words to 2048
|
| 68 |
# os.environ["APP_OCR_MAX_PAGES"] = str(40) # set the max pages to 40
|
| 69 |
|
| 70 |
-
aggregator = BatchAggregator(
|
| 71 |
|
| 72 |
|
| 73 |
def aggregate_text(
|
|
@@ -77,8 +78,8 @@ def aggregate_text(
|
|
| 77 |
"""
|
| 78 |
Aggregate the text from the batches.
|
| 79 |
|
| 80 |
-
NOTE: you should probably include
|
| 81 |
-
|
| 82 |
:param batches_html: The batches to aggregate, in html format
|
| 83 |
:param text_file: The text file to append the aggregate summary to
|
| 84 |
:return: The aggregate summary in html format
|
|
@@ -104,13 +105,13 @@ def aggregate_text(
|
|
| 104 |
content_batches = [batch["content"] for batch in extracted_batches]
|
| 105 |
full_summary = aggregator.infer_aggregate(content_batches)
|
| 106 |
|
| 107 |
-
# if a path that exists is provided,
|
| 108 |
if out_path:
|
| 109 |
out_path = Path(out_path)
|
| 110 |
|
| 111 |
try:
|
| 112 |
with open(out_path, "a", encoding="utf-8") as f:
|
| 113 |
-
f.write("\n\n
|
| 114 |
f.write(
|
| 115 |
"- This is an instruction-based LLM aggregation of the previous 'summary batches'.\n"
|
| 116 |
)
|
|
@@ -341,9 +342,9 @@ def load_uploaded_file(file_obj, max_pages: int = 20, lower: bool = False) -> st
|
|
| 341 |
raw_text = f.read()
|
| 342 |
text = clean(raw_text, lower=lower)
|
| 343 |
elif file_path.suffix == ".pdf":
|
| 344 |
-
logger.info(f"loading
|
| 345 |
max_pages = int(os.environ.get("APP_MAX_PAGES", max_pages))
|
| 346 |
-
logger.info(f"max_pages
|
| 347 |
conversion_stats = convert_PDF_to_Text(
|
| 348 |
file_path,
|
| 349 |
ocr_model=ocr_model,
|
|
@@ -357,13 +358,15 @@ def load_uploaded_file(file_obj, max_pages: int = 20, lower: bool = False) -> st
|
|
| 357 |
return text
|
| 358 |
except Exception as e:
|
| 359 |
logger.error(f"Trying to load file:\t{file_path},\nerror:\t{e}")
|
| 360 |
-
return "Error: Could not read file.
|
| 361 |
|
| 362 |
|
| 363 |
def parse_args():
|
|
|
|
| 364 |
parser = argparse.ArgumentParser(
|
| 365 |
-
description="Document Summarization with Long-Document Transformers",
|
| 366 |
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
|
|
|
|
| 367 |
)
|
| 368 |
parser.add_argument(
|
| 369 |
"--share",
|
|
@@ -415,7 +418,7 @@ if __name__ == "__main__":
|
|
| 415 |
with demo:
|
| 416 |
gr.Markdown("# Document Summarization with Long-Document Transformers")
|
| 417 |
gr.Markdown(
|
| 418 |
-
"An example use case for fine-tuned long document transformers. Model(s) are trained on [book summaries](https://
|
| 419 |
)
|
| 420 |
with gr.Column():
|
| 421 |
gr.Markdown("## Load Inputs & Select Parameters")
|
|
@@ -440,7 +443,7 @@ if __name__ == "__main__":
|
|
| 440 |
load_examples_button = gr.Button(
|
| 441 |
"Load Example in Dropdown",
|
| 442 |
)
|
| 443 |
-
load_file_button = gr.Button("Load
|
| 444 |
with gr.Column(variant="compact"):
|
| 445 |
example_name = gr.Dropdown(
|
| 446 |
_examples,
|
|
@@ -457,22 +460,23 @@ if __name__ == "__main__":
|
|
| 457 |
input_text = gr.Textbox(
|
| 458 |
lines=4,
|
| 459 |
max_lines=12,
|
| 460 |
-
label="
|
| 461 |
placeholder="Enter text to summarize, the text will be cleaned and truncated on Spaces. Narrative, academic (both papers and lecture transcription), and article text work well. May take a bit to generate depending on the input text :)",
|
| 462 |
)
|
| 463 |
gr.Markdown("---")
|
| 464 |
with gr.Column():
|
| 465 |
gr.Markdown("## Generate Summary")
|
| 466 |
-
gr.
|
| 467 |
-
|
| 468 |
-
|
| 469 |
-
|
| 470 |
-
|
| 471 |
-
|
| 472 |
-
|
|
|
|
| 473 |
output_text = gr.HTML("<p><em>Output will appear below:</em></p>")
|
| 474 |
with gr.Column():
|
| 475 |
-
gr.Markdown("
|
| 476 |
with gr.Row():
|
| 477 |
with gr.Column(variant="compact"):
|
| 478 |
gr.Markdown(
|
|
@@ -486,24 +490,42 @@ if __name__ == "__main__":
|
|
| 486 |
)
|
| 487 |
with gr.Column(variant="compact"):
|
| 488 |
gr.Markdown(
|
| 489 |
-
"Scores represent the summary quality
|
| 490 |
)
|
| 491 |
summary_scores = gr.Textbox(
|
| 492 |
label="Summary Scores",
|
| 493 |
placeholder="Summary scores will appear here",
|
| 494 |
)
|
| 495 |
-
with gr.Column():
|
| 496 |
-
gr.Markdown("
|
| 497 |
summary_text = gr.HTML(
|
| 498 |
-
label="Summary",
|
|
|
|
| 499 |
)
|
| 500 |
with gr.Column():
|
| 501 |
-
gr.Markdown("
|
| 502 |
-
|
| 503 |
-
"
|
| 504 |
-
|
| 505 |
-
)
|
| 506 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 507 |
|
| 508 |
gr.Markdown("---")
|
| 509 |
with gr.Column():
|
|
@@ -539,15 +561,15 @@ if __name__ == "__main__":
|
|
| 539 |
value=3,
|
| 540 |
)
|
| 541 |
with gr.Column():
|
| 542 |
-
gr.Markdown("
|
| 543 |
gr.Markdown(
|
| 544 |
-
"- Models are fine-tuned on the [
|
| 545 |
)
|
| 546 |
gr.Markdown(
|
| 547 |
-
"- _Update April 2023:_ Additional models fine-tuned on the [PLOS](https://
|
| 548 |
)
|
| 549 |
gr.Markdown(
|
| 550 |
-
"Adjust the max input words & max PDF pages for OCR by duplicating this space and [setting the environment variables](https://
|
| 551 |
)
|
| 552 |
gr.Markdown("---")
|
| 553 |
|
|
|
|
| 2 |
app.py - the main module for the gradio app for summarization
|
| 3 |
|
| 4 |
Usage:
|
| 5 |
+
python app.py --help
|
| 6 |
|
| 7 |
Environment Variables:
|
| 8 |
USE_TORCH (str): whether to use torch (1) or not (0)
|
|
|
|
| 20 |
import re
|
| 21 |
import time
|
| 22 |
from pathlib import Path
|
|
|
|
| 23 |
|
| 24 |
os.environ["USE_TORCH"] = "1"
|
| 25 |
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
|
|
|
| 32 |
import gradio as gr
|
| 33 |
import nltk
|
| 34 |
import torch
|
|
|
|
| 35 |
from cleantext import clean
|
| 36 |
from doctr.models import ocr_predictor
|
| 37 |
+
|
| 38 |
+
from aggregate import BatchAggregator
|
| 39 |
from pdf2text import convert_PDF_to_Text
|
| 40 |
from summarize import load_model_and_tokenizer, summarize_via_tokenbatches
|
| 41 |
from utils import (
|
| 42 |
extract_batches,
|
| 43 |
load_example_filenames,
|
| 44 |
+
remove_stagnant_files,
|
| 45 |
saves_summary,
|
| 46 |
textlist2html,
|
| 47 |
truncate_word_count,
|
|
|
|
| 48 |
)
|
| 49 |
|
| 50 |
_here = Path(__file__).parent
|
|
|
|
| 62 |
] # models users can choose from
|
| 63 |
|
| 64 |
SUMMARY_PLACEHOLDER = "<p><em>Output will appear below:</em></p>"
|
| 65 |
+
AGGREGATE_MODEL = "MBZUAI/LaMini-Flan-T5-783M" # model to use for aggregation
|
| 66 |
|
| 67 |
+
# if duplicating space: uncomment this line to adjust the max words
|
| 68 |
# os.environ["APP_MAX_WORDS"] = str(2048) # set the max words to 2048
|
| 69 |
# os.environ["APP_OCR_MAX_PAGES"] = str(40) # set the max pages to 40
|
| 70 |
|
| 71 |
+
aggregator = BatchAggregator(AGGREGATE_MODEL)
|
| 72 |
|
| 73 |
|
| 74 |
def aggregate_text(
|
|
|
|
| 78 |
"""
|
| 79 |
Aggregate the text from the batches.
|
| 80 |
|
| 81 |
+
NOTE: you should probably include the BatchAggregator object as a fn arg if using this code
|
| 82 |
+
|
| 83 |
:param batches_html: The batches to aggregate, in html format
|
| 84 |
:param text_file: The text file to append the aggregate summary to
|
| 85 |
:return: The aggregate summary in html format
|
|
|
|
| 105 |
content_batches = [batch["content"] for batch in extracted_batches]
|
| 106 |
full_summary = aggregator.infer_aggregate(content_batches)
|
| 107 |
|
| 108 |
+
# if a path that exists is provided, append the summary with markdown formatting
|
| 109 |
if out_path:
|
| 110 |
out_path = Path(out_path)
|
| 111 |
|
| 112 |
try:
|
| 113 |
with open(out_path, "a", encoding="utf-8") as f:
|
| 114 |
+
f.write("\n\n## Aggregate Summary\n\n")
|
| 115 |
f.write(
|
| 116 |
"- This is an instruction-based LLM aggregation of the previous 'summary batches'.\n"
|
| 117 |
)
|
|
|
|
| 342 |
raw_text = f.read()
|
| 343 |
text = clean(raw_text, lower=lower)
|
| 344 |
elif file_path.suffix == ".pdf":
|
| 345 |
+
logger.info(f"loading a PDF file: {file_path.name}")
|
| 346 |
max_pages = int(os.environ.get("APP_MAX_PAGES", max_pages))
|
| 347 |
+
logger.info(f"max_pages is: {max_pages}. Starting conversion...")
|
| 348 |
conversion_stats = convert_PDF_to_Text(
|
| 349 |
file_path,
|
| 350 |
ocr_model=ocr_model,
|
|
|
|
| 358 |
return text
|
| 359 |
except Exception as e:
|
| 360 |
logger.error(f"Trying to load file:\t{file_path},\nerror:\t{e}")
|
| 361 |
+
return f"Error: Could not read file {file_path.name}. Make sure it is a PDF, TXT, or MD file."
|
| 362 |
|
| 363 |
|
| 364 |
def parse_args():
|
| 365 |
+
"""arguments for the command line interface"""
|
| 366 |
parser = argparse.ArgumentParser(
|
| 367 |
+
description="Document Summarization with Long-Document Transformers Demo",
|
| 368 |
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
|
| 369 |
+
epilog="Runs a local-only web app to summarize documents. use --share for a public link to share.",
|
| 370 |
)
|
| 371 |
parser.add_argument(
|
| 372 |
"--share",
|
|
|
|
| 418 |
with demo:
|
| 419 |
gr.Markdown("# Document Summarization with Long-Document Transformers")
|
| 420 |
gr.Markdown(
|
| 421 |
+
"An example use case for fine-tuned long document transformers. Model(s) are trained on [book summaries](https://hf.co/datasets/kmfoda/booksum). Architectures [in this demo](https://hf.co/spaces/pszemraj/document-summarization) are [LongT5-base](https://hf.co/pszemraj/long-t5-tglobal-base-16384-book-summary) and [Pegasus-X-Large](https://hf.co/pszemraj/pegasus-x-large-book-summary)."
|
| 422 |
)
|
| 423 |
with gr.Column():
|
| 424 |
gr.Markdown("## Load Inputs & Select Parameters")
|
|
|
|
| 443 |
load_examples_button = gr.Button(
|
| 444 |
"Load Example in Dropdown",
|
| 445 |
)
|
| 446 |
+
load_file_button = gr.Button("Load & Process File")
|
| 447 |
with gr.Column(variant="compact"):
|
| 448 |
example_name = gr.Dropdown(
|
| 449 |
_examples,
|
|
|
|
| 460 |
input_text = gr.Textbox(
|
| 461 |
lines=4,
|
| 462 |
max_lines=12,
|
| 463 |
+
label="Text to Summarize",
|
| 464 |
placeholder="Enter text to summarize, the text will be cleaned and truncated on Spaces. Narrative, academic (both papers and lecture transcription), and article text work well. May take a bit to generate depending on the input text :)",
|
| 465 |
)
|
| 466 |
gr.Markdown("---")
|
| 467 |
with gr.Column():
|
| 468 |
gr.Markdown("## Generate Summary")
|
| 469 |
+
with gr.Row():
|
| 470 |
+
summarize_button = gr.Button(
|
| 471 |
+
"Summarize!",
|
| 472 |
+
variant="primary",
|
| 473 |
+
)
|
| 474 |
+
gr.Markdown(
|
| 475 |
+
"_Summarization should take ~1-2 minutes for most settings, but may extend up to 5-10 minutes in some scenarios._"
|
| 476 |
+
)
|
| 477 |
output_text = gr.HTML("<p><em>Output will appear below:</em></p>")
|
| 478 |
with gr.Column():
|
| 479 |
+
gr.Markdown("### Results & Scores")
|
| 480 |
with gr.Row():
|
| 481 |
with gr.Column(variant="compact"):
|
| 482 |
gr.Markdown(
|
|
|
|
| 490 |
)
|
| 491 |
with gr.Column(variant="compact"):
|
| 492 |
gr.Markdown(
|
| 493 |
+
"Scores **roughly** represent the summary quality as a measure of the model's 'confidence'. less-negative numbers (closer to 0) are better."
|
| 494 |
)
|
| 495 |
summary_scores = gr.Textbox(
|
| 496 |
label="Summary Scores",
|
| 497 |
placeholder="Summary scores will appear here",
|
| 498 |
)
|
| 499 |
+
with gr.Column(variant="panel"):
|
| 500 |
+
gr.Markdown("### **Summary Output**")
|
| 501 |
summary_text = gr.HTML(
|
| 502 |
+
label="Summary",
|
| 503 |
+
value="<center><i>Summary will appear here!</i></center>",
|
| 504 |
)
|
| 505 |
with gr.Column():
|
| 506 |
+
gr.Markdown("### **Aggregate Summary Batches**")
|
| 507 |
+
gr.Markdown(
|
| 508 |
+
"_Note: this is an experimental feature. Feedback welcome in the [discussions](https://hf.co/spaces/pszemraj/document-summarization/discussions)!_"
|
| 509 |
+
)
|
| 510 |
+
with gr.Row():
|
| 511 |
+
aggregate_button = gr.Button(
|
| 512 |
+
"Aggregate!",
|
| 513 |
+
variant="primary",
|
| 514 |
+
)
|
| 515 |
+
gr.Markdown(
|
| 516 |
+
f"""Aggregate the above batches into a cohesive summary.
|
| 517 |
+
- a secondary instruct-tuned LM consolidates info from the batches
|
| 518 |
+
- current model: [{AGGREGATE_MODEL}](https://hf.co/{AGGREGATE_MODEL})
|
| 519 |
+
"""
|
| 520 |
+
)
|
| 521 |
+
with gr.Column(variant="panel"):
|
| 522 |
+
aggregated_summary = gr.HTML(
|
| 523 |
+
label="Aggregate Summary",
|
| 524 |
+
value="<center><i>Aggregate summary will appear here!</i></center>",
|
| 525 |
+
)
|
| 526 |
+
gr.Markdown(
|
| 527 |
+
"\n\n_Aggregate summary also appended to the bottom of the `.txt` file!_"
|
| 528 |
+
)
|
| 529 |
|
| 530 |
gr.Markdown("---")
|
| 531 |
with gr.Column():
|
|
|
|
| 561 |
value=3,
|
| 562 |
)
|
| 563 |
with gr.Column():
|
| 564 |
+
gr.Markdown("## About")
|
| 565 |
gr.Markdown(
|
| 566 |
+
"- Models are fine-tuned on the [π
±οΈookSum dataset](https://arxiv.org/abs/2105.08209). The goal was to create a model that generalizes well and is useful for summarizing text in academic and everyday use."
|
| 567 |
)
|
| 568 |
gr.Markdown(
|
| 569 |
+
"- _Update April 2023:_ Additional models fine-tuned on the [PLOS](https://hf.co/datasets/pszemraj/scientific_lay_summarisation-plos-norm) and [ELIFE](https://hf.co/datasets/pszemraj/scientific_lay_summarisation-elife-norm) subsets of the [scientific lay summaries](https://arxiv.org/abs/2210.09932) dataset are available (see dropdown at the top)."
|
| 570 |
)
|
| 571 |
gr.Markdown(
|
| 572 |
+
"Adjust the max input words & max PDF pages for OCR by duplicating this space and [setting the environment variables](https://hf.co/docs/hub/spaces-overview#managing-secrets) `APP_MAX_WORDS` and `APP_OCR_MAX_PAGES` to the desired integer values."
|
| 573 |
)
|
| 574 |
gr.Markdown("---")
|
| 575 |
|
utils.py
CHANGED
|
@@ -4,6 +4,7 @@
|
|
| 4 |
import logging
|
| 5 |
import os
|
| 6 |
import re
|
|
|
|
| 7 |
import subprocess
|
| 8 |
from collections import defaultdict, deque
|
| 9 |
from datetime import datetime, timedelta
|
|
@@ -111,10 +112,9 @@ def compare_model_size(model_name: str, threshold: int = 500) -> bool:
|
|
| 111 |
if not matches:
|
| 112 |
return None
|
| 113 |
|
| 114 |
-
# Extract the parameter count and unit
|
| 115 |
parameter_count, unit = matches[-1]
|
| 116 |
-
|
| 117 |
-
parameter_count = int(parameter_count) # Convert to an integer
|
| 118 |
|
| 119 |
# Convert to the standard form (M for million, G for billion, k for thousand)
|
| 120 |
if unit == "G" or unit == "b":
|
|
@@ -129,7 +129,14 @@ def compare_model_size(model_name: str, threshold: int = 500) -> bool:
|
|
| 129 |
return parameter_count > threshold
|
| 130 |
|
| 131 |
|
| 132 |
-
def validate_pytorch2(torch_version: str = None):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 133 |
torch_version = torch.__version__ if torch_version is None else torch_version
|
| 134 |
|
| 135 |
pattern = r"^2\.\d+(\.\d+)*"
|
|
@@ -140,8 +147,8 @@ def validate_pytorch2(torch_version: str = None):
|
|
| 140 |
def get_timestamp(detailed=False) -> str:
|
| 141 |
"""
|
| 142 |
get_timestamp - get a timestamp for the current time
|
| 143 |
-
|
| 144 |
-
|
| 145 |
"""
|
| 146 |
return (
|
| 147 |
datetime.now().strftime("%b%d%Y_%H%M%S%f")
|
|
@@ -150,18 +157,13 @@ def get_timestamp(detailed=False) -> str:
|
|
| 150 |
)
|
| 151 |
|
| 152 |
|
| 153 |
-
def truncate_word_count(text, max_words=1024):
|
| 154 |
"""
|
| 155 |
-
truncate_word_count - a
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
max_words : int, optional, the maximum number of words, default=512
|
| 160 |
-
Returns
|
| 161 |
-
-------
|
| 162 |
-
dict, the text and whether it was truncated
|
| 163 |
"""
|
| 164 |
-
# split on whitespace with regex
|
| 165 |
words = re.split(r"\s+", text)
|
| 166 |
processed = {}
|
| 167 |
if len(words) > max_words:
|
|
@@ -176,8 +178,7 @@ def truncate_word_count(text, max_words=1024):
|
|
| 176 |
def load_examples(src, filetypes=[".txt", ".pdf"]):
|
| 177 |
"""
|
| 178 |
load_examples - a helper function for the gradio module to load examples
|
| 179 |
-
|
| 180 |
-
list of str, the examples
|
| 181 |
"""
|
| 182 |
src = Path(src)
|
| 183 |
src.mkdir(exist_ok=True)
|
|
@@ -210,7 +211,8 @@ def load_example_filenames(example_path: str or Path):
|
|
| 210 |
return examples
|
| 211 |
|
| 212 |
|
| 213 |
-
def textlist2html(text_batches):
|
|
|
|
| 214 |
# Step 1: Generate each summary batch as a string of HTML
|
| 215 |
formatted_batches = [
|
| 216 |
f"""
|
|
@@ -244,7 +246,7 @@ def textlist2html(text_batches):
|
|
| 244 |
return text_html_block
|
| 245 |
|
| 246 |
|
| 247 |
-
def extract_batches(html_string, pattern=None, flags=None) -> list:
|
| 248 |
"""
|
| 249 |
Extract batches of text from an HTML string.
|
| 250 |
|
|
@@ -336,7 +338,7 @@ def extract_keywords(
|
|
| 336 |
|
| 337 |
def saves_summary(
|
| 338 |
summarize_output, outpath: str or Path = None, add_signature=True, **kwargs
|
| 339 |
-
):
|
| 340 |
"""
|
| 341 |
saves_summary - save the summary generated from summarize_via_tokenbatches() to a text file
|
| 342 |
|
|
|
|
| 4 |
import logging
|
| 5 |
import os
|
| 6 |
import re
|
| 7 |
+
import string
|
| 8 |
import subprocess
|
| 9 |
from collections import defaultdict, deque
|
| 10 |
from datetime import datetime, timedelta
|
|
|
|
| 112 |
if not matches:
|
| 113 |
return None
|
| 114 |
|
| 115 |
+
# Extract the parameter count and unit
|
| 116 |
parameter_count, unit = matches[-1]
|
| 117 |
+
parameter_count = int(parameter_count)
|
|
|
|
| 118 |
|
| 119 |
# Convert to the standard form (M for million, G for billion, k for thousand)
|
| 120 |
if unit == "G" or unit == "b":
|
|
|
|
| 129 |
return parameter_count > threshold
|
| 130 |
|
| 131 |
|
| 132 |
+
def validate_pytorch2(torch_version: str = None) -> bool:
|
| 133 |
+
"""
|
| 134 |
+
validate_pytorch2 - validate that the PyTorch version is 2.0 or greater
|
| 135 |
+
|
| 136 |
+
:param str torch_version: the PyTorch version to validate, defaults to None
|
| 137 |
+
:return: True if the PyTorch version is 2.0 or greater, False otherwise
|
| 138 |
+
"""
|
| 139 |
+
|
| 140 |
torch_version = torch.__version__ if torch_version is None else torch_version
|
| 141 |
|
| 142 |
pattern = r"^2\.\d+(\.\d+)*"
|
|
|
|
| 147 |
def get_timestamp(detailed=False) -> str:
|
| 148 |
"""
|
| 149 |
get_timestamp - get a timestamp for the current time
|
| 150 |
+
:param bool detailed: whether to include seconds and microseconds, defaults to False
|
| 151 |
+
:return: str, the timestamp
|
| 152 |
"""
|
| 153 |
return (
|
| 154 |
datetime.now().strftime("%b%d%Y_%H%M%S%f")
|
|
|
|
| 157 |
)
|
| 158 |
|
| 159 |
|
| 160 |
+
def truncate_word_count(text: str, max_words=1024) -> dict:
|
| 161 |
"""
|
| 162 |
+
truncate_word_count - truncate a text to a maximum number of words
|
| 163 |
+
:param str text: the text to truncate
|
| 164 |
+
:param int max_words: the maximum number of words to keep, defaults to 1024
|
| 165 |
+
:return: dict, the processed text
|
|
|
|
|
|
|
|
|
|
|
|
|
| 166 |
"""
|
|
|
|
| 167 |
words = re.split(r"\s+", text)
|
| 168 |
processed = {}
|
| 169 |
if len(words) > max_words:
|
|
|
|
| 178 |
def load_examples(src, filetypes=[".txt", ".pdf"]):
|
| 179 |
"""
|
| 180 |
load_examples - a helper function for the gradio module to load examples
|
| 181 |
+
:param str src: the path to the examples
|
|
|
|
| 182 |
"""
|
| 183 |
src = Path(src)
|
| 184 |
src.mkdir(exist_ok=True)
|
|
|
|
| 211 |
return examples
|
| 212 |
|
| 213 |
|
| 214 |
+
def textlist2html(text_batches: List[str]) -> str:
|
| 215 |
+
"""textlist2html - convert a list of text summaries into a single HTML string"""
|
| 216 |
# Step 1: Generate each summary batch as a string of HTML
|
| 217 |
formatted_batches = [
|
| 218 |
f"""
|
|
|
|
| 246 |
return text_html_block
|
| 247 |
|
| 248 |
|
| 249 |
+
def extract_batches(html_string: str, pattern=None, flags=None) -> list:
|
| 250 |
"""
|
| 251 |
Extract batches of text from an HTML string.
|
| 252 |
|
|
|
|
| 338 |
|
| 339 |
def saves_summary(
|
| 340 |
summarize_output, outpath: str or Path = None, add_signature=True, **kwargs
|
| 341 |
+
) -> Path:
|
| 342 |
"""
|
| 343 |
saves_summary - save the summary generated from summarize_via_tokenbatches() to a text file
|
| 344 |
|