Spaces:

yasserrmd
/

NotebookLlama

Running

App Files Files Community

yasserrmd commited on Oct 30, 2024

Commit

7db791c

verified ·

1 Parent(s): 870f7a1

Update extract_text_from_pdf.py

Browse files

Files changed (1) hide show

extract_text_from_pdf.py +7 -7

extract_text_from_pdf.py CHANGED Viewed

@@ -2,7 +2,7 @@
 import os
 import torch
-import spaces
 from PyPDF2 import PdfReader
 from accelerate import Accelerator
 from transformers import AutoModelForCausalLM, AutoTokenizer
@@ -18,7 +18,7 @@ class PDFTextExtractor:
     """
     A class to handle PDF text extraction and preprocessing for podcast preparation.
     """
-    @spaces.GPU
     def __init__(self, pdf_path, output_path):
         """
         Initialize the PDFTextExtractor with paths and model details.
@@ -49,7 +49,7 @@ class PDFTextExtractor:
         Be smart and aggressive with removing details; you're only cleaning up the text without summarizing.
         Here is the text:
         """
-    @spaces.GPU
     def validate_pdf(self):
         """Check if the file exists and is a valid PDF."""
         if not os.path.exists(self.pdf_path):
@@ -60,7 +60,7 @@ class PDFTextExtractor:
             return False
         return True
-    @spaces.GPU
     def extract_text(self):
         """Extract text from the PDF, limited by max_chars."""
         if not self.validate_pdf():
@@ -91,7 +91,7 @@ class PDFTextExtractor:
             final_text = '\n'.join(extracted_text)
             print(f"Extraction complete! Total characters: {len(final_text)}")
             return final_text
-    @spaces.GPU
     def create_word_bounded_chunks(self, text):
         """Split text into chunks around the target size."""
         words = text.split()
@@ -114,7 +114,7 @@ class PDFTextExtractor:
         return chunks
-    @spaces.GPU(duration=120)
     def process_chunk(self, text_chunk):
         """Process a text chunk with the model and return the cleaned text."""
         conversation = [
@@ -130,7 +130,7 @@ class PDFTextExtractor:
         processed_text = self.tokenizer.decode(output[0], skip_special_tokens=True)[len(prompt):].strip()
         return processed_text
-    @spaces.GPU
     def clean_and_save_text(self):
         """Extract, clean, and save processed text to a file."""
         extracted_text = self.extract_text()

 import os
 import torch
+#import spaces
 from PyPDF2 import PdfReader
 from accelerate import Accelerator
 from transformers import AutoModelForCausalLM, AutoTokenizer
     """
     A class to handle PDF text extraction and preprocessing for podcast preparation.
     """
+    #@spaces.GPU
     def __init__(self, pdf_path, output_path):
         """
         Initialize the PDFTextExtractor with paths and model details.
         Be smart and aggressive with removing details; you're only cleaning up the text without summarizing.
         Here is the text:
         """
+    #@spaces.GPU
     def validate_pdf(self):
         """Check if the file exists and is a valid PDF."""
         if not os.path.exists(self.pdf_path):
             return False
         return True
+    #@spaces.GPU
     def extract_text(self):
         """Extract text from the PDF, limited by max_chars."""
         if not self.validate_pdf():
             final_text = '\n'.join(extracted_text)
             print(f"Extraction complete! Total characters: {len(final_text)}")
             return final_text
+    #@spaces.GPU
     def create_word_bounded_chunks(self, text):
         """Split text into chunks around the target size."""
         words = text.split()
         return chunks
+    #@spaces.GPU(duration=120)
     def process_chunk(self, text_chunk):
         """Process a text chunk with the model and return the cleaned text."""
         conversation = [
         processed_text = self.tokenizer.decode(output[0], skip_special_tokens=True)[len(prompt):].strip()
         return processed_text
+    #@spaces.GPU
     def clean_and_save_text(self):
         """Extract, clean, and save processed text to a file."""
         extracted_text = self.extract_text()