Spaces:
Running
Running
Update extract_text_from_pdf.py
Browse files- extract_text_from_pdf.py +7 -7
extract_text_from_pdf.py
CHANGED
|
@@ -2,7 +2,7 @@
|
|
| 2 |
|
| 3 |
import os
|
| 4 |
import torch
|
| 5 |
-
import spaces
|
| 6 |
from PyPDF2 import PdfReader
|
| 7 |
from accelerate import Accelerator
|
| 8 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
@@ -18,7 +18,7 @@ class PDFTextExtractor:
|
|
| 18 |
"""
|
| 19 |
A class to handle PDF text extraction and preprocessing for podcast preparation.
|
| 20 |
"""
|
| 21 |
-
|
| 22 |
def __init__(self, pdf_path, output_path):
|
| 23 |
"""
|
| 24 |
Initialize the PDFTextExtractor with paths and model details.
|
|
@@ -49,7 +49,7 @@ class PDFTextExtractor:
|
|
| 49 |
Be smart and aggressive with removing details; you're only cleaning up the text without summarizing.
|
| 50 |
Here is the text:
|
| 51 |
"""
|
| 52 |
-
|
| 53 |
def validate_pdf(self):
|
| 54 |
"""Check if the file exists and is a valid PDF."""
|
| 55 |
if not os.path.exists(self.pdf_path):
|
|
@@ -60,7 +60,7 @@ class PDFTextExtractor:
|
|
| 60 |
return False
|
| 61 |
return True
|
| 62 |
|
| 63 |
-
|
| 64 |
def extract_text(self):
|
| 65 |
"""Extract text from the PDF, limited by max_chars."""
|
| 66 |
if not self.validate_pdf():
|
|
@@ -91,7 +91,7 @@ class PDFTextExtractor:
|
|
| 91 |
final_text = '\n'.join(extracted_text)
|
| 92 |
print(f"Extraction complete! Total characters: {len(final_text)}")
|
| 93 |
return final_text
|
| 94 |
-
|
| 95 |
def create_word_bounded_chunks(self, text):
|
| 96 |
"""Split text into chunks around the target size."""
|
| 97 |
words = text.split()
|
|
@@ -114,7 +114,7 @@ class PDFTextExtractor:
|
|
| 114 |
|
| 115 |
return chunks
|
| 116 |
|
| 117 |
-
|
| 118 |
def process_chunk(self, text_chunk):
|
| 119 |
"""Process a text chunk with the model and return the cleaned text."""
|
| 120 |
conversation = [
|
|
@@ -130,7 +130,7 @@ class PDFTextExtractor:
|
|
| 130 |
|
| 131 |
processed_text = self.tokenizer.decode(output[0], skip_special_tokens=True)[len(prompt):].strip()
|
| 132 |
return processed_text
|
| 133 |
-
|
| 134 |
def clean_and_save_text(self):
|
| 135 |
"""Extract, clean, and save processed text to a file."""
|
| 136 |
extracted_text = self.extract_text()
|
|
|
|
| 2 |
|
| 3 |
import os
|
| 4 |
import torch
|
| 5 |
+
#import spaces
|
| 6 |
from PyPDF2 import PdfReader
|
| 7 |
from accelerate import Accelerator
|
| 8 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
|
|
| 18 |
"""
|
| 19 |
A class to handle PDF text extraction and preprocessing for podcast preparation.
|
| 20 |
"""
|
| 21 |
+
#@spaces.GPU
|
| 22 |
def __init__(self, pdf_path, output_path):
|
| 23 |
"""
|
| 24 |
Initialize the PDFTextExtractor with paths and model details.
|
|
|
|
| 49 |
Be smart and aggressive with removing details; you're only cleaning up the text without summarizing.
|
| 50 |
Here is the text:
|
| 51 |
"""
|
| 52 |
+
#@spaces.GPU
|
| 53 |
def validate_pdf(self):
|
| 54 |
"""Check if the file exists and is a valid PDF."""
|
| 55 |
if not os.path.exists(self.pdf_path):
|
|
|
|
| 60 |
return False
|
| 61 |
return True
|
| 62 |
|
| 63 |
+
#@spaces.GPU
|
| 64 |
def extract_text(self):
|
| 65 |
"""Extract text from the PDF, limited by max_chars."""
|
| 66 |
if not self.validate_pdf():
|
|
|
|
| 91 |
final_text = '\n'.join(extracted_text)
|
| 92 |
print(f"Extraction complete! Total characters: {len(final_text)}")
|
| 93 |
return final_text
|
| 94 |
+
#@spaces.GPU
|
| 95 |
def create_word_bounded_chunks(self, text):
|
| 96 |
"""Split text into chunks around the target size."""
|
| 97 |
words = text.split()
|
|
|
|
| 114 |
|
| 115 |
return chunks
|
| 116 |
|
| 117 |
+
#@spaces.GPU(duration=120)
|
| 118 |
def process_chunk(self, text_chunk):
|
| 119 |
"""Process a text chunk with the model and return the cleaned text."""
|
| 120 |
conversation = [
|
|
|
|
| 130 |
|
| 131 |
processed_text = self.tokenizer.decode(output[0], skip_special_tokens=True)[len(prompt):].strip()
|
| 132 |
return processed_text
|
| 133 |
+
#@spaces.GPU
|
| 134 |
def clean_and_save_text(self):
|
| 135 |
"""Extract, clean, and save processed text to a file."""
|
| 136 |
extracted_text = self.extract_text()
|