Spaces:

Srikesh
/

pdf_chat

Sleeping

App Files Files Community

Srikesh commited on Oct 20

Commit

07f9cb7

verified ·

1 Parent(s): 30c2a19

Update app.py

Browse files

Files changed (1) hide show

app.py +26 -10

app.py CHANGED Viewed

@@ -1,12 +1,20 @@
 import gradio as gr
 import fitz  # PyMuPDF
 import numpy as np
 from sentence_transformers import SentenceTransformer
 from sklearn.metrics.pairwise import cosine_similarity
 from transformers import pipeline
-from io import BytesIO
-# ✅ Lightweight models
 embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
 qa_model = pipeline("text2text-generation", model="google/flan-t5-base")
@@ -14,27 +22,35 @@ chunks = []
 vectors = None
 def extract_text_from_pdf(pdf_file):
-    """Safely extract text from PDF using PyMuPDF."""
-    # Handle Gradio NamedString or file object
     if hasattr(pdf_file, "read"):
         file_bytes = pdf_file.read()
     else:
         file_bytes = BytesIO(pdf_file.encode("utf-8")).read()
     text = ""
     with fitz.open(stream=file_bytes, filetype="pdf") as doc:
         for page in doc:
-            text += page.get_text("text") + "\n"
     return text.strip()
 def create_embeddings(text):
-    """Split text and create embeddings."""
     global chunks, vectors
     chunk_size = 800
     chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
     vectors = embedding_model.encode(chunks)
 def chat_with_pdf(query):
-    """Answer questions using semantic similarity."""
     if not chunks:
         return "❗ Please upload a PDF first."
     q_vec = embedding_model.encode([query])
@@ -49,15 +65,15 @@ def handle_pdf_upload(pdf_file):
     try:
         text = extract_text_from_pdf(pdf_file)
         if not text:
-            return "❌ No readable text found (scanned or image-based PDF)."
         create_embeddings(text)
         return "✅ PDF processed successfully. You can now chat!"
     except Exception as e:
         return f"❌ Error: {str(e)}"
-# 🎨 Gradio Interface
 with gr.Blocks() as app:
-    gr.Markdown("## 🤖 Chat with Your PDF — Lightweight & Fast")
     pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
     status_box = gr.Textbox(label="Status", interactive=False)
     pdf_input.change(fn=handle_pdf_upload, inputs=pdf_input, outputs=status_box)

 import gradio as gr
 import fitz  # PyMuPDF
 import numpy as np
+from io import BytesIO
 from sentence_transformers import SentenceTransformer
 from sklearn.metrics.pairwise import cosine_similarity
 from transformers import pipeline
+# Optional OCR
+try:
+    import pytesseract
+    from PIL import Image
+    OCR_AVAILABLE = True
+except ImportError:
+    OCR_AVAILABLE = False
+# Models
 embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
 qa_model = pipeline("text2text-generation", model="google/flan-t5-base")
 vectors = None
 def extract_text_from_pdf(pdf_file):
+    """Extract text; fallback to OCR if no text found."""
+    # Handle NamedString or file object
     if hasattr(pdf_file, "read"):
         file_bytes = pdf_file.read()
     else:
         file_bytes = BytesIO(pdf_file.encode("utf-8")).read()
     text = ""
     with fitz.open(stream=file_bytes, filetype="pdf") as doc:
         for page in doc:
+            page_text = page.get_text("text")
+            text += page_text + "\n"
+        # If no text found, try OCR
+        if not text.strip() and OCR_AVAILABLE:
+            for page in doc:
+                pix = page.get_pixmap()
+                img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+                text += pytesseract.image_to_string(img) + "\n"
     return text.strip()
 def create_embeddings(text):
     global chunks, vectors
     chunk_size = 800
     chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
     vectors = embedding_model.encode(chunks)
 def chat_with_pdf(query):
     if not chunks:
         return "❗ Please upload a PDF first."
     q_vec = embedding_model.encode([query])
     try:
         text = extract_text_from_pdf(pdf_file)
         if not text:
+            return "❌ No readable text found. If this is a scanned PDF, install pytesseract for OCR."
         create_embeddings(text)
         return "✅ PDF processed successfully. You can now chat!"
     except Exception as e:
         return f"❌ Error: {str(e)}"
+# Gradio UI
 with gr.Blocks() as app:
+    gr.Markdown("## 🤖 Chat with Your PDF — OCR fallback for scanned PDFs")
     pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
     status_box = gr.Textbox(label="Status", interactive=False)
     pdf_input.change(fn=handle_pdf_upload, inputs=pdf_input, outputs=status_box)