Srikesh commited on
Commit
07f9cb7
Β·
verified Β·
1 Parent(s): 30c2a19

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -10
app.py CHANGED
@@ -1,12 +1,20 @@
1
  import gradio as gr
2
  import fitz # PyMuPDF
3
  import numpy as np
 
4
  from sentence_transformers import SentenceTransformer
5
  from sklearn.metrics.pairwise import cosine_similarity
6
  from transformers import pipeline
7
- from io import BytesIO
8
 
9
- # βœ… Lightweight models
 
 
 
 
 
 
 
 
10
  embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
11
  qa_model = pipeline("text2text-generation", model="google/flan-t5-base")
12
 
@@ -14,27 +22,35 @@ chunks = []
14
  vectors = None
15
 
16
  def extract_text_from_pdf(pdf_file):
17
- """Safely extract text from PDF using PyMuPDF."""
18
- # Handle Gradio NamedString or file object
19
  if hasattr(pdf_file, "read"):
20
  file_bytes = pdf_file.read()
21
  else:
22
  file_bytes = BytesIO(pdf_file.encode("utf-8")).read()
 
23
  text = ""
24
  with fitz.open(stream=file_bytes, filetype="pdf") as doc:
25
  for page in doc:
26
- text += page.get_text("text") + "\n"
 
 
 
 
 
 
 
 
 
27
  return text.strip()
28
 
29
  def create_embeddings(text):
30
- """Split text and create embeddings."""
31
  global chunks, vectors
32
  chunk_size = 800
33
  chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
34
  vectors = embedding_model.encode(chunks)
35
 
36
  def chat_with_pdf(query):
37
- """Answer questions using semantic similarity."""
38
  if not chunks:
39
  return "❗ Please upload a PDF first."
40
  q_vec = embedding_model.encode([query])
@@ -49,15 +65,15 @@ def handle_pdf_upload(pdf_file):
49
  try:
50
  text = extract_text_from_pdf(pdf_file)
51
  if not text:
52
- return "❌ No readable text found (scanned or image-based PDF)."
53
  create_embeddings(text)
54
  return "βœ… PDF processed successfully. You can now chat!"
55
  except Exception as e:
56
  return f"❌ Error: {str(e)}"
57
 
58
- # 🎨 Gradio Interface
59
  with gr.Blocks() as app:
60
- gr.Markdown("## πŸ€– Chat with Your PDF β€” Lightweight & Fast")
61
  pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
62
  status_box = gr.Textbox(label="Status", interactive=False)
63
  pdf_input.change(fn=handle_pdf_upload, inputs=pdf_input, outputs=status_box)
 
1
  import gradio as gr
2
  import fitz # PyMuPDF
3
  import numpy as np
4
+ from io import BytesIO
5
  from sentence_transformers import SentenceTransformer
6
  from sklearn.metrics.pairwise import cosine_similarity
7
  from transformers import pipeline
 
8
 
9
+ # Optional OCR
10
+ try:
11
+ import pytesseract
12
+ from PIL import Image
13
+ OCR_AVAILABLE = True
14
+ except ImportError:
15
+ OCR_AVAILABLE = False
16
+
17
+ # Models
18
  embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
19
  qa_model = pipeline("text2text-generation", model="google/flan-t5-base")
20
 
 
22
  vectors = None
23
 
24
  def extract_text_from_pdf(pdf_file):
25
+ """Extract text; fallback to OCR if no text found."""
26
+ # Handle NamedString or file object
27
  if hasattr(pdf_file, "read"):
28
  file_bytes = pdf_file.read()
29
  else:
30
  file_bytes = BytesIO(pdf_file.encode("utf-8")).read()
31
+
32
  text = ""
33
  with fitz.open(stream=file_bytes, filetype="pdf") as doc:
34
  for page in doc:
35
+ page_text = page.get_text("text")
36
+ text += page_text + "\n"
37
+
38
+ # If no text found, try OCR
39
+ if not text.strip() and OCR_AVAILABLE:
40
+ for page in doc:
41
+ pix = page.get_pixmap()
42
+ img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
43
+ text += pytesseract.image_to_string(img) + "\n"
44
+
45
  return text.strip()
46
 
47
  def create_embeddings(text):
 
48
  global chunks, vectors
49
  chunk_size = 800
50
  chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
51
  vectors = embedding_model.encode(chunks)
52
 
53
  def chat_with_pdf(query):
 
54
  if not chunks:
55
  return "❗ Please upload a PDF first."
56
  q_vec = embedding_model.encode([query])
 
65
  try:
66
  text = extract_text_from_pdf(pdf_file)
67
  if not text:
68
+ return "❌ No readable text found. If this is a scanned PDF, install pytesseract for OCR."
69
  create_embeddings(text)
70
  return "βœ… PDF processed successfully. You can now chat!"
71
  except Exception as e:
72
  return f"❌ Error: {str(e)}"
73
 
74
+ # Gradio UI
75
  with gr.Blocks() as app:
76
+ gr.Markdown("## πŸ€– Chat with Your PDF β€” OCR fallback for scanned PDFs")
77
  pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
78
  status_box = gr.Textbox(label="Status", interactive=False)
79
  pdf_input.change(fn=handle_pdf_upload, inputs=pdf_input, outputs=status_box)