Spaces:

Srikesh
/

pdf_chat

Sleeping

App Files Files Community

Srikesh commited on Oct 20

Commit

c53d978

verified ·

1 Parent(s): 2ceabc6

Create app.py

Browse files

Files changed (1) hide show

app.py +206 -0

app.py ADDED Viewed

	@@ -0,0 +1,206 @@

+import os
+import gradio as gr
+from PyPDF2 import PdfReader
+from sentence_transformers import SentenceTransformer
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
+import faiss
+import numpy as np
+import math
+import time
+# ---------- CONFIG ----------
+EMBED_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
+GEN_MODEL_NAME = "google/flan-t5-base"  # fast & capable
+CHUNK_SIZE = 500             # characters per chunk (approx 250-350 tokens)
+CHUNK_OVERLAP = 100          # overlap between chunks to preserve context
+TOP_K = 4                    # number of chunks retrieved
+MAX_NEW_TOKENS = 150         # generation length (keep small for speed)
+GEN_TEMPERATURE = 0.0        # deterministic, faster
+NORMALIZE_EMB = True
+# ----------------------------
+# Global state
+embedder = SentenceTransformer(EMBED_MODEL_NAME)
+tokenizer = AutoTokenizer.from_pretrained(GEN_MODEL_NAME)
+gen_model = AutoModelForSeq2SeqLM.from_pretrained(GEN_MODEL_NAME)
+# Use the pipeline for convenience (it wraps tokenizer+model)
+qa_pipeline = pipeline(
+    "text2text-generation",
+    model=gen_model,
+    tokenizer=tokenizer,
+    device=-1,  # CPU (Spaces default). If GPU available, change to 0.
+)
+faiss_index = None
+pdf_chunks = []          # list[str]
+pdf_embeddings = None    # numpy array (N, dim)
+last_loaded_filename = None
+last_loaded_at = None
+# ---------- utilities ----------
+def chunk_text(text, chunk_size=CHUNK_SIZE, overlap=CHUNK_OVERLAP):
+    if not text:
+        return []
+    chunks = []
+    start = 0
+    length = len(text)
+    while start < length:
+        end = start + chunk_size
+        chunk = text[start:end].strip()
+        if chunk:
+            chunks.append(chunk)
+        start = end - overlap  # move with overlap
+        if start < 0:
+            start = 0
+    return chunks
+def build_faiss_index(embeddings: np.ndarray):
+    dim = embeddings.shape[1]
+    # IndexFlatIP with normalized vectors -> cosine similarity
+    index = faiss.IndexFlatIP(dim)
+    faiss.normalize_L2(embeddings)
+    index.add(embeddings)
+    return index
+def embed_texts(texts):
+    # sentence-transformers returns numpy arrays
+    embeddings = embedder.encode(texts, convert_to_numpy=True, show_progress_bar=False)
+    if NORMALIZE_EMB:
+        faiss.normalize_L2(embeddings)
+    return embeddings
+# ---------- Gradio functions ----------
+def process_pdf(pdf_file):
+    """
+    Upload and process PDF. Builds FAISS index and stores chunks & embeddings in memory.
+    Returns status message and basic metadata.
+    """
+    global faiss_index, pdf_chunks, pdf_embeddings, last_loaded_filename, last_loaded_at
+    if pdf_file is None:
+        return "⚠️ No file uploaded."
+    try:
+        # Extract text
+        reader = PdfReader(pdf_file.name)
+        full_text = []
+        for p in reader.pages:
+            text = p.extract_text()
+            if text:
+                full_text.append(text)
+        text = "\n".join(full_text).strip()
+        if not text:
+            return "⚠️ No readable text found in PDF."
+        # Chunk text
+        pdf_chunks = chunk_text(text, chunk_size=CHUNK_SIZE, overlap=CHUNK_OVERLAP)
+        # Embed chunks (batch)
+        pdf_embeddings = embed_texts(pdf_chunks)
+        # Build FAISS index
+        faiss_index = build_faiss_index(np.copy(pdf_embeddings))
+        last_loaded_filename = os.path.basename(pdf_file.name)
+        last_loaded_at = time.time()
+        return f"✅ PDF processed. {len(pdf_chunks)} chunks indexed. Ready for Q&A."
+    except Exception as e:
+        return f"❌ Error processing PDF: {e}"
+def chat_with_pdf(query):
+    """
+    Retrieve relevant chunks and generate an answer using the generator model.
+    Designed for low-latency responses.
+    """
+    global faiss_index, pdf_chunks, pdf_embeddings
+    if faiss_index is None or pdf_chunks is None or len(pdf_chunks) == 0:
+        return "⚠️ Please upload and process a PDF first."
+    if not query or not query.strip():
+        return "⚠️ Please enter a question."
+    query = query.strip()
+    # Embed query
+    q_emb = embedder.encode([query], convert_to_numpy=True)
+    if NORMALIZE_EMB:
+        faiss.normalize_L2(q_emb)
+    # Search top-k
+    top_k = min(TOP_K, len(pdf_chunks))
+    distances, indices = faiss_index.search(q_emb, top_k)
+    indices = indices[0].tolist()
+    # Compose context from retrieved chunks (concatenate, truncate if too long)
+    retrieved = [pdf_chunks[i] for i in indices]
+    context = "\n\n".join(retrieved)
+    # Build prompt - be concise and reference context
+    system_prompt = (
+        "You are a helpful assistant that answers questions using only the provided context. "
+        "If the answer is not contained in the context, say 'I don't know based on the document.' "
+        "Be concise and factual."
+    )
+    prompt = (
+        f"{system_prompt}\n\n"
+        f"Context:\n{context}\n\n"
+        f"Question: {query}\n\n"
+        f"Answer:"
+    )
+    # Limit prompt size by truncating context from the left if it's too long
+    # Keep the question + system prompt + rightmost part of context
+    max_prompt_chars = 3000  # heuristic to keep generation fast
+    if len(prompt) > max_prompt_chars:
+        # keep the question and system prompt, then rightmost slice of context
+        right_context = context[-2000:]
+        prompt = f"{system_prompt}\n\nContext:\n{right_context}\n\nQuestion: {query}\n\nAnswer:"
+    # Generate
+    try:
+        out = qa_pipeline(
+            prompt,
+            max_new_tokens=MAX_NEW_TOKENS,
+            do_sample=False,
+            temperature=GEN_TEMPERATURE,
+            num_return_sequences=1,
+        )
+        answer = out[0]["generated_text"].strip()
+        # Safety: if model hallucinates beyond context, keep it short
+        return answer
+    except Exception as e:
+        return f"❌ Generation error: {e}"
+# ---------- Gradio UI ----------
+with gr.Blocks(title="PDF Chat (fast, retrieval-augmented)") as demo:
+    gr.Markdown("# 📚 Chat with your PDF — optimized for speed")
+    gr.Markdown(
+        "Upload a PDF, click **Process PDF**, then ask questions. "
+        "This app uses semantic search (FAISS) + a lightweight generator for quick responses."
+    )
+    with gr.Row():
+        file_in = gr.File(label="Upload PDF (PDF only)")
+        process_btn = gr.Button("Process PDF")
+        status = gr.Textbox(label="Status", interactive=False)
+    process_btn.click(fn=process_pdf, inputs=[file_in], outputs=[status])
+    gr.Markdown("---")
+    query = gr.Textbox(label="Ask a question about the PDF", placeholder="e.g. What is the main conclusion?")
+    ask_btn = gr.Button("Ask")
+    answer = gr.Textbox(label="Answer", lines=6)
+    ask_btn.click(fn=chat_with_pdf, inputs=[query], outputs=[answer])
+    gr.Markdown(
+        "Notes:\n"
+        "- The app keeps the processed PDF in memory for the session (no DB).\n"
+        "- Designed for low latency; tune CHUNK_SIZE/TOP_K/MAX_NEW_TOKENS for speed/quality tradeoffs."
+    )
+if __name__ == "__main__":
+    demo.launch()