Spaces:

hoshoo21
/

Custom_RAG

Sleeping

App Files Files Community

hoshoo21 commited on Jul 21

Commit

7a837d4

1 Parent(s): 6b4e3a3

deployment

Browse files

Files changed (10) hide show

.flaskenv +2 -0
.gitignore +210 -0
DockerFile +0 -0
Local.py +104 -0
app.py +63 -0
book_title_extractor.py +63 -0
duplicate_detector.py +62 -0
persiststorage.db +0 -0
rag_engine.py +148 -0
requirements.txt +0 -0

.flaskenv ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ FLASK_APP=app.py
2	+ FLASK_ENV=development

.gitignore ADDED Viewed

	@@ -0,0 +1,210 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[codz]
+*$py.class
+# C extensions
+*.so
+chroma_temp/
+# Distribution / packaging
+.Python
+build/
+rag-frontend/
+develop-eggs/
+dist/
+qen/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py.cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+#poetry.toml
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#   pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
+#   https://pdm-project.org/en/latest/usage/project/#working-with-version-control
+#pdm.lock
+#pdm.toml
+.pdm-python
+.pdm-build/
+# pixi
+#   Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
+#pixi.lock
+#   Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
+#   in the .venv directory. It is recommended not to include this directory in version control.
+.pixi
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.envrc
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# Abstra
+# Abstra is an AI-powered process automation framework.
+# Ignore directories containing user credentials, local state, and settings.
+# Learn more at https://abstra.io/docs
+.abstra/
+# Visual Studio Code
+#  Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
+#  that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
+#  and can be added to the global gitignore or merged into this file. However, if you prefer,
+#  you could uncomment the following to ignore the entire vscode folder
+# .vscode/
+# Ruff stuff:
+.ruff_cache/
+# PyPI configuration file
+.pypirc
+# Cursor
+#  Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
+#  exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
+#  refer to https://docs.cursor.com/context/ignore-files
+.cursorignore
+.cursorindexingignore
+# Marimo
+marimo/_static/
+marimo/_lsp/
+__marimo__/

DockerFile ADDED Viewed

File without changes

Local.py ADDED Viewed

	@@ -0,0 +1,104 @@

+import os
+from dotenv import load_dotenv
+from langchain_community.document_loaders import PyPDFLoader
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langchain_ollama import OllamaEmbeddings
+from langchain_community.vectorstores import Chroma
+from langchain.chains import RetrievalQA
+from langchain_community.llms import Ollama
+load_dotenv()
+DATA_PATH ='data/'
+FILE_NAME = 'ABriefHistoryofTime.pdf'
+CHROMA_PATH= "chroma_db"
+def load_documents():
+    pdf_path = os.path.join(DATA_PATH, FILE_NAME)
+    loader = PyPDFLoader(pdf_path)
+    documents = loader.load()
+    documents = [doc for doc in documents if doc.page_content.strip() != ""]
+    print(type(documents[0]))
+    print (f"Loaded {len(documents)} pages from pdf {pdf_path}")
+    return documents
+def split_documents(documents):
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size =1000,
+        chunk_overlap = 200,
+        length_function = len,
+        is_separator_regex=False
+    )
+    all_splits = text_splitter.split_documents(documents)
+    print (f"Split into {len(all_splits)} chunks")
+    return all_splits
+def get_embedding_functions(model_name = "nomic-embed-text"):
+    embeddings = OllamaEmbeddings(model=model_name)
+    print(f"Initialized embeddings with model {model_name}")
+    return embeddings
+def get_vector_store(embedding_function, persist_directory = CHROMA_PATH):
+    vectorstore = Chroma(
+                            persist_directory=persist_directory,
+                            embedding_function= embedding_function
+                          )
+    print (f"Vector store initilialized/loaded from: {persist_directory}")
+    return vectorstore
+def index_documents(chunks, embeding_function, persist_directory = CHROMA_PATH):
+    print (f"Indexing {len(chunks)} chunks")
+    vectorstore = Chroma.from_documents(
+        documents=chunks,
+        embedding= embeding_function,
+        persist_directory=persist_directory
+    )
+    vectorstore.persist()
+    print (f"Indexing complete. Data saved to : {persist_directory}")
+    return vectorstore
+loaded_docs = load_documents()
+print(f"Document type: {type(loaded_docs)}")           # should be a list
+print(f"Number of docs: {len(loaded_docs)}")           # should be > 0
+print(f"First item type: {type(loaded_docs[0])}")      # should be langchain.docstore.document.Document
+for i, doc in enumerate(loaded_docs[:3]):
+    print(f"\n📄 Doc {i} content preview:\n{doc.page_content[:300]}")
+chunks = split_documents(loaded_docs)
+if chunks:
+    print("Sample split:", chunks[0].page_content[:300])
+embeding_function = get_embedding_functions()
+vector_chroma_store =index_documents(chunks,embeding_function=embeding_function)
+def load_llm(model_name="qwen:1.8b"):
+    llm = Ollama(model=model_name)
+    print(f"✅ Loaded LLM: {model_name}")
+    return llm
+def create_qa_chain(llm, vector_store):
+    retriever = vector_store.as_retriever()
+    qa_chain = RetrievalQA.from_chain_type(
+        llm=llm,
+        retriever=retriever,
+        return_source_documents=True  # Optional: to see context
+    )
+    print("✅ QA Chain initialized")
+    return qa_chain
+def ask_question(qa_chain, question):
+    print(f"\n❓ Question: {question}")
+    result = qa_chain({"query": question})
+    print(f"\n💬 Answer:\n{result['result']}")
+    return result
+llm = load_llm()
+qa_chain = create_qa_chain(llm,vector_store=vector_chroma_store)
+ask_question(qa_chain, "What is the main idea of the first chapter?")
+ask_question(qa_chain, "Who is the author of Breif history time ?")

app.py ADDED Viewed

	@@ -0,0 +1,63 @@

+from flask import Flask, Response,request, jsonify
+from werkzeug.utils import secure_filename
+import os
+from rag_engine import RagEngine
+from flask_cors import CORS, cross_origin
+app = Flask(__name__)
+cors = CORS(app)
+app.config["CORS_HEADERS"]= 'Content-Type'
+app.config["UPLOAD_FOLDER"]= "uploads"
+os.makedirs(app.config["UPLOAD_FOLDER"], exist_ok=True)
+rag = RagEngine()
+@app.route("/upload", methods=["POST"])
+@cross_origin()
+def upload_pdf():
+    file = request.files.get('file')
+    if not file.filename.endswith(".pdf"):
+        return jsonify({"error":"Only pdf files are support"}),400
+    filename = secure_filename(file.filename)
+    filepath = os.path.join(app.config["UPLOAD_FOLDER"], filename)
+    file.save(filepath)
+    try :
+        rag.index_pdf(filepath)
+    except ValueError as ve:
+        return jsonify({"error": str(ve)}), 400
+    return jsonify({"message":f"file {filename} uploaded and indexed successfully"})
+@app.route ("/stream", methods=["POST"])
+@cross_origin()
+def stream_answer():
+    question = request.json.get("question", "")
+    if not question.strip():
+        return jsonify({"error": "Empty question"}), 400
+    def generate():
+        for token in rag.ask_question_stream(question):
+            yield token
+    return Response(generate(), mimetype='text/plain')
+@app.route("/ask", methods=["POST"])
+@cross_origin()
+def ask():
+    question = request.json.get("question", "")
+    if not question.strip():
+        return jsonify({"error": "Empty question"}), 400
+    try :
+        answer = rag.ask_question(question)
+    except Exception as e:
+        return jsonify({"error": str(e)}),500
+    return jsonify({"message": answer})
+if __name__ =="__main__":
+    app.run(debug=True, port =6000)

book_title_extractor.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import re
+from langchain_community.document_loaders import PyPDFLoader
+class BookTitleExtractor:
+    def __init__(self, llm=None):
+        self.llm = llm
+    def extract_title(self, pdf_path, max_pages=5):
+        title = self._extract_with_heuristics(pdf_path, max_pages)
+        if title:
+            return title
+        if self.llm:
+            return self._extract_with_llm(pdf_path)
+        return "Unknown Title"
+    def _extract_with_heuristics(self, pdf_path, max_pages):
+        loader = PyPDFLoader(pdf_path)
+        pages = loader.load()[:max_pages]
+        for page in pages:
+            text = page.page_content.strip()
+            if not text:
+                continue
+            # Heuristic 1: ALL CAPS title
+            matches = re.findall(r'^[A-Z][A-Z\s\-:,]{5,}$', text, re.MULTILINE)
+            if matches:
+                return matches[0].strip()
+            # Heuristic 2: First significant line
+            lines = [line.strip() for line in text.split('\n') if len(line.strip()) > 10]
+            if lines:
+                return lines[0]
+        return None
+    def extract_book_title_from_documents(self,documents, max_docs=5):
+        for doc in documents[:max_docs]:
+            text = doc.page_content.strip()
+            if not text:
+                continue
+            # Heuristic 1: Lines with ALL CAPS (title pages often use this)
+            matches = re.findall(r'^[A-Z][A-Z\s\-:,]{5,}$', text, re.MULTILINE)
+            if matches:
+                return matches[0].strip()
+            # Heuristic 2: First non-empty, title-cased line
+            for line in text.split("\n"):
+                line = line.strip()
+                if len(line) > 10 and line.istitle():
+                    return line
+        return "Unknown Title"
+    def _extract_with_llm(self, pdf_path):
+        loader = PyPDFLoader(pdf_path)
+        pages = loader.load()
+        if not pages:
+            return "Unknown Title"
+        sample_text = pages[0].page_content.strip()[:1000]
+        prompt = (
+            "Identify the book title from the following text:\n\n"
+            f"{sample_text}\n\nOnly return the book title."
+        )
+        return self.llm.invoke(prompt).strip()

duplicate_detector.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import hashlib
+import os
+import sqlite3
+from langchain_community.document_loaders import PyPDFLoader
+class DuplicateDetector:
+    def __init__(self, db_path ="persiststorage.db", max_pages = 10):
+        self.fingerprints_seen = set()
+        self.db_path = db_path
+        self.max_pages =max_pages
+        self._init_db()
+    def _init_db(self):
+        conn =sqlite3.connect(self.db_path)
+        cursor =conn.cursor()
+        cursor.execute('''
+                        CREATE TABLE IF NOT EXISTS documents (
+                            id INTEGER PRIMARY KEY AUTOINCREMENT,
+                            filename TEXT,
+                            fingerprint TEXT UNIQUE,
+                            created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
+                        )
+                       ''')
+        conn.commit()
+        conn.close()
+    def is_duplicate (self, pdf_path):
+        fingerprints = self.generate_fingerprints(pdf_path)
+        try :
+            conn = sqlite3.connect(self.db_path)
+            cursor = conn.cursor()
+            cursor.execute("select id from documents where fingerprint =?", (fingerprints,))
+            exists = cursor.fetchone() is not None
+            conn.close()
+            return exists
+        except ValueError as e:
+            raise e
+    def store_fingerprints(self, pdf_path):
+        fingerprints = self.generate_fingerprints(pdf_path)
+        conn = sqlite3.connect(self.db_path)
+        cursor = conn.cursor()
+        try :
+            cursor.execute("INSERT INTO DOCUMENTS(filename, fingerprint) values(?,?)",
+                           (os.path.basename(pdf_path), fingerprints))
+            conn.commit()
+        except ValueError as e:
+            pass
+        finally:
+            conn.close()
+    def generate_fingerprints(self, pdf_path):
+        try :
+            loader = PyPDFLoader(pdf_path)
+            docs = loader.load()
+            text = "".join(doc.page_content for doc in docs[:self.max_pages])
+            fingerprint = hashlib.sha256(text.encode("utf-8")).hexdigest()
+            return fingerprint
+        except ValueError as e:
+            raise ValueError(f"Failed to fingerprint PDF: {e}")

persiststorage.db ADDED Viewed

Binary file (16.4 kB). View file

rag_engine.py ADDED Viewed

	@@ -0,0 +1,148 @@

+import os
+import shutil
+import tempfile
+from langchain_community.document_loaders import PyPDFLoader
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langchain_ollama import OllamaEmbeddings
+from langchain_community.vectorstores import Chroma
+from langchain.chains import RetrievalQA
+from langchain_community.llms import Ollama
+from book_title_extractor import BookTitleExtractor
+from duplicate_detector import DuplicateDetector
+from langchain_core.callbacks.base import BaseCallbackHandler
+from langchain_community.chat_models import ChatOllama
+class StreamingHanlder(BaseCallbackHandler):
+    def __init__(self):
+        self.buffer =[]
+        self.token_callback = None
+    def on_llm_new_token(self, token:str, **kwargs):
+        self.buffer.append(token)
+        if self.token_callback:
+            self.token_callback(token)
+class RagEngine:
+    def __init__(self, embed_model= "nomic-embed-text",llm_model="qwen:1.8b", temp_dir ="chroma_temp"):
+        self.embed_model = embed_model
+        self.llm_model = llm_model
+        self.embedding = OllamaEmbeddings(model=self.embed_model)
+        self.vectorstore = None
+        self.qa_chain = None
+        self.handler = StreamingHanlder()
+        self.llm = ChatOllama (model=self.llm_model, streaming= True, callbacks=[self.handler] )
+        self.temp_dir = temp_dir
+        os.makedirs(self.temp_dir, exist_ok=True)
+        self.title_extractor = BookTitleExtractor(llm=self.llm)
+        self.duplicate_detector = DuplicateDetector()
+        if os.path.exists(os.path.join(self.temp_dir, "chroma.sqlite3")):
+            print("🔁 Loading existing Chroma vectorstore...")
+            self.vectorstore = Chroma(
+                persist_directory=self.temp_dir,
+                embedding_function=self.embedding
+            )
+            self.qa_chain = RetrievalQA.from_chain_type(
+                llm=self.llm,
+                retriever=self.vectorstore.as_retriever(),
+                return_source_documents=True
+            )
+            print("Vectorstore and QA chain restored.")
+    def clear_temp(self):
+        shutil.rmtree(self.temp_dir,ignore_errors=True)
+        os.makedirs(self.temp_dir, exist_ok=True)
+    def index_pdf(self, pdf_path):
+        if self.duplicate_detector.is_duplicate(pdf_path):
+            raise ValueError(f"duplicate book detected, skipping index of: {pdf_path}")
+            return
+        self.duplicate_detector.store_fingerprints(pdf_path)
+        self.clear_temp()
+        filename = os.path.basename(pdf_path)
+        loader = PyPDFLoader(pdf_path)
+        documents = loader.load()
+        title = self.title_extractor.extract_book_title_from_documents(documents,max_docs=10)
+        for doc in documents:
+            doc.metadata["source"] = title
+        documents = [doc for doc in documents if doc.page_content.strip()]
+        if not documents:
+            raise ValueError("No Reasonable text in uploaded pdf")
+        splitter = RecursiveCharacterTextSplitter(chunk_size = 1000,chunk_overlap = 500 )
+        chunks = splitter.split_documents(documents)
+        if self.vectorstore is None:
+            self.vectorstore = Chroma.from_documents(
+                documents=chunks,
+                embedding=self.embedding,
+                persist_directory=self.temp_dir
+            )
+            self.vectorstore.persist()
+        else:
+            self.vectorstore.add_documents(chunks)
+        self.qa_chain = RetrievalQA.from_chain_type(
+            llm = self.llm,
+            retriever = self.vectorstore.as_retriever(),
+            return_source_documents = True
+        )
+    def ask_question(self, question):
+        print (question)
+        if not self.qa_chain :
+            return "please upload and index pdf document first"
+        result = self.qa_chain({"query":question})
+        answer = result["result"]
+        sources =[]
+        for doc in result["source_documents"]:
+             source = doc.metadata.get("source", "Unknown")
+             sources.append(source)
+        print (answer)
+        return {
+            "answer": answer,
+            "sources": list(set(sources))  # Remove duplicates
+        }
+    def ask_question_stream(self, question: str):
+        if not self.qa_chain:
+            yield "❗ Please upload and index a PDF document first."
+            return
+        from queue import Queue, Empty
+        import threading
+        q = Queue()
+        def token_callback(token):
+          q.put(token)
+        self.handler.buffer = []
+        self.handler.token_callback = token_callback
+        def run():
+            result = self.qa_chain.invoke({"query": question})
+            print (result)
+            self._latest_result = result
+            q.put(None)
+        threading.Thread(target=run).start()
+        print("Threading started", flush=True)
+        while True:
+         try:
+            token = q.get(timeout=30)
+            if token is None:
+                print("Stream finished", flush=True)
+                break
+            yield token
+         except Empty:
+            print("Timed out waiting for token", flush=True)
+            break
+        sources = []
+        for doc in self._latest_result.get("source_documents",[] ):
+            source = doc.metadata.get("source", "Unknown")
+            sources.append(source)
+        if sources:
+            yield "\n\n📚 **Sources:**\n"
+            for i, src in enumerate(set(sources)):
+                yield f"[{i+1}] {src}\n"

requirements.txt ADDED Viewed

Binary file (7.48 kB). View file