Spaces:
Sleeping
Sleeping
hoshoo21
commited on
Commit
Β·
7a837d4
1
Parent(s):
6b4e3a3
deployment
Browse files- .flaskenv +2 -0
- .gitignore +210 -0
- DockerFile +0 -0
- Local.py +104 -0
- app.py +63 -0
- book_title_extractor.py +63 -0
- duplicate_detector.py +62 -0
- persiststorage.db +0 -0
- rag_engine.py +148 -0
- requirements.txt +0 -0
.flaskenv
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FLASK_APP=app.py
|
| 2 |
+
FLASK_ENV=development
|
.gitignore
ADDED
|
@@ -0,0 +1,210 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Byte-compiled / optimized / DLL files
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[codz]
|
| 4 |
+
*$py.class
|
| 5 |
+
|
| 6 |
+
# C extensions
|
| 7 |
+
*.so
|
| 8 |
+
|
| 9 |
+
chroma_temp/
|
| 10 |
+
# Distribution / packaging
|
| 11 |
+
.Python
|
| 12 |
+
build/
|
| 13 |
+
rag-frontend/
|
| 14 |
+
develop-eggs/
|
| 15 |
+
dist/
|
| 16 |
+
qen/
|
| 17 |
+
downloads/
|
| 18 |
+
eggs/
|
| 19 |
+
.eggs/
|
| 20 |
+
lib/
|
| 21 |
+
lib64/
|
| 22 |
+
parts/
|
| 23 |
+
sdist/
|
| 24 |
+
var/
|
| 25 |
+
wheels/
|
| 26 |
+
share/python-wheels/
|
| 27 |
+
*.egg-info/
|
| 28 |
+
.installed.cfg
|
| 29 |
+
*.egg
|
| 30 |
+
MANIFEST
|
| 31 |
+
|
| 32 |
+
# PyInstaller
|
| 33 |
+
# Usually these files are written by a python script from a template
|
| 34 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
| 35 |
+
*.manifest
|
| 36 |
+
*.spec
|
| 37 |
+
|
| 38 |
+
# Installer logs
|
| 39 |
+
pip-log.txt
|
| 40 |
+
pip-delete-this-directory.txt
|
| 41 |
+
|
| 42 |
+
# Unit test / coverage reports
|
| 43 |
+
htmlcov/
|
| 44 |
+
.tox/
|
| 45 |
+
.nox/
|
| 46 |
+
.coverage
|
| 47 |
+
.coverage.*
|
| 48 |
+
.cache
|
| 49 |
+
nosetests.xml
|
| 50 |
+
coverage.xml
|
| 51 |
+
*.cover
|
| 52 |
+
*.py.cover
|
| 53 |
+
.hypothesis/
|
| 54 |
+
.pytest_cache/
|
| 55 |
+
cover/
|
| 56 |
+
|
| 57 |
+
# Translations
|
| 58 |
+
*.mo
|
| 59 |
+
*.pot
|
| 60 |
+
|
| 61 |
+
# Django stuff:
|
| 62 |
+
*.log
|
| 63 |
+
local_settings.py
|
| 64 |
+
db.sqlite3
|
| 65 |
+
db.sqlite3-journal
|
| 66 |
+
|
| 67 |
+
# Flask stuff:
|
| 68 |
+
instance/
|
| 69 |
+
.webassets-cache
|
| 70 |
+
|
| 71 |
+
# Scrapy stuff:
|
| 72 |
+
.scrapy
|
| 73 |
+
|
| 74 |
+
# Sphinx documentation
|
| 75 |
+
docs/_build/
|
| 76 |
+
|
| 77 |
+
# PyBuilder
|
| 78 |
+
.pybuilder/
|
| 79 |
+
target/
|
| 80 |
+
|
| 81 |
+
# Jupyter Notebook
|
| 82 |
+
.ipynb_checkpoints
|
| 83 |
+
|
| 84 |
+
# IPython
|
| 85 |
+
profile_default/
|
| 86 |
+
ipython_config.py
|
| 87 |
+
|
| 88 |
+
# pyenv
|
| 89 |
+
# For a library or package, you might want to ignore these files since the code is
|
| 90 |
+
# intended to run in multiple environments; otherwise, check them in:
|
| 91 |
+
# .python-version
|
| 92 |
+
|
| 93 |
+
# pipenv
|
| 94 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
| 95 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
| 96 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
| 97 |
+
# install all needed dependencies.
|
| 98 |
+
#Pipfile.lock
|
| 99 |
+
|
| 100 |
+
# UV
|
| 101 |
+
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
|
| 102 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
| 103 |
+
# commonly ignored for libraries.
|
| 104 |
+
#uv.lock
|
| 105 |
+
|
| 106 |
+
# poetry
|
| 107 |
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
| 108 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
| 109 |
+
# commonly ignored for libraries.
|
| 110 |
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
| 111 |
+
#poetry.lock
|
| 112 |
+
#poetry.toml
|
| 113 |
+
|
| 114 |
+
# pdm
|
| 115 |
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
| 116 |
+
# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
|
| 117 |
+
# https://pdm-project.org/en/latest/usage/project/#working-with-version-control
|
| 118 |
+
#pdm.lock
|
| 119 |
+
#pdm.toml
|
| 120 |
+
.pdm-python
|
| 121 |
+
.pdm-build/
|
| 122 |
+
|
| 123 |
+
# pixi
|
| 124 |
+
# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
|
| 125 |
+
#pixi.lock
|
| 126 |
+
# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
|
| 127 |
+
# in the .venv directory. It is recommended not to include this directory in version control.
|
| 128 |
+
.pixi
|
| 129 |
+
|
| 130 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
| 131 |
+
__pypackages__/
|
| 132 |
+
|
| 133 |
+
# Celery stuff
|
| 134 |
+
celerybeat-schedule
|
| 135 |
+
celerybeat.pid
|
| 136 |
+
|
| 137 |
+
# SageMath parsed files
|
| 138 |
+
*.sage.py
|
| 139 |
+
|
| 140 |
+
# Environments
|
| 141 |
+
.env
|
| 142 |
+
.envrc
|
| 143 |
+
.venv
|
| 144 |
+
env/
|
| 145 |
+
venv/
|
| 146 |
+
ENV/
|
| 147 |
+
env.bak/
|
| 148 |
+
venv.bak/
|
| 149 |
+
|
| 150 |
+
# Spyder project settings
|
| 151 |
+
.spyderproject
|
| 152 |
+
.spyproject
|
| 153 |
+
|
| 154 |
+
# Rope project settings
|
| 155 |
+
.ropeproject
|
| 156 |
+
|
| 157 |
+
# mkdocs documentation
|
| 158 |
+
/site
|
| 159 |
+
|
| 160 |
+
# mypy
|
| 161 |
+
.mypy_cache/
|
| 162 |
+
.dmypy.json
|
| 163 |
+
dmypy.json
|
| 164 |
+
|
| 165 |
+
# Pyre type checker
|
| 166 |
+
.pyre/
|
| 167 |
+
|
| 168 |
+
# pytype static type analyzer
|
| 169 |
+
.pytype/
|
| 170 |
+
|
| 171 |
+
# Cython debug symbols
|
| 172 |
+
cython_debug/
|
| 173 |
+
|
| 174 |
+
# PyCharm
|
| 175 |
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
| 176 |
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
| 177 |
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
| 178 |
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
| 179 |
+
#.idea/
|
| 180 |
+
|
| 181 |
+
# Abstra
|
| 182 |
+
# Abstra is an AI-powered process automation framework.
|
| 183 |
+
# Ignore directories containing user credentials, local state, and settings.
|
| 184 |
+
# Learn more at https://abstra.io/docs
|
| 185 |
+
.abstra/
|
| 186 |
+
|
| 187 |
+
# Visual Studio Code
|
| 188 |
+
# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
|
| 189 |
+
# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
|
| 190 |
+
# and can be added to the global gitignore or merged into this file. However, if you prefer,
|
| 191 |
+
# you could uncomment the following to ignore the entire vscode folder
|
| 192 |
+
# .vscode/
|
| 193 |
+
|
| 194 |
+
# Ruff stuff:
|
| 195 |
+
.ruff_cache/
|
| 196 |
+
|
| 197 |
+
# PyPI configuration file
|
| 198 |
+
.pypirc
|
| 199 |
+
|
| 200 |
+
# Cursor
|
| 201 |
+
# Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
|
| 202 |
+
# exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
|
| 203 |
+
# refer to https://docs.cursor.com/context/ignore-files
|
| 204 |
+
.cursorignore
|
| 205 |
+
.cursorindexingignore
|
| 206 |
+
|
| 207 |
+
# Marimo
|
| 208 |
+
marimo/_static/
|
| 209 |
+
marimo/_lsp/
|
| 210 |
+
__marimo__/
|
DockerFile
ADDED
|
File without changes
|
Local.py
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from dotenv import load_dotenv
|
| 3 |
+
from langchain_community.document_loaders import PyPDFLoader
|
| 4 |
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
| 5 |
+
from langchain_ollama import OllamaEmbeddings
|
| 6 |
+
from langchain_community.vectorstores import Chroma
|
| 7 |
+
from langchain.chains import RetrievalQA
|
| 8 |
+
from langchain_community.llms import Ollama
|
| 9 |
+
|
| 10 |
+
load_dotenv()
|
| 11 |
+
|
| 12 |
+
DATA_PATH ='data/'
|
| 13 |
+
FILE_NAME = 'ABriefHistoryofTime.pdf'
|
| 14 |
+
CHROMA_PATH= "chroma_db"
|
| 15 |
+
|
| 16 |
+
def load_documents():
|
| 17 |
+
pdf_path = os.path.join(DATA_PATH, FILE_NAME)
|
| 18 |
+
loader = PyPDFLoader(pdf_path)
|
| 19 |
+
documents = loader.load()
|
| 20 |
+
documents = [doc for doc in documents if doc.page_content.strip() != ""]
|
| 21 |
+
print(type(documents[0]))
|
| 22 |
+
print (f"Loaded {len(documents)} pages from pdf {pdf_path}")
|
| 23 |
+
|
| 24 |
+
return documents
|
| 25 |
+
def split_documents(documents):
|
| 26 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
| 27 |
+
chunk_size =1000,
|
| 28 |
+
chunk_overlap = 200,
|
| 29 |
+
length_function = len,
|
| 30 |
+
is_separator_regex=False
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
all_splits = text_splitter.split_documents(documents)
|
| 34 |
+
print (f"Split into {len(all_splits)} chunks")
|
| 35 |
+
return all_splits
|
| 36 |
+
def get_embedding_functions(model_name = "nomic-embed-text"):
|
| 37 |
+
embeddings = OllamaEmbeddings(model=model_name)
|
| 38 |
+
print(f"Initialized embeddings with model {model_name}")
|
| 39 |
+
return embeddings
|
| 40 |
+
|
| 41 |
+
def get_vector_store(embedding_function, persist_directory = CHROMA_PATH):
|
| 42 |
+
vectorstore = Chroma(
|
| 43 |
+
persist_directory=persist_directory,
|
| 44 |
+
embedding_function= embedding_function
|
| 45 |
+
)
|
| 46 |
+
print (f"Vector store initilialized/loaded from: {persist_directory}")
|
| 47 |
+
return vectorstore
|
| 48 |
+
|
| 49 |
+
def index_documents(chunks, embeding_function, persist_directory = CHROMA_PATH):
|
| 50 |
+
print (f"Indexing {len(chunks)} chunks")
|
| 51 |
+
vectorstore = Chroma.from_documents(
|
| 52 |
+
documents=chunks,
|
| 53 |
+
embedding= embeding_function,
|
| 54 |
+
persist_directory=persist_directory
|
| 55 |
+
)
|
| 56 |
+
vectorstore.persist()
|
| 57 |
+
print (f"Indexing complete. Data saved to : {persist_directory}")
|
| 58 |
+
return vectorstore
|
| 59 |
+
|
| 60 |
+
loaded_docs = load_documents()
|
| 61 |
+
print(f"Document type: {type(loaded_docs)}") # should be a list
|
| 62 |
+
print(f"Number of docs: {len(loaded_docs)}") # should be > 0
|
| 63 |
+
print(f"First item type: {type(loaded_docs[0])}") # should be langchain.docstore.document.Document
|
| 64 |
+
|
| 65 |
+
for i, doc in enumerate(loaded_docs[:3]):
|
| 66 |
+
print(f"\nπ Doc {i} content preview:\n{doc.page_content[:300]}")
|
| 67 |
+
|
| 68 |
+
chunks = split_documents(loaded_docs)
|
| 69 |
+
if chunks:
|
| 70 |
+
print("Sample split:", chunks[0].page_content[:300])
|
| 71 |
+
embeding_function = get_embedding_functions()
|
| 72 |
+
|
| 73 |
+
vector_chroma_store =index_documents(chunks,embeding_function=embeding_function)
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def load_llm(model_name="qwen:1.8b"):
|
| 77 |
+
llm = Ollama(model=model_name)
|
| 78 |
+
print(f"β
Loaded LLM: {model_name}")
|
| 79 |
+
return llm
|
| 80 |
+
|
| 81 |
+
def create_qa_chain(llm, vector_store):
|
| 82 |
+
retriever = vector_store.as_retriever()
|
| 83 |
+
qa_chain = RetrievalQA.from_chain_type(
|
| 84 |
+
llm=llm,
|
| 85 |
+
retriever=retriever,
|
| 86 |
+
return_source_documents=True # Optional: to see context
|
| 87 |
+
)
|
| 88 |
+
print("β
QA Chain initialized")
|
| 89 |
+
return qa_chain
|
| 90 |
+
|
| 91 |
+
def ask_question(qa_chain, question):
|
| 92 |
+
print(f"\nβ Question: {question}")
|
| 93 |
+
result = qa_chain({"query": question})
|
| 94 |
+
print(f"\n㪠Answer:\n{result['result']}")
|
| 95 |
+
return result
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
llm = load_llm()
|
| 99 |
+
|
| 100 |
+
qa_chain = create_qa_chain(llm,vector_store=vector_chroma_store)
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
ask_question(qa_chain, "What is the main idea of the first chapter?")
|
| 104 |
+
ask_question(qa_chain, "Who is the author of Breif history time ?")
|
app.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from flask import Flask, Response,request, jsonify
|
| 2 |
+
from werkzeug.utils import secure_filename
|
| 3 |
+
import os
|
| 4 |
+
from rag_engine import RagEngine
|
| 5 |
+
from flask_cors import CORS, cross_origin
|
| 6 |
+
|
| 7 |
+
app = Flask(__name__)
|
| 8 |
+
cors = CORS(app)
|
| 9 |
+
app.config["CORS_HEADERS"]= 'Content-Type'
|
| 10 |
+
|
| 11 |
+
app.config["UPLOAD_FOLDER"]= "uploads"
|
| 12 |
+
os.makedirs(app.config["UPLOAD_FOLDER"], exist_ok=True)
|
| 13 |
+
|
| 14 |
+
rag = RagEngine()
|
| 15 |
+
|
| 16 |
+
@app.route("/upload", methods=["POST"])
|
| 17 |
+
@cross_origin()
|
| 18 |
+
def upload_pdf():
|
| 19 |
+
file = request.files.get('file')
|
| 20 |
+
if not file.filename.endswith(".pdf"):
|
| 21 |
+
return jsonify({"error":"Only pdf files are support"}),400
|
| 22 |
+
filename = secure_filename(file.filename)
|
| 23 |
+
filepath = os.path.join(app.config["UPLOAD_FOLDER"], filename)
|
| 24 |
+
file.save(filepath)
|
| 25 |
+
|
| 26 |
+
try :
|
| 27 |
+
rag.index_pdf(filepath)
|
| 28 |
+
except ValueError as ve:
|
| 29 |
+
return jsonify({"error": str(ve)}), 400
|
| 30 |
+
|
| 31 |
+
return jsonify({"message":f"file {filename} uploaded and indexed successfully"})
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
@app.route ("/stream", methods=["POST"])
|
| 35 |
+
@cross_origin()
|
| 36 |
+
def stream_answer():
|
| 37 |
+
question = request.json.get("question", "")
|
| 38 |
+
if not question.strip():
|
| 39 |
+
return jsonify({"error": "Empty question"}), 400
|
| 40 |
+
|
| 41 |
+
def generate():
|
| 42 |
+
for token in rag.ask_question_stream(question):
|
| 43 |
+
yield token
|
| 44 |
+
|
| 45 |
+
return Response(generate(), mimetype='text/plain')
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
@app.route("/ask", methods=["POST"])
|
| 49 |
+
@cross_origin()
|
| 50 |
+
def ask():
|
| 51 |
+
question = request.json.get("question", "")
|
| 52 |
+
if not question.strip():
|
| 53 |
+
return jsonify({"error": "Empty question"}), 400
|
| 54 |
+
try :
|
| 55 |
+
answer = rag.ask_question(question)
|
| 56 |
+
except Exception as e:
|
| 57 |
+
return jsonify({"error": str(e)}),500
|
| 58 |
+
return jsonify({"message": answer})
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
if __name__ =="__main__":
|
| 62 |
+
|
| 63 |
+
app.run(debug=True, port =6000)
|
book_title_extractor.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
from langchain_community.document_loaders import PyPDFLoader
|
| 3 |
+
|
| 4 |
+
class BookTitleExtractor:
|
| 5 |
+
def __init__(self, llm=None):
|
| 6 |
+
self.llm = llm
|
| 7 |
+
|
| 8 |
+
def extract_title(self, pdf_path, max_pages=5):
|
| 9 |
+
title = self._extract_with_heuristics(pdf_path, max_pages)
|
| 10 |
+
if title:
|
| 11 |
+
return title
|
| 12 |
+
if self.llm:
|
| 13 |
+
return self._extract_with_llm(pdf_path)
|
| 14 |
+
return "Unknown Title"
|
| 15 |
+
|
| 16 |
+
def _extract_with_heuristics(self, pdf_path, max_pages):
|
| 17 |
+
loader = PyPDFLoader(pdf_path)
|
| 18 |
+
pages = loader.load()[:max_pages]
|
| 19 |
+
|
| 20 |
+
for page in pages:
|
| 21 |
+
text = page.page_content.strip()
|
| 22 |
+
if not text:
|
| 23 |
+
continue
|
| 24 |
+
# Heuristic 1: ALL CAPS title
|
| 25 |
+
matches = re.findall(r'^[A-Z][A-Z\s\-:,]{5,}$', text, re.MULTILINE)
|
| 26 |
+
if matches:
|
| 27 |
+
return matches[0].strip()
|
| 28 |
+
# Heuristic 2: First significant line
|
| 29 |
+
lines = [line.strip() for line in text.split('\n') if len(line.strip()) > 10]
|
| 30 |
+
if lines:
|
| 31 |
+
return lines[0]
|
| 32 |
+
return None
|
| 33 |
+
|
| 34 |
+
def extract_book_title_from_documents(self,documents, max_docs=5):
|
| 35 |
+
|
| 36 |
+
for doc in documents[:max_docs]:
|
| 37 |
+
text = doc.page_content.strip()
|
| 38 |
+
if not text:
|
| 39 |
+
continue
|
| 40 |
+
|
| 41 |
+
# Heuristic 1: Lines with ALL CAPS (title pages often use this)
|
| 42 |
+
matches = re.findall(r'^[A-Z][A-Z\s\-:,]{5,}$', text, re.MULTILINE)
|
| 43 |
+
if matches:
|
| 44 |
+
return matches[0].strip()
|
| 45 |
+
|
| 46 |
+
# Heuristic 2: First non-empty, title-cased line
|
| 47 |
+
for line in text.split("\n"):
|
| 48 |
+
line = line.strip()
|
| 49 |
+
if len(line) > 10 and line.istitle():
|
| 50 |
+
return line
|
| 51 |
+
return "Unknown Title"
|
| 52 |
+
|
| 53 |
+
def _extract_with_llm(self, pdf_path):
|
| 54 |
+
loader = PyPDFLoader(pdf_path)
|
| 55 |
+
pages = loader.load()
|
| 56 |
+
if not pages:
|
| 57 |
+
return "Unknown Title"
|
| 58 |
+
sample_text = pages[0].page_content.strip()[:1000]
|
| 59 |
+
prompt = (
|
| 60 |
+
"Identify the book title from the following text:\n\n"
|
| 61 |
+
f"{sample_text}\n\nOnly return the book title."
|
| 62 |
+
)
|
| 63 |
+
return self.llm.invoke(prompt).strip()
|
duplicate_detector.py
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import hashlib
|
| 2 |
+
import os
|
| 3 |
+
import sqlite3
|
| 4 |
+
|
| 5 |
+
from langchain_community.document_loaders import PyPDFLoader
|
| 6 |
+
|
| 7 |
+
class DuplicateDetector:
|
| 8 |
+
def __init__(self, db_path ="persiststorage.db", max_pages = 10):
|
| 9 |
+
self.fingerprints_seen = set()
|
| 10 |
+
self.db_path = db_path
|
| 11 |
+
self.max_pages =max_pages
|
| 12 |
+
self._init_db()
|
| 13 |
+
|
| 14 |
+
def _init_db(self):
|
| 15 |
+
conn =sqlite3.connect(self.db_path)
|
| 16 |
+
cursor =conn.cursor()
|
| 17 |
+
cursor.execute('''
|
| 18 |
+
CREATE TABLE IF NOT EXISTS documents (
|
| 19 |
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
| 20 |
+
filename TEXT,
|
| 21 |
+
fingerprint TEXT UNIQUE,
|
| 22 |
+
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
| 23 |
+
)
|
| 24 |
+
''')
|
| 25 |
+
conn.commit()
|
| 26 |
+
conn.close()
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def is_duplicate (self, pdf_path):
|
| 30 |
+
fingerprints = self.generate_fingerprints(pdf_path)
|
| 31 |
+
try :
|
| 32 |
+
conn = sqlite3.connect(self.db_path)
|
| 33 |
+
cursor = conn.cursor()
|
| 34 |
+
cursor.execute("select id from documents where fingerprint =?", (fingerprints,))
|
| 35 |
+
exists = cursor.fetchone() is not None
|
| 36 |
+
conn.close()
|
| 37 |
+
return exists
|
| 38 |
+
except ValueError as e:
|
| 39 |
+
raise e
|
| 40 |
+
|
| 41 |
+
def store_fingerprints(self, pdf_path):
|
| 42 |
+
fingerprints = self.generate_fingerprints(pdf_path)
|
| 43 |
+
conn = sqlite3.connect(self.db_path)
|
| 44 |
+
cursor = conn.cursor()
|
| 45 |
+
try :
|
| 46 |
+
cursor.execute("INSERT INTO DOCUMENTS(filename, fingerprint) values(?,?)",
|
| 47 |
+
(os.path.basename(pdf_path), fingerprints))
|
| 48 |
+
conn.commit()
|
| 49 |
+
except ValueError as e:
|
| 50 |
+
pass
|
| 51 |
+
finally:
|
| 52 |
+
conn.close()
|
| 53 |
+
|
| 54 |
+
def generate_fingerprints(self, pdf_path):
|
| 55 |
+
try :
|
| 56 |
+
loader = PyPDFLoader(pdf_path)
|
| 57 |
+
docs = loader.load()
|
| 58 |
+
text = "".join(doc.page_content for doc in docs[:self.max_pages])
|
| 59 |
+
fingerprint = hashlib.sha256(text.encode("utf-8")).hexdigest()
|
| 60 |
+
return fingerprint
|
| 61 |
+
except ValueError as e:
|
| 62 |
+
raise ValueError(f"Failed to fingerprint PDF: {e}")
|
persiststorage.db
ADDED
|
Binary file (16.4 kB). View file
|
|
|
rag_engine.py
ADDED
|
@@ -0,0 +1,148 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import shutil
|
| 3 |
+
import tempfile
|
| 4 |
+
from langchain_community.document_loaders import PyPDFLoader
|
| 5 |
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
| 6 |
+
from langchain_ollama import OllamaEmbeddings
|
| 7 |
+
from langchain_community.vectorstores import Chroma
|
| 8 |
+
from langchain.chains import RetrievalQA
|
| 9 |
+
from langchain_community.llms import Ollama
|
| 10 |
+
from book_title_extractor import BookTitleExtractor
|
| 11 |
+
from duplicate_detector import DuplicateDetector
|
| 12 |
+
from langchain_core.callbacks.base import BaseCallbackHandler
|
| 13 |
+
from langchain_community.chat_models import ChatOllama
|
| 14 |
+
class StreamingHanlder(BaseCallbackHandler):
|
| 15 |
+
def __init__(self):
|
| 16 |
+
self.buffer =[]
|
| 17 |
+
self.token_callback = None
|
| 18 |
+
def on_llm_new_token(self, token:str, **kwargs):
|
| 19 |
+
self.buffer.append(token)
|
| 20 |
+
if self.token_callback:
|
| 21 |
+
self.token_callback(token)
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
class RagEngine:
|
| 25 |
+
def __init__(self, embed_model= "nomic-embed-text",llm_model="qwen:1.8b", temp_dir ="chroma_temp"):
|
| 26 |
+
self.embed_model = embed_model
|
| 27 |
+
self.llm_model = llm_model
|
| 28 |
+
self.embedding = OllamaEmbeddings(model=self.embed_model)
|
| 29 |
+
self.vectorstore = None
|
| 30 |
+
self.qa_chain = None
|
| 31 |
+
self.handler = StreamingHanlder()
|
| 32 |
+
self.llm = ChatOllama (model=self.llm_model, streaming= True, callbacks=[self.handler] )
|
| 33 |
+
|
| 34 |
+
self.temp_dir = temp_dir
|
| 35 |
+
|
| 36 |
+
os.makedirs(self.temp_dir, exist_ok=True)
|
| 37 |
+
self.title_extractor = BookTitleExtractor(llm=self.llm)
|
| 38 |
+
self.duplicate_detector = DuplicateDetector()
|
| 39 |
+
if os.path.exists(os.path.join(self.temp_dir, "chroma.sqlite3")):
|
| 40 |
+
print("π Loading existing Chroma vectorstore...")
|
| 41 |
+
self.vectorstore = Chroma(
|
| 42 |
+
persist_directory=self.temp_dir,
|
| 43 |
+
embedding_function=self.embedding
|
| 44 |
+
)
|
| 45 |
+
self.qa_chain = RetrievalQA.from_chain_type(
|
| 46 |
+
llm=self.llm,
|
| 47 |
+
retriever=self.vectorstore.as_retriever(),
|
| 48 |
+
return_source_documents=True
|
| 49 |
+
)
|
| 50 |
+
print("Vectorstore and QA chain restored.")
|
| 51 |
+
|
| 52 |
+
def clear_temp(self):
|
| 53 |
+
shutil.rmtree(self.temp_dir,ignore_errors=True)
|
| 54 |
+
os.makedirs(self.temp_dir, exist_ok=True)
|
| 55 |
+
|
| 56 |
+
def index_pdf(self, pdf_path):
|
| 57 |
+
if self.duplicate_detector.is_duplicate(pdf_path):
|
| 58 |
+
raise ValueError(f"duplicate book detected, skipping index of: {pdf_path}")
|
| 59 |
+
return
|
| 60 |
+
self.duplicate_detector.store_fingerprints(pdf_path)
|
| 61 |
+
self.clear_temp()
|
| 62 |
+
filename = os.path.basename(pdf_path)
|
| 63 |
+
loader = PyPDFLoader(pdf_path)
|
| 64 |
+
documents = loader.load()
|
| 65 |
+
title = self.title_extractor.extract_book_title_from_documents(documents,max_docs=10)
|
| 66 |
+
|
| 67 |
+
for doc in documents:
|
| 68 |
+
doc.metadata["source"] = title
|
| 69 |
+
documents = [doc for doc in documents if doc.page_content.strip()]
|
| 70 |
+
if not documents:
|
| 71 |
+
raise ValueError("No Reasonable text in uploaded pdf")
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
splitter = RecursiveCharacterTextSplitter(chunk_size = 1000,chunk_overlap = 500 )
|
| 75 |
+
chunks = splitter.split_documents(documents)
|
| 76 |
+
if self.vectorstore is None:
|
| 77 |
+
self.vectorstore = Chroma.from_documents(
|
| 78 |
+
documents=chunks,
|
| 79 |
+
embedding=self.embedding,
|
| 80 |
+
persist_directory=self.temp_dir
|
| 81 |
+
)
|
| 82 |
+
self.vectorstore.persist()
|
| 83 |
+
|
| 84 |
+
else:
|
| 85 |
+
self.vectorstore.add_documents(chunks)
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
self.qa_chain = RetrievalQA.from_chain_type(
|
| 89 |
+
llm = self.llm,
|
| 90 |
+
retriever = self.vectorstore.as_retriever(),
|
| 91 |
+
return_source_documents = True
|
| 92 |
+
)
|
| 93 |
+
|
| 94 |
+
def ask_question(self, question):
|
| 95 |
+
print (question)
|
| 96 |
+
if not self.qa_chain :
|
| 97 |
+
return "please upload and index pdf document first"
|
| 98 |
+
result = self.qa_chain({"query":question})
|
| 99 |
+
answer = result["result"]
|
| 100 |
+
sources =[]
|
| 101 |
+
for doc in result["source_documents"]:
|
| 102 |
+
source = doc.metadata.get("source", "Unknown")
|
| 103 |
+
sources.append(source)
|
| 104 |
+
print (answer)
|
| 105 |
+
return {
|
| 106 |
+
"answer": answer,
|
| 107 |
+
"sources": list(set(sources)) # Remove duplicates
|
| 108 |
+
}
|
| 109 |
+
|
| 110 |
+
def ask_question_stream(self, question: str):
|
| 111 |
+
if not self.qa_chain:
|
| 112 |
+
yield "β Please upload and index a PDF document first."
|
| 113 |
+
return
|
| 114 |
+
from queue import Queue, Empty
|
| 115 |
+
import threading
|
| 116 |
+
q = Queue()
|
| 117 |
+
def token_callback(token):
|
| 118 |
+
q.put(token)
|
| 119 |
+
self.handler.buffer = []
|
| 120 |
+
self.handler.token_callback = token_callback
|
| 121 |
+
def run():
|
| 122 |
+
result = self.qa_chain.invoke({"query": question})
|
| 123 |
+
print (result)
|
| 124 |
+
self._latest_result = result
|
| 125 |
+
q.put(None)
|
| 126 |
+
threading.Thread(target=run).start()
|
| 127 |
+
|
| 128 |
+
print("Threading started", flush=True)
|
| 129 |
+
while True:
|
| 130 |
+
try:
|
| 131 |
+
token = q.get(timeout=30)
|
| 132 |
+
if token is None:
|
| 133 |
+
print("Stream finished", flush=True)
|
| 134 |
+
break
|
| 135 |
+
yield token
|
| 136 |
+
except Empty:
|
| 137 |
+
print("Timed out waiting for token", flush=True)
|
| 138 |
+
break
|
| 139 |
+
sources = []
|
| 140 |
+
for doc in self._latest_result.get("source_documents",[] ):
|
| 141 |
+
|
| 142 |
+
source = doc.metadata.get("source", "Unknown")
|
| 143 |
+
sources.append(source)
|
| 144 |
+
|
| 145 |
+
if sources:
|
| 146 |
+
yield "\n\nπ **Sources:**\n"
|
| 147 |
+
for i, src in enumerate(set(sources)):
|
| 148 |
+
yield f"[{i+1}] {src}\n"
|
requirements.txt
ADDED
|
Binary file (7.48 kB). View file
|
|
|