hoshoo21 commited on
Commit
7a837d4
Β·
1 Parent(s): 6b4e3a3

deployment

Browse files
Files changed (10) hide show
  1. .flaskenv +2 -0
  2. .gitignore +210 -0
  3. DockerFile +0 -0
  4. Local.py +104 -0
  5. app.py +63 -0
  6. book_title_extractor.py +63 -0
  7. duplicate_detector.py +62 -0
  8. persiststorage.db +0 -0
  9. rag_engine.py +148 -0
  10. requirements.txt +0 -0
.flaskenv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ FLASK_APP=app.py
2
+ FLASK_ENV=development
.gitignore ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[codz]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ chroma_temp/
10
+ # Distribution / packaging
11
+ .Python
12
+ build/
13
+ rag-frontend/
14
+ develop-eggs/
15
+ dist/
16
+ qen/
17
+ downloads/
18
+ eggs/
19
+ .eggs/
20
+ lib/
21
+ lib64/
22
+ parts/
23
+ sdist/
24
+ var/
25
+ wheels/
26
+ share/python-wheels/
27
+ *.egg-info/
28
+ .installed.cfg
29
+ *.egg
30
+ MANIFEST
31
+
32
+ # PyInstaller
33
+ # Usually these files are written by a python script from a template
34
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
35
+ *.manifest
36
+ *.spec
37
+
38
+ # Installer logs
39
+ pip-log.txt
40
+ pip-delete-this-directory.txt
41
+
42
+ # Unit test / coverage reports
43
+ htmlcov/
44
+ .tox/
45
+ .nox/
46
+ .coverage
47
+ .coverage.*
48
+ .cache
49
+ nosetests.xml
50
+ coverage.xml
51
+ *.cover
52
+ *.py.cover
53
+ .hypothesis/
54
+ .pytest_cache/
55
+ cover/
56
+
57
+ # Translations
58
+ *.mo
59
+ *.pot
60
+
61
+ # Django stuff:
62
+ *.log
63
+ local_settings.py
64
+ db.sqlite3
65
+ db.sqlite3-journal
66
+
67
+ # Flask stuff:
68
+ instance/
69
+ .webassets-cache
70
+
71
+ # Scrapy stuff:
72
+ .scrapy
73
+
74
+ # Sphinx documentation
75
+ docs/_build/
76
+
77
+ # PyBuilder
78
+ .pybuilder/
79
+ target/
80
+
81
+ # Jupyter Notebook
82
+ .ipynb_checkpoints
83
+
84
+ # IPython
85
+ profile_default/
86
+ ipython_config.py
87
+
88
+ # pyenv
89
+ # For a library or package, you might want to ignore these files since the code is
90
+ # intended to run in multiple environments; otherwise, check them in:
91
+ # .python-version
92
+
93
+ # pipenv
94
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
95
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
96
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
97
+ # install all needed dependencies.
98
+ #Pipfile.lock
99
+
100
+ # UV
101
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
102
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
103
+ # commonly ignored for libraries.
104
+ #uv.lock
105
+
106
+ # poetry
107
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
108
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
109
+ # commonly ignored for libraries.
110
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
111
+ #poetry.lock
112
+ #poetry.toml
113
+
114
+ # pdm
115
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
116
+ # pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
117
+ # https://pdm-project.org/en/latest/usage/project/#working-with-version-control
118
+ #pdm.lock
119
+ #pdm.toml
120
+ .pdm-python
121
+ .pdm-build/
122
+
123
+ # pixi
124
+ # Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
125
+ #pixi.lock
126
+ # Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
127
+ # in the .venv directory. It is recommended not to include this directory in version control.
128
+ .pixi
129
+
130
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
131
+ __pypackages__/
132
+
133
+ # Celery stuff
134
+ celerybeat-schedule
135
+ celerybeat.pid
136
+
137
+ # SageMath parsed files
138
+ *.sage.py
139
+
140
+ # Environments
141
+ .env
142
+ .envrc
143
+ .venv
144
+ env/
145
+ venv/
146
+ ENV/
147
+ env.bak/
148
+ venv.bak/
149
+
150
+ # Spyder project settings
151
+ .spyderproject
152
+ .spyproject
153
+
154
+ # Rope project settings
155
+ .ropeproject
156
+
157
+ # mkdocs documentation
158
+ /site
159
+
160
+ # mypy
161
+ .mypy_cache/
162
+ .dmypy.json
163
+ dmypy.json
164
+
165
+ # Pyre type checker
166
+ .pyre/
167
+
168
+ # pytype static type analyzer
169
+ .pytype/
170
+
171
+ # Cython debug symbols
172
+ cython_debug/
173
+
174
+ # PyCharm
175
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
176
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
177
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
178
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
179
+ #.idea/
180
+
181
+ # Abstra
182
+ # Abstra is an AI-powered process automation framework.
183
+ # Ignore directories containing user credentials, local state, and settings.
184
+ # Learn more at https://abstra.io/docs
185
+ .abstra/
186
+
187
+ # Visual Studio Code
188
+ # Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
189
+ # that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
190
+ # and can be added to the global gitignore or merged into this file. However, if you prefer,
191
+ # you could uncomment the following to ignore the entire vscode folder
192
+ # .vscode/
193
+
194
+ # Ruff stuff:
195
+ .ruff_cache/
196
+
197
+ # PyPI configuration file
198
+ .pypirc
199
+
200
+ # Cursor
201
+ # Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
202
+ # exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
203
+ # refer to https://docs.cursor.com/context/ignore-files
204
+ .cursorignore
205
+ .cursorindexingignore
206
+
207
+ # Marimo
208
+ marimo/_static/
209
+ marimo/_lsp/
210
+ __marimo__/
DockerFile ADDED
File without changes
Local.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+ from langchain_community.document_loaders import PyPDFLoader
4
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
5
+ from langchain_ollama import OllamaEmbeddings
6
+ from langchain_community.vectorstores import Chroma
7
+ from langchain.chains import RetrievalQA
8
+ from langchain_community.llms import Ollama
9
+
10
+ load_dotenv()
11
+
12
+ DATA_PATH ='data/'
13
+ FILE_NAME = 'ABriefHistoryofTime.pdf'
14
+ CHROMA_PATH= "chroma_db"
15
+
16
+ def load_documents():
17
+ pdf_path = os.path.join(DATA_PATH, FILE_NAME)
18
+ loader = PyPDFLoader(pdf_path)
19
+ documents = loader.load()
20
+ documents = [doc for doc in documents if doc.page_content.strip() != ""]
21
+ print(type(documents[0]))
22
+ print (f"Loaded {len(documents)} pages from pdf {pdf_path}")
23
+
24
+ return documents
25
+ def split_documents(documents):
26
+ text_splitter = RecursiveCharacterTextSplitter(
27
+ chunk_size =1000,
28
+ chunk_overlap = 200,
29
+ length_function = len,
30
+ is_separator_regex=False
31
+ )
32
+
33
+ all_splits = text_splitter.split_documents(documents)
34
+ print (f"Split into {len(all_splits)} chunks")
35
+ return all_splits
36
+ def get_embedding_functions(model_name = "nomic-embed-text"):
37
+ embeddings = OllamaEmbeddings(model=model_name)
38
+ print(f"Initialized embeddings with model {model_name}")
39
+ return embeddings
40
+
41
+ def get_vector_store(embedding_function, persist_directory = CHROMA_PATH):
42
+ vectorstore = Chroma(
43
+ persist_directory=persist_directory,
44
+ embedding_function= embedding_function
45
+ )
46
+ print (f"Vector store initilialized/loaded from: {persist_directory}")
47
+ return vectorstore
48
+
49
+ def index_documents(chunks, embeding_function, persist_directory = CHROMA_PATH):
50
+ print (f"Indexing {len(chunks)} chunks")
51
+ vectorstore = Chroma.from_documents(
52
+ documents=chunks,
53
+ embedding= embeding_function,
54
+ persist_directory=persist_directory
55
+ )
56
+ vectorstore.persist()
57
+ print (f"Indexing complete. Data saved to : {persist_directory}")
58
+ return vectorstore
59
+
60
+ loaded_docs = load_documents()
61
+ print(f"Document type: {type(loaded_docs)}") # should be a list
62
+ print(f"Number of docs: {len(loaded_docs)}") # should be > 0
63
+ print(f"First item type: {type(loaded_docs[0])}") # should be langchain.docstore.document.Document
64
+
65
+ for i, doc in enumerate(loaded_docs[:3]):
66
+ print(f"\nπŸ“„ Doc {i} content preview:\n{doc.page_content[:300]}")
67
+
68
+ chunks = split_documents(loaded_docs)
69
+ if chunks:
70
+ print("Sample split:", chunks[0].page_content[:300])
71
+ embeding_function = get_embedding_functions()
72
+
73
+ vector_chroma_store =index_documents(chunks,embeding_function=embeding_function)
74
+
75
+
76
+ def load_llm(model_name="qwen:1.8b"):
77
+ llm = Ollama(model=model_name)
78
+ print(f"βœ… Loaded LLM: {model_name}")
79
+ return llm
80
+
81
+ def create_qa_chain(llm, vector_store):
82
+ retriever = vector_store.as_retriever()
83
+ qa_chain = RetrievalQA.from_chain_type(
84
+ llm=llm,
85
+ retriever=retriever,
86
+ return_source_documents=True # Optional: to see context
87
+ )
88
+ print("βœ… QA Chain initialized")
89
+ return qa_chain
90
+
91
+ def ask_question(qa_chain, question):
92
+ print(f"\n❓ Question: {question}")
93
+ result = qa_chain({"query": question})
94
+ print(f"\nπŸ’¬ Answer:\n{result['result']}")
95
+ return result
96
+
97
+
98
+ llm = load_llm()
99
+
100
+ qa_chain = create_qa_chain(llm,vector_store=vector_chroma_store)
101
+
102
+
103
+ ask_question(qa_chain, "What is the main idea of the first chapter?")
104
+ ask_question(qa_chain, "Who is the author of Breif history time ?")
app.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, Response,request, jsonify
2
+ from werkzeug.utils import secure_filename
3
+ import os
4
+ from rag_engine import RagEngine
5
+ from flask_cors import CORS, cross_origin
6
+
7
+ app = Flask(__name__)
8
+ cors = CORS(app)
9
+ app.config["CORS_HEADERS"]= 'Content-Type'
10
+
11
+ app.config["UPLOAD_FOLDER"]= "uploads"
12
+ os.makedirs(app.config["UPLOAD_FOLDER"], exist_ok=True)
13
+
14
+ rag = RagEngine()
15
+
16
+ @app.route("/upload", methods=["POST"])
17
+ @cross_origin()
18
+ def upload_pdf():
19
+ file = request.files.get('file')
20
+ if not file.filename.endswith(".pdf"):
21
+ return jsonify({"error":"Only pdf files are support"}),400
22
+ filename = secure_filename(file.filename)
23
+ filepath = os.path.join(app.config["UPLOAD_FOLDER"], filename)
24
+ file.save(filepath)
25
+
26
+ try :
27
+ rag.index_pdf(filepath)
28
+ except ValueError as ve:
29
+ return jsonify({"error": str(ve)}), 400
30
+
31
+ return jsonify({"message":f"file {filename} uploaded and indexed successfully"})
32
+
33
+
34
+ @app.route ("/stream", methods=["POST"])
35
+ @cross_origin()
36
+ def stream_answer():
37
+ question = request.json.get("question", "")
38
+ if not question.strip():
39
+ return jsonify({"error": "Empty question"}), 400
40
+
41
+ def generate():
42
+ for token in rag.ask_question_stream(question):
43
+ yield token
44
+
45
+ return Response(generate(), mimetype='text/plain')
46
+
47
+
48
+ @app.route("/ask", methods=["POST"])
49
+ @cross_origin()
50
+ def ask():
51
+ question = request.json.get("question", "")
52
+ if not question.strip():
53
+ return jsonify({"error": "Empty question"}), 400
54
+ try :
55
+ answer = rag.ask_question(question)
56
+ except Exception as e:
57
+ return jsonify({"error": str(e)}),500
58
+ return jsonify({"message": answer})
59
+
60
+
61
+ if __name__ =="__main__":
62
+
63
+ app.run(debug=True, port =6000)
book_title_extractor.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from langchain_community.document_loaders import PyPDFLoader
3
+
4
+ class BookTitleExtractor:
5
+ def __init__(self, llm=None):
6
+ self.llm = llm
7
+
8
+ def extract_title(self, pdf_path, max_pages=5):
9
+ title = self._extract_with_heuristics(pdf_path, max_pages)
10
+ if title:
11
+ return title
12
+ if self.llm:
13
+ return self._extract_with_llm(pdf_path)
14
+ return "Unknown Title"
15
+
16
+ def _extract_with_heuristics(self, pdf_path, max_pages):
17
+ loader = PyPDFLoader(pdf_path)
18
+ pages = loader.load()[:max_pages]
19
+
20
+ for page in pages:
21
+ text = page.page_content.strip()
22
+ if not text:
23
+ continue
24
+ # Heuristic 1: ALL CAPS title
25
+ matches = re.findall(r'^[A-Z][A-Z\s\-:,]{5,}$', text, re.MULTILINE)
26
+ if matches:
27
+ return matches[0].strip()
28
+ # Heuristic 2: First significant line
29
+ lines = [line.strip() for line in text.split('\n') if len(line.strip()) > 10]
30
+ if lines:
31
+ return lines[0]
32
+ return None
33
+
34
+ def extract_book_title_from_documents(self,documents, max_docs=5):
35
+
36
+ for doc in documents[:max_docs]:
37
+ text = doc.page_content.strip()
38
+ if not text:
39
+ continue
40
+
41
+ # Heuristic 1: Lines with ALL CAPS (title pages often use this)
42
+ matches = re.findall(r'^[A-Z][A-Z\s\-:,]{5,}$', text, re.MULTILINE)
43
+ if matches:
44
+ return matches[0].strip()
45
+
46
+ # Heuristic 2: First non-empty, title-cased line
47
+ for line in text.split("\n"):
48
+ line = line.strip()
49
+ if len(line) > 10 and line.istitle():
50
+ return line
51
+ return "Unknown Title"
52
+
53
+ def _extract_with_llm(self, pdf_path):
54
+ loader = PyPDFLoader(pdf_path)
55
+ pages = loader.load()
56
+ if not pages:
57
+ return "Unknown Title"
58
+ sample_text = pages[0].page_content.strip()[:1000]
59
+ prompt = (
60
+ "Identify the book title from the following text:\n\n"
61
+ f"{sample_text}\n\nOnly return the book title."
62
+ )
63
+ return self.llm.invoke(prompt).strip()
duplicate_detector.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import hashlib
2
+ import os
3
+ import sqlite3
4
+
5
+ from langchain_community.document_loaders import PyPDFLoader
6
+
7
+ class DuplicateDetector:
8
+ def __init__(self, db_path ="persiststorage.db", max_pages = 10):
9
+ self.fingerprints_seen = set()
10
+ self.db_path = db_path
11
+ self.max_pages =max_pages
12
+ self._init_db()
13
+
14
+ def _init_db(self):
15
+ conn =sqlite3.connect(self.db_path)
16
+ cursor =conn.cursor()
17
+ cursor.execute('''
18
+ CREATE TABLE IF NOT EXISTS documents (
19
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
20
+ filename TEXT,
21
+ fingerprint TEXT UNIQUE,
22
+ created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
23
+ )
24
+ ''')
25
+ conn.commit()
26
+ conn.close()
27
+
28
+
29
+ def is_duplicate (self, pdf_path):
30
+ fingerprints = self.generate_fingerprints(pdf_path)
31
+ try :
32
+ conn = sqlite3.connect(self.db_path)
33
+ cursor = conn.cursor()
34
+ cursor.execute("select id from documents where fingerprint =?", (fingerprints,))
35
+ exists = cursor.fetchone() is not None
36
+ conn.close()
37
+ return exists
38
+ except ValueError as e:
39
+ raise e
40
+
41
+ def store_fingerprints(self, pdf_path):
42
+ fingerprints = self.generate_fingerprints(pdf_path)
43
+ conn = sqlite3.connect(self.db_path)
44
+ cursor = conn.cursor()
45
+ try :
46
+ cursor.execute("INSERT INTO DOCUMENTS(filename, fingerprint) values(?,?)",
47
+ (os.path.basename(pdf_path), fingerprints))
48
+ conn.commit()
49
+ except ValueError as e:
50
+ pass
51
+ finally:
52
+ conn.close()
53
+
54
+ def generate_fingerprints(self, pdf_path):
55
+ try :
56
+ loader = PyPDFLoader(pdf_path)
57
+ docs = loader.load()
58
+ text = "".join(doc.page_content for doc in docs[:self.max_pages])
59
+ fingerprint = hashlib.sha256(text.encode("utf-8")).hexdigest()
60
+ return fingerprint
61
+ except ValueError as e:
62
+ raise ValueError(f"Failed to fingerprint PDF: {e}")
persiststorage.db ADDED
Binary file (16.4 kB). View file
 
rag_engine.py ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import shutil
3
+ import tempfile
4
+ from langchain_community.document_loaders import PyPDFLoader
5
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
6
+ from langchain_ollama import OllamaEmbeddings
7
+ from langchain_community.vectorstores import Chroma
8
+ from langchain.chains import RetrievalQA
9
+ from langchain_community.llms import Ollama
10
+ from book_title_extractor import BookTitleExtractor
11
+ from duplicate_detector import DuplicateDetector
12
+ from langchain_core.callbacks.base import BaseCallbackHandler
13
+ from langchain_community.chat_models import ChatOllama
14
+ class StreamingHanlder(BaseCallbackHandler):
15
+ def __init__(self):
16
+ self.buffer =[]
17
+ self.token_callback = None
18
+ def on_llm_new_token(self, token:str, **kwargs):
19
+ self.buffer.append(token)
20
+ if self.token_callback:
21
+ self.token_callback(token)
22
+
23
+
24
+ class RagEngine:
25
+ def __init__(self, embed_model= "nomic-embed-text",llm_model="qwen:1.8b", temp_dir ="chroma_temp"):
26
+ self.embed_model = embed_model
27
+ self.llm_model = llm_model
28
+ self.embedding = OllamaEmbeddings(model=self.embed_model)
29
+ self.vectorstore = None
30
+ self.qa_chain = None
31
+ self.handler = StreamingHanlder()
32
+ self.llm = ChatOllama (model=self.llm_model, streaming= True, callbacks=[self.handler] )
33
+
34
+ self.temp_dir = temp_dir
35
+
36
+ os.makedirs(self.temp_dir, exist_ok=True)
37
+ self.title_extractor = BookTitleExtractor(llm=self.llm)
38
+ self.duplicate_detector = DuplicateDetector()
39
+ if os.path.exists(os.path.join(self.temp_dir, "chroma.sqlite3")):
40
+ print("πŸ” Loading existing Chroma vectorstore...")
41
+ self.vectorstore = Chroma(
42
+ persist_directory=self.temp_dir,
43
+ embedding_function=self.embedding
44
+ )
45
+ self.qa_chain = RetrievalQA.from_chain_type(
46
+ llm=self.llm,
47
+ retriever=self.vectorstore.as_retriever(),
48
+ return_source_documents=True
49
+ )
50
+ print("Vectorstore and QA chain restored.")
51
+
52
+ def clear_temp(self):
53
+ shutil.rmtree(self.temp_dir,ignore_errors=True)
54
+ os.makedirs(self.temp_dir, exist_ok=True)
55
+
56
+ def index_pdf(self, pdf_path):
57
+ if self.duplicate_detector.is_duplicate(pdf_path):
58
+ raise ValueError(f"duplicate book detected, skipping index of: {pdf_path}")
59
+ return
60
+ self.duplicate_detector.store_fingerprints(pdf_path)
61
+ self.clear_temp()
62
+ filename = os.path.basename(pdf_path)
63
+ loader = PyPDFLoader(pdf_path)
64
+ documents = loader.load()
65
+ title = self.title_extractor.extract_book_title_from_documents(documents,max_docs=10)
66
+
67
+ for doc in documents:
68
+ doc.metadata["source"] = title
69
+ documents = [doc for doc in documents if doc.page_content.strip()]
70
+ if not documents:
71
+ raise ValueError("No Reasonable text in uploaded pdf")
72
+
73
+
74
+ splitter = RecursiveCharacterTextSplitter(chunk_size = 1000,chunk_overlap = 500 )
75
+ chunks = splitter.split_documents(documents)
76
+ if self.vectorstore is None:
77
+ self.vectorstore = Chroma.from_documents(
78
+ documents=chunks,
79
+ embedding=self.embedding,
80
+ persist_directory=self.temp_dir
81
+ )
82
+ self.vectorstore.persist()
83
+
84
+ else:
85
+ self.vectorstore.add_documents(chunks)
86
+
87
+
88
+ self.qa_chain = RetrievalQA.from_chain_type(
89
+ llm = self.llm,
90
+ retriever = self.vectorstore.as_retriever(),
91
+ return_source_documents = True
92
+ )
93
+
94
+ def ask_question(self, question):
95
+ print (question)
96
+ if not self.qa_chain :
97
+ return "please upload and index pdf document first"
98
+ result = self.qa_chain({"query":question})
99
+ answer = result["result"]
100
+ sources =[]
101
+ for doc in result["source_documents"]:
102
+ source = doc.metadata.get("source", "Unknown")
103
+ sources.append(source)
104
+ print (answer)
105
+ return {
106
+ "answer": answer,
107
+ "sources": list(set(sources)) # Remove duplicates
108
+ }
109
+
110
+ def ask_question_stream(self, question: str):
111
+ if not self.qa_chain:
112
+ yield "❗ Please upload and index a PDF document first."
113
+ return
114
+ from queue import Queue, Empty
115
+ import threading
116
+ q = Queue()
117
+ def token_callback(token):
118
+ q.put(token)
119
+ self.handler.buffer = []
120
+ self.handler.token_callback = token_callback
121
+ def run():
122
+ result = self.qa_chain.invoke({"query": question})
123
+ print (result)
124
+ self._latest_result = result
125
+ q.put(None)
126
+ threading.Thread(target=run).start()
127
+
128
+ print("Threading started", flush=True)
129
+ while True:
130
+ try:
131
+ token = q.get(timeout=30)
132
+ if token is None:
133
+ print("Stream finished", flush=True)
134
+ break
135
+ yield token
136
+ except Empty:
137
+ print("Timed out waiting for token", flush=True)
138
+ break
139
+ sources = []
140
+ for doc in self._latest_result.get("source_documents",[] ):
141
+
142
+ source = doc.metadata.get("source", "Unknown")
143
+ sources.append(source)
144
+
145
+ if sources:
146
+ yield "\n\nπŸ“š **Sources:**\n"
147
+ for i, src in enumerate(set(sources)):
148
+ yield f"[{i+1}] {src}\n"
requirements.txt ADDED
Binary file (7.48 kB). View file