Spaces:
Running
Running
Merge branch 'main' into fix-conversational-memory
Browse files- document_qa/document_qa_engine.py +30 -10
- document_qa/grobid_processors.py +1 -1
- streamlit_app.py +2 -1
document_qa/document_qa_engine.py
CHANGED
|
@@ -3,6 +3,7 @@ import os
|
|
| 3 |
from pathlib import Path
|
| 4 |
from typing import Union, Any
|
| 5 |
|
|
|
|
| 6 |
from grobid_client.grobid_client import GrobidClient
|
| 7 |
from langchain.chains import create_extraction_chain, ConversationChain, ConversationalRetrievalChain
|
| 8 |
from langchain.chains.question_answering import load_qa_chain, stuff_prompt, refine_prompts, map_reduce_prompt, \
|
|
@@ -14,8 +15,6 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
| 14 |
from langchain.vectorstores import Chroma
|
| 15 |
from tqdm import tqdm
|
| 16 |
|
| 17 |
-
from document_qa.grobid_processors import GrobidProcessor
|
| 18 |
-
|
| 19 |
|
| 20 |
class DocumentQAEngine:
|
| 21 |
llm = None
|
|
@@ -188,8 +187,10 @@ class DocumentQAEngine:
|
|
| 188 |
relevant_documents = multi_query_retriever.get_relevant_documents(query)
|
| 189 |
return relevant_documents
|
| 190 |
|
| 191 |
-
def get_text_from_document(self, pdf_file_path, chunk_size=-1, perc_overlap=0.1, verbose=False):
|
| 192 |
-
"""
|
|
|
|
|
|
|
| 193 |
if verbose:
|
| 194 |
print("File", pdf_file_path)
|
| 195 |
filename = Path(pdf_file_path).stem
|
|
@@ -204,6 +205,7 @@ class DocumentQAEngine:
|
|
| 204 |
texts = []
|
| 205 |
metadatas = []
|
| 206 |
ids = []
|
|
|
|
| 207 |
if chunk_size < 0:
|
| 208 |
for passage in structure['passages']:
|
| 209 |
biblio_copy = copy.copy(biblio)
|
|
@@ -227,10 +229,25 @@ class DocumentQAEngine:
|
|
| 227 |
metadatas = [biblio for _ in range(len(texts))]
|
| 228 |
ids = [id for id, t in enumerate(texts)]
|
| 229 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 230 |
return texts, metadatas, ids
|
| 231 |
|
| 232 |
-
def create_memory_embeddings(self, pdf_path, doc_id=None, chunk_size=500, perc_overlap=0.1):
|
| 233 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 234 |
if doc_id:
|
| 235 |
hash = doc_id
|
| 236 |
else:
|
|
@@ -252,7 +269,7 @@ class DocumentQAEngine:
|
|
| 252 |
|
| 253 |
return hash
|
| 254 |
|
| 255 |
-
def create_embeddings(self, pdfs_dir_path: Path, chunk_size=500, perc_overlap=0.1):
|
| 256 |
input_files = []
|
| 257 |
for root, dirs, files in os.walk(pdfs_dir_path, followlinks=False):
|
| 258 |
for file_ in files:
|
|
@@ -269,9 +286,12 @@ class DocumentQAEngine:
|
|
| 269 |
if os.path.exists(data_path):
|
| 270 |
print(data_path, "exists. Skipping it ")
|
| 271 |
continue
|
| 272 |
-
|
| 273 |
-
texts, metadata, ids = self.get_text_from_document(
|
| 274 |
-
|
|
|
|
|
|
|
|
|
|
| 275 |
filename = metadata[0]['filename']
|
| 276 |
|
| 277 |
vector_db_document = Chroma.from_texts(texts,
|
|
|
|
| 3 |
from pathlib import Path
|
| 4 |
from typing import Union, Any
|
| 5 |
|
| 6 |
+
from document_qa.grobid_processors import GrobidProcessor
|
| 7 |
from grobid_client.grobid_client import GrobidClient
|
| 8 |
from langchain.chains import create_extraction_chain, ConversationChain, ConversationalRetrievalChain
|
| 9 |
from langchain.chains.question_answering import load_qa_chain, stuff_prompt, refine_prompts, map_reduce_prompt, \
|
|
|
|
| 15 |
from langchain.vectorstores import Chroma
|
| 16 |
from tqdm import tqdm
|
| 17 |
|
|
|
|
|
|
|
| 18 |
|
| 19 |
class DocumentQAEngine:
|
| 20 |
llm = None
|
|
|
|
| 187 |
relevant_documents = multi_query_retriever.get_relevant_documents(query)
|
| 188 |
return relevant_documents
|
| 189 |
|
| 190 |
+
def get_text_from_document(self, pdf_file_path, chunk_size=-1, perc_overlap=0.1, include=(), verbose=False):
|
| 191 |
+
"""
|
| 192 |
+
Extract text from documents using Grobid, if chunk_size is < 0 it keeps each paragraph separately
|
| 193 |
+
"""
|
| 194 |
if verbose:
|
| 195 |
print("File", pdf_file_path)
|
| 196 |
filename = Path(pdf_file_path).stem
|
|
|
|
| 205 |
texts = []
|
| 206 |
metadatas = []
|
| 207 |
ids = []
|
| 208 |
+
|
| 209 |
if chunk_size < 0:
|
| 210 |
for passage in structure['passages']:
|
| 211 |
biblio_copy = copy.copy(biblio)
|
|
|
|
| 229 |
metadatas = [biblio for _ in range(len(texts))]
|
| 230 |
ids = [id for id, t in enumerate(texts)]
|
| 231 |
|
| 232 |
+
if "biblio" in include:
|
| 233 |
+
biblio_metadata = copy.copy(biblio)
|
| 234 |
+
biblio_metadata['type'] = "biblio"
|
| 235 |
+
biblio_metadata['section'] = "header"
|
| 236 |
+
for key in ['title', 'authors', 'publication_year']:
|
| 237 |
+
if key in biblio_metadata:
|
| 238 |
+
texts.append("{}: {}".format(key, biblio_metadata[key]))
|
| 239 |
+
metadatas.append(biblio_metadata)
|
| 240 |
+
ids.append(key)
|
| 241 |
+
|
| 242 |
return texts, metadatas, ids
|
| 243 |
|
| 244 |
+
def create_memory_embeddings(self, pdf_path, doc_id=None, chunk_size=500, perc_overlap=0.1, include_biblio=False):
|
| 245 |
+
include = ["biblio"] if include_biblio else []
|
| 246 |
+
texts, metadata, ids = self.get_text_from_document(
|
| 247 |
+
pdf_path,
|
| 248 |
+
chunk_size=chunk_size,
|
| 249 |
+
perc_overlap=perc_overlap,
|
| 250 |
+
include=include)
|
| 251 |
if doc_id:
|
| 252 |
hash = doc_id
|
| 253 |
else:
|
|
|
|
| 269 |
|
| 270 |
return hash
|
| 271 |
|
| 272 |
+
def create_embeddings(self, pdfs_dir_path: Path, chunk_size=500, perc_overlap=0.1, include_biblio=False):
|
| 273 |
input_files = []
|
| 274 |
for root, dirs, files in os.walk(pdfs_dir_path, followlinks=False):
|
| 275 |
for file_ in files:
|
|
|
|
| 286 |
if os.path.exists(data_path):
|
| 287 |
print(data_path, "exists. Skipping it ")
|
| 288 |
continue
|
| 289 |
+
include = ["biblio"] if include_biblio else []
|
| 290 |
+
texts, metadata, ids = self.get_text_from_document(
|
| 291 |
+
input_file,
|
| 292 |
+
chunk_size=chunk_size,
|
| 293 |
+
perc_overlap=perc_overlap,
|
| 294 |
+
include=include)
|
| 295 |
filename = metadata[0]['filename']
|
| 296 |
|
| 297 |
vector_db_document = Chroma.from_texts(texts,
|
document_qa/grobid_processors.py
CHANGED
|
@@ -171,7 +171,7 @@ class GrobidProcessor(BaseProcessor):
|
|
| 171 |
}
|
| 172 |
try:
|
| 173 |
year = dateparser.parse(doc_biblio.header.date).year
|
| 174 |
-
biblio["
|
| 175 |
except:
|
| 176 |
pass
|
| 177 |
|
|
|
|
| 171 |
}
|
| 172 |
try:
|
| 173 |
year = dateparser.parse(doc_biblio.header.date).year
|
| 174 |
+
biblio["publication_year"] = year
|
| 175 |
except:
|
| 176 |
pass
|
| 177 |
|
streamlit_app.py
CHANGED
|
@@ -288,7 +288,8 @@ if uploaded_file and not st.session_state.loaded_embeddings:
|
|
| 288 |
# hash = get_file_hash(tmp_file.name)[:10]
|
| 289 |
st.session_state['doc_id'] = hash = st.session_state['rqa'][model].create_memory_embeddings(tmp_file.name,
|
| 290 |
chunk_size=chunk_size,
|
| 291 |
-
perc_overlap=0.1
|
|
|
|
| 292 |
st.session_state['loaded_embeddings'] = True
|
| 293 |
st.session_state.messages = []
|
| 294 |
|
|
|
|
| 288 |
# hash = get_file_hash(tmp_file.name)[:10]
|
| 289 |
st.session_state['doc_id'] = hash = st.session_state['rqa'][model].create_memory_embeddings(tmp_file.name,
|
| 290 |
chunk_size=chunk_size,
|
| 291 |
+
perc_overlap=0.1,
|
| 292 |
+
include_biblio=True)
|
| 293 |
st.session_state['loaded_embeddings'] = True
|
| 294 |
st.session_state.messages = []
|
| 295 |
|