Spaces:
Sleeping
Sleeping
Update retrival.py
Browse files- retrival.py +53 -51
retrival.py
CHANGED
|
@@ -13,14 +13,14 @@ import pytesseract
|
|
| 13 |
import os
|
| 14 |
import re
|
| 15 |
import uuid
|
|
|
|
| 16 |
from collections import defaultdict
|
| 17 |
-
|
| 18 |
pytesseract.pytesseract.tesseract_cmd = (r'/usr/bin/tesseract')
|
| 19 |
|
| 20 |
# Configurations
|
| 21 |
UPLOAD_FOLDER = "./uploads"
|
| 22 |
VECTOR_DB_FOLDER = "./VectorDB"
|
| 23 |
-
IMAGE_DB_FOLDER = "./Images"
|
| 24 |
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
|
| 25 |
os.makedirs(VECTOR_DB_FOLDER, exist_ok=True)
|
| 26 |
|
|
@@ -31,7 +31,7 @@ os.makedirs(VECTOR_DB_FOLDER, exist_ok=True)
|
|
| 31 |
#data_path=r"H:\DEV PATEL\2025\RAG Project\test_data\google data"
|
| 32 |
def load_document(data_path):
|
| 33 |
processed_documents = []
|
| 34 |
-
element_content = []
|
| 35 |
table_document = []
|
| 36 |
#having different process for the pdf
|
| 37 |
for root, _, files in os.walk(data_path):
|
|
@@ -44,7 +44,7 @@ def load_document(data_path):
|
|
| 44 |
try:
|
| 45 |
# Determine the file type based on extension
|
| 46 |
filename, file_extension = os.path.splitext(file.lower())
|
| 47 |
-
image_output = f"
|
| 48 |
# Use specific partition techniques based on file extension
|
| 49 |
if file_extension == ".pdf":
|
| 50 |
elements = partition_pdf(
|
|
@@ -73,10 +73,9 @@ def load_document(data_path):
|
|
| 73 |
categorized_content = {
|
| 74 |
"tables": {"content": [], "Metadata": []},
|
| 75 |
"images": {"content": [], "Metadata": []},
|
| 76 |
-
"text": {"content": [], "Metadata": []},
|
| 77 |
-
"text2": {"content": [], "Metadata": []}
|
| 78 |
}
|
| 79 |
-
element_content.append(elements)
|
| 80 |
CNT=1
|
| 81 |
for chunk in elements:
|
| 82 |
# Safely extract metadata and text
|
|
@@ -136,7 +135,6 @@ def load_document(data_path):
|
|
| 136 |
|
| 137 |
# Loop over tables and match text from the same document and page
|
| 138 |
|
| 139 |
-
'''
|
| 140 |
for doc in processed_documents:
|
| 141 |
cnt=1 # count for storing number of the table
|
| 142 |
for table_metadata in doc.get("tables", {}).get("Metadata", []):
|
|
@@ -181,7 +179,6 @@ def load_document(data_path):
|
|
| 181 |
}
|
| 182 |
)
|
| 183 |
)
|
| 184 |
-
'''
|
| 185 |
|
| 186 |
# Initialize a structure to group content by doc_id
|
| 187 |
grouped_by_doc_id = defaultdict(lambda: {
|
|
@@ -203,10 +200,10 @@ def load_document(data_path):
|
|
| 203 |
metadata = metadata_list[0] # Assuming metadata is consistent
|
| 204 |
grouped_by_doc_id[doc_id]["metadata"] = {
|
| 205 |
"source": source,
|
| 206 |
-
"filetype": metadata.get("filetype"),
|
| 207 |
"file_directory": metadata.get("file_directory"),
|
| 208 |
"filename": metadata.get("filename"),
|
| 209 |
-
"languages": str(metadata.get("languages")),
|
| 210 |
}
|
| 211 |
|
| 212 |
# Convert grouped content into Document objects
|
|
@@ -221,12 +218,11 @@ def load_document(data_path):
|
|
| 221 |
)
|
| 222 |
|
| 223 |
# Output the grouped documents
|
| 224 |
-
for document in grouped_documents:
|
| 225 |
-
|
| 226 |
|
| 227 |
|
| 228 |
#Dirctory loader for loading the text data only to specific db
|
| 229 |
-
'''
|
| 230 |
loader = DirectoryLoader(data_path, glob="*.*")
|
| 231 |
documents = loader.load()
|
| 232 |
|
|
@@ -237,9 +233,9 @@ def load_document(data_path):
|
|
| 237 |
path=doc.metadata.get("source")
|
| 238 |
match = re.search(r'([^\\]+\.[^\\]+)$', path)
|
| 239 |
doc.metadata.update({"filename":match.group(1)})
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
#documents,processed_documents,table_document = load_document(data_path)
|
| 244 |
|
| 245 |
|
|
@@ -249,8 +245,8 @@ def load_document(data_path):
|
|
| 249 |
|
| 250 |
def split_text(documents: list[Document]):
|
| 251 |
text_splitter = RecursiveCharacterTextSplitter(
|
| 252 |
-
chunk_size=
|
| 253 |
-
chunk_overlap=
|
| 254 |
length_function=len,
|
| 255 |
add_start_index=True,
|
| 256 |
)
|
|
@@ -265,41 +261,47 @@ def split_text(documents: list[Document]):
|
|
| 265 |
########################################################################################################################################################
|
| 266 |
|
| 267 |
#def save_to_chroma(chunks: list[Document], name: str, tables: list[Document]):
|
| 268 |
-
def save_to_chroma(chunks: list[Document], name: str):
|
| 269 |
CHROMA_PATH = f"./VectorDB/chroma_{name}"
|
| 270 |
-
|
| 271 |
if os.path.exists(CHROMA_PATH):
|
| 272 |
shutil.rmtree(CHROMA_PATH)
|
| 273 |
-
|
| 274 |
-
|
| 275 |
|
| 276 |
try:
|
| 277 |
# Load the embedding model
|
| 278 |
-
embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
|
| 279 |
#embedding_function = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1")
|
| 280 |
# Create Chroma DB for documents using from_documents [NOTE: Some of the data is converted to string because int and float show null if added]
|
| 281 |
print("Creating document vector database...")
|
| 282 |
-
db =
|
| 283 |
-
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 287 |
print("Document database successfully saved.")
|
| 288 |
-
|
| 289 |
-
#
|
| 290 |
-
|
| 291 |
-
|
| 292 |
-
|
| 293 |
-
|
| 294 |
-
|
| 295 |
-
|
| 296 |
-
|
| 297 |
-
|
| 298 |
-
|
| 299 |
-
|
| 300 |
-
|
| 301 |
-
|
| 302 |
-
|
|
|
|
|
|
|
| 303 |
|
| 304 |
except Exception as e:
|
| 305 |
print("Error while saving to Chroma:", e)
|
|
@@ -394,30 +396,30 @@ def save_to_chroma(chunks: list[Document], name: str):
|
|
| 394 |
####------------------------------------------------------- Combine Process of Load, Chunk and Store ----------------------------------------------####
|
| 395 |
########################################################################################################################################################
|
| 396 |
|
| 397 |
-
def generate_data_store(file_path, db_name):
|
| 398 |
CHROMA_PATH = f"./VectorDB/chroma_{db_name}"
|
| 399 |
print(f"Filepath ===> {file_path} DB Name ====> {db_name}")
|
| 400 |
|
| 401 |
try:
|
| 402 |
-
|
| 403 |
-
|
| 404 |
print("Documents loaded successfully.")
|
| 405 |
except Exception as e:
|
| 406 |
print(f"Error loading documents: {e}")
|
| 407 |
return
|
| 408 |
|
| 409 |
try:
|
| 410 |
-
chunks = split_text(
|
| 411 |
print(f"Text split into {len(chunks)} chunks.")
|
| 412 |
except Exception as e:
|
| 413 |
print(f"Error splitting text: {e}")
|
| 414 |
return
|
| 415 |
|
| 416 |
try:
|
| 417 |
-
|
| 418 |
-
asyncio.run(save_to_chroma(chunks, db_name))
|
| 419 |
print(f"Data saved to Chroma for database {db_name}.")
|
| 420 |
except Exception as e:
|
| 421 |
print(f"Error saving to Chroma: {e}")
|
| 422 |
return
|
| 423 |
-
|
|
|
|
| 13 |
import os
|
| 14 |
import re
|
| 15 |
import uuid
|
| 16 |
+
from langchain.schema import Document
|
| 17 |
from collections import defaultdict
|
| 18 |
+
pytesseract.pytesseract.tesseract_cmd = (r'/usr/bin/tesseract')
|
| 19 |
pytesseract.pytesseract.tesseract_cmd = (r'/usr/bin/tesseract')
|
| 20 |
|
| 21 |
# Configurations
|
| 22 |
UPLOAD_FOLDER = "./uploads"
|
| 23 |
VECTOR_DB_FOLDER = "./VectorDB"
|
|
|
|
| 24 |
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
|
| 25 |
os.makedirs(VECTOR_DB_FOLDER, exist_ok=True)
|
| 26 |
|
|
|
|
| 31 |
#data_path=r"H:\DEV PATEL\2025\RAG Project\test_data\google data"
|
| 32 |
def load_document(data_path):
|
| 33 |
processed_documents = []
|
| 34 |
+
#element_content = []
|
| 35 |
table_document = []
|
| 36 |
#having different process for the pdf
|
| 37 |
for root, _, files in os.walk(data_path):
|
|
|
|
| 44 |
try:
|
| 45 |
# Determine the file type based on extension
|
| 46 |
filename, file_extension = os.path.splitext(file.lower())
|
| 47 |
+
image_output = f"H:/DEV PATEL/2025/RAG Project/Images/{filename}/"
|
| 48 |
# Use specific partition techniques based on file extension
|
| 49 |
if file_extension == ".pdf":
|
| 50 |
elements = partition_pdf(
|
|
|
|
| 73 |
categorized_content = {
|
| 74 |
"tables": {"content": [], "Metadata": []},
|
| 75 |
"images": {"content": [], "Metadata": []},
|
| 76 |
+
"text": {"content": [], "Metadata": []},
|
|
|
|
| 77 |
}
|
| 78 |
+
#element_content.append(elements)
|
| 79 |
CNT=1
|
| 80 |
for chunk in elements:
|
| 81 |
# Safely extract metadata and text
|
|
|
|
| 135 |
|
| 136 |
# Loop over tables and match text from the same document and page
|
| 137 |
|
|
|
|
| 138 |
for doc in processed_documents:
|
| 139 |
cnt=1 # count for storing number of the table
|
| 140 |
for table_metadata in doc.get("tables", {}).get("Metadata", []):
|
|
|
|
| 179 |
}
|
| 180 |
)
|
| 181 |
)
|
|
|
|
| 182 |
|
| 183 |
# Initialize a structure to group content by doc_id
|
| 184 |
grouped_by_doc_id = defaultdict(lambda: {
|
|
|
|
| 200 |
metadata = metadata_list[0] # Assuming metadata is consistent
|
| 201 |
grouped_by_doc_id[doc_id]["metadata"] = {
|
| 202 |
"source": source,
|
| 203 |
+
#"filetype": metadata.get("filetype"),
|
| 204 |
"file_directory": metadata.get("file_directory"),
|
| 205 |
"filename": metadata.get("filename"),
|
| 206 |
+
#"languages": str(metadata.get("languages")),
|
| 207 |
}
|
| 208 |
|
| 209 |
# Convert grouped content into Document objects
|
|
|
|
| 218 |
)
|
| 219 |
|
| 220 |
# Output the grouped documents
|
| 221 |
+
# for document in grouped_documents:
|
| 222 |
+
# print(document)
|
| 223 |
|
| 224 |
|
| 225 |
#Dirctory loader for loading the text data only to specific db
|
|
|
|
| 226 |
loader = DirectoryLoader(data_path, glob="*.*")
|
| 227 |
documents = loader.load()
|
| 228 |
|
|
|
|
| 233 |
path=doc.metadata.get("source")
|
| 234 |
match = re.search(r'([^\\]+\.[^\\]+)$', path)
|
| 235 |
doc.metadata.update({"filename":match.group(1)})
|
| 236 |
+
|
| 237 |
+
return grouped_documents,documents,table_document
|
| 238 |
+
#grouped_documents = load_document(data_path)
|
| 239 |
#documents,processed_documents,table_document = load_document(data_path)
|
| 240 |
|
| 241 |
|
|
|
|
| 245 |
|
| 246 |
def split_text(documents: list[Document]):
|
| 247 |
text_splitter = RecursiveCharacterTextSplitter(
|
| 248 |
+
chunk_size=2000,
|
| 249 |
+
chunk_overlap=600,
|
| 250 |
length_function=len,
|
| 251 |
add_start_index=True,
|
| 252 |
)
|
|
|
|
| 261 |
########################################################################################################################################################
|
| 262 |
|
| 263 |
#def save_to_chroma(chunks: list[Document], name: str, tables: list[Document]):
|
| 264 |
+
async def save_to_chroma(chunks: list[Document], name: str, tables: list[Document]):
|
| 265 |
CHROMA_PATH = f"./VectorDB/chroma_{name}"
|
| 266 |
+
TABLE_PATH = f"./TableDB/chroma_{name}"
|
| 267 |
if os.path.exists(CHROMA_PATH):
|
| 268 |
shutil.rmtree(CHROMA_PATH)
|
| 269 |
+
if os.path.exists(TABLE_PATH):
|
| 270 |
+
shutil.rmtree(TABLE_PATH)
|
| 271 |
|
| 272 |
try:
|
| 273 |
# Load the embedding model
|
| 274 |
+
embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2",show_progress=True)
|
| 275 |
#embedding_function = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1")
|
| 276 |
# Create Chroma DB for documents using from_documents [NOTE: Some of the data is converted to string because int and float show null if added]
|
| 277 |
print("Creating document vector database...")
|
| 278 |
+
db =Chroma.from_documents(
|
| 279 |
+
documents=chunks,
|
| 280 |
+
embedding=embedding_function,
|
| 281 |
+
persist_directory=CHROMA_PATH,
|
| 282 |
+
|
| 283 |
+
)
|
| 284 |
+
|
| 285 |
+
print("Persisting the document database...")
|
| 286 |
+
db.persist()
|
| 287 |
print("Document database successfully saved.")
|
| 288 |
+
|
| 289 |
+
# Create Chroma DB for tables if available [NOTE: Some of the data is converted to string because int and float show null if added]
|
| 290 |
+
if tables !=[]:
|
| 291 |
+
print("Creating table vector database...")
|
| 292 |
+
tdb =Chroma.from_documents(
|
| 293 |
+
documents=tables,
|
| 294 |
+
embedding=embedding_function,
|
| 295 |
+
persist_directory=TABLE_PATH,
|
| 296 |
+
)
|
| 297 |
+
print("Persisting the table database...")
|
| 298 |
+
db.persist()
|
| 299 |
+
print("Table database successfully saved.")
|
| 300 |
+
else:
|
| 301 |
+
tdb = None
|
| 302 |
+
|
| 303 |
+
return db, tdb
|
| 304 |
+
#return db
|
| 305 |
|
| 306 |
except Exception as e:
|
| 307 |
print("Error while saving to Chroma:", e)
|
|
|
|
| 396 |
####------------------------------------------------------- Combine Process of Load, Chunk and Store ----------------------------------------------####
|
| 397 |
########################################################################################################################################################
|
| 398 |
|
| 399 |
+
async def generate_data_store(file_path, db_name):
|
| 400 |
CHROMA_PATH = f"./VectorDB/chroma_{db_name}"
|
| 401 |
print(f"Filepath ===> {file_path} DB Name ====> {db_name}")
|
| 402 |
|
| 403 |
try:
|
| 404 |
+
documents,processed_documents,table_document = load_document(file_path)
|
| 405 |
+
#grouped_document,document = load_document(file_path)
|
| 406 |
print("Documents loaded successfully.")
|
| 407 |
except Exception as e:
|
| 408 |
print(f"Error loading documents: {e}")
|
| 409 |
return
|
| 410 |
|
| 411 |
try:
|
| 412 |
+
chunks = split_text(documents)
|
| 413 |
print(f"Text split into {len(chunks)} chunks.")
|
| 414 |
except Exception as e:
|
| 415 |
print(f"Error splitting text: {e}")
|
| 416 |
return
|
| 417 |
|
| 418 |
try:
|
| 419 |
+
await save_to_chroma(chunks, db_name, table_document)
|
| 420 |
+
#await asyncio.run(save_to_chroma(chunks, db_name,table_document))
|
| 421 |
print(f"Data saved to Chroma for database {db_name}.")
|
| 422 |
except Exception as e:
|
| 423 |
print(f"Error saving to Chroma: {e}")
|
| 424 |
return
|
| 425 |
+
|