Spaces:
Running
Running
Update retrival.py
Browse files- retrival.py +5 -13
retrival.py
CHANGED
|
@@ -21,14 +21,16 @@ pytesseract.pytesseract.tesseract_cmd = (r'/usr/bin/tesseract')
|
|
| 21 |
# Configurations
|
| 22 |
UPLOAD_FOLDER = "./uploads"
|
| 23 |
VECTOR_DB_FOLDER = "./VectorDB"
|
|
|
|
| 24 |
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
|
| 25 |
os.makedirs(VECTOR_DB_FOLDER, exist_ok=True)
|
|
|
|
| 26 |
|
| 27 |
########################################################################################################################################################
|
| 28 |
####-------------------------------------------------------------- Documnet Loader ---------------------------------------------------------------####
|
| 29 |
########################################################################################################################################################
|
| 30 |
# Loaders for loading Document text, tables and images from any file format.
|
| 31 |
-
|
| 32 |
def load_document(data_path):
|
| 33 |
processed_documents = []
|
| 34 |
#element_content = []
|
|
@@ -44,7 +46,7 @@ def load_document(data_path):
|
|
| 44 |
try:
|
| 45 |
# Determine the file type based on extension
|
| 46 |
filename, file_extension = os.path.splitext(file.lower())
|
| 47 |
-
image_output = f"
|
| 48 |
# Use specific partition techniques based on file extension
|
| 49 |
if file_extension == ".pdf":
|
| 50 |
elements = partition_pdf(
|
|
@@ -217,11 +219,6 @@ def load_document(data_path):
|
|
| 217 |
)
|
| 218 |
)
|
| 219 |
|
| 220 |
-
# Output the grouped documents
|
| 221 |
-
# for document in grouped_documents:
|
| 222 |
-
# print(document)
|
| 223 |
-
|
| 224 |
-
|
| 225 |
#Dirctory loader for loading the text data only to specific db
|
| 226 |
loader = DirectoryLoader(data_path, glob="*.*")
|
| 227 |
documents = loader.load()
|
|
@@ -235,6 +232,7 @@ def load_document(data_path):
|
|
| 235 |
doc.metadata.update({"filename":match.group(1)})
|
| 236 |
|
| 237 |
return grouped_documents,documents,table_document
|
|
|
|
| 238 |
#grouped_documents = load_document(data_path)
|
| 239 |
#documents,processed_documents,table_document = load_document(data_path)
|
| 240 |
|
|
@@ -307,12 +305,6 @@ async def save_to_chroma(chunks: list[Document], name: str, tables: list[Documen
|
|
| 307 |
print("Error while saving to Chroma:", e)
|
| 308 |
return None
|
| 309 |
|
| 310 |
-
# def get_unique_sources(chroma_path):
|
| 311 |
-
# db = Chroma(persist_directory=chroma_path)
|
| 312 |
-
# metadata_list = db.get()["metadatas"]
|
| 313 |
-
# unique_sources = {metadata["source"] for metadata in metadata_list if "source" in metadata}
|
| 314 |
-
# return list(unique_sources)
|
| 315 |
-
|
| 316 |
########################################################################################################################################################
|
| 317 |
####----------------------------------------------------------- Updating Existing Data in Vector DB -----------------------------------------------####
|
| 318 |
########################################################################################################################################################
|
|
|
|
| 21 |
# Configurations
|
| 22 |
UPLOAD_FOLDER = "./uploads"
|
| 23 |
VECTOR_DB_FOLDER = "./VectorDB"
|
| 24 |
+
IMAGE_DB_FOLDER = "./ImageDB"
|
| 25 |
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
|
| 26 |
os.makedirs(VECTOR_DB_FOLDER, exist_ok=True)
|
| 27 |
+
os.makedirs(IMAGE_DB_FOLDER, exist_ok=True)
|
| 28 |
|
| 29 |
########################################################################################################################################################
|
| 30 |
####-------------------------------------------------------------- Documnet Loader ---------------------------------------------------------------####
|
| 31 |
########################################################################################################################################################
|
| 32 |
# Loaders for loading Document text, tables and images from any file format.
|
| 33 |
+
|
| 34 |
def load_document(data_path):
|
| 35 |
processed_documents = []
|
| 36 |
#element_content = []
|
|
|
|
| 46 |
try:
|
| 47 |
# Determine the file type based on extension
|
| 48 |
filename, file_extension = os.path.splitext(file.lower())
|
| 49 |
+
image_output = f"./ImageDB/{filename}/"
|
| 50 |
# Use specific partition techniques based on file extension
|
| 51 |
if file_extension == ".pdf":
|
| 52 |
elements = partition_pdf(
|
|
|
|
| 219 |
)
|
| 220 |
)
|
| 221 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 222 |
#Dirctory loader for loading the text data only to specific db
|
| 223 |
loader = DirectoryLoader(data_path, glob="*.*")
|
| 224 |
documents = loader.load()
|
|
|
|
| 232 |
doc.metadata.update({"filename":match.group(1)})
|
| 233 |
|
| 234 |
return grouped_documents,documents,table_document
|
| 235 |
+
|
| 236 |
#grouped_documents = load_document(data_path)
|
| 237 |
#documents,processed_documents,table_document = load_document(data_path)
|
| 238 |
|
|
|
|
| 305 |
print("Error while saving to Chroma:", e)
|
| 306 |
return None
|
| 307 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 308 |
########################################################################################################################################################
|
| 309 |
####----------------------------------------------------------- Updating Existing Data in Vector DB -----------------------------------------------####
|
| 310 |
########################################################################################################################################################
|