Spaces:

WebashalarForML
/

RAG_AI_V2

Running

App Files Files Community

WebashalarForML commited on Feb 5

Commit

f33b573

verified ·

1 Parent(s): dfd51c8

Update retrival.py

Browse files

Files changed (1) hide show

retrival.py +5 -13

retrival.py CHANGED Viewed

@@ -21,14 +21,16 @@ pytesseract.pytesseract.tesseract_cmd = (r'/usr/bin/tesseract')
 # Configurations
 UPLOAD_FOLDER = "./uploads"
 VECTOR_DB_FOLDER = "./VectorDB"
 os.makedirs(UPLOAD_FOLDER, exist_ok=True)
 os.makedirs(VECTOR_DB_FOLDER, exist_ok=True)
 ########################################################################################################################################################
 ####--------------------------------------------------------------  Documnet Loader  ---------------------------------------------------------------####
 ########################################################################################################################################################
 # Loaders for loading Document text, tables and images from any file format.
-#data_path=r"H:\DEV PATEL\2025\RAG Project\test_data\google data"
 def load_document(data_path):
     processed_documents = []
     #element_content = []
@@ -44,7 +46,7 @@ def load_document(data_path):
             try:
                 # Determine the file type based on extension
                 filename, file_extension = os.path.splitext(file.lower())
-                image_output = f"H:/DEV PATEL/2025/RAG Project/Images/{filename}/"
                 # Use specific partition techniques based on file extension
                 if file_extension == ".pdf":
                     elements = partition_pdf(
@@ -217,11 +219,6 @@ def load_document(data_path):
             )
         )
-    # Output the grouped documents
-    # for document in grouped_documents:
-    #     print(document)
     #Dirctory loader for loading the text data only to specific db
     loader = DirectoryLoader(data_path, glob="*.*")
     documents = loader.load()
@@ -235,6 +232,7 @@ def load_document(data_path):
         doc.metadata.update({"filename":match.group(1)})
     return grouped_documents,documents,table_document
 #grouped_documents = load_document(data_path)
 #documents,processed_documents,table_document = load_document(data_path)
@@ -307,12 +305,6 @@ async def save_to_chroma(chunks: list[Document], name: str, tables: list[Documen
         print("Error while saving to Chroma:", e)
         return None
-# def get_unique_sources(chroma_path):
-#     db = Chroma(persist_directory=chroma_path)
-#     metadata_list = db.get()["metadatas"]
-#     unique_sources = {metadata["source"] for metadata in metadata_list if "source" in metadata}
-#     return list(unique_sources)
 ########################################################################################################################################################
 ####----------------------------------------------------------- Updating Existing Data in Vector DB  -----------------------------------------------####
 ########################################################################################################################################################

 # Configurations
 UPLOAD_FOLDER = "./uploads"
 VECTOR_DB_FOLDER = "./VectorDB"
+IMAGE_DB_FOLDER = "./ImageDB"
 os.makedirs(UPLOAD_FOLDER, exist_ok=True)
 os.makedirs(VECTOR_DB_FOLDER, exist_ok=True)
+os.makedirs(IMAGE_DB_FOLDER, exist_ok=True)
 ########################################################################################################################################################
 ####--------------------------------------------------------------  Documnet Loader  ---------------------------------------------------------------####
 ########################################################################################################################################################
 # Loaders for loading Document text, tables and images from any file format.
 def load_document(data_path):
     processed_documents = []
     #element_content = []
             try:
                 # Determine the file type based on extension
                 filename, file_extension = os.path.splitext(file.lower())
+                image_output = f"./ImageDB/{filename}/"
                 # Use specific partition techniques based on file extension
                 if file_extension == ".pdf":
                     elements = partition_pdf(
             )
         )
     #Dirctory loader for loading the text data only to specific db
     loader = DirectoryLoader(data_path, glob="*.*")
     documents = loader.load()
         doc.metadata.update({"filename":match.group(1)})
     return grouped_documents,documents,table_document
 #grouped_documents = load_document(data_path)
 #documents,processed_documents,table_document = load_document(data_path)
         print("Error while saving to Chroma:", e)
         return None
 ########################################################################################################################################################
 ####----------------------------------------------------------- Updating Existing Data in Vector DB  -----------------------------------------------####
 ########################################################################################################################################################