RAG_AI_V2

Sleeping

App Files Files Community

WebashalarForML commited on Jan 30

Commit

8ce0321

verified ·

1 Parent(s): 5eb0b04

Update retrival.py

Browse files

Files changed (1) hide show

retrival.py +53 -51

retrival.py CHANGED Viewed

@@ -13,14 +13,14 @@ import pytesseract
 import os
 import re
 import uuid
 from collections import defaultdict
 pytesseract.pytesseract.tesseract_cmd = (r'/usr/bin/tesseract')
 # Configurations
 UPLOAD_FOLDER = "./uploads"
 VECTOR_DB_FOLDER = "./VectorDB"
-IMAGE_DB_FOLDER = "./Images"
 os.makedirs(UPLOAD_FOLDER, exist_ok=True)
 os.makedirs(VECTOR_DB_FOLDER, exist_ok=True)
@@ -31,7 +31,7 @@ os.makedirs(VECTOR_DB_FOLDER, exist_ok=True)
 #data_path=r"H:\DEV PATEL\2025\RAG Project\test_data\google data"
 def load_document(data_path):
     processed_documents = []
-    element_content = []
     table_document = []
     #having different process for the pdf
     for root, _, files in os.walk(data_path):
@@ -44,7 +44,7 @@ def load_document(data_path):
             try:
                 # Determine the file type based on extension
                 filename, file_extension = os.path.splitext(file.lower())
-                image_output = f"./Images/{filename}/"
                 # Use specific partition techniques based on file extension
                 if file_extension == ".pdf":
                     elements = partition_pdf(
@@ -73,10 +73,9 @@ def load_document(data_path):
             categorized_content = {
                 "tables": {"content": [], "Metadata": []},
                 "images": {"content": [], "Metadata": []},
-                "text": {"content": [], "Metadata": []},
-                "text2": {"content": [], "Metadata": []}
             }
-            element_content.append(elements)
             CNT=1
             for chunk in elements:
                 # Safely extract metadata and text
@@ -136,7 +135,6 @@ def load_document(data_path):
     # Loop over tables and match text from the same document and page
-    '''
     for doc in processed_documents:
         cnt=1 # count for storing number of the table
         for table_metadata in doc.get("tables", {}).get("Metadata", []):
@@ -181,7 +179,6 @@ def load_document(data_path):
                     }
                 )
             )
-    '''
     # Initialize a structure to group content by doc_id
     grouped_by_doc_id = defaultdict(lambda: {
@@ -203,10 +200,10 @@ def load_document(data_path):
             metadata = metadata_list[0]  # Assuming metadata is consistent
             grouped_by_doc_id[doc_id]["metadata"] = {
                 "source": source,
-                "filetype": metadata.get("filetype"),
                 "file_directory": metadata.get("file_directory"),
                 "filename": metadata.get("filename"),
-                "languages": str(metadata.get("languages")),
             }
     # Convert grouped content into Document objects
@@ -221,12 +218,11 @@ def load_document(data_path):
         )
     # Output the grouped documents
-    for document in grouped_documents:
-        print(document)
     #Dirctory loader for loading the text data only to specific db
-    '''
     loader = DirectoryLoader(data_path, glob="*.*")
     documents = loader.load()
@@ -237,9 +233,9 @@ def load_document(data_path):
         path=doc.metadata.get("source")
         match = re.search(r'([^\\]+\.[^\\]+)$', path)
         doc.metadata.update({"filename":match.group(1)})
-    return documents,
-    '''
-    return grouped_documents
 #documents,processed_documents,table_document = load_document(data_path)
@@ -249,8 +245,8 @@ def load_document(data_path):
 def split_text(documents: list[Document]):
     text_splitter = RecursiveCharacterTextSplitter(
-        chunk_size=1000,
-        chunk_overlap=500,
         length_function=len,
         add_start_index=True,
     )
@@ -265,41 +261,47 @@ def split_text(documents: list[Document]):
 ########################################################################################################################################################
 #def save_to_chroma(chunks: list[Document], name: str, tables: list[Document]):
-def save_to_chroma(chunks: list[Document], name: str):
     CHROMA_PATH = f"./VectorDB/chroma_{name}"
-    #TABLE_PATH = f"./TableDB/chroma_{name}"
     if os.path.exists(CHROMA_PATH):
         shutil.rmtree(CHROMA_PATH)
-    # if os.path.exists(TABLE_PATH):
-    #     shutil.rmtree(TABLE_PATH)
     try:
         # Load the embedding model
-        embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
         #embedding_function = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1")
         # Create Chroma DB for documents using from_documents [NOTE: Some of the data is converted to string because int and float show null if added]
         print("Creating document vector database...")
-        db = Chroma.from_documents(
-            documents=chunks,
-            embedding=embedding_function,
-            persist_directory=CHROMA_PATH,
-        )
         print("Document database successfully saved.")
-        # # Create Chroma DB for tables if available [NOTE: Some of the data is converted to string because int and float show null if added]
-        # if tables:
-        #     print("Creating table vector database...")
-        #     tdb = Chroma.from_documents(
-        #         documents=tables,
-        #         embedding=embedding_function,
-        #         persist_directory=TABLE_PATH,
-        #     )
-        #     print("Table database successfully saved.")
-        # else:
-        #     tdb = None
-        #return db, tdb
-        return db
     except Exception as e:
         print("Error while saving to Chroma:", e)
@@ -394,30 +396,30 @@ def save_to_chroma(chunks: list[Document], name: str):
 ####------------------------------------------------------- Combine Process of Load, Chunk and Store  ----------------------------------------------####
 ########################################################################################################################################################
-def generate_data_store(file_path, db_name):
     CHROMA_PATH = f"./VectorDB/chroma_{db_name}"
     print(f"Filepath ===> {file_path}  DB Name ====> {db_name}")
     try:
-        #documents,grouped_documents = load_document(file_path)
-        grouped_documents = load_document(file_path)
         print("Documents loaded successfully.")
     except Exception as e:
         print(f"Error loading documents: {e}")
         return
     try:
-        chunks = split_text(grouped_documents)
         print(f"Text split into {len(chunks)} chunks.")
     except Exception as e:
         print(f"Error splitting text: {e}")
         return
     try:
-        #asyncio.run(save_to_chroma(save_to_chroma(chunks, db_name, table_document)))
-        asyncio.run(save_to_chroma(chunks, db_name))
         print(f"Data saved to Chroma for database {db_name}.")
     except Exception as e:
         print(f"Error saving to Chroma: {e}")
         return

 import os
 import re
 import uuid
+from langchain.schema import Document
 from collections import defaultdict
+pytesseract.pytesseract.tesseract_cmd = (r'/usr/bin/tesseract')
 pytesseract.pytesseract.tesseract_cmd = (r'/usr/bin/tesseract')
 # Configurations
 UPLOAD_FOLDER = "./uploads"
 VECTOR_DB_FOLDER = "./VectorDB"
 os.makedirs(UPLOAD_FOLDER, exist_ok=True)
 os.makedirs(VECTOR_DB_FOLDER, exist_ok=True)
 #data_path=r"H:\DEV PATEL\2025\RAG Project\test_data\google data"
 def load_document(data_path):
     processed_documents = []
+    #element_content = []
     table_document = []
     #having different process for the pdf
     for root, _, files in os.walk(data_path):
             try:
                 # Determine the file type based on extension
                 filename, file_extension = os.path.splitext(file.lower())
+                image_output = f"H:/DEV PATEL/2025/RAG Project/Images/{filename}/"
                 # Use specific partition techniques based on file extension
                 if file_extension == ".pdf":
                     elements = partition_pdf(
             categorized_content = {
                 "tables": {"content": [], "Metadata": []},
                 "images": {"content": [], "Metadata": []},
+                "text": {"content": [], "Metadata": []},
             }
+            #element_content.append(elements)
             CNT=1
             for chunk in elements:
                 # Safely extract metadata and text
     # Loop over tables and match text from the same document and page
     for doc in processed_documents:
         cnt=1 # count for storing number of the table
         for table_metadata in doc.get("tables", {}).get("Metadata", []):
                     }
                 )
             )
     # Initialize a structure to group content by doc_id
     grouped_by_doc_id = defaultdict(lambda: {
             metadata = metadata_list[0]  # Assuming metadata is consistent
             grouped_by_doc_id[doc_id]["metadata"] = {
                 "source": source,
+                #"filetype": metadata.get("filetype"),
                 "file_directory": metadata.get("file_directory"),
                 "filename": metadata.get("filename"),
+                #"languages": str(metadata.get("languages")),
             }
     # Convert grouped content into Document objects
         )
     # Output the grouped documents
+    # for document in grouped_documents:
+    #     print(document)
     #Dirctory loader for loading the text data only to specific db
     loader = DirectoryLoader(data_path, glob="*.*")
     documents = loader.load()
         path=doc.metadata.get("source")
         match = re.search(r'([^\\]+\.[^\\]+)$', path)
         doc.metadata.update({"filename":match.group(1)})
+    return grouped_documents,documents,table_document
+#grouped_documents = load_document(data_path)
 #documents,processed_documents,table_document = load_document(data_path)
 def split_text(documents: list[Document]):
     text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=2000,
+        chunk_overlap=600,
         length_function=len,
         add_start_index=True,
     )
 ########################################################################################################################################################
 #def save_to_chroma(chunks: list[Document], name: str, tables: list[Document]):
+async def save_to_chroma(chunks: list[Document], name: str, tables: list[Document]):
     CHROMA_PATH = f"./VectorDB/chroma_{name}"
+    TABLE_PATH = f"./TableDB/chroma_{name}"
     if os.path.exists(CHROMA_PATH):
         shutil.rmtree(CHROMA_PATH)
+    if os.path.exists(TABLE_PATH):
+        shutil.rmtree(TABLE_PATH)
     try:
         # Load the embedding model
+        embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2",show_progress=True)
         #embedding_function = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1")
         # Create Chroma DB for documents using from_documents [NOTE: Some of the data is converted to string because int and float show null if added]
         print("Creating document vector database...")
+        db =Chroma.from_documents(
+                                            documents=chunks,
+                                            embedding=embedding_function,
+                                            persist_directory=CHROMA_PATH,
+                                        )
+        print("Persisting the document database...")
+        db.persist()
         print("Document database successfully saved.")
+        # Create Chroma DB for tables if available [NOTE: Some of the data is converted to string because int and float show null if added]
+        if tables !=[]:
+            print("Creating table vector database...")
+            tdb =Chroma.from_documents(
+                documents=tables,
+                embedding=embedding_function,
+                persist_directory=TABLE_PATH,
+            )
+            print("Persisting the table database...")
+            db.persist()
+            print("Table database successfully saved.")
+        else:
+            tdb = None
+        return db, tdb
+        #return db
     except Exception as e:
         print("Error while saving to Chroma:", e)
 ####------------------------------------------------------- Combine Process of Load, Chunk and Store  ----------------------------------------------####
 ########################################################################################################################################################
+async def generate_data_store(file_path, db_name):
     CHROMA_PATH = f"./VectorDB/chroma_{db_name}"
     print(f"Filepath ===> {file_path}  DB Name ====> {db_name}")
     try:
+        documents,processed_documents,table_document = load_document(file_path)
+        #grouped_document,document = load_document(file_path)
         print("Documents loaded successfully.")
     except Exception as e:
         print(f"Error loading documents: {e}")
         return
     try:
+        chunks = split_text(documents)
         print(f"Text split into {len(chunks)} chunks.")
     except Exception as e:
         print(f"Error splitting text: {e}")
         return
     try:
+        await save_to_chroma(chunks, db_name, table_document)
+        #await asyncio.run(save_to_chroma(chunks, db_name,table_document))
         print(f"Data saved to Chroma for database {db_name}.")
     except Exception as e:
         print(f"Error saving to Chroma: {e}")
         return