RAG_AI_V2

Sleeping

App Files Files Community

WebashalarForML commited on Jan 27

Commit

263ccc7

verified ·

1 Parent(s): 5f2ea7c

Update retrival.py

Browse files

Files changed (1) hide show

retrival.py +358 -75

retrival.py CHANGED Viewed

@@ -1,17 +1,19 @@
 from langchain_community.document_loaders import DirectoryLoader
-from langchain.embeddings import HuggingFaceInstructEmbeddings,HuggingFaceEmbeddings # for embedding task
-from langchain.text_splitter import RecursiveCharacterTextSplitter # for converting the large documents into smaller chunks
-from langchain.schema import Document
 from langchain_core.documents import Document
-from langchain_openai import OpenAIEmbeddings
 from langchain_community.vectorstores import Chroma
-import openai
-import openai
 import os
 import shutil
 import uuid
-import asyncio # async
 # Configurations
 UPLOAD_FOLDER = "./uploads"
@@ -19,86 +21,375 @@ VECTOR_DB_FOLDER = "./VectorDB"
 os.makedirs(UPLOAD_FOLDER, exist_ok=True)
 os.makedirs(VECTOR_DB_FOLDER, exist_ok=True)
 def load_document(data_path):
-    # Load documents
     loader = DirectoryLoader(data_path, glob="*.*")
-    print("loader",loader)
-    document = loader.load()
-    return document
-# Creating the chunks of Data from the knowledge
 def split_text(documents: list[Document]):
     text_splitter = RecursiveCharacterTextSplitter(
-        chunk_size = 1000,
-        chunk_overlap = 500,
-        length_function = len,
         add_start_index=True,
-    )
-    chunks = text_splitter.split_documents(documents)
     print(f"Split {len(documents)} documents into {len(chunks)} chunks.")
     return chunks
-# # Chroma for creating the vector db whcch we will use for the searching relvant data.
-# def save_to_chroma(chunks: list[Document],name: str):
-#     print
-#     CHROMA_PATH = f"./VectorDB/chroma_{name}"
-#     # Clear out the database first.
-#     if os.path.exists(CHROMA_PATH):
-#         shutil.rmtree(CHROMA_PATH)
-#     # Initialize SBERT embedding function
-#     embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
-#     db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)
-#     # Add documents and persist the database
-#     db.add_documents(chunks)
-#     db.persist()
-#     # Return the database instance or a success status
-#     return db
-def save_to_chroma(chunks: list[Document], name: str):
     CHROMA_PATH = f"./VectorDB/chroma_{name}"
-    # Clear out the database first
     if os.path.exists(CHROMA_PATH):
         shutil.rmtree(CHROMA_PATH)
     try:
-        # Initialize SBERT embedding function
         embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
         db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)
-        # Add documents and persist the database
-        print("Adding documents to the database...")
         db.add_documents(chunks)
-        print("Persisting the database...")
         db.persist()
-        print("Database successfully saved.")
-        return db
     except Exception as e:
-        print("Error while saving to Chroma:", e)
-        return None
-def get_unique_sources(chroma_path):
-    # Load the Chroma database
-    db = Chroma(persist_directory=chroma_path)
-    # Retrieve all metadata from the database
-    metadata_list = db.get()['metadatas']
-    # Extract unique sources from metadata
-    unique_sources = {metadata['source'] for metadata in metadata_list if 'source' in metadata}
-    return list(unique_sources)
-def generate_data_store(file_path,db_name):
     CHROMA_PATH = f"./VectorDB/chroma_{db_name}"
-    print(f"filepath===>{file_path}  db_name =====>{db_name}")
     try:
-        documents = load_document(file_path)
         print("Documents loaded successfully.")
     except Exception as e:
         print(f"Error loading documents: {e}")
@@ -112,17 +403,9 @@ def generate_data_store(file_path,db_name):
         return
     try:
-        asyncio.run(save_to_chroma(chunks, db_name))
         print(f"Data saved to Chroma for database {db_name}.")
     except Exception as e:
         print(f"Error saving to Chroma: {e}")
         return
-# def main():
-#     data_path = "H:\\DEV PATEL\\RAG Project\\data1"
-#     db_name = "Product_data"
-#     generate_data_store(data_path,db_name)
-# if __name__ == "__main__":
-#     main()

 from langchain_community.document_loaders import DirectoryLoader
+from langchain.embeddings import HuggingFaceEmbeddings
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.schema import Document
 from langchain_core.documents import Document
 from langchain_community.vectorstores import Chroma
 import os
 import shutil
+import asyncio
+from unstructured.partition.pdf import partition_pdf
+from unstructured.partition.auto import partition
+import pytesseract
+import os
+import re
 import uuid
+pytesseract.pytesseract.tesseract_cmd = (r'/usr/bin/tesseract')
 # Configurations
 UPLOAD_FOLDER = "./uploads"
 os.makedirs(UPLOAD_FOLDER, exist_ok=True)
 os.makedirs(VECTOR_DB_FOLDER, exist_ok=True)
+########################################################################################################################################################
+####--------------------------------------------------------------  Documnet Loader  ---------------------------------------------------------------####
+########################################################################################################################################################
+# Loaders for loading Document text, tables and images from any file format.
+#data_path=r"H:\DEV PATEL\2025\RAG Project\test_data\google data"
 def load_document(data_path):
+    processed_documents = []
+    element_content = []
+    table_document = []
+    #having different process for the pdf
+    for root, _, files in os.walk(data_path):
+        for file in files:
+            file_path = os.path.join(root, file)
+            doc_id = str(uuid.uuid4())  # Generate a unique ID for the document
+            print(f"Processing document ID: {doc_id}, Path: {file_path}")
+            try:
+                # Determine the file type based on extension
+                filename, file_extension = os.path.splitext(file.lower())
+                image_output = f"H:/DEV PATEL/2025/RAG Project/Images/{filename}/"
+                # Use specific partition techniques based on file extension
+                if file_extension == ".pdf":
+                    elements = partition_pdf(
+                        filename=file_path,
+                        strategy="hi_res",  # Use layout detection
+                        infer_table_structure=True,
+                        hi_res_model_name="yolox",
+                        extract_images_in_pdf=True,
+                        extract_image_block_types=["Image","Table"],
+                        extract_image_block_output_dir=image_output,
+                        show_progress=True,
+                        #chunking_strategy="by_title",
+                    )
+                else:
+                    # Default to auto partition if no specific handler is found
+                    elements = partition(
+                        filename=file_path,
+                        strategy="hi_res",
+                        infer_table_structure=True,
+                        show_progress=True,
+                        #chunking_strategy="by_title"
+                    )
+            except Exception as e:
+                print(f"Failed to process document {file_path}: {e}")
+                continue
+            categorized_content = {
+                "tables": {"content": [], "Metadata": []},
+                "images": {"content": [], "Metadata": []},
+                "text": {"content": [], "Metadata": []},
+                "text2": {"content": [], "Metadata": []}
+            }
+            element_content.append(elements)
+            CNT=1
+            for chunk in elements:
+                # Safely extract metadata and text
+                chunk_type = str(type(chunk))
+                chunk_metadata = chunk.metadata.to_dict() if chunk.metadata else {}
+                chunk_text = getattr(chunk, "text", None)
+                # Separate content into categories
+                #if "Table" in chunk_type:
+                if any(
+                    keyword in chunk_type
+                    for keyword in [
+                        "Table",
+                        "TableChunk"]):
+                    categorized_content["tables"]["content"].append(chunk_text)
+                    categorized_content["tables"]["Metadata"].append(chunk_metadata)
+                    #test1
+                    TABLE_DATA=f"Table number {CNT} "+chunk_metadata.get("text_as_html", "")+" "
+                    CNT+=1
+                    categorized_content["text"]["content"].append(TABLE_DATA)
+                    categorized_content["text"]["Metadata"].append(chunk_metadata)
+                elif "Image" in chunk_type:
+                    categorized_content["images"]["content"].append(chunk_text)
+                    categorized_content["images"]["Metadata"].append(chunk_metadata)
+                elif any(
+                    keyword in chunk_type
+                    for keyword in [
+                        "CompositeElement",
+                        "Text",
+                        "NarrativeText",
+                        "Title",
+                        "Header",
+                        "Footer",
+                        "FigureCaption",
+                        "ListItem",
+                        "UncategorizedText",
+                        "Formula",
+                        "CodeSnippet",
+                        "Address",
+                        "EmailAddress",
+                        "PageBreak",
+                    ]
+                ):
+                    categorized_content["text"]["content"].append(chunk_text)
+                    categorized_content["text"]["Metadata"].append(chunk_metadata)
+                else:
+                    continue
+            # Append processed document
+            processed_documents.append({
+                "doc_id": doc_id,
+                "source": file_path,
+                **categorized_content,
+            })
+    # Loop over tables and match text from the same document and page
+    for doc in processed_documents:
+        cnt=1 # count for storing number of the table
+        for table_metadata in doc.get("tables", {}).get("Metadata", []):
+            page_number = table_metadata.get("page_number")
+            source = doc.get("source")
+            page_content = ""
+            for text_metadata, text_content in zip(
+                doc.get("text", {}).get("Metadata", []),
+                doc.get("text", {}).get("content", [])
+            ):
+                page_number2 = text_metadata.get("page_number")
+                source2 = doc.get("source")
+                if source == source2 and page_number == page_number2:
+                    print(f"Matching text found for source: {source}, page: {page_number}")
+                    page_content += f"{text_content} "  # Concatenate text with a space
+            # Add the matched content to the table metadata
+            table_metadata["page_content"] =f"Table number {cnt} "+table_metadata.get("text_as_html", "")+" "+page_content.strip()  # Remove trailing spaces and have the content proper here
+            table_metadata["text_as_html"] = table_metadata.get("text_as_html", "") # we are also storing it seperatly
+            table_metadata["Table_number"] = cnt  # addiing the table number it will be use in retrival
+            cnt+=1
+            # Custom loader of document which will store the table along with the text on that page specifically
+            # making document of each table with its content
+            unique_id = str(uuid.uuid4())
+            table_document.append(
+                Document(
+                    id =unique_id, # Add doc_id directly
+                    page_content=table_metadata.get("page_content", ""),  # Get page_content from metadata, default to empty string if missing
+                    metadata={
+                        "source": doc["source"],
+                        "text_as_html": table_metadata.get("text_as_html", ""),
+                        "filetype": table_metadata.get("filetype", ""),
+                        "page_number": str(table_metadata.get("page_number", 0)),  # Default to 0 if missing
+                        "image_path": table_metadata.get("image_path", ""),
+                        "file_directory": table_metadata.get("file_directory", ""),
+                        "filename": table_metadata.get("filename", ""),
+                        "Table_number": str(table_metadata.get("Table_number", 0))  # Default to 0 if missing
+                    }
+                )
+            )
+    # Initialize a structure to group content by doc_id
+    grouped_by_doc_id = defaultdict(lambda: {
+        "text_content": [],
+        "metadata": None,  # Metadata will only be set once per doc_id
+    })
+    for doc in processed_documents:
+        doc_id = doc.get("doc_id")
+        source = doc.get("source")
+        text_content = doc.get("text", {}).get("content", [])
+        metadata_list = doc.get("text", {}).get("Metadata", [])
+        # Merge text content
+        grouped_by_doc_id[doc_id]["text_content"].extend(text_content)
+        # Set metadata (if not already set)
+        if grouped_by_doc_id[doc_id]["metadata"] is None and metadata_list:
+            metadata = metadata_list[0]  # Assuming metadata is consistent
+            grouped_by_doc_id[doc_id]["metadata"] = {
+                "source": source,
+                "filetype": metadata.get("filetype"),
+                "file_directory": metadata.get("file_directory"),
+                "filename": metadata.get("filename"),
+                "languages": str(metadata.get("languages")),
+            }
+    # Convert grouped content into Document objects
+    grouped_documents = []
+    for doc_id, data in grouped_by_doc_id.items():
+        grouped_documents.append(
+            Document(
+                id=doc_id,
+                page_content=" ".join(data["text_content"]).strip(),
+                metadata=data["metadata"],
+            )
+        )
+    # Output the grouped documents
+    for document in grouped_documents:
+        print(document)
+    #Dirctory loader for loading the text data only to specific db
     loader = DirectoryLoader(data_path, glob="*.*")
+    documents = loader.load()
+    # update the metadata adding filname to the met
+    for doc in documents:
+        unique_id = str(uuid.uuid4())
+        doc.id = unique_id
+        path=doc.metadata.get("source")
+        match = re.search(r'([^\\]+\.[^\\]+)$', path)
+        doc.metadata.update({"filename":match.group(1)})
+    return documents,grouped_documents
+#documents,processed_documents,table_document = load_document(data_path)
+########################################################################################################################################################
+####-------------------------------------------------------------- Chunking the Text  --------------------------------------------------------------####
+########################################################################################################################################################
 def split_text(documents: list[Document]):
     text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=1000,
+        chunk_overlap=500,
+        length_function=len,
         add_start_index=True,
+    )
+    chunks = text_splitter.split_documents(documents) # splitting the document into chunks
+    for index in chunks:
+        index.metadata["start_index"]=str(index.metadata["start_index"]) # the converstion of int metadata to str was done to store it in sqlite3
     print(f"Split {len(documents)} documents into {len(chunks)} chunks.")
     return chunks
+########################################################################################################################################################
+####---------------------------------------------------- Creating and Storeing Data in Vector DB  --------------------------------------------------####
+########################################################################################################################################################
+def save_to_chroma(chunks: list[Document], name: str, tables: list[Document]):
     CHROMA_PATH = f"./VectorDB/chroma_{name}"
+    TABLE_PATH = f"./TableDB/chroma_{name}"
     if os.path.exists(CHROMA_PATH):
         shutil.rmtree(CHROMA_PATH)
+    if os.path.exists(TABLE_PATH):
+        shutil.rmtree(TABLE_PATH)
+    try:
+        # Load the embedding model
+        #embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
+        embedding_function = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1")
+        # Create Chroma DB for documents using from_documents [NOTE: Some of the data is converted to string because int and float show null if added]
+        print("Creating document vector database...")
+        db = Chroma.from_documents(
+            documents=chunks,
+            embedding=embedding_function,
+            persist_directory=CHROMA_PATH,
+        )
+        print("Document database successfully saved.")
+        # Create Chroma DB for tables if available [NOTE: Some of the data is converted to string because int and float show null if added]
+        if tables:
+            print("Creating table vector database...")
+            tdb = Chroma.from_documents(
+                documents=tables,
+                embedding=embedding_function,
+                persist_directory=TABLE_PATH,
+            )
+            print("Table database successfully saved.")
+        else:
+            tdb = None
+        return db, tdb
+    except Exception as e:
+        print("Error while saving to Chroma:", e)
+        return None
+# def get_unique_sources(chroma_path):
+#     db = Chroma(persist_directory=chroma_path)
+#     metadata_list = db.get()["metadatas"]
+#     unique_sources = {metadata["source"] for metadata in metadata_list if "source" in metadata}
+#     return list(unique_sources)
+########################################################################################################################################################
+####----------------------------------------------------------- Updating Existing Data in Vector DB  -----------------------------------------------####
+########################################################################################################################################################
+def add_document_to_existing_db(new_documents: list[Document], db_name: str):
+    CHROMA_PATH = f"./VectorDB/chroma_{db_name}"
+    if not os.path.exists(CHROMA_PATH):
+        print(f"Database '{db_name}' does not exist. Please create it first.")
+        return
     try:
         embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
+        #embedding_function = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1")
         db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)
+        print("Adding new documents to the existing database...")
+        chunks = split_text(new_documents)
         db.add_documents(chunks)
         db.persist()
+        print("New documents added and database updated successfully.")
     except Exception as e:
+        print("Error while adding documents to existing database:", e)
+def delete_chunks_by_source(chroma_path, source_to_delete):
+    if not os.path.exists(chroma_path):
+        print(f"Database at path '{chroma_path}' does not exist.")
+        return
+    try:
+        #embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
+        embedding_function = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1")
+        db = Chroma(persist_directory=chroma_path, embedding_function=embedding_function)
+        print(f"Retrieving all metadata to identify chunks with source '{source_to_delete}'...")
+        metadata_list = db.get()["metadatas"]
+        # Identify indices of chunks to delete
+        indices_to_delete = [
+            idx for idx, metadata in enumerate(metadata_list) if metadata.get("source") == source_to_delete
+        ]
+        if not indices_to_delete:
+            print(f"No chunks found with source '{source_to_delete}'.")
+            return
+        print(f"Deleting {len(indices_to_delete)} chunks with source '{source_to_delete}'...")
+        db.delete(indices=indices_to_delete)
+        db.persist()
+        print("Chunks deleted and database updated successfully.")
+    except Exception as e:
+        print(f"Error while deleting chunks by source: {e}")
+# # update a data store
+# def update_data_store(file_path, db_name):
+#     CHROMA_PATH = f"./VectorDB/chroma_{db_name}"
+#     print(f"Filepath ===> {file_path}  DB Name ====> {db_name}")
+#     try:
+#         documents,table_document = load_document(file_path)
+#         print("Documents loaded successfully.")
+#     except Exception as e:
+#         print(f"Error loading documents: {e}")
+#         return
+#     try:
+#         chunks = split_text(documents)
+#         print(f"Text split into {len(chunks)} chunks.")
+#     except Exception as e:
+#         print(f"Error splitting text: {e}")
+#         return
+#     try:
+#         asyncio.run(save_to_chroma(save_to_chroma(chunks, db_name, table_document)))
+#         print(f"Data saved to Chroma for database {db_name}.")
+#     except Exception as e:
+#         print(f"Error saving to Chroma: {e}")
+#         return
+########################################################################################################################################################
+####------------------------------------------------------- Combine Process of Load, Chunk and Store  ----------------------------------------------####
+########################################################################################################################################################
+def generate_data_store(file_path, db_name):
     CHROMA_PATH = f"./VectorDB/chroma_{db_name}"
+    print(f"Filepath ===> {file_path}  DB Name ====> {db_name}")
     try:
+        documents,processed_documents,table_document = load_document(file_path)
         print("Documents loaded successfully.")
     except Exception as e:
         print(f"Error loading documents: {e}")
         return
     try:
+        asyncio.run(save_to_chroma(save_to_chroma(chunks, db_name, table_document)))
         print(f"Data saved to Chroma for database {db_name}.")
     except Exception as e:
         print(f"Error saving to Chroma: {e}")
         return