from langchain_community.vectorstores import Chroma from langchain_openai import OpenAIEmbeddings import pickle import os CHUNKS_PATH = "E:/courses/LangChain Project/main root/output/chunks.pkl" DB_DIR = "E:/courses/LangChain Project/main root/db" BATCH_SIZE = 100 # You can tune this depending on average token size per chunk if not os.path.exists(CHUNKS_PATH): raise FileNotFoundError("Run chunk_and_embed.py first") with open(CHUNKS_PATH, "rb") as f: chunks = pickle.load(f) embedding = OpenAIEmbeddings(model="text-embedding-3-small") # Create or load the vectorstore vectorstore = Chroma(persist_directory=DB_DIR, embedding_function=embedding) print(f"🧠 Embedding and adding {len(chunks)} chunks in batches of {BATCH_SIZE}...") # Add documents in batches to avoid hitting token limits for i in range(0, len(chunks), BATCH_SIZE): batch = chunks[i:i + BATCH_SIZE] vectorstore.add_documents(batch) print(f"✅ Added batch {i // BATCH_SIZE + 1} of {len(chunks) // BATCH_SIZE + 1}") # vectorstore.persist() print(f"✅ Vectorstore saved to {DB_DIR}")