sanatan_ai / copy_chromadb.py
vikramvasudevan's picture
Upload folder using huggingface_hub
ec6e2b2 verified
raw
history blame
5.96 kB
import argparse
import chromadb
from tqdm import tqdm # Optional: For progress bar
db_config = {
"youtube_db": {
"source_db_path": "../youtube_surfer_ai_agent/youtube_db",
"source_collection_name": "yt_metadata",
"destination_collection_name": "yt_metadata",
},
"divya_prabandham": {
"source_db_path": "../uveda_analyzer/chromadb_store",
"source_collection_name": "divya_prabandham",
"destination_collection_name": "divya_prabandham",
},
"divya_prabandham_taniyans": {
"source_db_path": "../uveda_analyzer/chromadb_store",
"source_collection_name": "divya_prabandham_taniyans",
"destination_collection_name": "divya_prabandham_taniyans",
},
"vishnu_sahasranamam": {
"source_db_path": "../vishnu_sahasranamam_ai/output/chroma_store",
"source_collection_name": "vishnu_sahasranamam",
"destination_collection_name": "vishnu_sahasranamam_openai",
},
"bhagavat_gita": {
"source_db_path": "../bhagavat_gita_chat/chromadb_store",
"source_collection_name": "bhagavat_gita",
"destination_collection_name": "bhagavat_gita_openai",
},
"pancha_sooktham": {
"source_db_path": "../sooktham_ai/chromadb_store",
"source_collection_name": "pancha_sooktham",
"destination_collection_name": "pancha_sooktham",
},
"taitriya_upanishad": {
"source_db_path": "../taitriya_upanishad_ai/chromadb_store",
"source_collection_name": "taitriya_upanishad",
"destination_collection_name": "taitriya_upanishad",
},
"shanthi_panchakam": {
"source_db_path": "../shanthi_panchakam_ai/chromadb_store",
"source_collection_name": "shanthi_panchakam",
"destination_collection_name": "shanthi_panchakam",
},
"taitriya_samhitha": {
"source_db_path": "../taitriya_samhitha_ai/chromadb_store",
"source_collection_name": "taitriya_samhitha",
"destination_collection_name": "taitriya_samhitha",
},
"taitriya_brahmanam": {
"source_db_path": "../taitriya_brahmanam_ai/chromadb_store",
"source_collection_name": "taitriya_brahmanam",
"destination_collection_name": "taitriya_brahmanam",
},
"katakam": {
"source_db_path": "../taitriya_brahmanam_ai/chromadb_store",
"source_collection_name": "katakam",
"destination_collection_name": "katakam",
},
"sri_stavam": {
"source_db_path": "../vedam_ai/chromadb-store",
"source_collection_name": "sri_stavam",
"destination_collection_name": "sri_stavam",
},
"taitriya_aranyakam": {
"source_db_path": "../taitriya_aranyakam_ai/chromadb_store",
"source_collection_name": "taitriya_aranyakam",
"destination_collection_name": "taitriya_aranyakam",
},
"brahma_sutra": {
"source_db_path": "../brahma_sutra_ai/chromadb_store",
"source_collection_name": "brahma_sutra",
"destination_collection_name": "brahma_sutra",
},
"valmiki_ramayanam": {
"source_db_path": "../valmiki_ramayanam_ai/chromadb_store",
"source_collection_name": "valmiki_ramayanam",
"destination_collection_name": "valmiki_ramayanam_openai",
},
"sri_vachana_bhushanam": {
"source_db_path": "../sri_vachana_bhushanam_ai/chromadb_store",
"source_collection_name": "sri_vachana_bhushanam",
"destination_collection_name": "sri_vachana_bhushanam",
},
}
parser = argparse.ArgumentParser(description="My app with database parameter")
parser.add_argument(
"--db",
type=str,
required=True,
choices=list(db_config.keys()),
help=f"Id of the database to use. allowed_values : {', '.join(db_config.keys())}",
)
args = parser.parse_args()
db_id = args.db
if db_id is None:
raise Exception(f"No db provided!")
if db_id not in db_config:
raise Exception(f"db with id {db_id} not found!")
# Connect to source and destination local persistent clients
source_client = chromadb.PersistentClient(path=db_config[db_id]["source_db_path"])
destination_client = chromadb.PersistentClient(path="./chromadb-store")
source_collection_name = db_config[db_id]["source_collection_name"]
destination_collection_name = db_config[db_id]["destination_collection_name"]
# Get the source collection
source_collection = source_client.get_collection(source_collection_name)
# Retrieve all data from the source collection
source_data = source_collection.get(include=["documents", "metadatas", "embeddings"])
# Create or get the destination collection
if destination_client.get_or_create_collection(destination_collection_name):
print("Deleting existing collection", destination_collection_name)
destination_client.delete_collection(destination_collection_name)
destination_collection = destination_client.get_or_create_collection(
destination_collection_name,
metadata=source_collection.metadata, # Copy metadata if needed
)
# Add data to the destination collection in batches
BATCH_SIZE = 500
total_records = len(source_data["ids"])
print(f"Copying {total_records} records in batches of {BATCH_SIZE}...")
for i in tqdm(range(0, total_records, BATCH_SIZE)):
batch_ids = source_data["ids"][i : i + BATCH_SIZE]
batch_docs = source_data["documents"][i : i + BATCH_SIZE]
batch_metas = source_data["metadatas"][i : i + BATCH_SIZE]
batch_embeds = (
source_data["embeddings"][i : i + BATCH_SIZE]
if "embeddings" in source_data and source_data["embeddings"] is not None
else None
)
destination_collection.add(
ids=batch_ids,
documents=batch_docs,
metadatas=batch_metas,
embeddings=batch_embeds,
)
print("✅ Collection copied successfully!")
print("Total records in source collection = ", source_collection.count())
print("Total records in destination collection = ", destination_collection.count())