vikramvasudevan commited on
Commit
0aef7d0
·
verified ·
1 Parent(s): 5f4344d

Upload folder using huggingface_hub

Browse files
Files changed (3) hide show
  1. config.py +2 -1
  2. db.py +51 -2
  3. modules/db/reembed.py +4 -0
config.py CHANGED
@@ -16,7 +16,8 @@ class SanatanConfig:
16
  "name": "vishnu_puranam",
17
  "title": "Sri Vishnu Puranam",
18
  "output_dir": "./output/vishnu_puranam",
19
- "collection_name": "vishnu_puranam",
 
20
  "metadata_fields": [
21
  {"name": "file", "datatype": "str"},
22
  {"name": "num_chars", "datatype": "str"},
 
16
  "name": "vishnu_puranam",
17
  "title": "Sri Vishnu Puranam",
18
  "output_dir": "./output/vishnu_puranam",
19
+ "collection_name": "vishnu_puranam_openai",
20
+ "collection_embedding_fn": "openai",
21
  "metadata_fields": [
22
  {"name": "file", "datatype": "str"},
23
  {"name": "num_chars", "datatype": "str"},
db.py CHANGED
@@ -7,6 +7,7 @@ from pydantic import BaseModel
7
 
8
  from metadata import MetadataFilter, MetadataWhereClause
9
  from modules.db.relevance import validate_relevance_queryresult
 
10
 
11
  logger = logging.getLogger(__name__)
12
  logger.setLevel(logging.INFO)
@@ -53,7 +54,7 @@ class SanatanDatabase:
53
  )
54
 
55
  validated_response = validate_relevance_queryresult(query, response)
56
-
57
  return validated_response["result"]
58
 
59
  def search_for_literal(
@@ -175,6 +176,54 @@ class SanatanDatabase:
175
  count = self.count(collection_name=scripture["collection_name"])
176
  if count == 0:
177
  raise Exception(f"No data in collection {scripture["collection_name"]}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178
 
179
 
180
  if __name__ == "__main__":
@@ -213,4 +262,4 @@ if __name__ == "__main__":
213
  # print("Document: ")
214
  # print(response["documents"][0][0])
215
  # print("Metadata: ")
216
- # print(response["metadatas"][0][0])
 
7
 
8
  from metadata import MetadataFilter, MetadataWhereClause
9
  from modules.db.relevance import validate_relevance_queryresult
10
+ from tqdm import tqdm
11
 
12
  logger = logging.getLogger(__name__)
13
  logger.setLevel(logging.INFO)
 
54
  )
55
 
56
  validated_response = validate_relevance_queryresult(query, response)
57
+
58
  return validated_response["result"]
59
 
60
  def search_for_literal(
 
176
  count = self.count(collection_name=scripture["collection_name"])
177
  if count == 0:
178
  raise Exception(f"No data in collection {scripture["collection_name"]}")
179
+
180
+ def reembed_collection_openai(self, collection_name: str, batch_size: int = 50):
181
+ """
182
+ Deletes and recreates a Chroma collection with OpenAI text-embedding-3-large embeddings.
183
+ All existing documents are re-embedded and inserted into the new collection.
184
+
185
+ Args:
186
+ collection_name: The name of the collection to delete/recreate.
187
+ batch_size: Number of documents to process per batch.
188
+ """
189
+ # Step 1: Fetch old collection data (if exists)
190
+ try:
191
+ old_collection = self.chroma_client.get_collection(name=collection_name)
192
+ old_data = old_collection.get(include=["documents", "metadatas"])
193
+ documents = old_data["documents"]
194
+ metadatas = old_data["metadatas"]
195
+ ids = old_data["ids"]
196
+ print(f"Fetched {len(documents)} documents from old collection.")
197
+
198
+ # Step 2: Delete old collection
199
+ # self.chroma_client.delete_collection(collection_name)
200
+ # print(f"Deleted old collection '{collection_name}'.")
201
+ except chromadb.errors.NotFoundError:
202
+ print(f"No existing collection named '{collection_name}', starting fresh.")
203
+ documents, metadatas, ids = [], [], []
204
+
205
+ # Step 3: Create new collection with correct embedding dimension
206
+ new_collection = self.chroma_client.create_collection(
207
+ name=f"{collection_name}_openai",
208
+ embedding_function=None, # embeddings will be provided manually
209
+ )
210
+ print(f"Created new collection '{collection_name}_openai' with embedding_dim=3072.")
211
+
212
+ # Step 4: Re-embed and insert documents in batches
213
+ for i in tqdm(range(0, len(documents), batch_size), desc="Re-embedding batches"):
214
+ batch_docs = documents[i:i+batch_size]
215
+ batch_metadatas = metadatas[i:i+batch_size]
216
+ batch_ids = ids[i:i+batch_size]
217
+
218
+ embeddings = get_embedding(batch_docs, backend="openai")
219
+
220
+ new_collection.add(
221
+ ids=batch_ids,
222
+ documents=batch_docs,
223
+ metadatas=batch_metadatas,
224
+ embeddings=embeddings
225
+ )
226
+ print("All documents re-embedded and added to new collection successfully!")
227
 
228
 
229
  if __name__ == "__main__":
 
262
  # print("Document: ")
263
  # print(response["documents"][0][0])
264
  # print("Metadata: ")
265
+ # print(response["metadatas"][0][0])
modules/db/reembed.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from db import SanatanDatabase
2
+
3
+ if __name__ == "__main__":
4
+ SanatanDatabase().reembed_collection_openai(collection_name="vishnu_puranam")