Spaces:
Runtime error
Runtime error
Add batching to db load
Browse files- models/etl.py +17 -9
models/etl.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
import json
|
| 2 |
import chromadb
|
| 3 |
from datetime import datetime
|
|
|
|
| 4 |
|
| 5 |
from utils.general_utils import timeit
|
| 6 |
from utils.embedding_utils import MyEmbeddingFunction
|
|
@@ -96,16 +97,23 @@ def load_data_to_db(db_path, data):
|
|
| 96 |
|
| 97 |
collection = client.get_collection("huberman_videos")
|
| 98 |
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 109 |
print(f"Data loaded to database at {db_path}.")
|
| 110 |
|
| 111 |
|
|
|
|
| 1 |
import json
|
| 2 |
import chromadb
|
| 3 |
from datetime import datetime
|
| 4 |
+
import math
|
| 5 |
|
| 6 |
from utils.general_utils import timeit
|
| 7 |
from utils.embedding_utils import MyEmbeddingFunction
|
|
|
|
| 97 |
|
| 98 |
collection = client.get_collection("huberman_videos")
|
| 99 |
|
| 100 |
+
num_rows = len(data)
|
| 101 |
+
batch_size = 5461
|
| 102 |
+
num_batches = math.ceil(num_rows / batch_size)
|
| 103 |
+
|
| 104 |
+
for i in range(num_batches):
|
| 105 |
+
batch_data = data[i * batch_size : (i + 1) * batch_size]
|
| 106 |
+
documents = [segment['text'] for segment in batch_data]
|
| 107 |
+
metadata = [segment['metadata'] for segment in batch_data]
|
| 108 |
+
ids = [segment['metadata']['segment_id'] for segment in batch_data]
|
| 109 |
|
| 110 |
+
collection.add(
|
| 111 |
+
documents=documents,
|
| 112 |
+
metadatas=metadata,
|
| 113 |
+
ids=ids
|
| 114 |
+
)
|
| 115 |
+
print(f"Batch {i+1} of {num_batches} loaded to database.")
|
| 116 |
+
|
| 117 |
print(f"Data loaded to database at {db_path}.")
|
| 118 |
|
| 119 |
|