Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Upload folder using huggingface_hub
Browse files
config.py
CHANGED
|
@@ -352,6 +352,15 @@ class SanatanConfig:
|
|
| 352 |
"output_dir": "./output/kamba_ramayanam",
|
| 353 |
"collection_name": "kamba_ramayanam_en",
|
| 354 |
"unit": "verse",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 355 |
"metadata_fields": [
|
| 356 |
{
|
| 357 |
"name": "kandam",
|
|
@@ -668,7 +677,6 @@ class SanatanConfig:
|
|
| 668 |
):
|
| 669 |
canonical_doc["text"] = canonical_doc["document"]
|
| 670 |
canonical_doc["document"] = "-"
|
| 671 |
-
|
| 672 |
-
|
| 673 |
-
)
|
| 674 |
return canonical_doc
|
|
|
|
| 352 |
"output_dir": "./output/kamba_ramayanam",
|
| 353 |
"collection_name": "kamba_ramayanam_en",
|
| 354 |
"unit": "verse",
|
| 355 |
+
"unit_field": "verse_number",
|
| 356 |
+
"field_mapping": {
|
| 357 |
+
"chapter_name": lambda doc: f"{doc.get('kandam','')}",
|
| 358 |
+
"title": lambda doc: f"{doc.get('padalam_ta','')} - {doc.get('padalam_en','')}".strip(),
|
| 359 |
+
"author": lambda doc: "Kamban",
|
| 360 |
+
"unit_index": "verse_number",
|
| 361 |
+
"verse": lambda doc: int(doc.get("verse_number", "0")),
|
| 362 |
+
"relative_path": lambda doc: f"{doc.get('padalam_ta','')} - {doc.get('padalam_en','')}".strip(),
|
| 363 |
+
},
|
| 364 |
"metadata_fields": [
|
| 365 |
{
|
| 366 |
"name": "kandam",
|
|
|
|
| 677 |
):
|
| 678 |
canonical_doc["text"] = canonical_doc["document"]
|
| 679 |
canonical_doc["document"] = "-"
|
| 680 |
+
verse = resolve_field(config.get("unit_field", config.get("unit")))
|
| 681 |
+
canonical_doc["verse"] = int(verse) if verse else 0
|
|
|
|
| 682 |
return canonical_doc
|
db.py
CHANGED
|
@@ -112,7 +112,7 @@ class SanatanDatabase:
|
|
| 112 |
n_results=n_results,
|
| 113 |
)
|
| 114 |
|
| 115 |
-
def fetch_document_by_index(self, collection_name: str, index: int, unit_name
|
| 116 |
"""
|
| 117 |
Fetch one document at a time from a ChromaDB collection using pagination (index = 0-based).
|
| 118 |
|
|
@@ -133,18 +133,11 @@ class SanatanDatabase:
|
|
| 133 |
collection = self.chroma_client.get_or_create_collection(name=collection_name)
|
| 134 |
|
| 135 |
try:
|
| 136 |
-
# show a sample data record
|
| 137 |
-
response = collection.get(
|
| 138 |
-
limit=2,
|
| 139 |
-
# offset=index, # pagination via offset
|
| 140 |
-
include=["metadatas", "documents"],
|
| 141 |
-
)
|
| 142 |
-
print(response)
|
| 143 |
response = collection.get(
|
| 144 |
limit=1,
|
| 145 |
# offset=index, # pagination via offset
|
| 146 |
include=["metadatas", "documents"],
|
| 147 |
-
where={unit_name: index}
|
| 148 |
)
|
| 149 |
except Exception as e:
|
| 150 |
logger.error("Error fetching document: %s", e)
|
|
@@ -158,11 +151,19 @@ class SanatanDatabase:
|
|
| 158 |
result = {"document": documents[0]}
|
| 159 |
if metadatas:
|
| 160 |
result.update(metadatas[0])
|
|
|
|
| 161 |
return result
|
| 162 |
else:
|
| 163 |
print("No data available")
|
| 164 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 165 |
|
|
|
|
| 166 |
|
| 167 |
def search_semantic(
|
| 168 |
self,
|
|
|
|
| 112 |
n_results=n_results,
|
| 113 |
)
|
| 114 |
|
| 115 |
+
def fetch_document_by_index(self, collection_name: str, index: int, unit_name: str):
|
| 116 |
"""
|
| 117 |
Fetch one document at a time from a ChromaDB collection using pagination (index = 0-based).
|
| 118 |
|
|
|
|
| 133 |
collection = self.chroma_client.get_or_create_collection(name=collection_name)
|
| 134 |
|
| 135 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 136 |
response = collection.get(
|
| 137 |
limit=1,
|
| 138 |
# offset=index, # pagination via offset
|
| 139 |
include=["metadatas", "documents"],
|
| 140 |
+
where={"$or": [{unit_name: index}, {unit_name: str(index)}]},
|
| 141 |
)
|
| 142 |
except Exception as e:
|
| 143 |
logger.error("Error fetching document: %s", e)
|
|
|
|
| 151 |
result = {"document": documents[0]}
|
| 152 |
if metadatas:
|
| 153 |
result.update(metadatas[0])
|
| 154 |
+
print("raw data = ", result)
|
| 155 |
return result
|
| 156 |
else:
|
| 157 |
print("No data available")
|
| 158 |
+
# show a sample data record
|
| 159 |
+
response = collection.get(
|
| 160 |
+
limit=2,
|
| 161 |
+
# offset=index, # pagination via offset
|
| 162 |
+
include=["metadatas", "documents"],
|
| 163 |
+
)
|
| 164 |
+
print("sample data : ",response)
|
| 165 |
|
| 166 |
+
return {"error": "No data available."}
|
| 167 |
|
| 168 |
def search_semantic(
|
| 169 |
self,
|