Spaces:

vikramvasudevan
/

sanatan_ai

Running on CPU Upgrade

App Files Files Community

vikramvasudevan commited on 10 days ago

Commit

fac55e5

verified ·

1 Parent(s): 695466f

Upload folder using huggingface_hub

Browse files

Files changed (5) hide show

data/azhwars.json +5 -0
db.py +17 -2
modules/config/divya_prabandham_taniyans.py +16 -12
nalayiram_helper.py +36 -2
tests/test_divya_prabandham_verse_fix.py +9 -3

data/azhwars.json CHANGED Viewed

@@ -128,5 +128,10 @@
       "RNA",
       "Thiruvarangathu Amutanar",
       "Iramanusa Nootranthathi"
     ]
   ]

       "RNA",
       "Thiruvarangathu Amutanar",
       "Iramanusa Nootranthathi"
+    ],
+    [
+      "taniyan",
+      "NA",
+      "Common"
     ]
   ]

db.py CHANGED Viewed

@@ -208,7 +208,7 @@ class SanatanDatabase:
         # If the conversion returns an empty dict, treat it as None
         if isinstance(where_clause, dict) and not where_clause:
             where_clause = None
         # First, try strict filter
         data = collection.get(include=["metadatas", "documents"], where=where_clause)
@@ -751,9 +751,19 @@ class SanatanDatabase:
         df["_id"] = ids
         df["_doc"] = documents
         # Add sortable columns for each unique field
         for field_name in unique_fields:
-            if field_name.lower() == "chapter" and chapter_order_mapping:
                 # Map chapter names to their defined order
                 df["_sort_" + field_name] = (
                     df[field_name].map(chapter_order_mapping).fillna(np.inf)
@@ -773,6 +783,11 @@ class SanatanDatabase:
                 df["_sort_" + field_name] = df[field_name].apply(parse_val)
         sort_cols = ["_sort_" + f for f in unique_fields]
         df = df.sort_values(by=sort_cols, kind="stable").reset_index(drop=True)
         # Assign global index

         # If the conversion returns an empty dict, treat it as None
         if isinstance(where_clause, dict) and not where_clause:
             where_clause = None
         # First, try strict filter
         data = collection.get(include=["metadatas", "documents"], where=where_clause)
         df["_id"] = ids
         df["_doc"] = documents
+        logger.info(
+            "build_global_index_for_all_scriptures:%s:unique_fields: %s",
+            scripture_name,
+            unique_fields,
+        )
         # Add sortable columns for each unique field
         for field_name in unique_fields:
+            if field_name.lower() in ("chapter","prabandham_name") and chapter_order_mapping:
+                logger.info(
+                    "build_global_index_for_all_scriptures:%s:sorting",
+                    scripture_name,
+                )
                 # Map chapter names to their defined order
                 df["_sort_" + field_name] = (
                     df[field_name].map(chapter_order_mapping).fillna(np.inf)
                 df["_sort_" + field_name] = df[field_name].apply(parse_val)
         sort_cols = ["_sort_" + f for f in unique_fields]
+        logger.info(
+                    "build_global_index_for_all_scriptures:%s:sort_cols=%s",
+                    scripture_name,
+                    sort_cols
+                )
         df = df.sort_values(by=sort_cols, kind="stable").reset_index(drop=True)
         # Assign global index

modules/config/divya_prabandham_taniyans.py CHANGED Viewed

@@ -14,6 +14,7 @@ divya_prabandham_taniyans_config = {
     "collection_embedding_fn": "openai",
     "unit": "taniyan",
     "unit_field": "verse",
     "field_mapping": {
         "text": "pasuram_ta",
         "title": lambda doc: f"{doc.get('prabandham_name','')} Taniyan",
@@ -23,7 +24,6 @@ divya_prabandham_taniyans_config = {
         "transliteration": "pasuram_en",
         "reference_link": "html_url",
         "author": "author",
-        # "chapter_name": "prabandham_name",
         "relative_path": lambda doc: "-".join(
             filter(
                 None,
@@ -32,13 +32,6 @@ divya_prabandham_taniyans_config = {
         ),
     },
     "metadata_fields": [
-        {
-            "name": "prabandham_code",
-            "label": "Prabandham Code",
-            "datatype": "str",
-            "description": "contains the short prabandham_code. e.g. `TPL` for `Thiruppallandu`",
-            "is_unique": True,
-        },
         {
             "name": "prabandham_name",
             "label": "Prabandham Name",
@@ -46,10 +39,21 @@ divya_prabandham_taniyans_config = {
             "description": "contains the prabandham name. e.g. `Thiruppallandu`",
             "show_as_filter": True,
             "component": "dropdown",
-            "lov": lambda: [
-                p.prabandham_name
-                for p in nalayiram_helper.get_standardized_prabandham_names()
-            ],
             "is_unique": True,
         },
         {

     "collection_embedding_fn": "openai",
     "unit": "taniyan",
     "unit_field": "verse",
+    "chapter_order": lambda: nalayiram_helper.get_prabandham_chapter_order_mapping(),
     "field_mapping": {
         "text": "pasuram_ta",
         "title": lambda doc: f"{doc.get('prabandham_name','')} Taniyan",
         "transliteration": "pasuram_en",
         "reference_link": "html_url",
         "author": "author",
         "relative_path": lambda doc: "-".join(
             filter(
                 None,
         ),
     },
     "metadata_fields": [
         {
             "name": "prabandham_name",
             "label": "Prabandham Name",
             "description": "contains the prabandham name. e.g. `Thiruppallandu`",
             "show_as_filter": True,
             "component": "dropdown",
+            "lov": lambda: list(
+                set(
+                    [
+                        p.prabandham_name
+                        for p in nalayiram_helper.get_standardized_prabandham_names()
+                    ]
+                )
+            ),
+            "is_unique": True,
+        },
+        {
+            "name": "prabandham_code",
+            "label": "Prabandham Code",
+            "datatype": "str",
+            "description": "contains the short prabandham_code. e.g. `TPL` for `Thiruppallandu`",
             "is_unique": True,
         },
         {

nalayiram_helper.py CHANGED Viewed

@@ -71,6 +71,7 @@ def get_standardized_divya_desam_names() -> list[str]:
     ]
     return sorted(set([row["title"] for row in data]))
 def reorder_taniyan(collection):
     logger.info("reorder_taniyan: started")
@@ -160,7 +161,7 @@ def reorder_taniyan(collection):
 def delete_taniyan(collection):
-    logger.info("delete_taniyan: started")
     # Fetch all docs (only ids + metadata needed)
     data = collection.get(include=["metadatas"])
@@ -169,7 +170,8 @@ def delete_taniyan(collection):
     # Collect ids where section_type starts with "taniyan"
     taniyan_ids = [
-        ids[i] for i, meta in enumerate(metas)
         if meta.get("section_type", "").startswith("taniyan")
     ]
@@ -183,5 +185,37 @@ def delete_taniyan(collection):
     logger.info("delete_taniyan: finished")
 if __name__ == "__main__":
     logger.info(get_standardized_azhwar_names())

     ]
     return sorted(set([row["title"] for row in data]))
 def reorder_taniyan(collection):
     logger.info("reorder_taniyan: started")
 def delete_taniyan(collection):
+    logger.info("delete_taniyan: started")
     # Fetch all docs (only ids + metadata needed)
     data = collection.get(include=["metadatas"])
     # Collect ids where section_type starts with "taniyan"
     taniyan_ids = [
+        ids[i]
+        for i, meta in enumerate(metas)
         if meta.get("section_type", "").startswith("taniyan")
     ]
     logger.info("delete_taniyan: finished")
+def get_prabandham_chapter_order_mapping():
+    chapter_names = [
+        "Common",
+        "Thiruppallāṇḍu",
+        "Periyazvar Thirumozhi",
+        "Thiruppavai",
+        "Nachiyar Thirumozhi",
+        "Perumal Thirumozhi",
+        "Thiruchandavirutham",
+        "Thirumalai",
+        "Thirupalliezhuchi",
+        "Amalanadipiran",
+        "Kanninunchiruthambu",
+        "Periya Thirumozhi",
+        "Thirukurunthandakam",
+        "Thirunedumthandakam",
+        "Muthal Thiruvanthathi",
+        "Irandam Thiruvanthathi",
+        "Moonram Thiruvanthathi",
+        "Nanmukan Thiruvanthathi",
+        "Thiruvirutham",
+        "Thiruvasiriyam",
+        "Periya Thiruvanthathi",
+        "Thiruvezhukootrarikkai",
+        "Siriya Thirumadal",
+        "Periya Thirumadal",
+        "Thiruvaimozhi",
+        "Iramanusa Nootranthathi",
+    ]
+    section_dict = {name: i + 1 for i, name in enumerate(chapter_names)}
+    return section_dict
 if __name__ == "__main__":
     logger.info(get_standardized_azhwar_names())

tests/test_divya_prabandham_verse_fix.py CHANGED Viewed

@@ -1,13 +1,19 @@
 import json
 import logging
 from db import SanatanDatabase
 from metadata import MetadataFilter, MetadataWhereClause
 if __name__ == "__main__":
     logging.basicConfig()
-    collection_name = "divya_prabandham"
     database = SanatanDatabase()
-    database.delete_taniyans_in_divya_prabandham()
-    database.fix_taniyans_in_divya_prabandham()

 import json
 import logging
+from config import SanatanConfig
 from db import SanatanDatabase
 from metadata import MetadataFilter, MetadataWhereClause
 if __name__ == "__main__":
     logging.basicConfig()
+    collection_name = "divya_prabandham_taniyans"
     database = SanatanDatabase()
+    # database.delete_taniyans_in_divya_prabandham()
+    # database.fix_taniyans_in_divya_prabandham()
+    config = SanatanConfig()
+    c = config.get_scripture_by_name("divya_prabandham_taniyans")
+    database.build_global_index_for_scripture(c)
+    results = database.get(collection_name,None,n_results=2)
+    print(results)