Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
| import json | |
| from dataclasses import dataclass | |
| from collections import defaultdict | |
| import logging | |
| logging.basicConfig() | |
| logger = logging.getLogger(__name__) | |
| logger.setLevel(logging.INFO) | |
| class Pasuram: | |
| prabandham_code: str | |
| azhwar_name: str | |
| prabandham_name: str | |
| def get_standardized_prabandham_names() -> list[Pasuram]: | |
| """ | |
| Get a list of prabandham names along with the azhwars who authored them in divya_prabandham, | |
| sorted by the prabandham name (3rd field, index 2). | |
| """ | |
| with open("./data/azhwars.json", "r", encoding="utf-8") as f: | |
| azhwars = json.load(f) | |
| header = azhwars[0] | |
| rows = azhwars[1:] | |
| # Sort by 3rd field (index 2) | |
| rows.sort(key=lambda row: row[2]) | |
| final_azhwars = [Pasuram(**dict(zip(header, row))) for row in rows] | |
| return final_azhwars | |
| def get_standardized_azhwar_names() -> list[str]: | |
| """ | |
| Get a list of azhwar names along with the pasurams they have authored in divya_prabandham | |
| """ | |
| with open("./data/azhwars.json", "r", encoding="utf-8") as f: | |
| azhwars = json.load(f) | |
| header = azhwars[0] | |
| rows = azhwars[1:] | |
| final_azhwars = [row[1] for row in rows] ## 2nd field is the azhwar name | |
| return sorted(set(final_azhwars)) | |
| def get_standardized_divya_desam_names() -> list[str]: | |
| """ | |
| Get a list of divya desam names in divya_prabandham | |
| """ | |
| with open("./data/divya_desams.json", "r", encoding="utf-8") as f: | |
| divya_desams = json.load(f) # FIXED | |
| selected_fields = [ | |
| "title", | |
| "other_names", | |
| "name_ta", | |
| "alwars", | |
| "area", | |
| "state", | |
| "thirukolam", | |
| "direction", | |
| "sampradayam", | |
| "divya_desam", | |
| ] | |
| data = [ | |
| {key: row[key] for key in selected_fields if key in row} | |
| for row in divya_desams["pageProps"]["hits"] | |
| ] | |
| return sorted(set([row["title"] for row in data])) | |
| def reorder_taniyan(collection): | |
| logger.info("reorder_taniyan: started") | |
| # Fetch all docs with ids + metadatas | |
| data = collection.get(include=["metadatas"]) | |
| ids = data.get("ids", []) | |
| metas = data.get("metadatas", []) | |
| if not ids or not metas: | |
| logger.warning("reorder_taniyan: no data found in collection") | |
| return | |
| # sort globally by current _global_index | |
| records = sorted( | |
| [(i, m) for i, m in enumerate(metas)], | |
| key=lambda x: x[1].get("_global_index", float("inf")), | |
| ) | |
| # group by prabandham_code | |
| grouped = defaultdict(list) | |
| for i, meta in records: | |
| prabandham = meta.get("prabandham_code") | |
| if prabandham: | |
| grouped[prabandham].append((i, meta)) | |
| updates = [] | |
| global_counter = 1 # running _global_index across the collection | |
| for prabandham, items in grouped.items(): | |
| taniyan_items = [ | |
| (i, m) for i, m in items if m.get("section_type", "").startswith("taniyan") | |
| ] | |
| non_taniyan_items = [ | |
| (i, m) | |
| for i, m in items | |
| if not m.get("section_type", "").startswith("taniyan") | |
| ] | |
| if not taniyan_items and not non_taniyan_items: | |
| continue | |
| # sort both groups by original _global_index | |
| taniyan_items.sort(key=lambda x: x[1]["_global_index"]) | |
| non_taniyan_items.sort(key=lambda x: x[1]["_global_index"]) | |
| # --- taniyans first (verse starts from 1) --- | |
| for verse_no, (i, meta) in enumerate(taniyan_items, start=1): | |
| updates.append( | |
| { | |
| "id": ids[i], | |
| "metadata": { | |
| **meta, | |
| "_global_index": global_counter, | |
| "verse": verse_no, | |
| }, | |
| } | |
| ) | |
| global_counter += 1 | |
| # --- non-taniyans continue from their base verse --- | |
| if non_taniyan_items: | |
| base_verse = min(m["verse"] for _, m in non_taniyan_items) | |
| for offset, (i, meta) in enumerate(non_taniyan_items): | |
| updates.append( | |
| { | |
| "id": ids[i], | |
| "metadata": { | |
| **meta, | |
| "_global_index": global_counter, | |
| "verse": base_verse + offset, | |
| }, | |
| } | |
| ) | |
| global_counter += 1 | |
| if updates: | |
| logger.info("reorder_taniyan: updating %d records...", len(updates)) | |
| collection.update( | |
| ids=[u["id"] for u in updates], | |
| metadatas=[u["metadata"] for u in updates], | |
| ) | |
| logger.info("reorder_taniyan: update complete.") | |
| else: | |
| logger.info("reorder_taniyan: nothing to update") | |
| logger.info("reorder_taniyan: finished") | |
| def delete_taniyan(collection): | |
| logger.info("delete_taniyan: started") | |
| # Fetch all docs (only ids + metadata needed) | |
| data = collection.get(include=["metadatas"]) | |
| ids = data["ids"] | |
| metas = data["metadatas"] | |
| # Collect ids where section_type starts with "taniyan" | |
| taniyan_ids = [ | |
| ids[i] | |
| for i, meta in enumerate(metas) | |
| if meta.get("section_type", "").startswith("taniyan") | |
| ] | |
| if taniyan_ids: | |
| logger.info("delete_taniyan: Deleting %d taniyan records...", len(taniyan_ids)) | |
| collection.delete(ids=taniyan_ids) | |
| logger.info("delete_taniyan: Deleted %d taniyan records", len(taniyan_ids)) | |
| else: | |
| logger.info("delete_taniyan: No taniyan records found") | |
| logger.info("delete_taniyan: finished") | |
| def get_prabandham_chapter_order_mapping(): | |
| chapter_names = [ | |
| "Common", | |
| "Thiruppallāṇḍu", | |
| "Periyazvar Thirumozhi", | |
| "Thiruppavai", | |
| "Nachiyar Thirumozhi", | |
| "Perumal Thirumozhi", | |
| "Thiruchandavirutham", | |
| "Thirumalai", | |
| "Thirupalliezhuchi", | |
| "Amalanadipiran", | |
| "Kanninunchiruthambu", | |
| "Periya Thirumozhi", | |
| "Thirukurunthandakam", | |
| "Thirunedumthandakam", | |
| "Muthal Thiruvanthathi", | |
| "Irandam Thiruvanthathi", | |
| "Moonram Thiruvanthathi", | |
| "Nanmukan Thiruvanthathi", | |
| "Thiruvirutham", | |
| "Thiruvasiriyam", | |
| "Periya Thiruvanthathi", | |
| "Thiruvezhukootrarikkai", | |
| "Siriya Thirumadal", | |
| "Periya Thirumadal", | |
| "Thiruvaimozhi", | |
| "Iramanusa Nootranthathi", | |
| ] | |
| section_dict = {name: i + 1 for i, name in enumerate(chapter_names)} | |
| return section_dict | |
| if __name__ == "__main__": | |
| logger.info(get_standardized_azhwar_names()) | |