vikramvasudevan commited on
Commit
fac55e5
·
verified ·
1 Parent(s): 695466f

Upload folder using huggingface_hub

Browse files
data/azhwars.json CHANGED
@@ -128,5 +128,10 @@
128
  "RNA",
129
  "Thiruvarangathu Amutanar",
130
  "Iramanusa Nootranthathi"
 
 
 
 
 
131
  ]
132
  ]
 
128
  "RNA",
129
  "Thiruvarangathu Amutanar",
130
  "Iramanusa Nootranthathi"
131
+ ],
132
+ [
133
+ "taniyan",
134
+ "NA",
135
+ "Common"
136
  ]
137
  ]
db.py CHANGED
@@ -208,7 +208,7 @@ class SanatanDatabase:
208
  # If the conversion returns an empty dict, treat it as None
209
  if isinstance(where_clause, dict) and not where_clause:
210
  where_clause = None
211
-
212
  # First, try strict filter
213
  data = collection.get(include=["metadatas", "documents"], where=where_clause)
214
 
@@ -751,9 +751,19 @@ class SanatanDatabase:
751
  df["_id"] = ids
752
  df["_doc"] = documents
753
 
 
 
 
 
 
 
754
  # Add sortable columns for each unique field
755
  for field_name in unique_fields:
756
- if field_name.lower() == "chapter" and chapter_order_mapping:
 
 
 
 
757
  # Map chapter names to their defined order
758
  df["_sort_" + field_name] = (
759
  df[field_name].map(chapter_order_mapping).fillna(np.inf)
@@ -773,6 +783,11 @@ class SanatanDatabase:
773
  df["_sort_" + field_name] = df[field_name].apply(parse_val)
774
 
775
  sort_cols = ["_sort_" + f for f in unique_fields]
 
 
 
 
 
776
  df = df.sort_values(by=sort_cols, kind="stable").reset_index(drop=True)
777
 
778
  # Assign global index
 
208
  # If the conversion returns an empty dict, treat it as None
209
  if isinstance(where_clause, dict) and not where_clause:
210
  where_clause = None
211
+
212
  # First, try strict filter
213
  data = collection.get(include=["metadatas", "documents"], where=where_clause)
214
 
 
751
  df["_id"] = ids
752
  df["_doc"] = documents
753
 
754
+ logger.info(
755
+ "build_global_index_for_all_scriptures:%s:unique_fields: %s",
756
+ scripture_name,
757
+ unique_fields,
758
+ )
759
+
760
  # Add sortable columns for each unique field
761
  for field_name in unique_fields:
762
+ if field_name.lower() in ("chapter","prabandham_name") and chapter_order_mapping:
763
+ logger.info(
764
+ "build_global_index_for_all_scriptures:%s:sorting",
765
+ scripture_name,
766
+ )
767
  # Map chapter names to their defined order
768
  df["_sort_" + field_name] = (
769
  df[field_name].map(chapter_order_mapping).fillna(np.inf)
 
783
  df["_sort_" + field_name] = df[field_name].apply(parse_val)
784
 
785
  sort_cols = ["_sort_" + f for f in unique_fields]
786
+ logger.info(
787
+ "build_global_index_for_all_scriptures:%s:sort_cols=%s",
788
+ scripture_name,
789
+ sort_cols
790
+ )
791
  df = df.sort_values(by=sort_cols, kind="stable").reset_index(drop=True)
792
 
793
  # Assign global index
modules/config/divya_prabandham_taniyans.py CHANGED
@@ -14,6 +14,7 @@ divya_prabandham_taniyans_config = {
14
  "collection_embedding_fn": "openai",
15
  "unit": "taniyan",
16
  "unit_field": "verse",
 
17
  "field_mapping": {
18
  "text": "pasuram_ta",
19
  "title": lambda doc: f"{doc.get('prabandham_name','')} Taniyan",
@@ -23,7 +24,6 @@ divya_prabandham_taniyans_config = {
23
  "transliteration": "pasuram_en",
24
  "reference_link": "html_url",
25
  "author": "author",
26
- # "chapter_name": "prabandham_name",
27
  "relative_path": lambda doc: "-".join(
28
  filter(
29
  None,
@@ -32,13 +32,6 @@ divya_prabandham_taniyans_config = {
32
  ),
33
  },
34
  "metadata_fields": [
35
- {
36
- "name": "prabandham_code",
37
- "label": "Prabandham Code",
38
- "datatype": "str",
39
- "description": "contains the short prabandham_code. e.g. `TPL` for `Thiruppallandu`",
40
- "is_unique": True,
41
- },
42
  {
43
  "name": "prabandham_name",
44
  "label": "Prabandham Name",
@@ -46,10 +39,21 @@ divya_prabandham_taniyans_config = {
46
  "description": "contains the prabandham name. e.g. `Thiruppallandu`",
47
  "show_as_filter": True,
48
  "component": "dropdown",
49
- "lov": lambda: [
50
- p.prabandham_name
51
- for p in nalayiram_helper.get_standardized_prabandham_names()
52
- ],
 
 
 
 
 
 
 
 
 
 
 
53
  "is_unique": True,
54
  },
55
  {
 
14
  "collection_embedding_fn": "openai",
15
  "unit": "taniyan",
16
  "unit_field": "verse",
17
+ "chapter_order": lambda: nalayiram_helper.get_prabandham_chapter_order_mapping(),
18
  "field_mapping": {
19
  "text": "pasuram_ta",
20
  "title": lambda doc: f"{doc.get('prabandham_name','')} Taniyan",
 
24
  "transliteration": "pasuram_en",
25
  "reference_link": "html_url",
26
  "author": "author",
 
27
  "relative_path": lambda doc: "-".join(
28
  filter(
29
  None,
 
32
  ),
33
  },
34
  "metadata_fields": [
 
 
 
 
 
 
 
35
  {
36
  "name": "prabandham_name",
37
  "label": "Prabandham Name",
 
39
  "description": "contains the prabandham name. e.g. `Thiruppallandu`",
40
  "show_as_filter": True,
41
  "component": "dropdown",
42
+ "lov": lambda: list(
43
+ set(
44
+ [
45
+ p.prabandham_name
46
+ for p in nalayiram_helper.get_standardized_prabandham_names()
47
+ ]
48
+ )
49
+ ),
50
+ "is_unique": True,
51
+ },
52
+ {
53
+ "name": "prabandham_code",
54
+ "label": "Prabandham Code",
55
+ "datatype": "str",
56
+ "description": "contains the short prabandham_code. e.g. `TPL` for `Thiruppallandu`",
57
  "is_unique": True,
58
  },
59
  {
nalayiram_helper.py CHANGED
@@ -71,6 +71,7 @@ def get_standardized_divya_desam_names() -> list[str]:
71
  ]
72
  return sorted(set([row["title"] for row in data]))
73
 
 
74
  def reorder_taniyan(collection):
75
  logger.info("reorder_taniyan: started")
76
 
@@ -160,7 +161,7 @@ def reorder_taniyan(collection):
160
 
161
 
162
  def delete_taniyan(collection):
163
- logger.info("delete_taniyan: started")
164
 
165
  # Fetch all docs (only ids + metadata needed)
166
  data = collection.get(include=["metadatas"])
@@ -169,7 +170,8 @@ def delete_taniyan(collection):
169
 
170
  # Collect ids where section_type starts with "taniyan"
171
  taniyan_ids = [
172
- ids[i] for i, meta in enumerate(metas)
 
173
  if meta.get("section_type", "").startswith("taniyan")
174
  ]
175
 
@@ -183,5 +185,37 @@ def delete_taniyan(collection):
183
  logger.info("delete_taniyan: finished")
184
 
185
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
186
  if __name__ == "__main__":
187
  logger.info(get_standardized_azhwar_names())
 
71
  ]
72
  return sorted(set([row["title"] for row in data]))
73
 
74
+
75
  def reorder_taniyan(collection):
76
  logger.info("reorder_taniyan: started")
77
 
 
161
 
162
 
163
  def delete_taniyan(collection):
164
+ logger.info("delete_taniyan: started")
165
 
166
  # Fetch all docs (only ids + metadata needed)
167
  data = collection.get(include=["metadatas"])
 
170
 
171
  # Collect ids where section_type starts with "taniyan"
172
  taniyan_ids = [
173
+ ids[i]
174
+ for i, meta in enumerate(metas)
175
  if meta.get("section_type", "").startswith("taniyan")
176
  ]
177
 
 
185
  logger.info("delete_taniyan: finished")
186
 
187
 
188
+ def get_prabandham_chapter_order_mapping():
189
+ chapter_names = [
190
+ "Common",
191
+ "Thiruppallāṇḍu",
192
+ "Periyazvar Thirumozhi",
193
+ "Thiruppavai",
194
+ "Nachiyar Thirumozhi",
195
+ "Perumal Thirumozhi",
196
+ "Thiruchandavirutham",
197
+ "Thirumalai",
198
+ "Thirupalliezhuchi",
199
+ "Amalanadipiran",
200
+ "Kanninunchiruthambu",
201
+ "Periya Thirumozhi",
202
+ "Thirukurunthandakam",
203
+ "Thirunedumthandakam",
204
+ "Muthal Thiruvanthathi",
205
+ "Irandam Thiruvanthathi",
206
+ "Moonram Thiruvanthathi",
207
+ "Nanmukan Thiruvanthathi",
208
+ "Thiruvirutham",
209
+ "Thiruvasiriyam",
210
+ "Periya Thiruvanthathi",
211
+ "Thiruvezhukootrarikkai",
212
+ "Siriya Thirumadal",
213
+ "Periya Thirumadal",
214
+ "Thiruvaimozhi",
215
+ "Iramanusa Nootranthathi",
216
+ ]
217
+ section_dict = {name: i + 1 for i, name in enumerate(chapter_names)}
218
+ return section_dict
219
+
220
  if __name__ == "__main__":
221
  logger.info(get_standardized_azhwar_names())
tests/test_divya_prabandham_verse_fix.py CHANGED
@@ -1,13 +1,19 @@
1
  import json
2
  import logging
3
 
 
4
  from db import SanatanDatabase
5
  from metadata import MetadataFilter, MetadataWhereClause
6
 
7
 
8
  if __name__ == "__main__":
9
  logging.basicConfig()
10
- collection_name = "divya_prabandham"
11
  database = SanatanDatabase()
12
- database.delete_taniyans_in_divya_prabandham()
13
- database.fix_taniyans_in_divya_prabandham()
 
 
 
 
 
 
1
  import json
2
  import logging
3
 
4
+ from config import SanatanConfig
5
  from db import SanatanDatabase
6
  from metadata import MetadataFilter, MetadataWhereClause
7
 
8
 
9
  if __name__ == "__main__":
10
  logging.basicConfig()
11
+ collection_name = "divya_prabandham_taniyans"
12
  database = SanatanDatabase()
13
+ # database.delete_taniyans_in_divya_prabandham()
14
+ # database.fix_taniyans_in_divya_prabandham()
15
+ config = SanatanConfig()
16
+ c = config.get_scripture_by_name("divya_prabandham_taniyans")
17
+ database.build_global_index_for_scripture(c)
18
+ results = database.get(collection_name,None,n_results=2)
19
+ print(results)