File size: 6,519 Bytes
d434239
63d1774
d434239
3c0fb3e
90dc9aa
b11b469
75a5b18
fd1b271
 
3c0fb3e
b11b469
d023803
 
 
 
 
 
 
73a6587
 
 
 
 
 
 
d434239
 
 
d023803
a1180f7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d434239
74c37c0
 
 
 
 
 
c7480d6
63d1774
90a01cc
0412bab
90a01cc
 
 
0412bab
 
 
 
 
75a5b18
63d1774
 
 
 
 
 
0412bab
faece1b
 
 
 
 
 
b11b469
faece1b
 
 
7b33394
 
faece1b
 
 
 
 
 
90a01cc
faece1b
b11b469
faece1b
 
 
 
3772fe4
faece1b
 
 
 
 
 
 
 
b11b469
 
 
 
 
 
 
 
 
faece1b
b11b469
 
 
 
 
 
 
faece1b
 
 
 
 
 
 
 
 
 
b11b469
faece1b
 
b11b469
faece1b
 
 
 
 
 
b11b469
 
 
faece1b
730c13e
b11b469
 
b24fcf4
51db0fa
 
 
 
b11b469
 
7b33394
 
b11b469
faece1b
3772fe4
 
 
 
 
 
 
3c0fb3e
b11b469
3c0fb3e
 
b11b469
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
from metadata import MetadataWhereClause
from typing import List, Dict

from modules.config import scripture_configurations
from modules.languages.transliterator import fn_transliterate


class SanatanConfig:
    dbStorePath: str = "./chromadb-store"
    scriptures = scripture_configurations

    def get_scripture_by_collection(self, collection_name: str):
        return [
            scripture
            for scripture in self.scriptures
            if scripture["collection_name"] == collection_name
        ][0]

    def get_scripture_by_name(self, scripture_name: str):
        return [
            scripture
            for scripture in self.scriptures
            if scripture["name"] == scripture_name
        ][0]

    def is_metadata_field_allowed(
        self, collection_name: str, metadata_where_clause: MetadataWhereClause
    ):
        scripture = self.get_scripture_by_collection(collection_name=collection_name)
        allowed_fields = [field["name"] for field in scripture["metadata_fields"]]

        def validate_clause(clause: MetadataWhereClause):
            # validate direct filters
            if clause.filters:
                for f in clause.filters:
                    if f.metadata_field not in allowed_fields:
                        raise Exception(
                            f"metadata_field: [{f.metadata_field}] not allowed in collection [{collection_name}]. "
                            f"Here are the allowed fields with their descriptions: {scripture['metadata_fields']}"
                        )
            # recurse into groups
            if clause.groups:
                for g in clause.groups:
                    validate_clause(g)

        validate_clause(metadata_where_clause)
        return True

    def get_embedding_for_collection(self, collection_name: str):
        scripture = self.get_scripture_by_collection(collection_name)
        embedding_fn = "hf"  # default is huggingface sentence transformaers
        if "collection_embedding_fn" in scripture:
            embedding_fn = scripture["collection_embedding_fn"]  # overridden in config
        return embedding_fn

    def remove_callables(self, obj):
        if isinstance(obj, dict):
            return {
                k: self.remove_callables(v) for k, v in obj.items() if not callable(v)
            }
        elif isinstance(obj, list):
            return [self.remove_callables(v) for v in obj if not callable(v)]
        else:
            return obj

    def filter_scriptures_fields(self, fields_to_keep: List[str]) -> List[Dict]:
        """
        Return a list of scripture dicts containing only the specified fields.
        """
        filtered = []
        for s in self.scriptures:
            filtered.append({k: s[k] for k in fields_to_keep if k in s})
        return self.remove_callables(filtered)

    def canonicalize_document(
        self, scripture_name: str, document_text: str, metadata_doc: dict
    ):
        """
        Convert scripture-specific document to a flattened canonical form.
        Supports strings, lambdas, or nested dicts in field mapping.
        Only allows keys from the allowed canonical fields list.
        """
        allowed_keys = {
            "_global_index",
            "id",
            "verse",
            "text",
            "title",
            "unit",
            "unit_index",
            "word_by_word_native",
            "translation",
            "transliteration",
            "transliteration_v2",  # support v2
            "reference_link",
            "author",
            "chapter_name",
            "relative_path",
            "location",
        }

        config = next((s for s in self.scriptures if s["name"] == scripture_name), None)
        if not config:
            raise ValueError(f"Unknown scripture: {scripture_name}")

        mapping = config.get("field_mapping", {})

        # ------------------------------------
        # Inject transliteration_v2 if missing
        # ------------------------------------
        if "transliteration_v2" not in mapping:
            text_field = mapping.get("text", "text")  # fallback to "text"
            mapping["transliteration_v2"] = lambda doc: {
                lang: t for lang, t in fn_transliterate(doc.get(text_field, "")).items()
            }

        def resolve_field(field):
            """Resolve a field: string key, callable, or nested dict"""
            if isinstance(field, dict):
                # Recursively resolve nested dict values
                return {
                    subkey: resolve_field(subval) for subkey, subval in field.items()
                }
            elif callable(field):
                try:
                    return field(metadata_doc)
                except Exception:
                    return None
            elif isinstance(field, str):
                return metadata_doc.get(field)
            return None

        canonical_doc = {}
        for key, field in mapping.items():
            if key in allowed_keys:
                canonical_doc[key] = resolve_field(field)

        # Add standard fields from config
        canonical_doc["scripture_name"] = config.get("name")
        canonical_doc["scripture_title"] = config.get("title")
        canonical_doc["source"] = config.get("source")
        canonical_doc["language"] = config.get("language")
        canonical_doc["unit"] = config.get("unit")
        canonical_doc["document"] = document_text

        # Handle text/document swap if text missing
        if canonical_doc.get("text") in (None, "-"):
            canonical_doc["text"] = canonical_doc["document"]
            canonical_doc["document"] = "-"

        # Verse resolution
        verse = resolve_field(config.get("unit_field", config.get("unit")))
        if verse == "-":
            canonical_doc["verse"] = -1
        else:
            canonical_doc["verse"] = int(verse) if verse else 0

        # ID and global index
        canonical_doc["id"] = resolve_field("id")
        canonical_doc["_global_index"] = resolve_field("_global_index")

        return canonical_doc

    def get_collection_name(self, scripture_name):
        config = next(
            (s for s in SanatanConfig().scriptures if s["name"] == scripture_name), None
        )
        collection_name = config.get("collection_name")
        return collection_name


if __name__ == "__main__":
    print(SanatanConfig.scriptures)
    [scripture["collection_name"] for scripture in SanatanConfig.scriptures]