vikramvasudevan commited on
Commit
a1180f7
·
verified ·
1 Parent(s): 98834ab

Upload folder using huggingface_hub

Browse files
.gitignore CHANGED
@@ -11,3 +11,4 @@ wheels/
11
  .env
12
  chromadb-store/
13
  chromadb-store.zip
 
 
11
  .env
12
  chromadb-store/
13
  chromadb-store.zip
14
+ outputs/
README.md CHANGED
@@ -5,3 +5,20 @@ sdk: gradio
5
  sdk_version: 5.38.0
6
  python_version: 3.12
7
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  sdk_version: 5.38.0
6
  python_version: 3.12
7
  ---
8
+
9
+ ### Introduction
10
+ This is an Agentic-AI project that integrates all Hindu Sanatan Dharma scriptures into a single searchable platform.
11
+
12
+ ### Supported Channels
13
+ - Web (https://huggingface.co/spaces/vikramvasudevan/sanatan_ai)
14
+ - Android (bhashyam.ai app)
15
+
16
+ ### Start Web Server
17
+ - Run the following command from project root
18
+ > `uv run ./main.py`
19
+
20
+ ### Automated AI Evaluator
21
+ - Tests are defined in tests/test_config.py
22
+ - Run the following command from project root to execute the tests.
23
+ > `uv run -m tests.test_evaluator`
24
+ - Test Logs are generated under `{project-root}/outputs/tests` folder as neatly formatted md files.
app.py CHANGED
@@ -13,6 +13,7 @@ from langchain_core.messages.ai import AIMessageChunk, AIMessage
13
  from langchain_core.messages.system import SystemMessage
14
  from langchain_core.messages.tool import ToolMessage
15
 
 
16
  from config import SanatanConfig
17
  from db import SanatanDatabase
18
  from drive_downloader import ZipDownloader
@@ -63,277 +64,6 @@ def render_message_with_tooltip(content: str, max_chars=200):
63
  return f"<div title='{escape(content)}'>{short}</div>"
64
 
65
 
66
- thinking_verbs = [
67
- "thinking",
68
- "processing",
69
- "crunching data",
70
- "please wait",
71
- "just a few more seconds",
72
- "closing in",
73
- "analyzing",
74
- "reasoning",
75
- "computing",
76
- "synthesizing insight",
77
- "searching through the cosmos",
78
- "decoding ancient knowledge",
79
- "scanning the scriptures",
80
- "accessing divine memory",
81
- "gathering wisdom",
82
- "consulting the rishis",
83
- "listening to the ātmā",
84
- "channeling sacred energy",
85
- "unfolding the divine word",
86
- "meditating on the meaning",
87
- "reciting from memory",
88
- "traversing the Vedas",
89
- "seeking the inner light",
90
- "invoking paramārtha",
91
- "putting it all together",
92
- "digging deeper",
93
- "making sense of it",
94
- "connecting the dots",
95
- "almost there",
96
- "getting closer",
97
- "wrapping it up",
98
- "piecing it together",
99
- "swirling through verses",
100
- "diving into the ocean of knowledge",
101
- "lighting the lamp of understanding",
102
- "walking the path of inquiry",
103
- "aligning stars of context",
104
- ]
105
-
106
-
107
- async def chat_wrapper(
108
- message, history, thread_id, debug, preferred_language="English"
109
- ):
110
- if debug:
111
- async for chunk in chat_streaming(
112
- debug, message, history, thread_id, preferred_language=preferred_language
113
- ):
114
- yield chunk
115
- else:
116
- response = chat(
117
- debug, message, history, thread_id, preferred_language=preferred_language
118
- )
119
- yield response
120
-
121
-
122
- def chat(debug_mode, message, history, thread_id, preferred_language="English"):
123
- config = {"configurable": {"thread_id": thread_id}}
124
- response = graph.invoke(
125
- {
126
- "debug_mode": debug_mode,
127
- "messages": [{"role": "user", "content": message}],
128
- "language": preferred_language,
129
- },
130
- config=config,
131
- )
132
- return response["messages"][-1].content
133
-
134
-
135
- def add_node_to_tree(
136
- node_tree: list[str], node_label: str, tooltip: str = "no arguments to show"
137
- ) -> list[str]:
138
- if tooltip:
139
- tooltip = escape(tooltip).replace("'", "&apos;")
140
- node_with_tooltip = (
141
- f"<span class='node-label' title='{tooltip}'>{node_label}</span>"
142
- )
143
- else:
144
- node_with_tooltip = node_label
145
- node_tree[-1] = node_with_tooltip
146
- node_tree.append("<span class='spinner'>&nbsp;</span>")
147
- return node_tree
148
-
149
-
150
- def end_node_tree(node_tree: list[str]) -> list[str]:
151
- node_tree[-1] = "🏁"
152
- return node_tree
153
-
154
-
155
- def get_args_for_toolcall(tool_calls_buffer: dict, tool_call_id: str):
156
- return (
157
- tool_calls_buffer[tool_call_id]["args_str"]
158
- if tool_call_id in tool_calls_buffer
159
- and "args_str" in tool_calls_buffer[tool_call_id]
160
- else ""
161
- )
162
-
163
-
164
- async def chat_streaming(
165
- debug_mode: bool, message, history, thread_id, preferred_language="English"
166
- ):
167
- state = {
168
- "debug_mode": debug_mode,
169
- "messages": (history or []) + [{"role": "user", "content": message}],
170
- "language": preferred_language,
171
- }
172
- config = {"configurable": {"thread_id": thread_id}, "recursion_limit": 15}
173
- start_time = time.time()
174
- streamed_response = ""
175
- final_response = ""
176
- # final_node = "validator"
177
-
178
- MAX_CONTENT = 500
179
-
180
- try:
181
- node_tree = ["🚩", "<span class='spinner'>&nbsp;</span>"]
182
-
183
- tool_calls_buffer = {}
184
-
185
- async for msg, metadata in graph.astream(
186
- state, config=config, stream_mode="messages"
187
- ):
188
- node = metadata.get("langgraph_node", "?")
189
- name = getattr(msg, "name", "-")
190
- if not isinstance(msg, ToolMessage):
191
- node_icon = "🧠"
192
- else:
193
- node_icon = "⚙️"
194
- node_label = f"{node}"
195
- tool_label = f"{name or ''}"
196
- if tool_label:
197
- node_label = node_label + f":{tool_label}"
198
- label = f"{node_icon} {node_label}"
199
- tooltip = ""
200
- if isinstance(msg, ToolMessage):
201
- tooltip = get_args_for_toolcall(tool_calls_buffer, msg.tool_call_id)
202
- # logger.info("tooltip = ", tooltip)
203
-
204
- # checking for -2 last but one. since last entry is always a spinner
205
- if node_tree[-2] != label:
206
- add_node_to_tree(node_tree, label, tooltip)
207
- full: str = escape(msg.content)
208
- truncated = (full[:MAX_CONTENT] + "…") if len(full) > MAX_CONTENT else full
209
-
210
- def generate_processing_message():
211
- return f"<div class='thinking-bubble'><em>🤔{random.choice(thinking_verbs)} ...</em></div>"
212
-
213
- if (
214
- not isinstance(msg, ToolMessage)
215
- and not isinstance(msg, SystemMessage)
216
- and not isinstance(msg, AIMessageChunk)
217
- ):
218
- logger.info("msg = %s", msg)
219
- if isinstance(msg, ToolMessage):
220
- logger.debug("tool message = %s", msg)
221
-
222
- html = f"<div class='thinking-bubble'><em>🤔 {msg.name} tool: {random.choice(thinking_verbs)} ...</em></div>"
223
- yield f"### { ' → '.join(node_tree)}\n{html}"
224
- elif isinstance(msg, AIMessageChunk):
225
-
226
- def truncate_middle(text, front=50, back=50):
227
- if not text:
228
- return ""
229
- if len(text) <= front + back:
230
- return text
231
- return f"{text[:front]}…{text[-back:]}".replace(
232
- "\n", ""
233
- ) # remove new lines.
234
-
235
- if not msg.content:
236
- # logger.warning("*** No Message Chunk!")
237
- yield f"### { " → ".join(node_tree)}\n{generate_processing_message()}\n<div class='intermediate-output'>{escape(truncate_middle(streamed_response))}</div>"
238
- else:
239
- # Stream intermediate messages with transparent style
240
- # if node != final_node:
241
- streamed_response += msg.content
242
- yield f"### { ' → '.join(node_tree) }\n<div class='intermediate-output'>{escape(truncate_middle(streamed_response))}</div>"
243
- # else:
244
- # Buffer the final validated response instead of yielding
245
- final_response += msg.content
246
-
247
- if msg.tool_call_chunks:
248
- for tool_call_chunk in msg.tool_call_chunks:
249
- logger.debug("*** tool_call_chunk = ", tool_call_chunk)
250
- if tool_call_chunk["id"] is not None:
251
- tool_call_id = tool_call_chunk["id"]
252
-
253
- if tool_call_id not in tool_calls_buffer:
254
- tool_calls_buffer[tool_call_id] = {
255
- "name": "",
256
- "args_str": "",
257
- "id": tool_call_id,
258
- "type": "tool_call",
259
- }
260
-
261
- # Accumulate tool call name and arguments
262
- if tool_call_chunk["name"] is not None:
263
- tool_calls_buffer[tool_call_id]["name"] += tool_call_chunk[
264
- "name"
265
- ]
266
- if tool_call_chunk["args"] is not None:
267
- tool_calls_buffer[tool_call_id][
268
- "args_str"
269
- ] += tool_call_chunk["args"]
270
- else:
271
- logger.debug("message = ", type(msg), msg.content[:100])
272
- full: str = escape(msg.content)
273
- truncated = (
274
- (full[:MAX_CONTENT] + "…") if len(full) > MAX_CONTENT else full
275
- )
276
- html = (
277
- f"<div class='thinking-bubble'><em>🤔 {random.choice(thinking_verbs)} ...</em></div>"
278
- f"<div style='opacity: 0.1'>"
279
- f"<strong>Telling myself:</strong> {truncated or '...'}"
280
- f"</div>"
281
- )
282
- yield f"### { " → ".join(node_tree)}\n{html}"
283
- if getattr(msg, "tool_calls", []):
284
- logger.info("ELSE::tool_calls = %s", msg.tool_calls)
285
-
286
- node_tree[-1] = "✅"
287
- end_time = time.time()
288
- duration = end_time - start_time
289
- final_response = (
290
- f"\n{final_response}" f"\n\n⏱️ Processed in {duration:.2f} seconds"
291
- )
292
- buffer = f"### {' → '.join(node_tree)}\n"
293
- yield buffer
294
- for c in final_response:
295
- buffer += c
296
- yield buffer
297
- await asyncio.sleep(0.0005)
298
-
299
- logger.debug("************************************")
300
- # Now, you can process the complete tool calls from the buffer
301
- for tool_call_id, accumulated_tool_call in tool_calls_buffer.items():
302
- # Attempt to parse arguments only if the 'args_str' isn't empty
303
- if accumulated_tool_call["args_str"]:
304
- try:
305
- parsed_args = json.loads(accumulated_tool_call["args_str"])
306
- logger.debug(f"Tool Name: {accumulated_tool_call['name']}")
307
- logger.debug(f"Tool Arguments: {parsed_args}")
308
- except json.JSONDecodeError:
309
- logger.debug(
310
- f"Partial arguments for tool {accumulated_tool_call['name']}: {accumulated_tool_call['args_str']}"
311
- )
312
- except asyncio.CancelledError:
313
- logger.warning("⚠️ Request cancelled by user")
314
- node_tree = end_node_tree(node_tree=node_tree)
315
- yield (
316
- f"### {' → '.join(node_tree)}"
317
- "\n⚠️⚠️⚠️ Request cancelled by user"
318
- "\nhere is what I got so far ...\n"
319
- f"\n{streamed_response}"
320
- )
321
- # Important: re-raise if you want upstream to also know
322
- # raise
323
- return
324
- except Exception as e:
325
- logger.error("❌❌❌ Error processing request: %s", e)
326
- traceback.print_exc()
327
- node_tree = end_node_tree(node_tree=node_tree)
328
- yield (
329
- f"### { " → ".join(node_tree)}"
330
- f"\n❌❌❌ Error processing request : {str(e)}"
331
- "\nhere is what I got so far ...\n"
332
- f"\n{streamed_response}"
333
- )
334
- return
335
-
336
-
337
  # UI Elements
338
  thread_id = gr.State(init_session)
339
 
 
13
  from langchain_core.messages.system import SystemMessage
14
  from langchain_core.messages.tool import ToolMessage
15
 
16
+ from chat_utils import chat_wrapper
17
  from config import SanatanConfig
18
  from db import SanatanDatabase
19
  from drive_downloader import ZipDownloader
 
64
  return f"<div title='{escape(content)}'>{short}</div>"
65
 
66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  # UI Elements
68
  thread_id = gr.State(init_session)
69
 
chat_utils.py ADDED
@@ -0,0 +1,287 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import random
3
+ import asyncio
4
+ import logging
5
+ import time
6
+ import traceback
7
+ from html import escape
8
+ from langchain_core.messages.ai import AIMessageChunk
9
+ from langchain_core.messages.system import SystemMessage
10
+ from langchain_core.messages.tool import ToolMessage
11
+
12
+ from graph_helper import generate_graph
13
+
14
+ # Logging
15
+ logging.basicConfig()
16
+ logger = logging.getLogger()
17
+ logger.setLevel(logging.INFO)
18
+
19
+ thinking_verbs = [
20
+ "thinking",
21
+ "processing",
22
+ "crunching data",
23
+ "please wait",
24
+ "just a few more seconds",
25
+ "closing in",
26
+ "analyzing",
27
+ "reasoning",
28
+ "computing",
29
+ "synthesizing insight",
30
+ "searching through the cosmos",
31
+ "decoding ancient knowledge",
32
+ "scanning the scriptures",
33
+ "accessing divine memory",
34
+ "gathering wisdom",
35
+ "consulting the rishis",
36
+ "listening to the ātmā",
37
+ "channeling sacred energy",
38
+ "unfolding the divine word",
39
+ "meditating on the meaning",
40
+ "reciting from memory",
41
+ "traversing the Vedas",
42
+ "seeking the inner light",
43
+ "invoking paramārtha",
44
+ "putting it all together",
45
+ "digging deeper",
46
+ "making sense of it",
47
+ "connecting the dots",
48
+ "almost there",
49
+ "getting closer",
50
+ "wrapping it up",
51
+ "piecing it together",
52
+ "swirling through verses",
53
+ "diving into the ocean of knowledge",
54
+ "lighting the lamp of understanding",
55
+ "walking the path of inquiry",
56
+ "aligning stars of context",
57
+ ]
58
+
59
+ graph = generate_graph()
60
+
61
+ def add_node_to_tree(
62
+ node_tree: list[str], node_label: str, tooltip: str = "no arguments to show"
63
+ ) -> list[str]:
64
+ if tooltip:
65
+ tooltip = escape(tooltip).replace("'", "&apos;")
66
+ node_with_tooltip = (
67
+ f"<span class='node-label' title='{tooltip}'>{node_label}</span>"
68
+ )
69
+ else:
70
+ node_with_tooltip = node_label
71
+ node_tree[-1] = node_with_tooltip
72
+ node_tree.append("<span class='spinner'>&nbsp;</span>")
73
+ return node_tree
74
+
75
+
76
+ def end_node_tree(node_tree: list[str]) -> list[str]:
77
+ node_tree[-1] = "🏁"
78
+ return node_tree
79
+
80
+
81
+ def get_args_for_toolcall(tool_calls_buffer: dict, tool_call_id: str):
82
+ return (
83
+ tool_calls_buffer[tool_call_id]["args_str"]
84
+ if tool_call_id in tool_calls_buffer
85
+ and "args_str" in tool_calls_buffer[tool_call_id]
86
+ else ""
87
+ )
88
+
89
+
90
+ async def chat_wrapper(
91
+ message, history, thread_id, debug, preferred_language="English"
92
+ ):
93
+ if debug:
94
+ async for chunk in chat_streaming(
95
+ debug, message, history, thread_id, preferred_language=preferred_language
96
+ ):
97
+ yield chunk
98
+ else:
99
+ response = chat(
100
+ debug, message, history, thread_id, preferred_language=preferred_language
101
+ )
102
+ yield response
103
+
104
+
105
+ def chat(debug_mode, message, history, thread_id, preferred_language="English"):
106
+ config = {"configurable": {"thread_id": thread_id}, "recursion_limit": 30}
107
+ response = graph.invoke(
108
+ {
109
+ "debug_mode": debug_mode,
110
+ "messages": [{"role": "user", "content": message}],
111
+ "language": preferred_language,
112
+ },
113
+ config=config,
114
+ )
115
+ return response["messages"][-1].content
116
+
117
+ async def chat_streaming(
118
+ debug_mode: bool, message, history, thread_id, preferred_language="English"
119
+ ):
120
+ state = {
121
+ "debug_mode": debug_mode,
122
+ "messages": (history or []) + [{"role": "user", "content": message}],
123
+ "language": preferred_language,
124
+ }
125
+ config = {"configurable": {"thread_id": thread_id}, "recursion_limit": 30}
126
+ start_time = time.time()
127
+ streamed_response = ""
128
+ final_response = ""
129
+ # final_node = "validator"
130
+
131
+ MAX_CONTENT = 500
132
+
133
+ try:
134
+ node_tree = ["🚩", "<span class='spinner'>&nbsp;</span>"]
135
+
136
+ tool_calls_buffer = {}
137
+
138
+ async for msg, metadata in graph.astream(
139
+ state, config=config, stream_mode="messages"
140
+ ):
141
+ node = metadata.get("langgraph_node", "?")
142
+ name = getattr(msg, "name", "-")
143
+ if not isinstance(msg, ToolMessage):
144
+ node_icon = "🧠"
145
+ else:
146
+ node_icon = "⚙️"
147
+ node_label = f"{node}"
148
+ tool_label = f"{name or ''}"
149
+ if tool_label:
150
+ node_label = node_label + f":{tool_label}"
151
+ label = f"{node_icon} {node_label}"
152
+ tooltip = ""
153
+ if isinstance(msg, ToolMessage):
154
+ tooltip = get_args_for_toolcall(tool_calls_buffer, msg.tool_call_id)
155
+ # logger.info("tooltip = ", tooltip)
156
+
157
+ # checking for -2 last but one. since last entry is always a spinner
158
+ if node_tree[-2] != label:
159
+ add_node_to_tree(node_tree, label, tooltip)
160
+ full: str = escape(msg.content)
161
+ truncated = (full[:MAX_CONTENT] + "…") if len(full) > MAX_CONTENT else full
162
+
163
+ def generate_processing_message():
164
+ return f"<div class='thinking-bubble'><em>🤔{random.choice(thinking_verbs)} ...</em></div>"
165
+
166
+ if (
167
+ not isinstance(msg, ToolMessage)
168
+ and not isinstance(msg, SystemMessage)
169
+ and not isinstance(msg, AIMessageChunk)
170
+ ):
171
+ logger.info("msg = %s", msg)
172
+ if isinstance(msg, ToolMessage):
173
+ logger.debug("tool message = %s", msg)
174
+
175
+ html = f"<div class='thinking-bubble'><em>🤔 {msg.name} tool: {random.choice(thinking_verbs)} ...</em></div>"
176
+ yield f"### { ' → '.join(node_tree)}\n{html}"
177
+ elif isinstance(msg, AIMessageChunk):
178
+
179
+ def truncate_middle(text, front=50, back=50):
180
+ if not text:
181
+ return ""
182
+ if len(text) <= front + back:
183
+ return text
184
+ return f"{text[:front]}…{text[-back:]}".replace(
185
+ "\n", ""
186
+ ) # remove new lines.
187
+
188
+ if not msg.content:
189
+ # logger.warning("*** No Message Chunk!")
190
+ yield f"### { " → ".join(node_tree)}\n{generate_processing_message()}\n<div class='intermediate-output'>{escape(truncate_middle(streamed_response))}</div>"
191
+ else:
192
+ # Stream intermediate messages with transparent style
193
+ # if node != final_node:
194
+ streamed_response += msg.content
195
+ yield f"### { ' → '.join(node_tree) }\n<div class='intermediate-output'>{escape(truncate_middle(streamed_response))}</div>"
196
+ # else:
197
+ # Buffer the final validated response instead of yielding
198
+ final_response += msg.content
199
+
200
+ if msg.tool_call_chunks:
201
+ for tool_call_chunk in msg.tool_call_chunks:
202
+ logger.debug("*** tool_call_chunk = ", tool_call_chunk)
203
+ if tool_call_chunk["id"] is not None:
204
+ tool_call_id = tool_call_chunk["id"]
205
+
206
+ if tool_call_id not in tool_calls_buffer:
207
+ tool_calls_buffer[tool_call_id] = {
208
+ "name": "",
209
+ "args_str": "",
210
+ "id": tool_call_id,
211
+ "type": "tool_call",
212
+ }
213
+
214
+ # Accumulate tool call name and arguments
215
+ if tool_call_chunk["name"] is not None:
216
+ tool_calls_buffer[tool_call_id]["name"] += tool_call_chunk[
217
+ "name"
218
+ ]
219
+ if tool_call_chunk["args"] is not None:
220
+ tool_calls_buffer[tool_call_id][
221
+ "args_str"
222
+ ] += tool_call_chunk["args"]
223
+ else:
224
+ logger.debug("message = ", type(msg), msg.content[:100])
225
+ full: str = escape(msg.content)
226
+ truncated = (
227
+ (full[:MAX_CONTENT] + "…") if len(full) > MAX_CONTENT else full
228
+ )
229
+ html = (
230
+ f"<div class='thinking-bubble'><em>🤔 {random.choice(thinking_verbs)} ...</em></div>"
231
+ f"<div style='opacity: 0.1'>"
232
+ f"<strong>Telling myself:</strong> {truncated or '...'}"
233
+ f"</div>"
234
+ )
235
+ yield f"### { " → ".join(node_tree)}\n{html}"
236
+ if getattr(msg, "tool_calls", []):
237
+ logger.info("ELSE::tool_calls = %s", msg.tool_calls)
238
+
239
+ node_tree[-1] = "✅"
240
+ end_time = time.time()
241
+ duration = end_time - start_time
242
+ final_response = (
243
+ f"\n{final_response}" f"\n\n⏱️ Processed in {duration:.2f} seconds"
244
+ )
245
+ buffer = f"### {' → '.join(node_tree)}\n"
246
+ yield buffer
247
+ for c in final_response:
248
+ buffer += c
249
+ yield buffer
250
+ await asyncio.sleep(0.0005)
251
+
252
+ logger.debug("************************************")
253
+ # Now, you can process the complete tool calls from the buffer
254
+ for tool_call_id, accumulated_tool_call in tool_calls_buffer.items():
255
+ # Attempt to parse arguments only if the 'args_str' isn't empty
256
+ if accumulated_tool_call["args_str"]:
257
+ try:
258
+ parsed_args = json.loads(accumulated_tool_call["args_str"])
259
+ logger.debug(f"Tool Name: {accumulated_tool_call['name']}")
260
+ logger.debug(f"Tool Arguments: {parsed_args}")
261
+ except json.JSONDecodeError:
262
+ logger.debug(
263
+ f"Partial arguments for tool {accumulated_tool_call['name']}: {accumulated_tool_call['args_str']}"
264
+ )
265
+ except asyncio.CancelledError:
266
+ logger.warning("⚠️ Request cancelled by user")
267
+ node_tree = end_node_tree(node_tree=node_tree)
268
+ yield (
269
+ f"### {' → '.join(node_tree)}"
270
+ "\n⚠️⚠️⚠️ Request cancelled by user"
271
+ "\nhere is what I got so far ...\n"
272
+ f"\n{streamed_response}"
273
+ )
274
+ # Important: re-raise if you want upstream to also know
275
+ # raise
276
+ return
277
+ except Exception as e:
278
+ logger.error("❌❌❌ Error processing request: %s", e)
279
+ traceback.print_exc()
280
+ node_tree = end_node_tree(node_tree=node_tree)
281
+ yield (
282
+ f"### { " → ".join(node_tree)}"
283
+ f"\n❌❌❌ Error processing request : {str(e)}"
284
+ "\nhere is what I got so far ...\n"
285
+ f"\n{streamed_response}"
286
+ )
287
+ return
config.py CHANGED
@@ -246,9 +246,10 @@ class SanatanConfig:
246
  {
247
  "name": "verse",
248
  "datatype": "int",
 
249
  "description": (
250
- "Absolute verse number or pasuram number."
251
- "Use it only when a specific prabandham name is NOT mentioned in the user query."
252
  "For e.g. 'Give me pasuram 1176'"
253
  ),
254
  },
@@ -539,13 +540,23 @@ class SanatanConfig:
539
  self, collection_name: str, metadata_where_clause: MetadataWhereClause
540
  ):
541
  scripture = self.get_scripture_by_collection(collection_name=collection_name)
542
- for filter in metadata_where_clause.filters:
543
- if filter.metadata_field not in [
544
- field["name"] for field in scripture["metadata_fields"]
545
- ]:
546
- raise Exception(
547
- f"metadata_field: [{filter.metadata_field}] not allowed in collection [{collection_name}]. Here are the allowed fields with their descriptions: {scripture["metadata_fields"]}"
548
- )
 
 
 
 
 
 
 
 
 
 
549
  return True
550
 
551
  def get_embedding_for_collection(self, collection_name: str):
 
246
  {
247
  "name": "verse",
248
  "datatype": "int",
249
+ "is_unique" : True,
250
  "description": (
251
+ "Absolute verse number or pasuram number. Each verse has a unique number."
252
+ # "Use it only when a specific prabandham name is NOT mentioned in the user query."
253
  "For e.g. 'Give me pasuram 1176'"
254
  ),
255
  },
 
540
  self, collection_name: str, metadata_where_clause: MetadataWhereClause
541
  ):
542
  scripture = self.get_scripture_by_collection(collection_name=collection_name)
543
+ allowed_fields = [field["name"] for field in scripture["metadata_fields"]]
544
+
545
+ def validate_clause(clause: MetadataWhereClause):
546
+ # validate direct filters
547
+ if clause.filters:
548
+ for f in clause.filters:
549
+ if f.metadata_field not in allowed_fields:
550
+ raise Exception(
551
+ f"metadata_field: [{f.metadata_field}] not allowed in collection [{collection_name}]. "
552
+ f"Here are the allowed fields with their descriptions: {scripture['metadata_fields']}"
553
+ )
554
+ # recurse into groups
555
+ if clause.groups:
556
+ for g in clause.groups:
557
+ validate_clause(g)
558
+
559
+ validate_clause(metadata_where_clause)
560
  return True
561
 
562
  def get_embedding_for_collection(self, collection_name: str):
db.py CHANGED
@@ -1,4 +1,6 @@
1
  import json
 
 
2
  import chromadb
3
  import re, unicodedata
4
  from config import SanatanConfig
@@ -33,18 +35,120 @@ class SanatanDatabase:
33
  metadatas=metadatas,
34
  )
35
 
36
- def search(self, collection_name: str, query: str, n_results=2):
37
- logger.info("Vector Semantic Search for [%s] in [%s]", query, collection_name)
 
 
 
 
 
 
 
 
 
 
 
38
  collection = self.chroma_client.get_or_create_collection(name=collection_name)
39
- try:
40
- response = collection.query(
41
- query_embeddings=get_embedding(
42
- [query], SanatanConfig().get_embedding_for_collection(collection_name)
43
- ),
44
- # query_texts=[query],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  n_results=n_results,
46
- include=["metadatas","documents","distances"],
47
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  except Exception as e:
49
  logger.error("Error in search: %s", e)
50
  return chromadb.QueryResult(
@@ -53,42 +157,71 @@ class SanatanDatabase:
53
  metadatas=[],
54
  distances=[],
55
  )
56
-
57
  validated_response = validate_relevance_queryresult(query, response)
58
 
59
- return validated_response["result"]
 
 
 
 
 
 
60
 
61
  def search_for_literal(
62
- self, collection_name: str, literal_to_search_for: str, n_results=2
 
 
 
 
63
  ):
64
  logger.info(
65
- "Searching literally for [%s] in [%s]",
66
  literal_to_search_for,
67
  collection_name,
 
68
  )
 
 
 
 
 
 
69
  collection = self.chroma_client.get_or_create_collection(name=collection_name)
70
 
71
  def normalize(text):
72
  return unicodedata.normalize("NFKC", text).lower()
73
 
74
  # 1. Try native contains
75
- response = collection.query(
76
- query_embeddings=get_embedding(
77
- [literal_to_search_for], SanatanConfig().get_embedding_for_collection(collection_name)
 
 
78
  ),
79
  where_document={"$contains": literal_to_search_for},
80
- n_results=n_results,
81
  )
82
 
83
  if response["documents"] and any(response["documents"]):
84
- return response
 
 
 
 
85
 
86
  # 2. Regex fallback (normalized)
87
  logger.info("⚠ No luck. Falling back to regex for %s", literal_to_search_for)
88
  regex = re.compile(re.escape(normalize(literal_to_search_for)))
89
  logger.info("regex = %s", regex)
90
 
91
- all_docs = collection.get()
 
 
 
 
 
 
92
  matched_docs = []
93
 
94
  for doc_list, metadata_list, doc_id_list in zip(
@@ -135,36 +268,13 @@ class SanatanDatabase:
135
  if len(matched_docs) >= n_results:
136
  break
137
 
138
- return {
139
- "documents": [[d["document"] for d in matched_docs]],
140
- "ids": [[d["id"] for d in matched_docs]],
141
- "metadatas": [[d["metadata"] for d in matched_docs]],
142
- }
143
-
144
- def search_by_metadata(
145
- self,
146
- collection_name: str,
147
- query: str,
148
- metadata_where_clause: MetadataWhereClause,
149
- n_results=2,
150
- ):
151
- """Search by a metadata field inside a specific collection using a specific operator. For instance {"azhwar_name": {"$in": "Thirumangai Azhwar"}}"""
152
- logger.info(
153
- "Searching by metadata for [%s] in [%s] with metadata_filters=%s",
154
- query,
155
- collection_name,
156
- metadata_where_clause,
157
  )
158
- collection = self.chroma_client.get_or_create_collection(name=collection_name)
159
- response = collection.query(
160
- query_embeddings=get_embedding(
161
- [query], SanatanConfig().get_embedding_for_collection(collection_name)
162
- ),
163
- where=metadata_where_clause.to_chroma_where(),
164
- # query_texts=[query],
165
- n_results=n_results,
166
- )
167
- return response
168
 
169
  def count(self, collection_name: str):
170
  collection = self.chroma_client.get_or_create_collection(name=collection_name)
@@ -177,12 +287,12 @@ class SanatanDatabase:
177
  count = self.count(collection_name=scripture["collection_name"])
178
  if count == 0:
179
  raise Exception(f"No data in collection {scripture["collection_name"]}")
180
-
181
  def reembed_collection_openai(self, collection_name: str, batch_size: int = 50):
182
  """
183
  Deletes and recreates a Chroma collection with OpenAI text-embedding-3-large embeddings.
184
  All existing documents are re-embedded and inserted into the new collection.
185
-
186
  Args:
187
  collection_name: The name of the collection to delete/recreate.
188
  batch_size: Number of documents to process per batch.
@@ -195,7 +305,7 @@ class SanatanDatabase:
195
  metadatas = old_data["metadatas"]
196
  ids = old_data["ids"]
197
  print(f"Fetched {len(documents)} documents from old collection.")
198
-
199
  # Step 2: Delete old collection
200
  # self.chroma_client.delete_collection(collection_name)
201
  # print(f"Deleted old collection '{collection_name}'.")
@@ -208,13 +318,17 @@ class SanatanDatabase:
208
  name=f"{collection_name}_openai",
209
  embedding_function=None, # embeddings will be provided manually
210
  )
211
- print(f"Created new collection '{collection_name}_openai' with embedding_dim=3072.")
 
 
212
 
213
  # Step 4: Re-embed and insert documents in batches
214
- for i in tqdm(range(0, len(documents), batch_size), desc="Re-embedding batches"):
215
- batch_docs = documents[i:i+batch_size]
216
- batch_metadatas = metadatas[i:i+batch_size]
217
- batch_ids = ids[i:i+batch_size]
 
 
218
 
219
  embeddings = get_embedding(batch_docs, backend="openai")
220
 
@@ -222,6 +336,6 @@ class SanatanDatabase:
222
  ids=batch_ids,
223
  documents=batch_docs,
224
  metadatas=batch_metadatas,
225
- embeddings=embeddings
226
  )
227
- print("All documents re-embedded and added to new collection successfully!")
 
1
  import json
2
+ import random
3
+ from typing import Literal
4
  import chromadb
5
  import re, unicodedata
6
  from config import SanatanConfig
 
35
  metadatas=metadatas,
36
  )
37
 
38
+ def fetch_random_data(
39
+ self,
40
+ collection_name: str,
41
+ metadata_where_clause: MetadataWhereClause = None,
42
+ n_results=1,
43
+ ):
44
+ # fetch all documents once
45
+ logger.info(
46
+ "getting %d random verses from [%s] | metadata_where_clause = %s",
47
+ n_results,
48
+ collection_name,
49
+ metadata_where_clause,
50
+ )
51
  collection = self.chroma_client.get_or_create_collection(name=collection_name)
52
+ data = collection.get(
53
+ where=(
54
+ metadata_where_clause.to_chroma_where()
55
+ if metadata_where_clause is not None
56
+ else None
57
+ )
58
+ )
59
+ docs = data["documents"] # list of all verse texts
60
+ ids = data["ids"]
61
+ metas = data["metadatas"]
62
+
63
+ if not docs:
64
+ logger.warning("No data found! - data=%s", data)
65
+ return chromadb.QueryResult(ids=[], documents=[], metadatas=[])
66
+
67
+ # pick k random indices
68
+ indices = random.sample(range(len(docs)), k=min(n_results, len(docs)))
69
+
70
+ return chromadb.QueryResult(
71
+ ids=[ids[i] for i in indices],
72
+ documents=[docs[i] for i in indices],
73
+ metadatas=[metas[i] for i in indices],
74
+ )
75
+
76
+ def search(
77
+ self,
78
+ collection_name: str,
79
+ query: str = None,
80
+ metadata_where_clause: MetadataWhereClause = None,
81
+ n_results=2,
82
+ search_type: Literal["semantic", "literal", "random"] = "semantic",
83
+ ):
84
+ logger.info(
85
+ "Search for [%s] in [%s]| metadata_where_clause=%s | search_type=%s | n_results=%d",
86
+ query,
87
+ collection_name,
88
+ metadata_where_clause,
89
+ search_type,
90
+ n_results,
91
+ )
92
+ if search_type == "semantic":
93
+ return self.search_semantic(
94
+ collection_name=collection_name,
95
+ query=query,
96
+ metadata_where_clause=metadata_where_clause,
97
+ n_results=n_results,
98
+ )
99
+ elif search_type == "literal":
100
+ return self.search_for_literal(
101
+ collection_name=collection_name,
102
+ literal_to_search_for=query,
103
+ metadata_where_clause=metadata_where_clause,
104
  n_results=n_results,
 
105
  )
106
+ else:
107
+ # random
108
+ return self.fetch_random_data(
109
+ collection_name=collection_name,
110
+ metadata_where_clause=metadata_where_clause,
111
+ n_results=n_results,
112
+ )
113
+
114
+ def search_semantic(
115
+ self,
116
+ collection_name: str,
117
+ query: str | None = None,
118
+ metadata_where_clause: MetadataWhereClause | None = None,
119
+ n_results=2,
120
+ ):
121
+ logger.info(
122
+ "Vector Semantic Search for [%s] in [%s] | metadata_where_clause = %s",
123
+ query,
124
+ collection_name,
125
+ metadata_where_clause,
126
+ )
127
+ collection = self.chroma_client.get_or_create_collection(name=collection_name)
128
+ try:
129
+ q = query.strip() if query is not None else ""
130
+ if not q:
131
+ # fallback: fetch random verse
132
+ return self.fetch_random_data(
133
+ collection_name=collection_name,
134
+ metadata_where_clause=metadata_where_clause,
135
+ n_results=n_results,
136
+ )
137
+ else:
138
+ response = collection.query(
139
+ query_embeddings=get_embedding(
140
+ [query],
141
+ SanatanConfig().get_embedding_for_collection(collection_name),
142
+ ),
143
+ # query_texts=[query],
144
+ n_results=n_results,
145
+ where=(
146
+ metadata_where_clause.to_chroma_where()
147
+ if metadata_where_clause is not None
148
+ else None
149
+ ),
150
+ include=["metadatas", "documents", "distances"],
151
+ )
152
  except Exception as e:
153
  logger.error("Error in search: %s", e)
154
  return chromadb.QueryResult(
 
157
  metadatas=[],
158
  distances=[],
159
  )
160
+
161
  validated_response = validate_relevance_queryresult(query, response)
162
 
163
+ logger.info(
164
+ "status = %s | reason= %s",
165
+ validated_response.status,
166
+ validated_response.reason,
167
+ )
168
+
169
+ return validated_response.result
170
 
171
  def search_for_literal(
172
+ self,
173
+ collection_name: str,
174
+ literal_to_search_for: str | None = None,
175
+ metadata_where_clause: MetadataWhereClause | None = None,
176
+ n_results=2,
177
  ):
178
  logger.info(
179
+ "Searching literally for [%s] in [%s] | metadata_where_clause = %s",
180
  literal_to_search_for,
181
  collection_name,
182
+ metadata_where_clause,
183
  )
184
+ if literal_to_search_for is None or literal_to_search_for.strip() == "":
185
+ logger.warning("Nothing to search literally.")
186
+ # raise Exception("literal_to_search_for cannot be None or empty for a literal search!")
187
+ return self.fetch_random_data(
188
+ collection_name=collection_name,
189
+ )
190
  collection = self.chroma_client.get_or_create_collection(name=collection_name)
191
 
192
  def normalize(text):
193
  return unicodedata.normalize("NFKC", text).lower()
194
 
195
  # 1. Try native contains
196
+ response = collection.get(
197
+ where=(
198
+ metadata_where_clause.to_chroma_where()
199
+ if metadata_where_clause is not None
200
+ else None
201
  ),
202
  where_document={"$contains": literal_to_search_for},
203
+ limit=n_results,
204
  )
205
 
206
  if response["documents"] and any(response["documents"]):
207
+ return chromadb.QueryResult(
208
+ ids=response["ids"],
209
+ documents=response["documents"],
210
+ metadatas=response["metadatas"],
211
+ )
212
 
213
  # 2. Regex fallback (normalized)
214
  logger.info("⚠ No luck. Falling back to regex for %s", literal_to_search_for)
215
  regex = re.compile(re.escape(normalize(literal_to_search_for)))
216
  logger.info("regex = %s", regex)
217
 
218
+ all_docs = collection.get(
219
+ where=(
220
+ metadata_where_clause.to_chroma_where()
221
+ if metadata_where_clause is not None
222
+ else None
223
+ ),
224
+ )
225
  matched_docs = []
226
 
227
  for doc_list, metadata_list, doc_id_list in zip(
 
268
  if len(matched_docs) >= n_results:
269
  break
270
 
271
+ return chromadb.QueryResult(
272
+ {
273
+ "documents": [[d["document"] for d in matched_docs]],
274
+ "ids": [[d["id"] for d in matched_docs]],
275
+ "metadatas": [[d["metadata"] for d in matched_docs]],
276
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
277
  )
 
 
 
 
 
 
 
 
 
 
278
 
279
  def count(self, collection_name: str):
280
  collection = self.chroma_client.get_or_create_collection(name=collection_name)
 
287
  count = self.count(collection_name=scripture["collection_name"])
288
  if count == 0:
289
  raise Exception(f"No data in collection {scripture["collection_name"]}")
290
+
291
  def reembed_collection_openai(self, collection_name: str, batch_size: int = 50):
292
  """
293
  Deletes and recreates a Chroma collection with OpenAI text-embedding-3-large embeddings.
294
  All existing documents are re-embedded and inserted into the new collection.
295
+
296
  Args:
297
  collection_name: The name of the collection to delete/recreate.
298
  batch_size: Number of documents to process per batch.
 
305
  metadatas = old_data["metadatas"]
306
  ids = old_data["ids"]
307
  print(f"Fetched {len(documents)} documents from old collection.")
308
+
309
  # Step 2: Delete old collection
310
  # self.chroma_client.delete_collection(collection_name)
311
  # print(f"Deleted old collection '{collection_name}'.")
 
318
  name=f"{collection_name}_openai",
319
  embedding_function=None, # embeddings will be provided manually
320
  )
321
+ print(
322
+ f"Created new collection '{collection_name}_openai' with embedding_dim=3072."
323
+ )
324
 
325
  # Step 4: Re-embed and insert documents in batches
326
+ for i in tqdm(
327
+ range(0, len(documents), batch_size), desc="Re-embedding batches"
328
+ ):
329
+ batch_docs = documents[i : i + batch_size]
330
+ batch_metadatas = metadatas[i : i + batch_size]
331
+ batch_ids = ids[i : i + batch_size]
332
 
333
  embeddings = get_embedding(batch_docs, backend="openai")
334
 
 
336
  ids=batch_ids,
337
  documents=batch_docs,
338
  metadatas=batch_metadatas,
339
+ embeddings=embeddings,
340
  )
341
+ print("All documents re-embedded and added to new collection successfully!")
metadata.py CHANGED
@@ -1,5 +1,5 @@
1
  from pydantic import BaseModel
2
- from typing import Literal, Union, List, Dict
3
 
4
 
5
  AllowedOps = Literal["$in", "$eq", "$gt", "$gte", "$lt", "$lte", "$ne", "$nin"]
@@ -11,19 +11,47 @@ class MetadataFilter(BaseModel):
11
  metadata_value: Union[str, int, float, List[Union[str, int, float]]]
12
 
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  class MetadataWhereClause(BaseModel):
15
- filters: List[MetadataFilter]
 
 
16
 
17
  def to_chroma_where(self) -> Dict:
18
- """Convert list of MetadataFilter into Chroma-compatible where clause with AND logic."""
19
- if not self.filters:
 
 
 
 
 
 
 
 
 
 
 
20
  return {}
21
- if len(self.filters) == 1:
22
- f = self.filters[0]
23
- return {f.metadata_field: {f.metadata_search_operator: f.metadata_value}}
24
- return {
25
- "$and": [
26
- {f.metadata_field: {f.metadata_search_operator: f.metadata_value}}
27
- for f in self.filters
28
- ]
29
- }
 
1
  from pydantic import BaseModel
2
+ from typing import Literal, Optional, Union, List, Dict
3
 
4
 
5
  AllowedOps = Literal["$in", "$eq", "$gt", "$gte", "$lt", "$lte", "$ne", "$nin"]
 
11
  metadata_value: Union[str, int, float, List[Union[str, int, float]]]
12
 
13
 
14
+ # class MetadataWhereClause(BaseModel):
15
+ # filters: List[MetadataFilter]
16
+ # conditional_operator: Literal["$and", "$or"] = "$and"
17
+
18
+ # def to_chroma_where(self) -> Dict:
19
+ # """Convert list of MetadataFilter into Chroma-compatible where clause with AND logic."""
20
+ # if not self.filters:
21
+ # return {}
22
+ # if len(self.filters) == 1:
23
+ # f = self.filters[0]
24
+ # return {f.metadata_field: {f.metadata_search_operator: f.metadata_value}}
25
+ # return {
26
+ # self.conditional_operator: [
27
+ # {f.metadata_field: {f.metadata_search_operator: f.metadata_value}}
28
+ # for f in self.filters
29
+ # ]
30
+ # }
31
+
32
  class MetadataWhereClause(BaseModel):
33
+ filters: Optional[List["MetadataFilter"]] = None
34
+ groups: Optional[List["MetadataWhereClause"]] = None
35
+ conditional_operator: Literal["$and", "$or"] = "$and"
36
 
37
  def to_chroma_where(self) -> Dict:
38
+ parts = []
39
+
40
+ # Handle direct filters
41
+ if self.filters:
42
+ for f in self.filters:
43
+ parts.append({f.metadata_field: {f.metadata_search_operator: f.metadata_value}})
44
+
45
+ # Handle nested groups
46
+ if self.groups:
47
+ for g in self.groups:
48
+ parts.append(g.to_chroma_where())
49
+
50
+ if not parts:
51
  return {}
52
+
53
+ if len(parts) == 1:
54
+ return parts[0]
55
+
56
+ # More than one part → wrap with conditional operator
57
+ return {self.conditional_operator: parts}
 
 
 
modules/db/relevance.py CHANGED
@@ -1,4 +1,11 @@
1
  from chromadb.api.types import QueryResult
 
 
 
 
 
 
 
2
 
3
  def validate_relevance_queryresult(query: str, result: QueryResult, max_distance: float = 0.35):
4
  """
@@ -20,24 +27,24 @@ def validate_relevance_queryresult(query: str, result: QueryResult, max_distance
20
  distances = result.get("distances", [])
21
 
22
  if not documents:
23
- return {
24
  "status": "not_found",
25
  "reason": "No results",
26
  "result": result
27
- }
28
 
29
  # distances can be List[List[float]]; get the first distance of the first result
30
  best_distance = distances[0][0] if distances and isinstance(distances[0], list) else (distances[0] if distances else float('inf'))
31
 
32
  if best_distance > max_distance:
33
- return {
34
  "status": "not_relevant",
35
  "reason": f"Best distance {best_distance:.4f} > {max_distance}",
36
  "result": result
37
- }
38
 
39
- return {
40
  "status": "ok",
41
  "reason": "Relevant",
42
  "result": result
43
- }
 
1
  from chromadb.api.types import QueryResult
2
+ from dataclasses import dataclass
3
+
4
+ @dataclass
5
+ class ValidationOutcome:
6
+ status : str
7
+ reason : str
8
+ result : QueryResult
9
 
10
  def validate_relevance_queryresult(query: str, result: QueryResult, max_distance: float = 0.35):
11
  """
 
27
  distances = result.get("distances", [])
28
 
29
  if not documents:
30
+ return ValidationOutcome(**{
31
  "status": "not_found",
32
  "reason": "No results",
33
  "result": result
34
+ })
35
 
36
  # distances can be List[List[float]]; get the first distance of the first result
37
  best_distance = distances[0][0] if distances and isinstance(distances[0], list) else (distances[0] if distances else float('inf'))
38
 
39
  if best_distance > max_distance:
40
+ return ValidationOutcome(**{
41
  "status": "not_relevant",
42
  "reason": f"Best distance {best_distance:.4f} > {max_distance}",
43
  "result": result
44
+ })
45
 
46
+ return ValidationOutcome(**{
47
  "status": "ok",
48
  "reason": "Relevant",
49
  "result": result
50
+ })
modules/nodes/chat.py CHANGED
@@ -8,15 +8,11 @@ from tools import (
8
  tool_search_web,
9
  tool_push,
10
  tool_get_standardized_azhwar_names,
11
- tool_search_db_by_metadata,
12
  tool_get_standardized_divya_desam_names,
13
- tool_search_db_for_literal,
14
  )
15
 
16
  tools = [
17
- tool_search_db_by_metadata,
18
  tool_search_db,
19
- tool_search_db_for_literal,
20
  tool_get_standardized_azhwar_names,
21
  tool_get_standardized_prabandham_names,
22
  tool_get_standardized_divya_desam_names,
 
8
  tool_search_web,
9
  tool_push,
10
  tool_get_standardized_azhwar_names,
 
11
  tool_get_standardized_divya_desam_names,
 
12
  )
13
 
14
  tools = [
 
15
  tool_search_db,
 
16
  tool_get_standardized_azhwar_names,
17
  tool_get_standardized_prabandham_names,
18
  tool_get_standardized_divya_desam_names,
modules/nodes/init.py CHANGED
@@ -27,26 +27,7 @@ def init_system_prompt_node(state: ChatState) -> ChatState:
27
  content=f"Here is the list of all scriptures along with their metadata configurations:\n{json.dumps(scriptures, separators=(',', ':'))}\n"
28
  ),
29
  SystemMessage(
30
- content="""
31
- You have access to three scripture search tools. You MUST follow these rules when choosing a tool:
32
-
33
- 1. **tool_search_db_by_metadata** – Use this when the user explicitly provides metadata criteria such as:
34
- - Specific azhwar name
35
- - Prabandham or prabandham code
36
- - Verse number or decade number
37
- - Divya desam name
38
- ⚠️ Always call the corresponding standardization tool first.
39
- - "If the user asks for a specific azhwar, use `tool_get_standardized_azhwar_names` first."
40
- - "If the user asks for a specific prabandham, use `tool_get_standardized_prabandham_names` first."
41
- - "If the user mentions a divya desam, use `tool_get_standardized_divya_desam_names` first."
42
-
43
-
44
- 2. **tool_semantic_vector_search** – Use this when the user asks about themes, ideas, emotions, or meanings without explicit verse numbers or metadata.
45
-
46
- 3. **tool_search_db_by_literal_text** – Use this only if the user explicitly requests an exact phrase match.
47
-
48
- Never call a tool repeatedly with the same arguments. Stop if results don’t change meaningfully.
49
- """
50
  ),
51
  SystemMessage(
52
  content=f"""
@@ -56,28 +37,51 @@ You are a knowledgeable assistant for *{{collection_name}}*.
56
  Languages: Sanskrit, Tamil, and {state['language']}.
57
  Use **only** the verses and notes retrieved from the context. Never fabricate or import external knowledge.
58
 
 
 
 
 
 
 
 
59
  ---
60
 
61
- ### ✅ Default Response Format (always include)
62
 
63
- ### Title : {{scripture}} | {{chapter_title_if_available}} | {{`title`}} | {{author_name_if_available}} | Reference Link {{html_url}} if available
 
64
 
65
- ### 📜 Original Verse(s)
66
- - Show exact native-script verses from the context.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  - Do not translate, transliterate, or explain.
68
  - Preserve line breaks and spacing exactly.
69
 
70
  ### 📜 Sanitized Verse(s)
71
- - Only include this section if sanitization changes anything.
72
  - Sanitize by:
73
- 1. Fixing garbled Unicode characters.
74
  2. Correcting broken diacritics, pulli markers, vowel signs, and punctuation.
75
  3. Preserving original spacing and line order.
76
- - If no change → skip this section entirely.
77
 
78
  ### 📜 {state['language']} – Simple Meaning
79
  - Give a **short, natural summary/meaning** in {state['language']}.
80
- - Keep it concise and error-free.
81
 
82
  ### 🔮 Next Steps
83
  End with a short list of follow-up prompts:
@@ -97,15 +101,19 @@ End with a short list of follow-up prompts:
97
  #### 📜 Transliteration
98
  - Provide verse transliteration in {state['language']} if requested.
99
 
100
- #### 📜 Word-by-Word Meaning
101
  - Provide WBW meaning in English or {state['language']} if requested.
102
 
 
 
 
103
  #### 📜 Detailed Notes / Purport
104
  - Summarize and translate explanatory notes/purports if present in context.
105
 
106
  ---
107
 
108
  ⚠️ Rules:
 
109
  - Do not duplicate content across sections.
110
  - Do not invent verses, meanings, or purports.
111
  - If no context found → reply in {state['language']}:
@@ -127,7 +135,7 @@ End with a short list of follow-up prompts:
127
  )
128
  )
129
  )
130
- state["initialized"] = True
131
 
132
  state["tool_calls"] = 0
133
  state["seen_tool_calls"] = set()
 
27
  content=f"Here is the list of all scriptures along with their metadata configurations:\n{json.dumps(scriptures, separators=(',', ':'))}\n"
28
  ),
29
  SystemMessage(
30
+ content="""The tools are deterministic. Caling the same tool multiple times with the same arguments are not going to yield different results. So NEVER call a tool twice or more with the same arguments. Stop if results don’t change meaningfully."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  ),
32
  SystemMessage(
33
  content=f"""
 
37
  Languages: Sanskrit, Tamil, and {state['language']}.
38
  Use **only** the verses and notes retrieved from the context. Never fabricate or import external knowledge.
39
 
40
+ In the context, if there ia a variable called `html_utl`, then use that direcly for `reference_link`. If not, look for `video_id` and use that to construct the youtube url using https://www.youtube.com/watch?v={{video_id}} and store it under `reference_link`
41
+ RULE:
42
+ - If the user asks for "one verse", "any verse", "show me a verse", or similar, always return exactly ONE verse.
43
+ - Do not return multiple verses.
44
+ - Only return multiple verses if the user explicitly asks for more than one.
45
+ In the header at the end for the field `verse_or_page`, show the `verse` or `page` whichever is available in the context and mention Verse `verse` or Page `page` as the case may be.
46
+
47
  ---
48
 
49
+ ### ✅ Default Response Format (always include unless it is a followup question and/or user requests specific details only)
50
 
51
+ ### 🕉️ Scripture
52
+ - Show `collection` if available else skip the entire section including header.
53
 
54
+ ### 📜 Divya Desam
55
+ - Show `divya_desams` if available else skip the entire section including header.
56
+
57
+ ### 📜 Author
58
+ - Show `author` if available else skip the entire section including header.
59
+
60
+ ### 📜 Verse Number
61
+ - Show verse number if available else skip the entire section including header. If available show this as a hyperlink with `html_url` if html_url is available.
62
+
63
+ ### 📜 Title
64
+ - Show `title` if available else skip the entire section including header.
65
+
66
+ ### 📜 Page
67
+ - Show `page` if available else skip the entire section including header.
68
+
69
+ ### 📜 Original Verse
70
+ - Show exact original native-script verses from the context in a separate markdown block.
71
  - Do not translate, transliterate, or explain.
72
  - Preserve line breaks and spacing exactly.
73
 
74
  ### 📜 Sanitized Verse(s)
75
+ - Only include this section if sanitization changes anything otherwise don't even output the section heading .
76
  - Sanitize by:
77
+ 1. Fixing garbled Unicode characters in the original verse section.
78
  2. Correcting broken diacritics, pulli markers, vowel signs, and punctuation.
79
  3. Preserving original spacing and line order.
80
+ - If no change → skip this section entirely including the heading.
81
 
82
  ### 📜 {state['language']} – Simple Meaning
83
  - Give a **short, natural summary/meaning** in {state['language']}.
84
+ - Keep it concise and error-free. Do not give word-by-word meanings here even if available.
85
 
86
  ### 🔮 Next Steps
87
  End with a short list of follow-up prompts:
 
101
  #### 📜 Transliteration
102
  - Provide verse transliteration in {state['language']} if requested.
103
 
104
+ #### 📜 Word-by-Word Meaning (English)
105
  - Provide WBW meaning in English or {state['language']} if requested.
106
 
107
+ #### 📜 Word-by-Word Meaning ({state['language']})
108
+ - Provide WBW meaning {state['language']} if requested.
109
+
110
  #### 📜 Detailed Notes / Purport
111
  - Summarize and translate explanatory notes/purports if present in context.
112
 
113
  ---
114
 
115
  ⚠️ Rules:
116
+ - For a follow-up question, if the user does not specify a context in the question, assume it is for the verse returned by the previous response.For e.g. "word by word meaning" implies that the user wants to know "the word by word meaning for the above pasuram".
117
  - Do not duplicate content across sections.
118
  - Do not invent verses, meanings, or purports.
119
  - If no context found → reply in {state['language']}:
 
135
  )
136
  )
137
  )
138
+ state["initialized"] = True
139
 
140
  state["tool_calls"] = 0
141
  state["seen_tool_calls"] = set()
nalayiram_helper.py CHANGED
@@ -20,7 +20,7 @@ def get_standardized_prabandham_names() -> list[Pasuram]:
20
 
21
  return final_azhwars
22
 
23
- def get_standardized_azhwar_names() -> list[Pasuram]:
24
  """
25
  Get a list of azhwar names along with the pasurams they have authored in divya_prabandham
26
  """
@@ -28,12 +28,12 @@ def get_standardized_azhwar_names() -> list[Pasuram]:
28
  azhwars = json.load(f)
29
  header = azhwars[0]
30
  rows = azhwars[1:]
31
- final_azhwars = [Pasuram(**dict(zip(header, row))) for row in rows]
32
 
33
- return final_azhwars
34
 
35
 
36
- def get_standardized_divya_desam_names() -> list[dict]:
37
  """
38
  Get a list of divya desam names in divya_prabandham
39
  """
@@ -52,7 +52,8 @@ def get_standardized_divya_desam_names() -> list[dict]:
52
  "sampradayam",
53
  "divya_desam",
54
  ]
55
- return [{key : row[key] for key in selected_fields if key in row} for row in divya_desams["pageProps"]["hits"]]
 
56
 
57
 
58
  if __name__ == "__main__":
 
20
 
21
  return final_azhwars
22
 
23
+ def get_standardized_azhwar_names() -> list[str]:
24
  """
25
  Get a list of azhwar names along with the pasurams they have authored in divya_prabandham
26
  """
 
28
  azhwars = json.load(f)
29
  header = azhwars[0]
30
  rows = azhwars[1:]
31
+ final_azhwars = [row[1] for row in rows] ## 2nd field is the azhwar name
32
 
33
+ return sorted(set(final_azhwars))
34
 
35
 
36
+ def get_standardized_divya_desam_names() -> list[str]:
37
  """
38
  Get a list of divya desam names in divya_prabandham
39
  """
 
52
  "sampradayam",
53
  "divya_desam",
54
  ]
55
+ data = [{key : row[key] for key in selected_fields if key in row} for row in divya_desams["pageProps"]["hits"]]
56
+ return sorted(set([row["title"] for row in data]))
57
 
58
 
59
  if __name__ == "__main__":
sanatan_assistant.py CHANGED
@@ -104,96 +104,52 @@ Respond in **Markdown** format only. Ensure Sanskrit/Tamil verses are always cle
104
  return prompt
105
 
106
 
107
- def query(collection_name: allowedCollections, query: str, n_results=3):
108
- """
109
- Search a scripture collection.
110
-
111
- Parameters:
112
- - collection_name (str): The name of the scripture collection to search. ...
113
- - query (str): The search query.
114
- - n_results (int): Number of results to return. Default is 3.
115
-
116
- Returns:
117
- - A list of matching results.
118
- """
119
- logger.info("Semantic Search: Searching collection [%s] for [%s]", collection_name, query)
120
- response = sanatanDatabase.search(
121
- collection_name=collection_name, query=query, n_results=n_results
122
- )
123
-
124
- return "\n\n".join(
125
- f"Document: {doc}\nMetadata: {meta}\nID: {id_}"
126
- for doc, meta, id_ in zip(
127
- response["documents"], response["metadatas"], response["ids"]
128
- )
129
- )
130
-
131
- def query_by_metadata_field(
132
  collection_name: allowedCollections,
133
- query: str,
134
- metadata_where_clause : MetadataWhereClause,
135
  n_results=3,
 
136
  ):
137
  """
138
- Search a scripture collection by metadata. Do NOT use this for semantic search. Only use when a specific metadata field is provided.
139
 
140
  Parameters:
141
- - collection_name (str): The name of the scripture collection to search. ...
142
- - query (str): The search query.
143
- - metadata_where_clause: the filter which is an array of the following type
144
- - metadata_field (str) : The name of the metadata field. e.g. azhwar_name
145
- - metadata_search_operator (str) : The search operator e.g. $eq or $in. DO NOT use $regex.
146
- - metadata_value : Value to search for can be any primitive datatype like str or int (or a list[str] if metadata_search_operator = '$in'). for e.g. Thirumangai Azhwar or '2233' or 2233
147
  - n_results (int): Number of results to return. Default is 3.
 
148
 
149
  Returns:
150
  - A list of matching results.
151
  """
152
- logger.info("Searching collection [%s] for [%s]", collection_name, query)
153
-
 
 
 
 
 
 
 
 
 
154
  try:
155
- sanatanConfig.is_metadata_field_allowed(collection_name=collection_name, metadata_where_clause=metadata_where_clause)
 
 
 
 
156
  except:
157
  raise
158
 
159
- response = sanatanDatabase.search_by_metadata(
160
  collection_name=collection_name,
161
  query=query,
162
  metadata_where_clause=metadata_where_clause,
163
  n_results=n_results,
164
- )
165
-
166
- return "\n\n".join(
167
- f"Document: {doc}\nMetadata: {meta}\nID: {id_}"
168
- for doc, meta, id_ in zip(
169
- response["documents"], response["metadatas"], response["ids"]
170
- )
171
- )
172
-
173
-
174
- def query_by_literal_text(
175
- collection_name: allowedCollections,
176
- literal_to_search_for: str,
177
- n_results=3,
178
- ):
179
- """
180
- Search a scripture collection by a literal. Do NOT use this for semantic search. Only use when the user specifically asks for literal search.
181
-
182
- Parameters:
183
- - collection_name (str): The name of the scripture collection to search. ...
184
- - literal_to_search_for (str): The search query.
185
- - n_results (int): Number of results to return. Default is 3.
186
-
187
- Returns:
188
- - A list of matching results.
189
- """
190
- logger.info("Performing literal search in collection [%s] for [%s]", collection_name, literal_to_search_for)
191
-
192
-
193
- response = sanatanDatabase.search_for_literal(
194
- collection_name=collection_name,
195
- literal_to_search_for=literal_to_search_for,
196
- n_results=n_results,
197
  )
198
 
199
  return "\n\n".join(
 
104
  return prompt
105
 
106
 
107
+ def query(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
  collection_name: allowedCollections,
109
+ query: str | None = None,
110
+ metadata_where_clause: MetadataWhereClause | None = None,
111
  n_results=3,
112
+ search_type: Literal["semantic", "literal", "random"] = "semantic",
113
  ):
114
  """
115
+ Search a scripture collection.
116
 
117
  Parameters:
118
+ - collection_name (str): The name of the scripture collection to search (use the exact name from the metadata configuration. ...
119
+ - query (str): The search query - this is the semantic or literal query you want to search for. if you want to perform a random search or just want to search by metadata only, can be passed as None ..
120
+ - metadata_where_clause: MetadataWhereClause - Set to None if no metadata filters are requested. Always set when user mentions a specific prabandham, azhwar, or any other known field from the configuration. Example: {\"prabandham_name\": \"Thiruvaimozhi\"}. use the `conditional_operator` to filter based on $and or $or conditions. use `groups` to combine multiple queries into one.
 
 
 
121
  - n_results (int): Number of results to return. Default is 3.
122
+ - search_type: can be one of semantic, literal or random.
123
 
124
  Returns:
125
  - A list of matching results.
126
  """
127
+ logger.info(
128
+ "%s Search: collection [%s] for [%s] | metadata_where_clause=%s",
129
+ search_type,
130
+ collection_name,
131
+ query,
132
+ metadata_where_clause,
133
+ )
134
+ if search_type != "random" and metadata_where_clause is None and query is None:
135
+ raise Exception(
136
+ "Invalid input: when search type is not random, either metadata_where_clause or query should be provided"
137
+ )
138
  try:
139
+ if metadata_where_clause is not None:
140
+ sanatanConfig.is_metadata_field_allowed(
141
+ collection_name=collection_name,
142
+ metadata_where_clause=metadata_where_clause,
143
+ )
144
  except:
145
  raise
146
 
147
+ response = sanatanDatabase.search(
148
  collection_name=collection_name,
149
  query=query,
150
  metadata_where_clause=metadata_where_clause,
151
  n_results=n_results,
152
+ search_type=search_type,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
  )
154
 
155
  return "\n\n".join(
server.py CHANGED
@@ -6,7 +6,7 @@ from fastapi import APIRouter, Request
6
  from fastapi.responses import JSONResponse
7
  import pycountry
8
  from pydantic import BaseModel
9
- from app import chat
10
  from config import SanatanConfig
11
  from db import SanatanDatabase
12
 
 
6
  from fastapi.responses import JSONResponse
7
  import pycountry
8
  from pydantic import BaseModel
9
+ from chat_utils import chat
10
  from config import SanatanConfig
11
  from db import SanatanDatabase
12
 
tests/test_config.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Example test questions
2
+ TEST_QUESTIONS = [
3
+ {
4
+ "q": "one pasuram on thirukudandai and another from srirangam both written by thirumangai azhwar",
5
+ "type": "composite",
6
+ "difficulty": "complex",
7
+ "expected_answer_summary": "Should return one pasuram from Thirukudanthai and another from Srirangam, both authored by Thirumangai Azhwar.",
8
+ "expected_sources": ["Thirukudanthai", "Srirangam"],
9
+ "expected_azhwar": ["Thirumangai Azhwar"],
10
+ "n_results": 2,
11
+ },
12
+ {
13
+ "q": "give me 2 pasurams, one written by thirumazhisai alwar and the other by thirumangai azhwar, both written on divya desam Srirangam",
14
+ "type": "composite",
15
+ "difficulty": "complex",
16
+ "expected_answer_summary": "Should return two pasurams on Srirangam: one by Thirumazhisai Azhwar and the other by Thirumangai Azhwar.",
17
+ "expected_sources": ["Srirangam"],
18
+ "expected_azhwar": ["Thirumazhisai Azhwar", "Thirumangai Azhwar"],
19
+ "n_results": 2,
20
+ },
21
+ {
22
+ "q": "a pasuram from nanmugan thiruvandhadhi that talks about Krishna playing flute",
23
+ "type": "semantic",
24
+ "difficulty": "medium",
25
+ "expected_answer_summary": "Should return 1 pasuram from Nanmukan Thiruvanthathi.",
26
+ "expected_sources": ["Nanmukan Thiruvanthathi"],
27
+ "expected_azhwar": ["Thirumazhisai Azhwar"],
28
+ "expected_topics": ["Krishna", "Flute"],
29
+ "n_results": 1,
30
+ },
31
+ {
32
+ "q": "varaha avatar in nanmugan thiruvandhadhi",
33
+ "type": "semantic",
34
+ "difficulty": "medium",
35
+ "expected_answer_summary": "Should return 1 pasuram from Nanmukan Thiruvanthathi.",
36
+ "expected_sources": ["Nanmukan Thiruvanthathi"],
37
+ "expected_azhwar": ["Thirumazhisai Azhwar"],
38
+ "expected_keywords": ["boar"],
39
+ "n_results": 1,
40
+ },
41
+ {
42
+ "q": "varaha avatar in nanmugan thiruvandadhi and perumal thirumozhi",
43
+ "type": "semantic+composite",
44
+ "difficulty": "medium",
45
+ "expected_answer_summary": "Should return 2 pasurams. One from Nanmukan Thiruvanthathi and another from perumal thirumozhi.",
46
+ "expected_sources": ["Nanmukan Thiruvanthathi", "perumal thirumozhi"],
47
+ "expected_azhwar": ["Thirumazhisai Azhwar", "Kulasekhara Azhwar"],
48
+ "expected_keywords": ["boar"],
49
+ "n_results": 2,
50
+ },
51
+ ]
tests/test_evaluator.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from datetime import datetime
3
+ import openai
4
+ import json
5
+
6
+ from chat_utils import chat
7
+ from tests.test_config import TEST_QUESTIONS
8
+
9
+ def validate_with_ai(test_entry, bot_response):
10
+ """
11
+ Validator works with narrative bot responses.
12
+ The bot does not need to output JSON.
13
+ The LLM analyzes the bot response and returns a JSON validation.
14
+ """
15
+ prompt = f"""
16
+ You are a validator AI. The user provided the following bot response:
17
+
18
+ Bot Response:
19
+ \"\"\"{bot_response}\"\"\"
20
+
21
+ Expected attributes:
22
+ - Sources: {test_entry.get('expected_sources', [])}
23
+ - Azhwar: {test_entry.get('expected_azhwar', [])}
24
+ - Topics: {test_entry.get('expected_topics', [])}
25
+ - Keywords: {test_entry.get('expected_keywords', [])}
26
+ - Number of results: {test_entry.get('n_results', 1)}
27
+
28
+ Check the bot response and answer **only** in JSON with two fields:
29
+ {{
30
+ "valid": true/false, // True if bot response matches the expected attributes
31
+ "feedback": "short explanation why it passed or failed"
32
+ }}
33
+
34
+ Do **not** ask the bot to output the JSON itself. You should parse the narrative internally and return JSON.
35
+ """
36
+ resp = openai.chat.completions.create(
37
+ model="gpt-5-nano",
38
+ messages=[{"role": "user", "content": prompt}],
39
+ )
40
+ try:
41
+ content = resp.choices[0].message.content
42
+ return json.loads(content)
43
+ except Exception as e:
44
+ return {"valid": False, "feedback": f"Validator parsing error: {e}"}
45
+
46
+ def run_tests(debug_mode=False):
47
+ history = []
48
+ thread_id = "test_thread"
49
+
50
+ # Create log directory if it doesn't exist
51
+ log_dir = "outputs/tests"
52
+ os.makedirs(log_dir, exist_ok=True)
53
+
54
+ # Markdown log file with timestamp
55
+ run_id = datetime.now().strftime("%Y%m%d_%H%M%S")
56
+ log_file_path = os.path.join(log_dir, f"{run_id}.md")
57
+
58
+ # Keep track of summary
59
+ total_tests = len(TEST_QUESTIONS)
60
+ passed_tests = 0
61
+ results_summary = []
62
+
63
+ with open(log_file_path, "w", encoding="utf-8") as f:
64
+ f.write(f"# Sanatan AI Test Run - {run_id}\n\n")
65
+ for idx, test in enumerate(TEST_QUESTIONS, start=1):
66
+ f.write(f"## Test {idx}: {test['q']}\n\n")
67
+ f.write(f"**Type:** {test['type']} \n")
68
+ f.write(f"**Difficulty:** {test['difficulty']} \n")
69
+ f.write(f"**Expected Summary:** {test.get('expected_answer_summary', '')}\n\n")
70
+
71
+ print(f"\n=== Testing Question ===\n{test['q']}")
72
+ bot_response = chat(debug_mode, test["q"], history, thread_id)
73
+ f.write(f"### Bot Response\n```\n{bot_response}\n```\n\n")
74
+
75
+ validation = validate_with_ai(test, bot_response)
76
+ f.write(f"### Validation\n- **Valid:** {validation['valid']}\n- **Feedback:** {validation['feedback']}\n\n")
77
+
78
+ print(f"Valid: {validation['valid']}\nFeedback: {validation['feedback']}")
79
+
80
+ # Track results for summary
81
+ results_summary.append({
82
+ "question": test['q'],
83
+ "valid": validation['valid']
84
+ })
85
+ if validation['valid']:
86
+ passed_tests += 1
87
+
88
+ # Write run summary
89
+ failed_tests = total_tests - passed_tests
90
+ pass_rate = (passed_tests / total_tests) * 100 if total_tests > 0 else 0
91
+ f.write(f"# Run Summary\n\n")
92
+ f.write(f"- **Total Tests:** {total_tests}\n")
93
+ f.write(f"- **Passed:** {passed_tests}\n")
94
+ f.write(f"- **Failed:** {failed_tests}\n")
95
+ f.write(f"- **Pass Rate:** {pass_rate:.2f}%\n\n")
96
+
97
+ # Optional: Table of all test results
98
+ f.write("## Test Results Table\n\n")
99
+ f.write("| Test | Question | Valid |\n")
100
+ f.write("|------|----------|-------|\n")
101
+ for i, res in enumerate(results_summary, start=1):
102
+ valid_str = "✅" if res['valid'] else "❌"
103
+ f.write(f"| {i} | {res['question']} | {valid_str} |\n")
104
+
105
+ print(f"\nTest run complete. Markdown log saved to {log_file_path}")
106
+
107
+ if __name__ == "__main__":
108
+ run_tests(debug_mode=True)
tools.py CHANGED
@@ -13,8 +13,6 @@ from serperdev_helper import search as search_web
13
  from sanatan_assistant import (
14
  format_scripture_answer,
15
  query,
16
- query_by_metadata_field,
17
- query_by_literal_text,
18
  )
19
 
20
  tool_push = Tool(
@@ -25,52 +23,37 @@ allowed_collections = [s["collection_name"] for s in SanatanConfig.scriptures]
25
 
26
  tool_search_db = StructuredTool.from_function(
27
  query,
28
- name="tool_semantic_vector_search",
29
  description=(
30
- "🚫 Never use this tool if the user asks for a verse number, pasuram number, or any explicit metadata field "
31
- "(like azhwar name, prabandham, divya desam, decade, or chapter). "
32
- "✅ Only use this tool when the query is vague or thematic, e.g. "
33
- "'Which pasurams talk about Krishna's childhood?' or 'Show me verses about compassion'. "
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  f"The collection_name must be one of: {', '.join(allowed_collections)}."
35
  ),
36
  )
37
 
38
 
39
- tool_search_db_for_literal = StructuredTool.from_function(
40
- query_by_literal_text,
41
- name="tool_search_db_by_literal_text",
42
- description=(
43
- "🚫 Never use this tool by default."
44
- " ✅ Only use this tool if the user explicitly requests a 'literal match', 'exact phrase search', or uses words like 'match exactly', 'find the exact string', 'verbatim', or 'literal text'."
45
- " If the user simply asks for a verse number (e.g., verse 34, pasuram 2.3.5, sahasranamam verse 20), you must NOT use this tool — instead you must use `tool_search_db_by_metadata`."
46
- " Do not fall back to this tool if semantic or metadata search seems difficult or fails — it is reserved strictly for explicit literal match requests."
47
- f" The collection_name must be one of: {', '.join(allowed_collections)}."
48
- ),
49
- )
50
-
51
-
52
-
53
- tool_search_db_by_metadata = StructuredTool.from_function(
54
- query_by_metadata_field,
55
- name="tool_search_db_by_metadata",
56
- description=(
57
- "Use this tool **only when the user provides explicit metadata criteria**, such as: azhwar name, pasuram number, verse number, decade, prabandham name, or divya desam name."
58
- " This is not meant for general queries."
59
- f" The collection_name must be one of: {', '.join(allowed_collections)}."
60
- "You *MUST* ALWAYS call one of the standardization tools available to get the correct entity name before using this tool."
61
- "If the user asks for a specific azhwar, use `tool_get_standardized_azhwar_names` first."
62
- "If the user asks for a specific prabandham, use `tool_get_standardized_prabandham_names` first."
63
- "If the user mentions a divya desam, use `tool_get_standardized_divya_desam_names` first."
64
- "If you set metadata_search_operator to $in, then metadata_value must always be a list — even if it contains only a single item."
65
- """🔒 Important:
66
- When using the tool_get_standardized_azhwar_names, tool_get_standardized_divya_desam_names, or any similar standardization tool, you must use the standardized name exactly as returned by the tool — without modifying, reformatting, translating, or simplifying it in any way.
67
- For example, if the tool returns Thirumālirum Solai, you must pass that exact string to tool_search_db_by_metadata. Do not change it to Thirumalirum Solai, Tirumalirumsolai, or anything else.
68
- 🔍 This is critical for the search to return results correctly.
69
- 🚫 Any deviation will cause the search to fail or miss results."""
70
- ),
71
- )
72
-
73
-
74
  tool_search_web = Tool(
75
  name="search_web", description="Search the web for information", func=search_web
76
  )
 
13
  from sanatan_assistant import (
14
  format_scripture_answer,
15
  query,
 
 
16
  )
17
 
18
  tool_push = Tool(
 
23
 
24
  tool_search_db = StructuredTool.from_function(
25
  query,
26
+ name="tool_search_db",
27
  description=(
28
+ "🚫 use this tool to fetch any data from the database."
29
+ "rules for metadata_where_clause:"
30
+ """
31
+ - ⚠️ Every time you include a metadata_where_clause argument, you must first call the appropriate standardization tool (tool_get_standardized_divya_desam_names,tool_get_standardized_prabandham_names,tool_get_standardized_azhwar_names). Never insert raw values directly. Even if the input already looks correct, you must still call the tool. If you fail to do this, the query will be invalid.
32
+ > Standardization Step 1: Call the standardization tool to get the canonical Divya Desam name.
33
+ |--Example:
34
+ |----standardized_divya_desams = tool_get_standardized_divya_desam_names()
35
+ |----standardized_divya_desam = look for closest match to "Thirukkudandai" in standardized_divya_desams
36
+ > Standardization Step 2: Use the standardized name in your DB search argument for metadata_where_clause for the field divya_desams.
37
+ - When choosing collection_name argument for the tool_search_db, make sure you choose the exact collection_name from the metadata configuration above
38
+ - Always prefer a single tool call with composite filters rather than multiple calls.
39
+ - For MetadataWhereClause.filters.$.metadata_search_operator do not use $regex as angument. use semantic search option by using query argument instead.
40
+ - If user posts a thematic question, do not ignore the theme when you pass `query` arguments.
41
+ - Use `MetadataWhereClause` recursively with `filters` and `groups` to build nested conditions.
42
+ """
43
+ "- Always set metadata filters when user mentions a specific divya desam, prabandham, azhwar, or any other known field from the configuration. Example: {\"prabandham_name\": \"Thiruvaimozhi\"}."
44
+ "- Multiple metadata filters can be passed at the same time."
45
+ "- If passing '$in' as metadata_search_operator, the metadata_value should always be of type array. for instance {'metadata_field': 'divya_desams', 'metadata_search_operator': '$in', 'metadata_value': []'Srirangam']}"
46
+ "- Set metadata filters as None if no metadata filter is requested.\n"
47
+ "rules for search_type:"
48
+ "- use `random` if user does not provide a thematic/semantic search request. For e.g. 'any pasuram' or 'any pasuram from thiruvaimozhi'"
49
+ "- use `semantic` if user provides thematic/semantic search request"
50
+ "- use `literal` ONLY if user specifically requests for a literal search."
51
+ "\n"
52
  f"The collection_name must be one of: {', '.join(allowed_collections)}."
53
  ),
54
  )
55
 
56
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  tool_search_web = Tool(
58
  name="search_web", description="Search the web for information", func=search_web
59
  )