Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Upload folder using huggingface_hub
Browse files- .gitignore +1 -0
- README.md +17 -0
- app.py +1 -271
- chat_utils.py +287 -0
- config.py +20 -9
- db.py +172 -58
- metadata.py +41 -13
- modules/db/relevance.py +13 -6
- modules/nodes/chat.py +0 -4
- modules/nodes/init.py +38 -30
- nalayiram_helper.py +6 -5
- sanatan_assistant.py +27 -71
- server.py +1 -1
- tests/test_config.py +51 -0
- tests/test_evaluator.py +108 -0
- tools.py +25 -42
.gitignore
CHANGED
|
@@ -11,3 +11,4 @@ wheels/
|
|
| 11 |
.env
|
| 12 |
chromadb-store/
|
| 13 |
chromadb-store.zip
|
|
|
|
|
|
| 11 |
.env
|
| 12 |
chromadb-store/
|
| 13 |
chromadb-store.zip
|
| 14 |
+
outputs/
|
README.md
CHANGED
|
@@ -5,3 +5,20 @@ sdk: gradio
|
|
| 5 |
sdk_version: 5.38.0
|
| 6 |
python_version: 3.12
|
| 7 |
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
sdk_version: 5.38.0
|
| 6 |
python_version: 3.12
|
| 7 |
---
|
| 8 |
+
|
| 9 |
+
### Introduction
|
| 10 |
+
This is an Agentic-AI project that integrates all Hindu Sanatan Dharma scriptures into a single searchable platform.
|
| 11 |
+
|
| 12 |
+
### Supported Channels
|
| 13 |
+
- Web (https://huggingface.co/spaces/vikramvasudevan/sanatan_ai)
|
| 14 |
+
- Android (bhashyam.ai app)
|
| 15 |
+
|
| 16 |
+
### Start Web Server
|
| 17 |
+
- Run the following command from project root
|
| 18 |
+
> `uv run ./main.py`
|
| 19 |
+
|
| 20 |
+
### Automated AI Evaluator
|
| 21 |
+
- Tests are defined in tests/test_config.py
|
| 22 |
+
- Run the following command from project root to execute the tests.
|
| 23 |
+
> `uv run -m tests.test_evaluator`
|
| 24 |
+
- Test Logs are generated under `{project-root}/outputs/tests` folder as neatly formatted md files.
|
app.py
CHANGED
|
@@ -13,6 +13,7 @@ from langchain_core.messages.ai import AIMessageChunk, AIMessage
|
|
| 13 |
from langchain_core.messages.system import SystemMessage
|
| 14 |
from langchain_core.messages.tool import ToolMessage
|
| 15 |
|
|
|
|
| 16 |
from config import SanatanConfig
|
| 17 |
from db import SanatanDatabase
|
| 18 |
from drive_downloader import ZipDownloader
|
|
@@ -63,277 +64,6 @@ def render_message_with_tooltip(content: str, max_chars=200):
|
|
| 63 |
return f"<div title='{escape(content)}'>{short}</div>"
|
| 64 |
|
| 65 |
|
| 66 |
-
thinking_verbs = [
|
| 67 |
-
"thinking",
|
| 68 |
-
"processing",
|
| 69 |
-
"crunching data",
|
| 70 |
-
"please wait",
|
| 71 |
-
"just a few more seconds",
|
| 72 |
-
"closing in",
|
| 73 |
-
"analyzing",
|
| 74 |
-
"reasoning",
|
| 75 |
-
"computing",
|
| 76 |
-
"synthesizing insight",
|
| 77 |
-
"searching through the cosmos",
|
| 78 |
-
"decoding ancient knowledge",
|
| 79 |
-
"scanning the scriptures",
|
| 80 |
-
"accessing divine memory",
|
| 81 |
-
"gathering wisdom",
|
| 82 |
-
"consulting the rishis",
|
| 83 |
-
"listening to the ātmā",
|
| 84 |
-
"channeling sacred energy",
|
| 85 |
-
"unfolding the divine word",
|
| 86 |
-
"meditating on the meaning",
|
| 87 |
-
"reciting from memory",
|
| 88 |
-
"traversing the Vedas",
|
| 89 |
-
"seeking the inner light",
|
| 90 |
-
"invoking paramārtha",
|
| 91 |
-
"putting it all together",
|
| 92 |
-
"digging deeper",
|
| 93 |
-
"making sense of it",
|
| 94 |
-
"connecting the dots",
|
| 95 |
-
"almost there",
|
| 96 |
-
"getting closer",
|
| 97 |
-
"wrapping it up",
|
| 98 |
-
"piecing it together",
|
| 99 |
-
"swirling through verses",
|
| 100 |
-
"diving into the ocean of knowledge",
|
| 101 |
-
"lighting the lamp of understanding",
|
| 102 |
-
"walking the path of inquiry",
|
| 103 |
-
"aligning stars of context",
|
| 104 |
-
]
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
async def chat_wrapper(
|
| 108 |
-
message, history, thread_id, debug, preferred_language="English"
|
| 109 |
-
):
|
| 110 |
-
if debug:
|
| 111 |
-
async for chunk in chat_streaming(
|
| 112 |
-
debug, message, history, thread_id, preferred_language=preferred_language
|
| 113 |
-
):
|
| 114 |
-
yield chunk
|
| 115 |
-
else:
|
| 116 |
-
response = chat(
|
| 117 |
-
debug, message, history, thread_id, preferred_language=preferred_language
|
| 118 |
-
)
|
| 119 |
-
yield response
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
def chat(debug_mode, message, history, thread_id, preferred_language="English"):
|
| 123 |
-
config = {"configurable": {"thread_id": thread_id}}
|
| 124 |
-
response = graph.invoke(
|
| 125 |
-
{
|
| 126 |
-
"debug_mode": debug_mode,
|
| 127 |
-
"messages": [{"role": "user", "content": message}],
|
| 128 |
-
"language": preferred_language,
|
| 129 |
-
},
|
| 130 |
-
config=config,
|
| 131 |
-
)
|
| 132 |
-
return response["messages"][-1].content
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
def add_node_to_tree(
|
| 136 |
-
node_tree: list[str], node_label: str, tooltip: str = "no arguments to show"
|
| 137 |
-
) -> list[str]:
|
| 138 |
-
if tooltip:
|
| 139 |
-
tooltip = escape(tooltip).replace("'", "'")
|
| 140 |
-
node_with_tooltip = (
|
| 141 |
-
f"<span class='node-label' title='{tooltip}'>{node_label}</span>"
|
| 142 |
-
)
|
| 143 |
-
else:
|
| 144 |
-
node_with_tooltip = node_label
|
| 145 |
-
node_tree[-1] = node_with_tooltip
|
| 146 |
-
node_tree.append("<span class='spinner'> </span>")
|
| 147 |
-
return node_tree
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
def end_node_tree(node_tree: list[str]) -> list[str]:
|
| 151 |
-
node_tree[-1] = "🏁"
|
| 152 |
-
return node_tree
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
def get_args_for_toolcall(tool_calls_buffer: dict, tool_call_id: str):
|
| 156 |
-
return (
|
| 157 |
-
tool_calls_buffer[tool_call_id]["args_str"]
|
| 158 |
-
if tool_call_id in tool_calls_buffer
|
| 159 |
-
and "args_str" in tool_calls_buffer[tool_call_id]
|
| 160 |
-
else ""
|
| 161 |
-
)
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
async def chat_streaming(
|
| 165 |
-
debug_mode: bool, message, history, thread_id, preferred_language="English"
|
| 166 |
-
):
|
| 167 |
-
state = {
|
| 168 |
-
"debug_mode": debug_mode,
|
| 169 |
-
"messages": (history or []) + [{"role": "user", "content": message}],
|
| 170 |
-
"language": preferred_language,
|
| 171 |
-
}
|
| 172 |
-
config = {"configurable": {"thread_id": thread_id}, "recursion_limit": 15}
|
| 173 |
-
start_time = time.time()
|
| 174 |
-
streamed_response = ""
|
| 175 |
-
final_response = ""
|
| 176 |
-
# final_node = "validator"
|
| 177 |
-
|
| 178 |
-
MAX_CONTENT = 500
|
| 179 |
-
|
| 180 |
-
try:
|
| 181 |
-
node_tree = ["🚩", "<span class='spinner'> </span>"]
|
| 182 |
-
|
| 183 |
-
tool_calls_buffer = {}
|
| 184 |
-
|
| 185 |
-
async for msg, metadata in graph.astream(
|
| 186 |
-
state, config=config, stream_mode="messages"
|
| 187 |
-
):
|
| 188 |
-
node = metadata.get("langgraph_node", "?")
|
| 189 |
-
name = getattr(msg, "name", "-")
|
| 190 |
-
if not isinstance(msg, ToolMessage):
|
| 191 |
-
node_icon = "🧠"
|
| 192 |
-
else:
|
| 193 |
-
node_icon = "⚙️"
|
| 194 |
-
node_label = f"{node}"
|
| 195 |
-
tool_label = f"{name or ''}"
|
| 196 |
-
if tool_label:
|
| 197 |
-
node_label = node_label + f":{tool_label}"
|
| 198 |
-
label = f"{node_icon} {node_label}"
|
| 199 |
-
tooltip = ""
|
| 200 |
-
if isinstance(msg, ToolMessage):
|
| 201 |
-
tooltip = get_args_for_toolcall(tool_calls_buffer, msg.tool_call_id)
|
| 202 |
-
# logger.info("tooltip = ", tooltip)
|
| 203 |
-
|
| 204 |
-
# checking for -2 last but one. since last entry is always a spinner
|
| 205 |
-
if node_tree[-2] != label:
|
| 206 |
-
add_node_to_tree(node_tree, label, tooltip)
|
| 207 |
-
full: str = escape(msg.content)
|
| 208 |
-
truncated = (full[:MAX_CONTENT] + "…") if len(full) > MAX_CONTENT else full
|
| 209 |
-
|
| 210 |
-
def generate_processing_message():
|
| 211 |
-
return f"<div class='thinking-bubble'><em>🤔{random.choice(thinking_verbs)} ...</em></div>"
|
| 212 |
-
|
| 213 |
-
if (
|
| 214 |
-
not isinstance(msg, ToolMessage)
|
| 215 |
-
and not isinstance(msg, SystemMessage)
|
| 216 |
-
and not isinstance(msg, AIMessageChunk)
|
| 217 |
-
):
|
| 218 |
-
logger.info("msg = %s", msg)
|
| 219 |
-
if isinstance(msg, ToolMessage):
|
| 220 |
-
logger.debug("tool message = %s", msg)
|
| 221 |
-
|
| 222 |
-
html = f"<div class='thinking-bubble'><em>🤔 {msg.name} tool: {random.choice(thinking_verbs)} ...</em></div>"
|
| 223 |
-
yield f"### { ' → '.join(node_tree)}\n{html}"
|
| 224 |
-
elif isinstance(msg, AIMessageChunk):
|
| 225 |
-
|
| 226 |
-
def truncate_middle(text, front=50, back=50):
|
| 227 |
-
if not text:
|
| 228 |
-
return ""
|
| 229 |
-
if len(text) <= front + back:
|
| 230 |
-
return text
|
| 231 |
-
return f"{text[:front]}…{text[-back:]}".replace(
|
| 232 |
-
"\n", ""
|
| 233 |
-
) # remove new lines.
|
| 234 |
-
|
| 235 |
-
if not msg.content:
|
| 236 |
-
# logger.warning("*** No Message Chunk!")
|
| 237 |
-
yield f"### { " → ".join(node_tree)}\n{generate_processing_message()}\n<div class='intermediate-output'>{escape(truncate_middle(streamed_response))}</div>"
|
| 238 |
-
else:
|
| 239 |
-
# Stream intermediate messages with transparent style
|
| 240 |
-
# if node != final_node:
|
| 241 |
-
streamed_response += msg.content
|
| 242 |
-
yield f"### { ' → '.join(node_tree) }\n<div class='intermediate-output'>{escape(truncate_middle(streamed_response))}</div>"
|
| 243 |
-
# else:
|
| 244 |
-
# Buffer the final validated response instead of yielding
|
| 245 |
-
final_response += msg.content
|
| 246 |
-
|
| 247 |
-
if msg.tool_call_chunks:
|
| 248 |
-
for tool_call_chunk in msg.tool_call_chunks:
|
| 249 |
-
logger.debug("*** tool_call_chunk = ", tool_call_chunk)
|
| 250 |
-
if tool_call_chunk["id"] is not None:
|
| 251 |
-
tool_call_id = tool_call_chunk["id"]
|
| 252 |
-
|
| 253 |
-
if tool_call_id not in tool_calls_buffer:
|
| 254 |
-
tool_calls_buffer[tool_call_id] = {
|
| 255 |
-
"name": "",
|
| 256 |
-
"args_str": "",
|
| 257 |
-
"id": tool_call_id,
|
| 258 |
-
"type": "tool_call",
|
| 259 |
-
}
|
| 260 |
-
|
| 261 |
-
# Accumulate tool call name and arguments
|
| 262 |
-
if tool_call_chunk["name"] is not None:
|
| 263 |
-
tool_calls_buffer[tool_call_id]["name"] += tool_call_chunk[
|
| 264 |
-
"name"
|
| 265 |
-
]
|
| 266 |
-
if tool_call_chunk["args"] is not None:
|
| 267 |
-
tool_calls_buffer[tool_call_id][
|
| 268 |
-
"args_str"
|
| 269 |
-
] += tool_call_chunk["args"]
|
| 270 |
-
else:
|
| 271 |
-
logger.debug("message = ", type(msg), msg.content[:100])
|
| 272 |
-
full: str = escape(msg.content)
|
| 273 |
-
truncated = (
|
| 274 |
-
(full[:MAX_CONTENT] + "…") if len(full) > MAX_CONTENT else full
|
| 275 |
-
)
|
| 276 |
-
html = (
|
| 277 |
-
f"<div class='thinking-bubble'><em>🤔 {random.choice(thinking_verbs)} ...</em></div>"
|
| 278 |
-
f"<div style='opacity: 0.1'>"
|
| 279 |
-
f"<strong>Telling myself:</strong> {truncated or '...'}"
|
| 280 |
-
f"</div>"
|
| 281 |
-
)
|
| 282 |
-
yield f"### { " → ".join(node_tree)}\n{html}"
|
| 283 |
-
if getattr(msg, "tool_calls", []):
|
| 284 |
-
logger.info("ELSE::tool_calls = %s", msg.tool_calls)
|
| 285 |
-
|
| 286 |
-
node_tree[-1] = "✅"
|
| 287 |
-
end_time = time.time()
|
| 288 |
-
duration = end_time - start_time
|
| 289 |
-
final_response = (
|
| 290 |
-
f"\n{final_response}" f"\n\n⏱️ Processed in {duration:.2f} seconds"
|
| 291 |
-
)
|
| 292 |
-
buffer = f"### {' → '.join(node_tree)}\n"
|
| 293 |
-
yield buffer
|
| 294 |
-
for c in final_response:
|
| 295 |
-
buffer += c
|
| 296 |
-
yield buffer
|
| 297 |
-
await asyncio.sleep(0.0005)
|
| 298 |
-
|
| 299 |
-
logger.debug("************************************")
|
| 300 |
-
# Now, you can process the complete tool calls from the buffer
|
| 301 |
-
for tool_call_id, accumulated_tool_call in tool_calls_buffer.items():
|
| 302 |
-
# Attempt to parse arguments only if the 'args_str' isn't empty
|
| 303 |
-
if accumulated_tool_call["args_str"]:
|
| 304 |
-
try:
|
| 305 |
-
parsed_args = json.loads(accumulated_tool_call["args_str"])
|
| 306 |
-
logger.debug(f"Tool Name: {accumulated_tool_call['name']}")
|
| 307 |
-
logger.debug(f"Tool Arguments: {parsed_args}")
|
| 308 |
-
except json.JSONDecodeError:
|
| 309 |
-
logger.debug(
|
| 310 |
-
f"Partial arguments for tool {accumulated_tool_call['name']}: {accumulated_tool_call['args_str']}"
|
| 311 |
-
)
|
| 312 |
-
except asyncio.CancelledError:
|
| 313 |
-
logger.warning("⚠️ Request cancelled by user")
|
| 314 |
-
node_tree = end_node_tree(node_tree=node_tree)
|
| 315 |
-
yield (
|
| 316 |
-
f"### {' → '.join(node_tree)}"
|
| 317 |
-
"\n⚠️⚠️⚠️ Request cancelled by user"
|
| 318 |
-
"\nhere is what I got so far ...\n"
|
| 319 |
-
f"\n{streamed_response}"
|
| 320 |
-
)
|
| 321 |
-
# Important: re-raise if you want upstream to also know
|
| 322 |
-
# raise
|
| 323 |
-
return
|
| 324 |
-
except Exception as e:
|
| 325 |
-
logger.error("❌❌❌ Error processing request: %s", e)
|
| 326 |
-
traceback.print_exc()
|
| 327 |
-
node_tree = end_node_tree(node_tree=node_tree)
|
| 328 |
-
yield (
|
| 329 |
-
f"### { " → ".join(node_tree)}"
|
| 330 |
-
f"\n❌❌❌ Error processing request : {str(e)}"
|
| 331 |
-
"\nhere is what I got so far ...\n"
|
| 332 |
-
f"\n{streamed_response}"
|
| 333 |
-
)
|
| 334 |
-
return
|
| 335 |
-
|
| 336 |
-
|
| 337 |
# UI Elements
|
| 338 |
thread_id = gr.State(init_session)
|
| 339 |
|
|
|
|
| 13 |
from langchain_core.messages.system import SystemMessage
|
| 14 |
from langchain_core.messages.tool import ToolMessage
|
| 15 |
|
| 16 |
+
from chat_utils import chat_wrapper
|
| 17 |
from config import SanatanConfig
|
| 18 |
from db import SanatanDatabase
|
| 19 |
from drive_downloader import ZipDownloader
|
|
|
|
| 64 |
return f"<div title='{escape(content)}'>{short}</div>"
|
| 65 |
|
| 66 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
# UI Elements
|
| 68 |
thread_id = gr.State(init_session)
|
| 69 |
|
chat_utils.py
ADDED
|
@@ -0,0 +1,287 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import random
|
| 3 |
+
import asyncio
|
| 4 |
+
import logging
|
| 5 |
+
import time
|
| 6 |
+
import traceback
|
| 7 |
+
from html import escape
|
| 8 |
+
from langchain_core.messages.ai import AIMessageChunk
|
| 9 |
+
from langchain_core.messages.system import SystemMessage
|
| 10 |
+
from langchain_core.messages.tool import ToolMessage
|
| 11 |
+
|
| 12 |
+
from graph_helper import generate_graph
|
| 13 |
+
|
| 14 |
+
# Logging
|
| 15 |
+
logging.basicConfig()
|
| 16 |
+
logger = logging.getLogger()
|
| 17 |
+
logger.setLevel(logging.INFO)
|
| 18 |
+
|
| 19 |
+
thinking_verbs = [
|
| 20 |
+
"thinking",
|
| 21 |
+
"processing",
|
| 22 |
+
"crunching data",
|
| 23 |
+
"please wait",
|
| 24 |
+
"just a few more seconds",
|
| 25 |
+
"closing in",
|
| 26 |
+
"analyzing",
|
| 27 |
+
"reasoning",
|
| 28 |
+
"computing",
|
| 29 |
+
"synthesizing insight",
|
| 30 |
+
"searching through the cosmos",
|
| 31 |
+
"decoding ancient knowledge",
|
| 32 |
+
"scanning the scriptures",
|
| 33 |
+
"accessing divine memory",
|
| 34 |
+
"gathering wisdom",
|
| 35 |
+
"consulting the rishis",
|
| 36 |
+
"listening to the ātmā",
|
| 37 |
+
"channeling sacred energy",
|
| 38 |
+
"unfolding the divine word",
|
| 39 |
+
"meditating on the meaning",
|
| 40 |
+
"reciting from memory",
|
| 41 |
+
"traversing the Vedas",
|
| 42 |
+
"seeking the inner light",
|
| 43 |
+
"invoking paramārtha",
|
| 44 |
+
"putting it all together",
|
| 45 |
+
"digging deeper",
|
| 46 |
+
"making sense of it",
|
| 47 |
+
"connecting the dots",
|
| 48 |
+
"almost there",
|
| 49 |
+
"getting closer",
|
| 50 |
+
"wrapping it up",
|
| 51 |
+
"piecing it together",
|
| 52 |
+
"swirling through verses",
|
| 53 |
+
"diving into the ocean of knowledge",
|
| 54 |
+
"lighting the lamp of understanding",
|
| 55 |
+
"walking the path of inquiry",
|
| 56 |
+
"aligning stars of context",
|
| 57 |
+
]
|
| 58 |
+
|
| 59 |
+
graph = generate_graph()
|
| 60 |
+
|
| 61 |
+
def add_node_to_tree(
|
| 62 |
+
node_tree: list[str], node_label: str, tooltip: str = "no arguments to show"
|
| 63 |
+
) -> list[str]:
|
| 64 |
+
if tooltip:
|
| 65 |
+
tooltip = escape(tooltip).replace("'", "'")
|
| 66 |
+
node_with_tooltip = (
|
| 67 |
+
f"<span class='node-label' title='{tooltip}'>{node_label}</span>"
|
| 68 |
+
)
|
| 69 |
+
else:
|
| 70 |
+
node_with_tooltip = node_label
|
| 71 |
+
node_tree[-1] = node_with_tooltip
|
| 72 |
+
node_tree.append("<span class='spinner'> </span>")
|
| 73 |
+
return node_tree
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def end_node_tree(node_tree: list[str]) -> list[str]:
|
| 77 |
+
node_tree[-1] = "🏁"
|
| 78 |
+
return node_tree
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
def get_args_for_toolcall(tool_calls_buffer: dict, tool_call_id: str):
|
| 82 |
+
return (
|
| 83 |
+
tool_calls_buffer[tool_call_id]["args_str"]
|
| 84 |
+
if tool_call_id in tool_calls_buffer
|
| 85 |
+
and "args_str" in tool_calls_buffer[tool_call_id]
|
| 86 |
+
else ""
|
| 87 |
+
)
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
async def chat_wrapper(
|
| 91 |
+
message, history, thread_id, debug, preferred_language="English"
|
| 92 |
+
):
|
| 93 |
+
if debug:
|
| 94 |
+
async for chunk in chat_streaming(
|
| 95 |
+
debug, message, history, thread_id, preferred_language=preferred_language
|
| 96 |
+
):
|
| 97 |
+
yield chunk
|
| 98 |
+
else:
|
| 99 |
+
response = chat(
|
| 100 |
+
debug, message, history, thread_id, preferred_language=preferred_language
|
| 101 |
+
)
|
| 102 |
+
yield response
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
def chat(debug_mode, message, history, thread_id, preferred_language="English"):
|
| 106 |
+
config = {"configurable": {"thread_id": thread_id}, "recursion_limit": 30}
|
| 107 |
+
response = graph.invoke(
|
| 108 |
+
{
|
| 109 |
+
"debug_mode": debug_mode,
|
| 110 |
+
"messages": [{"role": "user", "content": message}],
|
| 111 |
+
"language": preferred_language,
|
| 112 |
+
},
|
| 113 |
+
config=config,
|
| 114 |
+
)
|
| 115 |
+
return response["messages"][-1].content
|
| 116 |
+
|
| 117 |
+
async def chat_streaming(
|
| 118 |
+
debug_mode: bool, message, history, thread_id, preferred_language="English"
|
| 119 |
+
):
|
| 120 |
+
state = {
|
| 121 |
+
"debug_mode": debug_mode,
|
| 122 |
+
"messages": (history or []) + [{"role": "user", "content": message}],
|
| 123 |
+
"language": preferred_language,
|
| 124 |
+
}
|
| 125 |
+
config = {"configurable": {"thread_id": thread_id}, "recursion_limit": 30}
|
| 126 |
+
start_time = time.time()
|
| 127 |
+
streamed_response = ""
|
| 128 |
+
final_response = ""
|
| 129 |
+
# final_node = "validator"
|
| 130 |
+
|
| 131 |
+
MAX_CONTENT = 500
|
| 132 |
+
|
| 133 |
+
try:
|
| 134 |
+
node_tree = ["🚩", "<span class='spinner'> </span>"]
|
| 135 |
+
|
| 136 |
+
tool_calls_buffer = {}
|
| 137 |
+
|
| 138 |
+
async for msg, metadata in graph.astream(
|
| 139 |
+
state, config=config, stream_mode="messages"
|
| 140 |
+
):
|
| 141 |
+
node = metadata.get("langgraph_node", "?")
|
| 142 |
+
name = getattr(msg, "name", "-")
|
| 143 |
+
if not isinstance(msg, ToolMessage):
|
| 144 |
+
node_icon = "🧠"
|
| 145 |
+
else:
|
| 146 |
+
node_icon = "⚙️"
|
| 147 |
+
node_label = f"{node}"
|
| 148 |
+
tool_label = f"{name or ''}"
|
| 149 |
+
if tool_label:
|
| 150 |
+
node_label = node_label + f":{tool_label}"
|
| 151 |
+
label = f"{node_icon} {node_label}"
|
| 152 |
+
tooltip = ""
|
| 153 |
+
if isinstance(msg, ToolMessage):
|
| 154 |
+
tooltip = get_args_for_toolcall(tool_calls_buffer, msg.tool_call_id)
|
| 155 |
+
# logger.info("tooltip = ", tooltip)
|
| 156 |
+
|
| 157 |
+
# checking for -2 last but one. since last entry is always a spinner
|
| 158 |
+
if node_tree[-2] != label:
|
| 159 |
+
add_node_to_tree(node_tree, label, tooltip)
|
| 160 |
+
full: str = escape(msg.content)
|
| 161 |
+
truncated = (full[:MAX_CONTENT] + "…") if len(full) > MAX_CONTENT else full
|
| 162 |
+
|
| 163 |
+
def generate_processing_message():
|
| 164 |
+
return f"<div class='thinking-bubble'><em>🤔{random.choice(thinking_verbs)} ...</em></div>"
|
| 165 |
+
|
| 166 |
+
if (
|
| 167 |
+
not isinstance(msg, ToolMessage)
|
| 168 |
+
and not isinstance(msg, SystemMessage)
|
| 169 |
+
and not isinstance(msg, AIMessageChunk)
|
| 170 |
+
):
|
| 171 |
+
logger.info("msg = %s", msg)
|
| 172 |
+
if isinstance(msg, ToolMessage):
|
| 173 |
+
logger.debug("tool message = %s", msg)
|
| 174 |
+
|
| 175 |
+
html = f"<div class='thinking-bubble'><em>🤔 {msg.name} tool: {random.choice(thinking_verbs)} ...</em></div>"
|
| 176 |
+
yield f"### { ' → '.join(node_tree)}\n{html}"
|
| 177 |
+
elif isinstance(msg, AIMessageChunk):
|
| 178 |
+
|
| 179 |
+
def truncate_middle(text, front=50, back=50):
|
| 180 |
+
if not text:
|
| 181 |
+
return ""
|
| 182 |
+
if len(text) <= front + back:
|
| 183 |
+
return text
|
| 184 |
+
return f"{text[:front]}…{text[-back:]}".replace(
|
| 185 |
+
"\n", ""
|
| 186 |
+
) # remove new lines.
|
| 187 |
+
|
| 188 |
+
if not msg.content:
|
| 189 |
+
# logger.warning("*** No Message Chunk!")
|
| 190 |
+
yield f"### { " → ".join(node_tree)}\n{generate_processing_message()}\n<div class='intermediate-output'>{escape(truncate_middle(streamed_response))}</div>"
|
| 191 |
+
else:
|
| 192 |
+
# Stream intermediate messages with transparent style
|
| 193 |
+
# if node != final_node:
|
| 194 |
+
streamed_response += msg.content
|
| 195 |
+
yield f"### { ' → '.join(node_tree) }\n<div class='intermediate-output'>{escape(truncate_middle(streamed_response))}</div>"
|
| 196 |
+
# else:
|
| 197 |
+
# Buffer the final validated response instead of yielding
|
| 198 |
+
final_response += msg.content
|
| 199 |
+
|
| 200 |
+
if msg.tool_call_chunks:
|
| 201 |
+
for tool_call_chunk in msg.tool_call_chunks:
|
| 202 |
+
logger.debug("*** tool_call_chunk = ", tool_call_chunk)
|
| 203 |
+
if tool_call_chunk["id"] is not None:
|
| 204 |
+
tool_call_id = tool_call_chunk["id"]
|
| 205 |
+
|
| 206 |
+
if tool_call_id not in tool_calls_buffer:
|
| 207 |
+
tool_calls_buffer[tool_call_id] = {
|
| 208 |
+
"name": "",
|
| 209 |
+
"args_str": "",
|
| 210 |
+
"id": tool_call_id,
|
| 211 |
+
"type": "tool_call",
|
| 212 |
+
}
|
| 213 |
+
|
| 214 |
+
# Accumulate tool call name and arguments
|
| 215 |
+
if tool_call_chunk["name"] is not None:
|
| 216 |
+
tool_calls_buffer[tool_call_id]["name"] += tool_call_chunk[
|
| 217 |
+
"name"
|
| 218 |
+
]
|
| 219 |
+
if tool_call_chunk["args"] is not None:
|
| 220 |
+
tool_calls_buffer[tool_call_id][
|
| 221 |
+
"args_str"
|
| 222 |
+
] += tool_call_chunk["args"]
|
| 223 |
+
else:
|
| 224 |
+
logger.debug("message = ", type(msg), msg.content[:100])
|
| 225 |
+
full: str = escape(msg.content)
|
| 226 |
+
truncated = (
|
| 227 |
+
(full[:MAX_CONTENT] + "…") if len(full) > MAX_CONTENT else full
|
| 228 |
+
)
|
| 229 |
+
html = (
|
| 230 |
+
f"<div class='thinking-bubble'><em>🤔 {random.choice(thinking_verbs)} ...</em></div>"
|
| 231 |
+
f"<div style='opacity: 0.1'>"
|
| 232 |
+
f"<strong>Telling myself:</strong> {truncated or '...'}"
|
| 233 |
+
f"</div>"
|
| 234 |
+
)
|
| 235 |
+
yield f"### { " → ".join(node_tree)}\n{html}"
|
| 236 |
+
if getattr(msg, "tool_calls", []):
|
| 237 |
+
logger.info("ELSE::tool_calls = %s", msg.tool_calls)
|
| 238 |
+
|
| 239 |
+
node_tree[-1] = "✅"
|
| 240 |
+
end_time = time.time()
|
| 241 |
+
duration = end_time - start_time
|
| 242 |
+
final_response = (
|
| 243 |
+
f"\n{final_response}" f"\n\n⏱️ Processed in {duration:.2f} seconds"
|
| 244 |
+
)
|
| 245 |
+
buffer = f"### {' → '.join(node_tree)}\n"
|
| 246 |
+
yield buffer
|
| 247 |
+
for c in final_response:
|
| 248 |
+
buffer += c
|
| 249 |
+
yield buffer
|
| 250 |
+
await asyncio.sleep(0.0005)
|
| 251 |
+
|
| 252 |
+
logger.debug("************************************")
|
| 253 |
+
# Now, you can process the complete tool calls from the buffer
|
| 254 |
+
for tool_call_id, accumulated_tool_call in tool_calls_buffer.items():
|
| 255 |
+
# Attempt to parse arguments only if the 'args_str' isn't empty
|
| 256 |
+
if accumulated_tool_call["args_str"]:
|
| 257 |
+
try:
|
| 258 |
+
parsed_args = json.loads(accumulated_tool_call["args_str"])
|
| 259 |
+
logger.debug(f"Tool Name: {accumulated_tool_call['name']}")
|
| 260 |
+
logger.debug(f"Tool Arguments: {parsed_args}")
|
| 261 |
+
except json.JSONDecodeError:
|
| 262 |
+
logger.debug(
|
| 263 |
+
f"Partial arguments for tool {accumulated_tool_call['name']}: {accumulated_tool_call['args_str']}"
|
| 264 |
+
)
|
| 265 |
+
except asyncio.CancelledError:
|
| 266 |
+
logger.warning("⚠️ Request cancelled by user")
|
| 267 |
+
node_tree = end_node_tree(node_tree=node_tree)
|
| 268 |
+
yield (
|
| 269 |
+
f"### {' → '.join(node_tree)}"
|
| 270 |
+
"\n⚠️⚠️⚠️ Request cancelled by user"
|
| 271 |
+
"\nhere is what I got so far ...\n"
|
| 272 |
+
f"\n{streamed_response}"
|
| 273 |
+
)
|
| 274 |
+
# Important: re-raise if you want upstream to also know
|
| 275 |
+
# raise
|
| 276 |
+
return
|
| 277 |
+
except Exception as e:
|
| 278 |
+
logger.error("❌❌❌ Error processing request: %s", e)
|
| 279 |
+
traceback.print_exc()
|
| 280 |
+
node_tree = end_node_tree(node_tree=node_tree)
|
| 281 |
+
yield (
|
| 282 |
+
f"### { " → ".join(node_tree)}"
|
| 283 |
+
f"\n❌❌❌ Error processing request : {str(e)}"
|
| 284 |
+
"\nhere is what I got so far ...\n"
|
| 285 |
+
f"\n{streamed_response}"
|
| 286 |
+
)
|
| 287 |
+
return
|
config.py
CHANGED
|
@@ -246,9 +246,10 @@ class SanatanConfig:
|
|
| 246 |
{
|
| 247 |
"name": "verse",
|
| 248 |
"datatype": "int",
|
|
|
|
| 249 |
"description": (
|
| 250 |
-
"Absolute verse number or pasuram number."
|
| 251 |
-
"Use it only when a specific prabandham name is NOT mentioned in the user query."
|
| 252 |
"For e.g. 'Give me pasuram 1176'"
|
| 253 |
),
|
| 254 |
},
|
|
@@ -539,13 +540,23 @@ class SanatanConfig:
|
|
| 539 |
self, collection_name: str, metadata_where_clause: MetadataWhereClause
|
| 540 |
):
|
| 541 |
scripture = self.get_scripture_by_collection(collection_name=collection_name)
|
| 542 |
-
for
|
| 543 |
-
|
| 544 |
-
|
| 545 |
-
|
| 546 |
-
|
| 547 |
-
|
| 548 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 549 |
return True
|
| 550 |
|
| 551 |
def get_embedding_for_collection(self, collection_name: str):
|
|
|
|
| 246 |
{
|
| 247 |
"name": "verse",
|
| 248 |
"datatype": "int",
|
| 249 |
+
"is_unique" : True,
|
| 250 |
"description": (
|
| 251 |
+
"Absolute verse number or pasuram number. Each verse has a unique number."
|
| 252 |
+
# "Use it only when a specific prabandham name is NOT mentioned in the user query."
|
| 253 |
"For e.g. 'Give me pasuram 1176'"
|
| 254 |
),
|
| 255 |
},
|
|
|
|
| 540 |
self, collection_name: str, metadata_where_clause: MetadataWhereClause
|
| 541 |
):
|
| 542 |
scripture = self.get_scripture_by_collection(collection_name=collection_name)
|
| 543 |
+
allowed_fields = [field["name"] for field in scripture["metadata_fields"]]
|
| 544 |
+
|
| 545 |
+
def validate_clause(clause: MetadataWhereClause):
|
| 546 |
+
# validate direct filters
|
| 547 |
+
if clause.filters:
|
| 548 |
+
for f in clause.filters:
|
| 549 |
+
if f.metadata_field not in allowed_fields:
|
| 550 |
+
raise Exception(
|
| 551 |
+
f"metadata_field: [{f.metadata_field}] not allowed in collection [{collection_name}]. "
|
| 552 |
+
f"Here are the allowed fields with their descriptions: {scripture['metadata_fields']}"
|
| 553 |
+
)
|
| 554 |
+
# recurse into groups
|
| 555 |
+
if clause.groups:
|
| 556 |
+
for g in clause.groups:
|
| 557 |
+
validate_clause(g)
|
| 558 |
+
|
| 559 |
+
validate_clause(metadata_where_clause)
|
| 560 |
return True
|
| 561 |
|
| 562 |
def get_embedding_for_collection(self, collection_name: str):
|
db.py
CHANGED
|
@@ -1,4 +1,6 @@
|
|
| 1 |
import json
|
|
|
|
|
|
|
| 2 |
import chromadb
|
| 3 |
import re, unicodedata
|
| 4 |
from config import SanatanConfig
|
|
@@ -33,18 +35,120 @@ class SanatanDatabase:
|
|
| 33 |
metadatas=metadatas,
|
| 34 |
)
|
| 35 |
|
| 36 |
-
def
|
| 37 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
collection = self.chroma_client.get_or_create_collection(name=collection_name)
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
n_results=n_results,
|
| 46 |
-
include=["metadatas","documents","distances"],
|
| 47 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
except Exception as e:
|
| 49 |
logger.error("Error in search: %s", e)
|
| 50 |
return chromadb.QueryResult(
|
|
@@ -53,42 +157,71 @@ class SanatanDatabase:
|
|
| 53 |
metadatas=[],
|
| 54 |
distances=[],
|
| 55 |
)
|
| 56 |
-
|
| 57 |
validated_response = validate_relevance_queryresult(query, response)
|
| 58 |
|
| 59 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
|
| 61 |
def search_for_literal(
|
| 62 |
-
self,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
):
|
| 64 |
logger.info(
|
| 65 |
-
"Searching literally for [%s] in [%s]",
|
| 66 |
literal_to_search_for,
|
| 67 |
collection_name,
|
|
|
|
| 68 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
collection = self.chroma_client.get_or_create_collection(name=collection_name)
|
| 70 |
|
| 71 |
def normalize(text):
|
| 72 |
return unicodedata.normalize("NFKC", text).lower()
|
| 73 |
|
| 74 |
# 1. Try native contains
|
| 75 |
-
response = collection.
|
| 76 |
-
|
| 77 |
-
|
|
|
|
|
|
|
| 78 |
),
|
| 79 |
where_document={"$contains": literal_to_search_for},
|
| 80 |
-
|
| 81 |
)
|
| 82 |
|
| 83 |
if response["documents"] and any(response["documents"]):
|
| 84 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
|
| 86 |
# 2. Regex fallback (normalized)
|
| 87 |
logger.info("⚠ No luck. Falling back to regex for %s", literal_to_search_for)
|
| 88 |
regex = re.compile(re.escape(normalize(literal_to_search_for)))
|
| 89 |
logger.info("regex = %s", regex)
|
| 90 |
|
| 91 |
-
all_docs = collection.get(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
matched_docs = []
|
| 93 |
|
| 94 |
for doc_list, metadata_list, doc_id_list in zip(
|
|
@@ -135,36 +268,13 @@ class SanatanDatabase:
|
|
| 135 |
if len(matched_docs) >= n_results:
|
| 136 |
break
|
| 137 |
|
| 138 |
-
return
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
def search_by_metadata(
|
| 145 |
-
self,
|
| 146 |
-
collection_name: str,
|
| 147 |
-
query: str,
|
| 148 |
-
metadata_where_clause: MetadataWhereClause,
|
| 149 |
-
n_results=2,
|
| 150 |
-
):
|
| 151 |
-
"""Search by a metadata field inside a specific collection using a specific operator. For instance {"azhwar_name": {"$in": "Thirumangai Azhwar"}}"""
|
| 152 |
-
logger.info(
|
| 153 |
-
"Searching by metadata for [%s] in [%s] with metadata_filters=%s",
|
| 154 |
-
query,
|
| 155 |
-
collection_name,
|
| 156 |
-
metadata_where_clause,
|
| 157 |
)
|
| 158 |
-
collection = self.chroma_client.get_or_create_collection(name=collection_name)
|
| 159 |
-
response = collection.query(
|
| 160 |
-
query_embeddings=get_embedding(
|
| 161 |
-
[query], SanatanConfig().get_embedding_for_collection(collection_name)
|
| 162 |
-
),
|
| 163 |
-
where=metadata_where_clause.to_chroma_where(),
|
| 164 |
-
# query_texts=[query],
|
| 165 |
-
n_results=n_results,
|
| 166 |
-
)
|
| 167 |
-
return response
|
| 168 |
|
| 169 |
def count(self, collection_name: str):
|
| 170 |
collection = self.chroma_client.get_or_create_collection(name=collection_name)
|
|
@@ -177,12 +287,12 @@ class SanatanDatabase:
|
|
| 177 |
count = self.count(collection_name=scripture["collection_name"])
|
| 178 |
if count == 0:
|
| 179 |
raise Exception(f"No data in collection {scripture["collection_name"]}")
|
| 180 |
-
|
| 181 |
def reembed_collection_openai(self, collection_name: str, batch_size: int = 50):
|
| 182 |
"""
|
| 183 |
Deletes and recreates a Chroma collection with OpenAI text-embedding-3-large embeddings.
|
| 184 |
All existing documents are re-embedded and inserted into the new collection.
|
| 185 |
-
|
| 186 |
Args:
|
| 187 |
collection_name: The name of the collection to delete/recreate.
|
| 188 |
batch_size: Number of documents to process per batch.
|
|
@@ -195,7 +305,7 @@ class SanatanDatabase:
|
|
| 195 |
metadatas = old_data["metadatas"]
|
| 196 |
ids = old_data["ids"]
|
| 197 |
print(f"Fetched {len(documents)} documents from old collection.")
|
| 198 |
-
|
| 199 |
# Step 2: Delete old collection
|
| 200 |
# self.chroma_client.delete_collection(collection_name)
|
| 201 |
# print(f"Deleted old collection '{collection_name}'.")
|
|
@@ -208,13 +318,17 @@ class SanatanDatabase:
|
|
| 208 |
name=f"{collection_name}_openai",
|
| 209 |
embedding_function=None, # embeddings will be provided manually
|
| 210 |
)
|
| 211 |
-
print(
|
|
|
|
|
|
|
| 212 |
|
| 213 |
# Step 4: Re-embed and insert documents in batches
|
| 214 |
-
for i in tqdm(
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
|
|
|
|
|
|
| 218 |
|
| 219 |
embeddings = get_embedding(batch_docs, backend="openai")
|
| 220 |
|
|
@@ -222,6 +336,6 @@ class SanatanDatabase:
|
|
| 222 |
ids=batch_ids,
|
| 223 |
documents=batch_docs,
|
| 224 |
metadatas=batch_metadatas,
|
| 225 |
-
embeddings=embeddings
|
| 226 |
)
|
| 227 |
-
print("All documents re-embedded and added to new collection successfully!")
|
|
|
|
| 1 |
import json
|
| 2 |
+
import random
|
| 3 |
+
from typing import Literal
|
| 4 |
import chromadb
|
| 5 |
import re, unicodedata
|
| 6 |
from config import SanatanConfig
|
|
|
|
| 35 |
metadatas=metadatas,
|
| 36 |
)
|
| 37 |
|
| 38 |
+
def fetch_random_data(
|
| 39 |
+
self,
|
| 40 |
+
collection_name: str,
|
| 41 |
+
metadata_where_clause: MetadataWhereClause = None,
|
| 42 |
+
n_results=1,
|
| 43 |
+
):
|
| 44 |
+
# fetch all documents once
|
| 45 |
+
logger.info(
|
| 46 |
+
"getting %d random verses from [%s] | metadata_where_clause = %s",
|
| 47 |
+
n_results,
|
| 48 |
+
collection_name,
|
| 49 |
+
metadata_where_clause,
|
| 50 |
+
)
|
| 51 |
collection = self.chroma_client.get_or_create_collection(name=collection_name)
|
| 52 |
+
data = collection.get(
|
| 53 |
+
where=(
|
| 54 |
+
metadata_where_clause.to_chroma_where()
|
| 55 |
+
if metadata_where_clause is not None
|
| 56 |
+
else None
|
| 57 |
+
)
|
| 58 |
+
)
|
| 59 |
+
docs = data["documents"] # list of all verse texts
|
| 60 |
+
ids = data["ids"]
|
| 61 |
+
metas = data["metadatas"]
|
| 62 |
+
|
| 63 |
+
if not docs:
|
| 64 |
+
logger.warning("No data found! - data=%s", data)
|
| 65 |
+
return chromadb.QueryResult(ids=[], documents=[], metadatas=[])
|
| 66 |
+
|
| 67 |
+
# pick k random indices
|
| 68 |
+
indices = random.sample(range(len(docs)), k=min(n_results, len(docs)))
|
| 69 |
+
|
| 70 |
+
return chromadb.QueryResult(
|
| 71 |
+
ids=[ids[i] for i in indices],
|
| 72 |
+
documents=[docs[i] for i in indices],
|
| 73 |
+
metadatas=[metas[i] for i in indices],
|
| 74 |
+
)
|
| 75 |
+
|
| 76 |
+
def search(
|
| 77 |
+
self,
|
| 78 |
+
collection_name: str,
|
| 79 |
+
query: str = None,
|
| 80 |
+
metadata_where_clause: MetadataWhereClause = None,
|
| 81 |
+
n_results=2,
|
| 82 |
+
search_type: Literal["semantic", "literal", "random"] = "semantic",
|
| 83 |
+
):
|
| 84 |
+
logger.info(
|
| 85 |
+
"Search for [%s] in [%s]| metadata_where_clause=%s | search_type=%s | n_results=%d",
|
| 86 |
+
query,
|
| 87 |
+
collection_name,
|
| 88 |
+
metadata_where_clause,
|
| 89 |
+
search_type,
|
| 90 |
+
n_results,
|
| 91 |
+
)
|
| 92 |
+
if search_type == "semantic":
|
| 93 |
+
return self.search_semantic(
|
| 94 |
+
collection_name=collection_name,
|
| 95 |
+
query=query,
|
| 96 |
+
metadata_where_clause=metadata_where_clause,
|
| 97 |
+
n_results=n_results,
|
| 98 |
+
)
|
| 99 |
+
elif search_type == "literal":
|
| 100 |
+
return self.search_for_literal(
|
| 101 |
+
collection_name=collection_name,
|
| 102 |
+
literal_to_search_for=query,
|
| 103 |
+
metadata_where_clause=metadata_where_clause,
|
| 104 |
n_results=n_results,
|
|
|
|
| 105 |
)
|
| 106 |
+
else:
|
| 107 |
+
# random
|
| 108 |
+
return self.fetch_random_data(
|
| 109 |
+
collection_name=collection_name,
|
| 110 |
+
metadata_where_clause=metadata_where_clause,
|
| 111 |
+
n_results=n_results,
|
| 112 |
+
)
|
| 113 |
+
|
| 114 |
+
def search_semantic(
|
| 115 |
+
self,
|
| 116 |
+
collection_name: str,
|
| 117 |
+
query: str | None = None,
|
| 118 |
+
metadata_where_clause: MetadataWhereClause | None = None,
|
| 119 |
+
n_results=2,
|
| 120 |
+
):
|
| 121 |
+
logger.info(
|
| 122 |
+
"Vector Semantic Search for [%s] in [%s] | metadata_where_clause = %s",
|
| 123 |
+
query,
|
| 124 |
+
collection_name,
|
| 125 |
+
metadata_where_clause,
|
| 126 |
+
)
|
| 127 |
+
collection = self.chroma_client.get_or_create_collection(name=collection_name)
|
| 128 |
+
try:
|
| 129 |
+
q = query.strip() if query is not None else ""
|
| 130 |
+
if not q:
|
| 131 |
+
# fallback: fetch random verse
|
| 132 |
+
return self.fetch_random_data(
|
| 133 |
+
collection_name=collection_name,
|
| 134 |
+
metadata_where_clause=metadata_where_clause,
|
| 135 |
+
n_results=n_results,
|
| 136 |
+
)
|
| 137 |
+
else:
|
| 138 |
+
response = collection.query(
|
| 139 |
+
query_embeddings=get_embedding(
|
| 140 |
+
[query],
|
| 141 |
+
SanatanConfig().get_embedding_for_collection(collection_name),
|
| 142 |
+
),
|
| 143 |
+
# query_texts=[query],
|
| 144 |
+
n_results=n_results,
|
| 145 |
+
where=(
|
| 146 |
+
metadata_where_clause.to_chroma_where()
|
| 147 |
+
if metadata_where_clause is not None
|
| 148 |
+
else None
|
| 149 |
+
),
|
| 150 |
+
include=["metadatas", "documents", "distances"],
|
| 151 |
+
)
|
| 152 |
except Exception as e:
|
| 153 |
logger.error("Error in search: %s", e)
|
| 154 |
return chromadb.QueryResult(
|
|
|
|
| 157 |
metadatas=[],
|
| 158 |
distances=[],
|
| 159 |
)
|
| 160 |
+
|
| 161 |
validated_response = validate_relevance_queryresult(query, response)
|
| 162 |
|
| 163 |
+
logger.info(
|
| 164 |
+
"status = %s | reason= %s",
|
| 165 |
+
validated_response.status,
|
| 166 |
+
validated_response.reason,
|
| 167 |
+
)
|
| 168 |
+
|
| 169 |
+
return validated_response.result
|
| 170 |
|
| 171 |
def search_for_literal(
|
| 172 |
+
self,
|
| 173 |
+
collection_name: str,
|
| 174 |
+
literal_to_search_for: str | None = None,
|
| 175 |
+
metadata_where_clause: MetadataWhereClause | None = None,
|
| 176 |
+
n_results=2,
|
| 177 |
):
|
| 178 |
logger.info(
|
| 179 |
+
"Searching literally for [%s] in [%s] | metadata_where_clause = %s",
|
| 180 |
literal_to_search_for,
|
| 181 |
collection_name,
|
| 182 |
+
metadata_where_clause,
|
| 183 |
)
|
| 184 |
+
if literal_to_search_for is None or literal_to_search_for.strip() == "":
|
| 185 |
+
logger.warning("Nothing to search literally.")
|
| 186 |
+
# raise Exception("literal_to_search_for cannot be None or empty for a literal search!")
|
| 187 |
+
return self.fetch_random_data(
|
| 188 |
+
collection_name=collection_name,
|
| 189 |
+
)
|
| 190 |
collection = self.chroma_client.get_or_create_collection(name=collection_name)
|
| 191 |
|
| 192 |
def normalize(text):
|
| 193 |
return unicodedata.normalize("NFKC", text).lower()
|
| 194 |
|
| 195 |
# 1. Try native contains
|
| 196 |
+
response = collection.get(
|
| 197 |
+
where=(
|
| 198 |
+
metadata_where_clause.to_chroma_where()
|
| 199 |
+
if metadata_where_clause is not None
|
| 200 |
+
else None
|
| 201 |
),
|
| 202 |
where_document={"$contains": literal_to_search_for},
|
| 203 |
+
limit=n_results,
|
| 204 |
)
|
| 205 |
|
| 206 |
if response["documents"] and any(response["documents"]):
|
| 207 |
+
return chromadb.QueryResult(
|
| 208 |
+
ids=response["ids"],
|
| 209 |
+
documents=response["documents"],
|
| 210 |
+
metadatas=response["metadatas"],
|
| 211 |
+
)
|
| 212 |
|
| 213 |
# 2. Regex fallback (normalized)
|
| 214 |
logger.info("⚠ No luck. Falling back to regex for %s", literal_to_search_for)
|
| 215 |
regex = re.compile(re.escape(normalize(literal_to_search_for)))
|
| 216 |
logger.info("regex = %s", regex)
|
| 217 |
|
| 218 |
+
all_docs = collection.get(
|
| 219 |
+
where=(
|
| 220 |
+
metadata_where_clause.to_chroma_where()
|
| 221 |
+
if metadata_where_clause is not None
|
| 222 |
+
else None
|
| 223 |
+
),
|
| 224 |
+
)
|
| 225 |
matched_docs = []
|
| 226 |
|
| 227 |
for doc_list, metadata_list, doc_id_list in zip(
|
|
|
|
| 268 |
if len(matched_docs) >= n_results:
|
| 269 |
break
|
| 270 |
|
| 271 |
+
return chromadb.QueryResult(
|
| 272 |
+
{
|
| 273 |
+
"documents": [[d["document"] for d in matched_docs]],
|
| 274 |
+
"ids": [[d["id"] for d in matched_docs]],
|
| 275 |
+
"metadatas": [[d["metadata"] for d in matched_docs]],
|
| 276 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 277 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 278 |
|
| 279 |
def count(self, collection_name: str):
|
| 280 |
collection = self.chroma_client.get_or_create_collection(name=collection_name)
|
|
|
|
| 287 |
count = self.count(collection_name=scripture["collection_name"])
|
| 288 |
if count == 0:
|
| 289 |
raise Exception(f"No data in collection {scripture["collection_name"]}")
|
| 290 |
+
|
| 291 |
def reembed_collection_openai(self, collection_name: str, batch_size: int = 50):
|
| 292 |
"""
|
| 293 |
Deletes and recreates a Chroma collection with OpenAI text-embedding-3-large embeddings.
|
| 294 |
All existing documents are re-embedded and inserted into the new collection.
|
| 295 |
+
|
| 296 |
Args:
|
| 297 |
collection_name: The name of the collection to delete/recreate.
|
| 298 |
batch_size: Number of documents to process per batch.
|
|
|
|
| 305 |
metadatas = old_data["metadatas"]
|
| 306 |
ids = old_data["ids"]
|
| 307 |
print(f"Fetched {len(documents)} documents from old collection.")
|
| 308 |
+
|
| 309 |
# Step 2: Delete old collection
|
| 310 |
# self.chroma_client.delete_collection(collection_name)
|
| 311 |
# print(f"Deleted old collection '{collection_name}'.")
|
|
|
|
| 318 |
name=f"{collection_name}_openai",
|
| 319 |
embedding_function=None, # embeddings will be provided manually
|
| 320 |
)
|
| 321 |
+
print(
|
| 322 |
+
f"Created new collection '{collection_name}_openai' with embedding_dim=3072."
|
| 323 |
+
)
|
| 324 |
|
| 325 |
# Step 4: Re-embed and insert documents in batches
|
| 326 |
+
for i in tqdm(
|
| 327 |
+
range(0, len(documents), batch_size), desc="Re-embedding batches"
|
| 328 |
+
):
|
| 329 |
+
batch_docs = documents[i : i + batch_size]
|
| 330 |
+
batch_metadatas = metadatas[i : i + batch_size]
|
| 331 |
+
batch_ids = ids[i : i + batch_size]
|
| 332 |
|
| 333 |
embeddings = get_embedding(batch_docs, backend="openai")
|
| 334 |
|
|
|
|
| 336 |
ids=batch_ids,
|
| 337 |
documents=batch_docs,
|
| 338 |
metadatas=batch_metadatas,
|
| 339 |
+
embeddings=embeddings,
|
| 340 |
)
|
| 341 |
+
print("All documents re-embedded and added to new collection successfully!")
|
metadata.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
from pydantic import BaseModel
|
| 2 |
-
from typing import Literal, Union, List, Dict
|
| 3 |
|
| 4 |
|
| 5 |
AllowedOps = Literal["$in", "$eq", "$gt", "$gte", "$lt", "$lte", "$ne", "$nin"]
|
|
@@ -11,19 +11,47 @@ class MetadataFilter(BaseModel):
|
|
| 11 |
metadata_value: Union[str, int, float, List[Union[str, int, float]]]
|
| 12 |
|
| 13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
class MetadataWhereClause(BaseModel):
|
| 15 |
-
filters: List[MetadataFilter]
|
|
|
|
|
|
|
| 16 |
|
| 17 |
def to_chroma_where(self) -> Dict:
|
| 18 |
-
|
| 19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
return {}
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
return
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
for f in self.filters
|
| 28 |
-
]
|
| 29 |
-
}
|
|
|
|
| 1 |
from pydantic import BaseModel
|
| 2 |
+
from typing import Literal, Optional, Union, List, Dict
|
| 3 |
|
| 4 |
|
| 5 |
AllowedOps = Literal["$in", "$eq", "$gt", "$gte", "$lt", "$lte", "$ne", "$nin"]
|
|
|
|
| 11 |
metadata_value: Union[str, int, float, List[Union[str, int, float]]]
|
| 12 |
|
| 13 |
|
| 14 |
+
# class MetadataWhereClause(BaseModel):
|
| 15 |
+
# filters: List[MetadataFilter]
|
| 16 |
+
# conditional_operator: Literal["$and", "$or"] = "$and"
|
| 17 |
+
|
| 18 |
+
# def to_chroma_where(self) -> Dict:
|
| 19 |
+
# """Convert list of MetadataFilter into Chroma-compatible where clause with AND logic."""
|
| 20 |
+
# if not self.filters:
|
| 21 |
+
# return {}
|
| 22 |
+
# if len(self.filters) == 1:
|
| 23 |
+
# f = self.filters[0]
|
| 24 |
+
# return {f.metadata_field: {f.metadata_search_operator: f.metadata_value}}
|
| 25 |
+
# return {
|
| 26 |
+
# self.conditional_operator: [
|
| 27 |
+
# {f.metadata_field: {f.metadata_search_operator: f.metadata_value}}
|
| 28 |
+
# for f in self.filters
|
| 29 |
+
# ]
|
| 30 |
+
# }
|
| 31 |
+
|
| 32 |
class MetadataWhereClause(BaseModel):
|
| 33 |
+
filters: Optional[List["MetadataFilter"]] = None
|
| 34 |
+
groups: Optional[List["MetadataWhereClause"]] = None
|
| 35 |
+
conditional_operator: Literal["$and", "$or"] = "$and"
|
| 36 |
|
| 37 |
def to_chroma_where(self) -> Dict:
|
| 38 |
+
parts = []
|
| 39 |
+
|
| 40 |
+
# Handle direct filters
|
| 41 |
+
if self.filters:
|
| 42 |
+
for f in self.filters:
|
| 43 |
+
parts.append({f.metadata_field: {f.metadata_search_operator: f.metadata_value}})
|
| 44 |
+
|
| 45 |
+
# Handle nested groups
|
| 46 |
+
if self.groups:
|
| 47 |
+
for g in self.groups:
|
| 48 |
+
parts.append(g.to_chroma_where())
|
| 49 |
+
|
| 50 |
+
if not parts:
|
| 51 |
return {}
|
| 52 |
+
|
| 53 |
+
if len(parts) == 1:
|
| 54 |
+
return parts[0]
|
| 55 |
+
|
| 56 |
+
# More than one part → wrap with conditional operator
|
| 57 |
+
return {self.conditional_operator: parts}
|
|
|
|
|
|
|
|
|
modules/db/relevance.py
CHANGED
|
@@ -1,4 +1,11 @@
|
|
| 1 |
from chromadb.api.types import QueryResult
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
def validate_relevance_queryresult(query: str, result: QueryResult, max_distance: float = 0.35):
|
| 4 |
"""
|
|
@@ -20,24 +27,24 @@ def validate_relevance_queryresult(query: str, result: QueryResult, max_distance
|
|
| 20 |
distances = result.get("distances", [])
|
| 21 |
|
| 22 |
if not documents:
|
| 23 |
-
return {
|
| 24 |
"status": "not_found",
|
| 25 |
"reason": "No results",
|
| 26 |
"result": result
|
| 27 |
-
}
|
| 28 |
|
| 29 |
# distances can be List[List[float]]; get the first distance of the first result
|
| 30 |
best_distance = distances[0][0] if distances and isinstance(distances[0], list) else (distances[0] if distances else float('inf'))
|
| 31 |
|
| 32 |
if best_distance > max_distance:
|
| 33 |
-
return {
|
| 34 |
"status": "not_relevant",
|
| 35 |
"reason": f"Best distance {best_distance:.4f} > {max_distance}",
|
| 36 |
"result": result
|
| 37 |
-
}
|
| 38 |
|
| 39 |
-
return {
|
| 40 |
"status": "ok",
|
| 41 |
"reason": "Relevant",
|
| 42 |
"result": result
|
| 43 |
-
}
|
|
|
|
| 1 |
from chromadb.api.types import QueryResult
|
| 2 |
+
from dataclasses import dataclass
|
| 3 |
+
|
| 4 |
+
@dataclass
|
| 5 |
+
class ValidationOutcome:
|
| 6 |
+
status : str
|
| 7 |
+
reason : str
|
| 8 |
+
result : QueryResult
|
| 9 |
|
| 10 |
def validate_relevance_queryresult(query: str, result: QueryResult, max_distance: float = 0.35):
|
| 11 |
"""
|
|
|
|
| 27 |
distances = result.get("distances", [])
|
| 28 |
|
| 29 |
if not documents:
|
| 30 |
+
return ValidationOutcome(**{
|
| 31 |
"status": "not_found",
|
| 32 |
"reason": "No results",
|
| 33 |
"result": result
|
| 34 |
+
})
|
| 35 |
|
| 36 |
# distances can be List[List[float]]; get the first distance of the first result
|
| 37 |
best_distance = distances[0][0] if distances and isinstance(distances[0], list) else (distances[0] if distances else float('inf'))
|
| 38 |
|
| 39 |
if best_distance > max_distance:
|
| 40 |
+
return ValidationOutcome(**{
|
| 41 |
"status": "not_relevant",
|
| 42 |
"reason": f"Best distance {best_distance:.4f} > {max_distance}",
|
| 43 |
"result": result
|
| 44 |
+
})
|
| 45 |
|
| 46 |
+
return ValidationOutcome(**{
|
| 47 |
"status": "ok",
|
| 48 |
"reason": "Relevant",
|
| 49 |
"result": result
|
| 50 |
+
})
|
modules/nodes/chat.py
CHANGED
|
@@ -8,15 +8,11 @@ from tools import (
|
|
| 8 |
tool_search_web,
|
| 9 |
tool_push,
|
| 10 |
tool_get_standardized_azhwar_names,
|
| 11 |
-
tool_search_db_by_metadata,
|
| 12 |
tool_get_standardized_divya_desam_names,
|
| 13 |
-
tool_search_db_for_literal,
|
| 14 |
)
|
| 15 |
|
| 16 |
tools = [
|
| 17 |
-
tool_search_db_by_metadata,
|
| 18 |
tool_search_db,
|
| 19 |
-
tool_search_db_for_literal,
|
| 20 |
tool_get_standardized_azhwar_names,
|
| 21 |
tool_get_standardized_prabandham_names,
|
| 22 |
tool_get_standardized_divya_desam_names,
|
|
|
|
| 8 |
tool_search_web,
|
| 9 |
tool_push,
|
| 10 |
tool_get_standardized_azhwar_names,
|
|
|
|
| 11 |
tool_get_standardized_divya_desam_names,
|
|
|
|
| 12 |
)
|
| 13 |
|
| 14 |
tools = [
|
|
|
|
| 15 |
tool_search_db,
|
|
|
|
| 16 |
tool_get_standardized_azhwar_names,
|
| 17 |
tool_get_standardized_prabandham_names,
|
| 18 |
tool_get_standardized_divya_desam_names,
|
modules/nodes/init.py
CHANGED
|
@@ -27,26 +27,7 @@ def init_system_prompt_node(state: ChatState) -> ChatState:
|
|
| 27 |
content=f"Here is the list of all scriptures along with their metadata configurations:\n{json.dumps(scriptures, separators=(',', ':'))}\n"
|
| 28 |
),
|
| 29 |
SystemMessage(
|
| 30 |
-
content="""
|
| 31 |
-
You have access to three scripture search tools. You MUST follow these rules when choosing a tool:
|
| 32 |
-
|
| 33 |
-
1. **tool_search_db_by_metadata** – Use this when the user explicitly provides metadata criteria such as:
|
| 34 |
-
- Specific azhwar name
|
| 35 |
-
- Prabandham or prabandham code
|
| 36 |
-
- Verse number or decade number
|
| 37 |
-
- Divya desam name
|
| 38 |
-
⚠️ Always call the corresponding standardization tool first.
|
| 39 |
-
- "If the user asks for a specific azhwar, use `tool_get_standardized_azhwar_names` first."
|
| 40 |
-
- "If the user asks for a specific prabandham, use `tool_get_standardized_prabandham_names` first."
|
| 41 |
-
- "If the user mentions a divya desam, use `tool_get_standardized_divya_desam_names` first."
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
2. **tool_semantic_vector_search** – Use this when the user asks about themes, ideas, emotions, or meanings without explicit verse numbers or metadata.
|
| 45 |
-
|
| 46 |
-
3. **tool_search_db_by_literal_text** – Use this only if the user explicitly requests an exact phrase match.
|
| 47 |
-
|
| 48 |
-
Never call a tool repeatedly with the same arguments. Stop if results don’t change meaningfully.
|
| 49 |
-
"""
|
| 50 |
),
|
| 51 |
SystemMessage(
|
| 52 |
content=f"""
|
|
@@ -56,28 +37,51 @@ You are a knowledgeable assistant for *{{collection_name}}*.
|
|
| 56 |
Languages: Sanskrit, Tamil, and {state['language']}.
|
| 57 |
Use **only** the verses and notes retrieved from the context. Never fabricate or import external knowledge.
|
| 58 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
---
|
| 60 |
|
| 61 |
-
### ✅ Default Response Format (always include)
|
| 62 |
|
| 63 |
-
###
|
|
|
|
| 64 |
|
| 65 |
-
### 📜
|
| 66 |
-
- Show
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
- Do not translate, transliterate, or explain.
|
| 68 |
- Preserve line breaks and spacing exactly.
|
| 69 |
|
| 70 |
### 📜 Sanitized Verse(s)
|
| 71 |
-
- Only include this section if sanitization changes anything.
|
| 72 |
- Sanitize by:
|
| 73 |
-
1. Fixing garbled Unicode characters.
|
| 74 |
2. Correcting broken diacritics, pulli markers, vowel signs, and punctuation.
|
| 75 |
3. Preserving original spacing and line order.
|
| 76 |
-
- If no change → skip this section entirely.
|
| 77 |
|
| 78 |
### 📜 {state['language']} – Simple Meaning
|
| 79 |
- Give a **short, natural summary/meaning** in {state['language']}.
|
| 80 |
-
- Keep it concise and error-free.
|
| 81 |
|
| 82 |
### 🔮 Next Steps
|
| 83 |
End with a short list of follow-up prompts:
|
|
@@ -97,15 +101,19 @@ End with a short list of follow-up prompts:
|
|
| 97 |
#### 📜 Transliteration
|
| 98 |
- Provide verse transliteration in {state['language']} if requested.
|
| 99 |
|
| 100 |
-
#### 📜 Word-by-Word Meaning
|
| 101 |
- Provide WBW meaning in English or {state['language']} if requested.
|
| 102 |
|
|
|
|
|
|
|
|
|
|
| 103 |
#### 📜 Detailed Notes / Purport
|
| 104 |
- Summarize and translate explanatory notes/purports if present in context.
|
| 105 |
|
| 106 |
---
|
| 107 |
|
| 108 |
⚠️ Rules:
|
|
|
|
| 109 |
- Do not duplicate content across sections.
|
| 110 |
- Do not invent verses, meanings, or purports.
|
| 111 |
- If no context found → reply in {state['language']}:
|
|
@@ -127,7 +135,7 @@ End with a short list of follow-up prompts:
|
|
| 127 |
)
|
| 128 |
)
|
| 129 |
)
|
| 130 |
-
state["initialized"] = True
|
| 131 |
|
| 132 |
state["tool_calls"] = 0
|
| 133 |
state["seen_tool_calls"] = set()
|
|
|
|
| 27 |
content=f"Here is the list of all scriptures along with their metadata configurations:\n{json.dumps(scriptures, separators=(',', ':'))}\n"
|
| 28 |
),
|
| 29 |
SystemMessage(
|
| 30 |
+
content="""The tools are deterministic. Caling the same tool multiple times with the same arguments are not going to yield different results. So NEVER call a tool twice or more with the same arguments. Stop if results don’t change meaningfully."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
),
|
| 32 |
SystemMessage(
|
| 33 |
content=f"""
|
|
|
|
| 37 |
Languages: Sanskrit, Tamil, and {state['language']}.
|
| 38 |
Use **only** the verses and notes retrieved from the context. Never fabricate or import external knowledge.
|
| 39 |
|
| 40 |
+
In the context, if there ia a variable called `html_utl`, then use that direcly for `reference_link`. If not, look for `video_id` and use that to construct the youtube url using https://www.youtube.com/watch?v={{video_id}} and store it under `reference_link`
|
| 41 |
+
RULE:
|
| 42 |
+
- If the user asks for "one verse", "any verse", "show me a verse", or similar, always return exactly ONE verse.
|
| 43 |
+
- Do not return multiple verses.
|
| 44 |
+
- Only return multiple verses if the user explicitly asks for more than one.
|
| 45 |
+
In the header at the end for the field `verse_or_page`, show the `verse` or `page` whichever is available in the context and mention Verse `verse` or Page `page` as the case may be.
|
| 46 |
+
|
| 47 |
---
|
| 48 |
|
| 49 |
+
### ✅ Default Response Format (always include unless it is a followup question and/or user requests specific details only)
|
| 50 |
|
| 51 |
+
### 🕉️ Scripture
|
| 52 |
+
- Show `collection` if available else skip the entire section including header.
|
| 53 |
|
| 54 |
+
### 📜 Divya Desam
|
| 55 |
+
- Show `divya_desams` if available else skip the entire section including header.
|
| 56 |
+
|
| 57 |
+
### 📜 Author
|
| 58 |
+
- Show `author` if available else skip the entire section including header.
|
| 59 |
+
|
| 60 |
+
### 📜 Verse Number
|
| 61 |
+
- Show verse number if available else skip the entire section including header. If available show this as a hyperlink with `html_url` if html_url is available.
|
| 62 |
+
|
| 63 |
+
### 📜 Title
|
| 64 |
+
- Show `title` if available else skip the entire section including header.
|
| 65 |
+
|
| 66 |
+
### 📜 Page
|
| 67 |
+
- Show `page` if available else skip the entire section including header.
|
| 68 |
+
|
| 69 |
+
### 📜 Original Verse
|
| 70 |
+
- Show exact original native-script verses from the context in a separate markdown block.
|
| 71 |
- Do not translate, transliterate, or explain.
|
| 72 |
- Preserve line breaks and spacing exactly.
|
| 73 |
|
| 74 |
### 📜 Sanitized Verse(s)
|
| 75 |
+
- Only include this section if sanitization changes anything otherwise don't even output the section heading .
|
| 76 |
- Sanitize by:
|
| 77 |
+
1. Fixing garbled Unicode characters in the original verse section.
|
| 78 |
2. Correcting broken diacritics, pulli markers, vowel signs, and punctuation.
|
| 79 |
3. Preserving original spacing and line order.
|
| 80 |
+
- If no change → skip this section entirely including the heading.
|
| 81 |
|
| 82 |
### 📜 {state['language']} – Simple Meaning
|
| 83 |
- Give a **short, natural summary/meaning** in {state['language']}.
|
| 84 |
+
- Keep it concise and error-free. Do not give word-by-word meanings here even if available.
|
| 85 |
|
| 86 |
### 🔮 Next Steps
|
| 87 |
End with a short list of follow-up prompts:
|
|
|
|
| 101 |
#### 📜 Transliteration
|
| 102 |
- Provide verse transliteration in {state['language']} if requested.
|
| 103 |
|
| 104 |
+
#### 📜 Word-by-Word Meaning (English)
|
| 105 |
- Provide WBW meaning in English or {state['language']} if requested.
|
| 106 |
|
| 107 |
+
#### 📜 Word-by-Word Meaning ({state['language']})
|
| 108 |
+
- Provide WBW meaning {state['language']} if requested.
|
| 109 |
+
|
| 110 |
#### 📜 Detailed Notes / Purport
|
| 111 |
- Summarize and translate explanatory notes/purports if present in context.
|
| 112 |
|
| 113 |
---
|
| 114 |
|
| 115 |
⚠️ Rules:
|
| 116 |
+
- For a follow-up question, if the user does not specify a context in the question, assume it is for the verse returned by the previous response.For e.g. "word by word meaning" implies that the user wants to know "the word by word meaning for the above pasuram".
|
| 117 |
- Do not duplicate content across sections.
|
| 118 |
- Do not invent verses, meanings, or purports.
|
| 119 |
- If no context found → reply in {state['language']}:
|
|
|
|
| 135 |
)
|
| 136 |
)
|
| 137 |
)
|
| 138 |
+
state["initialized"] = True
|
| 139 |
|
| 140 |
state["tool_calls"] = 0
|
| 141 |
state["seen_tool_calls"] = set()
|
nalayiram_helper.py
CHANGED
|
@@ -20,7 +20,7 @@ def get_standardized_prabandham_names() -> list[Pasuram]:
|
|
| 20 |
|
| 21 |
return final_azhwars
|
| 22 |
|
| 23 |
-
def get_standardized_azhwar_names() -> list[
|
| 24 |
"""
|
| 25 |
Get a list of azhwar names along with the pasurams they have authored in divya_prabandham
|
| 26 |
"""
|
|
@@ -28,12 +28,12 @@ def get_standardized_azhwar_names() -> list[Pasuram]:
|
|
| 28 |
azhwars = json.load(f)
|
| 29 |
header = azhwars[0]
|
| 30 |
rows = azhwars[1:]
|
| 31 |
-
final_azhwars = [
|
| 32 |
|
| 33 |
-
return final_azhwars
|
| 34 |
|
| 35 |
|
| 36 |
-
def get_standardized_divya_desam_names() -> list[
|
| 37 |
"""
|
| 38 |
Get a list of divya desam names in divya_prabandham
|
| 39 |
"""
|
|
@@ -52,7 +52,8 @@ def get_standardized_divya_desam_names() -> list[dict]:
|
|
| 52 |
"sampradayam",
|
| 53 |
"divya_desam",
|
| 54 |
]
|
| 55 |
-
|
|
|
|
| 56 |
|
| 57 |
|
| 58 |
if __name__ == "__main__":
|
|
|
|
| 20 |
|
| 21 |
return final_azhwars
|
| 22 |
|
| 23 |
+
def get_standardized_azhwar_names() -> list[str]:
|
| 24 |
"""
|
| 25 |
Get a list of azhwar names along with the pasurams they have authored in divya_prabandham
|
| 26 |
"""
|
|
|
|
| 28 |
azhwars = json.load(f)
|
| 29 |
header = azhwars[0]
|
| 30 |
rows = azhwars[1:]
|
| 31 |
+
final_azhwars = [row[1] for row in rows] ## 2nd field is the azhwar name
|
| 32 |
|
| 33 |
+
return sorted(set(final_azhwars))
|
| 34 |
|
| 35 |
|
| 36 |
+
def get_standardized_divya_desam_names() -> list[str]:
|
| 37 |
"""
|
| 38 |
Get a list of divya desam names in divya_prabandham
|
| 39 |
"""
|
|
|
|
| 52 |
"sampradayam",
|
| 53 |
"divya_desam",
|
| 54 |
]
|
| 55 |
+
data = [{key : row[key] for key in selected_fields if key in row} for row in divya_desams["pageProps"]["hits"]]
|
| 56 |
+
return sorted(set([row["title"] for row in data]))
|
| 57 |
|
| 58 |
|
| 59 |
if __name__ == "__main__":
|
sanatan_assistant.py
CHANGED
|
@@ -104,96 +104,52 @@ Respond in **Markdown** format only. Ensure Sanskrit/Tamil verses are always cle
|
|
| 104 |
return prompt
|
| 105 |
|
| 106 |
|
| 107 |
-
def query(
|
| 108 |
-
"""
|
| 109 |
-
Search a scripture collection.
|
| 110 |
-
|
| 111 |
-
Parameters:
|
| 112 |
-
- collection_name (str): The name of the scripture collection to search. ...
|
| 113 |
-
- query (str): The search query.
|
| 114 |
-
- n_results (int): Number of results to return. Default is 3.
|
| 115 |
-
|
| 116 |
-
Returns:
|
| 117 |
-
- A list of matching results.
|
| 118 |
-
"""
|
| 119 |
-
logger.info("Semantic Search: Searching collection [%s] for [%s]", collection_name, query)
|
| 120 |
-
response = sanatanDatabase.search(
|
| 121 |
-
collection_name=collection_name, query=query, n_results=n_results
|
| 122 |
-
)
|
| 123 |
-
|
| 124 |
-
return "\n\n".join(
|
| 125 |
-
f"Document: {doc}\nMetadata: {meta}\nID: {id_}"
|
| 126 |
-
for doc, meta, id_ in zip(
|
| 127 |
-
response["documents"], response["metadatas"], response["ids"]
|
| 128 |
-
)
|
| 129 |
-
)
|
| 130 |
-
|
| 131 |
-
def query_by_metadata_field(
|
| 132 |
collection_name: allowedCollections,
|
| 133 |
-
query: str,
|
| 134 |
-
metadata_where_clause
|
| 135 |
n_results=3,
|
|
|
|
| 136 |
):
|
| 137 |
"""
|
| 138 |
-
Search a scripture collection
|
| 139 |
|
| 140 |
Parameters:
|
| 141 |
-
- collection_name (str): The name of the scripture collection to search. ...
|
| 142 |
-
- query (str): The search query.
|
| 143 |
-
- metadata_where_clause: the filter
|
| 144 |
-
- metadata_field (str) : The name of the metadata field. e.g. azhwar_name
|
| 145 |
-
- metadata_search_operator (str) : The search operator e.g. $eq or $in. DO NOT use $regex.
|
| 146 |
-
- metadata_value : Value to search for can be any primitive datatype like str or int (or a list[str] if metadata_search_operator = '$in'). for e.g. Thirumangai Azhwar or '2233' or 2233
|
| 147 |
- n_results (int): Number of results to return. Default is 3.
|
|
|
|
| 148 |
|
| 149 |
Returns:
|
| 150 |
- A list of matching results.
|
| 151 |
"""
|
| 152 |
-
logger.info(
|
| 153 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 154 |
try:
|
| 155 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 156 |
except:
|
| 157 |
raise
|
| 158 |
|
| 159 |
-
response = sanatanDatabase.
|
| 160 |
collection_name=collection_name,
|
| 161 |
query=query,
|
| 162 |
metadata_where_clause=metadata_where_clause,
|
| 163 |
n_results=n_results,
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
return "\n\n".join(
|
| 167 |
-
f"Document: {doc}\nMetadata: {meta}\nID: {id_}"
|
| 168 |
-
for doc, meta, id_ in zip(
|
| 169 |
-
response["documents"], response["metadatas"], response["ids"]
|
| 170 |
-
)
|
| 171 |
-
)
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
def query_by_literal_text(
|
| 175 |
-
collection_name: allowedCollections,
|
| 176 |
-
literal_to_search_for: str,
|
| 177 |
-
n_results=3,
|
| 178 |
-
):
|
| 179 |
-
"""
|
| 180 |
-
Search a scripture collection by a literal. Do NOT use this for semantic search. Only use when the user specifically asks for literal search.
|
| 181 |
-
|
| 182 |
-
Parameters:
|
| 183 |
-
- collection_name (str): The name of the scripture collection to search. ...
|
| 184 |
-
- literal_to_search_for (str): The search query.
|
| 185 |
-
- n_results (int): Number of results to return. Default is 3.
|
| 186 |
-
|
| 187 |
-
Returns:
|
| 188 |
-
- A list of matching results.
|
| 189 |
-
"""
|
| 190 |
-
logger.info("Performing literal search in collection [%s] for [%s]", collection_name, literal_to_search_for)
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
response = sanatanDatabase.search_for_literal(
|
| 194 |
-
collection_name=collection_name,
|
| 195 |
-
literal_to_search_for=literal_to_search_for,
|
| 196 |
-
n_results=n_results,
|
| 197 |
)
|
| 198 |
|
| 199 |
return "\n\n".join(
|
|
|
|
| 104 |
return prompt
|
| 105 |
|
| 106 |
|
| 107 |
+
def query(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 108 |
collection_name: allowedCollections,
|
| 109 |
+
query: str | None = None,
|
| 110 |
+
metadata_where_clause: MetadataWhereClause | None = None,
|
| 111 |
n_results=3,
|
| 112 |
+
search_type: Literal["semantic", "literal", "random"] = "semantic",
|
| 113 |
):
|
| 114 |
"""
|
| 115 |
+
Search a scripture collection.
|
| 116 |
|
| 117 |
Parameters:
|
| 118 |
+
- collection_name (str): The name of the scripture collection to search (use the exact name from the metadata configuration. ...
|
| 119 |
+
- query (str): The search query - this is the semantic or literal query you want to search for. if you want to perform a random search or just want to search by metadata only, can be passed as None ..
|
| 120 |
+
- metadata_where_clause: MetadataWhereClause - Set to None if no metadata filters are requested. Always set when user mentions a specific prabandham, azhwar, or any other known field from the configuration. Example: {\"prabandham_name\": \"Thiruvaimozhi\"}. use the `conditional_operator` to filter based on $and or $or conditions. use `groups` to combine multiple queries into one.
|
|
|
|
|
|
|
|
|
|
| 121 |
- n_results (int): Number of results to return. Default is 3.
|
| 122 |
+
- search_type: can be one of semantic, literal or random.
|
| 123 |
|
| 124 |
Returns:
|
| 125 |
- A list of matching results.
|
| 126 |
"""
|
| 127 |
+
logger.info(
|
| 128 |
+
"%s Search: collection [%s] for [%s] | metadata_where_clause=%s",
|
| 129 |
+
search_type,
|
| 130 |
+
collection_name,
|
| 131 |
+
query,
|
| 132 |
+
metadata_where_clause,
|
| 133 |
+
)
|
| 134 |
+
if search_type != "random" and metadata_where_clause is None and query is None:
|
| 135 |
+
raise Exception(
|
| 136 |
+
"Invalid input: when search type is not random, either metadata_where_clause or query should be provided"
|
| 137 |
+
)
|
| 138 |
try:
|
| 139 |
+
if metadata_where_clause is not None:
|
| 140 |
+
sanatanConfig.is_metadata_field_allowed(
|
| 141 |
+
collection_name=collection_name,
|
| 142 |
+
metadata_where_clause=metadata_where_clause,
|
| 143 |
+
)
|
| 144 |
except:
|
| 145 |
raise
|
| 146 |
|
| 147 |
+
response = sanatanDatabase.search(
|
| 148 |
collection_name=collection_name,
|
| 149 |
query=query,
|
| 150 |
metadata_where_clause=metadata_where_clause,
|
| 151 |
n_results=n_results,
|
| 152 |
+
search_type=search_type,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
)
|
| 154 |
|
| 155 |
return "\n\n".join(
|
server.py
CHANGED
|
@@ -6,7 +6,7 @@ from fastapi import APIRouter, Request
|
|
| 6 |
from fastapi.responses import JSONResponse
|
| 7 |
import pycountry
|
| 8 |
from pydantic import BaseModel
|
| 9 |
-
from
|
| 10 |
from config import SanatanConfig
|
| 11 |
from db import SanatanDatabase
|
| 12 |
|
|
|
|
| 6 |
from fastapi.responses import JSONResponse
|
| 7 |
import pycountry
|
| 8 |
from pydantic import BaseModel
|
| 9 |
+
from chat_utils import chat
|
| 10 |
from config import SanatanConfig
|
| 11 |
from db import SanatanDatabase
|
| 12 |
|
tests/test_config.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Example test questions
|
| 2 |
+
TEST_QUESTIONS = [
|
| 3 |
+
{
|
| 4 |
+
"q": "one pasuram on thirukudandai and another from srirangam both written by thirumangai azhwar",
|
| 5 |
+
"type": "composite",
|
| 6 |
+
"difficulty": "complex",
|
| 7 |
+
"expected_answer_summary": "Should return one pasuram from Thirukudanthai and another from Srirangam, both authored by Thirumangai Azhwar.",
|
| 8 |
+
"expected_sources": ["Thirukudanthai", "Srirangam"],
|
| 9 |
+
"expected_azhwar": ["Thirumangai Azhwar"],
|
| 10 |
+
"n_results": 2,
|
| 11 |
+
},
|
| 12 |
+
{
|
| 13 |
+
"q": "give me 2 pasurams, one written by thirumazhisai alwar and the other by thirumangai azhwar, both written on divya desam Srirangam",
|
| 14 |
+
"type": "composite",
|
| 15 |
+
"difficulty": "complex",
|
| 16 |
+
"expected_answer_summary": "Should return two pasurams on Srirangam: one by Thirumazhisai Azhwar and the other by Thirumangai Azhwar.",
|
| 17 |
+
"expected_sources": ["Srirangam"],
|
| 18 |
+
"expected_azhwar": ["Thirumazhisai Azhwar", "Thirumangai Azhwar"],
|
| 19 |
+
"n_results": 2,
|
| 20 |
+
},
|
| 21 |
+
{
|
| 22 |
+
"q": "a pasuram from nanmugan thiruvandhadhi that talks about Krishna playing flute",
|
| 23 |
+
"type": "semantic",
|
| 24 |
+
"difficulty": "medium",
|
| 25 |
+
"expected_answer_summary": "Should return 1 pasuram from Nanmukan Thiruvanthathi.",
|
| 26 |
+
"expected_sources": ["Nanmukan Thiruvanthathi"],
|
| 27 |
+
"expected_azhwar": ["Thirumazhisai Azhwar"],
|
| 28 |
+
"expected_topics": ["Krishna", "Flute"],
|
| 29 |
+
"n_results": 1,
|
| 30 |
+
},
|
| 31 |
+
{
|
| 32 |
+
"q": "varaha avatar in nanmugan thiruvandhadhi",
|
| 33 |
+
"type": "semantic",
|
| 34 |
+
"difficulty": "medium",
|
| 35 |
+
"expected_answer_summary": "Should return 1 pasuram from Nanmukan Thiruvanthathi.",
|
| 36 |
+
"expected_sources": ["Nanmukan Thiruvanthathi"],
|
| 37 |
+
"expected_azhwar": ["Thirumazhisai Azhwar"],
|
| 38 |
+
"expected_keywords": ["boar"],
|
| 39 |
+
"n_results": 1,
|
| 40 |
+
},
|
| 41 |
+
{
|
| 42 |
+
"q": "varaha avatar in nanmugan thiruvandadhi and perumal thirumozhi",
|
| 43 |
+
"type": "semantic+composite",
|
| 44 |
+
"difficulty": "medium",
|
| 45 |
+
"expected_answer_summary": "Should return 2 pasurams. One from Nanmukan Thiruvanthathi and another from perumal thirumozhi.",
|
| 46 |
+
"expected_sources": ["Nanmukan Thiruvanthathi", "perumal thirumozhi"],
|
| 47 |
+
"expected_azhwar": ["Thirumazhisai Azhwar", "Kulasekhara Azhwar"],
|
| 48 |
+
"expected_keywords": ["boar"],
|
| 49 |
+
"n_results": 2,
|
| 50 |
+
},
|
| 51 |
+
]
|
tests/test_evaluator.py
ADDED
|
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from datetime import datetime
|
| 3 |
+
import openai
|
| 4 |
+
import json
|
| 5 |
+
|
| 6 |
+
from chat_utils import chat
|
| 7 |
+
from tests.test_config import TEST_QUESTIONS
|
| 8 |
+
|
| 9 |
+
def validate_with_ai(test_entry, bot_response):
|
| 10 |
+
"""
|
| 11 |
+
Validator works with narrative bot responses.
|
| 12 |
+
The bot does not need to output JSON.
|
| 13 |
+
The LLM analyzes the bot response and returns a JSON validation.
|
| 14 |
+
"""
|
| 15 |
+
prompt = f"""
|
| 16 |
+
You are a validator AI. The user provided the following bot response:
|
| 17 |
+
|
| 18 |
+
Bot Response:
|
| 19 |
+
\"\"\"{bot_response}\"\"\"
|
| 20 |
+
|
| 21 |
+
Expected attributes:
|
| 22 |
+
- Sources: {test_entry.get('expected_sources', [])}
|
| 23 |
+
- Azhwar: {test_entry.get('expected_azhwar', [])}
|
| 24 |
+
- Topics: {test_entry.get('expected_topics', [])}
|
| 25 |
+
- Keywords: {test_entry.get('expected_keywords', [])}
|
| 26 |
+
- Number of results: {test_entry.get('n_results', 1)}
|
| 27 |
+
|
| 28 |
+
Check the bot response and answer **only** in JSON with two fields:
|
| 29 |
+
{{
|
| 30 |
+
"valid": true/false, // True if bot response matches the expected attributes
|
| 31 |
+
"feedback": "short explanation why it passed or failed"
|
| 32 |
+
}}
|
| 33 |
+
|
| 34 |
+
Do **not** ask the bot to output the JSON itself. You should parse the narrative internally and return JSON.
|
| 35 |
+
"""
|
| 36 |
+
resp = openai.chat.completions.create(
|
| 37 |
+
model="gpt-5-nano",
|
| 38 |
+
messages=[{"role": "user", "content": prompt}],
|
| 39 |
+
)
|
| 40 |
+
try:
|
| 41 |
+
content = resp.choices[0].message.content
|
| 42 |
+
return json.loads(content)
|
| 43 |
+
except Exception as e:
|
| 44 |
+
return {"valid": False, "feedback": f"Validator parsing error: {e}"}
|
| 45 |
+
|
| 46 |
+
def run_tests(debug_mode=False):
|
| 47 |
+
history = []
|
| 48 |
+
thread_id = "test_thread"
|
| 49 |
+
|
| 50 |
+
# Create log directory if it doesn't exist
|
| 51 |
+
log_dir = "outputs/tests"
|
| 52 |
+
os.makedirs(log_dir, exist_ok=True)
|
| 53 |
+
|
| 54 |
+
# Markdown log file with timestamp
|
| 55 |
+
run_id = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 56 |
+
log_file_path = os.path.join(log_dir, f"{run_id}.md")
|
| 57 |
+
|
| 58 |
+
# Keep track of summary
|
| 59 |
+
total_tests = len(TEST_QUESTIONS)
|
| 60 |
+
passed_tests = 0
|
| 61 |
+
results_summary = []
|
| 62 |
+
|
| 63 |
+
with open(log_file_path, "w", encoding="utf-8") as f:
|
| 64 |
+
f.write(f"# Sanatan AI Test Run - {run_id}\n\n")
|
| 65 |
+
for idx, test in enumerate(TEST_QUESTIONS, start=1):
|
| 66 |
+
f.write(f"## Test {idx}: {test['q']}\n\n")
|
| 67 |
+
f.write(f"**Type:** {test['type']} \n")
|
| 68 |
+
f.write(f"**Difficulty:** {test['difficulty']} \n")
|
| 69 |
+
f.write(f"**Expected Summary:** {test.get('expected_answer_summary', '')}\n\n")
|
| 70 |
+
|
| 71 |
+
print(f"\n=== Testing Question ===\n{test['q']}")
|
| 72 |
+
bot_response = chat(debug_mode, test["q"], history, thread_id)
|
| 73 |
+
f.write(f"### Bot Response\n```\n{bot_response}\n```\n\n")
|
| 74 |
+
|
| 75 |
+
validation = validate_with_ai(test, bot_response)
|
| 76 |
+
f.write(f"### Validation\n- **Valid:** {validation['valid']}\n- **Feedback:** {validation['feedback']}\n\n")
|
| 77 |
+
|
| 78 |
+
print(f"Valid: {validation['valid']}\nFeedback: {validation['feedback']}")
|
| 79 |
+
|
| 80 |
+
# Track results for summary
|
| 81 |
+
results_summary.append({
|
| 82 |
+
"question": test['q'],
|
| 83 |
+
"valid": validation['valid']
|
| 84 |
+
})
|
| 85 |
+
if validation['valid']:
|
| 86 |
+
passed_tests += 1
|
| 87 |
+
|
| 88 |
+
# Write run summary
|
| 89 |
+
failed_tests = total_tests - passed_tests
|
| 90 |
+
pass_rate = (passed_tests / total_tests) * 100 if total_tests > 0 else 0
|
| 91 |
+
f.write(f"# Run Summary\n\n")
|
| 92 |
+
f.write(f"- **Total Tests:** {total_tests}\n")
|
| 93 |
+
f.write(f"- **Passed:** {passed_tests}\n")
|
| 94 |
+
f.write(f"- **Failed:** {failed_tests}\n")
|
| 95 |
+
f.write(f"- **Pass Rate:** {pass_rate:.2f}%\n\n")
|
| 96 |
+
|
| 97 |
+
# Optional: Table of all test results
|
| 98 |
+
f.write("## Test Results Table\n\n")
|
| 99 |
+
f.write("| Test | Question | Valid |\n")
|
| 100 |
+
f.write("|------|----------|-------|\n")
|
| 101 |
+
for i, res in enumerate(results_summary, start=1):
|
| 102 |
+
valid_str = "✅" if res['valid'] else "❌"
|
| 103 |
+
f.write(f"| {i} | {res['question']} | {valid_str} |\n")
|
| 104 |
+
|
| 105 |
+
print(f"\nTest run complete. Markdown log saved to {log_file_path}")
|
| 106 |
+
|
| 107 |
+
if __name__ == "__main__":
|
| 108 |
+
run_tests(debug_mode=True)
|
tools.py
CHANGED
|
@@ -13,8 +13,6 @@ from serperdev_helper import search as search_web
|
|
| 13 |
from sanatan_assistant import (
|
| 14 |
format_scripture_answer,
|
| 15 |
query,
|
| 16 |
-
query_by_metadata_field,
|
| 17 |
-
query_by_literal_text,
|
| 18 |
)
|
| 19 |
|
| 20 |
tool_push = Tool(
|
|
@@ -25,52 +23,37 @@ allowed_collections = [s["collection_name"] for s in SanatanConfig.scriptures]
|
|
| 25 |
|
| 26 |
tool_search_db = StructuredTool.from_function(
|
| 27 |
query,
|
| 28 |
-
name="
|
| 29 |
description=(
|
| 30 |
-
"🚫
|
| 31 |
-
"
|
| 32 |
-
"
|
| 33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
f"The collection_name must be one of: {', '.join(allowed_collections)}."
|
| 35 |
),
|
| 36 |
)
|
| 37 |
|
| 38 |
|
| 39 |
-
tool_search_db_for_literal = StructuredTool.from_function(
|
| 40 |
-
query_by_literal_text,
|
| 41 |
-
name="tool_search_db_by_literal_text",
|
| 42 |
-
description=(
|
| 43 |
-
"🚫 Never use this tool by default."
|
| 44 |
-
" ✅ Only use this tool if the user explicitly requests a 'literal match', 'exact phrase search', or uses words like 'match exactly', 'find the exact string', 'verbatim', or 'literal text'."
|
| 45 |
-
" If the user simply asks for a verse number (e.g., verse 34, pasuram 2.3.5, sahasranamam verse 20), you must NOT use this tool — instead you must use `tool_search_db_by_metadata`."
|
| 46 |
-
" Do not fall back to this tool if semantic or metadata search seems difficult or fails — it is reserved strictly for explicit literal match requests."
|
| 47 |
-
f" The collection_name must be one of: {', '.join(allowed_collections)}."
|
| 48 |
-
),
|
| 49 |
-
)
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
tool_search_db_by_metadata = StructuredTool.from_function(
|
| 54 |
-
query_by_metadata_field,
|
| 55 |
-
name="tool_search_db_by_metadata",
|
| 56 |
-
description=(
|
| 57 |
-
"Use this tool **only when the user provides explicit metadata criteria**, such as: azhwar name, pasuram number, verse number, decade, prabandham name, or divya desam name."
|
| 58 |
-
" This is not meant for general queries."
|
| 59 |
-
f" The collection_name must be one of: {', '.join(allowed_collections)}."
|
| 60 |
-
"You *MUST* ALWAYS call one of the standardization tools available to get the correct entity name before using this tool."
|
| 61 |
-
"If the user asks for a specific azhwar, use `tool_get_standardized_azhwar_names` first."
|
| 62 |
-
"If the user asks for a specific prabandham, use `tool_get_standardized_prabandham_names` first."
|
| 63 |
-
"If the user mentions a divya desam, use `tool_get_standardized_divya_desam_names` first."
|
| 64 |
-
"If you set metadata_search_operator to $in, then metadata_value must always be a list — even if it contains only a single item."
|
| 65 |
-
"""🔒 Important:
|
| 66 |
-
When using the tool_get_standardized_azhwar_names, tool_get_standardized_divya_desam_names, or any similar standardization tool, you must use the standardized name exactly as returned by the tool — without modifying, reformatting, translating, or simplifying it in any way.
|
| 67 |
-
For example, if the tool returns Thirumālirum Solai, you must pass that exact string to tool_search_db_by_metadata. Do not change it to Thirumalirum Solai, Tirumalirumsolai, or anything else.
|
| 68 |
-
🔍 This is critical for the search to return results correctly.
|
| 69 |
-
🚫 Any deviation will cause the search to fail or miss results."""
|
| 70 |
-
),
|
| 71 |
-
)
|
| 72 |
-
|
| 73 |
-
|
| 74 |
tool_search_web = Tool(
|
| 75 |
name="search_web", description="Search the web for information", func=search_web
|
| 76 |
)
|
|
|
|
| 13 |
from sanatan_assistant import (
|
| 14 |
format_scripture_answer,
|
| 15 |
query,
|
|
|
|
|
|
|
| 16 |
)
|
| 17 |
|
| 18 |
tool_push = Tool(
|
|
|
|
| 23 |
|
| 24 |
tool_search_db = StructuredTool.from_function(
|
| 25 |
query,
|
| 26 |
+
name="tool_search_db",
|
| 27 |
description=(
|
| 28 |
+
"🚫 use this tool to fetch any data from the database."
|
| 29 |
+
"rules for metadata_where_clause:"
|
| 30 |
+
"""
|
| 31 |
+
- ⚠️ Every time you include a metadata_where_clause argument, you must first call the appropriate standardization tool (tool_get_standardized_divya_desam_names,tool_get_standardized_prabandham_names,tool_get_standardized_azhwar_names). Never insert raw values directly. Even if the input already looks correct, you must still call the tool. If you fail to do this, the query will be invalid.
|
| 32 |
+
> Standardization Step 1: Call the standardization tool to get the canonical Divya Desam name.
|
| 33 |
+
|--Example:
|
| 34 |
+
|----standardized_divya_desams = tool_get_standardized_divya_desam_names()
|
| 35 |
+
|----standardized_divya_desam = look for closest match to "Thirukkudandai" in standardized_divya_desams
|
| 36 |
+
> Standardization Step 2: Use the standardized name in your DB search argument for metadata_where_clause for the field divya_desams.
|
| 37 |
+
- When choosing collection_name argument for the tool_search_db, make sure you choose the exact collection_name from the metadata configuration above
|
| 38 |
+
- Always prefer a single tool call with composite filters rather than multiple calls.
|
| 39 |
+
- For MetadataWhereClause.filters.$.metadata_search_operator do not use $regex as angument. use semantic search option by using query argument instead.
|
| 40 |
+
- If user posts a thematic question, do not ignore the theme when you pass `query` arguments.
|
| 41 |
+
- Use `MetadataWhereClause` recursively with `filters` and `groups` to build nested conditions.
|
| 42 |
+
"""
|
| 43 |
+
"- Always set metadata filters when user mentions a specific divya desam, prabandham, azhwar, or any other known field from the configuration. Example: {\"prabandham_name\": \"Thiruvaimozhi\"}."
|
| 44 |
+
"- Multiple metadata filters can be passed at the same time."
|
| 45 |
+
"- If passing '$in' as metadata_search_operator, the metadata_value should always be of type array. for instance {'metadata_field': 'divya_desams', 'metadata_search_operator': '$in', 'metadata_value': []'Srirangam']}"
|
| 46 |
+
"- Set metadata filters as None if no metadata filter is requested.\n"
|
| 47 |
+
"rules for search_type:"
|
| 48 |
+
"- use `random` if user does not provide a thematic/semantic search request. For e.g. 'any pasuram' or 'any pasuram from thiruvaimozhi'"
|
| 49 |
+
"- use `semantic` if user provides thematic/semantic search request"
|
| 50 |
+
"- use `literal` ONLY if user specifically requests for a literal search."
|
| 51 |
+
"\n"
|
| 52 |
f"The collection_name must be one of: {', '.join(allowed_collections)}."
|
| 53 |
),
|
| 54 |
)
|
| 55 |
|
| 56 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
tool_search_web = Tool(
|
| 58 |
name="search_web", description="Search the web for information", func=search_web
|
| 59 |
)
|