import os from datetime import datetime import openai import json from chat_utils import chat from tests.test_config import TEST_QUESTIONS def validate_with_ai(test_entry, bot_response): """ Validator works with narrative bot responses. The bot does not need to output JSON. The LLM analyzes the bot response and returns a JSON validation. """ prompt = f""" You are a validator AI. The user provided the following bot response: Bot Response: \"\"\"{bot_response}\"\"\" Expected attributes: - Sources: {test_entry.get('expected_sources', [])} - Azhwar: {test_entry.get('expected_azhwar', [])} - Topics: {test_entry.get('expected_topics', [])} - Keywords: {test_entry.get('expected_keywords', [])} - Number of results: {test_entry.get('n_results', 1)} Check the bot response and answer **only** in JSON with two fields: {{ "valid": true/false, // True if bot response matches the expected attributes "feedback": "short explanation why it passed or failed" }} Do **not** ask the bot to output the JSON itself. You should parse the narrative internally and return JSON. """ resp = openai.chat.completions.create( model="gpt-5-nano", messages=[{"role": "user", "content": prompt}], ) try: content = resp.choices[0].message.content return json.loads(content) except Exception as e: return {"valid": False, "feedback": f"Validator parsing error: {e}"} def run_tests(debug_mode=False): history = [] thread_id = "test_thread" # Create log directory if it doesn't exist log_dir = "outputs/tests" os.makedirs(log_dir, exist_ok=True) # Markdown log file with timestamp run_id = datetime.now().strftime("%Y%m%d_%H%M%S") log_file_path = os.path.join(log_dir, f"{run_id}.md") # Keep track of summary total_tests = len(TEST_QUESTIONS) passed_tests = 0 results_summary = [] with open(log_file_path, "w", encoding="utf-8") as f: f.write(f"# Sanatan AI Test Run - {run_id}\n\n") for idx, test in enumerate(TEST_QUESTIONS, start=1): f.write(f"## Test {idx}: {test['q']}\n\n") f.write(f"**Type:** {test['type']} \n") f.write(f"**Difficulty:** {test['difficulty']} \n") f.write(f"**Expected Summary:** {test.get('expected_answer_summary', '')}\n\n") print(f"\n=== Testing Question ===\n{test['q']}") bot_response = chat(debug_mode, test["q"], history, thread_id) f.write(f"### Bot Response\n```\n{bot_response}\n```\n\n") validation = validate_with_ai(test, bot_response) f.write(f"### Validation\n- **Valid:** {validation['valid']}\n- **Feedback:** {validation['feedback']}\n\n") print(f"Valid: {validation['valid']}\nFeedback: {validation['feedback']}") # Track results for summary results_summary.append({ "question": test['q'], "valid": validation['valid'] }) if validation['valid']: passed_tests += 1 # Write run summary failed_tests = total_tests - passed_tests pass_rate = (passed_tests / total_tests) * 100 if total_tests > 0 else 0 f.write(f"# Run Summary\n\n") f.write(f"- **Total Tests:** {total_tests}\n") f.write(f"- **Passed:** {passed_tests}\n") f.write(f"- **Failed:** {failed_tests}\n") f.write(f"- **Pass Rate:** {pass_rate:.2f}%\n\n") # Optional: Table of all test results f.write("## Test Results Table\n\n") f.write("| Test | Question | Valid |\n") f.write("|------|----------|-------|\n") for i, res in enumerate(results_summary, start=1): valid_str = "✅" if res['valid'] else "❌" f.write(f"| {i} | {res['question']} | {valid_str} |\n") print(f"\nTest run complete. Markdown log saved to {log_file_path}") if __name__ == "__main__": run_tests(debug_mode=True)