chinmayjha's picture
Deploy complete Second Brain AI Assistant with custom UI
b27eb78
raw
history blame
2.38 kB
from pathlib import Path
from loguru import logger
from opik.evaluation import evaluate
from opik.evaluation.metrics import AnswerRelevance, Hallucination, Moderation
from second_brain_online import opik_utils
from second_brain_online.application.agents import agents, extract_tool_responses
from second_brain_online.config import settings
from .summary_density_heuristic import SummaryDensityHeuristic
from .summary_density_judge import SummaryDensityJudge
opik_utils.configure()
def evaluate_agent(prompts: list[str], retriever_config_path: Path) -> None:
assert settings.COMET_API_KEY, (
"COMET_API_KEY is not set. We need it to track the experiment with Opik."
)
logger.info("Starting evaluation...")
logger.info(f"Evaluating agent with {len(prompts)} prompts.")
def evaluation_task(x: dict) -> dict:
"""Call agentic app logic to evaluate."""
agent = agents.get_agent(retriever_config_path=retriever_config_path)
response = agent.run(x["input"])
context = extract_tool_responses(agent)
return {
"input": x["input"],
"context": context,
"output": response,
}
# Get or create dataset
dataset_name = "second_brain_rag_agentic_app_evaluation_dataset"
dataset = opik_utils.get_or_create_dataset(name=dataset_name, prompts=prompts)
# Evaluate
agent = agents.get_agent(retriever_config_path=retriever_config_path)
experiment_config = {
"model_id": settings.OPENAI_MODEL_ID,
"retriever_config_path": retriever_config_path,
"agent_config": {
"max_steps": agent.max_steps,
"agent_name": agent.agent_name,
},
}
scoring_metrics = [
Hallucination(),
AnswerRelevance(),
Moderation(),
SummaryDensityHeuristic(),
SummaryDensityJudge(),
]
if dataset:
logger.info("Evaluation details:")
logger.info(f"Dataset: {dataset_name}")
logger.info(f"Metrics: {[m.__class__.__name__ for m in scoring_metrics]}")
evaluate(
dataset=dataset,
task=evaluation_task,
scoring_metrics=scoring_metrics,
experiment_config=experiment_config,
task_threads=2,
)
else:
logger.error("Can't run the evaluation as the dataset items are empty.")