Spaces:
Sleeping
Sleeping
| from pathlib import Path | |
| from loguru import logger | |
| from opik.evaluation import evaluate | |
| from opik.evaluation.metrics import AnswerRelevance, Hallucination, Moderation | |
| from second_brain_online import opik_utils | |
| from second_brain_online.application.agents import agents, extract_tool_responses | |
| from second_brain_online.config import settings | |
| from .summary_density_heuristic import SummaryDensityHeuristic | |
| from .summary_density_judge import SummaryDensityJudge | |
| opik_utils.configure() | |
| def evaluate_agent(prompts: list[str], retriever_config_path: Path) -> None: | |
| assert settings.COMET_API_KEY, ( | |
| "COMET_API_KEY is not set. We need it to track the experiment with Opik." | |
| ) | |
| logger.info("Starting evaluation...") | |
| logger.info(f"Evaluating agent with {len(prompts)} prompts.") | |
| def evaluation_task(x: dict) -> dict: | |
| """Call agentic app logic to evaluate.""" | |
| agent = agents.get_agent(retriever_config_path=retriever_config_path) | |
| response = agent.run(x["input"]) | |
| context = extract_tool_responses(agent) | |
| return { | |
| "input": x["input"], | |
| "context": context, | |
| "output": response, | |
| } | |
| # Get or create dataset | |
| dataset_name = "second_brain_rag_agentic_app_evaluation_dataset" | |
| dataset = opik_utils.get_or_create_dataset(name=dataset_name, prompts=prompts) | |
| # Evaluate | |
| agent = agents.get_agent(retriever_config_path=retriever_config_path) | |
| experiment_config = { | |
| "model_id": settings.OPENAI_MODEL_ID, | |
| "retriever_config_path": retriever_config_path, | |
| "agent_config": { | |
| "max_steps": agent.max_steps, | |
| "agent_name": agent.agent_name, | |
| }, | |
| } | |
| scoring_metrics = [ | |
| Hallucination(), | |
| AnswerRelevance(), | |
| Moderation(), | |
| SummaryDensityHeuristic(), | |
| SummaryDensityJudge(), | |
| ] | |
| if dataset: | |
| logger.info("Evaluation details:") | |
| logger.info(f"Dataset: {dataset_name}") | |
| logger.info(f"Metrics: {[m.__class__.__name__ for m in scoring_metrics]}") | |
| evaluate( | |
| dataset=dataset, | |
| task=evaluation_task, | |
| scoring_metrics=scoring_metrics, | |
| experiment_config=experiment_config, | |
| task_threads=2, | |
| ) | |
| else: | |
| logger.error("Can't run the evaluation as the dataset items are empty.") | |