Spaces:
Sleeping
Sleeping
File size: 2,378 Bytes
b27eb78 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 |
from pathlib import Path
from loguru import logger
from opik.evaluation import evaluate
from opik.evaluation.metrics import AnswerRelevance, Hallucination, Moderation
from second_brain_online import opik_utils
from second_brain_online.application.agents import agents, extract_tool_responses
from second_brain_online.config import settings
from .summary_density_heuristic import SummaryDensityHeuristic
from .summary_density_judge import SummaryDensityJudge
opik_utils.configure()
def evaluate_agent(prompts: list[str], retriever_config_path: Path) -> None:
assert settings.COMET_API_KEY, (
"COMET_API_KEY is not set. We need it to track the experiment with Opik."
)
logger.info("Starting evaluation...")
logger.info(f"Evaluating agent with {len(prompts)} prompts.")
def evaluation_task(x: dict) -> dict:
"""Call agentic app logic to evaluate."""
agent = agents.get_agent(retriever_config_path=retriever_config_path)
response = agent.run(x["input"])
context = extract_tool_responses(agent)
return {
"input": x["input"],
"context": context,
"output": response,
}
# Get or create dataset
dataset_name = "second_brain_rag_agentic_app_evaluation_dataset"
dataset = opik_utils.get_or_create_dataset(name=dataset_name, prompts=prompts)
# Evaluate
agent = agents.get_agent(retriever_config_path=retriever_config_path)
experiment_config = {
"model_id": settings.OPENAI_MODEL_ID,
"retriever_config_path": retriever_config_path,
"agent_config": {
"max_steps": agent.max_steps,
"agent_name": agent.agent_name,
},
}
scoring_metrics = [
Hallucination(),
AnswerRelevance(),
Moderation(),
SummaryDensityHeuristic(),
SummaryDensityJudge(),
]
if dataset:
logger.info("Evaluation details:")
logger.info(f"Dataset: {dataset_name}")
logger.info(f"Metrics: {[m.__class__.__name__ for m in scoring_metrics]}")
evaluate(
dataset=dataset,
task=evaluation_task,
scoring_metrics=scoring_metrics,
experiment_config=experiment_config,
task_threads=2,
)
else:
logger.error("Can't run the evaluation as the dataset items are empty.")
|