File size: 5,226 Bytes
3a7aaed
 
 
 
a0888ca
3a7aaed
 
 
 
 
 
 
 
 
 
 
 
 
a0888ca
3a7aaed
 
 
 
 
 
 
 
 
a0888ca
3a7aaed
 
 
 
 
 
 
 
 
a0888ca
3a7aaed
 
 
 
 
 
a0888ca
3a7aaed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a0888ca
3a7aaed
 
 
 
 
a0888ca
3a7aaed
 
 
a0888ca
 
3a7aaed
 
 
 
 
 
 
 
 
a0888ca
3a7aaed
a0888ca
3a7aaed
 
 
 
a0888ca
3a7aaed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import json\n",
    "from phoenix.client import Client\n",
    "\n",
    "# Load the existing spans\n",
    "spans_df = Client().spans.get_spans_dataframe(project_name=\"default\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load the source of truth\n",
    "dataset_df = pd.read_json(\"../data/metadata.jsonl\", lines=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Filter by root agents\n",
    "agents_df = spans_df[(spans_df.span_kind == 'AGENT') & (spans_df.parent_id.isna()) & (spans_df.status_code == 'OK')]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/var/folders/pj/v1zrqj1d10x9_1rd2njh_r_r0000gn/T/ipykernel_36696/3107371246.py:2: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  agents_df[\"task\"] = agents_df[\"attributes.input.value\"].apply(json.loads).apply(lambda x : x[\"task\"]).str.replace(r'\\s*The mentionned file can be downloaded from.*$', '', regex=True)\n"
     ]
    }
   ],
   "source": [
    "# Retrieve the right question and add the answer\n",
    "agents_df[\"task\"] = agents_df[\"attributes.input.value\"].apply(json.loads).apply(lambda x : x[\"task\"]).str.replace(r'\\s*The mentionned file can be downloaded from.*$', '', regex=True)\n",
    "agents_merged_df = pd.merge(agents_df,dataset_df,how=\"left\",left_on=\"task\", right_on=\"Question\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "from phoenix.evals.evaluators import bind_evaluator, async_evaluate_dataframe\n",
    "from evaluators import conciseness_evaluator\n",
    "from scorer import question_scorer_wrapper as question_scorer\n",
    "\n",
    "# Define the evaluator\n",
    "conciseness_evaluator = bind_evaluator(evaluator=conciseness_evaluator, input_mapping={ \"output\": \"attributes.output.value\", \"expected\": \"Final answer\"})\n",
    "question_scorer_eval = bind_evaluator(evaluator=question_scorer, input_mapping={ \"output\": \"attributes.output.value\", \"expected\": \"Final answer\"})\n",
    "results_df = await async_evaluate_dataframe(agents_merged_df, evaluators=[exact_match_eval, conciseness_evaluator, question_scorer_eval])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "results_df[\"conciseness\"] = results_df.conciseness_evaluator_score.apply(json.loads).apply(lambda x : x[\"label\"])\n",
    "results_df[\"question_scorer\"] = results_df.question_scorer_score.apply(json.loads).apply(lambda x : x[\"score\"])\n",
    "results_df[\"agent_type\"] = results_df[\"attributes.smolagents\"].apply(lambda x : \"multi_agent\" if \"managed_agents\" in x else \"llm_agent\")\n",
    "results_filtered_df = results_df[[\"name\", \"span_kind\", \"start_time\", \"context.span_id\", \"context.trace_id\",\"attributes.output.value\", \"task_id\", \"Question\", \"Final answer\", \"agent_type\", \"conciseness_evaluator_score\", \"question_scorer_score\", \"conciseness\", \"question_scorer\"]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/romainfayoux/Documents/Programmation/Final_Assignment_Template/.venv/lib/python3.12/site-packages/phoenix/evals/utils.py:367: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
      "  result_df = pd.concat(result_dfs, ignore_index=True)\n"
     ]
    }
   ],
   "source": [
    "# Upload results\n",
    "import numpy as np\n",
    "from phoenix.evals.utils import to_annotation_dataframe\n",
    "\n",
    "annotation_df = to_annotation_dataframe(results_filtered_df)\n",
    "annotation_df = annotation_df.replace({np.nan: None})\n",
    "Client().spans.log_span_annotations_dataframe(dataframe=annotation_df)\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Final_Assignment_Template",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}