Romain Fayoux
commited on
Commit
·
a0888ca
1
Parent(s):
14d6990
Used official scoring function
Browse files- eval/eval_notebook.ipynb +12 -12
- eval/scorer.py +106 -0
eval/eval_notebook.ipynb
CHANGED
|
@@ -2,7 +2,7 @@
|
|
| 2 |
"cells": [
|
| 3 |
{
|
| 4 |
"cell_type": "code",
|
| 5 |
-
"execution_count":
|
| 6 |
"metadata": {},
|
| 7 |
"outputs": [],
|
| 8 |
"source": [
|
|
@@ -16,7 +16,7 @@
|
|
| 16 |
},
|
| 17 |
{
|
| 18 |
"cell_type": "code",
|
| 19 |
-
"execution_count":
|
| 20 |
"metadata": {},
|
| 21 |
"outputs": [],
|
| 22 |
"source": [
|
|
@@ -26,7 +26,7 @@
|
|
| 26 |
},
|
| 27 |
{
|
| 28 |
"cell_type": "code",
|
| 29 |
-
"execution_count":
|
| 30 |
"metadata": {},
|
| 31 |
"outputs": [],
|
| 32 |
"source": [
|
|
@@ -36,14 +36,14 @@
|
|
| 36 |
},
|
| 37 |
{
|
| 38 |
"cell_type": "code",
|
| 39 |
-
"execution_count":
|
| 40 |
"metadata": {},
|
| 41 |
"outputs": [
|
| 42 |
{
|
| 43 |
"name": "stderr",
|
| 44 |
"output_type": "stream",
|
| 45 |
"text": [
|
| 46 |
-
"/var/folders/pj/v1zrqj1d10x9_1rd2njh_r_r0000gn/T/
|
| 47 |
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
|
| 48 |
"Try using .loc[row_indexer,col_indexer] = value instead\n",
|
| 49 |
"\n",
|
|
@@ -60,18 +60,18 @@
|
|
| 60 |
},
|
| 61 |
{
|
| 62 |
"cell_type": "code",
|
| 63 |
-
"execution_count":
|
| 64 |
"metadata": {},
|
| 65 |
"outputs": [],
|
| 66 |
"source": [
|
| 67 |
"from phoenix.evals.evaluators import bind_evaluator, async_evaluate_dataframe\n",
|
| 68 |
-
"from phoenix.evals.metrics import exact_match\n",
|
| 69 |
"from evaluators import conciseness_evaluator\n",
|
|
|
|
| 70 |
"\n",
|
| 71 |
"# Define the evaluator\n",
|
| 72 |
-
"exact_match_eval = bind_evaluator(evaluator=exact_match, input_mapping= { \"output\": \"attributes.output.value\", \"expected\": \"Final answer\"})\n",
|
| 73 |
"conciseness_evaluator = bind_evaluator(evaluator=conciseness_evaluator, input_mapping={ \"output\": \"attributes.output.value\", \"expected\": \"Final answer\"})\n",
|
| 74 |
-
"
|
|
|
|
| 75 |
]
|
| 76 |
},
|
| 77 |
{
|
|
@@ -80,15 +80,15 @@
|
|
| 80 |
"metadata": {},
|
| 81 |
"outputs": [],
|
| 82 |
"source": [
|
| 83 |
-
"results_df[\"exact_match\"] = results_df.exact_match_score.apply(json.loads).apply(lambda x : x[\"score\"])\n",
|
| 84 |
"results_df[\"conciseness\"] = results_df.conciseness_evaluator_score.apply(json.loads).apply(lambda x : x[\"label\"])\n",
|
|
|
|
| 85 |
"results_df[\"agent_type\"] = results_df[\"attributes.smolagents\"].apply(lambda x : \"multi_agent\" if \"managed_agents\" in x else \"llm_agent\")\n",
|
| 86 |
-
"results_filtered_df = results_df[[\"name\", \"span_kind\", \"start_time\", \"context.span_id\", \"context.trace_id\",\"attributes.output.value\", \"task_id\", \"Question\", \"Final answer\", \"agent_type\", \"
|
| 87 |
]
|
| 88 |
},
|
| 89 |
{
|
| 90 |
"cell_type": "code",
|
| 91 |
-
"execution_count":
|
| 92 |
"metadata": {},
|
| 93 |
"outputs": [
|
| 94 |
{
|
|
|
|
| 2 |
"cells": [
|
| 3 |
{
|
| 4 |
"cell_type": "code",
|
| 5 |
+
"execution_count": 29,
|
| 6 |
"metadata": {},
|
| 7 |
"outputs": [],
|
| 8 |
"source": [
|
|
|
|
| 16 |
},
|
| 17 |
{
|
| 18 |
"cell_type": "code",
|
| 19 |
+
"execution_count": 30,
|
| 20 |
"metadata": {},
|
| 21 |
"outputs": [],
|
| 22 |
"source": [
|
|
|
|
| 26 |
},
|
| 27 |
{
|
| 28 |
"cell_type": "code",
|
| 29 |
+
"execution_count": 31,
|
| 30 |
"metadata": {},
|
| 31 |
"outputs": [],
|
| 32 |
"source": [
|
|
|
|
| 36 |
},
|
| 37 |
{
|
| 38 |
"cell_type": "code",
|
| 39 |
+
"execution_count": 32,
|
| 40 |
"metadata": {},
|
| 41 |
"outputs": [
|
| 42 |
{
|
| 43 |
"name": "stderr",
|
| 44 |
"output_type": "stream",
|
| 45 |
"text": [
|
| 46 |
+
"/var/folders/pj/v1zrqj1d10x9_1rd2njh_r_r0000gn/T/ipykernel_36696/3107371246.py:2: SettingWithCopyWarning: \n",
|
| 47 |
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
|
| 48 |
"Try using .loc[row_indexer,col_indexer] = value instead\n",
|
| 49 |
"\n",
|
|
|
|
| 60 |
},
|
| 61 |
{
|
| 62 |
"cell_type": "code",
|
| 63 |
+
"execution_count": 33,
|
| 64 |
"metadata": {},
|
| 65 |
"outputs": [],
|
| 66 |
"source": [
|
| 67 |
"from phoenix.evals.evaluators import bind_evaluator, async_evaluate_dataframe\n",
|
|
|
|
| 68 |
"from evaluators import conciseness_evaluator\n",
|
| 69 |
+
"from scorer import question_scorer_wrapper as question_scorer\n",
|
| 70 |
"\n",
|
| 71 |
"# Define the evaluator\n",
|
|
|
|
| 72 |
"conciseness_evaluator = bind_evaluator(evaluator=conciseness_evaluator, input_mapping={ \"output\": \"attributes.output.value\", \"expected\": \"Final answer\"})\n",
|
| 73 |
+
"question_scorer_eval = bind_evaluator(evaluator=question_scorer, input_mapping={ \"output\": \"attributes.output.value\", \"expected\": \"Final answer\"})\n",
|
| 74 |
+
"results_df = await async_evaluate_dataframe(agents_merged_df, evaluators=[exact_match_eval, conciseness_evaluator, question_scorer_eval])\n"
|
| 75 |
]
|
| 76 |
},
|
| 77 |
{
|
|
|
|
| 80 |
"metadata": {},
|
| 81 |
"outputs": [],
|
| 82 |
"source": [
|
|
|
|
| 83 |
"results_df[\"conciseness\"] = results_df.conciseness_evaluator_score.apply(json.loads).apply(lambda x : x[\"label\"])\n",
|
| 84 |
+
"results_df[\"question_scorer\"] = results_df.question_scorer_score.apply(json.loads).apply(lambda x : x[\"score\"])\n",
|
| 85 |
"results_df[\"agent_type\"] = results_df[\"attributes.smolagents\"].apply(lambda x : \"multi_agent\" if \"managed_agents\" in x else \"llm_agent\")\n",
|
| 86 |
+
"results_filtered_df = results_df[[\"name\", \"span_kind\", \"start_time\", \"context.span_id\", \"context.trace_id\",\"attributes.output.value\", \"task_id\", \"Question\", \"Final answer\", \"agent_type\", \"conciseness_evaluator_score\", \"question_scorer_score\", \"conciseness\", \"question_scorer\"]]"
|
| 87 |
]
|
| 88 |
},
|
| 89 |
{
|
| 90 |
"cell_type": "code",
|
| 91 |
+
"execution_count": 35,
|
| 92 |
"metadata": {},
|
| 93 |
"outputs": [
|
| 94 |
{
|
eval/scorer.py
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import string
|
| 3 |
+
import warnings
|
| 4 |
+
from phoenix.evals import create_evaluator, Score
|
| 5 |
+
|
| 6 |
+
@create_evaluator(name="question_scorer")
|
| 7 |
+
def question_scorer_wrapper(output: str, expected: str) -> bool:
|
| 8 |
+
correct = question_scorer(output, expected)
|
| 9 |
+
return Score(score=float(correct))
|
| 10 |
+
|
| 11 |
+
def normalize_number_str(number_str: str) -> float:
|
| 12 |
+
# we replace these common units and commas to allow
|
| 13 |
+
# conversion to float
|
| 14 |
+
for char in ["$", "%", ","]:
|
| 15 |
+
number_str = number_str.replace(char, "")
|
| 16 |
+
try:
|
| 17 |
+
return float(number_str)
|
| 18 |
+
except ValueError:
|
| 19 |
+
print(f"String {number_str} cannot be normalized to number str.")
|
| 20 |
+
return float("inf")
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def split_string(
|
| 24 |
+
s: str,
|
| 25 |
+
char_list: list[str] = [",", ";"],
|
| 26 |
+
) -> list[str]:
|
| 27 |
+
pattern = f"[{''.join(char_list)}]"
|
| 28 |
+
return re.split(pattern, s)
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def question_scorer(
|
| 32 |
+
model_answer: str,
|
| 33 |
+
ground_truth: str,
|
| 34 |
+
) -> bool:
|
| 35 |
+
def is_float(element: any) -> bool:
|
| 36 |
+
try:
|
| 37 |
+
float(element)
|
| 38 |
+
return True
|
| 39 |
+
except ValueError:
|
| 40 |
+
return False
|
| 41 |
+
|
| 42 |
+
if model_answer is None:
|
| 43 |
+
model_answer = "None"
|
| 44 |
+
|
| 45 |
+
# if gt is a number
|
| 46 |
+
if is_float(ground_truth):
|
| 47 |
+
print(f"Evaluating {model_answer} as a number.")
|
| 48 |
+
normalized_answer = normalize_number_str(model_answer)
|
| 49 |
+
return normalized_answer == float(ground_truth)
|
| 50 |
+
|
| 51 |
+
# if gt is a list
|
| 52 |
+
elif any(char in ground_truth for char in [",", ";"]):
|
| 53 |
+
print(f"Evaluating {model_answer} as a comma separated list.")
|
| 54 |
+
# question with the fish: normalization removes punct
|
| 55 |
+
|
| 56 |
+
gt_elems = split_string(ground_truth)
|
| 57 |
+
ma_elems = split_string(model_answer)
|
| 58 |
+
|
| 59 |
+
# check length is the same
|
| 60 |
+
if len(gt_elems) != len(ma_elems):
|
| 61 |
+
warnings.warn(
|
| 62 |
+
"Answer lists have different lengths, returning False.", UserWarning
|
| 63 |
+
)
|
| 64 |
+
return False
|
| 65 |
+
|
| 66 |
+
# compare each element as float or str
|
| 67 |
+
comparisons = []
|
| 68 |
+
for ma_elem, gt_elem in zip(ma_elems, gt_elems):
|
| 69 |
+
if is_float(gt_elem):
|
| 70 |
+
normalized_ma_elem = normalize_number_str(ma_elem)
|
| 71 |
+
comparisons.append(normalized_ma_elem == float(gt_elem))
|
| 72 |
+
else:
|
| 73 |
+
# we do not remove punct since comparisons can include punct
|
| 74 |
+
comparisons.append(
|
| 75 |
+
normalize_str(ma_elem, remove_punct=False)
|
| 76 |
+
== normalize_str(gt_elem, remove_punct=False)
|
| 77 |
+
)
|
| 78 |
+
return all(comparisons)
|
| 79 |
+
|
| 80 |
+
# if gt is a str
|
| 81 |
+
else:
|
| 82 |
+
print(f"Evaluating {model_answer} as a string.")
|
| 83 |
+
return normalize_str(model_answer) == normalize_str(ground_truth)
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
def normalize_str(input_str, remove_punct=True) -> str:
|
| 87 |
+
"""
|
| 88 |
+
Normalize a string by:
|
| 89 |
+
- Removing all white spaces
|
| 90 |
+
- Optionally removing punctuation (if remove_punct is True)
|
| 91 |
+
- Converting to lowercase
|
| 92 |
+
Parameters:
|
| 93 |
+
- input_str: str, the string to normalize
|
| 94 |
+
- remove_punct: bool, whether to remove punctuation (default: True)
|
| 95 |
+
Returns:
|
| 96 |
+
- str, the normalized string
|
| 97 |
+
"""
|
| 98 |
+
# Remove all white spaces. Required e.g for seagull vs. sea gull
|
| 99 |
+
no_spaces = re.sub(r"\s", "", input_str)
|
| 100 |
+
|
| 101 |
+
# Remove punctuation, if specified.
|
| 102 |
+
if remove_punct:
|
| 103 |
+
translator = str.maketrans("", "", string.punctuation)
|
| 104 |
+
return no_spaces.lower().translate(translator)
|
| 105 |
+
else:
|
| 106 |
+
return no_spaces.lower()
|