Romain Fayoux commited on
Commit
a0888ca
·
1 Parent(s): 14d6990

Used official scoring function

Browse files
Files changed (2) hide show
  1. eval/eval_notebook.ipynb +12 -12
  2. eval/scorer.py +106 -0
eval/eval_notebook.ipynb CHANGED
@@ -2,7 +2,7 @@
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
- "execution_count": 19,
6
  "metadata": {},
7
  "outputs": [],
8
  "source": [
@@ -16,7 +16,7 @@
16
  },
17
  {
18
  "cell_type": "code",
19
- "execution_count": 20,
20
  "metadata": {},
21
  "outputs": [],
22
  "source": [
@@ -26,7 +26,7 @@
26
  },
27
  {
28
  "cell_type": "code",
29
- "execution_count": 21,
30
  "metadata": {},
31
  "outputs": [],
32
  "source": [
@@ -36,14 +36,14 @@
36
  },
37
  {
38
  "cell_type": "code",
39
- "execution_count": 22,
40
  "metadata": {},
41
  "outputs": [
42
  {
43
  "name": "stderr",
44
  "output_type": "stream",
45
  "text": [
46
- "/var/folders/pj/v1zrqj1d10x9_1rd2njh_r_r0000gn/T/ipykernel_98186/3107371246.py:2: SettingWithCopyWarning: \n",
47
  "A value is trying to be set on a copy of a slice from a DataFrame.\n",
48
  "Try using .loc[row_indexer,col_indexer] = value instead\n",
49
  "\n",
@@ -60,18 +60,18 @@
60
  },
61
  {
62
  "cell_type": "code",
63
- "execution_count": 29,
64
  "metadata": {},
65
  "outputs": [],
66
  "source": [
67
  "from phoenix.evals.evaluators import bind_evaluator, async_evaluate_dataframe\n",
68
- "from phoenix.evals.metrics import exact_match\n",
69
  "from evaluators import conciseness_evaluator\n",
 
70
  "\n",
71
  "# Define the evaluator\n",
72
- "exact_match_eval = bind_evaluator(evaluator=exact_match, input_mapping= { \"output\": \"attributes.output.value\", \"expected\": \"Final answer\"})\n",
73
  "conciseness_evaluator = bind_evaluator(evaluator=conciseness_evaluator, input_mapping={ \"output\": \"attributes.output.value\", \"expected\": \"Final answer\"})\n",
74
- "results_df = await async_evaluate_dataframe(agents_merged_df, evaluators=[exact_match_eval, conciseness_evaluator])\n"
 
75
  ]
76
  },
77
  {
@@ -80,15 +80,15 @@
80
  "metadata": {},
81
  "outputs": [],
82
  "source": [
83
- "results_df[\"exact_match\"] = results_df.exact_match_score.apply(json.loads).apply(lambda x : x[\"score\"])\n",
84
  "results_df[\"conciseness\"] = results_df.conciseness_evaluator_score.apply(json.loads).apply(lambda x : x[\"label\"])\n",
 
85
  "results_df[\"agent_type\"] = results_df[\"attributes.smolagents\"].apply(lambda x : \"multi_agent\" if \"managed_agents\" in x else \"llm_agent\")\n",
86
- "results_filtered_df = results_df[[\"name\", \"span_kind\", \"start_time\", \"context.span_id\", \"context.trace_id\",\"attributes.output.value\", \"task_id\", \"Question\", \"Final answer\", \"agent_type\", \"exact_match_score\", \"conciseness_evaluator_score\", \"exact_match\", \"conciseness\"]]"
87
  ]
88
  },
89
  {
90
  "cell_type": "code",
91
- "execution_count": 38,
92
  "metadata": {},
93
  "outputs": [
94
  {
 
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
+ "execution_count": 29,
6
  "metadata": {},
7
  "outputs": [],
8
  "source": [
 
16
  },
17
  {
18
  "cell_type": "code",
19
+ "execution_count": 30,
20
  "metadata": {},
21
  "outputs": [],
22
  "source": [
 
26
  },
27
  {
28
  "cell_type": "code",
29
+ "execution_count": 31,
30
  "metadata": {},
31
  "outputs": [],
32
  "source": [
 
36
  },
37
  {
38
  "cell_type": "code",
39
+ "execution_count": 32,
40
  "metadata": {},
41
  "outputs": [
42
  {
43
  "name": "stderr",
44
  "output_type": "stream",
45
  "text": [
46
+ "/var/folders/pj/v1zrqj1d10x9_1rd2njh_r_r0000gn/T/ipykernel_36696/3107371246.py:2: SettingWithCopyWarning: \n",
47
  "A value is trying to be set on a copy of a slice from a DataFrame.\n",
48
  "Try using .loc[row_indexer,col_indexer] = value instead\n",
49
  "\n",
 
60
  },
61
  {
62
  "cell_type": "code",
63
+ "execution_count": 33,
64
  "metadata": {},
65
  "outputs": [],
66
  "source": [
67
  "from phoenix.evals.evaluators import bind_evaluator, async_evaluate_dataframe\n",
 
68
  "from evaluators import conciseness_evaluator\n",
69
+ "from scorer import question_scorer_wrapper as question_scorer\n",
70
  "\n",
71
  "# Define the evaluator\n",
 
72
  "conciseness_evaluator = bind_evaluator(evaluator=conciseness_evaluator, input_mapping={ \"output\": \"attributes.output.value\", \"expected\": \"Final answer\"})\n",
73
+ "question_scorer_eval = bind_evaluator(evaluator=question_scorer, input_mapping={ \"output\": \"attributes.output.value\", \"expected\": \"Final answer\"})\n",
74
+ "results_df = await async_evaluate_dataframe(agents_merged_df, evaluators=[exact_match_eval, conciseness_evaluator, question_scorer_eval])\n"
75
  ]
76
  },
77
  {
 
80
  "metadata": {},
81
  "outputs": [],
82
  "source": [
 
83
  "results_df[\"conciseness\"] = results_df.conciseness_evaluator_score.apply(json.loads).apply(lambda x : x[\"label\"])\n",
84
+ "results_df[\"question_scorer\"] = results_df.question_scorer_score.apply(json.loads).apply(lambda x : x[\"score\"])\n",
85
  "results_df[\"agent_type\"] = results_df[\"attributes.smolagents\"].apply(lambda x : \"multi_agent\" if \"managed_agents\" in x else \"llm_agent\")\n",
86
+ "results_filtered_df = results_df[[\"name\", \"span_kind\", \"start_time\", \"context.span_id\", \"context.trace_id\",\"attributes.output.value\", \"task_id\", \"Question\", \"Final answer\", \"agent_type\", \"conciseness_evaluator_score\", \"question_scorer_score\", \"conciseness\", \"question_scorer\"]]"
87
  ]
88
  },
89
  {
90
  "cell_type": "code",
91
+ "execution_count": 35,
92
  "metadata": {},
93
  "outputs": [
94
  {
eval/scorer.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import string
3
+ import warnings
4
+ from phoenix.evals import create_evaluator, Score
5
+
6
+ @create_evaluator(name="question_scorer")
7
+ def question_scorer_wrapper(output: str, expected: str) -> bool:
8
+ correct = question_scorer(output, expected)
9
+ return Score(score=float(correct))
10
+
11
+ def normalize_number_str(number_str: str) -> float:
12
+ # we replace these common units and commas to allow
13
+ # conversion to float
14
+ for char in ["$", "%", ","]:
15
+ number_str = number_str.replace(char, "")
16
+ try:
17
+ return float(number_str)
18
+ except ValueError:
19
+ print(f"String {number_str} cannot be normalized to number str.")
20
+ return float("inf")
21
+
22
+
23
+ def split_string(
24
+ s: str,
25
+ char_list: list[str] = [",", ";"],
26
+ ) -> list[str]:
27
+ pattern = f"[{''.join(char_list)}]"
28
+ return re.split(pattern, s)
29
+
30
+
31
+ def question_scorer(
32
+ model_answer: str,
33
+ ground_truth: str,
34
+ ) -> bool:
35
+ def is_float(element: any) -> bool:
36
+ try:
37
+ float(element)
38
+ return True
39
+ except ValueError:
40
+ return False
41
+
42
+ if model_answer is None:
43
+ model_answer = "None"
44
+
45
+ # if gt is a number
46
+ if is_float(ground_truth):
47
+ print(f"Evaluating {model_answer} as a number.")
48
+ normalized_answer = normalize_number_str(model_answer)
49
+ return normalized_answer == float(ground_truth)
50
+
51
+ # if gt is a list
52
+ elif any(char in ground_truth for char in [",", ";"]):
53
+ print(f"Evaluating {model_answer} as a comma separated list.")
54
+ # question with the fish: normalization removes punct
55
+
56
+ gt_elems = split_string(ground_truth)
57
+ ma_elems = split_string(model_answer)
58
+
59
+ # check length is the same
60
+ if len(gt_elems) != len(ma_elems):
61
+ warnings.warn(
62
+ "Answer lists have different lengths, returning False.", UserWarning
63
+ )
64
+ return False
65
+
66
+ # compare each element as float or str
67
+ comparisons = []
68
+ for ma_elem, gt_elem in zip(ma_elems, gt_elems):
69
+ if is_float(gt_elem):
70
+ normalized_ma_elem = normalize_number_str(ma_elem)
71
+ comparisons.append(normalized_ma_elem == float(gt_elem))
72
+ else:
73
+ # we do not remove punct since comparisons can include punct
74
+ comparisons.append(
75
+ normalize_str(ma_elem, remove_punct=False)
76
+ == normalize_str(gt_elem, remove_punct=False)
77
+ )
78
+ return all(comparisons)
79
+
80
+ # if gt is a str
81
+ else:
82
+ print(f"Evaluating {model_answer} as a string.")
83
+ return normalize_str(model_answer) == normalize_str(ground_truth)
84
+
85
+
86
+ def normalize_str(input_str, remove_punct=True) -> str:
87
+ """
88
+ Normalize a string by:
89
+ - Removing all white spaces
90
+ - Optionally removing punctuation (if remove_punct is True)
91
+ - Converting to lowercase
92
+ Parameters:
93
+ - input_str: str, the string to normalize
94
+ - remove_punct: bool, whether to remove punctuation (default: True)
95
+ Returns:
96
+ - str, the normalized string
97
+ """
98
+ # Remove all white spaces. Required e.g for seagull vs. sea gull
99
+ no_spaces = re.sub(r"\s", "", input_str)
100
+
101
+ # Remove punctuation, if specified.
102
+ if remove_punct:
103
+ translator = str.maketrans("", "", string.punctuation)
104
+ return no_spaces.lower().translate(translator)
105
+ else:
106
+ return no_spaces.lower()