Final_Assignment_Template

Running

App Files Files Community

Romain Fayoux commited on Oct 8

Commit

a0888ca

1 Parent(s): 14d6990

Used official scoring function

Browse files

Files changed (2) hide show

eval/eval_notebook.ipynb +12 -12
eval/scorer.py +106 -0

eval/eval_notebook.ipynb CHANGED Viewed

@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 19,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -16,7 +16,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -26,7 +26,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -36,14 +36,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/var/folders/pj/v1zrqj1d10x9_1rd2njh_r_r0000gn/T/ipykernel_98186/3107371246.py:2: SettingWithCopyWarning: \n",
       "A value is trying to be set on a copy of a slice from a DataFrame.\n",
       "Try using .loc[row_indexer,col_indexer] = value instead\n",
       "\n",
@@ -60,18 +60,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 29,
    "metadata": {},
    "outputs": [],
    "source": [
     "from phoenix.evals.evaluators import bind_evaluator, async_evaluate_dataframe\n",
-    "from phoenix.evals.metrics import exact_match\n",
     "from evaluators import conciseness_evaluator\n",
     "\n",
     "# Define the evaluator\n",
-    "exact_match_eval = bind_evaluator(evaluator=exact_match, input_mapping= { \"output\": \"attributes.output.value\", \"expected\": \"Final answer\"})\n",
     "conciseness_evaluator = bind_evaluator(evaluator=conciseness_evaluator, input_mapping={ \"output\": \"attributes.output.value\", \"expected\": \"Final answer\"})\n",
-    "results_df = await async_evaluate_dataframe(agents_merged_df, evaluators=[exact_match_eval, conciseness_evaluator])\n"
    ]
   },
   {
@@ -80,15 +80,15 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "results_df[\"exact_match\"] = results_df.exact_match_score.apply(json.loads).apply(lambda x : x[\"score\"])\n",
     "results_df[\"conciseness\"] = results_df.conciseness_evaluator_score.apply(json.loads).apply(lambda x : x[\"label\"])\n",
     "results_df[\"agent_type\"] = results_df[\"attributes.smolagents\"].apply(lambda x : \"multi_agent\" if \"managed_agents\" in x else \"llm_agent\")\n",
-    "results_filtered_df = results_df[[\"name\", \"span_kind\", \"start_time\", \"context.span_id\", \"context.trace_id\",\"attributes.output.value\", \"task_id\", \"Question\", \"Final answer\", \"agent_type\", \"exact_match_score\", \"conciseness_evaluator_score\", \"exact_match\", \"conciseness\"]]"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 38,
    "metadata": {},
    "outputs": [
     {

  "cells": [
   {
    "cell_type": "code",
+   "execution_count": 29,
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": 30,
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": 31,
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": 32,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
+      "/var/folders/pj/v1zrqj1d10x9_1rd2njh_r_r0000gn/T/ipykernel_36696/3107371246.py:2: SettingWithCopyWarning: \n",
       "A value is trying to be set on a copy of a slice from a DataFrame.\n",
       "Try using .loc[row_indexer,col_indexer] = value instead\n",
       "\n",
   },
   {
    "cell_type": "code",
+   "execution_count": 33,
    "metadata": {},
    "outputs": [],
    "source": [
     "from phoenix.evals.evaluators import bind_evaluator, async_evaluate_dataframe\n",
     "from evaluators import conciseness_evaluator\n",
+    "from scorer import question_scorer_wrapper as question_scorer\n",
     "\n",
     "# Define the evaluator\n",
     "conciseness_evaluator = bind_evaluator(evaluator=conciseness_evaluator, input_mapping={ \"output\": \"attributes.output.value\", \"expected\": \"Final answer\"})\n",
+    "question_scorer_eval = bind_evaluator(evaluator=question_scorer, input_mapping={ \"output\": \"attributes.output.value\", \"expected\": \"Final answer\"})\n",
+    "results_df = await async_evaluate_dataframe(agents_merged_df, evaluators=[exact_match_eval, conciseness_evaluator, question_scorer_eval])\n"
    ]
   },
   {
    "metadata": {},
    "outputs": [],
    "source": [
     "results_df[\"conciseness\"] = results_df.conciseness_evaluator_score.apply(json.loads).apply(lambda x : x[\"label\"])\n",
+    "results_df[\"question_scorer\"] = results_df.question_scorer_score.apply(json.loads).apply(lambda x : x[\"score\"])\n",
     "results_df[\"agent_type\"] = results_df[\"attributes.smolagents\"].apply(lambda x : \"multi_agent\" if \"managed_agents\" in x else \"llm_agent\")\n",
+    "results_filtered_df = results_df[[\"name\", \"span_kind\", \"start_time\", \"context.span_id\", \"context.trace_id\",\"attributes.output.value\", \"task_id\", \"Question\", \"Final answer\", \"agent_type\", \"conciseness_evaluator_score\", \"question_scorer_score\", \"conciseness\", \"question_scorer\"]]"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 35,
    "metadata": {},
    "outputs": [
     {

eval/scorer.py ADDED Viewed

	@@ -0,0 +1,106 @@

+import re
+import string
+import warnings
+from phoenix.evals import create_evaluator, Score
+@create_evaluator(name="question_scorer")
+def question_scorer_wrapper(output: str, expected: str) -> bool:
+    correct = question_scorer(output, expected)
+    return Score(score=float(correct))
+def normalize_number_str(number_str: str) -> float:
+    # we replace these common units and commas to allow
+    # conversion to float
+    for char in ["$", "%", ","]:
+        number_str = number_str.replace(char, "")
+    try:
+        return float(number_str)
+    except ValueError:
+        print(f"String {number_str} cannot be normalized to number str.")
+        return float("inf")
+def split_string(
+    s: str,
+    char_list: list[str] = [",", ";"],
+) -> list[str]:
+    pattern = f"[{''.join(char_list)}]"
+    return re.split(pattern, s)
+def question_scorer(
+    model_answer: str,
+    ground_truth: str,
+) -> bool:
+    def is_float(element: any) -> bool:
+        try:
+            float(element)
+            return True
+        except ValueError:
+            return False
+    if model_answer is None:
+        model_answer = "None"
+    # if gt is a number
+    if is_float(ground_truth):
+        print(f"Evaluating {model_answer} as a number.")
+        normalized_answer = normalize_number_str(model_answer)
+        return normalized_answer == float(ground_truth)
+    # if gt is a list
+    elif any(char in ground_truth for char in [",", ";"]):
+        print(f"Evaluating {model_answer} as a comma separated list.")
+        # question with the fish: normalization removes punct
+        gt_elems = split_string(ground_truth)
+        ma_elems = split_string(model_answer)
+        # check length is the same
+        if len(gt_elems) != len(ma_elems):
+            warnings.warn(
+                "Answer lists have different lengths, returning False.", UserWarning
+            )
+            return False
+        # compare each element as float or str
+        comparisons = []
+        for ma_elem, gt_elem in zip(ma_elems, gt_elems):
+            if is_float(gt_elem):
+                normalized_ma_elem = normalize_number_str(ma_elem)
+                comparisons.append(normalized_ma_elem == float(gt_elem))
+            else:
+                # we do not remove punct since comparisons can include punct
+                comparisons.append(
+                    normalize_str(ma_elem, remove_punct=False)
+                    == normalize_str(gt_elem, remove_punct=False)
+                )
+        return all(comparisons)
+    # if gt is a str
+    else:
+        print(f"Evaluating {model_answer} as a string.")
+        return normalize_str(model_answer) == normalize_str(ground_truth)
+def normalize_str(input_str, remove_punct=True) -> str:
+    """
+    Normalize a string by:
+    - Removing all white spaces
+    - Optionally removing punctuation (if remove_punct is True)
+    - Converting to lowercase
+    Parameters:
+    - input_str: str, the string to normalize
+    - remove_punct: bool, whether to remove punctuation (default: True)
+    Returns:
+    - str, the normalized string
+    """
+    # Remove all white spaces. Required e.g for seagull vs. sea gull
+    no_spaces = re.sub(r"\s", "", input_str)
+    # Remove punctuation, if specified.
+    if remove_punct:
+        translator = str.maketrans("", "", string.punctuation)
+        return no_spaces.lower().translate(translator)
+    else:
+        return no_spaces.lower()