Spaces:
Sleeping
Sleeping
Commit
·
ee5875c
1
Parent(s):
3445f6a
flatten results for dataset
Browse files- evaluation_logic.py +14 -18
evaluation_logic.py
CHANGED
|
@@ -58,33 +58,29 @@ def save_evaluation(inference_api, model_name, prompt_format, metrics):
|
|
| 58 |
evaluation_file = evaluation_folder / f"evaluation_{file_uuid}.json"
|
| 59 |
evaluation_folder.mkdir(parents=True, exist_ok=True)
|
| 60 |
|
| 61 |
-
# Extract
|
| 62 |
categories = ['easy', 'medium', 'hard', 'duckdb', 'ddl', 'all']
|
| 63 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
|
|
|
|
| 65 |
for category in categories:
|
| 66 |
if category in metrics['exec']:
|
| 67 |
category_metrics = metrics['exec'][category]
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
'execution_accuracy': category_metrics['exec']
|
| 71 |
-
}
|
| 72 |
else:
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
'execution_accuracy': 0.0
|
| 76 |
-
}
|
| 77 |
|
| 78 |
with evaluation_scheduler.lock:
|
| 79 |
with evaluation_file.open("a") as f:
|
| 80 |
-
json.dump(
|
| 81 |
-
|
| 82 |
-
"model_name": model_name,
|
| 83 |
-
"prompt_format": prompt_format,
|
| 84 |
-
"category_metrics": simplified_metrics,
|
| 85 |
-
"timestamp": datetime.now().isoformat()
|
| 86 |
-
}, f)
|
| 87 |
-
f.write('\n')
|
| 88 |
|
| 89 |
def run_prediction(inference_api, model_name, prompt_format, output_file):
|
| 90 |
dataset_path = str(eval_dir / "data/dev.json")
|
|
|
|
| 58 |
evaluation_file = evaluation_folder / f"evaluation_{file_uuid}.json"
|
| 59 |
evaluation_folder.mkdir(parents=True, exist_ok=True)
|
| 60 |
|
| 61 |
+
# Extract and flatten the category-specific execution metrics
|
| 62 |
categories = ['easy', 'medium', 'hard', 'duckdb', 'ddl', 'all']
|
| 63 |
+
flattened_metrics = {
|
| 64 |
+
"inference_api": inference_api,
|
| 65 |
+
"model_name": model_name,
|
| 66 |
+
"prompt_format": prompt_format,
|
| 67 |
+
"timestamp": datetime.now().isoformat()
|
| 68 |
+
}
|
| 69 |
|
| 70 |
+
# Flatten each category's metrics into separate columns
|
| 71 |
for category in categories:
|
| 72 |
if category in metrics['exec']:
|
| 73 |
category_metrics = metrics['exec'][category]
|
| 74 |
+
flattened_metrics[f"{category}_count"] = category_metrics['count']
|
| 75 |
+
flattened_metrics[f"{category}_execution_accuracy"] = category_metrics['exec']
|
|
|
|
|
|
|
| 76 |
else:
|
| 77 |
+
flattened_metrics[f"{category}_count"] = 0
|
| 78 |
+
flattened_metrics[f"{category}_execution_accuracy"] = 0.0
|
|
|
|
|
|
|
| 79 |
|
| 80 |
with evaluation_scheduler.lock:
|
| 81 |
with evaluation_file.open("a") as f:
|
| 82 |
+
json.dump(flattened_metrics, f)
|
| 83 |
+
f.write('\n')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
|
| 85 |
def run_prediction(inference_api, model_name, prompt_format, output_file):
|
| 86 |
dataset_path = str(eval_dir / "data/dev.json")
|