Spaces:
Sleeping
Sleeping
| """Metrics utilities.""" | |
| from __future__ import annotations | |
| from typing import Iterable | |
| import numpy as np | |
| from numpy.typing import ArrayLike | |
| from sklearn.metrics import ( | |
| accuracy_score, | |
| average_precision_score, | |
| brier_score_loss, | |
| f1_score, | |
| recall_score, | |
| precision_score, | |
| roc_auc_score, | |
| ) | |
| def compute_metrics( | |
| y_true: Iterable[float], | |
| y_score: Iterable[float], | |
| *, | |
| threshold: float = 0.5, | |
| ) -> dict[str, float]: | |
| """Compute classification metrics from scores and labels.""" | |
| y_true_arr = np.asarray(list(y_true), dtype=float) | |
| y_score_arr = np.asarray(list(y_score), dtype=float) | |
| y_pred = (y_score_arr >= threshold).astype(int) | |
| metrics: dict[str, float] = {} | |
| try: | |
| metrics["roc_auc"] = float(roc_auc_score(y_true_arr, y_score_arr)) | |
| except ValueError: | |
| metrics["roc_auc"] = float("nan") | |
| try: | |
| metrics["pr_auc"] = float(average_precision_score(y_true_arr, y_score_arr)) | |
| except ValueError: | |
| metrics["pr_auc"] = float("nan") | |
| metrics["accuracy"] = float(accuracy_score(y_true_arr, y_pred)) | |
| metrics["f1"] = float(f1_score(y_true_arr, y_pred, zero_division=0)) | |
| metrics["f1_positive"] = float(f1_score(y_true_arr, y_pred, pos_label=1, zero_division=0)) | |
| metrics["f1_negative"] = float(f1_score(y_true_arr, y_pred, pos_label=0, zero_division=0)) | |
| metrics["sensitivity"] = float(recall_score(y_true_arr, y_pred, zero_division=0)) | |
| # Specificity is recall on the negative class | |
| metrics["specificity"] = float( | |
| recall_score(1 - y_true_arr, 1 - y_pred, zero_division=0) | |
| ) | |
| metrics["precision"] = float(precision_score(y_true_arr, y_pred, zero_division=0)) | |
| metrics["positive_rate"] = float(y_true_arr.mean()) if y_true_arr.size else float("nan") | |
| metrics["brier"] = float(brier_score_loss(y_true_arr, y_score_arr)) | |
| ece, mce = _calibration_errors(y_true_arr, y_score_arr) | |
| metrics["ece"] = float(ece) | |
| metrics["mce"] = float(mce) | |
| return metrics | |
| def _calibration_errors( | |
| y_true: np.ndarray, | |
| y_score: np.ndarray, | |
| n_bins: int = 15, | |
| ) -> tuple[float, float]: | |
| if y_true.size == 0: | |
| return float("nan"), float("nan") | |
| # Clamp scores to [0, 1] to avoid binning issues when calibrators overshoot | |
| scores = np.clip(y_score, 0.0, 1.0) | |
| bins = np.linspace(0.0, 1.0, n_bins + 1) | |
| bin_indices = np.digitize(scores, bins[1:-1], right=True) | |
| total = y_true.size | |
| ece = 0.0 | |
| mce = 0.0 | |
| for bin_idx in range(n_bins): | |
| mask = bin_indices == bin_idx | |
| if not np.any(mask): | |
| continue | |
| bin_scores = scores[mask] | |
| bin_true = y_true[mask] | |
| confidence = float(bin_scores.mean()) | |
| accuracy = float(bin_true.mean()) | |
| gap = abs(confidence - accuracy) | |
| weight = float(mask.sum()) / float(total) | |
| ece += weight * gap | |
| mce = max(mce, gap) | |
| return ece, mce | |
| def bootstrap_metric_intervals( | |
| y_true: ArrayLike, | |
| y_score: ArrayLike, | |
| *, | |
| n_bootstrap: int = 200, | |
| alpha: float = 0.05, | |
| threshold: float = 0.5, | |
| random_state: int | None = 42, | |
| ) -> dict[str, dict[str, float]]: | |
| """Estimate bootstrap confidence intervals for core metrics. | |
| Parameters | |
| ---------- | |
| y_true, y_score: | |
| Arrays of ground-truth labels and probability scores. | |
| n_bootstrap: | |
| Number of bootstrap resamples; set to ``0`` to disable. | |
| alpha: | |
| Two-sided confidence level (default ``0.05`` gives 95% CI). | |
| threshold: | |
| Decision threshold passed to :func:`compute_metrics`. | |
| random_state: | |
| Seed controlling the bootstrap sampler. | |
| """ | |
| if n_bootstrap <= 0: | |
| return {} | |
| y_true_arr = np.asarray(y_true, dtype=float) | |
| y_score_arr = np.asarray(y_score, dtype=float) | |
| n = y_true_arr.size | |
| if n == 0: | |
| return {} | |
| rng = np.random.default_rng(random_state) | |
| collected: dict[str, list[float]] = {} | |
| for _ in range(n_bootstrap): | |
| indices = rng.integers(0, n, size=n) | |
| resampled_true = y_true_arr[indices] | |
| resampled_score = y_score_arr[indices] | |
| if np.unique(resampled_true).size < 2: | |
| continue | |
| metrics = compute_metrics(resampled_true, resampled_score, threshold=threshold) | |
| for metric_name, value in metrics.items(): | |
| collected.setdefault(metric_name, []).append(value) | |
| lower_q = alpha / 2.0 | |
| upper_q = 1.0 - lower_q | |
| summary: dict[str, dict[str, float]] = {} | |
| for metric_name, values in collected.items(): | |
| arr = np.asarray(values, dtype=float) | |
| valid = arr[~np.isnan(arr)] | |
| if valid.size == 0: | |
| continue | |
| lower = float(np.nanquantile(valid, lower_q)) | |
| upper = float(np.nanquantile(valid, upper_q)) | |
| median = float(np.nanmedian(valid)) | |
| summary[metric_name] = { | |
| "ci_lower": lower, | |
| "ci_upper": upper, | |
| "ci_median": median, | |
| } | |
| return summary | |