from typing import Any from opik.evaluation.metrics import base_metric, score_result class SummaryDensityHeuristic(base_metric.BaseMetric): """ A metric that evaluates whether an LLM's output has appropriate length and density. This metric uses an heuristic to determine if the output length is appropriate for the given instruction. It returns a normalized score between 0 and 1, where: - 0.0 (Poor): Output is either too short and incomplete, or too long with unnecessary information - 0.5 (Good): Output has decent length balance but still slightly too short or too long - 1.0 (Excellent): Output length is appropriate, answering the question concisely without being verbose """ def __init__( self, name: str = "summary_density_heuristic", min_length: int = 128, max_length: int = 1024, ) -> None: self.name = name self.min_length = min_length self.max_length = max_length def score( self, input: str, output: str, **ignored_kwargs: Any ) -> score_result.ScoreResult: """ Score the output of an LLM. Args: input: The input prompt given to the LLM. output: The output of an LLM to score. **ignored_kwargs: Any additional keyword arguments. Returns: ScoreResult: The computed score with explanation. """ length_score = self._compute_length_score(output) reason = f"Output length: {len(output)} chars. " if length_score == 1.0: reason += "Length is within ideal range." elif length_score >= 0.5: reason += "Length is slightly outside ideal range." else: reason += "Length is significantly outside ideal range." return score_result.ScoreResult( name=self.name, value=length_score, reason=reason, ) def _compute_length_score(self, text: str) -> float: """ Compute a score based on text length relative to min and max boundaries. Args: text: The text to evaluate. Returns: float: A score between 0 and 1, where: - 0.0: Text length is significantly outside the boundaries - 0.5: Text length is slightly outside the boundaries - 1.0: Text length is within the ideal range """ length = len(text) # If length is within bounds, return perfect score if self.min_length <= length <= self.max_length: return 1.0 if length < self.min_length: deviation = (self.min_length - length) / self.min_length else: deviation = (length - self.max_length) / self.max_length # Convert deviation to a score between 0 and 1 # deviation <= 0.5 -> score between 0.5 and 1.0 # deviation > 0.5 -> score between 0.0 and 0.5 score = max(0.0, 1.0 - deviation) return score