Spaces:
Sleeping
Sleeping
| from typing import Any | |
| from opik.evaluation.metrics import base_metric, score_result | |
| class SummaryDensityHeuristic(base_metric.BaseMetric): | |
| """ | |
| A metric that evaluates whether an LLM's output has appropriate length and density. | |
| This metric uses an heuristic to determine if the output length is appropriate for the given instruction. | |
| It returns a normalized score between 0 and 1, where: | |
| - 0.0 (Poor): Output is either too short and incomplete, or too long with unnecessary information | |
| - 0.5 (Good): Output has decent length balance but still slightly too short or too long | |
| - 1.0 (Excellent): Output length is appropriate, answering the question concisely without being verbose | |
| """ | |
| def __init__( | |
| self, | |
| name: str = "summary_density_heuristic", | |
| min_length: int = 128, | |
| max_length: int = 1024, | |
| ) -> None: | |
| self.name = name | |
| self.min_length = min_length | |
| self.max_length = max_length | |
| def score( | |
| self, input: str, output: str, **ignored_kwargs: Any | |
| ) -> score_result.ScoreResult: | |
| """ | |
| Score the output of an LLM. | |
| Args: | |
| input: The input prompt given to the LLM. | |
| output: The output of an LLM to score. | |
| **ignored_kwargs: Any additional keyword arguments. | |
| Returns: | |
| ScoreResult: The computed score with explanation. | |
| """ | |
| length_score = self._compute_length_score(output) | |
| reason = f"Output length: {len(output)} chars. " | |
| if length_score == 1.0: | |
| reason += "Length is within ideal range." | |
| elif length_score >= 0.5: | |
| reason += "Length is slightly outside ideal range." | |
| else: | |
| reason += "Length is significantly outside ideal range." | |
| return score_result.ScoreResult( | |
| name=self.name, | |
| value=length_score, | |
| reason=reason, | |
| ) | |
| def _compute_length_score(self, text: str) -> float: | |
| """ | |
| Compute a score based on text length relative to min and max boundaries. | |
| Args: | |
| text: The text to evaluate. | |
| Returns: | |
| float: A score between 0 and 1, where: | |
| - 0.0: Text length is significantly outside the boundaries | |
| - 0.5: Text length is slightly outside the boundaries | |
| - 1.0: Text length is within the ideal range | |
| """ | |
| length = len(text) | |
| # If length is within bounds, return perfect score | |
| if self.min_length <= length <= self.max_length: | |
| return 1.0 | |
| if length < self.min_length: | |
| deviation = (self.min_length - length) / self.min_length | |
| else: | |
| deviation = (length - self.max_length) / self.max_length | |
| # Convert deviation to a score between 0 and 1 | |
| # deviation <= 0.5 -> score between 0.5 and 1.0 | |
| # deviation > 0.5 -> score between 0.0 and 0.5 | |
| score = max(0.0, 1.0 - deviation) | |
| return score | |