""" Module for aggregating results from different evaluation metrics. """ import pandas as pd import numpy as np from collections import defaultdict class ResultsAggregator: """Class for aggregating and analyzing image evaluation results.""" def __init__(self): """Initialize the aggregator.""" # Weights for different metric categories self.default_weights = { # Technical metrics 'sharpness': 1.0, 'noise': 1.0, 'contrast': 1.0, 'saturation': 1.0, 'entropy': 1.0, 'compression_artifacts': 1.0, 'dynamic_range': 1.0, # Aesthetic metrics 'aesthetic_score': 1.5, 'composition_score': 1.2, 'color_harmony': 1.2, # Prompt metrics 'prompt_similarity': 2.0, } # Metrics where lower is better self.inverse_metrics = ['noise', 'compression_artifacts'] def normalize_metric(self, values, metric_name): """ Normalize metric values to 0-10 scale. Args: values: list of metric values metric_name: name of the metric Returns: list: normalized values """ if not values: return [] # For metrics where lower is better, invert the values if metric_name in self.inverse_metrics: values = [max(values) - v + min(values) for v in values] # Normalize to 0-10 scale min_val = min(values) max_val = max(values) if max_val == min_val: return [5.0] * len(values) # Default to middle value if all values are the same return [10 * (v - min_val) / (max_val - min_val) for v in values] def aggregate_model_results(self, model_results, custom_weights=None): """ Aggregate results for a single model across multiple images. Args: model_results: list of metric dictionaries for images from the same model custom_weights: optional dictionary of custom weights for metrics Returns: dict: aggregated metrics """ if not model_results: return {} # Use default weights if custom weights not provided weights = custom_weights if custom_weights else self.default_weights # Initialize aggregated results aggregated = {} # Collect all metrics all_metrics = set() for result in model_results: all_metrics.update(result.keys()) # Aggregate each metric for metric in all_metrics: # Skip non-numeric metrics values = [result.get(metric) for result in model_results if metric in result and isinstance(result[metric], (int, float))] if values: aggregated[metric] = { 'mean': np.mean(values), 'median': np.median(values), 'std': np.std(values), 'min': np.min(values), 'max': np.max(values), 'count': len(values) } # Calculate overall score score_components = [] weight_sum = 0 for metric, stats in aggregated.items(): if metric in weights: # Normalize the mean value to 0-10 scale normalized_value = stats['mean'] if metric in self.inverse_metrics: # For metrics where lower is better, invert the scale normalized_value = 10 - normalized_value # Apply weight weight = weights[metric] score_components.append(normalized_value * weight) weight_sum += weight # Calculate weighted average if weight_sum > 0: aggregated['overall_score'] = sum(score_components) / weight_sum else: aggregated['overall_score'] = 5.0 # Default middle score return aggregated def compare_models(self, model_results_dict, custom_weights=None): """ Compare results across different models. Args: model_results_dict: dictionary with model names as keys and lists of results as values custom_weights: optional dictionary of custom weights for metrics Returns: dict: comparison results """ # Aggregate results for each model aggregated_results = {} for model_name, results in model_results_dict.items(): aggregated_results[model_name] = self.aggregate_model_results(results, custom_weights) # Extract key metrics for comparison comparison = {} for model_name, agg_results in aggregated_results.items(): model_comparison = { 'overall_score': agg_results.get('overall_score', 5.0) } # Add mean values of all metrics for metric, stats in agg_results.items(): if metric != 'overall_score' and isinstance(stats, dict) and 'mean' in stats: model_comparison[f"{metric}"] = stats['mean'] comparison[model_name] = model_comparison return comparison def analyze_by_prompt(self, results_by_prompt, custom_weights=None): """ Analyze results grouped by prompt. Args: results_by_prompt: dictionary with prompts as keys and dictionaries of model results as values custom_weights: optional dictionary of custom weights for metrics Returns: dict: analysis results by prompt """ prompt_analysis = {} for prompt, model_results in results_by_prompt.items(): # Compare models for this prompt prompt_comparison = self.compare_models(model_results, custom_weights) # Find best model for this prompt best_model = None best_score = -1 for model, metrics in prompt_comparison.items(): score = metrics.get('overall_score', 0) if score > best_score: best_score = score best_model = model prompt_analysis[prompt] = { 'model_comparison': prompt_comparison, 'best_model': best_model, 'best_score': best_score } return prompt_analysis def create_comparison_dataframe(self, comparison_results): """ Create a pandas DataFrame from comparison results. Args: comparison_results: dictionary with model names as keys and metric dictionaries as values Returns: pandas.DataFrame: comparison table """ # Convert to DataFrame df = pd.DataFrame.from_dict(comparison_results, orient='index') # Sort by overall score if 'overall_score' in df.columns: df = df.sort_values('overall_score', ascending=False) return df