Spaces:
Running
Running
| """ | |
| Module for aggregating results from different evaluation metrics. | |
| """ | |
| import pandas as pd | |
| import numpy as np | |
| from collections import defaultdict | |
| class ResultsAggregator: | |
| """Class for aggregating and analyzing image evaluation results.""" | |
| def __init__(self): | |
| """Initialize the aggregator.""" | |
| # Weights for different metric categories | |
| self.default_weights = { | |
| # Technical metrics | |
| 'sharpness': 1.0, | |
| 'noise': 1.0, | |
| 'contrast': 1.0, | |
| 'saturation': 1.0, | |
| 'entropy': 1.0, | |
| 'compression_artifacts': 1.0, | |
| 'dynamic_range': 1.0, | |
| # Aesthetic metrics | |
| 'aesthetic_score': 1.5, | |
| 'composition_score': 1.2, | |
| 'color_harmony': 1.2, | |
| # Prompt metrics | |
| 'prompt_similarity': 2.0, | |
| } | |
| # Metrics where lower is better | |
| self.inverse_metrics = ['noise', 'compression_artifacts'] | |
| def normalize_metric(self, values, metric_name): | |
| """ | |
| Normalize metric values to 0-10 scale. | |
| Args: | |
| values: list of metric values | |
| metric_name: name of the metric | |
| Returns: | |
| list: normalized values | |
| """ | |
| if not values: | |
| return [] | |
| # For metrics where lower is better, invert the values | |
| if metric_name in self.inverse_metrics: | |
| values = [max(values) - v + min(values) for v in values] | |
| # Normalize to 0-10 scale | |
| min_val = min(values) | |
| max_val = max(values) | |
| if max_val == min_val: | |
| return [5.0] * len(values) # Default to middle value if all values are the same | |
| return [10 * (v - min_val) / (max_val - min_val) for v in values] | |
| def aggregate_model_results(self, model_results, custom_weights=None): | |
| """ | |
| Aggregate results for a single model across multiple images. | |
| Args: | |
| model_results: list of metric dictionaries for images from the same model | |
| custom_weights: optional dictionary of custom weights for metrics | |
| Returns: | |
| dict: aggregated metrics | |
| """ | |
| if not model_results: | |
| return {} | |
| # Use default weights if custom weights not provided | |
| weights = custom_weights if custom_weights else self.default_weights | |
| # Initialize aggregated results | |
| aggregated = {} | |
| # Collect all metrics | |
| all_metrics = set() | |
| for result in model_results: | |
| all_metrics.update(result.keys()) | |
| # Aggregate each metric | |
| for metric in all_metrics: | |
| # Skip non-numeric metrics | |
| values = [result.get(metric) for result in model_results if metric in result | |
| and isinstance(result[metric], (int, float))] | |
| if values: | |
| aggregated[metric] = { | |
| 'mean': np.mean(values), | |
| 'median': np.median(values), | |
| 'std': np.std(values), | |
| 'min': np.min(values), | |
| 'max': np.max(values), | |
| 'count': len(values) | |
| } | |
| # Calculate overall score | |
| score_components = [] | |
| weight_sum = 0 | |
| for metric, stats in aggregated.items(): | |
| if metric in weights: | |
| # Normalize the mean value to 0-10 scale | |
| normalized_value = stats['mean'] | |
| if metric in self.inverse_metrics: | |
| # For metrics where lower is better, invert the scale | |
| normalized_value = 10 - normalized_value | |
| # Apply weight | |
| weight = weights[metric] | |
| score_components.append(normalized_value * weight) | |
| weight_sum += weight | |
| # Calculate weighted average | |
| if weight_sum > 0: | |
| aggregated['overall_score'] = sum(score_components) / weight_sum | |
| else: | |
| aggregated['overall_score'] = 5.0 # Default middle score | |
| return aggregated | |
| def compare_models(self, model_results_dict, custom_weights=None): | |
| """ | |
| Compare results across different models. | |
| Args: | |
| model_results_dict: dictionary with model names as keys and lists of results as values | |
| custom_weights: optional dictionary of custom weights for metrics | |
| Returns: | |
| dict: comparison results | |
| """ | |
| # Aggregate results for each model | |
| aggregated_results = {} | |
| for model_name, results in model_results_dict.items(): | |
| aggregated_results[model_name] = self.aggregate_model_results(results, custom_weights) | |
| # Extract key metrics for comparison | |
| comparison = {} | |
| for model_name, agg_results in aggregated_results.items(): | |
| model_comparison = { | |
| 'overall_score': agg_results.get('overall_score', 5.0) | |
| } | |
| # Add mean values of all metrics | |
| for metric, stats in agg_results.items(): | |
| if metric != 'overall_score' and isinstance(stats, dict) and 'mean' in stats: | |
| model_comparison[f"{metric}"] = stats['mean'] | |
| comparison[model_name] = model_comparison | |
| return comparison | |
| def analyze_by_prompt(self, results_by_prompt, custom_weights=None): | |
| """ | |
| Analyze results grouped by prompt. | |
| Args: | |
| results_by_prompt: dictionary with prompts as keys and dictionaries of model results as values | |
| custom_weights: optional dictionary of custom weights for metrics | |
| Returns: | |
| dict: analysis results by prompt | |
| """ | |
| prompt_analysis = {} | |
| for prompt, model_results in results_by_prompt.items(): | |
| # Compare models for this prompt | |
| prompt_comparison = self.compare_models(model_results, custom_weights) | |
| # Find best model for this prompt | |
| best_model = None | |
| best_score = -1 | |
| for model, metrics in prompt_comparison.items(): | |
| score = metrics.get('overall_score', 0) | |
| if score > best_score: | |
| best_score = score | |
| best_model = model | |
| prompt_analysis[prompt] = { | |
| 'model_comparison': prompt_comparison, | |
| 'best_model': best_model, | |
| 'best_score': best_score | |
| } | |
| return prompt_analysis | |
| def create_comparison_dataframe(self, comparison_results): | |
| """ | |
| Create a pandas DataFrame from comparison results. | |
| Args: | |
| comparison_results: dictionary with model names as keys and metric dictionaries as values | |
| Returns: | |
| pandas.DataFrame: comparison table | |
| """ | |
| # Convert to DataFrame | |
| df = pd.DataFrame.from_dict(comparison_results, orient='index') | |
| # Sort by overall score | |
| if 'overall_score' in df.columns: | |
| df = df.sort_values('overall_score', ascending=False) | |
| return df | |