Ákos Hadnagy
		
	commited on
		
		
					Commit 
							
							·
						
						b6b18a0
	
1
								Parent(s):
							
							954d017
								
UI improvements
Browse files- app.py +197 -44
- scenario_mappings.json +9 -9
    	
        app.py
    CHANGED
    
    | @@ -31,6 +31,7 @@ class BenchmarkDashboard: | |
| 31 | 
             
                    self.reader = BenchmarkDataReader()
         | 
| 32 | 
             
                    self.df = None
         | 
| 33 | 
             
                    self.scenario_mappings = self.load_scenario_mappings()
         | 
|  | |
| 34 | 
             
                    self.load_data()
         | 
| 35 |  | 
| 36 | 
             
                def load_data(self) -> None:
         | 
| @@ -72,16 +73,121 @@ class BenchmarkDashboard: | |
| 72 | 
             
                    # If not found in mappings, assume it's already a raw name
         | 
| 73 | 
             
                    return readable_name
         | 
| 74 |  | 
| 75 | 
            -
                def  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 76 | 
             
                    """Get unique values for filter dropdowns and date range."""
         | 
| 77 | 
             
                    if self.df_pandas.empty:
         | 
| 78 | 
            -
                        return [], [], [], [], "", ""
         | 
| 79 |  | 
| 80 | 
             
                    models = sorted(self.df_pandas['model_name'].dropna().unique().tolist())
         | 
| 81 |  | 
| 82 | 
            -
                    # Get scenarios with  | 
| 83 | 
             
                    raw_scenarios = sorted(self.df_pandas['scenario_name'].dropna().unique().tolist())
         | 
| 84 | 
            -
                    scenarios =  | 
| 85 |  | 
| 86 | 
             
                    gpus = sorted(self.df_pandas['gpu_name'].dropna().unique().tolist())
         | 
| 87 |  | 
| @@ -122,9 +228,9 @@ class BenchmarkDashboard: | |
| 122 | 
             
                    min_date = self.df_pandas['timestamp'].min().strftime('%Y-%m-%d')
         | 
| 123 | 
             
                    max_date = self.df_pandas['timestamp'].max().strftime('%Y-%m-%d')
         | 
| 124 |  | 
| 125 | 
            -
                    return models, scenarios, gpus, benchmark_runs, min_date, max_date
         | 
| 126 |  | 
| 127 | 
            -
                def filter_data(self,  | 
| 128 | 
             
                               selected_gpus: List[str], selected_run: str = None,
         | 
| 129 | 
             
                               start_date: str = None, end_date: str = None) -> pd.DataFrame:
         | 
| 130 | 
             
                    """Filter data based on user selections."""
         | 
| @@ -133,11 +239,12 @@ class BenchmarkDashboard: | |
| 133 |  | 
| 134 | 
             
                    filtered_df = self.df_pandas.copy()
         | 
| 135 |  | 
| 136 | 
            -
                    if  | 
| 137 | 
            -
                        filtered_df = filtered_df[filtered_df['model_name'] | 
| 138 | 
             
                    if selected_scenarios:
         | 
| 139 | 
            -
                        #  | 
| 140 | 
            -
                         | 
|  | |
| 141 | 
             
                        filtered_df = filtered_df[filtered_df['scenario_name'].isin(raw_scenarios)]
         | 
| 142 | 
             
                    if selected_gpus:
         | 
| 143 | 
             
                        filtered_df = filtered_df[filtered_df['gpu_name'].isin(selected_gpus)]
         | 
| @@ -201,9 +308,9 @@ class BenchmarkDashboard: | |
| 201 | 
             
                        x='scenario_display',
         | 
| 202 | 
             
                        y=metric,
         | 
| 203 | 
             
                        color='model_name',
         | 
| 204 | 
            -
                        title=f'Performance Comparison: { | 
| 205 | 
             
                        labels={
         | 
| 206 | 
            -
                            metric:  | 
| 207 | 
             
                            'scenario_display': 'Benchmark Scenario',
         | 
| 208 | 
             
                            'model_name': 'Model'
         | 
| 209 | 
             
                        },
         | 
| @@ -255,7 +362,7 @@ class BenchmarkDashboard: | |
| 255 | 
             
                                    hovertemplate=f'<b>{model}</b><br>' +
         | 
| 256 | 
             
                                                 f'Scenario: {readable_scenario}<br>' +
         | 
| 257 | 
             
                                                 'Time: %{x}<br>' +
         | 
| 258 | 
            -
                                                 f'{ | 
| 259 | 
             
                                                 '<extra></extra>'
         | 
| 260 | 
             
                                ))
         | 
| 261 |  | 
| @@ -269,9 +376,9 @@ class BenchmarkDashboard: | |
| 269 | 
             
                        )
         | 
| 270 |  | 
| 271 | 
             
                    fig.update_layout(
         | 
| 272 | 
            -
                        title=f'Historical Trends Across Benchmark Runs: { | 
| 273 | 
             
                        xaxis_title='Timestamp',
         | 
| 274 | 
            -
                        yaxis_title= | 
| 275 | 
             
                        height=500,
         | 
| 276 | 
             
                        hovermode='closest',
         | 
| 277 | 
             
                        showlegend=True,
         | 
| @@ -325,7 +432,7 @@ class BenchmarkDashboard: | |
| 325 | 
             
                    return fig
         | 
| 326 |  | 
| 327 | 
             
                def create_metrics_summary_table(self, filtered_df: pd.DataFrame) -> pd.DataFrame:
         | 
| 328 | 
            -
                    """Create summary statistics table."""
         | 
| 329 | 
             
                    if filtered_df.empty:
         | 
| 330 | 
             
                        return pd.DataFrame({'Message': ['No data available for selected filters']})
         | 
| 331 |  | 
| @@ -336,24 +443,34 @@ class BenchmarkDashboard: | |
| 336 | 
             
                    ]
         | 
| 337 |  | 
| 338 | 
             
                    summary_data = []
         | 
| 339 | 
            -
                    for model in filtered_df['model_name'].unique():
         | 
| 340 | 
            -
                        model_data = filtered_df[filtered_df['model_name'] == model]
         | 
| 341 |  | 
| 342 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 343 | 
             
                        for metric in metrics_cols:
         | 
| 344 | 
            -
                            if metric in  | 
| 345 | 
            -
                                 | 
| 346 | 
            -
             | 
|  | |
|  | |
|  | |
| 347 |  | 
| 348 | 
             
                        summary_data.append(row)
         | 
| 349 |  | 
| 350 | 
             
                    return pd.DataFrame(summary_data)
         | 
| 351 |  | 
| 352 | 
            -
                def update_dashboard(self,  | 
| 353 | 
             
                                    selected_gpus: List[str], selected_run: str, metric: str):
         | 
| 354 | 
             
                    """Update all dashboard components based on current filters."""
         | 
| 355 | 
             
                    filtered_df = self.filter_data(
         | 
| 356 | 
            -
                         | 
| 357 | 
             
                    )
         | 
| 358 |  | 
| 359 | 
             
                    # Create charts
         | 
| @@ -363,23 +480,35 @@ class BenchmarkDashboard: | |
| 363 |  | 
| 364 | 
             
                    # Summary stats
         | 
| 365 | 
             
                    if not filtered_df.empty:
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 366 | 
             
                        summary_text = f"""
         | 
| 367 | 
            -
                        ** | 
| 368 | 
            -
                        -  | 
| 369 | 
            -
                        -  | 
| 370 | 
            -
                        -  | 
| 371 | 
            -
             | 
|  | |
|  | |
| 372 | 
             
                        """
         | 
| 373 | 
             
                    else:
         | 
| 374 | 
             
                        summary_text = "No data available for current selection."
         | 
| 375 |  | 
| 376 | 
             
                    return perf_chart, gpu_chart, summary_table, summary_text
         | 
| 377 |  | 
| 378 | 
            -
                def update_historical_trends(self,  | 
| 379 | 
             
                                            selected_gpus: List[str], start_date: str, end_date: str, metric: str):
         | 
| 380 | 
             
                    """Update historical trends chart with date filtering."""
         | 
| 381 | 
             
                    filtered_df = self.filter_data(
         | 
| 382 | 
            -
                         | 
| 383 | 
             
                        start_date=start_date, end_date=end_date
         | 
| 384 | 
             
                    )
         | 
| 385 | 
             
                    trend_chart = self.create_historical_trend_chart(filtered_df, metric)
         | 
| @@ -389,34 +518,38 @@ class BenchmarkDashboard: | |
| 389 | 
             
            def create_gradio_interface() -> gr.Interface:
         | 
| 390 | 
             
                """Create the Gradio interface."""
         | 
| 391 | 
             
                dashboard = BenchmarkDashboard()
         | 
| 392 | 
            -
                models, scenarios, gpus, benchmark_runs, min_date, max_date = dashboard.get_filter_options()
         | 
| 393 |  | 
| 394 | 
            -
                # Performance metrics options
         | 
| 395 | 
            -
                 | 
| 396 | 
             
                    "tokens_per_second_mean",
         | 
| 397 | 
             
                    "latency_seconds_mean",
         | 
| 398 | 
             
                    "time_to_first_token_seconds_mean",
         | 
| 399 | 
             
                    "time_per_output_token_seconds_mean"
         | 
| 400 | 
             
                ]
         | 
|  | |
| 401 |  | 
| 402 | 
             
                with gr.Blocks(title="LLM Inference Performance Dashboard", theme=gr.themes.Soft()) as demo:
         | 
| 403 | 
             
                    gr.Markdown("# 🚀 LLM Inference Performance Dashboard")
         | 
| 404 | 
             
                    gr.Markdown("Analyze and compare LLM inference performance across models, scenarios, and hardware configurations.")
         | 
|  | |
| 405 |  | 
| 406 | 
             
                    with gr.Row():
         | 
| 407 | 
             
                        with gr.Column(scale=1):
         | 
| 408 | 
             
                            gr.Markdown("## Filters")
         | 
| 409 |  | 
| 410 | 
            -
                            model_filter = gr. | 
| 411 | 
             
                                choices=models,
         | 
| 412 | 
            -
                                value=models,
         | 
| 413 | 
            -
                                label="Select  | 
| 414 | 
             
                                interactive=True
         | 
| 415 | 
             
                            )
         | 
| 416 | 
            -
                            scenario_filter = gr. | 
| 417 | 
             
                                choices=scenarios,
         | 
| 418 | 
            -
                                value= | 
| 419 | 
             
                                label="Select Scenarios",
         | 
|  | |
|  | |
| 420 | 
             
                                interactive=True
         | 
| 421 | 
             
                            )
         | 
| 422 | 
             
                            gpu_filter = gr.CheckboxGroup(
         | 
| @@ -427,7 +560,7 @@ def create_gradio_interface() -> gr.Interface: | |
| 427 | 
             
                            )
         | 
| 428 | 
             
                            metric_selector = gr.Dropdown(
         | 
| 429 | 
             
                                choices=metric_options,
         | 
| 430 | 
            -
                                value="tokens_per_second_mean",
         | 
| 431 | 
             
                                label="Primary Metric",
         | 
| 432 | 
             
                                interactive=True
         | 
| 433 | 
             
                            )
         | 
| @@ -494,16 +627,29 @@ def create_gradio_interface() -> gr.Interface: | |
| 494 | 
             
                        filtered_runs = [run for run in benchmark_runs if search_text.lower() in run.lower()]
         | 
| 495 | 
             
                        return gr.Dropdown(choices=filtered_runs, value=filtered_runs[0] if filtered_runs else None)
         | 
| 496 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 497 | 
             
                    # Update function for main dashboard (excluding historical trends)
         | 
| 498 | 
            -
                    def update_main( | 
|  | |
|  | |
| 499 | 
             
                        return dashboard.update_dashboard(
         | 
| 500 | 
            -
                             | 
| 501 | 
             
                        )
         | 
| 502 |  | 
| 503 | 
             
                    # Update function for historical trends
         | 
| 504 | 
            -
                    def update_trends( | 
|  | |
|  | |
| 505 | 
             
                        return dashboard.update_historical_trends(
         | 
| 506 | 
            -
                             | 
| 507 | 
             
                        )
         | 
| 508 |  | 
| 509 | 
             
                    # Set up interactivity for main dashboard
         | 
| @@ -525,6 +671,13 @@ def create_gradio_interface() -> gr.Interface: | |
| 525 | 
             
                    # Connect search field to filter benchmark runs
         | 
| 526 | 
             
                    run_search.change(fn=filter_benchmark_runs, inputs=[run_search], outputs=[benchmark_run_selector])
         | 
| 527 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 528 | 
             
                    # Initial load
         | 
| 529 | 
             
                    demo.load(fn=update_main, inputs=main_inputs, outputs=main_outputs)
         | 
| 530 | 
             
                    demo.load(fn=update_trends, inputs=trends_inputs, outputs=trends_outputs)
         | 
|  | |
| 31 | 
             
                    self.reader = BenchmarkDataReader()
         | 
| 32 | 
             
                    self.df = None
         | 
| 33 | 
             
                    self.scenario_mappings = self.load_scenario_mappings()
         | 
| 34 | 
            +
                    self.metric_mappings = self.get_metric_mappings()
         | 
| 35 | 
             
                    self.load_data()
         | 
| 36 |  | 
| 37 | 
             
                def load_data(self) -> None:
         | 
|  | |
| 73 | 
             
                    # If not found in mappings, assume it's already a raw name
         | 
| 74 | 
             
                    return readable_name
         | 
| 75 |  | 
| 76 | 
            +
                def get_metric_mappings(self) -> Dict[str, str]:
         | 
| 77 | 
            +
                    """Get metric name mappings from technical to human-readable names."""
         | 
| 78 | 
            +
                    return {
         | 
| 79 | 
            +
                        "tokens_per_second_mean": "Tokens per Second",
         | 
| 80 | 
            +
                        "latency_seconds_mean": "Latency (seconds)",
         | 
| 81 | 
            +
                        "time_to_first_token_seconds_mean": "Time to First Token (seconds)",
         | 
| 82 | 
            +
                        "time_per_output_token_seconds_mean": "Time per Output Token (seconds)"
         | 
| 83 | 
            +
                    }
         | 
| 84 | 
            +
             | 
| 85 | 
            +
                def get_readable_metric_name(self, metric_name: str) -> str:
         | 
| 86 | 
            +
                    """Get human-readable metric name or return original if not mapped."""
         | 
| 87 | 
            +
                    return self.metric_mappings.get(metric_name, metric_name)
         | 
| 88 | 
            +
             | 
| 89 | 
            +
                def get_raw_metric_name(self, readable_name: str) -> str:
         | 
| 90 | 
            +
                    """Convert human-readable metric name back to raw metric name."""
         | 
| 91 | 
            +
                    for raw_name, mapped_name in self.metric_mappings.items():
         | 
| 92 | 
            +
                        if mapped_name == readable_name:
         | 
| 93 | 
            +
                            return raw_name
         | 
| 94 | 
            +
                    return readable_name
         | 
| 95 | 
            +
             | 
| 96 | 
            +
                def get_best_scenario_for_model(self, model_name: str, metric: str = "tokens_per_second_mean") -> str:
         | 
| 97 | 
            +
                    """Get the best performing scenario for a given model."""
         | 
| 98 | 
            +
                    if self.df_pandas.empty:
         | 
| 99 | 
            +
                        return ""
         | 
| 100 | 
            +
             | 
| 101 | 
            +
                    # Filter data for this model
         | 
| 102 | 
            +
                    model_data = self.df_pandas[self.df_pandas['model_name'] == model_name]
         | 
| 103 | 
            +
                    if model_data.empty:
         | 
| 104 | 
            +
                        return ""
         | 
| 105 | 
            +
             | 
| 106 | 
            +
                    # Define priority order for scenarios (preference for kernelized/compiled)
         | 
| 107 | 
            +
                    priority_order = [
         | 
| 108 | 
            +
                        "eager_sdpa_flash_attention",
         | 
| 109 | 
            +
                        "eager_sdpa_efficient_attention",
         | 
| 110 | 
            +
                        "compiled_compile_max-autotune_sdpa_efficient_attention",
         | 
| 111 | 
            +
                        "compiled_compile_max-autotune_sdpa_default",
         | 
| 112 | 
            +
                        "compiled_compile_max-autotune_sdpa_math",
         | 
| 113 | 
            +
                        "compiled_compile_max-autotune_eager_attn",
         | 
| 114 | 
            +
                        "eager_sdpa_default",
         | 
| 115 | 
            +
                        "eager_sdpa_math",
         | 
| 116 | 
            +
                        "eager_eager_attn"
         | 
| 117 | 
            +
                    ]
         | 
| 118 | 
            +
             | 
| 119 | 
            +
                    # Check if metric exists
         | 
| 120 | 
            +
                    if metric not in model_data.columns:
         | 
| 121 | 
            +
                        # Fallback to first available scenario in priority order
         | 
| 122 | 
            +
                        for scenario in priority_order:
         | 
| 123 | 
            +
                            if scenario in model_data['scenario_name'].values:
         | 
| 124 | 
            +
                                return self.get_readable_scenario_name(scenario)
         | 
| 125 | 
            +
                        return self.get_readable_scenario_name(model_data['scenario_name'].iloc[0])
         | 
| 126 | 
            +
             | 
| 127 | 
            +
                    # Find best performing scenario (highest value for throughput metrics, lowest for latency)
         | 
| 128 | 
            +
                    is_latency_metric = 'latency' in metric.lower() or 'time' in metric.lower()
         | 
| 129 | 
            +
             | 
| 130 | 
            +
                    if is_latency_metric:
         | 
| 131 | 
            +
                        best_row = model_data.loc[model_data[metric].idxmin()]
         | 
| 132 | 
            +
                    else:
         | 
| 133 | 
            +
                        best_row = model_data.loc[model_data[metric].idxmax()]
         | 
| 134 | 
            +
             | 
| 135 | 
            +
                    return self.get_readable_scenario_name(best_row['scenario_name'])
         | 
| 136 | 
            +
             | 
| 137 | 
            +
                def get_organized_scenarios(self, available_raw_scenarios: List[str]) -> Tuple[List[str], List[str]]:
         | 
| 138 | 
            +
                    """Organize scenarios into priority groups with separators."""
         | 
| 139 | 
            +
                    # Define priority scenarios (main recommended scenarios)
         | 
| 140 | 
            +
                    priority_raw_scenarios = [
         | 
| 141 | 
            +
                        "eager_sdpa_flash_attention",
         | 
| 142 | 
            +
                        "compiled_compile_max-autotune_sdpa_default"
         | 
| 143 | 
            +
                    ]
         | 
| 144 | 
            +
             | 
| 145 | 
            +
                    # Define expert/advanced scenarios (including efficient attention)
         | 
| 146 | 
            +
                    expert_raw_scenarios = [
         | 
| 147 | 
            +
                        "eager_sdpa_efficient_attention",
         | 
| 148 | 
            +
                        "compiled_compile_max-autotune_sdpa_efficient_attention",
         | 
| 149 | 
            +
                        "compiled_compile_max-autotune_eager_attn",
         | 
| 150 | 
            +
                        "compiled_compile_max-autotune_sdpa_math",
         | 
| 151 | 
            +
                        "eager_sdpa_default",
         | 
| 152 | 
            +
                        "eager_eager_attn",
         | 
| 153 | 
            +
                        "eager_sdpa_math"
         | 
| 154 | 
            +
                    ]
         | 
| 155 | 
            +
             | 
| 156 | 
            +
                    # Get available scenarios in priority order
         | 
| 157 | 
            +
                    priority_scenarios = []
         | 
| 158 | 
            +
                    expert_scenarios = []
         | 
| 159 | 
            +
             | 
| 160 | 
            +
                    # Add priority scenarios that are available
         | 
| 161 | 
            +
                    for raw_scenario in priority_raw_scenarios:
         | 
| 162 | 
            +
                        if raw_scenario in available_raw_scenarios:
         | 
| 163 | 
            +
                            readable_name = self.get_readable_scenario_name(raw_scenario)
         | 
| 164 | 
            +
                            priority_scenarios.append(readable_name)
         | 
| 165 | 
            +
             | 
| 166 | 
            +
                    # Add expert scenarios that are available
         | 
| 167 | 
            +
                    for raw_scenario in expert_raw_scenarios:
         | 
| 168 | 
            +
                        if raw_scenario in available_raw_scenarios:
         | 
| 169 | 
            +
                            readable_name = self.get_readable_scenario_name(raw_scenario)
         | 
| 170 | 
            +
                            expert_scenarios.append(readable_name)
         | 
| 171 | 
            +
             | 
| 172 | 
            +
                    # Combine with separator
         | 
| 173 | 
            +
                    all_scenarios = priority_scenarios.copy()
         | 
| 174 | 
            +
                    if expert_scenarios:
         | 
| 175 | 
            +
                        all_scenarios.append("─── Advanced/Developer Options ───")
         | 
| 176 | 
            +
                        all_scenarios.extend(expert_scenarios)
         | 
| 177 | 
            +
             | 
| 178 | 
            +
                    # Return all scenarios (no default selections for multi-select anymore)
         | 
| 179 | 
            +
                    return all_scenarios, []
         | 
| 180 | 
            +
             | 
| 181 | 
            +
                def get_filter_options(self) -> Tuple[List[str], List[str], List[str], List[str], List[str], str, str]:
         | 
| 182 | 
             
                    """Get unique values for filter dropdowns and date range."""
         | 
| 183 | 
             
                    if self.df_pandas.empty:
         | 
| 184 | 
            +
                        return [], [], [], [], [], "", ""
         | 
| 185 |  | 
| 186 | 
             
                    models = sorted(self.df_pandas['model_name'].dropna().unique().tolist())
         | 
| 187 |  | 
| 188 | 
            +
                    # Get organized scenarios with priority ordering and default selections
         | 
| 189 | 
             
                    raw_scenarios = sorted(self.df_pandas['scenario_name'].dropna().unique().tolist())
         | 
| 190 | 
            +
                    scenarios, default_scenarios = self.get_organized_scenarios(raw_scenarios)
         | 
| 191 |  | 
| 192 | 
             
                    gpus = sorted(self.df_pandas['gpu_name'].dropna().unique().tolist())
         | 
| 193 |  | 
|  | |
| 228 | 
             
                    min_date = self.df_pandas['timestamp'].min().strftime('%Y-%m-%d')
         | 
| 229 | 
             
                    max_date = self.df_pandas['timestamp'].max().strftime('%Y-%m-%d')
         | 
| 230 |  | 
| 231 | 
            +
                    return models, scenarios, gpus, benchmark_runs, default_scenarios, min_date, max_date
         | 
| 232 |  | 
| 233 | 
            +
                def filter_data(self, selected_model: str, selected_scenarios: List[str],
         | 
| 234 | 
             
                               selected_gpus: List[str], selected_run: str = None,
         | 
| 235 | 
             
                               start_date: str = None, end_date: str = None) -> pd.DataFrame:
         | 
| 236 | 
             
                    """Filter data based on user selections."""
         | 
|  | |
| 239 |  | 
| 240 | 
             
                    filtered_df = self.df_pandas.copy()
         | 
| 241 |  | 
| 242 | 
            +
                    if selected_model:
         | 
| 243 | 
            +
                        filtered_df = filtered_df[filtered_df['model_name'] == selected_model]
         | 
| 244 | 
             
                    if selected_scenarios:
         | 
| 245 | 
            +
                        # Filter out separator lines and convert human-readable scenario names back to raw names for filtering
         | 
| 246 | 
            +
                        valid_scenarios = [scenario for scenario in selected_scenarios if not scenario.startswith("───")]
         | 
| 247 | 
            +
                        raw_scenarios = [self.get_raw_scenario_name(scenario) for scenario in valid_scenarios]
         | 
| 248 | 
             
                        filtered_df = filtered_df[filtered_df['scenario_name'].isin(raw_scenarios)]
         | 
| 249 | 
             
                    if selected_gpus:
         | 
| 250 | 
             
                        filtered_df = filtered_df[filtered_df['gpu_name'].isin(selected_gpus)]
         | 
|  | |
| 308 | 
             
                        x='scenario_display',
         | 
| 309 | 
             
                        y=metric,
         | 
| 310 | 
             
                        color='model_name',
         | 
| 311 | 
            +
                        title=f'Performance Comparison: {self.get_readable_metric_name(metric)}',
         | 
| 312 | 
             
                        labels={
         | 
| 313 | 
            +
                            metric: self.get_readable_metric_name(metric),
         | 
| 314 | 
             
                            'scenario_display': 'Benchmark Scenario',
         | 
| 315 | 
             
                            'model_name': 'Model'
         | 
| 316 | 
             
                        },
         | 
|  | |
| 362 | 
             
                                    hovertemplate=f'<b>{model}</b><br>' +
         | 
| 363 | 
             
                                                 f'Scenario: {readable_scenario}<br>' +
         | 
| 364 | 
             
                                                 'Time: %{x}<br>' +
         | 
| 365 | 
            +
                                                 f'{self.get_readable_metric_name(metric)}: %{{y}}<br>' +
         | 
| 366 | 
             
                                                 '<extra></extra>'
         | 
| 367 | 
             
                                ))
         | 
| 368 |  | 
|  | |
| 376 | 
             
                        )
         | 
| 377 |  | 
| 378 | 
             
                    fig.update_layout(
         | 
| 379 | 
            +
                        title=f'Historical Trends Across Benchmark Runs: {self.get_readable_metric_name(metric)}',
         | 
| 380 | 
             
                        xaxis_title='Timestamp',
         | 
| 381 | 
            +
                        yaxis_title=self.get_readable_metric_name(metric),
         | 
| 382 | 
             
                        height=500,
         | 
| 383 | 
             
                        hovermode='closest',
         | 
| 384 | 
             
                        showlegend=True,
         | 
|  | |
| 432 | 
             
                    return fig
         | 
| 433 |  | 
| 434 | 
             
                def create_metrics_summary_table(self, filtered_df: pd.DataFrame) -> pd.DataFrame:
         | 
| 435 | 
            +
                    """Create summary statistics table with each scenario as a separate row."""
         | 
| 436 | 
             
                    if filtered_df.empty:
         | 
| 437 | 
             
                        return pd.DataFrame({'Message': ['No data available for selected filters']})
         | 
| 438 |  | 
|  | |
| 443 | 
             
                    ]
         | 
| 444 |  | 
| 445 | 
             
                    summary_data = []
         | 
|  | |
|  | |
| 446 |  | 
| 447 | 
            +
                    # Group by scenario instead of model (since we're now single-model focused)
         | 
| 448 | 
            +
                    for scenario in filtered_df['scenario_name'].unique():
         | 
| 449 | 
            +
                        scenario_data = filtered_df[filtered_df['scenario_name'] == scenario]
         | 
| 450 | 
            +
             | 
| 451 | 
            +
                        # Get human-readable scenario name
         | 
| 452 | 
            +
                        readable_scenario = self.get_readable_scenario_name(scenario)
         | 
| 453 | 
            +
             | 
| 454 | 
            +
                        row = {'Scenario': readable_scenario}
         | 
| 455 | 
            +
             | 
| 456 | 
            +
                        # Add metrics for this scenario
         | 
| 457 | 
             
                        for metric in metrics_cols:
         | 
| 458 | 
            +
                            if metric in scenario_data.columns and not scenario_data[metric].isna().all():
         | 
| 459 | 
            +
                                readable_metric = self.get_readable_metric_name(metric)
         | 
| 460 | 
            +
             | 
| 461 | 
            +
                                # For scenarios, show the mean value (since each scenario should have one value per run)
         | 
| 462 | 
            +
                                mean_value = scenario_data[metric].mean()
         | 
| 463 | 
            +
                                row[readable_metric] = f"{mean_value:.2f}"
         | 
| 464 |  | 
| 465 | 
             
                        summary_data.append(row)
         | 
| 466 |  | 
| 467 | 
             
                    return pd.DataFrame(summary_data)
         | 
| 468 |  | 
| 469 | 
            +
                def update_dashboard(self, selected_model: str, selected_scenarios: List[str],
         | 
| 470 | 
             
                                    selected_gpus: List[str], selected_run: str, metric: str):
         | 
| 471 | 
             
                    """Update all dashboard components based on current filters."""
         | 
| 472 | 
             
                    filtered_df = self.filter_data(
         | 
| 473 | 
            +
                        selected_model, selected_scenarios, selected_gpus, selected_run
         | 
| 474 | 
             
                    )
         | 
| 475 |  | 
| 476 | 
             
                    # Create charts
         | 
|  | |
| 480 |  | 
| 481 | 
             
                    # Summary stats
         | 
| 482 | 
             
                    if not filtered_df.empty:
         | 
| 483 | 
            +
                        model_name = filtered_df['model_name'].iloc[0]
         | 
| 484 | 
            +
             | 
| 485 | 
            +
                        # Get list of scenario names (raw) and convert to readable names
         | 
| 486 | 
            +
                        raw_scenario_names = sorted(filtered_df['scenario_name'].unique())
         | 
| 487 | 
            +
                        readable_scenario_names = [self.get_readable_scenario_name(scenario) for scenario in raw_scenario_names]
         | 
| 488 | 
            +
                        scenarios_list = ", ".join(readable_scenario_names)
         | 
| 489 | 
            +
             | 
| 490 | 
            +
                        date_range = f"{filtered_df['timestamp'].min().strftime('%Y-%m-%d')} to {filtered_df['timestamp'].max().strftime('%Y-%m-%d')}"
         | 
| 491 | 
            +
                        benchmark_runs = len(filtered_df.groupby(['timestamp', 'file_path']))
         | 
| 492 | 
            +
             | 
| 493 | 
             
                        summary_text = f"""
         | 
| 494 | 
            +
                        **Analysis Summary for {model_name}:**
         | 
| 495 | 
            +
                        - Date Range: {date_range}
         | 
| 496 | 
            +
                        - Benchmark Runs: {benchmark_runs}
         | 
| 497 | 
            +
                        - Total Data Points: {len(filtered_df)}
         | 
| 498 | 
            +
             | 
| 499 | 
            +
                        **Selected Scenarios:**
         | 
| 500 | 
            +
                        {scenarios_list}
         | 
| 501 | 
             
                        """
         | 
| 502 | 
             
                    else:
         | 
| 503 | 
             
                        summary_text = "No data available for current selection."
         | 
| 504 |  | 
| 505 | 
             
                    return perf_chart, gpu_chart, summary_table, summary_text
         | 
| 506 |  | 
| 507 | 
            +
                def update_historical_trends(self, selected_model: str, selected_scenarios: List[str],
         | 
| 508 | 
             
                                            selected_gpus: List[str], start_date: str, end_date: str, metric: str):
         | 
| 509 | 
             
                    """Update historical trends chart with date filtering."""
         | 
| 510 | 
             
                    filtered_df = self.filter_data(
         | 
| 511 | 
            +
                        selected_model, selected_scenarios, selected_gpus,
         | 
| 512 | 
             
                        start_date=start_date, end_date=end_date
         | 
| 513 | 
             
                    )
         | 
| 514 | 
             
                    trend_chart = self.create_historical_trend_chart(filtered_df, metric)
         | 
|  | |
| 518 | 
             
            def create_gradio_interface() -> gr.Interface:
         | 
| 519 | 
             
                """Create the Gradio interface."""
         | 
| 520 | 
             
                dashboard = BenchmarkDashboard()
         | 
| 521 | 
            +
                models, scenarios, gpus, benchmark_runs, default_scenarios, min_date, max_date = dashboard.get_filter_options()
         | 
| 522 |  | 
| 523 | 
            +
                # Performance metrics options (human-readable)
         | 
| 524 | 
            +
                raw_metric_options = [
         | 
| 525 | 
             
                    "tokens_per_second_mean",
         | 
| 526 | 
             
                    "latency_seconds_mean",
         | 
| 527 | 
             
                    "time_to_first_token_seconds_mean",
         | 
| 528 | 
             
                    "time_per_output_token_seconds_mean"
         | 
| 529 | 
             
                ]
         | 
| 530 | 
            +
                metric_options = [dashboard.get_readable_metric_name(metric) for metric in raw_metric_options]
         | 
| 531 |  | 
| 532 | 
             
                with gr.Blocks(title="LLM Inference Performance Dashboard", theme=gr.themes.Soft()) as demo:
         | 
| 533 | 
             
                    gr.Markdown("# 🚀 LLM Inference Performance Dashboard")
         | 
| 534 | 
             
                    gr.Markdown("Analyze and compare LLM inference performance across models, scenarios, and hardware configurations.")
         | 
| 535 | 
            +
                    gr.Markdown("*💡 **Smart Defaults**: The best performing scenario is automatically selected for each model based on throughput analysis.*")
         | 
| 536 |  | 
| 537 | 
             
                    with gr.Row():
         | 
| 538 | 
             
                        with gr.Column(scale=1):
         | 
| 539 | 
             
                            gr.Markdown("## Filters")
         | 
| 540 |  | 
| 541 | 
            +
                            model_filter = gr.Dropdown(
         | 
| 542 | 
             
                                choices=models,
         | 
| 543 | 
            +
                                value=models[0] if models else None,
         | 
| 544 | 
            +
                                label="Select Model",
         | 
| 545 | 
             
                                interactive=True
         | 
| 546 | 
             
                            )
         | 
| 547 | 
            +
                            scenario_filter = gr.Dropdown(
         | 
| 548 | 
             
                                choices=scenarios,
         | 
| 549 | 
            +
                                value=[dashboard.get_best_scenario_for_model(models[0], "tokens_per_second_mean")] if models else [],
         | 
| 550 | 
             
                                label="Select Scenarios",
         | 
| 551 | 
            +
                                info="💡 The best performing scenario is automatically selected when you change models",
         | 
| 552 | 
            +
                                multiselect=True,
         | 
| 553 | 
             
                                interactive=True
         | 
| 554 | 
             
                            )
         | 
| 555 | 
             
                            gpu_filter = gr.CheckboxGroup(
         | 
|  | |
| 560 | 
             
                            )
         | 
| 561 | 
             
                            metric_selector = gr.Dropdown(
         | 
| 562 | 
             
                                choices=metric_options,
         | 
| 563 | 
            +
                                value=dashboard.get_readable_metric_name("tokens_per_second_mean"),
         | 
| 564 | 
             
                                label="Primary Metric",
         | 
| 565 | 
             
                                interactive=True
         | 
| 566 | 
             
                            )
         | 
|  | |
| 627 | 
             
                        filtered_runs = [run for run in benchmark_runs if search_text.lower() in run.lower()]
         | 
| 628 | 
             
                        return gr.Dropdown(choices=filtered_runs, value=filtered_runs[0] if filtered_runs else None)
         | 
| 629 |  | 
| 630 | 
            +
                    # Function to update scenarios when model changes
         | 
| 631 | 
            +
                    def update_scenarios_for_model(selected_model, current_metric):
         | 
| 632 | 
            +
                        if not selected_model:
         | 
| 633 | 
            +
                            return []
         | 
| 634 | 
            +
                        # Convert readable metric name back to raw name
         | 
| 635 | 
            +
                        raw_metric = dashboard.get_raw_metric_name(current_metric)
         | 
| 636 | 
            +
                        best_scenario = dashboard.get_best_scenario_for_model(selected_model, raw_metric)
         | 
| 637 | 
            +
                        return [best_scenario] if best_scenario else []
         | 
| 638 | 
            +
             | 
| 639 | 
             
                    # Update function for main dashboard (excluding historical trends)
         | 
| 640 | 
            +
                    def update_main(model_selected, scenarios_selected, gpus_selected, run_selected, metric):
         | 
| 641 | 
            +
                        # Convert readable metric name back to raw name
         | 
| 642 | 
            +
                        raw_metric = dashboard.get_raw_metric_name(metric)
         | 
| 643 | 
             
                        return dashboard.update_dashboard(
         | 
| 644 | 
            +
                            model_selected, scenarios_selected, gpus_selected, run_selected, raw_metric
         | 
| 645 | 
             
                        )
         | 
| 646 |  | 
| 647 | 
             
                    # Update function for historical trends
         | 
| 648 | 
            +
                    def update_trends(model_selected, scenarios_selected, gpus_selected, start_dt, end_dt, metric):
         | 
| 649 | 
            +
                        # Convert readable metric name back to raw name
         | 
| 650 | 
            +
                        raw_metric = dashboard.get_raw_metric_name(metric)
         | 
| 651 | 
             
                        return dashboard.update_historical_trends(
         | 
| 652 | 
            +
                            model_selected, scenarios_selected, gpus_selected, start_dt, end_dt, raw_metric
         | 
| 653 | 
             
                        )
         | 
| 654 |  | 
| 655 | 
             
                    # Set up interactivity for main dashboard
         | 
|  | |
| 671 | 
             
                    # Connect search field to filter benchmark runs
         | 
| 672 | 
             
                    run_search.change(fn=filter_benchmark_runs, inputs=[run_search], outputs=[benchmark_run_selector])
         | 
| 673 |  | 
| 674 | 
            +
                    # Auto-update scenarios when model changes
         | 
| 675 | 
            +
                    model_filter.change(
         | 
| 676 | 
            +
                        fn=update_scenarios_for_model,
         | 
| 677 | 
            +
                        inputs=[model_filter, metric_selector],
         | 
| 678 | 
            +
                        outputs=[scenario_filter]
         | 
| 679 | 
            +
                    )
         | 
| 680 | 
            +
             | 
| 681 | 
             
                    # Initial load
         | 
| 682 | 
             
                    demo.load(fn=update_main, inputs=main_inputs, outputs=main_outputs)
         | 
| 683 | 
             
                    demo.load(fn=update_trends, inputs=trends_inputs, outputs=trends_outputs)
         | 
    	
        scenario_mappings.json
    CHANGED
    
    | @@ -1,11 +1,11 @@ | |
| 1 | 
             
            {
         | 
| 2 | 
            -
              " | 
| 3 | 
            -
              " | 
| 4 | 
            -
              " | 
| 5 | 
            -
              " | 
| 6 | 
            -
              " | 
| 7 | 
            -
              " | 
| 8 | 
            -
              "compiled_compile_max- | 
| 9 | 
            -
              "compiled_compile_max- | 
| 10 | 
            -
              "compiled_compile_max- | 
| 11 | 
             
            }
         | 
|  | |
| 1 | 
             
            {
         | 
| 2 | 
            +
              "eager_sdpa_flash_attention": "Flash Attention",
         | 
| 3 | 
            +
              "compiled_compile_max-autotune_sdpa_default": "Compiled + SDPA Default",
         | 
| 4 | 
            +
              "eager_sdpa_default": "SDPA Default",
         | 
| 5 | 
            +
              "eager_eager_attn": "Eager Attention",
         | 
| 6 | 
            +
              "eager_sdpa_math": "SDPA Math Backend",
         | 
| 7 | 
            +
              "eager_sdpa_efficient_attention": "Efficient Attention",
         | 
| 8 | 
            +
              "compiled_compile_max-autotune_sdpa_efficient_attention": "Compiled + Efficient Attention",
         | 
| 9 | 
            +
              "compiled_compile_max-autotune_eager_attn": "Compiled + Eager Attention",
         | 
| 10 | 
            +
              "compiled_compile_max-autotune_sdpa_math": "Compiled + SDPA Math Backend"
         | 
| 11 | 
             
            }
         | 
