feat: add Glicko2 ranking
Browse files- docs/ranking_system.md +134 -57
 - requirements.txt +3 -0
 - requirements/base.txt +2 -1
 - src/app.py +17 -3
 - src/components/device_comparison.py +186 -0
 - src/components/visualizations.py +144 -303
 - src/core/glicko2_ranking.py +618 -0
 
    	
        docs/ranking_system.md
    CHANGED
    
    | 
         @@ -1,77 +1,154 @@ 
     | 
|
| 1 | 
         
            -
            #  
     | 
| 2 | 
         | 
| 3 | 
         
             
            ## Overview
         
     | 
| 4 | 
         
            -
            The ranking system implements a multi-dimensional approach to evaluate and compare device performance across different aspects of LLM (GGUF) model runs. 
         
     | 
| 5 | 
         | 
| 6 | 
         
            -
             
     | 
| 7 | 
         | 
| 8 | 
         
            -
             
     | 
| 9 | 
         
            -
            ```python
         
     | 
| 10 | 
         
            -
            PP_CONFIG = 512  # Standard prompt processing token count
         
     | 
| 11 | 
         
            -
            TG_CONFIG = 128  # Standard token generation count
         
     | 
| 12 | 
         | 
| 13 | 
         
            -
             
     | 
| 14 | 
         
            -
             
     | 
| 15 | 
         
            -
             
     | 
| 16 | 
         
            -
             
     | 
| 17 | 
         
            -
             
     | 
| 18 | 
         
            -
             
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 19 | 
         | 
| 20 | 
         
            -
            ### Quantization Quality Factors
         
     | 
| 21 | 
         
             
            ```python
         
     | 
| 22 | 
         
            -
             
     | 
| 23 | 
         
            -
             
     | 
| 24 | 
         
            -
                " 
     | 
| 25 | 
         
            -
                 
     | 
| 26 | 
         
            -
             
     | 
| 27 | 
         
            -
             
     | 
| 28 | 
         
            -
             
     | 
| 29 | 
         
            -
             
     | 
| 30 | 
         
            -
             
     | 
| 31 | 
         
            -
             
     | 
| 32 | 
         
            -
             
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 33 | 
         
             
            ```
         
     | 
| 34 | 
         | 
| 35 | 
         
            -
             
     | 
| 36 | 
         
            -
            - F16/F32 are considered 1.0 (this skews the results a bit towards quantization)
         
     | 
| 37 | 
         | 
| 
         | 
|
| 38 | 
         | 
| 39 | 
         
            -
             
     | 
| 40 | 
         
            -
             
     | 
| 41 | 
         
            -
             
     | 
| 42 | 
         
            -
             
     | 
| 43 | 
         
            -
             
     | 
| 44 | 
         
            -
             
     | 
| 45 | 
         
             
               ```
         
     | 
| 46 | 
         | 
| 47 | 
         
            -
            2. ** 
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 48 | 
         
             
               ```
         
     | 
| 49 | 
         
            -
               # Direct multiplication by model size (in billions)
         
     | 
| 50 | 
         
            -
               performance_score = base_score * model_size * quant_factor
         
     | 
| 51 | 
         
            -
               ```
         
     | 
| 52 | 
         
            -
               - Linear multiplier by model size
         
     | 
| 53 | 
         | 
| 54 | 
         
            -
             
     | 
| 55 | 
         
            -
               ```
         
     | 
| 56 | 
         
            -
               normalized_score = (performance_score / max_performance_score) * 100
         
     | 
| 57 | 
         
            -
               ```
         
     | 
| 58 | 
         | 
| 59 | 
         
            -
             
     | 
| 60 | 
         
            -
            - Only benchmarks matching standard conditions are considered:
         
     | 
| 61 | 
         
            -
              - PP_CONFIG (512) tokens for prompt processing
         
     | 
| 62 | 
         
            -
              - TG_CONFIG (128) tokens for token generation
         
     | 
| 63 | 
         | 
| 64 | 
         
            -
             
     | 
| 
         | 
|
| 
         | 
|
| 65 | 
         | 
| 66 | 
         
            -
             
     | 
| 67 | 
         
            -
            - Groups data by `Normalized Device ID` and `Platform`
         
     | 
| 68 | 
         
            -
            - Uses normalized device IDs to ensure consistent device identification across different submissions
         
     | 
| 69 | 
         | 
| 70 | 
         
            -
             
     | 
| 71 | 
         
            -
             
     | 
| 72 | 
         
            -
             
     | 
| 73 | 
         
            -
             
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 74 | 
         | 
| 75 | 
         
            -
             
     | 
| 76 | 
         
            -
             
     | 
| 77 | 
         
            -
             
     | 
| 
         | 
|
| 1 | 
         
            +
            # Glicko-2 Ranking System Implementation
         
     | 
| 2 | 
         | 
| 3 | 
         
             
            ## Overview
         
     | 
| 
         | 
|
| 4 | 
         | 
| 5 | 
         
            +
            The Glicko-2 ranking system is used in this project to rank devices based on their performance in benchmark tests, specifically measuring token generation speed (tokens/second) and prompt processing speed (tokens/second). This document explains both the theoretical foundations of Glicko-2 and its specific implementation in our system.
         
     | 
| 6 | 
         | 
| 7 | 
         
            +
            ## Glicko-2 Theory
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 8 | 
         | 
| 9 | 
         
            +
            Glicko-2 is an improvement over the original Glicko system, which itself was an improvement over the Elo rating system. It was developed by Mark Glicko and is particularly well-suited for situations where:
         
     | 
| 10 | 
         
            +
             
     | 
| 11 | 
         
            +
            1. Devices have different numbers of benchmark runs
         
     | 
| 12 | 
         
            +
            2. There's uncertainty about a device's true performance capabilities
         
     | 
| 13 | 
         
            +
            3. Performance metrics need to be compared across different model sizes and configurations
         
     | 
| 14 | 
         
            +
             
     | 
| 15 | 
         
            +
            ### Key Components
         
     | 
| 16 | 
         
            +
             
     | 
| 17 | 
         
            +
            1. **Rating (μ)**: A numerical value representing a device's relative performance level (higher is better)
         
     | 
| 18 | 
         
            +
            2. **Rating Deviation (RD)**: The uncertainty in the performance rating
         
     | 
| 19 | 
         
            +
            3. **Volatility (σ)**: A measure of how consistent a device's performance is across different benchmarks
         
     | 
| 20 | 
         
            +
             
     | 
| 21 | 
         
            +
            ### Rating System Parameters
         
     | 
| 22 | 
         
            +
             
     | 
| 23 | 
         
            +
            - **Initial Rating**: 1500 (standard starting point on the Glicko-2 scale)
         
     | 
| 24 | 
         
            +
            - **Initial RD**: 350 (high uncertainty for new devices)
         
     | 
| 25 | 
         
            +
            - **Volatility**: 0.06 (controls how quickly performance ratings can change)
         
     | 
| 26 | 
         
            +
            - **Tau**: 0.5 (system constant that limits the change in volatility)
         
     | 
| 27 | 
         
            +
             
     | 
| 28 | 
         
            +
            Note: The rating numbers themselves are on a relative scale and don't directly correspond to tokens/second. Instead, they represent relative performance levels where higher numbers indicate better performance. The actual token generation and prompt processing speeds (in tokens/second) are used to determine the relative performance outcomes that update these ratings.
         
     | 
| 29 | 
         
            +
             
     | 
| 30 | 
         
            +
            ## Implementation Details
         
     | 
| 31 | 
         
            +
             
     | 
| 32 | 
         
            +
            ### Data Preparation
         
     | 
| 33 | 
         
            +
             
     | 
| 34 | 
         
            +
            Before applying Glicko-2, we preprocess the benchmark data:
         
     | 
| 35 | 
         
            +
             
     | 
| 36 | 
         
            +
            1. Filter out emulators and iOS devices with insufficient GPU layers, so that we are consistent among iOS devices.
         
     | 
| 37 | 
         
            +
            2. Normalize scores within each model group to account for different model difficulties
         
     | 
| 38 | 
         
            +
            3. Convert continuous performance metrics into relative comparisons:
         
     | 
| 39 | 
         
            +
               - For each pair of devices running the same model, we compare their token generation and prompt processing speeds
         
     | 
| 40 | 
         
            +
               - If a device is faster in both metrics, it "wins" the comparison (outcome = 1)
         
     | 
| 41 | 
         
            +
               - If a device is slower in both metrics, it "loses" the comparison (outcome = 0)
         
     | 
| 42 | 
         
            +
               - If one device is faster in one metric but slower in the other, it's considered a "draw" (outcome = 0.5)
         
     | 
| 43 | 
         
            +
               - This conversion is necessary because Glicko-2 works with discrete outcomes (win/loss/draw) rather than continuous performance values
         
     | 
| 44 | 
         
            +
             
     | 
| 45 | 
         
            +
            For example, if:
         
     | 
| 46 | 
         
            +
            - Device A: Token Generation = 50 tokens/sec, Prompt Processing = 30 tokens/sec
         
     | 
| 47 | 
         
            +
            - Device B: Token Generation = 45 tokens/sec, Prompt Processing = 25 tokens/sec
         
     | 
| 48 | 
         
            +
             
     | 
| 49 | 
         
            +
            Then Device A "wins" this comparison because it's faster in both metrics. This relative outcome (1 for Device A, 0 for Device B) is what's used to update the Glicko-2 ratings.
         
     | 
| 50 | 
         
            +
             
     | 
| 51 | 
         
            +
            ### Match Processing
         
     | 
| 52 | 
         
            +
             
     | 
| 53 | 
         
            +
            For each model, we compare devices pairwise based on their token generation and prompt processing speeds:
         
     | 
| 54 | 
         | 
| 
         | 
|
| 55 | 
         
             
            ```python
         
     | 
| 56 | 
         
            +
            # Example of match processing
         
     | 
| 57 | 
         
            +
            for model, group in df.groupby("Model ID"):
         
     | 
| 58 | 
         
            +
                devices = group["Normalized Device ID"].unique()
         
     | 
| 59 | 
         
            +
                for i in range(len(devices)):
         
     | 
| 60 | 
         
            +
                    for j in range(i + 1, len(devices)):
         
     | 
| 61 | 
         
            +
                        device1 = devices[i]
         
     | 
| 62 | 
         
            +
                        device2 = devices[j]
         
     | 
| 63 | 
         
            +
                        
         
     | 
| 64 | 
         
            +
                        # Compare performance metrics
         
     | 
| 65 | 
         
            +
                        token_speed1 = group[group["Normalized Device ID"] == device1]["Token Generation"].iloc[0]
         
     | 
| 66 | 
         
            +
                        token_speed2 = group[group["Normalized Device ID"] == device2]["Token Generation"].iloc[0]
         
     | 
| 67 | 
         
            +
                        
         
     | 
| 68 | 
         
            +
                        prompt_speed1 = group[group["Normalized Device ID"] == device1]["Prompt Processing"].iloc[0]
         
     | 
| 69 | 
         
            +
                        prompt_speed2 = group[group["Normalized Device ID"] == device2]["Prompt Processing"].iloc[0]
         
     | 
| 70 | 
         
            +
                        
         
     | 
| 71 | 
         
            +
                        # Determine performance outcome
         
     | 
| 72 | 
         
            +
                        if token_speed1 > token_speed2 and prompt_speed1 > prompt_speed2:
         
     | 
| 73 | 
         
            +
                            outcome = 1  # device1 performs better
         
     | 
| 74 | 
         
            +
                        elif token_speed1 < token_speed2 and prompt_speed1 < prompt_speed2:
         
     | 
| 75 | 
         
            +
                            outcome = 0  # device2 performs better
         
     | 
| 76 | 
         
            +
                        else:
         
     | 
| 77 | 
         
            +
                            outcome = 0.5  # mixed performance
         
     | 
| 78 | 
         
             
            ```
         
     | 
| 79 | 
         | 
| 80 | 
         
            +
            ### Rating Updates
         
     | 
| 
         | 
|
| 81 | 
         | 
| 82 | 
         
            +
            The Glicko-2 system updates performance ratings after each benchmark comparison:
         
     | 
| 83 | 
         | 
| 84 | 
         
            +
            1. **Calculate Expected Performance**:
         
     | 
| 85 | 
         
            +
               ```python
         
     | 
| 86 | 
         
            +
               def expected_performance(rating1, rating2, rd1, rd2):
         
     | 
| 87 | 
         
            +
                   q = math.log(10) / 400
         
     | 
| 88 | 
         
            +
                   g_rd = 1 / math.sqrt(1 + 3 * q**2 * (rd2**2) / math.pi**2)
         
     | 
| 89 | 
         
            +
                   return 1 / (1 + 10**(-g_rd * (rating1 - rating2) / 400))
         
     | 
| 90 | 
         
             
               ```
         
     | 
| 91 | 
         | 
| 92 | 
         
            +
            2. **Update Performance Rating and RD**:
         
     | 
| 93 | 
         
            +
               ```python
         
     | 
| 94 | 
         
            +
               def update_performance(rating, rd, outcome, expected):
         
     | 
| 95 | 
         
            +
                   q = math.log(10) / 400
         
     | 
| 96 | 
         
            +
                   d_squared = 1 / (q**2 * g_rd**2 * expected * (1 - expected))
         
     | 
| 97 | 
         
            +
                   new_rd = math.sqrt(1 / (1 / rd**2 + 1 / d_squared))
         
     | 
| 98 | 
         
            +
                   new_rating = rating + q / (1 / rd**2 + 1 / d_squared) * g_rd * (outcome - expected)
         
     | 
| 99 | 
         
            +
                   return new_rating, new_rd
         
     | 
| 100 | 
         
             
               ```
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 101 | 
         | 
| 102 | 
         
            +
            ### Confidence Thresholds
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 103 | 
         | 
| 104 | 
         
            +
            We implement several confidence thresholds:
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 105 | 
         | 
| 106 | 
         
            +
            1. **Minimum Benchmarks**: Devices must have at least 5 benchmark runs to be included in confident rankings
         
     | 
| 107 | 
         
            +
            2. **Performance Deviation**: Devices with RD > 100 tokens/second are considered less reliable
         
     | 
| 108 | 
         
            +
            3. **Performance Consistency**: High volatility indicates inconsistent performance across benchmarks
         
     | 
| 109 | 
         | 
| 110 | 
         
            +
            ## Practical Considerations
         
     | 
| 
         | 
|
| 
         | 
|
| 111 | 
         | 
| 112 | 
         
            +
            ### Handling Sparse Data
         
     | 
| 113 | 
         
            +
             
     | 
| 114 | 
         
            +
            The system is designed to handle sparse benchmark data by:
         
     | 
| 115 | 
         
            +
            1. Using conservative initial performance ratings for new devices
         
     | 
| 116 | 
         
            +
            2. Increasing RD for devices with few benchmark runs
         
     | 
| 117 | 
         
            +
            3. Implementing a minimum benchmark threshold
         
     | 
| 118 | 
         
            +
             
     | 
| 119 | 
         
            +
            ### Performance Metrics
         
     | 
| 120 | 
         
            +
             
     | 
| 121 | 
         
            +
            We track several performance metrics:
         
     | 
| 122 | 
         
            +
            - Combined performance rating (overall tokens/second)
         
     | 
| 123 | 
         
            +
            - Token generation rating (tokens/second)
         
     | 
| 124 | 
         
            +
            - Prompt processing rating (tokens/second)
         
     | 
| 125 | 
         
            +
            - Performance deviation (uncertainty in tokens/second)
         
     | 
| 126 | 
         
            +
            - Number of benchmark runs
         
     | 
| 127 | 
         
            +
            - Performance comparison statistics
         
     | 
| 128 | 
         
            +
             
     | 
| 129 | 
         
            +
            ### Visualization
         
     | 
| 130 | 
         
            +
             
     | 
| 131 | 
         
            +
            The system provides:
         
     | 
| 132 | 
         
            +
            1. Overall performance rankings with confidence intervals
         
     | 
| 133 | 
         
            +
            2. Platform-specific performance statistics
         
     | 
| 134 | 
         
            +
            3. Head-to-head performance comparison tools
         
     | 
| 135 | 
         
            +
            4. Performance trend analysis across different model sizes
         
     | 
| 136 | 
         
            +
             
     | 
| 137 | 
         
            +
            ## Advantages Over Other Systems
         
     | 
| 138 | 
         
            +
             
     | 
| 139 | 
         
            +
            1. **Better Handling of Performance Uncertainty**: Explicit modeling of performance measurement uncertainty
         
     | 
| 140 | 
         
            +
            2. **More Accurate with Fewer Benchmarks**: Can provide meaningful performance ratings with limited data
         
     | 
| 141 | 
         
            +
            3. **Dynamic Performance Updates**: Volatility parameter allows for appropriate rating changes
         
     | 
| 142 | 
         
            +
            4. **Transparent Confidence**: Performance deviations provide clear confidence measures
         
     | 
| 143 | 
         
            +
             
     | 
| 144 | 
         
            +
            ## Limitations
         
     | 
| 145 | 
         
            +
             
     | 
| 146 | 
         
            +
            1. **Computational Complexity**: More complex than Elo, requiring more calculations
         
     | 
| 147 | 
         
            +
            2. **Parameter Sensitivity**: Results can be sensitive to system parameters
         
     | 
| 148 | 
         
            +
            3. **Continuous Metrics**: Requires conversion of continuous performance metrics (tokens/second) to relative comparisons
         
     | 
| 149 | 
         
            +
             
     | 
| 150 | 
         
            +
            ## References
         
     | 
| 151 | 
         | 
| 152 | 
         
            +
            1. Glicko, M. (2001). "The Glicko-2 Rating System"
         
     | 
| 153 | 
         
            +
            2. Glickman, M. E. (1999). "Parameter estimation in large dynamic paired comparison experiments"
         
     | 
| 154 | 
         
            +
            3. Glickman, M. E. (2001). "Dynamic paired comparison models with stochastic variances" 
         
     | 
    	
        requirements.txt
    CHANGED
    
    | 
         @@ -7,3 +7,6 @@ httpx>=0.25.1 
     | 
|
| 7 | 
         
             
            pydantic-settings>=2.0.3 
         
     | 
| 8 | 
         
             
            firebase-admin==6.6.0
         
     | 
| 9 | 
         
             
            statsmodels>=0.14.1
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 7 | 
         
             
            pydantic-settings>=2.0.3 
         
     | 
| 8 | 
         
             
            firebase-admin==6.6.0
         
     | 
| 9 | 
         
             
            statsmodels>=0.14.1
         
     | 
| 10 | 
         
            +
            matplotlib>=3.7.0
         
     | 
| 11 | 
         
            +
            arviz>=0.17.0
         
     | 
| 12 | 
         
            +
            glicko2
         
     | 
    	
        requirements/base.txt
    CHANGED
    
    | 
         @@ -5,4 +5,5 @@ pandas>=2.1.3 
     | 
|
| 5 | 
         
             
            plotly>=5.18.0
         
     | 
| 6 | 
         
             
            httpx>=0.25.1
         
     | 
| 7 | 
         
             
            pydantic-settings>=2.0.3 
         
     | 
| 8 | 
         
            -
            firebase-admin==6.6.0
         
     | 
| 
         | 
| 
         | 
|
| 5 | 
         
             
            plotly>=5.18.0
         
     | 
| 6 | 
         
             
            httpx>=0.25.1
         
     | 
| 7 | 
         
             
            pydantic-settings>=2.0.3 
         
     | 
| 8 | 
         
            +
            firebase-admin==6.6.0
         
     | 
| 9 | 
         
            +
            glicko2
         
     | 
    	
        src/app.py
    CHANGED
    
    | 
         @@ -10,6 +10,8 @@ from .components.visualizations import ( 
     | 
|
| 10 | 
         
             
                render_device_rankings,
         
     | 
| 11 | 
         
             
            )
         
     | 
| 12 | 
         
             
            from .components.header import render_header, render_contribution_guide
         
     | 
| 
         | 
|
| 
         | 
|
| 13 | 
         
             
            from .services.firebase import fetch_leaderboard_data
         
     | 
| 14 | 
         
             
            from .core.styles import CUSTOM_CSS
         
     | 
| 15 | 
         
             
            from .core.scoring import (
         
     | 
| 
         @@ -128,7 +130,13 @@ async def main(): 
     | 
|
| 128 | 
         | 
| 129 | 
         
             
                with main_col:
         
     | 
| 130 | 
         
             
                    # Create tabs for different views
         
     | 
| 131 | 
         
            -
                    tab1, tab2 = st.tabs( 
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 132 | 
         | 
| 133 | 
         
             
                    with tab1:
         
     | 
| 134 | 
         
             
                        # Device rankings view
         
     | 
| 
         @@ -139,11 +147,11 @@ async def main(): 
     | 
|
| 139 | 
         
             
                        st.info(
         
     | 
| 140 | 
         
             
                            f"📊 Rankings are based on benchmarks with standard conditions: "
         
     | 
| 141 | 
         
             
                            f"PP={std.PP_CONFIG} tokens, TG={std.TG_CONFIG} tokens. "
         
     | 
| 142 | 
         
            -
                            f" 
     | 
| 143 | 
         
             
                        )
         
     | 
| 144 | 
         | 
| 145 | 
         
             
                        # Render performance metrics
         
     | 
| 146 | 
         
            -
                        render_performance_metrics(metrics)
         
     | 
| 147 | 
         | 
| 148 | 
         
             
                        # Render device rankings
         
     | 
| 149 | 
         
             
                        render_device_rankings(df)
         
     | 
| 
         @@ -172,6 +180,12 @@ async def main(): 
     | 
|
| 172 | 
         
             
                        # Render performance plots with table filters
         
     | 
| 173 | 
         
             
                        render_performance_plots(df, table_filters)
         
     | 
| 174 | 
         | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 175 | 
         
             
                with guide_col:
         
     | 
| 176 | 
         
             
                    render_contribution_guide()
         
     | 
| 177 | 
         | 
| 
         | 
|
| 10 | 
         
             
                render_device_rankings,
         
     | 
| 11 | 
         
             
            )
         
     | 
| 12 | 
         
             
            from .components.header import render_header, render_contribution_guide
         
     | 
| 13 | 
         
            +
            from .components.rankings import render_algorithm_rankings
         
     | 
| 14 | 
         
            +
            from .components.device_comparison import render_device_comparison
         
     | 
| 15 | 
         
             
            from .services.firebase import fetch_leaderboard_data
         
     | 
| 16 | 
         
             
            from .core.styles import CUSTOM_CSS
         
     | 
| 17 | 
         
             
            from .core.scoring import (
         
     | 
| 
         | 
|
| 130 | 
         | 
| 131 | 
         
             
                with main_col:
         
     | 
| 132 | 
         
             
                    # Create tabs for different views
         
     | 
| 133 | 
         
            +
                    tab1, tab2, tab3 = st.tabs(
         
     | 
| 134 | 
         
            +
                        [
         
     | 
| 135 | 
         
            +
                            "Device Rankings",
         
     | 
| 136 | 
         
            +
                            "Benchmark Results",
         
     | 
| 137 | 
         
            +
                            "⚔️ Device Duel",
         
     | 
| 138 | 
         
            +
                        ]
         
     | 
| 139 | 
         
            +
                    )
         
     | 
| 140 | 
         | 
| 141 | 
         
             
                    with tab1:
         
     | 
| 142 | 
         
             
                        # Device rankings view
         
     | 
| 
         | 
|
| 147 | 
         
             
                        st.info(
         
     | 
| 148 | 
         
             
                            f"📊 Rankings are based on benchmarks with standard conditions: "
         
     | 
| 149 | 
         
             
                            f"PP={std.PP_CONFIG} tokens, TG={std.TG_CONFIG} tokens. "
         
     | 
| 150 | 
         
            +
                            f"The rankings are based on the Glicko-2 algorithm."
         
     | 
| 151 | 
         
             
                        )
         
     | 
| 152 | 
         | 
| 153 | 
         
             
                        # Render performance metrics
         
     | 
| 154 | 
         
            +
                        # render_performance_metrics(metrics)
         
     | 
| 155 | 
         | 
| 156 | 
         
             
                        # Render device rankings
         
     | 
| 157 | 
         
             
                        render_device_rankings(df)
         
     | 
| 
         | 
|
| 180 | 
         
             
                        # Render performance plots with table filters
         
     | 
| 181 | 
         
             
                        render_performance_plots(df, table_filters)
         
     | 
| 182 | 
         | 
| 183 | 
         
            +
                    with tab3:
         
     | 
| 184 | 
         
            +
                        # Device comparison view
         
     | 
| 185 | 
         
            +
                        # Get list of normalized device IDs for the device comparison
         
     | 
| 186 | 
         
            +
                        normalized_device_ids = sorted(df["Normalized Device ID"].unique().tolist())
         
     | 
| 187 | 
         
            +
                        render_device_comparison(df, normalized_device_ids)
         
     | 
| 188 | 
         
            +
             
     | 
| 189 | 
         
             
                with guide_col:
         
     | 
| 190 | 
         
             
                    render_contribution_guide()
         
     | 
| 191 | 
         | 
    	
        src/components/device_comparison.py
    ADDED
    
    | 
         @@ -0,0 +1,186 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            import streamlit as st
         
     | 
| 2 | 
         
            +
            import pandas as pd
         
     | 
| 3 | 
         
            +
            from typing import List, Optional
         
     | 
| 4 | 
         
            +
             
     | 
| 5 | 
         
            +
            from ..core.elo_ranking import analyze_device_matches
         
     | 
| 6 | 
         
            +
            from ..core.trueskill_ranking import analyze_device_trueskill_matches
         
     | 
| 7 | 
         
            +
            from ..core.glicko2_ranking import analyze_device_glicko2_matches
         
     | 
| 8 | 
         
            +
            from ..components.visualizations import clean_device_id
         
     | 
| 9 | 
         
            +
             
     | 
| 10 | 
         
            +
             
     | 
| 11 | 
         
            +
            def render_device_comparison(df: pd.DataFrame, normalized_device_ids: List[str]):
         
     | 
| 12 | 
         
            +
                """
         
     | 
| 13 | 
         
            +
                Render a component for comparing two devices and analyzing their matches.
         
     | 
| 14 | 
         
            +
             
     | 
| 15 | 
         
            +
                Args:
         
     | 
| 16 | 
         
            +
                    df: DataFrame containing benchmark data
         
     | 
| 17 | 
         
            +
                    normalized_device_ids: List of normalized device IDs to select from
         
     | 
| 18 | 
         
            +
                """
         
     | 
| 19 | 
         
            +
                st.title("⚔️ Device Duel Arena")
         
     | 
| 20 | 
         
            +
             
     | 
| 21 | 
         
            +
                # Create mapping of normalized IDs to display names
         
     | 
| 22 | 
         
            +
                device_display_names = {
         
     | 
| 23 | 
         
            +
                    device_id: clean_device_id(device_id) for device_id in normalized_device_ids
         
     | 
| 24 | 
         
            +
                }
         
     | 
| 25 | 
         
            +
             
     | 
| 26 | 
         
            +
                # Create two columns for device selection
         
     | 
| 27 | 
         
            +
                col1, col2 = st.columns(2)
         
     | 
| 28 | 
         
            +
             
     | 
| 29 | 
         
            +
                with col1:
         
     | 
| 30 | 
         
            +
                    device1 = st.selectbox(
         
     | 
| 31 | 
         
            +
                        "Select First Device",
         
     | 
| 32 | 
         
            +
                        options=normalized_device_ids,
         
     | 
| 33 | 
         
            +
                        format_func=lambda x: device_display_names[x],
         
     | 
| 34 | 
         
            +
                        key="device_compare_1",
         
     | 
| 35 | 
         
            +
                    )
         
     | 
| 36 | 
         
            +
             
     | 
| 37 | 
         
            +
                with col2:
         
     | 
| 38 | 
         
            +
                    # Filter second device dropdown to exclude the first selected device
         
     | 
| 39 | 
         
            +
                    remaining_devices = [d for d in normalized_device_ids if d != device1]
         
     | 
| 40 | 
         
            +
                    device2 = st.selectbox(
         
     | 
| 41 | 
         
            +
                        "Select Second Device",
         
     | 
| 42 | 
         
            +
                        options=remaining_devices,
         
     | 
| 43 | 
         
            +
                        format_func=lambda x: device_display_names[x],
         
     | 
| 44 | 
         
            +
                        key="device_compare_2",
         
     | 
| 45 | 
         
            +
                    )
         
     | 
| 46 | 
         
            +
             
     | 
| 47 | 
         
            +
                # Button to analyze matches
         
     | 
| 48 | 
         
            +
                if st.button("Start Duel", key="analyze_matches_btn"):
         
     | 
| 49 | 
         
            +
                    st.markdown("### Match Analysis Results")
         
     | 
| 50 | 
         
            +
             
     | 
| 51 | 
         
            +
                    # Ensure we have both devices
         
     | 
| 52 | 
         
            +
                    if device1 and device2:
         
     | 
| 53 | 
         
            +
                        with st.spinner(
         
     | 
| 54 | 
         
            +
                            f"Analyzing matches between {device_display_names[device1]} and {device_display_names[device2]}..."
         
     | 
| 55 | 
         
            +
                        ):
         
     | 
| 56 | 
         
            +
                            try:
         
     | 
| 57 | 
         
            +
                                # Analyze matches using Glicko-2
         
     | 
| 58 | 
         
            +
                                matches_df = analyze_device_glicko2_matches(df, device1, device2)
         
     | 
| 59 | 
         
            +
             
     | 
| 60 | 
         
            +
                                if not matches_df.empty:
         
     | 
| 61 | 
         
            +
                                    # Show summary statistics
         
     | 
| 62 | 
         
            +
                                    total_matches = len(matches_df)
         
     | 
| 63 | 
         
            +
             
     | 
| 64 | 
         
            +
                                    # Set up metrics
         
     | 
| 65 | 
         
            +
                                    col1, col2, col3 = st.columns(3)
         
     | 
| 66 | 
         
            +
             
     | 
| 67 | 
         
            +
                                    with col1:
         
     | 
| 68 | 
         
            +
                                        st.metric("Total Matches", total_matches)
         
     | 
| 69 | 
         
            +
             
     | 
| 70 | 
         
            +
                                    # Check for required columns before calculating metrics
         
     | 
| 71 | 
         
            +
                                    if (
         
     | 
| 72 | 
         
            +
                                        "Token Winner" in matches_df.columns
         
     | 
| 73 | 
         
            +
                                        and "Prompt Winner" in matches_df.columns
         
     | 
| 74 | 
         
            +
                                    ):
         
     | 
| 75 | 
         
            +
                                        token_wins_1 = sum(matches_df["Token Winner"] == device1)
         
     | 
| 76 | 
         
            +
                                        prompt_wins_1 = sum(matches_df["Prompt Winner"] == device1)
         
     | 
| 77 | 
         
            +
             
     | 
| 78 | 
         
            +
                                        with col2:
         
     | 
| 79 | 
         
            +
                                            st.metric(
         
     | 
| 80 | 
         
            +
                                                f"{device_display_names[device1]}'s Token Wins",
         
     | 
| 81 | 
         
            +
                                                f"{token_wins_1} ({token_wins_1/total_matches*100:.1f}%)",
         
     | 
| 82 | 
         
            +
                                            )
         
     | 
| 83 | 
         
            +
                                            with col3:
         
     | 
| 84 | 
         
            +
                                                st.metric(
         
     | 
| 85 | 
         
            +
                                                    f"{device_display_names[device1]}'s Prompt Wins",
         
     | 
| 86 | 
         
            +
                                                    f"{prompt_wins_1} ({prompt_wins_1/total_matches*100:.1f}%)",
         
     | 
| 87 | 
         
            +
                                                )
         
     | 
| 88 | 
         
            +
             
     | 
| 89 | 
         
            +
                                        # Add Combined Winner metric if available
         
     | 
| 90 | 
         
            +
                                        if "Combined Winner" in matches_df.columns:
         
     | 
| 91 | 
         
            +
                                            combined_wins_1 = sum(
         
     | 
| 92 | 
         
            +
                                                matches_df["Combined Winner"] == device1
         
     | 
| 93 | 
         
            +
                                            )
         
     | 
| 94 | 
         
            +
                                            st.metric(
         
     | 
| 95 | 
         
            +
                                                f"{device_display_names[device1]}'s Combined Wins",
         
     | 
| 96 | 
         
            +
                                                f"{combined_wins_1} ({combined_wins_1/total_matches*100:.1f}%)",
         
     | 
| 97 | 
         
            +
                                            )
         
     | 
| 98 | 
         
            +
                                    else:
         
     | 
| 99 | 
         
            +
                                        st.warning(
         
     | 
| 100 | 
         
            +
                                            "Winner information is missing from the match data."
         
     | 
| 101 | 
         
            +
                                        )
         
     | 
| 102 | 
         
            +
             
     | 
| 103 | 
         
            +
                                    # Show the detailed match table
         
     | 
| 104 | 
         
            +
                                    st.markdown("#### Detailed Match Results")
         
     | 
| 105 | 
         
            +
             
     | 
| 106 | 
         
            +
                                    # Define display columns for Glicko-2
         
     | 
| 107 | 
         
            +
                                    display_cols = [
         
     | 
| 108 | 
         
            +
                                        "Model",
         
     | 
| 109 | 
         
            +
                                        "Token Generation 1",
         
     | 
| 110 | 
         
            +
                                        "Token Generation 2",
         
     | 
| 111 | 
         
            +
                                        "Token Winner",
         
     | 
| 112 | 
         
            +
                                        "Token Win Prob",
         
     | 
| 113 | 
         
            +
                                        "Prompt Processing 1",
         
     | 
| 114 | 
         
            +
                                        "Prompt Processing 2",
         
     | 
| 115 | 
         
            +
                                        "Prompt Winner",
         
     | 
| 116 | 
         
            +
                                        "Prompt Win Prob",
         
     | 
| 117 | 
         
            +
                                        "Combined Winner",
         
     | 
| 118 | 
         
            +
                                        "Combined Win Prob",
         
     | 
| 119 | 
         
            +
                                        "Platform 1",
         
     | 
| 120 | 
         
            +
                                        "Platform 2",
         
     | 
| 121 | 
         
            +
                                    ]
         
     | 
| 122 | 
         
            +
             
     | 
| 123 | 
         
            +
                                    # Ensure all columns exist in the dataframe
         
     | 
| 124 | 
         
            +
                                    valid_cols = [
         
     | 
| 125 | 
         
            +
                                        col for col in display_cols if col in matches_df.columns
         
     | 
| 126 | 
         
            +
                                    ]
         
     | 
| 127 | 
         
            +
             
     | 
| 128 | 
         
            +
                                    if valid_cols:
         
     | 
| 129 | 
         
            +
                                        # Rename some columns for better display
         
     | 
| 130 | 
         
            +
                                        matches_display = matches_df[valid_cols].copy()
         
     | 
| 131 | 
         
            +
             
     | 
| 132 | 
         
            +
                                        # Define a rename mapping but only apply for columns that exist
         
     | 
| 133 | 
         
            +
                                        rename_mapping = {
         
     | 
| 134 | 
         
            +
                                            "Token Generation 1": f"{device_display_names[device1]} Token Gen",
         
     | 
| 135 | 
         
            +
                                            "Token Generation 2": f"{device_display_names[device2]} Token Gen",
         
     | 
| 136 | 
         
            +
                                            "Prompt Processing 1": f"{device_display_names[device1]} Prompt Proc",
         
     | 
| 137 | 
         
            +
                                            "Prompt Processing 2": f"{device_display_names[device2]} Prompt Proc",
         
     | 
| 138 | 
         
            +
                                            "Platform 1": f"{device_display_names[device1]} Platform",
         
     | 
| 139 | 
         
            +
                                            "Platform 2": f"{device_display_names[device2]} Platform",
         
     | 
| 140 | 
         
            +
                                            "Token Win Prob": "Device 1 Token Win Prob",
         
     | 
| 141 | 
         
            +
                                            "Prompt Win Prob": "Device 1 Prompt Win Prob",
         
     | 
| 142 | 
         
            +
                                            "Combined Win Prob": "Device 1 Combined Win Prob",
         
     | 
| 143 | 
         
            +
                                        }
         
     | 
| 144 | 
         
            +
             
     | 
| 145 | 
         
            +
                                        # Only rename columns that exist in the dataframe
         
     | 
| 146 | 
         
            +
                                        rename_filtered = {
         
     | 
| 147 | 
         
            +
                                            k: v
         
     | 
| 148 | 
         
            +
                                            for k, v in rename_mapping.items()
         
     | 
| 149 | 
         
            +
                                            if k in matches_display.columns
         
     | 
| 150 | 
         
            +
                                        }
         
     | 
| 151 | 
         
            +
                                        matches_display = matches_display.rename(
         
     | 
| 152 | 
         
            +
                                            columns=rename_filtered
         
     | 
| 153 | 
         
            +
                                        )
         
     | 
| 154 | 
         
            +
             
     | 
| 155 | 
         
            +
                                        # Round any numeric columns for better display
         
     | 
| 156 | 
         
            +
                                        for col in matches_display.columns:
         
     | 
| 157 | 
         
            +
                                            if matches_display[col].dtype in ["float64", "float32"]:
         
     | 
| 158 | 
         
            +
                                                matches_display[col] = matches_display[col].round(2)
         
     | 
| 159 | 
         
            +
             
     | 
| 160 | 
         
            +
                                        st.dataframe(
         
     | 
| 161 | 
         
            +
                                            matches_display,
         
     | 
| 162 | 
         
            +
                                            use_container_width=True,
         
     | 
| 163 | 
         
            +
                                            height=400,
         
     | 
| 164 | 
         
            +
                                        )
         
     | 
| 165 | 
         
            +
                                    else:
         
     | 
| 166 | 
         
            +
                                        st.warning(
         
     | 
| 167 | 
         
            +
                                            "No valid columns found for display in the match data."
         
     | 
| 168 | 
         
            +
                                        )
         
     | 
| 169 | 
         
            +
             
     | 
| 170 | 
         
            +
                                    # Platform breakdown if available
         
     | 
| 171 | 
         
            +
                                    if "Platform 2" in matches_df.columns:
         
     | 
| 172 | 
         
            +
                                        st.markdown("#### Platform Distribution")
         
     | 
| 173 | 
         
            +
                                        platform_counts = matches_df["Platform 2"].value_counts()
         
     | 
| 174 | 
         
            +
                                        st.bar_chart(platform_counts)
         
     | 
| 175 | 
         
            +
                                else:
         
     | 
| 176 | 
         
            +
                                    st.warning(
         
     | 
| 177 | 
         
            +
                                        f"No matches found between {device_display_names[device1]} and {device_display_names[device2]}."
         
     | 
| 178 | 
         
            +
                                    )
         
     | 
| 179 | 
         
            +
                                    st.info(
         
     | 
| 180 | 
         
            +
                                        "Try selecting different devices or checking if they both have benchmark data for the same models."
         
     | 
| 181 | 
         
            +
                                    )
         
     | 
| 182 | 
         
            +
                            except Exception as e:
         
     | 
| 183 | 
         
            +
                                st.error(f"An error occurred during match analysis: {str(e)}")
         
     | 
| 184 | 
         
            +
                                st.info("Please try with different devices.")
         
     | 
| 185 | 
         
            +
                    else:
         
     | 
| 186 | 
         
            +
                        st.error("Please select two different devices to compare.")
         
     | 
    	
        src/components/visualizations.py
    CHANGED
    
    | 
         @@ -8,6 +8,7 @@ import pandas as pd 
     | 
|
| 8 | 
         
             
            from typing import Optional, Dict, List, Set
         
     | 
| 9 | 
         
             
            import plotly.graph_objects as go
         
     | 
| 10 | 
         
             
            from ..core.scoring import get_quantization_tier
         
     | 
| 
         | 
|
| 11 | 
         | 
| 12 | 
         | 
| 13 | 
         
             
            def clean_device_id(device_id: str) -> str:
         
     | 
| 
         @@ -576,318 +577,158 @@ def render_leaderboard_table(df: pd.DataFrame, filters: Dict): 
     | 
|
| 576 | 
         | 
| 577 | 
         | 
| 578 | 
         
             
            def render_device_rankings(df: pd.DataFrame):
         
     | 
| 579 | 
         
            -
                """Render device rankings  
     | 
| 580 | 
         
             
                if df.empty:
         
     | 
| 581 | 
         
             
                    st.warning("No data available for device rankings.")
         
     | 
| 582 | 
         
             
                    return
         
     | 
| 583 | 
         | 
| 584 | 
         
            -
                #  
     | 
| 585 | 
         
            -
                 
     | 
| 586 | 
         
            -
                     
     | 
| 587 | 
         
            -
             
     | 
| 588 | 
         
            -
             
     | 
| 589 | 
         
            -
                             
     | 
| 590 | 
         
            -
                             
     | 
| 591 | 
         
            -
                            "tg_score": "max",  # Use normalized TG score
         
     | 
| 592 | 
         
            -
                            "pp_score": "max",  # Use normalized PP score
         
     | 
| 593 | 
         
            -
                            "Model ID": lambda x: ", ".join(sorted(set(x))),  # All models tested
         
     | 
| 594 | 
         
            -
                            "quant_factor": lambda x: sorted(set(x)),  # Quantization levels tested
         
     | 
| 595 | 
         
            -
                        }
         
     | 
| 596 | 
         
            -
                    )
         
     | 
| 597 | 
         
            -
                    .reset_index()
         
     | 
| 598 | 
         
            -
                )
         
     | 
| 599 | 
         
            -
             
     | 
| 600 | 
         
            -
                # Flatten column names
         
     | 
| 601 | 
         
            -
                device_summary.columns = [
         
     | 
| 602 | 
         
            -
                    "Device ID",  # Normalized Device ID for grouping
         
     | 
| 603 | 
         
            -
                    "Platform",
         
     | 
| 604 | 
         
            -
                    "Best Score",
         
     | 
| 605 | 
         
            -
                    "Min Model Size",
         
     | 
| 606 | 
         
            -
                    "Max Model Size",
         
     | 
| 607 | 
         
            -
                    "TG Score",
         
     | 
| 608 | 
         
            -
                    "PP Score",
         
     | 
| 609 | 
         
            -
                    "Tested Models",
         
     | 
| 610 | 
         
            -
                    "Tested Quantizations",
         
     | 
| 611 | 
         
            -
                ]
         
     | 
| 612 | 
         
            -
             
     | 
| 613 | 
         
            -
                # Add clean device name
         
     | 
| 614 | 
         
            -
                device_summary["Device"] = device_summary["Device ID"].apply(clean_device_id)
         
     | 
| 615 | 
         
            -
             
     | 
| 616 | 
         
            -
                # Create three tabs for different ranking views
         
     | 
| 617 | 
         
            -
                rank_tab1, rank_tab2, rank_tab3 = st.tabs(
         
     | 
| 618 | 
         
            -
                    ["Overall Rankings", "Rankings by Model Size", "Rankings by Quantization"]
         
     | 
| 619 | 
         
            -
                )
         
     | 
| 620 | 
         
            -
             
     | 
| 621 | 
         
            -
                with rank_tab1:
         
     | 
| 622 | 
         
            -
                    st.subheader("📱 Overall Device Rankings")
         
     | 
| 623 | 
         
            -
             
     | 
| 624 | 
         
            -
                    # Sort by best score
         
     | 
| 625 | 
         
            -
                    overall_rankings = device_summary.sort_values("Best Score", ascending=False)
         
     | 
| 626 | 
         
            -
                    # Add ranking column
         
     | 
| 627 | 
         
            -
                    overall_rankings = overall_rankings.reset_index(drop=True)
         
     | 
| 628 | 
         
            -
                    overall_rankings.index = overall_rankings.index + 1
         
     | 
| 629 | 
         
            -
                    overall_rankings = overall_rankings.rename_axis("Rank")
         
     | 
| 630 | 
         
            -
             
     | 
| 631 | 
         
            -
                    # Format the display columns
         
     | 
| 632 | 
         
            -
                    display_df = overall_rankings.copy()
         
     | 
| 633 | 
         
            -
                    display_df["Best Score"] = display_df["Best Score"].round(2)
         
     | 
| 634 | 
         
            -
                    display_df["TG Score"] = display_df["TG Score"].round(2)
         
     | 
| 635 | 
         
            -
                    display_df["PP Score"] = display_df["PP Score"].round(2)
         
     | 
| 636 | 
         
            -
             
     | 
| 637 | 
         
            -
                    display_df["Model Size Range"] = display_df.apply(
         
     | 
| 638 | 
         
            -
                        lambda x: f"{x['Min Model Size']:.1f}B - {x['Max Model Size']:.1f}B", axis=1
         
     | 
| 639 | 
         
            -
                    )
         
     | 
| 640 | 
         
            -
             
     | 
| 641 | 
         
            -
                    # Select and reorder columns for display
         
     | 
| 642 | 
         
            -
                    display_cols = [
         
     | 
| 643 | 
         
            -
                        "Device",  # Use clean device name for display
         
     | 
| 644 | 
         
            -
                        "Platform",
         
     | 
| 645 | 
         
            -
                        "Best Score",
         
     | 
| 646 | 
         
            -
                        "TG Score",
         
     | 
| 647 | 
         
            -
                        "PP Score",
         
     | 
| 648 | 
         
            -
                        "Model Size Range",
         
     | 
| 649 | 
         
            -
                    ]
         
     | 
| 650 | 
         
            -
             
     | 
| 651 | 
         
            -
                    st.dataframe(
         
     | 
| 652 | 
         
            -
                        display_df[display_cols],
         
     | 
| 653 | 
         
            -
                        use_container_width=True,
         
     | 
| 654 | 
         
            -
                        height=min(
         
     | 
| 655 | 
         
            -
                            800, (len(display_df) + 1) * 35 + 40
         
     | 
| 656 | 
         
            -
                        ),  # Dynamic height based on content
         
     | 
| 657 | 
         
            -
                        hide_index=False,
         
     | 
| 658 | 
         
            -
                        column_config={
         
     | 
| 659 | 
         
            -
                            "Rank": st.column_config.NumberColumn(
         
     | 
| 660 | 
         
            -
                                "Rank",
         
     | 
| 661 | 
         
            -
                                help="Device ranking based on performance score",
         
     | 
| 662 | 
         
            -
                            ),
         
     | 
| 663 | 
         
            -
                            "Device": st.column_config.TextColumn(
         
     | 
| 664 | 
         
            -
                                "Device",
         
     | 
| 665 | 
         
            -
                                help="Device brand and model",
         
     | 
| 666 | 
         
            -
                            ),
         
     | 
| 667 | 
         
            -
                            "Best Score": st.column_config.NumberColumn(
         
     | 
| 668 | 
         
            -
                                "Score", help="Overall performance score (0-100)", format="%.2f"
         
     | 
| 669 | 
         
            -
                            ),
         
     | 
| 670 | 
         
            -
                            "TG Score": st.column_config.NumberColumn(
         
     | 
| 671 | 
         
            -
                                "TG Score",
         
     | 
| 672 | 
         
            -
                                help="Normalized Token Generation score (0-100)",
         
     | 
| 673 | 
         
            -
                                format="%.2f",
         
     | 
| 674 | 
         
            -
                            ),
         
     | 
| 675 | 
         
            -
                            "PP Score": st.column_config.NumberColumn(
         
     | 
| 676 | 
         
            -
                                "PP Score",
         
     | 
| 677 | 
         
            -
                                help="Normalized Prompt Processing score (0-100)",
         
     | 
| 678 | 
         
            -
                                format="%.2f",
         
     | 
| 679 | 
         
            -
                            ),
         
     | 
| 680 | 
         
            -
                        },
         
     | 
| 681 | 
         
            -
                    )
         
     | 
| 682 | 
         
            -
             
     | 
| 683 | 
         
            -
                with rank_tab2:
         
     | 
| 684 | 
         
            -
                    st.subheader("📊 Rankings by Model Size")
         
     | 
| 685 | 
         
            -
             
     | 
| 686 | 
         
            -
                    # Define model size categories
         
     | 
| 687 | 
         
            -
                    def get_size_category(size):
         
     | 
| 688 | 
         
            -
                        if size < 1:
         
     | 
| 689 | 
         
            -
                            return "Tiny (<1B)"
         
     | 
| 690 | 
         
            -
                        elif size < 2:
         
     | 
| 691 | 
         
            -
                            return "Small (1-2B)"
         
     | 
| 692 | 
         
            -
                        elif size < 4:
         
     | 
| 693 | 
         
            -
                            return "Medium (2-4B)"
         
     | 
| 694 | 
         
            -
                        elif size < 8:
         
     | 
| 695 | 
         
            -
                            return "Large (4-8B)"
         
     | 
| 696 | 
         
            -
                        else:
         
     | 
| 697 | 
         
            -
                            return "Extra Large (>8B)"
         
     | 
| 698 | 
         
            -
             
     | 
| 699 | 
         
            -
                    # Create size-based rankings
         
     | 
| 700 | 
         
            -
                    size_rankings = df.copy()
         
     | 
| 701 | 
         
            -
                    size_rankings["Size Category"] = size_rankings["Model Size"].apply(
         
     | 
| 702 | 
         
            -
                        get_size_category
         
     | 
| 703 | 
         
            -
                    )
         
     | 
| 704 | 
         
            -
             
     | 
| 705 | 
         
            -
                    size_summary = (
         
     | 
| 706 | 
         
            -
                        size_rankings.groupby(["Normalized Device ID", "Platform", "Size Category"])
         
     | 
| 707 | 
         
            -
                        .agg(
         
     | 
| 708 | 
         
            -
                            {
         
     | 
| 709 | 
         
            -
                                "performance_score": ["max", "mean"],
         
     | 
| 710 | 
         
            -
                                "tg_score": "max",  # Use normalized scores
         
     | 
| 711 | 
         
            -
                                "pp_score": "max",  # Use normalized scores
         
     | 
| 712 | 
         
            -
                                "Model ID": lambda x: ", ".join(sorted(set(x))),
         
     | 
| 713 | 
         
            -
                            }
         
     | 
| 714 | 
         
             
                        )
         
     | 
| 715 | 
         
            -
                        .reset_index()
         
     | 
| 716 | 
         
            -
                    )
         
     | 
| 717 | 
         | 
| 718 | 
         
            -
             
     | 
| 719 | 
         
            -
             
     | 
| 720 | 
         
            -
                        "Device ID",
         
     | 
| 721 | 
         
            -
                        "Platform",
         
     | 
| 722 | 
         
            -
                        "Size Category",
         
     | 
| 723 | 
         
            -
                        "Best Score",
         
     | 
| 724 | 
         
            -
                        "Avg Score",
         
     | 
| 725 | 
         
            -
                        "TG Score",
         
     | 
| 726 | 
         
            -
                        "PP Score",
         
     | 
| 727 | 
         
            -
                        "Models",
         
     | 
| 728 | 
         
            -
                    ]
         
     | 
| 729 | 
         | 
| 730 | 
         
            -
             
     | 
| 731 | 
         
            -
             
     | 
| 732 | 
         
            -
             
     | 
| 733 | 
         
            -
             
     | 
| 734 | 
         
            -
                    for size_cat in sorted(size_summary["Size Category"].unique()):
         
     | 
| 735 | 
         
            -
                        st.markdown(f"##### {size_cat}")
         
     | 
| 736 | 
         
            -
                        cat_data = size_summary[size_summary["Size Category"] == size_cat].copy()
         
     | 
| 737 | 
         
            -
                        cat_data = cat_data.sort_values("Best Score", ascending=False)
         
     | 
| 738 | 
         
            -
             
     | 
| 739 | 
         
            -
                        # Add ranking column
         
     | 
| 740 | 
         
            -
                        cat_data = cat_data.reset_index(drop=True)
         
     | 
| 741 | 
         
            -
                        cat_data.index = cat_data.index + 1
         
     | 
| 742 | 
         
            -
                        cat_data = cat_data.rename_axis("Rank")
         
     | 
| 743 | 
         
            -
             
     | 
| 744 | 
         
            -
                        # Format scores
         
     | 
| 745 | 
         
            -
                        cat_data["Best Score"] = cat_data["Best Score"].round(2)
         
     | 
| 746 | 
         
            -
                        cat_data["Avg Score"] = cat_data["Avg Score"].round(2)
         
     | 
| 747 | 
         
            -
                        cat_data["TG Score"] = cat_data["TG Score"].round(2)
         
     | 
| 748 | 
         
            -
                        cat_data["PP Score"] = cat_data["PP Score"].round(2)
         
     | 
| 749 | 
         
            -
             
     | 
| 750 | 
         
            -
                        display_cols = [
         
     | 
| 751 | 
         
            -
                            "Device",  # Use clean device name for display
         
     | 
| 752 | 
         
            -
                            "Platform",
         
     | 
| 753 | 
         
            -
                            "Best Score",
         
     | 
| 754 | 
         
            -
                            "Avg Score",
         
     | 
| 755 | 
         
            -
                            "TG Score",
         
     | 
| 756 | 
         
            -
                            "PP Score",
         
     | 
| 757 | 
         
            -
                        ]
         
     | 
| 758 | 
         
            -
             
     | 
| 759 | 
         
            -
                        st.dataframe(
         
     | 
| 760 | 
         
            -
                            cat_data[display_cols],
         
     | 
| 761 | 
         
            -
                            use_container_width=True,
         
     | 
| 762 | 
         
            -
                            height=min(
         
     | 
| 763 | 
         
            -
                                300, (len(cat_data) + 1) * 35 + 40
         
     | 
| 764 | 
         
            -
                            ),  # Slightly smaller for category tables
         
     | 
| 765 | 
         
            -
                            hide_index=False,
         
     | 
| 766 | 
         
            -
                            column_config={
         
     | 
| 767 | 
         
            -
                                "Rank": st.column_config.NumberColumn(
         
     | 
| 768 | 
         
            -
                                    "Rank",
         
     | 
| 769 | 
         
            -
                                    help="Device ranking within this size category",
         
     | 
| 770 | 
         
            -
                                ),
         
     | 
| 771 | 
         
            -
                                "Device": st.column_config.TextColumn(
         
     | 
| 772 | 
         
            -
                                    "Device",
         
     | 
| 773 | 
         
            -
                                    help="Device brand and model",
         
     | 
| 774 | 
         
            -
                                ),
         
     | 
| 775 | 
         
            -
                                "Best Score": st.column_config.NumberColumn(
         
     | 
| 776 | 
         
            -
                                    "Best Score",
         
     | 
| 777 | 
         
            -
                                    help="Best performance score achieved",
         
     | 
| 778 | 
         
            -
                                    format="%.2f",
         
     | 
| 779 | 
         
            -
                                ),
         
     | 
| 780 | 
         
            -
                                "Avg Score": st.column_config.NumberColumn(
         
     | 
| 781 | 
         
            -
                                    "Avg Score", help="Average performance score", format="%.2f"
         
     | 
| 782 | 
         
            -
                                ),
         
     | 
| 783 | 
         
            -
                                "TG Score": st.column_config.NumberColumn(
         
     | 
| 784 | 
         
            -
                                    "TG Score",
         
     | 
| 785 | 
         
            -
                                    help="Normalized Token Generation score (0-100)",
         
     | 
| 786 | 
         
            -
                                    format="%.2f",
         
     | 
| 787 | 
         
            -
                                ),
         
     | 
| 788 | 
         
            -
                                "PP Score": st.column_config.NumberColumn(
         
     | 
| 789 | 
         
            -
                                    "PP Score",
         
     | 
| 790 | 
         
            -
                                    help="Normalized Prompt Processing score (0-100)",
         
     | 
| 791 | 
         
            -
                                    format="%.2f",
         
     | 
| 792 | 
         
            -
                                ),
         
     | 
| 793 | 
         
            -
                            },
         
     | 
| 794 | 
         
             
                        )
         
     | 
| 795 | 
         | 
| 796 | 
         
            -
             
     | 
| 797 | 
         
            -
             
     | 
| 798 | 
         
            -
             
     | 
| 799 | 
         
            -
             
     | 
| 800 | 
         
            -
             
     | 
| 801 | 
         
            -
             
     | 
| 802 | 
         
            -
                         
     | 
| 803 | 
         
            -
             
     | 
| 804 | 
         
            -
             
     | 
| 805 | 
         
            -
             
     | 
| 806 | 
         
            -
             
     | 
| 807 | 
         
            -
             
     | 
| 808 | 
         
            -
             
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 809 | 
         
             
                            }
         
     | 
| 810 | 
         
            -
                        )
         
     | 
| 811 | 
         
            -
                        .reset_index()
         
     | 
| 812 | 
         
            -
                    )
         
     | 
| 813 | 
         | 
| 814 | 
         
            -
             
     | 
| 815 | 
         
            -
             
     | 
| 816 | 
         
            -
             
     | 
| 817 | 
         
            -
             
     | 
| 818 | 
         
            -
             
     | 
| 819 | 
         
            -
             
     | 
| 820 | 
         
            -
             
     | 
| 821 | 
         
            -
             
     | 
| 822 | 
         
            -
             
     | 
| 823 | 
         
            -
             
     | 
| 824 | 
         
            -
             
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 825 | 
         | 
| 826 | 
         
            -
             
     | 
| 827 | 
         
            -
             
     | 
| 828 | 
         
            -
             
     | 
| 829 | 
         
            -
             
     | 
| 830 | 
         
            -
             
     | 
| 831 | 
         
            -
             
     | 
| 832 | 
         
            -
                        st. 
     | 
| 833 | 
         
            -
                        quant_data = quant_summary[
         
     | 
| 834 | 
         
            -
                            quant_summary["Quant Factor"] == quant_level
         
     | 
| 835 | 
         
            -
                        ].copy()
         
     | 
| 836 | 
         
            -
                        quant_data = quant_data.sort_values("Best Score", ascending=False)
         
     | 
| 837 | 
         
            -
             
     | 
| 838 | 
         
            -
                        # Add ranking column
         
     | 
| 839 | 
         
            -
                        quant_data = quant_data.reset_index(drop=True)
         
     | 
| 840 | 
         
            -
                        quant_data.index = quant_data.index + 1
         
     | 
| 841 | 
         
            -
                        quant_data = quant_data.rename_axis("Rank")
         
     | 
| 842 | 
         
            -
             
     | 
| 843 | 
         
            -
                        # Format scores
         
     | 
| 844 | 
         
            -
                        quant_data["Best Score"] = quant_data["Best Score"].round(2)
         
     | 
| 845 | 
         
            -
                        quant_data["Avg Score"] = quant_data["Avg Score"].round(2)
         
     | 
| 846 | 
         
            -
                        quant_data["TG Score"] = quant_data["TG Score"].round(2)
         
     | 
| 847 | 
         
            -
                        quant_data["PP Score"] = quant_data["PP Score"].round(2)
         
     | 
| 848 | 
         
            -
             
     | 
| 849 | 
         
            -
                        display_cols = [
         
     | 
| 850 | 
         
            -
                            "Device",
         
     | 
| 851 | 
         
            -
                            "Platform",
         
     | 
| 852 | 
         
            -
                            "Best Score",
         
     | 
| 853 | 
         
            -
                            "Avg Score",
         
     | 
| 854 | 
         
            -
                            "TG Score",
         
     | 
| 855 | 
         
            -
                            "PP Score",
         
     | 
| 856 | 
         
            -
                        ]
         
     | 
| 857 | 
         
            -
             
     | 
| 858 | 
         
            -
                        st.dataframe(
         
     | 
| 859 | 
         
            -
                            quant_data[display_cols],
         
     | 
| 860 | 
         
            -
                            use_container_width=True,
         
     | 
| 861 | 
         
            -
                            height=min(
         
     | 
| 862 | 
         
            -
                                300, (len(quant_data) + 1) * 35 + 40
         
     | 
| 863 | 
         
            -
                            ),  # Slightly smaller for quantization tables
         
     | 
| 864 | 
         
            -
                            hide_index=False,
         
     | 
| 865 | 
         
            -
                            column_config={
         
     | 
| 866 | 
         
            -
                                "Rank": st.column_config.NumberColumn(
         
     | 
| 867 | 
         
            -
                                    "Rank",
         
     | 
| 868 | 
         
            -
                                    help="Device ranking within this quantization level",
         
     | 
| 869 | 
         
            -
                                ),
         
     | 
| 870 | 
         
            -
                                "Device": st.column_config.TextColumn(
         
     | 
| 871 | 
         
            -
                                    "Device",
         
     | 
| 872 | 
         
            -
                                    help="Device brand and model",
         
     | 
| 873 | 
         
            -
                                ),
         
     | 
| 874 | 
         
            -
                                "Best Score": st.column_config.NumberColumn(
         
     | 
| 875 | 
         
            -
                                    "Best Score",
         
     | 
| 876 | 
         
            -
                                    help="Best performance score achieved",
         
     | 
| 877 | 
         
            -
                                    format="%.2f",
         
     | 
| 878 | 
         
            -
                                ),
         
     | 
| 879 | 
         
            -
                                "Avg Score": st.column_config.NumberColumn(
         
     | 
| 880 | 
         
            -
                                    "Avg Score", help="Average performance score", format="%.2f"
         
     | 
| 881 | 
         
            -
                                ),
         
     | 
| 882 | 
         
            -
                                "TG Score": st.column_config.NumberColumn(
         
     | 
| 883 | 
         
            -
                                    "TG Score",
         
     | 
| 884 | 
         
            -
                                    help="Normalized Token Generation score (0-100)",
         
     | 
| 885 | 
         
            -
                                    format="%.2f",
         
     | 
| 886 | 
         
            -
                                ),
         
     | 
| 887 | 
         
            -
                                "PP Score": st.column_config.NumberColumn(
         
     | 
| 888 | 
         
            -
                                    "PP Score",
         
     | 
| 889 | 
         
            -
                                    help="Normalized Prompt Processing score (0-100)",
         
     | 
| 890 | 
         
            -
                                    format="%.2f",
         
     | 
| 891 | 
         
            -
                                ),
         
     | 
| 892 | 
         
            -
                            },
         
     | 
| 893 | 
         
            -
                        )
         
     | 
| 
         | 
|
| 8 | 
         
             
            from typing import Optional, Dict, List, Set
         
     | 
| 9 | 
         
             
            import plotly.graph_objects as go
         
     | 
| 10 | 
         
             
            from ..core.scoring import get_quantization_tier
         
     | 
| 11 | 
         
            +
            from ..core.glicko2_ranking import analyze_glicko2_rankings
         
     | 
| 12 | 
         | 
| 13 | 
         | 
| 14 | 
         
             
            def clean_device_id(device_id: str) -> str:
         
     | 
| 
         | 
|
| 577 | 
         | 
| 578 | 
         | 
| 579 | 
         
             
            def render_device_rankings(df: pd.DataFrame):
         
     | 
| 580 | 
         
            +
                """Render device rankings using Glicko-2 algorithm."""
         
     | 
| 581 | 
         
             
                if df.empty:
         
     | 
| 582 | 
         
             
                    st.warning("No data available for device rankings.")
         
     | 
| 583 | 
         
             
                    return
         
     | 
| 584 | 
         | 
| 585 | 
         
            +
                # Calculate Glicko-2 rankings automatically
         
     | 
| 586 | 
         
            +
                with st.spinner("Calculating Glicko-2 rankings..."):
         
     | 
| 587 | 
         
            +
                    try:
         
     | 
| 588 | 
         
            +
                        g2_all, g2_confident = analyze_glicko2_rankings(
         
     | 
| 589 | 
         
            +
                            df,
         
     | 
| 590 | 
         
            +
                            min_matches=5,  # Default minimum matches
         
     | 
| 591 | 
         
            +
                            min_gpu_layers=20,  # Default minimum GPU layers
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 592 | 
         
             
                        )
         
     | 
| 
         | 
|
| 
         | 
|
| 593 | 
         | 
| 594 | 
         
            +
                        # Display performance overview
         
     | 
| 595 | 
         
            +
                        st.subheader("🏆 Performance Overview")
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 596 | 
         | 
| 597 | 
         
            +
                        # Get top device from Glicko-2 rankings
         
     | 
| 598 | 
         
            +
                        top_device = g2_confident.index[0] if not g2_confident.empty else "N/A"
         
     | 
| 599 | 
         
            +
                        top_device_clean = (
         
     | 
| 600 | 
         
            +
                            clean_device_id(top_device) if top_device != "N/A" else "N/A"
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 601 | 
         
             
                        )
         
     | 
| 602 | 
         | 
| 603 | 
         
            +
                        # Calculate total unique devices and models
         
     | 
| 604 | 
         
            +
                        total_devices = df["Normalized Device ID"].nunique()
         
     | 
| 605 | 
         
            +
                        total_models = df["Model ID"].nunique()
         
     | 
| 606 | 
         
            +
             
     | 
| 607 | 
         
            +
                        # Display metrics in columns
         
     | 
| 608 | 
         
            +
                        col1, col2, col3 = st.columns([3, 1, 1])
         
     | 
| 609 | 
         
            +
                        with col1:
         
     | 
| 610 | 
         
            +
                            st.metric("Top Device", top_device_clean)
         
     | 
| 611 | 
         
            +
                        with col2:
         
     | 
| 612 | 
         
            +
                            st.metric("Total Devices", total_devices)
         
     | 
| 613 | 
         
            +
                        with col3:
         
     | 
| 614 | 
         
            +
                            st.metric("Total Models", total_models)
         
     | 
| 615 | 
         
            +
             
     | 
| 616 | 
         
            +
                        st.markdown("---")
         
     | 
| 617 | 
         
            +
             
     | 
| 618 | 
         
            +
                        # Display confident rankings
         
     | 
| 619 | 
         
            +
                        if not g2_confident.empty:
         
     | 
| 620 | 
         
            +
                            st.subheader("📱 Device Rankings")
         
     | 
| 621 | 
         
            +
             
     | 
| 622 | 
         
            +
                            # Create a copy and handle the index
         
     | 
| 623 | 
         
            +
                            g2_confident_display = g2_confident.copy()
         
     | 
| 624 | 
         
            +
             
     | 
| 625 | 
         
            +
                            # Get the device ID column name
         
     | 
| 626 | 
         
            +
                            device_id_col = g2_confident_display.index.name or "device"
         
     | 
| 627 | 
         
            +
                            g2_confident_display = g2_confident_display.reset_index()
         
     | 
| 628 | 
         
            +
             
     | 
| 629 | 
         
            +
                            # Get platform information from the original dataframe
         
     | 
| 630 | 
         
            +
                            platform_map = (
         
     | 
| 631 | 
         
            +
                                df.groupby("Normalized Device ID")["Platform"].first().to_dict()
         
     | 
| 632 | 
         
            +
                            )
         
     | 
| 633 | 
         
            +
                            g2_confident_display["Platform"] = g2_confident_display[
         
     | 
| 634 | 
         
            +
                                device_id_col
         
     | 
| 635 | 
         
            +
                            ].map(platform_map)
         
     | 
| 636 | 
         
            +
             
     | 
| 637 | 
         
            +
                            # Get model size range from the original dataframe
         
     | 
| 638 | 
         
            +
                            model_sizes = df.groupby("Normalized Device ID")["Model Size"].agg(
         
     | 
| 639 | 
         
            +
                                ["min", "max"]
         
     | 
| 640 | 
         
            +
                            )
         
     | 
| 641 | 
         
            +
                            g2_confident_display["Model Size Range"] = g2_confident_display[
         
     | 
| 642 | 
         
            +
                                device_id_col
         
     | 
| 643 | 
         
            +
                            ].apply(
         
     | 
| 644 | 
         
            +
                                lambda x: f"{model_sizes.loc[x, 'min']:.1f}B - {model_sizes.loc[x, 'max']:.1f}B"
         
     | 
| 645 | 
         
            +
                            )
         
     | 
| 646 | 
         
            +
             
     | 
| 647 | 
         
            +
                            # Add clean device name
         
     | 
| 648 | 
         
            +
                            g2_confident_display["Device"] = g2_confident_display[
         
     | 
| 649 | 
         
            +
                                device_id_col
         
     | 
| 650 | 
         
            +
                            ].apply(clean_device_id)
         
     | 
| 651 | 
         
            +
             
     | 
| 652 | 
         
            +
                            # Round numeric columns to whole numbers
         
     | 
| 653 | 
         
            +
                            numeric_cols = [
         
     | 
| 654 | 
         
            +
                                "combined_rating",
         
     | 
| 655 | 
         
            +
                                "combined_rd",
         
     | 
| 656 | 
         
            +
                                "token_rating",
         
     | 
| 657 | 
         
            +
                                "prompt_rating",
         
     | 
| 658 | 
         
            +
                            ]
         
     | 
| 659 | 
         
            +
                            for col in numeric_cols:
         
     | 
| 660 | 
         
            +
                                if col in g2_confident_display.columns:
         
     | 
| 661 | 
         
            +
                                    g2_confident_display[col] = (
         
     | 
| 662 | 
         
            +
                                        g2_confident_display[col].round(0).astype(int)
         
     | 
| 663 | 
         
            +
                                    )
         
     | 
| 664 | 
         
            +
             
     | 
| 665 | 
         
            +
                            # Select and order columns for display
         
     | 
| 666 | 
         
            +
                            display_cols = [
         
     | 
| 667 | 
         
            +
                                "Device",
         
     | 
| 668 | 
         
            +
                                "Platform",
         
     | 
| 669 | 
         
            +
                                "combined_rating",
         
     | 
| 670 | 
         
            +
                                "combined_rd",
         
     | 
| 671 | 
         
            +
                                "token_rating",
         
     | 
| 672 | 
         
            +
                                "prompt_rating",
         
     | 
| 673 | 
         
            +
                                "Model Size Range",
         
     | 
| 674 | 
         
            +
                            ]
         
     | 
| 675 | 
         
            +
             
     | 
| 676 | 
         
            +
                            # Rename columns for better display
         
     | 
| 677 | 
         
            +
                            rename_map = {
         
     | 
| 678 | 
         
            +
                                "combined_rating": "Rating",
         
     | 
| 679 | 
         
            +
                                "combined_rd": "Rating Deviation",
         
     | 
| 680 | 
         
            +
                                "token_rating": "Token Rating",
         
     | 
| 681 | 
         
            +
                                "prompt_rating": "Prompt Rating",
         
     | 
| 682 | 
         
             
                            }
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 683 | 
         | 
| 684 | 
         
            +
                            g2_confident_display = g2_confident_display.rename(columns=rename_map)
         
     | 
| 685 | 
         
            +
             
     | 
| 686 | 
         
            +
                            # Sort by Rating
         
     | 
| 687 | 
         
            +
                            g2_confident_display = g2_confident_display.sort_values(
         
     | 
| 688 | 
         
            +
                                "Rating", ascending=False
         
     | 
| 689 | 
         
            +
                            )
         
     | 
| 690 | 
         
            +
             
     | 
| 691 | 
         
            +
                            # Add rank column
         
     | 
| 692 | 
         
            +
                            g2_confident_display = g2_confident_display.reset_index(drop=True)
         
     | 
| 693 | 
         
            +
                            g2_confident_display.index = g2_confident_display.index + 1
         
     | 
| 694 | 
         
            +
                            g2_confident_display = g2_confident_display.rename_axis("Rank")
         
     | 
| 695 | 
         
            +
             
     | 
| 696 | 
         
            +
                            # Display the table
         
     | 
| 697 | 
         
            +
                            st.dataframe(
         
     | 
| 698 | 
         
            +
                                g2_confident_display[
         
     | 
| 699 | 
         
            +
                                    [
         
     | 
| 700 | 
         
            +
                                        "Device",
         
     | 
| 701 | 
         
            +
                                        "Platform",
         
     | 
| 702 | 
         
            +
                                        "Rating",
         
     | 
| 703 | 
         
            +
                                        "Rating Deviation",
         
     | 
| 704 | 
         
            +
                                        "Token Rating",
         
     | 
| 705 | 
         
            +
                                        "Prompt Rating",
         
     | 
| 706 | 
         
            +
                                        "Model Size Range",
         
     | 
| 707 | 
         
            +
                                    ]
         
     | 
| 708 | 
         
            +
                                ],
         
     | 
| 709 | 
         
            +
                                use_container_width=True,
         
     | 
| 710 | 
         
            +
                                height=min(600, (len(g2_confident_display) + 1) * 35 + 40),
         
     | 
| 711 | 
         
            +
                                hide_index=False,
         
     | 
| 712 | 
         
            +
                            )
         
     | 
| 713 | 
         
            +
             
     | 
| 714 | 
         
            +
                            # Platform statistics
         
     | 
| 715 | 
         
            +
                            st.markdown("#### Platform Statistics")
         
     | 
| 716 | 
         
            +
                            platform_stats = (
         
     | 
| 717 | 
         
            +
                                g2_confident_display.groupby("Platform")
         
     | 
| 718 | 
         
            +
                                .agg(
         
     | 
| 719 | 
         
            +
                                    {
         
     | 
| 720 | 
         
            +
                                        "Rating": ["mean", "std"],
         
     | 
| 721 | 
         
            +
                                    }
         
     | 
| 722 | 
         
            +
                                )
         
     | 
| 723 | 
         
            +
                                .round(0)
         
     | 
| 724 | 
         
            +
                                .astype(int)
         
     | 
| 725 | 
         
            +
                            )
         
     | 
| 726 | 
         
            +
                            st.dataframe(platform_stats, use_container_width=True)
         
     | 
| 727 | 
         | 
| 728 | 
         
            +
                        else:
         
     | 
| 729 | 
         
            +
                            st.warning(
         
     | 
| 730 | 
         
            +
                                "No confident rankings available. Try adjusting the minimum matches threshold."
         
     | 
| 731 | 
         
            +
                            )
         
     | 
| 732 | 
         
            +
             
     | 
| 733 | 
         
            +
                    except Exception as e:
         
     | 
| 734 | 
         
            +
                        st.error(f"Error calculating Glicko-2 rankings: {str(e)}")
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
    	
        src/core/glicko2_ranking.py
    ADDED
    
    | 
         @@ -0,0 +1,618 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            """
         
     | 
| 2 | 
         
            +
            Glicko-2 Ranking System for Device Performance Comparison
         
     | 
| 3 | 
         
            +
             
     | 
| 4 | 
         
            +
            This module implements a Glicko-2 based ranking system for comparing device performance
         
     | 
| 5 | 
         
            +
            in benchmark tests. Glicko-2 is an improvement over the original Glicko system and Elo,
         
     | 
| 6 | 
         
            +
            providing better handling of rating uncertainty and volatility.
         
     | 
| 7 | 
         
            +
             
     | 
| 8 | 
         
            +
            The system:
         
     | 
| 9 | 
         
            +
            1. Filters out emulators and iOS devices with insufficient GPU layers
         
     | 
| 10 | 
         
            +
            2. Normalizes scores within each model group
         
     | 
| 11 | 
         
            +
            3. Computes Glicko-2 ratings for devices based on their performance
         
     | 
| 12 | 
         
            +
            4. Provides uncertainty metrics alongside ratings
         
     | 
| 13 | 
         
            +
            5. Supports both combined and separate analysis of Token Generation and Prompt Processing
         
     | 
| 14 | 
         
            +
            """
         
     | 
| 15 | 
         
            +
             
     | 
| 16 | 
         
            +
            import numpy as np
         
     | 
| 17 | 
         
            +
            import pandas as pd
         
     | 
| 18 | 
         
            +
            from collections import defaultdict
         
     | 
| 19 | 
         
            +
            from typing import Tuple, Dict, List, Optional
         
     | 
| 20 | 
         
            +
            import glicko2
         
     | 
| 21 | 
         
            +
             
     | 
| 22 | 
         
            +
             
     | 
| 23 | 
         
            +
            def preprocess_benchmark_data(
         
     | 
| 24 | 
         
            +
                df: pd.DataFrame,
         
     | 
| 25 | 
         
            +
                min_gpu_layers: int = 20,
         
     | 
| 26 | 
         
            +
                pp_config: int = 512,
         
     | 
| 27 | 
         
            +
                tg_config: int = 128,
         
     | 
| 28 | 
         
            +
            ) -> pd.DataFrame:
         
     | 
| 29 | 
         
            +
                """
         
     | 
| 30 | 
         
            +
                Preprocess benchmark data by filtering out invalid entries.
         
     | 
| 31 | 
         
            +
             
     | 
| 32 | 
         
            +
                Args:
         
     | 
| 33 | 
         
            +
                    df: DataFrame containing benchmark data
         
     | 
| 34 | 
         
            +
                    min_gpu_layers: Minimum number of GPU layers required for iOS devices
         
     | 
| 35 | 
         
            +
                    pp_config: Prompt Processing configuration to filter for
         
     | 
| 36 | 
         
            +
                    tg_config: Token Generation configuration to filter for
         
     | 
| 37 | 
         
            +
             
     | 
| 38 | 
         
            +
                Returns:
         
     | 
| 39 | 
         
            +
                    Filtered DataFrame containing only valid benchmark entries
         
     | 
| 40 | 
         
            +
                """
         
     | 
| 41 | 
         
            +
                # Create a mask for devices to keep
         
     | 
| 42 | 
         
            +
                keep_device = (
         
     | 
| 43 | 
         
            +
                    # Keep non-iOS devices
         
     | 
| 44 | 
         
            +
                    (
         
     | 
| 45 | 
         
            +
                        (df["Platform"] != "iOS")
         
     | 
| 46 | 
         
            +
                        |
         
     | 
| 47 | 
         
            +
                        # Keep iOS devices with sufficient GPU layers
         
     | 
| 48 | 
         
            +
                        ((df["Platform"] == "iOS") & (df["n_gpu_layers"] >= min_gpu_layers))
         
     | 
| 49 | 
         
            +
                    )
         
     | 
| 50 | 
         
            +
                    &
         
     | 
| 51 | 
         
            +
                    # Remove emulators
         
     | 
| 52 | 
         
            +
                    (~df["Normalized Device ID"].str.contains("Emulator", case=False, na=False))
         
     | 
| 53 | 
         
            +
                    &
         
     | 
| 54 | 
         
            +
                    # Filter by configuration
         
     | 
| 55 | 
         
            +
                    (df["PP Config"] == pp_config)
         
     | 
| 56 | 
         
            +
                    & (df["TG Config"] == tg_config)
         
     | 
| 57 | 
         
            +
                )
         
     | 
| 58 | 
         
            +
             
     | 
| 59 | 
         
            +
                filtered_df = df[keep_device].copy()
         
     | 
| 60 | 
         
            +
             
     | 
| 61 | 
         
            +
                # Print filtering statistics
         
     | 
| 62 | 
         
            +
                total_devices = df["Normalized Device ID"].nunique()
         
     | 
| 63 | 
         
            +
                filtered_devices = filtered_df["Normalized Device ID"].nunique()
         
     | 
| 64 | 
         
            +
                emulator_devices = df[
         
     | 
| 65 | 
         
            +
                    df["Normalized Device ID"].str.contains("Emulator", case=False, na=False)
         
     | 
| 66 | 
         
            +
                ]["Normalized Device ID"].nunique()
         
     | 
| 67 | 
         
            +
             
     | 
| 68 | 
         
            +
                print("Filtering Statistics:")
         
     | 
| 69 | 
         
            +
                print(f"Original devices: {total_devices}")
         
     | 
| 70 | 
         
            +
                print(f"Emulator devices removed: {emulator_devices}")
         
     | 
| 71 | 
         
            +
                print(
         
     | 
| 72 | 
         
            +
                    f"iOS devices with insufficient GPU layers removed: "
         
     | 
| 73 | 
         
            +
                    f"{total_devices - filtered_devices - emulator_devices}"
         
     | 
| 74 | 
         
            +
                )
         
     | 
| 75 | 
         
            +
                print(f"Final device count: {filtered_devices}")
         
     | 
| 76 | 
         
            +
             
     | 
| 77 | 
         
            +
                # Print removed devices for verification
         
     | 
| 78 | 
         
            +
                print(
         
     | 
| 79 | 
         
            +
                    f"Removed {set(df['Normalized Device ID'].unique()) - set(filtered_df['Normalized Device ID'].unique())} "
         
     | 
| 80 | 
         
            +
                )
         
     | 
| 81 | 
         
            +
             
     | 
| 82 | 
         
            +
                return filtered_df
         
     | 
| 83 | 
         
            +
             
     | 
| 84 | 
         
            +
             
     | 
| 85 | 
         
            +
            def compute_glicko2_rankings(
         
     | 
| 86 | 
         
            +
                df: pd.DataFrame, token_weight: float = 0.6
         
     | 
| 87 | 
         
            +
            ) -> pd.DataFrame:
         
     | 
| 88 | 
         
            +
                """
         
     | 
| 89 | 
         
            +
                Compute device rankings using Glicko-2 rating system.
         
     | 
| 90 | 
         
            +
             
     | 
| 91 | 
         
            +
                Args:
         
     | 
| 92 | 
         
            +
                    df: DataFrame containing benchmark data
         
     | 
| 93 | 
         
            +
                    token_weight: Weight for Token Generation in combined score (0.0 to 1.0)
         
     | 
| 94 | 
         
            +
             
     | 
| 95 | 
         
            +
                Returns:
         
     | 
| 96 | 
         
            +
                    DataFrame containing device rankings and statistics
         
     | 
| 97 | 
         
            +
                """
         
     | 
| 98 | 
         
            +
                # Initialize Glicko-2 ratings for all devices
         
     | 
| 99 | 
         
            +
                ratings = {}
         
     | 
| 100 | 
         
            +
                match_counts = defaultdict(int)
         
     | 
| 101 | 
         
            +
                win_counts = defaultdict(int)
         
     | 
| 102 | 
         
            +
                loss_counts = defaultdict(int)
         
     | 
| 103 | 
         
            +
             
     | 
| 104 | 
         
            +
                # Default Glicko-2 settings
         
     | 
| 105 | 
         
            +
                # Rating = 1500, RD (rating deviation) = 350, Volatility = 0.06
         
     | 
| 106 | 
         
            +
                def create_glicko2_rating():
         
     | 
| 107 | 
         
            +
                    return glicko2.Player(rating=1500, rd=350, vol=0.06)
         
     | 
| 108 | 
         
            +
             
     | 
| 109 | 
         
            +
                def normalize_scores(group: pd.DataFrame) -> pd.Series:
         
     | 
| 110 | 
         
            +
                    """Normalize and combine scores within a model group"""
         
     | 
| 111 | 
         
            +
                    # Normalize Token Generation (higher is better)
         
     | 
| 112 | 
         
            +
                    token_min = group["Token Generation"].min()
         
     | 
| 113 | 
         
            +
                    token_max = group["Token Generation"].max()
         
     | 
| 114 | 
         
            +
                    token_norm = (
         
     | 
| 115 | 
         
            +
                        (group["Token Generation"] - token_min) / (token_max - token_min)
         
     | 
| 116 | 
         
            +
                        if token_max > token_min
         
     | 
| 117 | 
         
            +
                        else 0
         
     | 
| 118 | 
         
            +
                    )
         
     | 
| 119 | 
         
            +
             
     | 
| 120 | 
         
            +
                    # Normalize Prompt Processing (higher is better)
         
     | 
| 121 | 
         
            +
                    prompt_min = group["Prompt Processing"].min()
         
     | 
| 122 | 
         
            +
                    prompt_max = group["Prompt Processing"].max()
         
     | 
| 123 | 
         
            +
                    prompt_norm = (
         
     | 
| 124 | 
         
            +
                        (group["Prompt Processing"] - prompt_min) / (prompt_max - prompt_min)
         
     | 
| 125 | 
         
            +
                        if prompt_max > prompt_min
         
     | 
| 126 | 
         
            +
                        else 0
         
     | 
| 127 | 
         
            +
                    )
         
     | 
| 128 | 
         
            +
             
     | 
| 129 | 
         
            +
                    # Combine scores
         
     | 
| 130 | 
         
            +
                    return token_weight * token_norm + (1 - token_weight) * prompt_norm
         
     | 
| 131 | 
         
            +
             
     | 
| 132 | 
         
            +
                # Get all unique devices
         
     | 
| 133 | 
         
            +
                all_devices = df["Normalized Device ID"].unique()
         
     | 
| 134 | 
         
            +
             
     | 
| 135 | 
         
            +
                # Initialize ratings for all devices
         
     | 
| 136 | 
         
            +
                for device in all_devices:
         
     | 
| 137 | 
         
            +
                    ratings[device] = create_glicko2_rating()
         
     | 
| 138 | 
         
            +
             
     | 
| 139 | 
         
            +
                # Process each model separately
         
     | 
| 140 | 
         
            +
                for model, group in df.groupby("Model ID"):
         
     | 
| 141 | 
         
            +
                    # Add normalized combined score
         
     | 
| 142 | 
         
            +
                    group.loc[:, "combined_score"] = normalize_scores(group)
         
     | 
| 143 | 
         
            +
             
     | 
| 144 | 
         
            +
                    devices = group["Normalized Device ID"].unique()
         
     | 
| 145 | 
         
            +
             
     | 
| 146 | 
         
            +
                    # In Glicko-2, we need to collect all results for a rating period before updating
         
     | 
| 147 | 
         
            +
                    # A rating period could be all matches for a specific model
         
     | 
| 148 | 
         
            +
                    device_matches = defaultdict(
         
     | 
| 149 | 
         
            +
                        lambda: {"opponent_ratings": [], "opponent_rds": [], "outcomes": []}
         
     | 
| 150 | 
         
            +
                    )
         
     | 
| 151 | 
         
            +
             
     | 
| 152 | 
         
            +
                    for i in range(len(devices)):
         
     | 
| 153 | 
         
            +
                        for j in range(i + 1, len(devices)):
         
     | 
| 154 | 
         
            +
                            device1 = devices[i]
         
     | 
| 155 | 
         
            +
                            device2 = devices[j]
         
     | 
| 156 | 
         
            +
             
     | 
| 157 | 
         
            +
                            score1 = group[group["Normalized Device ID"] == device1][
         
     | 
| 158 | 
         
            +
                                "combined_score"
         
     | 
| 159 | 
         
            +
                            ].iloc[0]
         
     | 
| 160 | 
         
            +
                            score2 = group[group["Normalized Device ID"] == device2][
         
     | 
| 161 | 
         
            +
                                "combined_score"
         
     | 
| 162 | 
         
            +
                            ].iloc[0]
         
     | 
| 163 | 
         
            +
             
     | 
| 164 | 
         
            +
                            # Update match counts
         
     | 
| 165 | 
         
            +
                            match_counts[device1] += 1
         
     | 
| 166 | 
         
            +
                            match_counts[device2] += 1
         
     | 
| 167 | 
         
            +
             
     | 
| 168 | 
         
            +
                            # Determine outcome (0 = loss, 1 = win, 0.5 = draw)
         
     | 
| 169 | 
         
            +
                            if score1 > score2:
         
     | 
| 170 | 
         
            +
                                # Device 1 wins
         
     | 
| 171 | 
         
            +
                                outcome = 1
         
     | 
| 172 | 
         
            +
                                win_counts[device1] += 1
         
     | 
| 173 | 
         
            +
                                loss_counts[device2] += 1
         
     | 
| 174 | 
         
            +
                                # For device 1
         
     | 
| 175 | 
         
            +
                                device_matches[device1]["opponent_ratings"].append(
         
     | 
| 176 | 
         
            +
                                    ratings[device2].rating
         
     | 
| 177 | 
         
            +
                                )
         
     | 
| 178 | 
         
            +
                                device_matches[device1]["opponent_rds"].append(ratings[device2].rd)
         
     | 
| 179 | 
         
            +
                                device_matches[device1]["outcomes"].append(outcome)
         
     | 
| 180 | 
         
            +
                                # For device 2
         
     | 
| 181 | 
         
            +
                                device_matches[device2]["opponent_ratings"].append(
         
     | 
| 182 | 
         
            +
                                    ratings[device1].rating
         
     | 
| 183 | 
         
            +
                                )
         
     | 
| 184 | 
         
            +
                                device_matches[device2]["opponent_rds"].append(ratings[device1].rd)
         
     | 
| 185 | 
         
            +
                                device_matches[device2]["outcomes"].append(0)  # Loss
         
     | 
| 186 | 
         
            +
                            elif score1 < score2:
         
     | 
| 187 | 
         
            +
                                # Device 2 wins
         
     | 
| 188 | 
         
            +
                                outcome = 0
         
     | 
| 189 | 
         
            +
                                win_counts[device2] += 1
         
     | 
| 190 | 
         
            +
                                loss_counts[device1] += 1
         
     | 
| 191 | 
         
            +
                                # For device 1
         
     | 
| 192 | 
         
            +
                                device_matches[device1]["opponent_ratings"].append(
         
     | 
| 193 | 
         
            +
                                    ratings[device2].rating
         
     | 
| 194 | 
         
            +
                                )
         
     | 
| 195 | 
         
            +
                                device_matches[device1]["opponent_rds"].append(ratings[device2].rd)
         
     | 
| 196 | 
         
            +
                                device_matches[device1]["outcomes"].append(outcome)
         
     | 
| 197 | 
         
            +
                                # For device 2
         
     | 
| 198 | 
         
            +
                                device_matches[device2]["opponent_ratings"].append(
         
     | 
| 199 | 
         
            +
                                    ratings[device1].rating
         
     | 
| 200 | 
         
            +
                                )
         
     | 
| 201 | 
         
            +
                                device_matches[device2]["opponent_rds"].append(ratings[device1].rd)
         
     | 
| 202 | 
         
            +
                                device_matches[device2]["outcomes"].append(1)  # Win
         
     | 
| 203 | 
         
            +
                            else:
         
     | 
| 204 | 
         
            +
                                # It's a draw
         
     | 
| 205 | 
         
            +
                                outcome = 0.5
         
     | 
| 206 | 
         
            +
                                # For device 1
         
     | 
| 207 | 
         
            +
                                device_matches[device1]["opponent_ratings"].append(
         
     | 
| 208 | 
         
            +
                                    ratings[device2].rating
         
     | 
| 209 | 
         
            +
                                )
         
     | 
| 210 | 
         
            +
                                device_matches[device1]["opponent_rds"].append(ratings[device2].rd)
         
     | 
| 211 | 
         
            +
                                device_matches[device1]["outcomes"].append(outcome)
         
     | 
| 212 | 
         
            +
                                # For device 2
         
     | 
| 213 | 
         
            +
                                device_matches[device2]["opponent_ratings"].append(
         
     | 
| 214 | 
         
            +
                                    ratings[device1].rating
         
     | 
| 215 | 
         
            +
                                )
         
     | 
| 216 | 
         
            +
                                device_matches[device2]["opponent_rds"].append(ratings[device1].rd)
         
     | 
| 217 | 
         
            +
                                device_matches[device2]["outcomes"].append(outcome)
         
     | 
| 218 | 
         
            +
             
     | 
| 219 | 
         
            +
                    # Update ratings after the model rating period
         
     | 
| 220 | 
         
            +
                    for device, matches in device_matches.items():
         
     | 
| 221 | 
         
            +
                        if matches[
         
     | 
| 222 | 
         
            +
                            "opponent_ratings"
         
     | 
| 223 | 
         
            +
                        ]:  # Only update if the device had matches in this period
         
     | 
| 224 | 
         
            +
                            # Update the rating with the three separate lists that the API requires
         
     | 
| 225 | 
         
            +
                            ratings[device].update_player(
         
     | 
| 226 | 
         
            +
                                matches["opponent_ratings"],  # List of opponent ratings
         
     | 
| 227 | 
         
            +
                                matches["opponent_rds"],  # List of opponent rating deviations
         
     | 
| 228 | 
         
            +
                                matches["outcomes"],  # List of outcomes
         
     | 
| 229 | 
         
            +
                            )
         
     | 
| 230 | 
         
            +
             
     | 
| 231 | 
         
            +
                # Convert to DataFrame
         
     | 
| 232 | 
         
            +
                ranking_data = []
         
     | 
| 233 | 
         
            +
                for device, rating in ratings.items():
         
     | 
| 234 | 
         
            +
                    if match_counts[device] > 0:  # Only include devices with matches
         
     | 
| 235 | 
         
            +
                        ranking_data.append(
         
     | 
| 236 | 
         
            +
                            {
         
     | 
| 237 | 
         
            +
                                "device": device,
         
     | 
| 238 | 
         
            +
                                "rating": rating.rating,
         
     | 
| 239 | 
         
            +
                                "rd": rating.rd,  # rating deviation (uncertainty)
         
     | 
| 240 | 
         
            +
                                "volatility": rating.vol,
         
     | 
| 241 | 
         
            +
                                "matches": match_counts[device],
         
     | 
| 242 | 
         
            +
                                "wins": win_counts[device],
         
     | 
| 243 | 
         
            +
                                "losses": loss_counts[device],
         
     | 
| 244 | 
         
            +
                                # Conservative rating (95% confidence lower bound)
         
     | 
| 245 | 
         
            +
                                "conserv_rating": rating.rating - (2 * rating.rd),
         
     | 
| 246 | 
         
            +
                            }
         
     | 
| 247 | 
         
            +
                        )
         
     | 
| 248 | 
         
            +
             
     | 
| 249 | 
         
            +
                # Create DataFrame
         
     | 
| 250 | 
         
            +
                ranking_df = pd.DataFrame(ranking_data)
         
     | 
| 251 | 
         
            +
             
     | 
| 252 | 
         
            +
                if len(ranking_df) > 0:
         
     | 
| 253 | 
         
            +
                    # Add win rate
         
     | 
| 254 | 
         
            +
                    ranking_df["win_rate"] = ranking_df["wins"] / ranking_df["matches"]
         
     | 
| 255 | 
         
            +
             
     | 
| 256 | 
         
            +
                    # Add platform information
         
     | 
| 257 | 
         
            +
                    ranking_df["Platform"] = pd.Series(
         
     | 
| 258 | 
         
            +
                        {
         
     | 
| 259 | 
         
            +
                            row["device"]: df[df["Normalized Device ID"] == row["device"]][
         
     | 
| 260 | 
         
            +
                                "Platform"
         
     | 
| 261 | 
         
            +
                            ].iloc[0]
         
     | 
| 262 | 
         
            +
                            for _, row in ranking_df.iterrows()
         
     | 
| 263 | 
         
            +
                        }
         
     | 
| 264 | 
         
            +
                    )
         
     | 
| 265 | 
         
            +
             
     | 
| 266 | 
         
            +
                    # Set device as index
         
     | 
| 267 | 
         
            +
                    ranking_df = ranking_df.set_index("device")
         
     | 
| 268 | 
         
            +
             
     | 
| 269 | 
         
            +
                return ranking_df
         
     | 
| 270 | 
         
            +
             
     | 
| 271 | 
         
            +
             
     | 
| 272 | 
         
            +
            def analyze_glicko2_rankings(
         
     | 
| 273 | 
         
            +
                df: pd.DataFrame,
         
     | 
| 274 | 
         
            +
                min_matches: int = 5,
         
     | 
| 275 | 
         
            +
                min_gpu_layers: int = 20,
         
     | 
| 276 | 
         
            +
                pp_config: int = 512,
         
     | 
| 277 | 
         
            +
                tg_config: int = 128,
         
     | 
| 278 | 
         
            +
            ) -> Tuple[pd.DataFrame, pd.DataFrame]:
         
     | 
| 279 | 
         
            +
                """
         
     | 
| 280 | 
         
            +
                Analyze and display ranking results with Glicko-2 ratings.
         
     | 
| 281 | 
         
            +
             
     | 
| 282 | 
         
            +
                Args:
         
     | 
| 283 | 
         
            +
                    df: DataFrame containing benchmark data
         
     | 
| 284 | 
         
            +
                    min_matches: Minimum number of matches required for confident rankings
         
     | 
| 285 | 
         
            +
                    min_gpu_layers: Minimum number of GPU layers required for iOS devices
         
     | 
| 286 | 
         
            +
                    pp_config: Prompt Processing configuration to filter for
         
     | 
| 287 | 
         
            +
                    tg_config: Token Generation configuration to filter for
         
     | 
| 288 | 
         
            +
             
     | 
| 289 | 
         
            +
                Returns:
         
     | 
| 290 | 
         
            +
                    Tuple of (all rankings DataFrame, confident rankings DataFrame)
         
     | 
| 291 | 
         
            +
                """
         
     | 
| 292 | 
         
            +
                # First filter the data
         
     | 
| 293 | 
         
            +
                filtered_df = preprocess_benchmark_data(df, min_gpu_layers, pp_config, tg_config)
         
     | 
| 294 | 
         
            +
                print(
         
     | 
| 295 | 
         
            +
                    f'Filtered number of devices: {filtered_df["Normalized Device ID"].nunique()}'
         
     | 
| 296 | 
         
            +
                )
         
     | 
| 297 | 
         
            +
                print(f"Filtered number of rows: {filtered_df.shape}")
         
     | 
| 298 | 
         
            +
                print(f"Original number of rows: {df.shape}")
         
     | 
| 299 | 
         
            +
             
     | 
| 300 | 
         
            +
                # Compute rankings for all three scenarios
         
     | 
| 301 | 
         
            +
                combined_rankings = compute_glicko2_rankings(filtered_df, token_weight=0.6)
         
     | 
| 302 | 
         
            +
                token_rankings = compute_glicko2_rankings(filtered_df, token_weight=1.0)
         
     | 
| 303 | 
         
            +
                prompt_rankings = compute_glicko2_rankings(filtered_df, token_weight=0.0)
         
     | 
| 304 | 
         
            +
             
     | 
| 305 | 
         
            +
                # Rename columns to avoid confusion
         
     | 
| 306 | 
         
            +
                combined_rankings = combined_rankings.rename(
         
     | 
| 307 | 
         
            +
                    columns={
         
     | 
| 308 | 
         
            +
                        "rating": "combined_rating",
         
     | 
| 309 | 
         
            +
                        "rd": "combined_rd",
         
     | 
| 310 | 
         
            +
                        "volatility": "combined_vol",
         
     | 
| 311 | 
         
            +
                        "conserv_rating": "combined_conserv",
         
     | 
| 312 | 
         
            +
                        "wins": "combined_wins",
         
     | 
| 313 | 
         
            +
                        "losses": "combined_losses",
         
     | 
| 314 | 
         
            +
                        "win_rate": "combined_win_rate",
         
     | 
| 315 | 
         
            +
                    }
         
     | 
| 316 | 
         
            +
                )
         
     | 
| 317 | 
         
            +
             
     | 
| 318 | 
         
            +
                token_rankings = token_rankings.rename(
         
     | 
| 319 | 
         
            +
                    columns={
         
     | 
| 320 | 
         
            +
                        "rating": "token_rating",
         
     | 
| 321 | 
         
            +
                        "rd": "token_rd",
         
     | 
| 322 | 
         
            +
                        "volatility": "token_vol",
         
     | 
| 323 | 
         
            +
                        "conserv_rating": "token_conserv",
         
     | 
| 324 | 
         
            +
                        "wins": "token_wins",
         
     | 
| 325 | 
         
            +
                        "losses": "token_losses",
         
     | 
| 326 | 
         
            +
                        "win_rate": "token_win_rate",
         
     | 
| 327 | 
         
            +
                    }
         
     | 
| 328 | 
         
            +
                )
         
     | 
| 329 | 
         
            +
             
     | 
| 330 | 
         
            +
                prompt_rankings = prompt_rankings.rename(
         
     | 
| 331 | 
         
            +
                    columns={
         
     | 
| 332 | 
         
            +
                        "rating": "prompt_rating",
         
     | 
| 333 | 
         
            +
                        "rd": "prompt_rd",
         
     | 
| 334 | 
         
            +
                        "volatility": "prompt_vol",
         
     | 
| 335 | 
         
            +
                        "conserv_rating": "prompt_conserv",
         
     | 
| 336 | 
         
            +
                        "wins": "prompt_wins",
         
     | 
| 337 | 
         
            +
                        "losses": "prompt_losses",
         
     | 
| 338 | 
         
            +
                        "win_rate": "prompt_win_rate",
         
     | 
| 339 | 
         
            +
                    }
         
     | 
| 340 | 
         
            +
                )
         
     | 
| 341 | 
         
            +
             
     | 
| 342 | 
         
            +
                # Combine all rankings into one DataFrame
         
     | 
| 343 | 
         
            +
                # We'll keep one set of match counts as they should be the same
         
     | 
| 344 | 
         
            +
                rankings = combined_rankings.copy()
         
     | 
| 345 | 
         
            +
             
     | 
| 346 | 
         
            +
                # Add token generation rankings
         
     | 
| 347 | 
         
            +
                for col in [
         
     | 
| 348 | 
         
            +
                    "token_rating",
         
     | 
| 349 | 
         
            +
                    "token_rd",
         
     | 
| 350 | 
         
            +
                    "token_vol",
         
     | 
| 351 | 
         
            +
                    "token_conserv",
         
     | 
| 352 | 
         
            +
                    "token_wins",
         
     | 
| 353 | 
         
            +
                    "token_losses",
         
     | 
| 354 | 
         
            +
                    "token_win_rate",
         
     | 
| 355 | 
         
            +
                ]:
         
     | 
| 356 | 
         
            +
                    rankings[col] = token_rankings[col]
         
     | 
| 357 | 
         
            +
             
     | 
| 358 | 
         
            +
                # Add prompt processing rankings
         
     | 
| 359 | 
         
            +
                for col in [
         
     | 
| 360 | 
         
            +
                    "prompt_rating",
         
     | 
| 361 | 
         
            +
                    "prompt_rd",
         
     | 
| 362 | 
         
            +
                    "prompt_vol",
         
     | 
| 363 | 
         
            +
                    "prompt_conserv",
         
     | 
| 364 | 
         
            +
                    "prompt_wins",
         
     | 
| 365 | 
         
            +
                    "prompt_losses",
         
     | 
| 366 | 
         
            +
                    "prompt_win_rate",
         
     | 
| 367 | 
         
            +
                ]:
         
     | 
| 368 | 
         
            +
                    rankings[col] = prompt_rankings[col]
         
     | 
| 369 | 
         
            +
             
     | 
| 370 | 
         
            +
                # Filter for minimum matches
         
     | 
| 371 | 
         
            +
                confident_rankings = rankings[rankings["matches"] >= min_matches].sort_values(
         
     | 
| 372 | 
         
            +
                    "combined_rating", ascending=False
         
     | 
| 373 | 
         
            +
                )
         
     | 
| 374 | 
         
            +
             
     | 
| 375 | 
         
            +
                # Print statistics
         
     | 
| 376 | 
         
            +
                print("\nRanking Statistics:")
         
     | 
| 377 | 
         
            +
                print(f"Total devices ranked: {len(rankings)}")
         
     | 
| 378 | 
         
            +
                print(f"Devices with {min_matches}+ matches: {len(confident_rankings)}")
         
     | 
| 379 | 
         
            +
             
     | 
| 380 | 
         
            +
                print("\nTop 10 Devices:")
         
     | 
| 381 | 
         
            +
                columns_to_show = [
         
     | 
| 382 | 
         
            +
                    "combined_rating",
         
     | 
| 383 | 
         
            +
                    "combined_rd",
         
     | 
| 384 | 
         
            +
                    "token_rating",
         
     | 
| 385 | 
         
            +
                    "prompt_rating",
         
     | 
| 386 | 
         
            +
                    "matches",
         
     | 
| 387 | 
         
            +
                    "Platform",
         
     | 
| 388 | 
         
            +
                ]
         
     | 
| 389 | 
         
            +
                print(confident_rankings[columns_to_show].head(10))
         
     | 
| 390 | 
         
            +
             
     | 
| 391 | 
         
            +
                print("\nPlatform Statistics:")
         
     | 
| 392 | 
         
            +
                platform_stats = confident_rankings.groupby("Platform").agg(
         
     | 
| 393 | 
         
            +
                    {
         
     | 
| 394 | 
         
            +
                        "combined_rating": ["count", "mean", "std"],
         
     | 
| 395 | 
         
            +
                        "token_rating": ["mean", "std"],
         
     | 
| 396 | 
         
            +
                        "prompt_rating": ["mean", "std"],
         
     | 
| 397 | 
         
            +
                        "matches": "mean",
         
     | 
| 398 | 
         
            +
                        "combined_win_rate": "mean",
         
     | 
| 399 | 
         
            +
                    }
         
     | 
| 400 | 
         
            +
                )
         
     | 
| 401 | 
         
            +
                print(platform_stats)
         
     | 
| 402 | 
         
            +
             
     | 
| 403 | 
         
            +
                # Calculate correlations between different ratings
         
     | 
| 404 | 
         
            +
                correlations = confident_rankings[
         
     | 
| 405 | 
         
            +
                    ["combined_rating", "token_rating", "prompt_rating"]
         
     | 
| 406 | 
         
            +
                ].corr()
         
     | 
| 407 | 
         
            +
                print("\nRating Correlations:")
         
     | 
| 408 | 
         
            +
                print(correlations)
         
     | 
| 409 | 
         
            +
             
     | 
| 410 | 
         
            +
                return rankings, confident_rankings
         
     | 
| 411 | 
         
            +
             
     | 
| 412 | 
         
            +
             
     | 
| 413 | 
         
            +
            def analyze_device_glicko2_matches(
         
     | 
| 414 | 
         
            +
                df: pd.DataFrame,
         
     | 
| 415 | 
         
            +
                device_id1: str,
         
     | 
| 416 | 
         
            +
                device_id2: Optional[str] = None,
         
     | 
| 417 | 
         
            +
                token_weight: float = 0.6,
         
     | 
| 418 | 
         
            +
            ) -> pd.DataFrame:
         
     | 
| 419 | 
         
            +
                """
         
     | 
| 420 | 
         
            +
                Analyze all matches for one or two specific devices using the Glicko-2 methodology.
         
     | 
| 421 | 
         
            +
             
     | 
| 422 | 
         
            +
                Args:
         
     | 
| 423 | 
         
            +
                    df: DataFrame containing benchmark data
         
     | 
| 424 | 
         
            +
                    device_id1: First device ID to analyze
         
     | 
| 425 | 
         
            +
                    device_id2: Optional second device ID to compare against
         
     | 
| 426 | 
         
            +
                    token_weight: Weight for Token Generation in combined score (0.0 to 1.0)
         
     | 
| 427 | 
         
            +
             
     | 
| 428 | 
         
            +
                Returns:
         
     | 
| 429 | 
         
            +
                    DataFrame containing detailed match information with win probabilities
         
     | 
| 430 | 
         
            +
                """
         
     | 
| 431 | 
         
            +
                matches = []
         
     | 
| 432 | 
         
            +
             
     | 
| 433 | 
         
            +
                def normalize_scores(group: pd.DataFrame) -> Dict[str, Dict]:
         
     | 
| 434 | 
         
            +
                    """Normalize scores within a model group and return as dict"""
         
     | 
| 435 | 
         
            +
                    # Normalize Token Generation (higher is better)
         
     | 
| 436 | 
         
            +
                    token_min = group["Token Generation"].min()
         
     | 
| 437 | 
         
            +
                    token_max = group["Token Generation"].max()
         
     | 
| 438 | 
         
            +
                    token_range = token_max - token_min
         
     | 
| 439 | 
         
            +
             
     | 
| 440 | 
         
            +
                    # Normalize Prompt Processing (higher is better)
         
     | 
| 441 | 
         
            +
                    prompt_min = group["Prompt Processing"].min()
         
     | 
| 442 | 
         
            +
                    prompt_max = group["Prompt Processing"].max()
         
     | 
| 443 | 
         
            +
                    prompt_range = prompt_max - prompt_min
         
     | 
| 444 | 
         
            +
             
     | 
| 445 | 
         
            +
                    # Calculate normalized scores for each device
         
     | 
| 446 | 
         
            +
                    result = {}
         
     | 
| 447 | 
         
            +
                    for _, row in group.iterrows():
         
     | 
| 448 | 
         
            +
                        device_id = row["Normalized Device ID"]
         
     | 
| 449 | 
         
            +
                        if token_range > 0 and prompt_range > 0:
         
     | 
| 450 | 
         
            +
                            token_norm = (row["Token Generation"] - token_min) / token_range
         
     | 
| 451 | 
         
            +
                            prompt_norm = (row["Prompt Processing"] - prompt_min) / prompt_range
         
     | 
| 452 | 
         
            +
                            combined = token_weight * token_norm + (1 - token_weight) * prompt_norm
         
     | 
| 453 | 
         
            +
                            result[device_id] = {
         
     | 
| 454 | 
         
            +
                                "token_norm": token_norm,
         
     | 
| 455 | 
         
            +
                                "prompt_norm": prompt_norm,
         
     | 
| 456 | 
         
            +
                                "combined": combined,
         
     | 
| 457 | 
         
            +
                            }
         
     | 
| 458 | 
         
            +
                    return result
         
     | 
| 459 | 
         
            +
             
     | 
| 460 | 
         
            +
                # Group by Model ID to compare within same models
         
     | 
| 461 | 
         
            +
                for model, group in df.groupby("Model ID"):
         
     | 
| 462 | 
         
            +
                    if device_id1 not in group["Normalized Device ID"].values:
         
     | 
| 463 | 
         
            +
                        continue
         
     | 
| 464 | 
         
            +
             
     | 
| 465 | 
         
            +
                    device1_data = group[group["Normalized Device ID"] == device_id1].iloc[0]
         
     | 
| 466 | 
         
            +
             
     | 
| 467 | 
         
            +
                    # If device2 specified, only compare those two
         
     | 
| 468 | 
         
            +
                    if device_id2 is not None:
         
     | 
| 469 | 
         
            +
                        if device_id2 not in group["Normalized Device ID"].values:
         
     | 
| 470 | 
         
            +
                            continue
         
     | 
| 471 | 
         
            +
                        devices_to_compare = [device_id2]
         
     | 
| 472 | 
         
            +
                    else:
         
     | 
| 473 | 
         
            +
                        devices_to_compare = [
         
     | 
| 474 | 
         
            +
                            d for d in group["Normalized Device ID"].unique() if d != device_id1
         
     | 
| 475 | 
         
            +
                        ]
         
     | 
| 476 | 
         
            +
             
     | 
| 477 | 
         
            +
                    # Get normalized scores
         
     | 
| 478 | 
         
            +
                    norm_scores = normalize_scores(group)
         
     | 
| 479 | 
         
            +
             
     | 
| 480 | 
         
            +
                    # Compare with other devices
         
     | 
| 481 | 
         
            +
                    for other_device in devices_to_compare:
         
     | 
| 482 | 
         
            +
                        device2_data = group[group["Normalized Device ID"] == other_device].iloc[0]
         
     | 
| 483 | 
         
            +
             
     | 
| 484 | 
         
            +
                        # Skip if normalization failed
         
     | 
| 485 | 
         
            +
                        if device_id1 not in norm_scores or other_device not in norm_scores:
         
     | 
| 486 | 
         
            +
                            continue
         
     | 
| 487 | 
         
            +
             
     | 
| 488 | 
         
            +
                        # Get normalized scores
         
     | 
| 489 | 
         
            +
                        scores1 = norm_scores[device_id1]
         
     | 
| 490 | 
         
            +
                        scores2 = norm_scores[other_device]
         
     | 
| 491 | 
         
            +
             
     | 
| 492 | 
         
            +
                        # Initialize Glicko-2 players for demonstration purposes
         
     | 
| 493 | 
         
            +
                        p1 = glicko2.Player()  # Default rating (1500, 350, 0.06)
         
     | 
| 494 | 
         
            +
                        p2 = glicko2.Player()
         
     | 
| 495 | 
         
            +
             
     | 
| 496 | 
         
            +
                        # Calculate win probability using Glicko-2 formulas
         
     | 
| 497 | 
         
            +
                        # We need to use the expect_score method, which takes a single player as input
         
     | 
| 498 | 
         
            +
                        token_prob = p1.expect_score(p2.rating, p2.rd)  # Properly use the method
         
     | 
| 499 | 
         
            +
                        prompt_prob = p1.expect_score(p2.rating, p2.rd)
         
     | 
| 500 | 
         
            +
                        combined_prob = p1.expect_score(p2.rating, p2.rd)
         
     | 
| 501 | 
         
            +
             
     | 
| 502 | 
         
            +
                        # Determine winners
         
     | 
| 503 | 
         
            +
                        token_winner = (
         
     | 
| 504 | 
         
            +
                            device_id1
         
     | 
| 505 | 
         
            +
                            if device1_data["Token Generation"] > device2_data["Token Generation"]
         
     | 
| 506 | 
         
            +
                            else (
         
     | 
| 507 | 
         
            +
                                other_device
         
     | 
| 508 | 
         
            +
                                if device2_data["Token Generation"]
         
     | 
| 509 | 
         
            +
                                > device1_data["Token Generation"]
         
     | 
| 510 | 
         
            +
                                else "Tie"
         
     | 
| 511 | 
         
            +
                            )
         
     | 
| 512 | 
         
            +
                        )
         
     | 
| 513 | 
         
            +
                        prompt_winner = (
         
     | 
| 514 | 
         
            +
                            device_id1
         
     | 
| 515 | 
         
            +
                            if device1_data["Prompt Processing"] > device2_data["Prompt Processing"]
         
     | 
| 516 | 
         
            +
                            else (
         
     | 
| 517 | 
         
            +
                                other_device
         
     | 
| 518 | 
         
            +
                                if device2_data["Prompt Processing"]
         
     | 
| 519 | 
         
            +
                                > device1_data["Prompt Processing"]
         
     | 
| 520 | 
         
            +
                                else "Tie"
         
     | 
| 521 | 
         
            +
                            )
         
     | 
| 522 | 
         
            +
                        )
         
     | 
| 523 | 
         
            +
                        combined_winner = (
         
     | 
| 524 | 
         
            +
                            device_id1
         
     | 
| 525 | 
         
            +
                            if scores1["combined"] > scores2["combined"]
         
     | 
| 526 | 
         
            +
                            else (
         
     | 
| 527 | 
         
            +
                                other_device if scores2["combined"] > scores1["combined"] else "Tie"
         
     | 
| 528 | 
         
            +
                            )
         
     | 
| 529 | 
         
            +
                        )
         
     | 
| 530 | 
         
            +
             
     | 
| 531 | 
         
            +
                        matches.append(
         
     | 
| 532 | 
         
            +
                            {
         
     | 
| 533 | 
         
            +
                                "Model": model,
         
     | 
| 534 | 
         
            +
                                "Device 1": device_id1,
         
     | 
| 535 | 
         
            +
                                "Device 2": other_device,
         
     | 
| 536 | 
         
            +
                                "n_gpu_layers 1": device1_data["n_gpu_layers"],
         
     | 
| 537 | 
         
            +
                                "n_gpu_layers 2": device2_data["n_gpu_layers"],
         
     | 
| 538 | 
         
            +
                                "Token Generation 1": device1_data["Token Generation"],
         
     | 
| 539 | 
         
            +
                                "Token Generation 2": device2_data["Token Generation"],
         
     | 
| 540 | 
         
            +
                                "Token Winner": token_winner,
         
     | 
| 541 | 
         
            +
                                "Token Win Prob": token_prob,
         
     | 
| 542 | 
         
            +
                                "Prompt Processing 1": device1_data["Prompt Processing"],
         
     | 
| 543 | 
         
            +
                                "Prompt Processing 2": device2_data["Prompt Processing"],
         
     | 
| 544 | 
         
            +
                                "Prompt Winner": prompt_winner,
         
     | 
| 545 | 
         
            +
                                "Prompt Win Prob": prompt_prob,
         
     | 
| 546 | 
         
            +
                                "Combined Winner": combined_winner,
         
     | 
| 547 | 
         
            +
                                "Combined Win Prob": combined_prob,
         
     | 
| 548 | 
         
            +
                                "Platform 1": device1_data["Platform"],
         
     | 
| 549 | 
         
            +
                                "Platform 2": device2_data["Platform"],
         
     | 
| 550 | 
         
            +
                            }
         
     | 
| 551 | 
         
            +
                        )
         
     | 
| 552 | 
         
            +
             
     | 
| 553 | 
         
            +
                matches_df = pd.DataFrame(matches)
         
     | 
| 554 | 
         
            +
             
     | 
| 555 | 
         
            +
                if len(matches_df) > 0:
         
     | 
| 556 | 
         
            +
                    # Add summary statistics
         
     | 
| 557 | 
         
            +
                    print(f"\nMatch Summary for {device_id1}:")
         
     | 
| 558 | 
         
            +
                    print(f"n_gpu_layers for Device 1: {matches_df['n_gpu_layers 1'].iloc[0]}")
         
     | 
| 559 | 
         
            +
                    if device_id2:
         
     | 
| 560 | 
         
            +
                        print(f"Total matches against {device_id2}: {len(matches_df)}")
         
     | 
| 561 | 
         
            +
                        print(f"n_gpu_layers for Device 2: {matches_df['n_gpu_layers 2'].iloc[0]}")
         
     | 
| 562 | 
         
            +
                    else:
         
     | 
| 563 | 
         
            +
                        print(f"Total matches: {len(matches_df)}")
         
     | 
| 564 | 
         
            +
                        print("\nOpponent n_gpu_layers distribution:")
         
     | 
| 565 | 
         
            +
                        print(matches_df["n_gpu_layers 2"].value_counts().sort_index())
         
     | 
| 566 | 
         
            +
             
     | 
| 567 | 
         
            +
                    token_wins = sum(matches_df["Token Winner"] == device_id1)
         
     | 
| 568 | 
         
            +
                    prompt_wins = sum(matches_df["Prompt Winner"] == device_id1)
         
     | 
| 569 | 
         
            +
                    combined_wins = sum(matches_df["Combined Winner"] == device_id1)
         
     | 
| 570 | 
         
            +
             
     | 
| 571 | 
         
            +
                    print(
         
     | 
| 572 | 
         
            +
                        f"\nToken Generation Wins: {token_wins} ({token_wins/len(matches_df)*100:.1f}%)"
         
     | 
| 573 | 
         
            +
                    )
         
     | 
| 574 | 
         
            +
                    print(
         
     | 
| 575 | 
         
            +
                        f"Prompt Processing Wins: {prompt_wins} ({prompt_wins/len(matches_df)*100:.1f}%)"
         
     | 
| 576 | 
         
            +
                    )
         
     | 
| 577 | 
         
            +
                    print(
         
     | 
| 578 | 
         
            +
                        f"Combined Wins: {combined_wins} ({combined_wins/len(matches_df)*100:.1f}%)"
         
     | 
| 579 | 
         
            +
                    )
         
     | 
| 580 | 
         
            +
             
     | 
| 581 | 
         
            +
                    # Platform breakdown
         
     | 
| 582 | 
         
            +
                    print("\nMatches by Platform:")
         
     | 
| 583 | 
         
            +
                    platform_counts = matches_df["Platform 2"].value_counts()
         
     | 
| 584 | 
         
            +
                    print(platform_counts)
         
     | 
| 585 | 
         
            +
             
     | 
| 586 | 
         
            +
                    # Show detailed matches
         
     | 
| 587 | 
         
            +
                    print("\nDetailed Matches:")
         
     | 
| 588 | 
         
            +
                    display_cols = [
         
     | 
| 589 | 
         
            +
                        "Model",
         
     | 
| 590 | 
         
            +
                        "Device 2",
         
     | 
| 591 | 
         
            +
                        "Platform 2",
         
     | 
| 592 | 
         
            +
                        "n_gpu_layers 1",
         
     | 
| 593 | 
         
            +
                        "n_gpu_layers 2",
         
     | 
| 594 | 
         
            +
                        "Token Generation 1",
         
     | 
| 595 | 
         
            +
                        "Token Generation 2",
         
     | 
| 596 | 
         
            +
                        "Token Winner",
         
     | 
| 597 | 
         
            +
                        "Prompt Processing 1",
         
     | 
| 598 | 
         
            +
                        "Prompt Processing 2",
         
     | 
| 599 | 
         
            +
                        "Prompt Winner",
         
     | 
| 600 | 
         
            +
                    ]
         
     | 
| 601 | 
         
            +
                    print(matches_df[display_cols])
         
     | 
| 602 | 
         
            +
             
     | 
| 603 | 
         
            +
                    return matches_df
         
     | 
| 604 | 
         
            +
                else:
         
     | 
| 605 | 
         
            +
                    print(
         
     | 
| 606 | 
         
            +
                        f"No matches found for device {device_id1}"
         
     | 
| 607 | 
         
            +
                        + (f" against {device_id2}" if device_id2 else "")
         
     | 
| 608 | 
         
            +
                    )
         
     | 
| 609 | 
         
            +
                    return pd.DataFrame()
         
     | 
| 610 | 
         
            +
             
     | 
| 611 | 
         
            +
             
     | 
| 612 | 
         
            +
            if __name__ == "__main__":
         
     | 
| 613 | 
         
            +
                # Example usage
         
     | 
| 614 | 
         
            +
                print("This module provides Glicko-2 ranking for device performance.")
         
     | 
| 615 | 
         
            +
                print("Import and use the functions in your own code.")
         
     | 
| 616 | 
         
            +
                print("Example:")
         
     | 
| 617 | 
         
            +
                print("  from glicko2_ranking import analyze_glicko2_rankings")
         
     | 
| 618 | 
         
            +
                print("  rankings, confident_rankings = analyze_glicko2_rankings(df)")
         
     |