Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -348,34 +348,82 @@ def create_source_html(sources):
|
|
| 348 |
html += "</div>"
|
| 349 |
return html
|
| 350 |
|
| 351 |
-
def create_leaderboard():
|
| 352 |
scores = []
|
| 353 |
for model, data in models.items():
|
| 354 |
total_score = 0
|
| 355 |
total_questions = 0
|
|
|
|
| 356 |
|
| 357 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 358 |
for section in category.values():
|
| 359 |
if section['status'] != 'N/A':
|
| 360 |
questions = section.get('questions', {})
|
| 361 |
-
|
| 362 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 363 |
|
|
|
|
| 364 |
score_percentage = (total_score / total_questions * 100) if total_questions > 0 else 0
|
| 365 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 366 |
|
| 367 |
-
|
| 368 |
-
df =
|
| 369 |
|
| 370 |
-
|
| 371 |
-
|
| 372 |
-
html += "<table class='leaderboard-table'>"
|
| 373 |
-
html += "<tr><th>Rank</th><th>Model</th><th>Score Percentage</th></tr>"
|
| 374 |
-
for i, (_, row) in enumerate(df.iterrows(), 1):
|
| 375 |
-
html += f"<tr><td>{i}</td><td>{row['Model']}</td><td>{row['Score Percentage']:.2f}%</td></tr>"
|
| 376 |
-
html += "</table></div>"
|
| 377 |
|
| 378 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 379 |
|
| 380 |
def create_category_chart(selected_models, selected_categories):
|
| 381 |
if not selected_models:
|
|
@@ -1070,6 +1118,98 @@ css = """
|
|
| 1070 |
.dark .completion-bar-container.na .completion-bar {
|
| 1071 |
background-color: #666;
|
| 1072 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1073 |
"""
|
| 1074 |
|
| 1075 |
first_model = next(iter(models.values()))
|
|
@@ -1080,7 +1220,7 @@ with gr.Blocks(css=css) as demo:
|
|
| 1080 |
|
| 1081 |
with gr.Row():
|
| 1082 |
tab_selection = gr.Radio(["Leaderboard", "Category Analysis", "Detailed Scorecard"],
|
| 1083 |
-
|
| 1084 |
|
| 1085 |
with gr.Row():
|
| 1086 |
model_chooser = gr.Dropdown(choices=[""] + list(models.keys()),
|
|
@@ -1088,15 +1228,25 @@ with gr.Blocks(css=css) as demo:
|
|
| 1088 |
value="",
|
| 1089 |
interactive=True, visible=False)
|
| 1090 |
model_multi_chooser = gr.Dropdown(choices=list(models.keys()),
|
| 1091 |
-
label="Select Models for Comparison",
|
| 1092 |
-
|
| 1093 |
-
|
| 1094 |
-
|
| 1095 |
-
|
| 1096 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1097 |
|
| 1098 |
with gr.Column(visible=True) as leaderboard_tab:
|
| 1099 |
-
leaderboard_output = gr.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1100 |
|
| 1101 |
with gr.Column(visible=False) as category_analysis_tab:
|
| 1102 |
category_chart = gr.Plot()
|
|
@@ -1106,55 +1256,60 @@ with gr.Blocks(css=css) as demo:
|
|
| 1106 |
all_category_cards = gr.HTML()
|
| 1107 |
total_score = gr.Markdown()
|
| 1108 |
|
| 1109 |
-
# Initialize the dashboard
|
| 1110 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1111 |
|
|
|
|
| 1112 |
def update_dashboard(tab, selected_models, selected_model, selected_categories):
|
| 1113 |
-
|
| 1114 |
-
|
| 1115 |
-
|
| 1116 |
-
|
| 1117 |
-
|
| 1118 |
-
|
| 1119 |
-
|
| 1120 |
-
|
| 1121 |
-
|
| 1122 |
-
|
| 1123 |
-
|
| 1124 |
-
|
| 1125 |
-
|
| 1126 |
-
|
| 1127 |
-
elif tab == "Category Analysis":
|
| 1128 |
-
category_chart_visibility = gr.update(visible=True)
|
| 1129 |
-
model_multi_chooser_visibility = gr.update(visible=True)
|
| 1130 |
-
category_filter_visibility = gr.update(visible=True)
|
| 1131 |
-
category_plot = create_category_chart(selected_models or [], selected_categories)
|
| 1132 |
-
return [leaderboard_visibility, category_chart_visibility, detailed_scorecard_visibility,
|
| 1133 |
-
model_chooser_visibility, model_multi_chooser_visibility, category_filter_visibility,
|
| 1134 |
-
gr.update(), gr.update(value=category_plot), gr.update(), gr.update(), gr.update()]
|
| 1135 |
|
| 1136 |
-
|
| 1137 |
-
|
| 1138 |
-
|
| 1139 |
-
|
| 1140 |
-
|
| 1141 |
-
|
| 1142 |
-
|
| 1143 |
-
|
| 1144 |
-
|
| 1145 |
-
|
| 1146 |
-
|
| 1147 |
-
|
| 1148 |
-
|
| 1149 |
-
|
| 1150 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1151 |
|
| 1152 |
# Set up event handlers
|
| 1153 |
tab_selection.change(
|
| 1154 |
fn=update_dashboard,
|
| 1155 |
inputs=[tab_selection, model_multi_chooser, model_chooser, category_filter],
|
| 1156 |
outputs=[leaderboard_tab, category_analysis_tab, detailed_scorecard_tab,
|
| 1157 |
-
model_chooser, model_multi_chooser,
|
| 1158 |
leaderboard_output, category_chart, model_metadata,
|
| 1159 |
all_category_cards, total_score]
|
| 1160 |
)
|
|
@@ -1181,7 +1336,7 @@ with gr.Blocks(css=css) as demo:
|
|
| 1181 |
fn=update_dashboard,
|
| 1182 |
inputs=[tab_selection, model_multi_chooser, model_chooser, category_filter],
|
| 1183 |
outputs=[leaderboard_tab, category_analysis_tab, detailed_scorecard_tab,
|
| 1184 |
-
model_chooser, model_multi_chooser,
|
| 1185 |
leaderboard_output, category_chart, model_metadata,
|
| 1186 |
all_category_cards, total_score]
|
| 1187 |
)
|
|
|
|
| 348 |
html += "</div>"
|
| 349 |
return html
|
| 350 |
|
| 351 |
+
def create_leaderboard(selected_categories):
|
| 352 |
scores = []
|
| 353 |
for model, data in models.items():
|
| 354 |
total_score = 0
|
| 355 |
total_questions = 0
|
| 356 |
+
score_by_category = {}
|
| 357 |
|
| 358 |
+
# Calculate scores by category
|
| 359 |
+
for category_name, category in data['scores'].items():
|
| 360 |
+
category_score = 0
|
| 361 |
+
category_total = 0
|
| 362 |
+
|
| 363 |
for section in category.values():
|
| 364 |
if section['status'] != 'N/A':
|
| 365 |
questions = section.get('questions', {})
|
| 366 |
+
category_score += sum(1 for q in questions.values() if q)
|
| 367 |
+
category_total += len(questions)
|
| 368 |
+
|
| 369 |
+
if category_total > 0:
|
| 370 |
+
score_by_category[category_name] = (category_score / category_total) * 100
|
| 371 |
+
total_score += category_score
|
| 372 |
+
total_questions += category_total
|
| 373 |
|
| 374 |
+
# Calculate overall score
|
| 375 |
score_percentage = (total_score / total_questions * 100) if total_questions > 0 else 0
|
| 376 |
+
|
| 377 |
+
# Get model type
|
| 378 |
+
model_type = data['metadata'].get('Type', 'Unknown')
|
| 379 |
+
|
| 380 |
+
# Create entry with numerical scores
|
| 381 |
+
model_entry = {
|
| 382 |
+
'Model': model,
|
| 383 |
+
'Type': model_type,
|
| 384 |
+
'Overall Completion Rate': score_percentage
|
| 385 |
+
}
|
| 386 |
+
|
| 387 |
+
# Add selected category scores with emojis
|
| 388 |
+
category_map = {
|
| 389 |
+
'1. Bias, Stereotypes, and Representational Harms Evaluation': '⚖️ Bias and Fairness',
|
| 390 |
+
'2. Cultural Values and Sensitive Content Evaluation': '🌍 Cultural Values',
|
| 391 |
+
'3. Disparate Performance Evaluation': '📊 Disparate Performance',
|
| 392 |
+
'4. Environmental Costs and Carbon Emissions Evaluation': '🌱 Environmental Impact',
|
| 393 |
+
'5. Privacy and Data Protection Evaluation': '🔒 Privacy',
|
| 394 |
+
'6. Financial Costs Evaluation': '💰 Financial Costs',
|
| 395 |
+
'7. Data and Content Moderation Labor Evaluation': '👥 Labor Practices'
|
| 396 |
+
}
|
| 397 |
+
|
| 398 |
+
for full_cat_name, display_name in category_map.items():
|
| 399 |
+
if full_cat_name in selected_categories:
|
| 400 |
+
score = score_by_category.get(full_cat_name, 0)
|
| 401 |
+
model_entry[display_name] = score
|
| 402 |
+
|
| 403 |
+
scores.append(model_entry)
|
| 404 |
|
| 405 |
+
# Convert to DataFrame
|
| 406 |
+
df = pd.DataFrame(scores)
|
| 407 |
|
| 408 |
+
# Sort by Overall Completion Rate descending
|
| 409 |
+
df = df.sort_values('Overall Completion Rate', ascending=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 410 |
|
| 411 |
+
# Add rank column based on current sort
|
| 412 |
+
df.insert(0, 'Rank', range(1, len(df) + 1))
|
| 413 |
+
|
| 414 |
+
# Format scores with % after sorting
|
| 415 |
+
numeric_columns = ['Overall Completion Rate'] + list(category_map.values())
|
| 416 |
+
for col in df.columns:
|
| 417 |
+
if col in numeric_columns:
|
| 418 |
+
df[col] = df[col].apply(lambda x: f"{x:.1f}%")
|
| 419 |
+
|
| 420 |
+
return df
|
| 421 |
+
|
| 422 |
+
with gr.Column(visible=True) as leaderboard_tab:
|
| 423 |
+
leaderboard_output = gr.DataFrame(
|
| 424 |
+
interactive=True, # Allow sorting
|
| 425 |
+
wrap=True
|
| 426 |
+
)
|
| 427 |
|
| 428 |
def create_category_chart(selected_models, selected_categories):
|
| 429 |
if not selected_models:
|
|
|
|
| 1118 |
.dark .completion-bar-container.na .completion-bar {
|
| 1119 |
background-color: #666;
|
| 1120 |
}
|
| 1121 |
+
.leaderboard-filters {
|
| 1122 |
+
margin-bottom: 20px;
|
| 1123 |
+
padding: 15px;
|
| 1124 |
+
background-color: #f8f9fa;
|
| 1125 |
+
border-radius: 8px;
|
| 1126 |
+
}
|
| 1127 |
+
|
| 1128 |
+
.dark .leaderboard-filters {
|
| 1129 |
+
background-color: #2a2a2a;
|
| 1130 |
+
}
|
| 1131 |
+
|
| 1132 |
+
.filter-group {
|
| 1133 |
+
margin-bottom: 10px;
|
| 1134 |
+
}
|
| 1135 |
+
|
| 1136 |
+
.filter-label {
|
| 1137 |
+
font-weight: 600;
|
| 1138 |
+
margin-bottom: 5px;
|
| 1139 |
+
display: block;
|
| 1140 |
+
}
|
| 1141 |
+
|
| 1142 |
+
.score-column {
|
| 1143 |
+
background-color: #f0f7ff;
|
| 1144 |
+
}
|
| 1145 |
+
|
| 1146 |
+
.dark .score-column {
|
| 1147 |
+
background-color: #1a2733;
|
| 1148 |
+
}
|
| 1149 |
+
|
| 1150 |
+
.metric-header {
|
| 1151 |
+
font-size: 0.9em;
|
| 1152 |
+
color: #666;
|
| 1153 |
+
text-align: center;
|
| 1154 |
+
}
|
| 1155 |
+
|
| 1156 |
+
.dark .metric-header {
|
| 1157 |
+
color: #aaa;
|
| 1158 |
+
}
|
| 1159 |
+
|
| 1160 |
+
.table-container {
|
| 1161 |
+
overflow-x: auto;
|
| 1162 |
+
}
|
| 1163 |
+
|
| 1164 |
+
.leaderboard-table td {
|
| 1165 |
+
white-space: nowrap;
|
| 1166 |
+
}
|
| 1167 |
+
|
| 1168 |
+
.score-cell {
|
| 1169 |
+
text-align: right;
|
| 1170 |
+
padding-right: 15px !important;
|
| 1171 |
+
}
|
| 1172 |
+
|
| 1173 |
+
.model-cell {
|
| 1174 |
+
max-width: 300px;
|
| 1175 |
+
overflow: hidden;
|
| 1176 |
+
text-overflow: ellipsis;
|
| 1177 |
+
white-space: nowrap;
|
| 1178 |
+
}
|
| 1179 |
+
|
| 1180 |
+
.leaderboard-table {
|
| 1181 |
+
width: 100%;
|
| 1182 |
+
border-collapse: collapse;
|
| 1183 |
+
}
|
| 1184 |
+
|
| 1185 |
+
.leaderboard-table th,
|
| 1186 |
+
.leaderboard-table td {
|
| 1187 |
+
padding: 10px;
|
| 1188 |
+
text-align: left;
|
| 1189 |
+
border: 1px solid #e0e0e0;
|
| 1190 |
+
}
|
| 1191 |
+
|
| 1192 |
+
.dark .leaderboard-table th,
|
| 1193 |
+
.dark .leaderboard-table td {
|
| 1194 |
+
border-color: #444;
|
| 1195 |
+
}
|
| 1196 |
+
|
| 1197 |
+
.leaderboard-table th {
|
| 1198 |
+
background-color: #f2f2f2;
|
| 1199 |
+
font-weight: bold;
|
| 1200 |
+
}
|
| 1201 |
+
|
| 1202 |
+
.dark .leaderboard-table th {
|
| 1203 |
+
background-color: #2c3e50;
|
| 1204 |
+
}
|
| 1205 |
+
|
| 1206 |
+
.leaderboard-table tr:hover {
|
| 1207 |
+
background-color: #f5f5f5;
|
| 1208 |
+
}
|
| 1209 |
+
|
| 1210 |
+
.dark .leaderboard-table tr:hover {
|
| 1211 |
+
background-color: #2d2d2d;
|
| 1212 |
+
}
|
| 1213 |
"""
|
| 1214 |
|
| 1215 |
first_model = next(iter(models.values()))
|
|
|
|
| 1220 |
|
| 1221 |
with gr.Row():
|
| 1222 |
tab_selection = gr.Radio(["Leaderboard", "Category Analysis", "Detailed Scorecard"],
|
| 1223 |
+
label="Select Tab", value="Leaderboard")
|
| 1224 |
|
| 1225 |
with gr.Row():
|
| 1226 |
model_chooser = gr.Dropdown(choices=[""] + list(models.keys()),
|
|
|
|
| 1228 |
value="",
|
| 1229 |
interactive=True, visible=False)
|
| 1230 |
model_multi_chooser = gr.Dropdown(choices=list(models.keys()),
|
| 1231 |
+
label="Select Models for Comparison",
|
| 1232 |
+
value=[],
|
| 1233 |
+
multiselect=True,
|
| 1234 |
+
interactive=True,
|
| 1235 |
+
visible=False,
|
| 1236 |
+
info="Select one or more models")
|
| 1237 |
+
|
| 1238 |
+
# Category filter now visible for all tabs
|
| 1239 |
+
category_filter = gr.CheckboxGroup(choices=category_choices,
|
| 1240 |
+
label="Filter Categories",
|
| 1241 |
+
value=category_choices)
|
| 1242 |
|
| 1243 |
with gr.Column(visible=True) as leaderboard_tab:
|
| 1244 |
+
leaderboard_output = gr.DataFrame(
|
| 1245 |
+
headers=["Rank", "Model", "Type", "Overall Score"],
|
| 1246 |
+
datatype=["number", "str", "str", "str"],
|
| 1247 |
+
interactive=False,
|
| 1248 |
+
wrap=True
|
| 1249 |
+
)
|
| 1250 |
|
| 1251 |
with gr.Column(visible=False) as category_analysis_tab:
|
| 1252 |
category_chart = gr.Plot()
|
|
|
|
| 1256 |
all_category_cards = gr.HTML()
|
| 1257 |
total_score = gr.Markdown()
|
| 1258 |
|
| 1259 |
+
# Initialize the dashboard
|
| 1260 |
+
def init_leaderboard():
|
| 1261 |
+
df = create_leaderboard(category_filter.value)
|
| 1262 |
+
return df
|
| 1263 |
+
|
| 1264 |
+
leaderboard_output.value = init_leaderboard()
|
| 1265 |
|
| 1266 |
+
# Update handlers
|
| 1267 |
def update_dashboard(tab, selected_models, selected_model, selected_categories):
|
| 1268 |
+
leaderboard_visibility = gr.update(visible=False)
|
| 1269 |
+
category_chart_visibility = gr.update(visible=False)
|
| 1270 |
+
detailed_scorecard_visibility = gr.update(visible=False)
|
| 1271 |
+
model_chooser_visibility = gr.update(visible=False)
|
| 1272 |
+
model_multi_chooser_visibility = gr.update(visible=False)
|
| 1273 |
+
|
| 1274 |
+
if tab == "Leaderboard":
|
| 1275 |
+
leaderboard_visibility = gr.update(visible=True)
|
| 1276 |
+
df = create_leaderboard(selected_categories)
|
| 1277 |
+
return [leaderboard_visibility, category_chart_visibility, detailed_scorecard_visibility,
|
| 1278 |
+
model_chooser_visibility, model_multi_chooser_visibility,
|
| 1279 |
+
gr.update(value=df), gr.update(), gr.update(), gr.update(), gr.update()]
|
| 1280 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1281 |
|
| 1282 |
+
elif tab == "Category Analysis":
|
| 1283 |
+
category_chart_visibility = gr.update(visible=True)
|
| 1284 |
+
model_multi_chooser_visibility = gr.update(visible=True)
|
| 1285 |
+
category_filter_visibility = gr.update(visible=True)
|
| 1286 |
+
category_plot = create_category_chart(selected_models or [], selected_categories)
|
| 1287 |
+
return [leaderboard_visibility, category_chart_visibility, detailed_scorecard_visibility,
|
| 1288 |
+
model_chooser_visibility, model_multi_chooser_visibility, category_filter_visibility,
|
| 1289 |
+
None, gr.update(value=category_plot), gr.update(), gr.update(), gr.update()]
|
| 1290 |
+
|
| 1291 |
+
elif tab == "Detailed Scorecard":
|
| 1292 |
+
detailed_scorecard_visibility = gr.update(visible=True)
|
| 1293 |
+
model_chooser_visibility = gr.update(visible=True)
|
| 1294 |
+
category_filter_visibility = gr.update(visible=True)
|
| 1295 |
+
if selected_model:
|
| 1296 |
+
scorecard_updates = update_detailed_scorecard(selected_model, selected_categories)
|
| 1297 |
+
else:
|
| 1298 |
+
scorecard_updates = [
|
| 1299 |
+
gr.update(value="Please select a model to view details.", visible=True),
|
| 1300 |
+
gr.update(visible=False),
|
| 1301 |
+
gr.update(visible=False)
|
| 1302 |
+
]
|
| 1303 |
+
return [leaderboard_visibility, category_chart_visibility, detailed_scorecard_visibility,
|
| 1304 |
+
model_chooser_visibility, model_multi_chooser_visibility, category_filter_visibility,
|
| 1305 |
+
None, None] + scorecard_updates
|
| 1306 |
|
| 1307 |
# Set up event handlers
|
| 1308 |
tab_selection.change(
|
| 1309 |
fn=update_dashboard,
|
| 1310 |
inputs=[tab_selection, model_multi_chooser, model_chooser, category_filter],
|
| 1311 |
outputs=[leaderboard_tab, category_analysis_tab, detailed_scorecard_tab,
|
| 1312 |
+
model_chooser, model_multi_chooser,
|
| 1313 |
leaderboard_output, category_chart, model_metadata,
|
| 1314 |
all_category_cards, total_score]
|
| 1315 |
)
|
|
|
|
| 1336 |
fn=update_dashboard,
|
| 1337 |
inputs=[tab_selection, model_multi_chooser, model_chooser, category_filter],
|
| 1338 |
outputs=[leaderboard_tab, category_analysis_tab, detailed_scorecard_tab,
|
| 1339 |
+
model_chooser, model_multi_chooser,
|
| 1340 |
leaderboard_output, category_chart, model_metadata,
|
| 1341 |
all_category_cards, total_score]
|
| 1342 |
)
|