Spaces:
Running
Running
Add task count into table column name
Browse files
app.py
CHANGED
|
@@ -55,8 +55,8 @@ with gr.Blocks() as block:
|
|
| 55 |
)
|
| 56 |
|
| 57 |
# Define different captions for each table
|
| 58 |
-
default_caption = "**Table 1: MEGA-Bench full results.** <br> The Core set contains $N_{\\text{core}} = 440$ tasks evaluated by rule-based metrics, and the Open-ended set contains $N_{\\text{open}} = 65$ tasks evaluated by a VLM judge (we use GPT-4o-0806). <br> $\\text{Overall} \\ = \\ \\frac{\\max(\\text{Core w/o CoT}, \\ \\text{Core w/ CoT}) \\ \\cdot \\ N_{\\text{core}} \\ + \\ \\text{Open-ended} \\ \\cdot \\ N_{\\text{open}}}{N_{\\text{core}} \\ + \\ N_{\\text{open}}}$"
|
| 59 |
-
core_single_image_caption = "**Table 2: MEGA-Bench Core Single-image results.** <br> This subset contains 273 single-image tasks from the Core set of the benchmark. For open-source models, we drop the image input in the 1-shot demonstration example so that the entire query contains a single image only. Compared to the default table, some models with only single-image support are added."
|
| 60 |
|
| 61 |
caption_component = gr.Markdown(
|
| 62 |
value=default_caption,
|
|
|
|
| 55 |
)
|
| 56 |
|
| 57 |
# Define different captions for each table
|
| 58 |
+
default_caption = "**Table 1: MEGA-Bench full results.** The number in the parentheses is the number of tasks of each keyword. <br> The Core set contains $N_{\\text{core}} = 440$ tasks evaluated by rule-based metrics, and the Open-ended set contains $N_{\\text{open}} = 65$ tasks evaluated by a VLM judge (we use GPT-4o-0806). <br> $\\text{Overall} \\ = \\ \\frac{\\max(\\text{Core w/o CoT}, \\ \\text{Core w/ CoT}) \\ \\cdot \\ N_{\\text{core}} \\ + \\ \\text{Open-ended} \\ \\cdot \\ N_{\\text{open}}}{N_{\\text{core}} \\ + \\ N_{\\text{open}}}$"
|
| 59 |
+
core_single_image_caption = "**Table 2: MEGA-Bench Core Single-image results.** The number in the parentheses is the number of tasks in each keyword. <br> This subset contains 273 single-image tasks from the Core set of the benchmark. For open-source models, we drop the image input in the 1-shot demonstration example so that the entire query contains a single image only. <br> Compared to the default table, some models with only single-image support are added."
|
| 60 |
|
| 61 |
caption_component = gr.Markdown(
|
| 62 |
value=default_caption,
|
utils.py
CHANGED
|
@@ -102,13 +102,35 @@ class BaseDataLoader:
|
|
| 102 |
self.MODEL_GROUPS = self._initialize_model_groups()
|
| 103 |
|
| 104 |
def _initialize_super_groups(self):
|
| 105 |
-
#
|
|
|
|
| 106 |
|
| 107 |
-
|
| 108 |
-
|
|
|
|
| 109 |
|
| 110 |
-
|
| 111 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
return {k: groups[k] for k in order if k in groups}
|
| 113 |
|
| 114 |
def _initialize_model_groups(self) -> Dict[str, list]:
|
|
@@ -167,12 +189,12 @@ class DefaultDataLoader(BaseDataLoader):
|
|
| 167 |
"Core(w/ CoT)": round(core_cot_score * 100, 2),
|
| 168 |
"Open-ended": round(summary["open"]["macro_mean_score"] * 100, 2)
|
| 169 |
}
|
| 170 |
-
for
|
| 171 |
-
original_keyword =
|
| 172 |
if original_dimension in model_data and original_keyword in model_data[original_dimension]:
|
| 173 |
-
row[
|
| 174 |
else:
|
| 175 |
-
row[
|
| 176 |
data.append(row)
|
| 177 |
|
| 178 |
df = pd.DataFrame(data)
|
|
@@ -209,12 +231,12 @@ class CoreSingleDataLoader(BaseDataLoader):
|
|
| 209 |
"Models": get_display_model_name(model),
|
| 210 |
"Core SI": round(core_si_score * 100, 2),
|
| 211 |
}
|
| 212 |
-
for
|
| 213 |
-
original_keyword =
|
| 214 |
if original_dimension in model_data and original_keyword in model_data[original_dimension]:
|
| 215 |
-
row[
|
| 216 |
else:
|
| 217 |
-
row[
|
| 218 |
data.append(row)
|
| 219 |
|
| 220 |
df = pd.DataFrame(data)
|
|
|
|
| 102 |
self.MODEL_GROUPS = self._initialize_model_groups()
|
| 103 |
|
| 104 |
def _initialize_super_groups(self):
|
| 105 |
+
# Get a sample model to access the structure
|
| 106 |
+
sample_model = next(iter(self.MODEL_DATA))
|
| 107 |
|
| 108 |
+
# Create groups with task counts
|
| 109 |
+
groups = {}
|
| 110 |
+
self.keyword_display_map = {} # Add this map to store display-to-original mapping
|
| 111 |
|
| 112 |
+
for dim in self.MODEL_DATA[sample_model]:
|
| 113 |
+
dim_name = DIMENSION_NAME_MAP[dim]
|
| 114 |
+
# Create a list of tuples (display_name, count, keyword) for sorting
|
| 115 |
+
keyword_info = []
|
| 116 |
+
|
| 117 |
+
for keyword in self.MODEL_DATA[sample_model][dim]:
|
| 118 |
+
# Get the task count for this keyword
|
| 119 |
+
task_count = self.MODEL_DATA[sample_model][dim][keyword]["count"]
|
| 120 |
+
original_name = KEYWORD_NAME_MAP.get(keyword, keyword)
|
| 121 |
+
display_name = f"{original_name}({task_count})"
|
| 122 |
+
keyword_info.append((display_name, task_count, keyword))
|
| 123 |
+
|
| 124 |
+
# Sort by count (descending) and then by display name (for ties)
|
| 125 |
+
keyword_info.sort(key=lambda x: (-x[1], x[0]))
|
| 126 |
+
|
| 127 |
+
# Store sorted display names and update mapping
|
| 128 |
+
groups[dim_name] = [info[0] for info in keyword_info]
|
| 129 |
+
for display_name, _, keyword in keyword_info:
|
| 130 |
+
self.keyword_display_map[display_name] = keyword
|
| 131 |
+
|
| 132 |
+
# Sort based on predefined order
|
| 133 |
+
order = ["Application", "Skills", "Output Format", "Input Format", "Visual Input Number"]
|
| 134 |
return {k: groups[k] for k in order if k in groups}
|
| 135 |
|
| 136 |
def _initialize_model_groups(self) -> Dict[str, list]:
|
|
|
|
| 189 |
"Core(w/ CoT)": round(core_cot_score * 100, 2),
|
| 190 |
"Open-ended": round(summary["open"]["macro_mean_score"] * 100, 2)
|
| 191 |
}
|
| 192 |
+
for display_name in self.SUPER_GROUPS[selected_super_group]:
|
| 193 |
+
original_keyword = self.keyword_display_map[display_name]
|
| 194 |
if original_dimension in model_data and original_keyword in model_data[original_dimension]:
|
| 195 |
+
row[display_name] = round(model_data[original_dimension][original_keyword]["average_score"] * 100, 2)
|
| 196 |
else:
|
| 197 |
+
row[display_name] = None
|
| 198 |
data.append(row)
|
| 199 |
|
| 200 |
df = pd.DataFrame(data)
|
|
|
|
| 231 |
"Models": get_display_model_name(model),
|
| 232 |
"Core SI": round(core_si_score * 100, 2),
|
| 233 |
}
|
| 234 |
+
for display_name in self.SUPER_GROUPS[selected_super_group]:
|
| 235 |
+
original_keyword = self.keyword_display_map[display_name]
|
| 236 |
if original_dimension in model_data and original_keyword in model_data[original_dimension]:
|
| 237 |
+
row[display_name] = round(model_data[original_dimension][original_keyword]["average_score"] * 100, 2)
|
| 238 |
else:
|
| 239 |
+
row[display_name] = None
|
| 240 |
data.append(row)
|
| 241 |
|
| 242 |
df = pd.DataFrame(data)
|