Spaces:
Running
Running
update results & separate results organization
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- app.py +2 -1
- constants.py +1 -1
- static/eval_results/Default/Aquila_VL_2B/summary_results.json +251 -0
- static/eval_results/Default/Aquila_VL_2B/task_results.json +0 -0
- static/eval_results/Default/Aria/summary_results.json +251 -0
- static/eval_results/Default/Aria/task_results.json +0 -0
- static/eval_results/Default/Claude_3.5/summary_results.json +251 -0
- static/eval_results/Default/Claude_3.5/task_results.json +0 -0
- static/eval_results/Default/Claude_3.5_new/summary_results.json +251 -0
- static/eval_results/Default/Claude_3.5_new/task_results.json +0 -0
- static/eval_results/Default/GPT_4o/summary_results.json +251 -0
- static/eval_results/Default/GPT_4o/task_results.json +0 -0
- static/eval_results/Default/GPT_4o_mini/summary_results.json +251 -0
- static/eval_results/Default/GPT_4o_mini/task_results.json +0 -0
- static/eval_results/Default/Gemini_1.5_flash_002/summary_results.json +251 -0
- static/eval_results/Default/Gemini_1.5_flash_002/task_results.json +0 -0
- static/eval_results/Default/Gemini_1.5_pro_002/summary_results.json +251 -0
- static/eval_results/Default/Gemini_1.5_pro_002/task_results.json +0 -0
- static/eval_results/Default/Idefics3/summary_results.json +251 -0
- static/eval_results/Default/Idefics3/task_results.json +0 -0
- static/eval_results/Default/InternVL2_2B/summary_results.json +251 -0
- static/eval_results/Default/InternVL2_2B/task_results.json +0 -0
- static/eval_results/Default/InternVL2_5_2B/summary_results.json +251 -0
- static/eval_results/Default/InternVL2_5_2B/task_results.json +0 -0
- static/eval_results/Default/InternVL2_5_78B/summary_results.json +251 -0
- static/eval_results/Default/InternVL2_5_78B/task_results.json +0 -0
- static/eval_results/Default/InternVL2_76B/summary_results.json +251 -0
- static/eval_results/Default/InternVL2_76B/task_results.json +0 -0
- static/eval_results/Default/InternVL2_8B/summary_results.json +251 -0
- static/eval_results/Default/InternVL2_8B/task_results.json +0 -0
- static/eval_results/Default/Llama_3_2_11B/summary_results.json +251 -0
- static/eval_results/Default/Llama_3_2_11B/task_results.json +0 -0
- static/eval_results/Default/Mammoth_VL/summary_results.json +251 -0
- static/eval_results/Default/Mammoth_VL/task_results.json +0 -0
- static/eval_results/Default/MiniCPM_v2.6/summary_results.json +251 -0
- static/eval_results/Default/MiniCPM_v2.6/task_results.json +0 -0
- static/eval_results/Default/NVLM/summary_results.json +251 -0
- static/eval_results/Default/NVLM/task_results.json +0 -0
- static/eval_results/Default/Phi-3.5-vision/summary_results.json +251 -0
- static/eval_results/Default/Phi-3.5-vision/task_results.json +0 -0
- static/eval_results/Default/Pixtral_12B/summary_results.json +251 -0
- static/eval_results/Default/Pixtral_12B/task_results.json +0 -0
- static/eval_results/Default/Qwen2_VL_2B/summary_results.json +251 -0
- static/eval_results/Default/Qwen2_VL_2B/task_results.json +0 -0
- static/eval_results/Default/Qwen2_VL_72B/summary_results.json +251 -0
- static/eval_results/Default/Qwen2_VL_72B/task_results.json +0 -0
- static/eval_results/Default/Qwen2_VL_7B/summary_results.json +251 -0
- static/eval_results/Default/Qwen2_VL_7B/task_results.json +0 -0
- static/eval_results/Default/all_model_keywords_stats.json +0 -0
- static/eval_results/Default/all_summary.json +0 -525
app.py
CHANGED
|
@@ -55,7 +55,8 @@ with gr.Blocks() as block:
|
|
| 55 |
)
|
| 56 |
|
| 57 |
# Define different captions for each table
|
| 58 |
-
default_caption = "**Table 1: MEGA-Bench full results.** The number in the parentheses is the number of tasks of each keyword. <br> The Core set contains $N_{\\text{core}} = 440$ tasks evaluated by rule-based metrics, and the Open-ended set contains $N_{\\text{open}} = 65$ tasks evaluated by a VLM judge (we use GPT-4o-0806). <br> $\\text{Overall} \\ = \\ \\frac{\\
|
|
|
|
| 59 |
single_image_caption = "**Table 2: MEGA-Bench Single-image setting results.** The number in the parentheses is the number of tasks in each keyword. <br> This subset contains 273 single-image tasks from the Core set and 42 single-image tasks from the Open-ended set. For open-source models, we drop the image input in the 1-shot demonstration example so that the entire query contains a single image only. <br> Compared to the default table, some models with only single-image support are added."
|
| 60 |
|
| 61 |
caption_component = gr.Markdown(
|
|
|
|
| 55 |
)
|
| 56 |
|
| 57 |
# Define different captions for each table
|
| 58 |
+
default_caption = "**Table 1: MEGA-Bench full results.** The number in the parentheses is the number of tasks of each keyword. <br> The Core set contains $N_{\\text{core}} = 440$ tasks evaluated by rule-based metrics, and the Open-ended set contains $N_{\\text{open}} = 65$ tasks evaluated by a VLM judge (we use GPT-4o-0806). <br> Different from the results in our paper, we only use the Core results with CoT prompting here for clarity and compatibility with the released data. <br> $\\text{Overall} \\ = \\ \\frac{\\text{Core} \\ \\cdot \\ N_{\\text{core}} \\ + \\ \\text{Open-ended} \\ \\cdot \\ N_{\\text{open}}}{N_{\\text{core}} \\ + \\ N_{\\text{open}}}$ "
|
| 59 |
+
|
| 60 |
single_image_caption = "**Table 2: MEGA-Bench Single-image setting results.** The number in the parentheses is the number of tasks in each keyword. <br> This subset contains 273 single-image tasks from the Core set and 42 single-image tasks from the Open-ended set. For open-source models, we drop the image input in the 1-shot demonstration example so that the entire query contains a single image only. <br> Compared to the default table, some models with only single-image support are added."
|
| 61 |
|
| 62 |
caption_component = gr.Markdown(
|
constants.py
CHANGED
|
@@ -28,7 +28,7 @@ We aim to provide cost-effective and accurate evaluation for multimodal models,
|
|
| 28 |
|
| 29 |
## 📊🔍 Results & Takeaways from Evaluating Top Models
|
| 30 |
|
| 31 |
-
- GPT-4o (0513) and Claude 3.5 Sonnet (1022) lead the benchmark. Claude 3.5 Sonnet (1022) improves over Claude 3.5 Sonnet (
|
| 32 |
- Qwen2-VL stands out among open-source models, and its flagship model gets close to some proprietary flagship models
|
| 33 |
- Chain-of-Thought (CoT) prompting improves proprietary models but has limited impact on open-source models
|
| 34 |
- Gemini 1.5 Flash performs the best among all the evaluated efficiency models, but struggles with UI and document tasks
|
|
|
|
| 28 |
|
| 29 |
## 📊🔍 Results & Takeaways from Evaluating Top Models
|
| 30 |
|
| 31 |
+
- GPT-4o (0513) and Claude 3.5 Sonnet (1022) lead the benchmark. Claude 3.5 Sonnet (1022) improves over Claude 3.5 Sonnet (0620) obviously in planning tasks (application dimension) and UI/Infographics inputs (input format dimension).
|
| 32 |
- Qwen2-VL stands out among open-source models, and its flagship model gets close to some proprietary flagship models
|
| 33 |
- Chain-of-Thought (CoT) prompting improves proprietary models but has limited impact on open-source models
|
| 34 |
- Gemini 1.5 Flash performs the best among all the evaluated efficiency models, but struggles with UI and document tasks
|
static/eval_results/Default/Aquila_VL_2B/summary_results.json
ADDED
|
@@ -0,0 +1,251 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_summary": {
|
| 3 |
+
"core": {
|
| 4 |
+
"num_eval_tasks": 440,
|
| 5 |
+
"num_eval_samples": 6539,
|
| 6 |
+
"macro_mean_score": 0.159970161379836,
|
| 7 |
+
"micro_mean_score": 0.15844711671722148
|
| 8 |
+
},
|
| 9 |
+
"open": {
|
| 10 |
+
"num_eval_tasks": 65,
|
| 11 |
+
"num_eval_samples": 1163,
|
| 12 |
+
"macro_mean_score": 0.24567572098570653,
|
| 13 |
+
"micro_mean_score": 0.2704213241616509
|
| 14 |
+
},
|
| 15 |
+
"overall_score": 0.17100157004197775
|
| 16 |
+
},
|
| 17 |
+
"keyword_stats": {
|
| 18 |
+
"skills": {
|
| 19 |
+
"Object Recognition and Classification": {
|
| 20 |
+
"count": 303,
|
| 21 |
+
"num_samples": 4755,
|
| 22 |
+
"tasks": [],
|
| 23 |
+
"average_score": 0.1796551584774396
|
| 24 |
+
},
|
| 25 |
+
"Text Recognition (OCR)": {
|
| 26 |
+
"count": 137,
|
| 27 |
+
"num_samples": 2239,
|
| 28 |
+
"tasks": [],
|
| 29 |
+
"average_score": 0.1263506560912463
|
| 30 |
+
},
|
| 31 |
+
"Language Understanding and Generation": {
|
| 32 |
+
"count": 154,
|
| 33 |
+
"num_samples": 2509,
|
| 34 |
+
"tasks": [],
|
| 35 |
+
"average_score": 0.1775085349123463
|
| 36 |
+
},
|
| 37 |
+
"Scene and Event Understanding": {
|
| 38 |
+
"count": 154,
|
| 39 |
+
"num_samples": 2467,
|
| 40 |
+
"tasks": [],
|
| 41 |
+
"average_score": 0.2114933522881099
|
| 42 |
+
},
|
| 43 |
+
"Mathematical and Logical Reasoning": {
|
| 44 |
+
"count": 109,
|
| 45 |
+
"num_samples": 1910,
|
| 46 |
+
"tasks": [],
|
| 47 |
+
"average_score": 0.16251700109869488
|
| 48 |
+
},
|
| 49 |
+
"Commonsense and Social Reasoning": {
|
| 50 |
+
"count": 51,
|
| 51 |
+
"num_samples": 855,
|
| 52 |
+
"tasks": [],
|
| 53 |
+
"average_score": 0.26453155444796583
|
| 54 |
+
},
|
| 55 |
+
"Ethical and Safety Reasoning": {
|
| 56 |
+
"count": 15,
|
| 57 |
+
"num_samples": 245,
|
| 58 |
+
"tasks": [],
|
| 59 |
+
"average_score": 0.3729498746867168
|
| 60 |
+
},
|
| 61 |
+
"Domain-Specific Knowledge and Skills": {
|
| 62 |
+
"count": 77,
|
| 63 |
+
"num_samples": 1386,
|
| 64 |
+
"tasks": [],
|
| 65 |
+
"average_score": 0.19090788408036002
|
| 66 |
+
},
|
| 67 |
+
"Spatial and Temporal Reasoning": {
|
| 68 |
+
"count": 152,
|
| 69 |
+
"num_samples": 2437,
|
| 70 |
+
"tasks": [],
|
| 71 |
+
"average_score": 0.16500679466160564
|
| 72 |
+
},
|
| 73 |
+
"Planning and Decision Making": {
|
| 74 |
+
"count": 37,
|
| 75 |
+
"num_samples": 577,
|
| 76 |
+
"tasks": [],
|
| 77 |
+
"average_score": 0.03972686819521137
|
| 78 |
+
}
|
| 79 |
+
},
|
| 80 |
+
"input_format": {
|
| 81 |
+
"User Interface Screenshots": {
|
| 82 |
+
"count": 93,
|
| 83 |
+
"num_samples": 1517,
|
| 84 |
+
"tasks": [],
|
| 85 |
+
"average_score": 0.07035116566014021
|
| 86 |
+
},
|
| 87 |
+
"Text-Based Images and Documents": {
|
| 88 |
+
"count": 82,
|
| 89 |
+
"num_samples": 1294,
|
| 90 |
+
"tasks": [],
|
| 91 |
+
"average_score": 0.11915109312705179
|
| 92 |
+
},
|
| 93 |
+
"Diagrams and Data Visualizations": {
|
| 94 |
+
"count": 101,
|
| 95 |
+
"num_samples": 1718,
|
| 96 |
+
"tasks": [],
|
| 97 |
+
"average_score": 0.18915652635850314
|
| 98 |
+
},
|
| 99 |
+
"Videos": {
|
| 100 |
+
"count": 43,
|
| 101 |
+
"num_samples": 698,
|
| 102 |
+
"tasks": [],
|
| 103 |
+
"average_score": 0.21939978337316163
|
| 104 |
+
},
|
| 105 |
+
"Artistic and Creative Content": {
|
| 106 |
+
"count": 32,
|
| 107 |
+
"num_samples": 541,
|
| 108 |
+
"tasks": [],
|
| 109 |
+
"average_score": 0.17643260913333875
|
| 110 |
+
},
|
| 111 |
+
"Photographs": {
|
| 112 |
+
"count": 143,
|
| 113 |
+
"num_samples": 2248,
|
| 114 |
+
"tasks": [],
|
| 115 |
+
"average_score": 0.2438396314831894
|
| 116 |
+
},
|
| 117 |
+
"3D Models and Aerial Imagery": {
|
| 118 |
+
"count": 11,
|
| 119 |
+
"num_samples": 169,
|
| 120 |
+
"tasks": [],
|
| 121 |
+
"average_score": 0.08989401697906672
|
| 122 |
+
}
|
| 123 |
+
},
|
| 124 |
+
"output_format": {
|
| 125 |
+
"contextual_formatted_text": {
|
| 126 |
+
"count": 98,
|
| 127 |
+
"num_samples": 1514,
|
| 128 |
+
"tasks": [],
|
| 129 |
+
"average_score": 0.12241197113963243
|
| 130 |
+
},
|
| 131 |
+
"structured_output": {
|
| 132 |
+
"count": 110,
|
| 133 |
+
"num_samples": 1714,
|
| 134 |
+
"tasks": [],
|
| 135 |
+
"average_score": 0.10758402844431432
|
| 136 |
+
},
|
| 137 |
+
"exact_text": {
|
| 138 |
+
"count": 83,
|
| 139 |
+
"num_samples": 1278,
|
| 140 |
+
"tasks": [],
|
| 141 |
+
"average_score": 0.19372082302321905
|
| 142 |
+
},
|
| 143 |
+
"numerical_data": {
|
| 144 |
+
"count": 49,
|
| 145 |
+
"num_samples": 862,
|
| 146 |
+
"tasks": [],
|
| 147 |
+
"average_score": 0.19201243810115767
|
| 148 |
+
},
|
| 149 |
+
"open_ended_output": {
|
| 150 |
+
"count": 80,
|
| 151 |
+
"num_samples": 1454,
|
| 152 |
+
"tasks": [],
|
| 153 |
+
"average_score": 0.23278612647548963
|
| 154 |
+
},
|
| 155 |
+
"multiple_choice": {
|
| 156 |
+
"count": 85,
|
| 157 |
+
"num_samples": 1363,
|
| 158 |
+
"tasks": [],
|
| 159 |
+
"average_score": 0.21664527852608348
|
| 160 |
+
}
|
| 161 |
+
},
|
| 162 |
+
"input_num": {
|
| 163 |
+
"6-8 images": {
|
| 164 |
+
"count": 21,
|
| 165 |
+
"num_samples": 314,
|
| 166 |
+
"tasks": [],
|
| 167 |
+
"average_score": 0.12138133030990172
|
| 168 |
+
},
|
| 169 |
+
"9-image or more": {
|
| 170 |
+
"count": 41,
|
| 171 |
+
"num_samples": 623,
|
| 172 |
+
"tasks": [],
|
| 173 |
+
"average_score": 0.01221681479628382
|
| 174 |
+
},
|
| 175 |
+
"1-image": {
|
| 176 |
+
"count": 315,
|
| 177 |
+
"num_samples": 5228,
|
| 178 |
+
"tasks": [],
|
| 179 |
+
"average_score": 0.17994400163273605
|
| 180 |
+
},
|
| 181 |
+
"video": {
|
| 182 |
+
"count": 43,
|
| 183 |
+
"num_samples": 698,
|
| 184 |
+
"tasks": [],
|
| 185 |
+
"average_score": 0.21939978337316163
|
| 186 |
+
},
|
| 187 |
+
"4-5 images": {
|
| 188 |
+
"count": 34,
|
| 189 |
+
"num_samples": 520,
|
| 190 |
+
"tasks": [],
|
| 191 |
+
"average_score": 0.18212149746318507
|
| 192 |
+
},
|
| 193 |
+
"2-3 images": {
|
| 194 |
+
"count": 51,
|
| 195 |
+
"num_samples": 802,
|
| 196 |
+
"tasks": [],
|
| 197 |
+
"average_score": 0.21563163558700174
|
| 198 |
+
}
|
| 199 |
+
},
|
| 200 |
+
"app": {
|
| 201 |
+
"Information_Extraction": {
|
| 202 |
+
"count": 72,
|
| 203 |
+
"num_samples": 1124,
|
| 204 |
+
"tasks": [],
|
| 205 |
+
"average_score": 0.0981320856519089
|
| 206 |
+
},
|
| 207 |
+
"Planning": {
|
| 208 |
+
"count": 78,
|
| 209 |
+
"num_samples": 1239,
|
| 210 |
+
"tasks": [],
|
| 211 |
+
"average_score": 0.0557399538308785
|
| 212 |
+
},
|
| 213 |
+
"Coding": {
|
| 214 |
+
"count": 31,
|
| 215 |
+
"num_samples": 474,
|
| 216 |
+
"tasks": [],
|
| 217 |
+
"average_score": 0.1351126472094214
|
| 218 |
+
},
|
| 219 |
+
"Perception": {
|
| 220 |
+
"count": 145,
|
| 221 |
+
"num_samples": 2313,
|
| 222 |
+
"tasks": [],
|
| 223 |
+
"average_score": 0.2025034827431662
|
| 224 |
+
},
|
| 225 |
+
"Metrics": {
|
| 226 |
+
"count": 20,
|
| 227 |
+
"num_samples": 309,
|
| 228 |
+
"tasks": [],
|
| 229 |
+
"average_score": 0.29326275059361956
|
| 230 |
+
},
|
| 231 |
+
"Science": {
|
| 232 |
+
"count": 29,
|
| 233 |
+
"num_samples": 574,
|
| 234 |
+
"tasks": [],
|
| 235 |
+
"average_score": 0.22529225586731416
|
| 236 |
+
},
|
| 237 |
+
"Knowledge": {
|
| 238 |
+
"count": 97,
|
| 239 |
+
"num_samples": 1605,
|
| 240 |
+
"tasks": [],
|
| 241 |
+
"average_score": 0.23810497886903373
|
| 242 |
+
},
|
| 243 |
+
"Mathematics": {
|
| 244 |
+
"count": 33,
|
| 245 |
+
"num_samples": 547,
|
| 246 |
+
"tasks": [],
|
| 247 |
+
"average_score": 0.17867138975396438
|
| 248 |
+
}
|
| 249 |
+
}
|
| 250 |
+
}
|
| 251 |
+
}
|
static/eval_results/Default/Aquila_VL_2B/task_results.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
static/eval_results/Default/Aria/summary_results.json
ADDED
|
@@ -0,0 +1,251 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_summary": {
|
| 3 |
+
"core": {
|
| 4 |
+
"num_eval_tasks": 440,
|
| 5 |
+
"num_eval_samples": 6539,
|
| 6 |
+
"macro_mean_score": 0.289073788209904,
|
| 7 |
+
"micro_mean_score": 0.2859007507765791
|
| 8 |
+
},
|
| 9 |
+
"open": {
|
| 10 |
+
"num_eval_tasks": 65,
|
| 11 |
+
"num_eval_samples": 1163,
|
| 12 |
+
"macro_mean_score": 0.5103725263180767,
|
| 13 |
+
"micro_mean_score": 0.5349957007738607
|
| 14 |
+
},
|
| 15 |
+
"overall_score": 0.31755778420402525
|
| 16 |
+
},
|
| 17 |
+
"keyword_stats": {
|
| 18 |
+
"skills": {
|
| 19 |
+
"Object Recognition and Classification": {
|
| 20 |
+
"count": 303,
|
| 21 |
+
"num_samples": 4755,
|
| 22 |
+
"tasks": [],
|
| 23 |
+
"average_score": 0.3153649050553317
|
| 24 |
+
},
|
| 25 |
+
"Text Recognition (OCR)": {
|
| 26 |
+
"count": 137,
|
| 27 |
+
"num_samples": 2239,
|
| 28 |
+
"tasks": [],
|
| 29 |
+
"average_score": 0.34425736922415495
|
| 30 |
+
},
|
| 31 |
+
"Language Understanding and Generation": {
|
| 32 |
+
"count": 154,
|
| 33 |
+
"num_samples": 2509,
|
| 34 |
+
"tasks": [],
|
| 35 |
+
"average_score": 0.3921740378709932
|
| 36 |
+
},
|
| 37 |
+
"Scene and Event Understanding": {
|
| 38 |
+
"count": 154,
|
| 39 |
+
"num_samples": 2467,
|
| 40 |
+
"tasks": [],
|
| 41 |
+
"average_score": 0.37623282710622424
|
| 42 |
+
},
|
| 43 |
+
"Mathematical and Logical Reasoning": {
|
| 44 |
+
"count": 109,
|
| 45 |
+
"num_samples": 1910,
|
| 46 |
+
"tasks": [],
|
| 47 |
+
"average_score": 0.271674311347156
|
| 48 |
+
},
|
| 49 |
+
"Commonsense and Social Reasoning": {
|
| 50 |
+
"count": 51,
|
| 51 |
+
"num_samples": 855,
|
| 52 |
+
"tasks": [],
|
| 53 |
+
"average_score": 0.46313777834281344
|
| 54 |
+
},
|
| 55 |
+
"Ethical and Safety Reasoning": {
|
| 56 |
+
"count": 15,
|
| 57 |
+
"num_samples": 245,
|
| 58 |
+
"tasks": [],
|
| 59 |
+
"average_score": 0.5692180451127821
|
| 60 |
+
},
|
| 61 |
+
"Domain-Specific Knowledge and Skills": {
|
| 62 |
+
"count": 77,
|
| 63 |
+
"num_samples": 1386,
|
| 64 |
+
"tasks": [],
|
| 65 |
+
"average_score": 0.3152064038837139
|
| 66 |
+
},
|
| 67 |
+
"Spatial and Temporal Reasoning": {
|
| 68 |
+
"count": 152,
|
| 69 |
+
"num_samples": 2437,
|
| 70 |
+
"tasks": [],
|
| 71 |
+
"average_score": 0.23851147782276536
|
| 72 |
+
},
|
| 73 |
+
"Planning and Decision Making": {
|
| 74 |
+
"count": 37,
|
| 75 |
+
"num_samples": 577,
|
| 76 |
+
"tasks": [],
|
| 77 |
+
"average_score": 0.11246568298589892
|
| 78 |
+
}
|
| 79 |
+
},
|
| 80 |
+
"input_format": {
|
| 81 |
+
"User Interface Screenshots": {
|
| 82 |
+
"count": 93,
|
| 83 |
+
"num_samples": 1517,
|
| 84 |
+
"tasks": [],
|
| 85 |
+
"average_score": 0.28561724084490353
|
| 86 |
+
},
|
| 87 |
+
"Text-Based Images and Documents": {
|
| 88 |
+
"count": 82,
|
| 89 |
+
"num_samples": 1294,
|
| 90 |
+
"tasks": [],
|
| 91 |
+
"average_score": 0.2505346698796475
|
| 92 |
+
},
|
| 93 |
+
"Diagrams and Data Visualizations": {
|
| 94 |
+
"count": 101,
|
| 95 |
+
"num_samples": 1718,
|
| 96 |
+
"tasks": [],
|
| 97 |
+
"average_score": 0.3040414715952029
|
| 98 |
+
},
|
| 99 |
+
"Videos": {
|
| 100 |
+
"count": 43,
|
| 101 |
+
"num_samples": 698,
|
| 102 |
+
"tasks": [],
|
| 103 |
+
"average_score": 0.41865640360591405
|
| 104 |
+
},
|
| 105 |
+
"Artistic and Creative Content": {
|
| 106 |
+
"count": 32,
|
| 107 |
+
"num_samples": 541,
|
| 108 |
+
"tasks": [],
|
| 109 |
+
"average_score": 0.3622713579911698
|
| 110 |
+
},
|
| 111 |
+
"Photographs": {
|
| 112 |
+
"count": 143,
|
| 113 |
+
"num_samples": 2248,
|
| 114 |
+
"tasks": [],
|
| 115 |
+
"average_score": 0.35872259826035346
|
| 116 |
+
},
|
| 117 |
+
"3D Models and Aerial Imagery": {
|
| 118 |
+
"count": 11,
|
| 119 |
+
"num_samples": 169,
|
| 120 |
+
"tasks": [],
|
| 121 |
+
"average_score": 0.1509096092007215
|
| 122 |
+
}
|
| 123 |
+
},
|
| 124 |
+
"output_format": {
|
| 125 |
+
"contextual_formatted_text": {
|
| 126 |
+
"count": 98,
|
| 127 |
+
"num_samples": 1514,
|
| 128 |
+
"tasks": [],
|
| 129 |
+
"average_score": 0.2846987779732631
|
| 130 |
+
},
|
| 131 |
+
"structured_output": {
|
| 132 |
+
"count": 110,
|
| 133 |
+
"num_samples": 1714,
|
| 134 |
+
"tasks": [],
|
| 135 |
+
"average_score": 0.2899384042262363
|
| 136 |
+
},
|
| 137 |
+
"exact_text": {
|
| 138 |
+
"count": 83,
|
| 139 |
+
"num_samples": 1278,
|
| 140 |
+
"tasks": [],
|
| 141 |
+
"average_score": 0.27412885527802433
|
| 142 |
+
},
|
| 143 |
+
"numerical_data": {
|
| 144 |
+
"count": 49,
|
| 145 |
+
"num_samples": 862,
|
| 146 |
+
"tasks": [],
|
| 147 |
+
"average_score": 0.3117275816801635
|
| 148 |
+
},
|
| 149 |
+
"open_ended_output": {
|
| 150 |
+
"count": 80,
|
| 151 |
+
"num_samples": 1454,
|
| 152 |
+
"tasks": [],
|
| 153 |
+
"average_score": 0.4523860109667709
|
| 154 |
+
},
|
| 155 |
+
"multiple_choice": {
|
| 156 |
+
"count": 85,
|
| 157 |
+
"num_samples": 1363,
|
| 158 |
+
"tasks": [],
|
| 159 |
+
"average_score": 0.310055869988487
|
| 160 |
+
}
|
| 161 |
+
},
|
| 162 |
+
"input_num": {
|
| 163 |
+
"6-8 images": {
|
| 164 |
+
"count": 21,
|
| 165 |
+
"num_samples": 314,
|
| 166 |
+
"tasks": [],
|
| 167 |
+
"average_score": 0.18301681783824644
|
| 168 |
+
},
|
| 169 |
+
"9-image or more": {
|
| 170 |
+
"count": 41,
|
| 171 |
+
"num_samples": 623,
|
| 172 |
+
"tasks": [],
|
| 173 |
+
"average_score": 0.26651659725352617
|
| 174 |
+
},
|
| 175 |
+
"1-image": {
|
| 176 |
+
"count": 315,
|
| 177 |
+
"num_samples": 5228,
|
| 178 |
+
"tasks": [],
|
| 179 |
+
"average_score": 0.34236220565522313
|
| 180 |
+
},
|
| 181 |
+
"video": {
|
| 182 |
+
"count": 43,
|
| 183 |
+
"num_samples": 698,
|
| 184 |
+
"tasks": [],
|
| 185 |
+
"average_score": 0.41865640360591405
|
| 186 |
+
},
|
| 187 |
+
"4-5 images": {
|
| 188 |
+
"count": 34,
|
| 189 |
+
"num_samples": 520,
|
| 190 |
+
"tasks": [],
|
| 191 |
+
"average_score": 0.19142683154129833
|
| 192 |
+
},
|
| 193 |
+
"2-3 images": {
|
| 194 |
+
"count": 51,
|
| 195 |
+
"num_samples": 802,
|
| 196 |
+
"tasks": [],
|
| 197 |
+
"average_score": 0.2596336265133595
|
| 198 |
+
}
|
| 199 |
+
},
|
| 200 |
+
"app": {
|
| 201 |
+
"Information_Extraction": {
|
| 202 |
+
"count": 72,
|
| 203 |
+
"num_samples": 1124,
|
| 204 |
+
"tasks": [],
|
| 205 |
+
"average_score": 0.3929243812973524
|
| 206 |
+
},
|
| 207 |
+
"Planning": {
|
| 208 |
+
"count": 78,
|
| 209 |
+
"num_samples": 1239,
|
| 210 |
+
"tasks": [],
|
| 211 |
+
"average_score": 0.1403503245041943
|
| 212 |
+
},
|
| 213 |
+
"Coding": {
|
| 214 |
+
"count": 31,
|
| 215 |
+
"num_samples": 474,
|
| 216 |
+
"tasks": [],
|
| 217 |
+
"average_score": 0.25367910605102256
|
| 218 |
+
},
|
| 219 |
+
"Perception": {
|
| 220 |
+
"count": 145,
|
| 221 |
+
"num_samples": 2313,
|
| 222 |
+
"tasks": [],
|
| 223 |
+
"average_score": 0.3494812758481046
|
| 224 |
+
},
|
| 225 |
+
"Metrics": {
|
| 226 |
+
"count": 20,
|
| 227 |
+
"num_samples": 309,
|
| 228 |
+
"tasks": [],
|
| 229 |
+
"average_score": 0.3662927672998609
|
| 230 |
+
},
|
| 231 |
+
"Science": {
|
| 232 |
+
"count": 29,
|
| 233 |
+
"num_samples": 574,
|
| 234 |
+
"tasks": [],
|
| 235 |
+
"average_score": 0.28616079233761366
|
| 236 |
+
},
|
| 237 |
+
"Knowledge": {
|
| 238 |
+
"count": 97,
|
| 239 |
+
"num_samples": 1605,
|
| 240 |
+
"tasks": [],
|
| 241 |
+
"average_score": 0.3953949223279651
|
| 242 |
+
},
|
| 243 |
+
"Mathematics": {
|
| 244 |
+
"count": 33,
|
| 245 |
+
"num_samples": 547,
|
| 246 |
+
"tasks": [],
|
| 247 |
+
"average_score": 0.26097385403450996
|
| 248 |
+
}
|
| 249 |
+
}
|
| 250 |
+
}
|
| 251 |
+
}
|
static/eval_results/Default/Aria/task_results.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
static/eval_results/Default/Claude_3.5/summary_results.json
ADDED
|
@@ -0,0 +1,251 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_summary": {
|
| 3 |
+
"core": {
|
| 4 |
+
"num_eval_tasks": 440,
|
| 5 |
+
"num_eval_samples": 6539,
|
| 6 |
+
"macro_mean_score": 0.5040975742801586,
|
| 7 |
+
"micro_mean_score": 0.5002259116666758
|
| 8 |
+
},
|
| 9 |
+
"open": {
|
| 10 |
+
"num_eval_tasks": 65,
|
| 11 |
+
"num_eval_samples": 1163,
|
| 12 |
+
"macro_mean_score": 0.6373907158949892,
|
| 13 |
+
"micro_mean_score": 0.6569647463456579
|
| 14 |
+
},
|
| 15 |
+
"overall_score": 0.5212541172602853
|
| 16 |
+
},
|
| 17 |
+
"keyword_stats": {
|
| 18 |
+
"skills": {
|
| 19 |
+
"Object Recognition and Classification": {
|
| 20 |
+
"count": 303,
|
| 21 |
+
"num_samples": 4755,
|
| 22 |
+
"tasks": [],
|
| 23 |
+
"average_score": 0.5405089647404562
|
| 24 |
+
},
|
| 25 |
+
"Text Recognition (OCR)": {
|
| 26 |
+
"count": 137,
|
| 27 |
+
"num_samples": 2239,
|
| 28 |
+
"tasks": [],
|
| 29 |
+
"average_score": 0.6082834220752651
|
| 30 |
+
},
|
| 31 |
+
"Language Understanding and Generation": {
|
| 32 |
+
"count": 154,
|
| 33 |
+
"num_samples": 2509,
|
| 34 |
+
"tasks": [],
|
| 35 |
+
"average_score": 0.5745077617490254
|
| 36 |
+
},
|
| 37 |
+
"Scene and Event Understanding": {
|
| 38 |
+
"count": 154,
|
| 39 |
+
"num_samples": 2467,
|
| 40 |
+
"tasks": [],
|
| 41 |
+
"average_score": 0.5450038475783499
|
| 42 |
+
},
|
| 43 |
+
"Mathematical and Logical Reasoning": {
|
| 44 |
+
"count": 109,
|
| 45 |
+
"num_samples": 1910,
|
| 46 |
+
"tasks": [],
|
| 47 |
+
"average_score": 0.4767692987630454
|
| 48 |
+
},
|
| 49 |
+
"Commonsense and Social Reasoning": {
|
| 50 |
+
"count": 51,
|
| 51 |
+
"num_samples": 855,
|
| 52 |
+
"tasks": [],
|
| 53 |
+
"average_score": 0.5756126284078804
|
| 54 |
+
},
|
| 55 |
+
"Ethical and Safety Reasoning": {
|
| 56 |
+
"count": 15,
|
| 57 |
+
"num_samples": 245,
|
| 58 |
+
"tasks": [],
|
| 59 |
+
"average_score": 0.6969774436090224
|
| 60 |
+
},
|
| 61 |
+
"Domain-Specific Knowledge and Skills": {
|
| 62 |
+
"count": 77,
|
| 63 |
+
"num_samples": 1386,
|
| 64 |
+
"tasks": [],
|
| 65 |
+
"average_score": 0.5278843049497918
|
| 66 |
+
},
|
| 67 |
+
"Spatial and Temporal Reasoning": {
|
| 68 |
+
"count": 152,
|
| 69 |
+
"num_samples": 2437,
|
| 70 |
+
"tasks": [],
|
| 71 |
+
"average_score": 0.4082144793870471
|
| 72 |
+
},
|
| 73 |
+
"Planning and Decision Making": {
|
| 74 |
+
"count": 37,
|
| 75 |
+
"num_samples": 577,
|
| 76 |
+
"tasks": [],
|
| 77 |
+
"average_score": 0.23803578664609892
|
| 78 |
+
}
|
| 79 |
+
},
|
| 80 |
+
"input_format": {
|
| 81 |
+
"User Interface Screenshots": {
|
| 82 |
+
"count": 93,
|
| 83 |
+
"num_samples": 1517,
|
| 84 |
+
"tasks": [],
|
| 85 |
+
"average_score": 0.5691641481808987
|
| 86 |
+
},
|
| 87 |
+
"Text-Based Images and Documents": {
|
| 88 |
+
"count": 82,
|
| 89 |
+
"num_samples": 1294,
|
| 90 |
+
"tasks": [],
|
| 91 |
+
"average_score": 0.4795267886975966
|
| 92 |
+
},
|
| 93 |
+
"Diagrams and Data Visualizations": {
|
| 94 |
+
"count": 101,
|
| 95 |
+
"num_samples": 1718,
|
| 96 |
+
"tasks": [],
|
| 97 |
+
"average_score": 0.525848282456283
|
| 98 |
+
},
|
| 99 |
+
"Videos": {
|
| 100 |
+
"count": 43,
|
| 101 |
+
"num_samples": 698,
|
| 102 |
+
"tasks": [],
|
| 103 |
+
"average_score": 0.508735695828719
|
| 104 |
+
},
|
| 105 |
+
"Artistic and Creative Content": {
|
| 106 |
+
"count": 32,
|
| 107 |
+
"num_samples": 541,
|
| 108 |
+
"tasks": [],
|
| 109 |
+
"average_score": 0.5699094130430454
|
| 110 |
+
},
|
| 111 |
+
"Photographs": {
|
| 112 |
+
"count": 143,
|
| 113 |
+
"num_samples": 2248,
|
| 114 |
+
"tasks": [],
|
| 115 |
+
"average_score": 0.5096772701625744
|
| 116 |
+
},
|
| 117 |
+
"3D Models and Aerial Imagery": {
|
| 118 |
+
"count": 11,
|
| 119 |
+
"num_samples": 169,
|
| 120 |
+
"tasks": [],
|
| 121 |
+
"average_score": 0.4429640420975014
|
| 122 |
+
}
|
| 123 |
+
},
|
| 124 |
+
"output_format": {
|
| 125 |
+
"contextual_formatted_text": {
|
| 126 |
+
"count": 98,
|
| 127 |
+
"num_samples": 1514,
|
| 128 |
+
"tasks": [],
|
| 129 |
+
"average_score": 0.5066797418318023
|
| 130 |
+
},
|
| 131 |
+
"structured_output": {
|
| 132 |
+
"count": 110,
|
| 133 |
+
"num_samples": 1714,
|
| 134 |
+
"tasks": [],
|
| 135 |
+
"average_score": 0.4971460788134188
|
| 136 |
+
},
|
| 137 |
+
"exact_text": {
|
| 138 |
+
"count": 83,
|
| 139 |
+
"num_samples": 1278,
|
| 140 |
+
"tasks": [],
|
| 141 |
+
"average_score": 0.5278127103234661
|
| 142 |
+
},
|
| 143 |
+
"numerical_data": {
|
| 144 |
+
"count": 49,
|
| 145 |
+
"num_samples": 862,
|
| 146 |
+
"tasks": [],
|
| 147 |
+
"average_score": 0.4490020843308984
|
| 148 |
+
},
|
| 149 |
+
"open_ended_output": {
|
| 150 |
+
"count": 80,
|
| 151 |
+
"num_samples": 1454,
|
| 152 |
+
"tasks": [],
|
| 153 |
+
"average_score": 0.5838224169821388
|
| 154 |
+
},
|
| 155 |
+
"multiple_choice": {
|
| 156 |
+
"count": 85,
|
| 157 |
+
"num_samples": 1363,
|
| 158 |
+
"tasks": [],
|
| 159 |
+
"average_score": 0.5456152399978661
|
| 160 |
+
}
|
| 161 |
+
},
|
| 162 |
+
"input_num": {
|
| 163 |
+
"6-8 images": {
|
| 164 |
+
"count": 21,
|
| 165 |
+
"num_samples": 314,
|
| 166 |
+
"tasks": [],
|
| 167 |
+
"average_score": 0.46300075585789874
|
| 168 |
+
},
|
| 169 |
+
"9-image or more": {
|
| 170 |
+
"count": 41,
|
| 171 |
+
"num_samples": 623,
|
| 172 |
+
"tasks": [],
|
| 173 |
+
"average_score": 0.5414381873407914
|
| 174 |
+
},
|
| 175 |
+
"1-image": {
|
| 176 |
+
"count": 315,
|
| 177 |
+
"num_samples": 5228,
|
| 178 |
+
"tasks": [],
|
| 179 |
+
"average_score": 0.5373019912310933
|
| 180 |
+
},
|
| 181 |
+
"video": {
|
| 182 |
+
"count": 43,
|
| 183 |
+
"num_samples": 698,
|
| 184 |
+
"tasks": [],
|
| 185 |
+
"average_score": 0.508735695828719
|
| 186 |
+
},
|
| 187 |
+
"4-5 images": {
|
| 188 |
+
"count": 34,
|
| 189 |
+
"num_samples": 520,
|
| 190 |
+
"tasks": [],
|
| 191 |
+
"average_score": 0.4422556748863689
|
| 192 |
+
},
|
| 193 |
+
"2-3 images": {
|
| 194 |
+
"count": 51,
|
| 195 |
+
"num_samples": 802,
|
| 196 |
+
"tasks": [],
|
| 197 |
+
"average_score": 0.49311554035078103
|
| 198 |
+
}
|
| 199 |
+
},
|
| 200 |
+
"app": {
|
| 201 |
+
"Information_Extraction": {
|
| 202 |
+
"count": 72,
|
| 203 |
+
"num_samples": 1124,
|
| 204 |
+
"tasks": [],
|
| 205 |
+
"average_score": 0.6663170946790707
|
| 206 |
+
},
|
| 207 |
+
"Planning": {
|
| 208 |
+
"count": 78,
|
| 209 |
+
"num_samples": 1239,
|
| 210 |
+
"tasks": [],
|
| 211 |
+
"average_score": 0.3382015835012861
|
| 212 |
+
},
|
| 213 |
+
"Coding": {
|
| 214 |
+
"count": 31,
|
| 215 |
+
"num_samples": 474,
|
| 216 |
+
"tasks": [],
|
| 217 |
+
"average_score": 0.5194010220575684
|
| 218 |
+
},
|
| 219 |
+
"Perception": {
|
| 220 |
+
"count": 145,
|
| 221 |
+
"num_samples": 2313,
|
| 222 |
+
"tasks": [],
|
| 223 |
+
"average_score": 0.532329797132399
|
| 224 |
+
},
|
| 225 |
+
"Metrics": {
|
| 226 |
+
"count": 20,
|
| 227 |
+
"num_samples": 309,
|
| 228 |
+
"tasks": [],
|
| 229 |
+
"average_score": 0.5808831682303479
|
| 230 |
+
},
|
| 231 |
+
"Science": {
|
| 232 |
+
"count": 29,
|
| 233 |
+
"num_samples": 574,
|
| 234 |
+
"tasks": [],
|
| 235 |
+
"average_score": 0.513474611293123
|
| 236 |
+
},
|
| 237 |
+
"Knowledge": {
|
| 238 |
+
"count": 97,
|
| 239 |
+
"num_samples": 1605,
|
| 240 |
+
"tasks": [],
|
| 241 |
+
"average_score": 0.5507075880782885
|
| 242 |
+
},
|
| 243 |
+
"Mathematics": {
|
| 244 |
+
"count": 33,
|
| 245 |
+
"num_samples": 547,
|
| 246 |
+
"tasks": [],
|
| 247 |
+
"average_score": 0.47461998432626556
|
| 248 |
+
}
|
| 249 |
+
}
|
| 250 |
+
}
|
| 251 |
+
}
|
static/eval_results/Default/Claude_3.5/task_results.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
static/eval_results/Default/Claude_3.5_new/summary_results.json
ADDED
|
@@ -0,0 +1,251 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_summary": {
|
| 3 |
+
"core": {
|
| 4 |
+
"num_eval_tasks": 440,
|
| 5 |
+
"num_eval_samples": 6539,
|
| 6 |
+
"macro_mean_score": 0.5259191914020757,
|
| 7 |
+
"micro_mean_score": 0.5230785894131227
|
| 8 |
+
},
|
| 9 |
+
"open": {
|
| 10 |
+
"num_eval_tasks": 65,
|
| 11 |
+
"num_eval_samples": 1163,
|
| 12 |
+
"macro_mean_score": 0.6563419761104125,
|
| 13 |
+
"micro_mean_score": 0.6724419604471196
|
| 14 |
+
},
|
| 15 |
+
"overall_score": 0.5427062825031487
|
| 16 |
+
},
|
| 17 |
+
"keyword_stats": {
|
| 18 |
+
"skills": {
|
| 19 |
+
"Object Recognition and Classification": {
|
| 20 |
+
"count": 303,
|
| 21 |
+
"num_samples": 4755,
|
| 22 |
+
"tasks": [],
|
| 23 |
+
"average_score": 0.5690045172520449
|
| 24 |
+
},
|
| 25 |
+
"Text Recognition (OCR)": {
|
| 26 |
+
"count": 137,
|
| 27 |
+
"num_samples": 2239,
|
| 28 |
+
"tasks": [],
|
| 29 |
+
"average_score": 0.6220681231036606
|
| 30 |
+
},
|
| 31 |
+
"Language Understanding and Generation": {
|
| 32 |
+
"count": 154,
|
| 33 |
+
"num_samples": 2509,
|
| 34 |
+
"tasks": [],
|
| 35 |
+
"average_score": 0.6077980666415158
|
| 36 |
+
},
|
| 37 |
+
"Scene and Event Understanding": {
|
| 38 |
+
"count": 154,
|
| 39 |
+
"num_samples": 2467,
|
| 40 |
+
"tasks": [],
|
| 41 |
+
"average_score": 0.5511440615639541
|
| 42 |
+
},
|
| 43 |
+
"Mathematical and Logical Reasoning": {
|
| 44 |
+
"count": 109,
|
| 45 |
+
"num_samples": 1910,
|
| 46 |
+
"tasks": [],
|
| 47 |
+
"average_score": 0.4885536652013625
|
| 48 |
+
},
|
| 49 |
+
"Commonsense and Social Reasoning": {
|
| 50 |
+
"count": 51,
|
| 51 |
+
"num_samples": 855,
|
| 52 |
+
"tasks": [],
|
| 53 |
+
"average_score": 0.5908204006544897
|
| 54 |
+
},
|
| 55 |
+
"Ethical and Safety Reasoning": {
|
| 56 |
+
"count": 15,
|
| 57 |
+
"num_samples": 245,
|
| 58 |
+
"tasks": [],
|
| 59 |
+
"average_score": 0.6569473684210526
|
| 60 |
+
},
|
| 61 |
+
"Domain-Specific Knowledge and Skills": {
|
| 62 |
+
"count": 77,
|
| 63 |
+
"num_samples": 1386,
|
| 64 |
+
"tasks": [],
|
| 65 |
+
"average_score": 0.5486763511384175
|
| 66 |
+
},
|
| 67 |
+
"Spatial and Temporal Reasoning": {
|
| 68 |
+
"count": 152,
|
| 69 |
+
"num_samples": 2437,
|
| 70 |
+
"tasks": [],
|
| 71 |
+
"average_score": 0.4315385951907387
|
| 72 |
+
},
|
| 73 |
+
"Planning and Decision Making": {
|
| 74 |
+
"count": 37,
|
| 75 |
+
"num_samples": 577,
|
| 76 |
+
"tasks": [],
|
| 77 |
+
"average_score": 0.2909419331017877
|
| 78 |
+
}
|
| 79 |
+
},
|
| 80 |
+
"input_format": {
|
| 81 |
+
"User Interface Screenshots": {
|
| 82 |
+
"count": 93,
|
| 83 |
+
"num_samples": 1517,
|
| 84 |
+
"tasks": [],
|
| 85 |
+
"average_score": 0.6048192628845258
|
| 86 |
+
},
|
| 87 |
+
"Text-Based Images and Documents": {
|
| 88 |
+
"count": 82,
|
| 89 |
+
"num_samples": 1294,
|
| 90 |
+
"tasks": [],
|
| 91 |
+
"average_score": 0.48924295292319175
|
| 92 |
+
},
|
| 93 |
+
"Diagrams and Data Visualizations": {
|
| 94 |
+
"count": 101,
|
| 95 |
+
"num_samples": 1718,
|
| 96 |
+
"tasks": [],
|
| 97 |
+
"average_score": 0.556418710368288
|
| 98 |
+
},
|
| 99 |
+
"Videos": {
|
| 100 |
+
"count": 43,
|
| 101 |
+
"num_samples": 698,
|
| 102 |
+
"tasks": [],
|
| 103 |
+
"average_score": 0.4946691340754988
|
| 104 |
+
},
|
| 105 |
+
"Artistic and Creative Content": {
|
| 106 |
+
"count": 32,
|
| 107 |
+
"num_samples": 541,
|
| 108 |
+
"tasks": [],
|
| 109 |
+
"average_score": 0.5558756390298104
|
| 110 |
+
},
|
| 111 |
+
"Photographs": {
|
| 112 |
+
"count": 143,
|
| 113 |
+
"num_samples": 2248,
|
| 114 |
+
"tasks": [],
|
| 115 |
+
"average_score": 0.5425198547046186
|
| 116 |
+
},
|
| 117 |
+
"3D Models and Aerial Imagery": {
|
| 118 |
+
"count": 11,
|
| 119 |
+
"num_samples": 169,
|
| 120 |
+
"tasks": [],
|
| 121 |
+
"average_score": 0.44210335381541843
|
| 122 |
+
}
|
| 123 |
+
},
|
| 124 |
+
"output_format": {
|
| 125 |
+
"contextual_formatted_text": {
|
| 126 |
+
"count": 98,
|
| 127 |
+
"num_samples": 1514,
|
| 128 |
+
"tasks": [],
|
| 129 |
+
"average_score": 0.5187252051932875
|
| 130 |
+
},
|
| 131 |
+
"structured_output": {
|
| 132 |
+
"count": 110,
|
| 133 |
+
"num_samples": 1714,
|
| 134 |
+
"tasks": [],
|
| 135 |
+
"average_score": 0.5071121107460066
|
| 136 |
+
},
|
| 137 |
+
"exact_text": {
|
| 138 |
+
"count": 83,
|
| 139 |
+
"num_samples": 1278,
|
| 140 |
+
"tasks": [],
|
| 141 |
+
"average_score": 0.5387340524651681
|
| 142 |
+
},
|
| 143 |
+
"numerical_data": {
|
| 144 |
+
"count": 49,
|
| 145 |
+
"num_samples": 862,
|
| 146 |
+
"tasks": [],
|
| 147 |
+
"average_score": 0.4824302644151348
|
| 148 |
+
},
|
| 149 |
+
"open_ended_output": {
|
| 150 |
+
"count": 80,
|
| 151 |
+
"num_samples": 1454,
|
| 152 |
+
"tasks": [],
|
| 153 |
+
"average_score": 0.6242798397166945
|
| 154 |
+
},
|
| 155 |
+
"multiple_choice": {
|
| 156 |
+
"count": 85,
|
| 157 |
+
"num_samples": 1363,
|
| 158 |
+
"tasks": [],
|
| 159 |
+
"average_score": 0.5782691045270721
|
| 160 |
+
}
|
| 161 |
+
},
|
| 162 |
+
"input_num": {
|
| 163 |
+
"6-8 images": {
|
| 164 |
+
"count": 21,
|
| 165 |
+
"num_samples": 314,
|
| 166 |
+
"tasks": [],
|
| 167 |
+
"average_score": 0.4630277507828528
|
| 168 |
+
},
|
| 169 |
+
"9-image or more": {
|
| 170 |
+
"count": 41,
|
| 171 |
+
"num_samples": 623,
|
| 172 |
+
"tasks": [],
|
| 173 |
+
"average_score": 0.5914338446093256
|
| 174 |
+
},
|
| 175 |
+
"1-image": {
|
| 176 |
+
"count": 315,
|
| 177 |
+
"num_samples": 5228,
|
| 178 |
+
"tasks": [],
|
| 179 |
+
"average_score": 0.5636254729390459
|
| 180 |
+
},
|
| 181 |
+
"video": {
|
| 182 |
+
"count": 43,
|
| 183 |
+
"num_samples": 698,
|
| 184 |
+
"tasks": [],
|
| 185 |
+
"average_score": 0.4946691340754988
|
| 186 |
+
},
|
| 187 |
+
"4-5 images": {
|
| 188 |
+
"count": 34,
|
| 189 |
+
"num_samples": 520,
|
| 190 |
+
"tasks": [],
|
| 191 |
+
"average_score": 0.4828123870640382
|
| 192 |
+
},
|
| 193 |
+
"2-3 images": {
|
| 194 |
+
"count": 51,
|
| 195 |
+
"num_samples": 802,
|
| 196 |
+
"tasks": [],
|
| 197 |
+
"average_score": 0.48756636014597515
|
| 198 |
+
}
|
| 199 |
+
},
|
| 200 |
+
"app": {
|
| 201 |
+
"Information_Extraction": {
|
| 202 |
+
"count": 72,
|
| 203 |
+
"num_samples": 1124,
|
| 204 |
+
"tasks": [],
|
| 205 |
+
"average_score": 0.6590137441693218
|
| 206 |
+
},
|
| 207 |
+
"Planning": {
|
| 208 |
+
"count": 78,
|
| 209 |
+
"num_samples": 1239,
|
| 210 |
+
"tasks": [],
|
| 211 |
+
"average_score": 0.39901670035164916
|
| 212 |
+
},
|
| 213 |
+
"Coding": {
|
| 214 |
+
"count": 31,
|
| 215 |
+
"num_samples": 474,
|
| 216 |
+
"tasks": [],
|
| 217 |
+
"average_score": 0.5166853031535193
|
| 218 |
+
},
|
| 219 |
+
"Perception": {
|
| 220 |
+
"count": 145,
|
| 221 |
+
"num_samples": 2313,
|
| 222 |
+
"tasks": [],
|
| 223 |
+
"average_score": 0.5561634744977417
|
| 224 |
+
},
|
| 225 |
+
"Metrics": {
|
| 226 |
+
"count": 20,
|
| 227 |
+
"num_samples": 309,
|
| 228 |
+
"tasks": [],
|
| 229 |
+
"average_score": 0.6123769274172342
|
| 230 |
+
},
|
| 231 |
+
"Science": {
|
| 232 |
+
"count": 29,
|
| 233 |
+
"num_samples": 574,
|
| 234 |
+
"tasks": [],
|
| 235 |
+
"average_score": 0.5512015158810595
|
| 236 |
+
},
|
| 237 |
+
"Knowledge": {
|
| 238 |
+
"count": 97,
|
| 239 |
+
"num_samples": 1605,
|
| 240 |
+
"tasks": [],
|
| 241 |
+
"average_score": 0.565796566886933
|
| 242 |
+
},
|
| 243 |
+
"Mathematics": {
|
| 244 |
+
"count": 33,
|
| 245 |
+
"num_samples": 547,
|
| 246 |
+
"tasks": [],
|
| 247 |
+
"average_score": 0.4763267502912362
|
| 248 |
+
}
|
| 249 |
+
}
|
| 250 |
+
}
|
| 251 |
+
}
|
static/eval_results/Default/Claude_3.5_new/task_results.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
static/eval_results/Default/GPT_4o/summary_results.json
ADDED
|
@@ -0,0 +1,251 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_summary": {
|
| 3 |
+
"core": {
|
| 4 |
+
"num_eval_tasks": 440,
|
| 5 |
+
"num_eval_samples": 6539,
|
| 6 |
+
"macro_mean_score": 0.5265030595065238,
|
| 7 |
+
"micro_mean_score": 0.5236338521693411
|
| 8 |
+
},
|
| 9 |
+
"open": {
|
| 10 |
+
"num_eval_tasks": 65,
|
| 11 |
+
"num_eval_samples": 1163,
|
| 12 |
+
"macro_mean_score": 0.6478225794744895,
|
| 13 |
+
"micro_mean_score": 0.665391229578676
|
| 14 |
+
},
|
| 15 |
+
"overall_score": 0.5421184432647768
|
| 16 |
+
},
|
| 17 |
+
"keyword_stats": {
|
| 18 |
+
"skills": {
|
| 19 |
+
"Object Recognition and Classification": {
|
| 20 |
+
"count": 303,
|
| 21 |
+
"num_samples": 4755,
|
| 22 |
+
"tasks": [],
|
| 23 |
+
"average_score": 0.5630758211022604
|
| 24 |
+
},
|
| 25 |
+
"Text Recognition (OCR)": {
|
| 26 |
+
"count": 137,
|
| 27 |
+
"num_samples": 2239,
|
| 28 |
+
"tasks": [],
|
| 29 |
+
"average_score": 0.6216411634729735
|
| 30 |
+
},
|
| 31 |
+
"Language Understanding and Generation": {
|
| 32 |
+
"count": 154,
|
| 33 |
+
"num_samples": 2509,
|
| 34 |
+
"tasks": [],
|
| 35 |
+
"average_score": 0.616018277142757
|
| 36 |
+
},
|
| 37 |
+
"Scene and Event Understanding": {
|
| 38 |
+
"count": 154,
|
| 39 |
+
"num_samples": 2467,
|
| 40 |
+
"tasks": [],
|
| 41 |
+
"average_score": 0.5823101249498799
|
| 42 |
+
},
|
| 43 |
+
"Mathematical and Logical Reasoning": {
|
| 44 |
+
"count": 109,
|
| 45 |
+
"num_samples": 1910,
|
| 46 |
+
"tasks": [],
|
| 47 |
+
"average_score": 0.44177544539510955
|
| 48 |
+
},
|
| 49 |
+
"Commonsense and Social Reasoning": {
|
| 50 |
+
"count": 51,
|
| 51 |
+
"num_samples": 855,
|
| 52 |
+
"tasks": [],
|
| 53 |
+
"average_score": 0.6345458069232931
|
| 54 |
+
},
|
| 55 |
+
"Ethical and Safety Reasoning": {
|
| 56 |
+
"count": 15,
|
| 57 |
+
"num_samples": 245,
|
| 58 |
+
"tasks": [],
|
| 59 |
+
"average_score": 0.6795263157894738
|
| 60 |
+
},
|
| 61 |
+
"Domain-Specific Knowledge and Skills": {
|
| 62 |
+
"count": 77,
|
| 63 |
+
"num_samples": 1386,
|
| 64 |
+
"tasks": [],
|
| 65 |
+
"average_score": 0.5514924675940659
|
| 66 |
+
},
|
| 67 |
+
"Spatial and Temporal Reasoning": {
|
| 68 |
+
"count": 152,
|
| 69 |
+
"num_samples": 2437,
|
| 70 |
+
"tasks": [],
|
| 71 |
+
"average_score": 0.39435038953269674
|
| 72 |
+
},
|
| 73 |
+
"Planning and Decision Making": {
|
| 74 |
+
"count": 37,
|
| 75 |
+
"num_samples": 577,
|
| 76 |
+
"tasks": [],
|
| 77 |
+
"average_score": 0.22934807257231926
|
| 78 |
+
}
|
| 79 |
+
},
|
| 80 |
+
"input_format": {
|
| 81 |
+
"User Interface Screenshots": {
|
| 82 |
+
"count": 93,
|
| 83 |
+
"num_samples": 1517,
|
| 84 |
+
"tasks": [],
|
| 85 |
+
"average_score": 0.608083455060831
|
| 86 |
+
},
|
| 87 |
+
"Text-Based Images and Documents": {
|
| 88 |
+
"count": 82,
|
| 89 |
+
"num_samples": 1294,
|
| 90 |
+
"tasks": [],
|
| 91 |
+
"average_score": 0.491325251564869
|
| 92 |
+
},
|
| 93 |
+
"Diagrams and Data Visualizations": {
|
| 94 |
+
"count": 101,
|
| 95 |
+
"num_samples": 1718,
|
| 96 |
+
"tasks": [],
|
| 97 |
+
"average_score": 0.4999089647103332
|
| 98 |
+
},
|
| 99 |
+
"Videos": {
|
| 100 |
+
"count": 43,
|
| 101 |
+
"num_samples": 698,
|
| 102 |
+
"tasks": [],
|
| 103 |
+
"average_score": 0.5315979872161023
|
| 104 |
+
},
|
| 105 |
+
"Artistic and Creative Content": {
|
| 106 |
+
"count": 32,
|
| 107 |
+
"num_samples": 541,
|
| 108 |
+
"tasks": [],
|
| 109 |
+
"average_score": 0.5641404607063637
|
| 110 |
+
},
|
| 111 |
+
"Photographs": {
|
| 112 |
+
"count": 143,
|
| 113 |
+
"num_samples": 2248,
|
| 114 |
+
"tasks": [],
|
| 115 |
+
"average_score": 0.5613545677222056
|
| 116 |
+
},
|
| 117 |
+
"3D Models and Aerial Imagery": {
|
| 118 |
+
"count": 11,
|
| 119 |
+
"num_samples": 169,
|
| 120 |
+
"tasks": [],
|
| 121 |
+
"average_score": 0.47760591698367955
|
| 122 |
+
}
|
| 123 |
+
},
|
| 124 |
+
"output_format": {
|
| 125 |
+
"contextual_formatted_text": {
|
| 126 |
+
"count": 98,
|
| 127 |
+
"num_samples": 1514,
|
| 128 |
+
"tasks": [],
|
| 129 |
+
"average_score": 0.5388690453811203
|
| 130 |
+
},
|
| 131 |
+
"structured_output": {
|
| 132 |
+
"count": 110,
|
| 133 |
+
"num_samples": 1714,
|
| 134 |
+
"tasks": [],
|
| 135 |
+
"average_score": 0.48037685656449847
|
| 136 |
+
},
|
| 137 |
+
"exact_text": {
|
| 138 |
+
"count": 83,
|
| 139 |
+
"num_samples": 1278,
|
| 140 |
+
"tasks": [],
|
| 141 |
+
"average_score": 0.5994159671881645
|
| 142 |
+
},
|
| 143 |
+
"numerical_data": {
|
| 144 |
+
"count": 49,
|
| 145 |
+
"num_samples": 862,
|
| 146 |
+
"tasks": [],
|
| 147 |
+
"average_score": 0.44606605087301393
|
| 148 |
+
},
|
| 149 |
+
"open_ended_output": {
|
| 150 |
+
"count": 80,
|
| 151 |
+
"num_samples": 1454,
|
| 152 |
+
"tasks": [],
|
| 153 |
+
"average_score": 0.6274371950293718
|
| 154 |
+
},
|
| 155 |
+
"multiple_choice": {
|
| 156 |
+
"count": 85,
|
| 157 |
+
"num_samples": 1363,
|
| 158 |
+
"tasks": [],
|
| 159 |
+
"average_score": 0.5448877153826162
|
| 160 |
+
}
|
| 161 |
+
},
|
| 162 |
+
"input_num": {
|
| 163 |
+
"6-8 images": {
|
| 164 |
+
"count": 21,
|
| 165 |
+
"num_samples": 314,
|
| 166 |
+
"tasks": [],
|
| 167 |
+
"average_score": 0.4751133786848073
|
| 168 |
+
},
|
| 169 |
+
"9-image or more": {
|
| 170 |
+
"count": 41,
|
| 171 |
+
"num_samples": 623,
|
| 172 |
+
"tasks": [],
|
| 173 |
+
"average_score": 0.5343350103400748
|
| 174 |
+
},
|
| 175 |
+
"1-image": {
|
| 176 |
+
"count": 315,
|
| 177 |
+
"num_samples": 5228,
|
| 178 |
+
"tasks": [],
|
| 179 |
+
"average_score": 0.5672657028463585
|
| 180 |
+
},
|
| 181 |
+
"video": {
|
| 182 |
+
"count": 43,
|
| 183 |
+
"num_samples": 698,
|
| 184 |
+
"tasks": [],
|
| 185 |
+
"average_score": 0.5315979872161023
|
| 186 |
+
},
|
| 187 |
+
"4-5 images": {
|
| 188 |
+
"count": 34,
|
| 189 |
+
"num_samples": 520,
|
| 190 |
+
"tasks": [],
|
| 191 |
+
"average_score": 0.4500928191484624
|
| 192 |
+
},
|
| 193 |
+
"2-3 images": {
|
| 194 |
+
"count": 51,
|
| 195 |
+
"num_samples": 802,
|
| 196 |
+
"tasks": [],
|
| 197 |
+
"average_score": 0.4908653289106883
|
| 198 |
+
}
|
| 199 |
+
},
|
| 200 |
+
"app": {
|
| 201 |
+
"Information_Extraction": {
|
| 202 |
+
"count": 72,
|
| 203 |
+
"num_samples": 1124,
|
| 204 |
+
"tasks": [],
|
| 205 |
+
"average_score": 0.7056027785545881
|
| 206 |
+
},
|
| 207 |
+
"Planning": {
|
| 208 |
+
"count": 78,
|
| 209 |
+
"num_samples": 1239,
|
| 210 |
+
"tasks": [],
|
| 211 |
+
"average_score": 0.33202130899313653
|
| 212 |
+
},
|
| 213 |
+
"Coding": {
|
| 214 |
+
"count": 31,
|
| 215 |
+
"num_samples": 474,
|
| 216 |
+
"tasks": [],
|
| 217 |
+
"average_score": 0.5032849161169843
|
| 218 |
+
},
|
| 219 |
+
"Perception": {
|
| 220 |
+
"count": 145,
|
| 221 |
+
"num_samples": 2313,
|
| 222 |
+
"tasks": [],
|
| 223 |
+
"average_score": 0.5510350848991218
|
| 224 |
+
},
|
| 225 |
+
"Metrics": {
|
| 226 |
+
"count": 20,
|
| 227 |
+
"num_samples": 309,
|
| 228 |
+
"tasks": [],
|
| 229 |
+
"average_score": 0.6095778863474799
|
| 230 |
+
},
|
| 231 |
+
"Science": {
|
| 232 |
+
"count": 29,
|
| 233 |
+
"num_samples": 574,
|
| 234 |
+
"tasks": [],
|
| 235 |
+
"average_score": 0.5283797185155754
|
| 236 |
+
},
|
| 237 |
+
"Knowledge": {
|
| 238 |
+
"count": 97,
|
| 239 |
+
"num_samples": 1605,
|
| 240 |
+
"tasks": [],
|
| 241 |
+
"average_score": 0.6135723164021851
|
| 242 |
+
},
|
| 243 |
+
"Mathematics": {
|
| 244 |
+
"count": 33,
|
| 245 |
+
"num_samples": 547,
|
| 246 |
+
"tasks": [],
|
| 247 |
+
"average_score": 0.44047720383044436
|
| 248 |
+
}
|
| 249 |
+
}
|
| 250 |
+
}
|
| 251 |
+
}
|
static/eval_results/Default/GPT_4o/task_results.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
static/eval_results/Default/GPT_4o_mini/summary_results.json
ADDED
|
@@ -0,0 +1,251 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_summary": {
|
| 3 |
+
"core": {
|
| 4 |
+
"num_eval_tasks": 440,
|
| 5 |
+
"num_eval_samples": 6539,
|
| 6 |
+
"macro_mean_score": 0.40767494558789397,
|
| 7 |
+
"micro_mean_score": 0.40431644154143376
|
| 8 |
+
},
|
| 9 |
+
"open": {
|
| 10 |
+
"num_eval_tasks": 65,
|
| 11 |
+
"num_eval_samples": 1163,
|
| 12 |
+
"macro_mean_score": 0.586537827213665,
|
| 13 |
+
"micro_mean_score": 0.6133276010318144
|
| 14 |
+
},
|
| 15 |
+
"overall_score": 0.43069690064863675
|
| 16 |
+
},
|
| 17 |
+
"keyword_stats": {
|
| 18 |
+
"skills": {
|
| 19 |
+
"Object Recognition and Classification": {
|
| 20 |
+
"count": 303,
|
| 21 |
+
"num_samples": 4755,
|
| 22 |
+
"tasks": [],
|
| 23 |
+
"average_score": 0.4492982787524939
|
| 24 |
+
},
|
| 25 |
+
"Text Recognition (OCR)": {
|
| 26 |
+
"count": 137,
|
| 27 |
+
"num_samples": 2239,
|
| 28 |
+
"tasks": [],
|
| 29 |
+
"average_score": 0.49026056071002017
|
| 30 |
+
},
|
| 31 |
+
"Language Understanding and Generation": {
|
| 32 |
+
"count": 154,
|
| 33 |
+
"num_samples": 2509,
|
| 34 |
+
"tasks": [],
|
| 35 |
+
"average_score": 0.5168957112681365
|
| 36 |
+
},
|
| 37 |
+
"Scene and Event Understanding": {
|
| 38 |
+
"count": 154,
|
| 39 |
+
"num_samples": 2467,
|
| 40 |
+
"tasks": [],
|
| 41 |
+
"average_score": 0.46731791428406805
|
| 42 |
+
},
|
| 43 |
+
"Mathematical and Logical Reasoning": {
|
| 44 |
+
"count": 109,
|
| 45 |
+
"num_samples": 1910,
|
| 46 |
+
"tasks": [],
|
| 47 |
+
"average_score": 0.3406008235342885
|
| 48 |
+
},
|
| 49 |
+
"Commonsense and Social Reasoning": {
|
| 50 |
+
"count": 51,
|
| 51 |
+
"num_samples": 855,
|
| 52 |
+
"tasks": [],
|
| 53 |
+
"average_score": 0.5572925295284307
|
| 54 |
+
},
|
| 55 |
+
"Ethical and Safety Reasoning": {
|
| 56 |
+
"count": 15,
|
| 57 |
+
"num_samples": 245,
|
| 58 |
+
"tasks": [],
|
| 59 |
+
"average_score": 0.6902380952380953
|
| 60 |
+
},
|
| 61 |
+
"Domain-Specific Knowledge and Skills": {
|
| 62 |
+
"count": 77,
|
| 63 |
+
"num_samples": 1386,
|
| 64 |
+
"tasks": [],
|
| 65 |
+
"average_score": 0.4189154010048976
|
| 66 |
+
},
|
| 67 |
+
"Spatial and Temporal Reasoning": {
|
| 68 |
+
"count": 152,
|
| 69 |
+
"num_samples": 2437,
|
| 70 |
+
"tasks": [],
|
| 71 |
+
"average_score": 0.2943206715105082
|
| 72 |
+
},
|
| 73 |
+
"Planning and Decision Making": {
|
| 74 |
+
"count": 37,
|
| 75 |
+
"num_samples": 577,
|
| 76 |
+
"tasks": [],
|
| 77 |
+
"average_score": 0.19422793560945503
|
| 78 |
+
}
|
| 79 |
+
},
|
| 80 |
+
"input_format": {
|
| 81 |
+
"User Interface Screenshots": {
|
| 82 |
+
"count": 93,
|
| 83 |
+
"num_samples": 1517,
|
| 84 |
+
"tasks": [],
|
| 85 |
+
"average_score": 0.47202628409684394
|
| 86 |
+
},
|
| 87 |
+
"Text-Based Images and Documents": {
|
| 88 |
+
"count": 82,
|
| 89 |
+
"num_samples": 1294,
|
| 90 |
+
"tasks": [],
|
| 91 |
+
"average_score": 0.3624496929166193
|
| 92 |
+
},
|
| 93 |
+
"Diagrams and Data Visualizations": {
|
| 94 |
+
"count": 101,
|
| 95 |
+
"num_samples": 1718,
|
| 96 |
+
"tasks": [],
|
| 97 |
+
"average_score": 0.38946844562183286
|
| 98 |
+
},
|
| 99 |
+
"Videos": {
|
| 100 |
+
"count": 43,
|
| 101 |
+
"num_samples": 698,
|
| 102 |
+
"tasks": [],
|
| 103 |
+
"average_score": 0.45508480503584553
|
| 104 |
+
},
|
| 105 |
+
"Artistic and Creative Content": {
|
| 106 |
+
"count": 32,
|
| 107 |
+
"num_samples": 541,
|
| 108 |
+
"tasks": [],
|
| 109 |
+
"average_score": 0.47569921440672464
|
| 110 |
+
},
|
| 111 |
+
"Photographs": {
|
| 112 |
+
"count": 143,
|
| 113 |
+
"num_samples": 2248,
|
| 114 |
+
"tasks": [],
|
| 115 |
+
"average_score": 0.465175334092545
|
| 116 |
+
},
|
| 117 |
+
"3D Models and Aerial Imagery": {
|
| 118 |
+
"count": 11,
|
| 119 |
+
"num_samples": 169,
|
| 120 |
+
"tasks": [],
|
| 121 |
+
"average_score": 0.29410984789062117
|
| 122 |
+
}
|
| 123 |
+
},
|
| 124 |
+
"output_format": {
|
| 125 |
+
"contextual_formatted_text": {
|
| 126 |
+
"count": 98,
|
| 127 |
+
"num_samples": 1514,
|
| 128 |
+
"tasks": [],
|
| 129 |
+
"average_score": 0.41242028190533997
|
| 130 |
+
},
|
| 131 |
+
"structured_output": {
|
| 132 |
+
"count": 110,
|
| 133 |
+
"num_samples": 1714,
|
| 134 |
+
"tasks": [],
|
| 135 |
+
"average_score": 0.3906415365938764
|
| 136 |
+
},
|
| 137 |
+
"exact_text": {
|
| 138 |
+
"count": 83,
|
| 139 |
+
"num_samples": 1278,
|
| 140 |
+
"tasks": [],
|
| 141 |
+
"average_score": 0.44244772638735347
|
| 142 |
+
},
|
| 143 |
+
"numerical_data": {
|
| 144 |
+
"count": 49,
|
| 145 |
+
"num_samples": 862,
|
| 146 |
+
"tasks": [],
|
| 147 |
+
"average_score": 0.3629944944697668
|
| 148 |
+
},
|
| 149 |
+
"open_ended_output": {
|
| 150 |
+
"count": 80,
|
| 151 |
+
"num_samples": 1454,
|
| 152 |
+
"tasks": [],
|
| 153 |
+
"average_score": 0.5713834131825314
|
| 154 |
+
},
|
| 155 |
+
"multiple_choice": {
|
| 156 |
+
"count": 85,
|
| 157 |
+
"num_samples": 1363,
|
| 158 |
+
"tasks": [],
|
| 159 |
+
"average_score": 0.39874839531459466
|
| 160 |
+
}
|
| 161 |
+
},
|
| 162 |
+
"input_num": {
|
| 163 |
+
"6-8 images": {
|
| 164 |
+
"count": 21,
|
| 165 |
+
"num_samples": 314,
|
| 166 |
+
"tasks": [],
|
| 167 |
+
"average_score": 0.3359977324263039
|
| 168 |
+
},
|
| 169 |
+
"9-image or more": {
|
| 170 |
+
"count": 41,
|
| 171 |
+
"num_samples": 623,
|
| 172 |
+
"tasks": [],
|
| 173 |
+
"average_score": 0.4305788513381019
|
| 174 |
+
},
|
| 175 |
+
"1-image": {
|
| 176 |
+
"count": 315,
|
| 177 |
+
"num_samples": 5228,
|
| 178 |
+
"tasks": [],
|
| 179 |
+
"average_score": 0.46343334374251277
|
| 180 |
+
},
|
| 181 |
+
"video": {
|
| 182 |
+
"count": 43,
|
| 183 |
+
"num_samples": 698,
|
| 184 |
+
"tasks": [],
|
| 185 |
+
"average_score": 0.45508480503584553
|
| 186 |
+
},
|
| 187 |
+
"4-5 images": {
|
| 188 |
+
"count": 34,
|
| 189 |
+
"num_samples": 520,
|
| 190 |
+
"tasks": [],
|
| 191 |
+
"average_score": 0.24651576711552803
|
| 192 |
+
},
|
| 193 |
+
"2-3 images": {
|
| 194 |
+
"count": 51,
|
| 195 |
+
"num_samples": 802,
|
| 196 |
+
"tasks": [],
|
| 197 |
+
"average_score": 0.36981497185070983
|
| 198 |
+
}
|
| 199 |
+
},
|
| 200 |
+
"app": {
|
| 201 |
+
"Information_Extraction": {
|
| 202 |
+
"count": 72,
|
| 203 |
+
"num_samples": 1124,
|
| 204 |
+
"tasks": [],
|
| 205 |
+
"average_score": 0.5666618234843734
|
| 206 |
+
},
|
| 207 |
+
"Planning": {
|
| 208 |
+
"count": 78,
|
| 209 |
+
"num_samples": 1239,
|
| 210 |
+
"tasks": [],
|
| 211 |
+
"average_score": 0.2420320329702607
|
| 212 |
+
},
|
| 213 |
+
"Coding": {
|
| 214 |
+
"count": 31,
|
| 215 |
+
"num_samples": 474,
|
| 216 |
+
"tasks": [],
|
| 217 |
+
"average_score": 0.3458483931206892
|
| 218 |
+
},
|
| 219 |
+
"Perception": {
|
| 220 |
+
"count": 145,
|
| 221 |
+
"num_samples": 2313,
|
| 222 |
+
"tasks": [],
|
| 223 |
+
"average_score": 0.43590838051817093
|
| 224 |
+
},
|
| 225 |
+
"Metrics": {
|
| 226 |
+
"count": 20,
|
| 227 |
+
"num_samples": 309,
|
| 228 |
+
"tasks": [],
|
| 229 |
+
"average_score": 0.5176671720617656
|
| 230 |
+
},
|
| 231 |
+
"Science": {
|
| 232 |
+
"count": 29,
|
| 233 |
+
"num_samples": 574,
|
| 234 |
+
"tasks": [],
|
| 235 |
+
"average_score": 0.3554299482098288
|
| 236 |
+
},
|
| 237 |
+
"Knowledge": {
|
| 238 |
+
"count": 97,
|
| 239 |
+
"num_samples": 1605,
|
| 240 |
+
"tasks": [],
|
| 241 |
+
"average_score": 0.5399167524341886
|
| 242 |
+
},
|
| 243 |
+
"Mathematics": {
|
| 244 |
+
"count": 33,
|
| 245 |
+
"num_samples": 547,
|
| 246 |
+
"tasks": [],
|
| 247 |
+
"average_score": 0.32918280841495845
|
| 248 |
+
}
|
| 249 |
+
}
|
| 250 |
+
}
|
| 251 |
+
}
|
static/eval_results/Default/GPT_4o_mini/task_results.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
static/eval_results/Default/Gemini_1.5_flash_002/summary_results.json
ADDED
|
@@ -0,0 +1,251 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_summary": {
|
| 3 |
+
"core": {
|
| 4 |
+
"num_eval_tasks": 440,
|
| 5 |
+
"num_eval_samples": 6539,
|
| 6 |
+
"macro_mean_score": 0.4189319021967416,
|
| 7 |
+
"micro_mean_score": 0.41567515414375245
|
| 8 |
+
},
|
| 9 |
+
"open": {
|
| 10 |
+
"num_eval_tasks": 65,
|
| 11 |
+
"num_eval_samples": 1163,
|
| 12 |
+
"macro_mean_score": 0.5691365176285039,
|
| 13 |
+
"micro_mean_score": 0.5987532244196045
|
| 14 |
+
},
|
| 15 |
+
"overall_score": 0.4382651695295427
|
| 16 |
+
},
|
| 17 |
+
"keyword_stats": {
|
| 18 |
+
"skills": {
|
| 19 |
+
"Object Recognition and Classification": {
|
| 20 |
+
"count": 303,
|
| 21 |
+
"num_samples": 4755,
|
| 22 |
+
"tasks": [],
|
| 23 |
+
"average_score": 0.46355333176347063
|
| 24 |
+
},
|
| 25 |
+
"Text Recognition (OCR)": {
|
| 26 |
+
"count": 137,
|
| 27 |
+
"num_samples": 2239,
|
| 28 |
+
"tasks": [],
|
| 29 |
+
"average_score": 0.4431807648811706
|
| 30 |
+
},
|
| 31 |
+
"Language Understanding and Generation": {
|
| 32 |
+
"count": 154,
|
| 33 |
+
"num_samples": 2509,
|
| 34 |
+
"tasks": [],
|
| 35 |
+
"average_score": 0.4975887290434539
|
| 36 |
+
},
|
| 37 |
+
"Scene and Event Understanding": {
|
| 38 |
+
"count": 154,
|
| 39 |
+
"num_samples": 2467,
|
| 40 |
+
"tasks": [],
|
| 41 |
+
"average_score": 0.49409642663278297
|
| 42 |
+
},
|
| 43 |
+
"Mathematical and Logical Reasoning": {
|
| 44 |
+
"count": 109,
|
| 45 |
+
"num_samples": 1910,
|
| 46 |
+
"tasks": [],
|
| 47 |
+
"average_score": 0.38033540105052427
|
| 48 |
+
},
|
| 49 |
+
"Commonsense and Social Reasoning": {
|
| 50 |
+
"count": 51,
|
| 51 |
+
"num_samples": 855,
|
| 52 |
+
"tasks": [],
|
| 53 |
+
"average_score": 0.5621166766717235
|
| 54 |
+
},
|
| 55 |
+
"Ethical and Safety Reasoning": {
|
| 56 |
+
"count": 15,
|
| 57 |
+
"num_samples": 245,
|
| 58 |
+
"tasks": [],
|
| 59 |
+
"average_score": 0.6570726817042606
|
| 60 |
+
},
|
| 61 |
+
"Domain-Specific Knowledge and Skills": {
|
| 62 |
+
"count": 77,
|
| 63 |
+
"num_samples": 1386,
|
| 64 |
+
"tasks": [],
|
| 65 |
+
"average_score": 0.4480877005302385
|
| 66 |
+
},
|
| 67 |
+
"Spatial and Temporal Reasoning": {
|
| 68 |
+
"count": 152,
|
| 69 |
+
"num_samples": 2437,
|
| 70 |
+
"tasks": [],
|
| 71 |
+
"average_score": 0.3338006749329557
|
| 72 |
+
},
|
| 73 |
+
"Planning and Decision Making": {
|
| 74 |
+
"count": 37,
|
| 75 |
+
"num_samples": 577,
|
| 76 |
+
"tasks": [],
|
| 77 |
+
"average_score": 0.16197013296986068
|
| 78 |
+
}
|
| 79 |
+
},
|
| 80 |
+
"input_format": {
|
| 81 |
+
"User Interface Screenshots": {
|
| 82 |
+
"count": 93,
|
| 83 |
+
"num_samples": 1517,
|
| 84 |
+
"tasks": [],
|
| 85 |
+
"average_score": 0.3971534837718938
|
| 86 |
+
},
|
| 87 |
+
"Text-Based Images and Documents": {
|
| 88 |
+
"count": 82,
|
| 89 |
+
"num_samples": 1294,
|
| 90 |
+
"tasks": [],
|
| 91 |
+
"average_score": 0.3448204918940882
|
| 92 |
+
},
|
| 93 |
+
"Diagrams and Data Visualizations": {
|
| 94 |
+
"count": 101,
|
| 95 |
+
"num_samples": 1718,
|
| 96 |
+
"tasks": [],
|
| 97 |
+
"average_score": 0.43525833484767545
|
| 98 |
+
},
|
| 99 |
+
"Videos": {
|
| 100 |
+
"count": 43,
|
| 101 |
+
"num_samples": 698,
|
| 102 |
+
"tasks": [],
|
| 103 |
+
"average_score": 0.4837362543956792
|
| 104 |
+
},
|
| 105 |
+
"Artistic and Creative Content": {
|
| 106 |
+
"count": 32,
|
| 107 |
+
"num_samples": 541,
|
| 108 |
+
"tasks": [],
|
| 109 |
+
"average_score": 0.5111257660425502
|
| 110 |
+
},
|
| 111 |
+
"Photographs": {
|
| 112 |
+
"count": 143,
|
| 113 |
+
"num_samples": 2248,
|
| 114 |
+
"tasks": [],
|
| 115 |
+
"average_score": 0.49366013155105076
|
| 116 |
+
},
|
| 117 |
+
"3D Models and Aerial Imagery": {
|
| 118 |
+
"count": 11,
|
| 119 |
+
"num_samples": 169,
|
| 120 |
+
"tasks": [],
|
| 121 |
+
"average_score": 0.4001983820478609
|
| 122 |
+
}
|
| 123 |
+
},
|
| 124 |
+
"output_format": {
|
| 125 |
+
"contextual_formatted_text": {
|
| 126 |
+
"count": 98,
|
| 127 |
+
"num_samples": 1514,
|
| 128 |
+
"tasks": [],
|
| 129 |
+
"average_score": 0.386988040250785
|
| 130 |
+
},
|
| 131 |
+
"structured_output": {
|
| 132 |
+
"count": 110,
|
| 133 |
+
"num_samples": 1714,
|
| 134 |
+
"tasks": [],
|
| 135 |
+
"average_score": 0.3884226428206387
|
| 136 |
+
},
|
| 137 |
+
"exact_text": {
|
| 138 |
+
"count": 83,
|
| 139 |
+
"num_samples": 1278,
|
| 140 |
+
"tasks": [],
|
| 141 |
+
"average_score": 0.4425893080900246
|
| 142 |
+
},
|
| 143 |
+
"numerical_data": {
|
| 144 |
+
"count": 49,
|
| 145 |
+
"num_samples": 862,
|
| 146 |
+
"tasks": [],
|
| 147 |
+
"average_score": 0.42223626366392253
|
| 148 |
+
},
|
| 149 |
+
"open_ended_output": {
|
| 150 |
+
"count": 80,
|
| 151 |
+
"num_samples": 1454,
|
| 152 |
+
"tasks": [],
|
| 153 |
+
"average_score": 0.5390305634303021
|
| 154 |
+
},
|
| 155 |
+
"multiple_choice": {
|
| 156 |
+
"count": 85,
|
| 157 |
+
"num_samples": 1363,
|
| 158 |
+
"tasks": [],
|
| 159 |
+
"average_score": 0.472066557554629
|
| 160 |
+
}
|
| 161 |
+
},
|
| 162 |
+
"input_num": {
|
| 163 |
+
"6-8 images": {
|
| 164 |
+
"count": 21,
|
| 165 |
+
"num_samples": 314,
|
| 166 |
+
"tasks": [],
|
| 167 |
+
"average_score": 0.3666950113378685
|
| 168 |
+
},
|
| 169 |
+
"9-image or more": {
|
| 170 |
+
"count": 41,
|
| 171 |
+
"num_samples": 623,
|
| 172 |
+
"tasks": [],
|
| 173 |
+
"average_score": 0.44571360028283974
|
| 174 |
+
},
|
| 175 |
+
"1-image": {
|
| 176 |
+
"count": 315,
|
| 177 |
+
"num_samples": 5228,
|
| 178 |
+
"tasks": [],
|
| 179 |
+
"average_score": 0.45400479933257654
|
| 180 |
+
},
|
| 181 |
+
"video": {
|
| 182 |
+
"count": 43,
|
| 183 |
+
"num_samples": 698,
|
| 184 |
+
"tasks": [],
|
| 185 |
+
"average_score": 0.4837362543956792
|
| 186 |
+
},
|
| 187 |
+
"4-5 images": {
|
| 188 |
+
"count": 34,
|
| 189 |
+
"num_samples": 520,
|
| 190 |
+
"tasks": [],
|
| 191 |
+
"average_score": 0.35161402777057993
|
| 192 |
+
},
|
| 193 |
+
"2-3 images": {
|
| 194 |
+
"count": 51,
|
| 195 |
+
"num_samples": 802,
|
| 196 |
+
"tasks": [],
|
| 197 |
+
"average_score": 0.3839609821519984
|
| 198 |
+
}
|
| 199 |
+
},
|
| 200 |
+
"app": {
|
| 201 |
+
"Information_Extraction": {
|
| 202 |
+
"count": 72,
|
| 203 |
+
"num_samples": 1124,
|
| 204 |
+
"tasks": [],
|
| 205 |
+
"average_score": 0.4822341581959653
|
| 206 |
+
},
|
| 207 |
+
"Planning": {
|
| 208 |
+
"count": 78,
|
| 209 |
+
"num_samples": 1239,
|
| 210 |
+
"tasks": [],
|
| 211 |
+
"average_score": 0.26434115361219657
|
| 212 |
+
},
|
| 213 |
+
"Coding": {
|
| 214 |
+
"count": 31,
|
| 215 |
+
"num_samples": 474,
|
| 216 |
+
"tasks": [],
|
| 217 |
+
"average_score": 0.3677547363031234
|
| 218 |
+
},
|
| 219 |
+
"Perception": {
|
| 220 |
+
"count": 145,
|
| 221 |
+
"num_samples": 2313,
|
| 222 |
+
"tasks": [],
|
| 223 |
+
"average_score": 0.4640301382180305
|
| 224 |
+
},
|
| 225 |
+
"Metrics": {
|
| 226 |
+
"count": 20,
|
| 227 |
+
"num_samples": 309,
|
| 228 |
+
"tasks": [],
|
| 229 |
+
"average_score": 0.5348199655361041
|
| 230 |
+
},
|
| 231 |
+
"Science": {
|
| 232 |
+
"count": 29,
|
| 233 |
+
"num_samples": 574,
|
| 234 |
+
"tasks": [],
|
| 235 |
+
"average_score": 0.4890240042560499
|
| 236 |
+
},
|
| 237 |
+
"Knowledge": {
|
| 238 |
+
"count": 97,
|
| 239 |
+
"num_samples": 1605,
|
| 240 |
+
"tasks": [],
|
| 241 |
+
"average_score": 0.5126038207415967
|
| 242 |
+
},
|
| 243 |
+
"Mathematics": {
|
| 244 |
+
"count": 33,
|
| 245 |
+
"num_samples": 547,
|
| 246 |
+
"tasks": [],
|
| 247 |
+
"average_score": 0.384818434165593
|
| 248 |
+
}
|
| 249 |
+
}
|
| 250 |
+
}
|
| 251 |
+
}
|
static/eval_results/Default/Gemini_1.5_flash_002/task_results.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
static/eval_results/Default/Gemini_1.5_pro_002/summary_results.json
ADDED
|
@@ -0,0 +1,251 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_summary": {
|
| 3 |
+
"core": {
|
| 4 |
+
"num_eval_tasks": 440,
|
| 5 |
+
"num_eval_samples": 6539,
|
| 6 |
+
"macro_mean_score": 0.4822473962867704,
|
| 7 |
+
"micro_mean_score": 0.4764805563057179
|
| 8 |
+
},
|
| 9 |
+
"open": {
|
| 10 |
+
"num_eval_tasks": 65,
|
| 11 |
+
"num_eval_samples": 1163,
|
| 12 |
+
"macro_mean_score": 0.5858190649927173,
|
| 13 |
+
"micro_mean_score": 0.6104901117798793
|
| 14 |
+
},
|
| 15 |
+
"overall_score": 0.4955784031499121
|
| 16 |
+
},
|
| 17 |
+
"keyword_stats": {
|
| 18 |
+
"skills": {
|
| 19 |
+
"Object Recognition and Classification": {
|
| 20 |
+
"count": 303,
|
| 21 |
+
"num_samples": 4755,
|
| 22 |
+
"tasks": [],
|
| 23 |
+
"average_score": 0.5202055934299538
|
| 24 |
+
},
|
| 25 |
+
"Text Recognition (OCR)": {
|
| 26 |
+
"count": 137,
|
| 27 |
+
"num_samples": 2239,
|
| 28 |
+
"tasks": [],
|
| 29 |
+
"average_score": 0.5017043129027509
|
| 30 |
+
},
|
| 31 |
+
"Language Understanding and Generation": {
|
| 32 |
+
"count": 154,
|
| 33 |
+
"num_samples": 2509,
|
| 34 |
+
"tasks": [],
|
| 35 |
+
"average_score": 0.5532599716027446
|
| 36 |
+
},
|
| 37 |
+
"Scene and Event Understanding": {
|
| 38 |
+
"count": 154,
|
| 39 |
+
"num_samples": 2467,
|
| 40 |
+
"tasks": [],
|
| 41 |
+
"average_score": 0.546753787203128
|
| 42 |
+
},
|
| 43 |
+
"Mathematical and Logical Reasoning": {
|
| 44 |
+
"count": 109,
|
| 45 |
+
"num_samples": 1910,
|
| 46 |
+
"tasks": [],
|
| 47 |
+
"average_score": 0.425969084163906
|
| 48 |
+
},
|
| 49 |
+
"Commonsense and Social Reasoning": {
|
| 50 |
+
"count": 51,
|
| 51 |
+
"num_samples": 855,
|
| 52 |
+
"tasks": [],
|
| 53 |
+
"average_score": 0.5751012914154264
|
| 54 |
+
},
|
| 55 |
+
"Ethical and Safety Reasoning": {
|
| 56 |
+
"count": 15,
|
| 57 |
+
"num_samples": 245,
|
| 58 |
+
"tasks": [],
|
| 59 |
+
"average_score": 0.6982330827067671
|
| 60 |
+
},
|
| 61 |
+
"Domain-Specific Knowledge and Skills": {
|
| 62 |
+
"count": 77,
|
| 63 |
+
"num_samples": 1386,
|
| 64 |
+
"tasks": [],
|
| 65 |
+
"average_score": 0.513647745999633
|
| 66 |
+
},
|
| 67 |
+
"Spatial and Temporal Reasoning": {
|
| 68 |
+
"count": 152,
|
| 69 |
+
"num_samples": 2437,
|
| 70 |
+
"tasks": [],
|
| 71 |
+
"average_score": 0.3845337030093212
|
| 72 |
+
},
|
| 73 |
+
"Planning and Decision Making": {
|
| 74 |
+
"count": 37,
|
| 75 |
+
"num_samples": 577,
|
| 76 |
+
"tasks": [],
|
| 77 |
+
"average_score": 0.23899503258223884
|
| 78 |
+
}
|
| 79 |
+
},
|
| 80 |
+
"input_format": {
|
| 81 |
+
"User Interface Screenshots": {
|
| 82 |
+
"count": 93,
|
| 83 |
+
"num_samples": 1517,
|
| 84 |
+
"tasks": [],
|
| 85 |
+
"average_score": 0.4625032188638111
|
| 86 |
+
},
|
| 87 |
+
"Text-Based Images and Documents": {
|
| 88 |
+
"count": 82,
|
| 89 |
+
"num_samples": 1294,
|
| 90 |
+
"tasks": [],
|
| 91 |
+
"average_score": 0.4292353723689881
|
| 92 |
+
},
|
| 93 |
+
"Diagrams and Data Visualizations": {
|
| 94 |
+
"count": 101,
|
| 95 |
+
"num_samples": 1718,
|
| 96 |
+
"tasks": [],
|
| 97 |
+
"average_score": 0.4869625906903554
|
| 98 |
+
},
|
| 99 |
+
"Videos": {
|
| 100 |
+
"count": 43,
|
| 101 |
+
"num_samples": 698,
|
| 102 |
+
"tasks": [],
|
| 103 |
+
"average_score": 0.5028718355967439
|
| 104 |
+
},
|
| 105 |
+
"Artistic and Creative Content": {
|
| 106 |
+
"count": 32,
|
| 107 |
+
"num_samples": 541,
|
| 108 |
+
"tasks": [],
|
| 109 |
+
"average_score": 0.5584779204331461
|
| 110 |
+
},
|
| 111 |
+
"Photographs": {
|
| 112 |
+
"count": 143,
|
| 113 |
+
"num_samples": 2248,
|
| 114 |
+
"tasks": [],
|
| 115 |
+
"average_score": 0.55005349042813
|
| 116 |
+
},
|
| 117 |
+
"3D Models and Aerial Imagery": {
|
| 118 |
+
"count": 11,
|
| 119 |
+
"num_samples": 169,
|
| 120 |
+
"tasks": [],
|
| 121 |
+
"average_score": 0.4292127751495457
|
| 122 |
+
}
|
| 123 |
+
},
|
| 124 |
+
"output_format": {
|
| 125 |
+
"contextual_formatted_text": {
|
| 126 |
+
"count": 98,
|
| 127 |
+
"num_samples": 1514,
|
| 128 |
+
"tasks": [],
|
| 129 |
+
"average_score": 0.44896309957892694
|
| 130 |
+
},
|
| 131 |
+
"structured_output": {
|
| 132 |
+
"count": 110,
|
| 133 |
+
"num_samples": 1714,
|
| 134 |
+
"tasks": [],
|
| 135 |
+
"average_score": 0.44418591808616864
|
| 136 |
+
},
|
| 137 |
+
"exact_text": {
|
| 138 |
+
"count": 83,
|
| 139 |
+
"num_samples": 1278,
|
| 140 |
+
"tasks": [],
|
| 141 |
+
"average_score": 0.5146447350354234
|
| 142 |
+
},
|
| 143 |
+
"numerical_data": {
|
| 144 |
+
"count": 49,
|
| 145 |
+
"num_samples": 862,
|
| 146 |
+
"tasks": [],
|
| 147 |
+
"average_score": 0.4688623462674191
|
| 148 |
+
},
|
| 149 |
+
"open_ended_output": {
|
| 150 |
+
"count": 80,
|
| 151 |
+
"num_samples": 1454,
|
| 152 |
+
"tasks": [],
|
| 153 |
+
"average_score": 0.5580414823700747
|
| 154 |
+
},
|
| 155 |
+
"multiple_choice": {
|
| 156 |
+
"count": 85,
|
| 157 |
+
"num_samples": 1363,
|
| 158 |
+
"tasks": [],
|
| 159 |
+
"average_score": 0.5538255562099124
|
| 160 |
+
}
|
| 161 |
+
},
|
| 162 |
+
"input_num": {
|
| 163 |
+
"6-8 images": {
|
| 164 |
+
"count": 21,
|
| 165 |
+
"num_samples": 314,
|
| 166 |
+
"tasks": [],
|
| 167 |
+
"average_score": 0.39066515495086923
|
| 168 |
+
},
|
| 169 |
+
"9-image or more": {
|
| 170 |
+
"count": 41,
|
| 171 |
+
"num_samples": 623,
|
| 172 |
+
"tasks": [],
|
| 173 |
+
"average_score": 0.5370278962809547
|
| 174 |
+
},
|
| 175 |
+
"1-image": {
|
| 176 |
+
"count": 315,
|
| 177 |
+
"num_samples": 5228,
|
| 178 |
+
"tasks": [],
|
| 179 |
+
"average_score": 0.5034399620483027
|
| 180 |
+
},
|
| 181 |
+
"video": {
|
| 182 |
+
"count": 43,
|
| 183 |
+
"num_samples": 698,
|
| 184 |
+
"tasks": [],
|
| 185 |
+
"average_score": 0.5028718355967439
|
| 186 |
+
},
|
| 187 |
+
"4-5 images": {
|
| 188 |
+
"count": 34,
|
| 189 |
+
"num_samples": 520,
|
| 190 |
+
"tasks": [],
|
| 191 |
+
"average_score": 0.4885398161821004
|
| 192 |
+
},
|
| 193 |
+
"2-3 images": {
|
| 194 |
+
"count": 51,
|
| 195 |
+
"num_samples": 802,
|
| 196 |
+
"tasks": [],
|
| 197 |
+
"average_score": 0.45544217378728585
|
| 198 |
+
}
|
| 199 |
+
},
|
| 200 |
+
"app": {
|
| 201 |
+
"Information_Extraction": {
|
| 202 |
+
"count": 72,
|
| 203 |
+
"num_samples": 1124,
|
| 204 |
+
"tasks": [],
|
| 205 |
+
"average_score": 0.5421439953094952
|
| 206 |
+
},
|
| 207 |
+
"Planning": {
|
| 208 |
+
"count": 78,
|
| 209 |
+
"num_samples": 1239,
|
| 210 |
+
"tasks": [],
|
| 211 |
+
"average_score": 0.3335324339429373
|
| 212 |
+
},
|
| 213 |
+
"Coding": {
|
| 214 |
+
"count": 31,
|
| 215 |
+
"num_samples": 474,
|
| 216 |
+
"tasks": [],
|
| 217 |
+
"average_score": 0.43465181771633377
|
| 218 |
+
},
|
| 219 |
+
"Perception": {
|
| 220 |
+
"count": 145,
|
| 221 |
+
"num_samples": 2313,
|
| 222 |
+
"tasks": [],
|
| 223 |
+
"average_score": 0.5250631828331306
|
| 224 |
+
},
|
| 225 |
+
"Metrics": {
|
| 226 |
+
"count": 20,
|
| 227 |
+
"num_samples": 309,
|
| 228 |
+
"tasks": [],
|
| 229 |
+
"average_score": 0.5821004797173627
|
| 230 |
+
},
|
| 231 |
+
"Science": {
|
| 232 |
+
"count": 29,
|
| 233 |
+
"num_samples": 574,
|
| 234 |
+
"tasks": [],
|
| 235 |
+
"average_score": 0.5124355410095621
|
| 236 |
+
},
|
| 237 |
+
"Knowledge": {
|
| 238 |
+
"count": 97,
|
| 239 |
+
"num_samples": 1605,
|
| 240 |
+
"tasks": [],
|
| 241 |
+
"average_score": 0.5722329455291694
|
| 242 |
+
},
|
| 243 |
+
"Mathematics": {
|
| 244 |
+
"count": 33,
|
| 245 |
+
"num_samples": 547,
|
| 246 |
+
"tasks": [],
|
| 247 |
+
"average_score": 0.41210885517904977
|
| 248 |
+
}
|
| 249 |
+
}
|
| 250 |
+
}
|
| 251 |
+
}
|
static/eval_results/Default/Gemini_1.5_pro_002/task_results.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
static/eval_results/Default/Idefics3/summary_results.json
ADDED
|
@@ -0,0 +1,251 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_summary": {
|
| 3 |
+
"core": {
|
| 4 |
+
"num_eval_tasks": 440,
|
| 5 |
+
"num_eval_samples": 6539,
|
| 6 |
+
"macro_mean_score": 0.08956972487602757,
|
| 7 |
+
"micro_mean_score": 0.08982225274252693
|
| 8 |
+
},
|
| 9 |
+
"open": {
|
| 10 |
+
"num_eval_tasks": 65,
|
| 11 |
+
"num_eval_samples": 1163,
|
| 12 |
+
"macro_mean_score": 0.3210866162255635,
|
| 13 |
+
"micro_mean_score": 0.35649183147033553
|
| 14 |
+
},
|
| 15 |
+
"overall_score": 0.11936892871309657
|
| 16 |
+
},
|
| 17 |
+
"keyword_stats": {
|
| 18 |
+
"skills": {
|
| 19 |
+
"Object Recognition and Classification": {
|
| 20 |
+
"count": 303,
|
| 21 |
+
"num_samples": 4755,
|
| 22 |
+
"tasks": [],
|
| 23 |
+
"average_score": 0.123378776179585
|
| 24 |
+
},
|
| 25 |
+
"Text Recognition (OCR)": {
|
| 26 |
+
"count": 137,
|
| 27 |
+
"num_samples": 2239,
|
| 28 |
+
"tasks": [],
|
| 29 |
+
"average_score": 0.09602065544451607
|
| 30 |
+
},
|
| 31 |
+
"Language Understanding and Generation": {
|
| 32 |
+
"count": 154,
|
| 33 |
+
"num_samples": 2509,
|
| 34 |
+
"tasks": [],
|
| 35 |
+
"average_score": 0.1661543932339007
|
| 36 |
+
},
|
| 37 |
+
"Scene and Event Understanding": {
|
| 38 |
+
"count": 154,
|
| 39 |
+
"num_samples": 2467,
|
| 40 |
+
"tasks": [],
|
| 41 |
+
"average_score": 0.13018902877020821
|
| 42 |
+
},
|
| 43 |
+
"Mathematical and Logical Reasoning": {
|
| 44 |
+
"count": 109,
|
| 45 |
+
"num_samples": 1910,
|
| 46 |
+
"tasks": [],
|
| 47 |
+
"average_score": 0.11200133210641629
|
| 48 |
+
},
|
| 49 |
+
"Commonsense and Social Reasoning": {
|
| 50 |
+
"count": 51,
|
| 51 |
+
"num_samples": 855,
|
| 52 |
+
"tasks": [],
|
| 53 |
+
"average_score": 0.1837120314657304
|
| 54 |
+
},
|
| 55 |
+
"Ethical and Safety Reasoning": {
|
| 56 |
+
"count": 15,
|
| 57 |
+
"num_samples": 245,
|
| 58 |
+
"tasks": [],
|
| 59 |
+
"average_score": 0.2364085213032582
|
| 60 |
+
},
|
| 61 |
+
"Domain-Specific Knowledge and Skills": {
|
| 62 |
+
"count": 77,
|
| 63 |
+
"num_samples": 1386,
|
| 64 |
+
"tasks": [],
|
| 65 |
+
"average_score": 0.15239546294916975
|
| 66 |
+
},
|
| 67 |
+
"Spatial and Temporal Reasoning": {
|
| 68 |
+
"count": 152,
|
| 69 |
+
"num_samples": 2437,
|
| 70 |
+
"tasks": [],
|
| 71 |
+
"average_score": 0.08255834173646705
|
| 72 |
+
},
|
| 73 |
+
"Planning and Decision Making": {
|
| 74 |
+
"count": 37,
|
| 75 |
+
"num_samples": 577,
|
| 76 |
+
"tasks": [],
|
| 77 |
+
"average_score": 0.03149369112824262
|
| 78 |
+
}
|
| 79 |
+
},
|
| 80 |
+
"input_format": {
|
| 81 |
+
"User Interface Screenshots": {
|
| 82 |
+
"count": 93,
|
| 83 |
+
"num_samples": 1517,
|
| 84 |
+
"tasks": [],
|
| 85 |
+
"average_score": 0.06151607584357764
|
| 86 |
+
},
|
| 87 |
+
"Text-Based Images and Documents": {
|
| 88 |
+
"count": 82,
|
| 89 |
+
"num_samples": 1294,
|
| 90 |
+
"tasks": [],
|
| 91 |
+
"average_score": 0.10124344675801887
|
| 92 |
+
},
|
| 93 |
+
"Diagrams and Data Visualizations": {
|
| 94 |
+
"count": 101,
|
| 95 |
+
"num_samples": 1718,
|
| 96 |
+
"tasks": [],
|
| 97 |
+
"average_score": 0.14147248511867794
|
| 98 |
+
},
|
| 99 |
+
"Videos": {
|
| 100 |
+
"count": 43,
|
| 101 |
+
"num_samples": 698,
|
| 102 |
+
"tasks": [],
|
| 103 |
+
"average_score": 0.15942387460900312
|
| 104 |
+
},
|
| 105 |
+
"Artistic and Creative Content": {
|
| 106 |
+
"count": 32,
|
| 107 |
+
"num_samples": 541,
|
| 108 |
+
"tasks": [],
|
| 109 |
+
"average_score": 0.17458268378399872
|
| 110 |
+
},
|
| 111 |
+
"Photographs": {
|
| 112 |
+
"count": 143,
|
| 113 |
+
"num_samples": 2248,
|
| 114 |
+
"tasks": [],
|
| 115 |
+
"average_score": 0.13442937440893113
|
| 116 |
+
},
|
| 117 |
+
"3D Models and Aerial Imagery": {
|
| 118 |
+
"count": 11,
|
| 119 |
+
"num_samples": 169,
|
| 120 |
+
"tasks": [],
|
| 121 |
+
"average_score": 0.02766884416043467
|
| 122 |
+
}
|
| 123 |
+
},
|
| 124 |
+
"output_format": {
|
| 125 |
+
"contextual_formatted_text": {
|
| 126 |
+
"count": 98,
|
| 127 |
+
"num_samples": 1514,
|
| 128 |
+
"tasks": [],
|
| 129 |
+
"average_score": 0.15513016850044997
|
| 130 |
+
},
|
| 131 |
+
"structured_output": {
|
| 132 |
+
"count": 110,
|
| 133 |
+
"num_samples": 1714,
|
| 134 |
+
"tasks": [],
|
| 135 |
+
"average_score": 0.03757596375966502
|
| 136 |
+
},
|
| 137 |
+
"exact_text": {
|
| 138 |
+
"count": 83,
|
| 139 |
+
"num_samples": 1278,
|
| 140 |
+
"tasks": [],
|
| 141 |
+
"average_score": 0.05386631116442094
|
| 142 |
+
},
|
| 143 |
+
"numerical_data": {
|
| 144 |
+
"count": 49,
|
| 145 |
+
"num_samples": 862,
|
| 146 |
+
"tasks": [],
|
| 147 |
+
"average_score": 0.0760949224506388
|
| 148 |
+
},
|
| 149 |
+
"open_ended_output": {
|
| 150 |
+
"count": 80,
|
| 151 |
+
"num_samples": 1454,
|
| 152 |
+
"tasks": [],
|
| 153 |
+
"average_score": 0.2987797010800956
|
| 154 |
+
},
|
| 155 |
+
"multiple_choice": {
|
| 156 |
+
"count": 85,
|
| 157 |
+
"num_samples": 1363,
|
| 158 |
+
"tasks": [],
|
| 159 |
+
"average_score": 0.10403841600436024
|
| 160 |
+
}
|
| 161 |
+
},
|
| 162 |
+
"input_num": {
|
| 163 |
+
"6-8 images": {
|
| 164 |
+
"count": 21,
|
| 165 |
+
"num_samples": 314,
|
| 166 |
+
"tasks": [],
|
| 167 |
+
"average_score": 0.0661753590325019
|
| 168 |
+
},
|
| 169 |
+
"9-image or more": {
|
| 170 |
+
"count": 41,
|
| 171 |
+
"num_samples": 623,
|
| 172 |
+
"tasks": [],
|
| 173 |
+
"average_score": 0.09190674791720088
|
| 174 |
+
},
|
| 175 |
+
"1-image": {
|
| 176 |
+
"count": 315,
|
| 177 |
+
"num_samples": 5228,
|
| 178 |
+
"tasks": [],
|
| 179 |
+
"average_score": 0.12345439179884048
|
| 180 |
+
},
|
| 181 |
+
"video": {
|
| 182 |
+
"count": 43,
|
| 183 |
+
"num_samples": 698,
|
| 184 |
+
"tasks": [],
|
| 185 |
+
"average_score": 0.15942387460900312
|
| 186 |
+
},
|
| 187 |
+
"4-5 images": {
|
| 188 |
+
"count": 34,
|
| 189 |
+
"num_samples": 520,
|
| 190 |
+
"tasks": [],
|
| 191 |
+
"average_score": 0.11382786944230487
|
| 192 |
+
},
|
| 193 |
+
"2-3 images": {
|
| 194 |
+
"count": 51,
|
| 195 |
+
"num_samples": 802,
|
| 196 |
+
"tasks": [],
|
| 197 |
+
"average_score": 0.10803808254834846
|
| 198 |
+
}
|
| 199 |
+
},
|
| 200 |
+
"app": {
|
| 201 |
+
"Information_Extraction": {
|
| 202 |
+
"count": 72,
|
| 203 |
+
"num_samples": 1124,
|
| 204 |
+
"tasks": [],
|
| 205 |
+
"average_score": 0.11450308988278819
|
| 206 |
+
},
|
| 207 |
+
"Planning": {
|
| 208 |
+
"count": 78,
|
| 209 |
+
"num_samples": 1239,
|
| 210 |
+
"tasks": [],
|
| 211 |
+
"average_score": 0.04671278220005028
|
| 212 |
+
},
|
| 213 |
+
"Coding": {
|
| 214 |
+
"count": 31,
|
| 215 |
+
"num_samples": 474,
|
| 216 |
+
"tasks": [],
|
| 217 |
+
"average_score": 0.0978814644137225
|
| 218 |
+
},
|
| 219 |
+
"Perception": {
|
| 220 |
+
"count": 145,
|
| 221 |
+
"num_samples": 2313,
|
| 222 |
+
"tasks": [],
|
| 223 |
+
"average_score": 0.13283830731528018
|
| 224 |
+
},
|
| 225 |
+
"Metrics": {
|
| 226 |
+
"count": 20,
|
| 227 |
+
"num_samples": 309,
|
| 228 |
+
"tasks": [],
|
| 229 |
+
"average_score": 0.09697463995668018
|
| 230 |
+
},
|
| 231 |
+
"Science": {
|
| 232 |
+
"count": 29,
|
| 233 |
+
"num_samples": 574,
|
| 234 |
+
"tasks": [],
|
| 235 |
+
"average_score": 0.1840497279921703
|
| 236 |
+
},
|
| 237 |
+
"Knowledge": {
|
| 238 |
+
"count": 97,
|
| 239 |
+
"num_samples": 1605,
|
| 240 |
+
"tasks": [],
|
| 241 |
+
"average_score": 0.1605667124060194
|
| 242 |
+
},
|
| 243 |
+
"Mathematics": {
|
| 244 |
+
"count": 33,
|
| 245 |
+
"num_samples": 547,
|
| 246 |
+
"tasks": [],
|
| 247 |
+
"average_score": 0.09835465288235297
|
| 248 |
+
}
|
| 249 |
+
}
|
| 250 |
+
}
|
| 251 |
+
}
|
static/eval_results/Default/Idefics3/task_results.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
static/eval_results/Default/InternVL2_2B/summary_results.json
ADDED
|
@@ -0,0 +1,251 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_summary": {
|
| 3 |
+
"core": {
|
| 4 |
+
"num_eval_tasks": 440,
|
| 5 |
+
"num_eval_samples": 6539,
|
| 6 |
+
"macro_mean_score": 0.13141974398938763,
|
| 7 |
+
"micro_mean_score": 0.13063500716262516
|
| 8 |
+
},
|
| 9 |
+
"open": {
|
| 10 |
+
"num_eval_tasks": 65,
|
| 11 |
+
"num_eval_samples": 1163,
|
| 12 |
+
"macro_mean_score": 0.23864417043743646,
|
| 13 |
+
"micro_mean_score": 0.24901117798796224
|
| 14 |
+
},
|
| 15 |
+
"overall_score": 0.14522090778963154
|
| 16 |
+
},
|
| 17 |
+
"keyword_stats": {
|
| 18 |
+
"skills": {
|
| 19 |
+
"Object Recognition and Classification": {
|
| 20 |
+
"count": 303,
|
| 21 |
+
"num_samples": 4755,
|
| 22 |
+
"tasks": [],
|
| 23 |
+
"average_score": 0.14491178903291552
|
| 24 |
+
},
|
| 25 |
+
"Text Recognition (OCR)": {
|
| 26 |
+
"count": 137,
|
| 27 |
+
"num_samples": 2239,
|
| 28 |
+
"tasks": [],
|
| 29 |
+
"average_score": 0.12126906675624163
|
| 30 |
+
},
|
| 31 |
+
"Language Understanding and Generation": {
|
| 32 |
+
"count": 154,
|
| 33 |
+
"num_samples": 2509,
|
| 34 |
+
"tasks": [],
|
| 35 |
+
"average_score": 0.16912754929321935
|
| 36 |
+
},
|
| 37 |
+
"Scene and Event Understanding": {
|
| 38 |
+
"count": 154,
|
| 39 |
+
"num_samples": 2467,
|
| 40 |
+
"tasks": [],
|
| 41 |
+
"average_score": 0.18542274192083463
|
| 42 |
+
},
|
| 43 |
+
"Mathematical and Logical Reasoning": {
|
| 44 |
+
"count": 109,
|
| 45 |
+
"num_samples": 1910,
|
| 46 |
+
"tasks": [],
|
| 47 |
+
"average_score": 0.13923308734553164
|
| 48 |
+
},
|
| 49 |
+
"Commonsense and Social Reasoning": {
|
| 50 |
+
"count": 51,
|
| 51 |
+
"num_samples": 855,
|
| 52 |
+
"tasks": [],
|
| 53 |
+
"average_score": 0.23992252224543772
|
| 54 |
+
},
|
| 55 |
+
"Ethical and Safety Reasoning": {
|
| 56 |
+
"count": 15,
|
| 57 |
+
"num_samples": 245,
|
| 58 |
+
"tasks": [],
|
| 59 |
+
"average_score": 0.3420927318295739
|
| 60 |
+
},
|
| 61 |
+
"Domain-Specific Knowledge and Skills": {
|
| 62 |
+
"count": 77,
|
| 63 |
+
"num_samples": 1386,
|
| 64 |
+
"tasks": [],
|
| 65 |
+
"average_score": 0.14807577209152425
|
| 66 |
+
},
|
| 67 |
+
"Spatial and Temporal Reasoning": {
|
| 68 |
+
"count": 152,
|
| 69 |
+
"num_samples": 2437,
|
| 70 |
+
"tasks": [],
|
| 71 |
+
"average_score": 0.13036555933925006
|
| 72 |
+
},
|
| 73 |
+
"Planning and Decision Making": {
|
| 74 |
+
"count": 37,
|
| 75 |
+
"num_samples": 577,
|
| 76 |
+
"tasks": [],
|
| 77 |
+
"average_score": 0.01727799227799228
|
| 78 |
+
}
|
| 79 |
+
},
|
| 80 |
+
"input_format": {
|
| 81 |
+
"User Interface Screenshots": {
|
| 82 |
+
"count": 93,
|
| 83 |
+
"num_samples": 1517,
|
| 84 |
+
"tasks": [],
|
| 85 |
+
"average_score": 0.057021136657850864
|
| 86 |
+
},
|
| 87 |
+
"Text-Based Images and Documents": {
|
| 88 |
+
"count": 82,
|
| 89 |
+
"num_samples": 1294,
|
| 90 |
+
"tasks": [],
|
| 91 |
+
"average_score": 0.10504085961245285
|
| 92 |
+
},
|
| 93 |
+
"Diagrams and Data Visualizations": {
|
| 94 |
+
"count": 101,
|
| 95 |
+
"num_samples": 1718,
|
| 96 |
+
"tasks": [],
|
| 97 |
+
"average_score": 0.1625198552182714
|
| 98 |
+
},
|
| 99 |
+
"Videos": {
|
| 100 |
+
"count": 43,
|
| 101 |
+
"num_samples": 698,
|
| 102 |
+
"tasks": [],
|
| 103 |
+
"average_score": 0.18999779001767986
|
| 104 |
+
},
|
| 105 |
+
"Artistic and Creative Content": {
|
| 106 |
+
"count": 32,
|
| 107 |
+
"num_samples": 541,
|
| 108 |
+
"tasks": [],
|
| 109 |
+
"average_score": 0.1487677475708977
|
| 110 |
+
},
|
| 111 |
+
"Photographs": {
|
| 112 |
+
"count": 143,
|
| 113 |
+
"num_samples": 2248,
|
| 114 |
+
"tasks": [],
|
| 115 |
+
"average_score": 0.2011727338536935
|
| 116 |
+
},
|
| 117 |
+
"3D Models and Aerial Imagery": {
|
| 118 |
+
"count": 11,
|
| 119 |
+
"num_samples": 169,
|
| 120 |
+
"tasks": [],
|
| 121 |
+
"average_score": 0.11886936592818943
|
| 122 |
+
}
|
| 123 |
+
},
|
| 124 |
+
"output_format": {
|
| 125 |
+
"contextual_formatted_text": {
|
| 126 |
+
"count": 98,
|
| 127 |
+
"num_samples": 1514,
|
| 128 |
+
"tasks": [],
|
| 129 |
+
"average_score": 0.1131404778887607
|
| 130 |
+
},
|
| 131 |
+
"structured_output": {
|
| 132 |
+
"count": 110,
|
| 133 |
+
"num_samples": 1714,
|
| 134 |
+
"tasks": [],
|
| 135 |
+
"average_score": 0.05739750616837997
|
| 136 |
+
},
|
| 137 |
+
"exact_text": {
|
| 138 |
+
"count": 83,
|
| 139 |
+
"num_samples": 1278,
|
| 140 |
+
"tasks": [],
|
| 141 |
+
"average_score": 0.15465451663650032
|
| 142 |
+
},
|
| 143 |
+
"numerical_data": {
|
| 144 |
+
"count": 49,
|
| 145 |
+
"num_samples": 862,
|
| 146 |
+
"tasks": [],
|
| 147 |
+
"average_score": 0.16044698450090833
|
| 148 |
+
},
|
| 149 |
+
"open_ended_output": {
|
| 150 |
+
"count": 80,
|
| 151 |
+
"num_samples": 1454,
|
| 152 |
+
"tasks": [],
|
| 153 |
+
"average_score": 0.21429521387724249
|
| 154 |
+
},
|
| 155 |
+
"multiple_choice": {
|
| 156 |
+
"count": 85,
|
| 157 |
+
"num_samples": 1363,
|
| 158 |
+
"tasks": [],
|
| 159 |
+
"average_score": 0.2128614316540013
|
| 160 |
+
}
|
| 161 |
+
},
|
| 162 |
+
"input_num": {
|
| 163 |
+
"6-8 images": {
|
| 164 |
+
"count": 21,
|
| 165 |
+
"num_samples": 314,
|
| 166 |
+
"tasks": [],
|
| 167 |
+
"average_score": 0.03658352229780801
|
| 168 |
+
},
|
| 169 |
+
"9-image or more": {
|
| 170 |
+
"count": 41,
|
| 171 |
+
"num_samples": 623,
|
| 172 |
+
"tasks": [],
|
| 173 |
+
"average_score": 0.05757839721254354
|
| 174 |
+
},
|
| 175 |
+
"1-image": {
|
| 176 |
+
"count": 315,
|
| 177 |
+
"num_samples": 5228,
|
| 178 |
+
"tasks": [],
|
| 179 |
+
"average_score": 0.15225683687839608
|
| 180 |
+
},
|
| 181 |
+
"video": {
|
| 182 |
+
"count": 43,
|
| 183 |
+
"num_samples": 698,
|
| 184 |
+
"tasks": [],
|
| 185 |
+
"average_score": 0.18999779001767986
|
| 186 |
+
},
|
| 187 |
+
"4-5 images": {
|
| 188 |
+
"count": 34,
|
| 189 |
+
"num_samples": 520,
|
| 190 |
+
"tasks": [],
|
| 191 |
+
"average_score": 0.17677460549936644
|
| 192 |
+
},
|
| 193 |
+
"2-3 images": {
|
| 194 |
+
"count": 51,
|
| 195 |
+
"num_samples": 802,
|
| 196 |
+
"tasks": [],
|
| 197 |
+
"average_score": 0.158165588340436
|
| 198 |
+
}
|
| 199 |
+
},
|
| 200 |
+
"app": {
|
| 201 |
+
"Information_Extraction": {
|
| 202 |
+
"count": 72,
|
| 203 |
+
"num_samples": 1124,
|
| 204 |
+
"tasks": [],
|
| 205 |
+
"average_score": 0.08722661966805
|
| 206 |
+
},
|
| 207 |
+
"Planning": {
|
| 208 |
+
"count": 78,
|
| 209 |
+
"num_samples": 1239,
|
| 210 |
+
"tasks": [],
|
| 211 |
+
"average_score": 0.04102853815875594
|
| 212 |
+
},
|
| 213 |
+
"Coding": {
|
| 214 |
+
"count": 31,
|
| 215 |
+
"num_samples": 474,
|
| 216 |
+
"tasks": [],
|
| 217 |
+
"average_score": 0.11264043251709285
|
| 218 |
+
},
|
| 219 |
+
"Perception": {
|
| 220 |
+
"count": 145,
|
| 221 |
+
"num_samples": 2313,
|
| 222 |
+
"tasks": [],
|
| 223 |
+
"average_score": 0.17001758160301803
|
| 224 |
+
},
|
| 225 |
+
"Metrics": {
|
| 226 |
+
"count": 20,
|
| 227 |
+
"num_samples": 309,
|
| 228 |
+
"tasks": [],
|
| 229 |
+
"average_score": 0.3332891958712894
|
| 230 |
+
},
|
| 231 |
+
"Science": {
|
| 232 |
+
"count": 29,
|
| 233 |
+
"num_samples": 574,
|
| 234 |
+
"tasks": [],
|
| 235 |
+
"average_score": 0.1686125516807394
|
| 236 |
+
},
|
| 237 |
+
"Knowledge": {
|
| 238 |
+
"count": 97,
|
| 239 |
+
"num_samples": 1605,
|
| 240 |
+
"tasks": [],
|
| 241 |
+
"average_score": 0.21169137106199268
|
| 242 |
+
},
|
| 243 |
+
"Mathematics": {
|
| 244 |
+
"count": 33,
|
| 245 |
+
"num_samples": 547,
|
| 246 |
+
"tasks": [],
|
| 247 |
+
"average_score": 0.10975764217070672
|
| 248 |
+
}
|
| 249 |
+
}
|
| 250 |
+
}
|
| 251 |
+
}
|
static/eval_results/Default/InternVL2_2B/task_results.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
static/eval_results/Default/InternVL2_5_2B/summary_results.json
ADDED
|
@@ -0,0 +1,251 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_summary": {
|
| 3 |
+
"core": {
|
| 4 |
+
"num_eval_tasks": 440,
|
| 5 |
+
"num_eval_samples": 6539,
|
| 6 |
+
"macro_mean_score": 0.17806821966478364,
|
| 7 |
+
"micro_mean_score": 0.17708809739236367
|
| 8 |
+
},
|
| 9 |
+
"open": {
|
| 10 |
+
"num_eval_tasks": 65,
|
| 11 |
+
"num_eval_samples": 1163,
|
| 12 |
+
"macro_mean_score": 0.2738430375585404,
|
| 13 |
+
"micro_mean_score": 0.2905417024935512
|
| 14 |
+
},
|
| 15 |
+
"overall_score": 0.19039567147289096
|
| 16 |
+
},
|
| 17 |
+
"keyword_stats": {
|
| 18 |
+
"skills": {
|
| 19 |
+
"Object Recognition and Classification": {
|
| 20 |
+
"count": 303,
|
| 21 |
+
"num_samples": 4755,
|
| 22 |
+
"tasks": [],
|
| 23 |
+
"average_score": 0.19614682488147464
|
| 24 |
+
},
|
| 25 |
+
"Text Recognition (OCR)": {
|
| 26 |
+
"count": 137,
|
| 27 |
+
"num_samples": 2239,
|
| 28 |
+
"tasks": [],
|
| 29 |
+
"average_score": 0.18910947570579717
|
| 30 |
+
},
|
| 31 |
+
"Language Understanding and Generation": {
|
| 32 |
+
"count": 154,
|
| 33 |
+
"num_samples": 2509,
|
| 34 |
+
"tasks": [],
|
| 35 |
+
"average_score": 0.20543964378430513
|
| 36 |
+
},
|
| 37 |
+
"Scene and Event Understanding": {
|
| 38 |
+
"count": 154,
|
| 39 |
+
"num_samples": 2467,
|
| 40 |
+
"tasks": [],
|
| 41 |
+
"average_score": 0.23636598588530347
|
| 42 |
+
},
|
| 43 |
+
"Mathematical and Logical Reasoning": {
|
| 44 |
+
"count": 109,
|
| 45 |
+
"num_samples": 1910,
|
| 46 |
+
"tasks": [],
|
| 47 |
+
"average_score": 0.15691382827270517
|
| 48 |
+
},
|
| 49 |
+
"Commonsense and Social Reasoning": {
|
| 50 |
+
"count": 51,
|
| 51 |
+
"num_samples": 855,
|
| 52 |
+
"tasks": [],
|
| 53 |
+
"average_score": 0.28604169870255614
|
| 54 |
+
},
|
| 55 |
+
"Ethical and Safety Reasoning": {
|
| 56 |
+
"count": 15,
|
| 57 |
+
"num_samples": 245,
|
| 58 |
+
"tasks": [],
|
| 59 |
+
"average_score": 0.4248446115288219
|
| 60 |
+
},
|
| 61 |
+
"Domain-Specific Knowledge and Skills": {
|
| 62 |
+
"count": 77,
|
| 63 |
+
"num_samples": 1386,
|
| 64 |
+
"tasks": [],
|
| 65 |
+
"average_score": 0.18745928331343714
|
| 66 |
+
},
|
| 67 |
+
"Spatial and Temporal Reasoning": {
|
| 68 |
+
"count": 152,
|
| 69 |
+
"num_samples": 2437,
|
| 70 |
+
"tasks": [],
|
| 71 |
+
"average_score": 0.15097551654513372
|
| 72 |
+
},
|
| 73 |
+
"Planning and Decision Making": {
|
| 74 |
+
"count": 37,
|
| 75 |
+
"num_samples": 577,
|
| 76 |
+
"tasks": [],
|
| 77 |
+
"average_score": 0.030568378443583684
|
| 78 |
+
}
|
| 79 |
+
},
|
| 80 |
+
"input_format": {
|
| 81 |
+
"User Interface Screenshots": {
|
| 82 |
+
"count": 93,
|
| 83 |
+
"num_samples": 1517,
|
| 84 |
+
"tasks": [],
|
| 85 |
+
"average_score": 0.13898447520398388
|
| 86 |
+
},
|
| 87 |
+
"Text-Based Images and Documents": {
|
| 88 |
+
"count": 82,
|
| 89 |
+
"num_samples": 1294,
|
| 90 |
+
"tasks": [],
|
| 91 |
+
"average_score": 0.13154711942685113
|
| 92 |
+
},
|
| 93 |
+
"Diagrams and Data Visualizations": {
|
| 94 |
+
"count": 101,
|
| 95 |
+
"num_samples": 1718,
|
| 96 |
+
"tasks": [],
|
| 97 |
+
"average_score": 0.18343540213068474
|
| 98 |
+
},
|
| 99 |
+
"Videos": {
|
| 100 |
+
"count": 43,
|
| 101 |
+
"num_samples": 698,
|
| 102 |
+
"tasks": [],
|
| 103 |
+
"average_score": 0.20755556526976354
|
| 104 |
+
},
|
| 105 |
+
"Artistic and Creative Content": {
|
| 106 |
+
"count": 32,
|
| 107 |
+
"num_samples": 541,
|
| 108 |
+
"tasks": [],
|
| 109 |
+
"average_score": 0.15983467048343838
|
| 110 |
+
},
|
| 111 |
+
"Photographs": {
|
| 112 |
+
"count": 143,
|
| 113 |
+
"num_samples": 2248,
|
| 114 |
+
"tasks": [],
|
| 115 |
+
"average_score": 0.26888883087046195
|
| 116 |
+
},
|
| 117 |
+
"3D Models and Aerial Imagery": {
|
| 118 |
+
"count": 11,
|
| 119 |
+
"num_samples": 169,
|
| 120 |
+
"tasks": [],
|
| 121 |
+
"average_score": 0.12906517409932386
|
| 122 |
+
}
|
| 123 |
+
},
|
| 124 |
+
"output_format": {
|
| 125 |
+
"contextual_formatted_text": {
|
| 126 |
+
"count": 98,
|
| 127 |
+
"num_samples": 1514,
|
| 128 |
+
"tasks": [],
|
| 129 |
+
"average_score": 0.14702422379343882
|
| 130 |
+
},
|
| 131 |
+
"structured_output": {
|
| 132 |
+
"count": 110,
|
| 133 |
+
"num_samples": 1714,
|
| 134 |
+
"tasks": [],
|
| 135 |
+
"average_score": 0.15324148486802894
|
| 136 |
+
},
|
| 137 |
+
"exact_text": {
|
| 138 |
+
"count": 83,
|
| 139 |
+
"num_samples": 1278,
|
| 140 |
+
"tasks": [],
|
| 141 |
+
"average_score": 0.19977956414542175
|
| 142 |
+
},
|
| 143 |
+
"numerical_data": {
|
| 144 |
+
"count": 49,
|
| 145 |
+
"num_samples": 862,
|
| 146 |
+
"tasks": [],
|
| 147 |
+
"average_score": 0.1665590610582109
|
| 148 |
+
},
|
| 149 |
+
"open_ended_output": {
|
| 150 |
+
"count": 80,
|
| 151 |
+
"num_samples": 1454,
|
| 152 |
+
"tasks": [],
|
| 153 |
+
"average_score": 0.2529339759528222
|
| 154 |
+
},
|
| 155 |
+
"multiple_choice": {
|
| 156 |
+
"count": 85,
|
| 157 |
+
"num_samples": 1363,
|
| 158 |
+
"tasks": [],
|
| 159 |
+
"average_score": 0.23420071687554841
|
| 160 |
+
}
|
| 161 |
+
},
|
| 162 |
+
"input_num": {
|
| 163 |
+
"6-8 images": {
|
| 164 |
+
"count": 21,
|
| 165 |
+
"num_samples": 314,
|
| 166 |
+
"tasks": [],
|
| 167 |
+
"average_score": 0.09651832955404382
|
| 168 |
+
},
|
| 169 |
+
"9-image or more": {
|
| 170 |
+
"count": 41,
|
| 171 |
+
"num_samples": 623,
|
| 172 |
+
"tasks": [],
|
| 173 |
+
"average_score": 0.0784280378818194
|
| 174 |
+
},
|
| 175 |
+
"1-image": {
|
| 176 |
+
"count": 315,
|
| 177 |
+
"num_samples": 5228,
|
| 178 |
+
"tasks": [],
|
| 179 |
+
"average_score": 0.21260786581183966
|
| 180 |
+
},
|
| 181 |
+
"video": {
|
| 182 |
+
"count": 43,
|
| 183 |
+
"num_samples": 698,
|
| 184 |
+
"tasks": [],
|
| 185 |
+
"average_score": 0.20755556526976354
|
| 186 |
+
},
|
| 187 |
+
"4-5 images": {
|
| 188 |
+
"count": 34,
|
| 189 |
+
"num_samples": 520,
|
| 190 |
+
"tasks": [],
|
| 191 |
+
"average_score": 0.138285387531761
|
| 192 |
+
},
|
| 193 |
+
"2-3 images": {
|
| 194 |
+
"count": 51,
|
| 195 |
+
"num_samples": 802,
|
| 196 |
+
"tasks": [],
|
| 197 |
+
"average_score": 0.20214332169825855
|
| 198 |
+
}
|
| 199 |
+
},
|
| 200 |
+
"app": {
|
| 201 |
+
"Information_Extraction": {
|
| 202 |
+
"count": 72,
|
| 203 |
+
"num_samples": 1124,
|
| 204 |
+
"tasks": [],
|
| 205 |
+
"average_score": 0.18128339685489062
|
| 206 |
+
},
|
| 207 |
+
"Planning": {
|
| 208 |
+
"count": 78,
|
| 209 |
+
"num_samples": 1239,
|
| 210 |
+
"tasks": [],
|
| 211 |
+
"average_score": 0.053153113565753
|
| 212 |
+
},
|
| 213 |
+
"Coding": {
|
| 214 |
+
"count": 31,
|
| 215 |
+
"num_samples": 474,
|
| 216 |
+
"tasks": [],
|
| 217 |
+
"average_score": 0.12416116984428181
|
| 218 |
+
},
|
| 219 |
+
"Perception": {
|
| 220 |
+
"count": 145,
|
| 221 |
+
"num_samples": 2313,
|
| 222 |
+
"tasks": [],
|
| 223 |
+
"average_score": 0.22449772657901465
|
| 224 |
+
},
|
| 225 |
+
"Metrics": {
|
| 226 |
+
"count": 20,
|
| 227 |
+
"num_samples": 309,
|
| 228 |
+
"tasks": [],
|
| 229 |
+
"average_score": 0.3762336977650326
|
| 230 |
+
},
|
| 231 |
+
"Science": {
|
| 232 |
+
"count": 29,
|
| 233 |
+
"num_samples": 574,
|
| 234 |
+
"tasks": [],
|
| 235 |
+
"average_score": 0.19222024833691936
|
| 236 |
+
},
|
| 237 |
+
"Knowledge": {
|
| 238 |
+
"count": 97,
|
| 239 |
+
"num_samples": 1605,
|
| 240 |
+
"tasks": [],
|
| 241 |
+
"average_score": 0.25056132494721467
|
| 242 |
+
},
|
| 243 |
+
"Mathematics": {
|
| 244 |
+
"count": 33,
|
| 245 |
+
"num_samples": 547,
|
| 246 |
+
"tasks": [],
|
| 247 |
+
"average_score": 0.15596334442569906
|
| 248 |
+
}
|
| 249 |
+
}
|
| 250 |
+
}
|
| 251 |
+
}
|
static/eval_results/Default/InternVL2_5_2B/task_results.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
static/eval_results/Default/InternVL2_5_78B/summary_results.json
ADDED
|
@@ -0,0 +1,251 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_summary": {
|
| 3 |
+
"core": {
|
| 4 |
+
"num_eval_tasks": 440,
|
| 5 |
+
"num_eval_samples": 6539,
|
| 6 |
+
"macro_mean_score": 0.44132952988532753,
|
| 7 |
+
"micro_mean_score": 0.4397079059379812
|
| 8 |
+
},
|
| 9 |
+
"open": {
|
| 10 |
+
"num_eval_tasks": 65,
|
| 11 |
+
"num_eval_samples": 1163,
|
| 12 |
+
"macro_mean_score": 0.5538024772749066,
|
| 13 |
+
"micro_mean_score": 0.5776870163370592
|
| 14 |
+
},
|
| 15 |
+
"overall_score": 0.4558062458859664
|
| 16 |
+
},
|
| 17 |
+
"keyword_stats": {
|
| 18 |
+
"skills": {
|
| 19 |
+
"Object Recognition and Classification": {
|
| 20 |
+
"count": 303,
|
| 21 |
+
"num_samples": 4755,
|
| 22 |
+
"tasks": [],
|
| 23 |
+
"average_score": 0.46893853078050696
|
| 24 |
+
},
|
| 25 |
+
"Text Recognition (OCR)": {
|
| 26 |
+
"count": 137,
|
| 27 |
+
"num_samples": 2239,
|
| 28 |
+
"tasks": [],
|
| 29 |
+
"average_score": 0.5220829627238773
|
| 30 |
+
},
|
| 31 |
+
"Language Understanding and Generation": {
|
| 32 |
+
"count": 154,
|
| 33 |
+
"num_samples": 2509,
|
| 34 |
+
"tasks": [],
|
| 35 |
+
"average_score": 0.4933134095077618
|
| 36 |
+
},
|
| 37 |
+
"Scene and Event Understanding": {
|
| 38 |
+
"count": 154,
|
| 39 |
+
"num_samples": 2467,
|
| 40 |
+
"tasks": [],
|
| 41 |
+
"average_score": 0.477971701185214
|
| 42 |
+
},
|
| 43 |
+
"Mathematical and Logical Reasoning": {
|
| 44 |
+
"count": 109,
|
| 45 |
+
"num_samples": 1910,
|
| 46 |
+
"tasks": [],
|
| 47 |
+
"average_score": 0.3936387335462224
|
| 48 |
+
},
|
| 49 |
+
"Commonsense and Social Reasoning": {
|
| 50 |
+
"count": 51,
|
| 51 |
+
"num_samples": 855,
|
| 52 |
+
"tasks": [],
|
| 53 |
+
"average_score": 0.5610278744213835
|
| 54 |
+
},
|
| 55 |
+
"Ethical and Safety Reasoning": {
|
| 56 |
+
"count": 15,
|
| 57 |
+
"num_samples": 245,
|
| 58 |
+
"tasks": [],
|
| 59 |
+
"average_score": 0.6072907268170428
|
| 60 |
+
},
|
| 61 |
+
"Domain-Specific Knowledge and Skills": {
|
| 62 |
+
"count": 77,
|
| 63 |
+
"num_samples": 1386,
|
| 64 |
+
"tasks": [],
|
| 65 |
+
"average_score": 0.44533550848682696
|
| 66 |
+
},
|
| 67 |
+
"Spatial and Temporal Reasoning": {
|
| 68 |
+
"count": 152,
|
| 69 |
+
"num_samples": 2437,
|
| 70 |
+
"tasks": [],
|
| 71 |
+
"average_score": 0.3548055654857457
|
| 72 |
+
},
|
| 73 |
+
"Planning and Decision Making": {
|
| 74 |
+
"count": 37,
|
| 75 |
+
"num_samples": 577,
|
| 76 |
+
"tasks": [],
|
| 77 |
+
"average_score": 0.22852234519925363
|
| 78 |
+
}
|
| 79 |
+
},
|
| 80 |
+
"input_format": {
|
| 81 |
+
"User Interface Screenshots": {
|
| 82 |
+
"count": 93,
|
| 83 |
+
"num_samples": 1517,
|
| 84 |
+
"tasks": [],
|
| 85 |
+
"average_score": 0.4910486370158392
|
| 86 |
+
},
|
| 87 |
+
"Text-Based Images and Documents": {
|
| 88 |
+
"count": 82,
|
| 89 |
+
"num_samples": 1294,
|
| 90 |
+
"tasks": [],
|
| 91 |
+
"average_score": 0.39410061025954557
|
| 92 |
+
},
|
| 93 |
+
"Diagrams and Data Visualizations": {
|
| 94 |
+
"count": 101,
|
| 95 |
+
"num_samples": 1718,
|
| 96 |
+
"tasks": [],
|
| 97 |
+
"average_score": 0.43424133240430957
|
| 98 |
+
},
|
| 99 |
+
"Videos": {
|
| 100 |
+
"count": 43,
|
| 101 |
+
"num_samples": 698,
|
| 102 |
+
"tasks": [],
|
| 103 |
+
"average_score": 0.5300255483670417
|
| 104 |
+
},
|
| 105 |
+
"Artistic and Creative Content": {
|
| 106 |
+
"count": 32,
|
| 107 |
+
"num_samples": 541,
|
| 108 |
+
"tasks": [],
|
| 109 |
+
"average_score": 0.4793195260560365
|
| 110 |
+
},
|
| 111 |
+
"Photographs": {
|
| 112 |
+
"count": 143,
|
| 113 |
+
"num_samples": 2248,
|
| 114 |
+
"tasks": [],
|
| 115 |
+
"average_score": 0.4622918421665308
|
| 116 |
+
},
|
| 117 |
+
"3D Models and Aerial Imagery": {
|
| 118 |
+
"count": 11,
|
| 119 |
+
"num_samples": 169,
|
| 120 |
+
"tasks": [],
|
| 121 |
+
"average_score": 0.3729954065847296
|
| 122 |
+
}
|
| 123 |
+
},
|
| 124 |
+
"output_format": {
|
| 125 |
+
"contextual_formatted_text": {
|
| 126 |
+
"count": 98,
|
| 127 |
+
"num_samples": 1514,
|
| 128 |
+
"tasks": [],
|
| 129 |
+
"average_score": 0.4226567593431527
|
| 130 |
+
},
|
| 131 |
+
"structured_output": {
|
| 132 |
+
"count": 110,
|
| 133 |
+
"num_samples": 1714,
|
| 134 |
+
"tasks": [],
|
| 135 |
+
"average_score": 0.4149806887502539
|
| 136 |
+
},
|
| 137 |
+
"exact_text": {
|
| 138 |
+
"count": 83,
|
| 139 |
+
"num_samples": 1278,
|
| 140 |
+
"tasks": [],
|
| 141 |
+
"average_score": 0.4904285184890861
|
| 142 |
+
},
|
| 143 |
+
"numerical_data": {
|
| 144 |
+
"count": 49,
|
| 145 |
+
"num_samples": 862,
|
| 146 |
+
"tasks": [],
|
| 147 |
+
"average_score": 0.4348674018783908
|
| 148 |
+
},
|
| 149 |
+
"open_ended_output": {
|
| 150 |
+
"count": 80,
|
| 151 |
+
"num_samples": 1454,
|
| 152 |
+
"tasks": [],
|
| 153 |
+
"average_score": 0.5124942746906233
|
| 154 |
+
},
|
| 155 |
+
"multiple_choice": {
|
| 156 |
+
"count": 85,
|
| 157 |
+
"num_samples": 1363,
|
| 158 |
+
"tasks": [],
|
| 159 |
+
"average_score": 0.4717682857925982
|
| 160 |
+
}
|
| 161 |
+
},
|
| 162 |
+
"input_num": {
|
| 163 |
+
"6-8 images": {
|
| 164 |
+
"count": 21,
|
| 165 |
+
"num_samples": 314,
|
| 166 |
+
"tasks": [],
|
| 167 |
+
"average_score": 0.20496909081092754
|
| 168 |
+
},
|
| 169 |
+
"9-image or more": {
|
| 170 |
+
"count": 41,
|
| 171 |
+
"num_samples": 623,
|
| 172 |
+
"tasks": [],
|
| 173 |
+
"average_score": 0.4184724897299287
|
| 174 |
+
},
|
| 175 |
+
"1-image": {
|
| 176 |
+
"count": 315,
|
| 177 |
+
"num_samples": 5228,
|
| 178 |
+
"tasks": [],
|
| 179 |
+
"average_score": 0.4951997132559491
|
| 180 |
+
},
|
| 181 |
+
"video": {
|
| 182 |
+
"count": 43,
|
| 183 |
+
"num_samples": 698,
|
| 184 |
+
"tasks": [],
|
| 185 |
+
"average_score": 0.5300255483670417
|
| 186 |
+
},
|
| 187 |
+
"4-5 images": {
|
| 188 |
+
"count": 34,
|
| 189 |
+
"num_samples": 520,
|
| 190 |
+
"tasks": [],
|
| 191 |
+
"average_score": 0.286105084660728
|
| 192 |
+
},
|
| 193 |
+
"2-3 images": {
|
| 194 |
+
"count": 51,
|
| 195 |
+
"num_samples": 802,
|
| 196 |
+
"tasks": [],
|
| 197 |
+
"average_score": 0.39635000103107665
|
| 198 |
+
}
|
| 199 |
+
},
|
| 200 |
+
"app": {
|
| 201 |
+
"Information_Extraction": {
|
| 202 |
+
"count": 72,
|
| 203 |
+
"num_samples": 1124,
|
| 204 |
+
"tasks": [],
|
| 205 |
+
"average_score": 0.5401547630322637
|
| 206 |
+
},
|
| 207 |
+
"Planning": {
|
| 208 |
+
"count": 78,
|
| 209 |
+
"num_samples": 1239,
|
| 210 |
+
"tasks": [],
|
| 211 |
+
"average_score": 0.26403470419652064
|
| 212 |
+
},
|
| 213 |
+
"Coding": {
|
| 214 |
+
"count": 31,
|
| 215 |
+
"num_samples": 474,
|
| 216 |
+
"tasks": [],
|
| 217 |
+
"average_score": 0.3933356676003734
|
| 218 |
+
},
|
| 219 |
+
"Perception": {
|
| 220 |
+
"count": 145,
|
| 221 |
+
"num_samples": 2313,
|
| 222 |
+
"tasks": [],
|
| 223 |
+
"average_score": 0.5168098196770042
|
| 224 |
+
},
|
| 225 |
+
"Metrics": {
|
| 226 |
+
"count": 20,
|
| 227 |
+
"num_samples": 309,
|
| 228 |
+
"tasks": [],
|
| 229 |
+
"average_score": 0.47731479110938463
|
| 230 |
+
},
|
| 231 |
+
"Science": {
|
| 232 |
+
"count": 29,
|
| 233 |
+
"num_samples": 574,
|
| 234 |
+
"tasks": [],
|
| 235 |
+
"average_score": 0.4388571290145052
|
| 236 |
+
},
|
| 237 |
+
"Knowledge": {
|
| 238 |
+
"count": 97,
|
| 239 |
+
"num_samples": 1605,
|
| 240 |
+
"tasks": [],
|
| 241 |
+
"average_score": 0.5034762755043025
|
| 242 |
+
},
|
| 243 |
+
"Mathematics": {
|
| 244 |
+
"count": 33,
|
| 245 |
+
"num_samples": 547,
|
| 246 |
+
"tasks": [],
|
| 247 |
+
"average_score": 0.37742798395328586
|
| 248 |
+
}
|
| 249 |
+
}
|
| 250 |
+
}
|
| 251 |
+
}
|
static/eval_results/Default/InternVL2_5_78B/task_results.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
static/eval_results/Default/InternVL2_76B/summary_results.json
ADDED
|
@@ -0,0 +1,251 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_summary": {
|
| 3 |
+
"core": {
|
| 4 |
+
"num_eval_tasks": 440,
|
| 5 |
+
"num_eval_samples": 6539,
|
| 6 |
+
"macro_mean_score": 0.3562710424410931,
|
| 7 |
+
"micro_mean_score": 0.35129859801162616
|
| 8 |
+
},
|
| 9 |
+
"open": {
|
| 10 |
+
"num_eval_tasks": 65,
|
| 11 |
+
"num_eval_samples": 1163,
|
| 12 |
+
"macro_mean_score": 0.5192997443033639,
|
| 13 |
+
"micro_mean_score": 0.5421324161650903
|
| 14 |
+
},
|
| 15 |
+
"overall_score": 0.3772549347599992
|
| 16 |
+
},
|
| 17 |
+
"keyword_stats": {
|
| 18 |
+
"skills": {
|
| 19 |
+
"Object Recognition and Classification": {
|
| 20 |
+
"count": 303,
|
| 21 |
+
"num_samples": 4755,
|
| 22 |
+
"tasks": [],
|
| 23 |
+
"average_score": 0.38193012983650343
|
| 24 |
+
},
|
| 25 |
+
"Text Recognition (OCR)": {
|
| 26 |
+
"count": 137,
|
| 27 |
+
"num_samples": 2239,
|
| 28 |
+
"tasks": [],
|
| 29 |
+
"average_score": 0.41315219763443384
|
| 30 |
+
},
|
| 31 |
+
"Language Understanding and Generation": {
|
| 32 |
+
"count": 154,
|
| 33 |
+
"num_samples": 2509,
|
| 34 |
+
"tasks": [],
|
| 35 |
+
"average_score": 0.43665980552577693
|
| 36 |
+
},
|
| 37 |
+
"Scene and Event Understanding": {
|
| 38 |
+
"count": 154,
|
| 39 |
+
"num_samples": 2467,
|
| 40 |
+
"tasks": [],
|
| 41 |
+
"average_score": 0.4265623936500962
|
| 42 |
+
},
|
| 43 |
+
"Mathematical and Logical Reasoning": {
|
| 44 |
+
"count": 109,
|
| 45 |
+
"num_samples": 1910,
|
| 46 |
+
"tasks": [],
|
| 47 |
+
"average_score": 0.2975890791763991
|
| 48 |
+
},
|
| 49 |
+
"Commonsense and Social Reasoning": {
|
| 50 |
+
"count": 51,
|
| 51 |
+
"num_samples": 855,
|
| 52 |
+
"tasks": [],
|
| 53 |
+
"average_score": 0.5257990949897898
|
| 54 |
+
},
|
| 55 |
+
"Ethical and Safety Reasoning": {
|
| 56 |
+
"count": 15,
|
| 57 |
+
"num_samples": 245,
|
| 58 |
+
"tasks": [],
|
| 59 |
+
"average_score": 0.5779473684210527
|
| 60 |
+
},
|
| 61 |
+
"Domain-Specific Knowledge and Skills": {
|
| 62 |
+
"count": 77,
|
| 63 |
+
"num_samples": 1386,
|
| 64 |
+
"tasks": [],
|
| 65 |
+
"average_score": 0.33287081421166276
|
| 66 |
+
},
|
| 67 |
+
"Spatial and Temporal Reasoning": {
|
| 68 |
+
"count": 152,
|
| 69 |
+
"num_samples": 2437,
|
| 70 |
+
"tasks": [],
|
| 71 |
+
"average_score": 0.2949505390920417
|
| 72 |
+
},
|
| 73 |
+
"Planning and Decision Making": {
|
| 74 |
+
"count": 37,
|
| 75 |
+
"num_samples": 577,
|
| 76 |
+
"tasks": [],
|
| 77 |
+
"average_score": 0.17036496432397477
|
| 78 |
+
}
|
| 79 |
+
},
|
| 80 |
+
"input_format": {
|
| 81 |
+
"User Interface Screenshots": {
|
| 82 |
+
"count": 93,
|
| 83 |
+
"num_samples": 1517,
|
| 84 |
+
"tasks": [],
|
| 85 |
+
"average_score": 0.3634339625985008
|
| 86 |
+
},
|
| 87 |
+
"Text-Based Images and Documents": {
|
| 88 |
+
"count": 82,
|
| 89 |
+
"num_samples": 1294,
|
| 90 |
+
"tasks": [],
|
| 91 |
+
"average_score": 0.31396468806559114
|
| 92 |
+
},
|
| 93 |
+
"Diagrams and Data Visualizations": {
|
| 94 |
+
"count": 101,
|
| 95 |
+
"num_samples": 1718,
|
| 96 |
+
"tasks": [],
|
| 97 |
+
"average_score": 0.3473756113126343
|
| 98 |
+
},
|
| 99 |
+
"Videos": {
|
| 100 |
+
"count": 43,
|
| 101 |
+
"num_samples": 698,
|
| 102 |
+
"tasks": [],
|
| 103 |
+
"average_score": 0.395893002855977
|
| 104 |
+
},
|
| 105 |
+
"Artistic and Creative Content": {
|
| 106 |
+
"count": 32,
|
| 107 |
+
"num_samples": 541,
|
| 108 |
+
"tasks": [],
|
| 109 |
+
"average_score": 0.44982107744035305
|
| 110 |
+
},
|
| 111 |
+
"Photographs": {
|
| 112 |
+
"count": 143,
|
| 113 |
+
"num_samples": 2248,
|
| 114 |
+
"tasks": [],
|
| 115 |
+
"average_score": 0.42875248733027654
|
| 116 |
+
},
|
| 117 |
+
"3D Models and Aerial Imagery": {
|
| 118 |
+
"count": 11,
|
| 119 |
+
"num_samples": 169,
|
| 120 |
+
"tasks": [],
|
| 121 |
+
"average_score": 0.2868239162778749
|
| 122 |
+
}
|
| 123 |
+
},
|
| 124 |
+
"output_format": {
|
| 125 |
+
"contextual_formatted_text": {
|
| 126 |
+
"count": 98,
|
| 127 |
+
"num_samples": 1514,
|
| 128 |
+
"tasks": [],
|
| 129 |
+
"average_score": 0.3630499545707523
|
| 130 |
+
},
|
| 131 |
+
"structured_output": {
|
| 132 |
+
"count": 110,
|
| 133 |
+
"num_samples": 1714,
|
| 134 |
+
"tasks": [],
|
| 135 |
+
"average_score": 0.3476691827105281
|
| 136 |
+
},
|
| 137 |
+
"exact_text": {
|
| 138 |
+
"count": 83,
|
| 139 |
+
"num_samples": 1278,
|
| 140 |
+
"tasks": [],
|
| 141 |
+
"average_score": 0.3943337471922549
|
| 142 |
+
},
|
| 143 |
+
"numerical_data": {
|
| 144 |
+
"count": 49,
|
| 145 |
+
"num_samples": 862,
|
| 146 |
+
"tasks": [],
|
| 147 |
+
"average_score": 0.29244088978470345
|
| 148 |
+
},
|
| 149 |
+
"open_ended_output": {
|
| 150 |
+
"count": 80,
|
| 151 |
+
"num_samples": 1454,
|
| 152 |
+
"tasks": [],
|
| 153 |
+
"average_score": 0.45822072478616577
|
| 154 |
+
},
|
| 155 |
+
"multiple_choice": {
|
| 156 |
+
"count": 85,
|
| 157 |
+
"num_samples": 1363,
|
| 158 |
+
"tasks": [],
|
| 159 |
+
"average_score": 0.3879326330400817
|
| 160 |
+
}
|
| 161 |
+
},
|
| 162 |
+
"input_num": {
|
| 163 |
+
"6-8 images": {
|
| 164 |
+
"count": 21,
|
| 165 |
+
"num_samples": 314,
|
| 166 |
+
"tasks": [],
|
| 167 |
+
"average_score": 0.20309901738473166
|
| 168 |
+
},
|
| 169 |
+
"9-image or more": {
|
| 170 |
+
"count": 41,
|
| 171 |
+
"num_samples": 623,
|
| 172 |
+
"tasks": [],
|
| 173 |
+
"average_score": 0.34771123515123364
|
| 174 |
+
},
|
| 175 |
+
"1-image": {
|
| 176 |
+
"count": 315,
|
| 177 |
+
"num_samples": 5228,
|
| 178 |
+
"tasks": [],
|
| 179 |
+
"average_score": 0.4145693044465943
|
| 180 |
+
},
|
| 181 |
+
"video": {
|
| 182 |
+
"count": 43,
|
| 183 |
+
"num_samples": 698,
|
| 184 |
+
"tasks": [],
|
| 185 |
+
"average_score": 0.395893002855977
|
| 186 |
+
},
|
| 187 |
+
"4-5 images": {
|
| 188 |
+
"count": 34,
|
| 189 |
+
"num_samples": 520,
|
| 190 |
+
"tasks": [],
|
| 191 |
+
"average_score": 0.24403942809507134
|
| 192 |
+
},
|
| 193 |
+
"2-3 images": {
|
| 194 |
+
"count": 51,
|
| 195 |
+
"num_samples": 802,
|
| 196 |
+
"tasks": [],
|
| 197 |
+
"average_score": 0.3153417935059416
|
| 198 |
+
}
|
| 199 |
+
},
|
| 200 |
+
"app": {
|
| 201 |
+
"Information_Extraction": {
|
| 202 |
+
"count": 72,
|
| 203 |
+
"num_samples": 1124,
|
| 204 |
+
"tasks": [],
|
| 205 |
+
"average_score": 0.4306947454508794
|
| 206 |
+
},
|
| 207 |
+
"Planning": {
|
| 208 |
+
"count": 78,
|
| 209 |
+
"num_samples": 1239,
|
| 210 |
+
"tasks": [],
|
| 211 |
+
"average_score": 0.2132321995754061
|
| 212 |
+
},
|
| 213 |
+
"Coding": {
|
| 214 |
+
"count": 31,
|
| 215 |
+
"num_samples": 474,
|
| 216 |
+
"tasks": [],
|
| 217 |
+
"average_score": 0.2953329718984368
|
| 218 |
+
},
|
| 219 |
+
"Perception": {
|
| 220 |
+
"count": 145,
|
| 221 |
+
"num_samples": 2313,
|
| 222 |
+
"tasks": [],
|
| 223 |
+
"average_score": 0.42202934355552685
|
| 224 |
+
},
|
| 225 |
+
"Metrics": {
|
| 226 |
+
"count": 20,
|
| 227 |
+
"num_samples": 309,
|
| 228 |
+
"tasks": [],
|
| 229 |
+
"average_score": 0.47409276729986083
|
| 230 |
+
},
|
| 231 |
+
"Science": {
|
| 232 |
+
"count": 29,
|
| 233 |
+
"num_samples": 574,
|
| 234 |
+
"tasks": [],
|
| 235 |
+
"average_score": 0.30014798153766264
|
| 236 |
+
},
|
| 237 |
+
"Knowledge": {
|
| 238 |
+
"count": 97,
|
| 239 |
+
"num_samples": 1605,
|
| 240 |
+
"tasks": [],
|
| 241 |
+
"average_score": 0.4625649385962016
|
| 242 |
+
},
|
| 243 |
+
"Mathematics": {
|
| 244 |
+
"count": 33,
|
| 245 |
+
"num_samples": 547,
|
| 246 |
+
"tasks": [],
|
| 247 |
+
"average_score": 0.2868813944130515
|
| 248 |
+
}
|
| 249 |
+
}
|
| 250 |
+
}
|
| 251 |
+
}
|
static/eval_results/Default/InternVL2_76B/task_results.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
static/eval_results/Default/InternVL2_8B/summary_results.json
ADDED
|
@@ -0,0 +1,251 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_summary": {
|
| 3 |
+
"core": {
|
| 4 |
+
"num_eval_tasks": 440,
|
| 5 |
+
"num_eval_samples": 6539,
|
| 6 |
+
"macro_mean_score": 0.25956581776451815,
|
| 7 |
+
"micro_mean_score": 0.2546984460483302
|
| 8 |
+
},
|
| 9 |
+
"open": {
|
| 10 |
+
"num_eval_tasks": 65,
|
| 11 |
+
"num_eval_samples": 1165,
|
| 12 |
+
"macro_mean_score": 0.3978571701460552,
|
| 13 |
+
"micro_mean_score": 0.4108583690987125
|
| 14 |
+
},
|
| 15 |
+
"overall_score": 0.2773656948037259
|
| 16 |
+
},
|
| 17 |
+
"keyword_stats": {
|
| 18 |
+
"skills": {
|
| 19 |
+
"Object Recognition and Classification": {
|
| 20 |
+
"count": 303,
|
| 21 |
+
"num_samples": 4755,
|
| 22 |
+
"tasks": [],
|
| 23 |
+
"average_score": 0.2817247716997634
|
| 24 |
+
},
|
| 25 |
+
"Text Recognition (OCR)": {
|
| 26 |
+
"count": 137,
|
| 27 |
+
"num_samples": 2239,
|
| 28 |
+
"tasks": [],
|
| 29 |
+
"average_score": 0.280559214034858
|
| 30 |
+
},
|
| 31 |
+
"Language Understanding and Generation": {
|
| 32 |
+
"count": 154,
|
| 33 |
+
"num_samples": 2511,
|
| 34 |
+
"tasks": [],
|
| 35 |
+
"average_score": 0.32020728060179815
|
| 36 |
+
},
|
| 37 |
+
"Scene and Event Understanding": {
|
| 38 |
+
"count": 154,
|
| 39 |
+
"num_samples": 2469,
|
| 40 |
+
"tasks": [],
|
| 41 |
+
"average_score": 0.325593535916075
|
| 42 |
+
},
|
| 43 |
+
"Mathematical and Logical Reasoning": {
|
| 44 |
+
"count": 109,
|
| 45 |
+
"num_samples": 1910,
|
| 46 |
+
"tasks": [],
|
| 47 |
+
"average_score": 0.24118253695139918
|
| 48 |
+
},
|
| 49 |
+
"Commonsense and Social Reasoning": {
|
| 50 |
+
"count": 51,
|
| 51 |
+
"num_samples": 855,
|
| 52 |
+
"tasks": [],
|
| 53 |
+
"average_score": 0.39684007367798446
|
| 54 |
+
},
|
| 55 |
+
"Ethical and Safety Reasoning": {
|
| 56 |
+
"count": 15,
|
| 57 |
+
"num_samples": 245,
|
| 58 |
+
"tasks": [],
|
| 59 |
+
"average_score": 0.4700852130325815
|
| 60 |
+
},
|
| 61 |
+
"Domain-Specific Knowledge and Skills": {
|
| 62 |
+
"count": 77,
|
| 63 |
+
"num_samples": 1386,
|
| 64 |
+
"tasks": [],
|
| 65 |
+
"average_score": 0.27052668526005397
|
| 66 |
+
},
|
| 67 |
+
"Spatial and Temporal Reasoning": {
|
| 68 |
+
"count": 152,
|
| 69 |
+
"num_samples": 2439,
|
| 70 |
+
"tasks": [],
|
| 71 |
+
"average_score": 0.23189345356483618
|
| 72 |
+
},
|
| 73 |
+
"Planning and Decision Making": {
|
| 74 |
+
"count": 37,
|
| 75 |
+
"num_samples": 577,
|
| 76 |
+
"tasks": [],
|
| 77 |
+
"average_score": 0.08260405712900723
|
| 78 |
+
}
|
| 79 |
+
},
|
| 80 |
+
"input_format": {
|
| 81 |
+
"User Interface Screenshots": {
|
| 82 |
+
"count": 93,
|
| 83 |
+
"num_samples": 1517,
|
| 84 |
+
"tasks": [],
|
| 85 |
+
"average_score": 0.22800928556370195
|
| 86 |
+
},
|
| 87 |
+
"Text-Based Images and Documents": {
|
| 88 |
+
"count": 82,
|
| 89 |
+
"num_samples": 1294,
|
| 90 |
+
"tasks": [],
|
| 91 |
+
"average_score": 0.2013779290163996
|
| 92 |
+
},
|
| 93 |
+
"Diagrams and Data Visualizations": {
|
| 94 |
+
"count": 101,
|
| 95 |
+
"num_samples": 1718,
|
| 96 |
+
"tasks": [],
|
| 97 |
+
"average_score": 0.2804429603269583
|
| 98 |
+
},
|
| 99 |
+
"Videos": {
|
| 100 |
+
"count": 43,
|
| 101 |
+
"num_samples": 700,
|
| 102 |
+
"tasks": [],
|
| 103 |
+
"average_score": 0.34791358240562653
|
| 104 |
+
},
|
| 105 |
+
"Artistic and Creative Content": {
|
| 106 |
+
"count": 32,
|
| 107 |
+
"num_samples": 541,
|
| 108 |
+
"tasks": [],
|
| 109 |
+
"average_score": 0.2942163420306113
|
| 110 |
+
},
|
| 111 |
+
"Photographs": {
|
| 112 |
+
"count": 143,
|
| 113 |
+
"num_samples": 2248,
|
| 114 |
+
"tasks": [],
|
| 115 |
+
"average_score": 0.3388056726588417
|
| 116 |
+
},
|
| 117 |
+
"3D Models and Aerial Imagery": {
|
| 118 |
+
"count": 11,
|
| 119 |
+
"num_samples": 169,
|
| 120 |
+
"tasks": [],
|
| 121 |
+
"average_score": 0.10933317885944857
|
| 122 |
+
}
|
| 123 |
+
},
|
| 124 |
+
"output_format": {
|
| 125 |
+
"contextual_formatted_text": {
|
| 126 |
+
"count": 98,
|
| 127 |
+
"num_samples": 1514,
|
| 128 |
+
"tasks": [],
|
| 129 |
+
"average_score": 0.250804626773504
|
| 130 |
+
},
|
| 131 |
+
"structured_output": {
|
| 132 |
+
"count": 110,
|
| 133 |
+
"num_samples": 1714,
|
| 134 |
+
"tasks": [],
|
| 135 |
+
"average_score": 0.2522493284864019
|
| 136 |
+
},
|
| 137 |
+
"exact_text": {
|
| 138 |
+
"count": 83,
|
| 139 |
+
"num_samples": 1278,
|
| 140 |
+
"tasks": [],
|
| 141 |
+
"average_score": 0.27414636444623874
|
| 142 |
+
},
|
| 143 |
+
"numerical_data": {
|
| 144 |
+
"count": 49,
|
| 145 |
+
"num_samples": 862,
|
| 146 |
+
"tasks": [],
|
| 147 |
+
"average_score": 0.22381302045502052
|
| 148 |
+
},
|
| 149 |
+
"open_ended_output": {
|
| 150 |
+
"count": 80,
|
| 151 |
+
"num_samples": 1456,
|
| 152 |
+
"tasks": [],
|
| 153 |
+
"average_score": 0.3537549824897016
|
| 154 |
+
},
|
| 155 |
+
"multiple_choice": {
|
| 156 |
+
"count": 85,
|
| 157 |
+
"num_samples": 1363,
|
| 158 |
+
"tasks": [],
|
| 159 |
+
"average_score": 0.30261189962428353
|
| 160 |
+
}
|
| 161 |
+
},
|
| 162 |
+
"input_num": {
|
| 163 |
+
"6-8 images": {
|
| 164 |
+
"count": 21,
|
| 165 |
+
"num_samples": 314,
|
| 166 |
+
"tasks": [],
|
| 167 |
+
"average_score": 0.15434618291761149
|
| 168 |
+
},
|
| 169 |
+
"9-image or more": {
|
| 170 |
+
"count": 41,
|
| 171 |
+
"num_samples": 623,
|
| 172 |
+
"tasks": [],
|
| 173 |
+
"average_score": 0.19872104324302098
|
| 174 |
+
},
|
| 175 |
+
"1-image": {
|
| 176 |
+
"count": 315,
|
| 177 |
+
"num_samples": 5228,
|
| 178 |
+
"tasks": [],
|
| 179 |
+
"average_score": 0.30088711082969344
|
| 180 |
+
},
|
| 181 |
+
"video": {
|
| 182 |
+
"count": 43,
|
| 183 |
+
"num_samples": 700,
|
| 184 |
+
"tasks": [],
|
| 185 |
+
"average_score": 0.34791358240562653
|
| 186 |
+
},
|
| 187 |
+
"4-5 images": {
|
| 188 |
+
"count": 34,
|
| 189 |
+
"num_samples": 520,
|
| 190 |
+
"tasks": [],
|
| 191 |
+
"average_score": 0.17725087609332119
|
| 192 |
+
},
|
| 193 |
+
"2-3 images": {
|
| 194 |
+
"count": 51,
|
| 195 |
+
"num_samples": 802,
|
| 196 |
+
"tasks": [],
|
| 197 |
+
"average_score": 0.2532272454839157
|
| 198 |
+
}
|
| 199 |
+
},
|
| 200 |
+
"app": {
|
| 201 |
+
"Information_Extraction": {
|
| 202 |
+
"count": 72,
|
| 203 |
+
"num_samples": 1124,
|
| 204 |
+
"tasks": [],
|
| 205 |
+
"average_score": 0.29129840423784176
|
| 206 |
+
},
|
| 207 |
+
"Planning": {
|
| 208 |
+
"count": 78,
|
| 209 |
+
"num_samples": 1239,
|
| 210 |
+
"tasks": [],
|
| 211 |
+
"average_score": 0.12166926715781588
|
| 212 |
+
},
|
| 213 |
+
"Coding": {
|
| 214 |
+
"count": 31,
|
| 215 |
+
"num_samples": 474,
|
| 216 |
+
"tasks": [],
|
| 217 |
+
"average_score": 0.24700310231619527
|
| 218 |
+
},
|
| 219 |
+
"Perception": {
|
| 220 |
+
"count": 145,
|
| 221 |
+
"num_samples": 2315,
|
| 222 |
+
"tasks": [],
|
| 223 |
+
"average_score": 0.3214666523378005
|
| 224 |
+
},
|
| 225 |
+
"Metrics": {
|
| 226 |
+
"count": 20,
|
| 227 |
+
"num_samples": 309,
|
| 228 |
+
"tasks": [],
|
| 229 |
+
"average_score": 0.3995660275981844
|
| 230 |
+
},
|
| 231 |
+
"Science": {
|
| 232 |
+
"count": 29,
|
| 233 |
+
"num_samples": 574,
|
| 234 |
+
"tasks": [],
|
| 235 |
+
"average_score": 0.24614711281861912
|
| 236 |
+
},
|
| 237 |
+
"Knowledge": {
|
| 238 |
+
"count": 97,
|
| 239 |
+
"num_samples": 1605,
|
| 240 |
+
"tasks": [],
|
| 241 |
+
"average_score": 0.3393895915929317
|
| 242 |
+
},
|
| 243 |
+
"Mathematics": {
|
| 244 |
+
"count": 33,
|
| 245 |
+
"num_samples": 547,
|
| 246 |
+
"tasks": [],
|
| 247 |
+
"average_score": 0.22078333222564453
|
| 248 |
+
}
|
| 249 |
+
}
|
| 250 |
+
}
|
| 251 |
+
}
|
static/eval_results/Default/InternVL2_8B/task_results.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
static/eval_results/Default/Llama_3_2_11B/summary_results.json
ADDED
|
@@ -0,0 +1,251 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_summary": {
|
| 3 |
+
"core": {
|
| 4 |
+
"num_eval_tasks": 440,
|
| 5 |
+
"num_eval_samples": 6539,
|
| 6 |
+
"macro_mean_score": 0.15999641916771298,
|
| 7 |
+
"micro_mean_score": 0.15809331016967038
|
| 8 |
+
},
|
| 9 |
+
"open": {
|
| 10 |
+
"num_eval_tasks": 65,
|
| 11 |
+
"num_eval_samples": 1163,
|
| 12 |
+
"macro_mean_score": 0.3173342406187366,
|
| 13 |
+
"micro_mean_score": 0.3487962166809973
|
| 14 |
+
},
|
| 15 |
+
"overall_score": 0.1802478219287358
|
| 16 |
+
},
|
| 17 |
+
"keyword_stats": {
|
| 18 |
+
"skills": {
|
| 19 |
+
"Object Recognition and Classification": {
|
| 20 |
+
"count": 303,
|
| 21 |
+
"num_samples": 4755,
|
| 22 |
+
"tasks": [],
|
| 23 |
+
"average_score": 0.1907604552173455
|
| 24 |
+
},
|
| 25 |
+
"Text Recognition (OCR)": {
|
| 26 |
+
"count": 137,
|
| 27 |
+
"num_samples": 2239,
|
| 28 |
+
"tasks": [],
|
| 29 |
+
"average_score": 0.14328677752263275
|
| 30 |
+
},
|
| 31 |
+
"Language Understanding and Generation": {
|
| 32 |
+
"count": 154,
|
| 33 |
+
"num_samples": 2509,
|
| 34 |
+
"tasks": [],
|
| 35 |
+
"average_score": 0.19646404502647707
|
| 36 |
+
},
|
| 37 |
+
"Scene and Event Understanding": {
|
| 38 |
+
"count": 154,
|
| 39 |
+
"num_samples": 2467,
|
| 40 |
+
"tasks": [],
|
| 41 |
+
"average_score": 0.22399113135844315
|
| 42 |
+
},
|
| 43 |
+
"Mathematical and Logical Reasoning": {
|
| 44 |
+
"count": 109,
|
| 45 |
+
"num_samples": 1910,
|
| 46 |
+
"tasks": [],
|
| 47 |
+
"average_score": 0.13303760019716085
|
| 48 |
+
},
|
| 49 |
+
"Commonsense and Social Reasoning": {
|
| 50 |
+
"count": 51,
|
| 51 |
+
"num_samples": 855,
|
| 52 |
+
"tasks": [],
|
| 53 |
+
"average_score": 0.323153603297999
|
| 54 |
+
},
|
| 55 |
+
"Ethical and Safety Reasoning": {
|
| 56 |
+
"count": 15,
|
| 57 |
+
"num_samples": 245,
|
| 58 |
+
"tasks": [],
|
| 59 |
+
"average_score": 0.4260501253132832
|
| 60 |
+
},
|
| 61 |
+
"Domain-Specific Knowledge and Skills": {
|
| 62 |
+
"count": 77,
|
| 63 |
+
"num_samples": 1386,
|
| 64 |
+
"tasks": [],
|
| 65 |
+
"average_score": 0.1770852858056774
|
| 66 |
+
},
|
| 67 |
+
"Spatial and Temporal Reasoning": {
|
| 68 |
+
"count": 152,
|
| 69 |
+
"num_samples": 2437,
|
| 70 |
+
"tasks": [],
|
| 71 |
+
"average_score": 0.15366454315378308
|
| 72 |
+
},
|
| 73 |
+
"Planning and Decision Making": {
|
| 74 |
+
"count": 37,
|
| 75 |
+
"num_samples": 577,
|
| 76 |
+
"tasks": [],
|
| 77 |
+
"average_score": 0.06563884729522687
|
| 78 |
+
}
|
| 79 |
+
},
|
| 80 |
+
"input_format": {
|
| 81 |
+
"User Interface Screenshots": {
|
| 82 |
+
"count": 93,
|
| 83 |
+
"num_samples": 1517,
|
| 84 |
+
"tasks": [],
|
| 85 |
+
"average_score": 0.11886347847341794
|
| 86 |
+
},
|
| 87 |
+
"Text-Based Images and Documents": {
|
| 88 |
+
"count": 82,
|
| 89 |
+
"num_samples": 1294,
|
| 90 |
+
"tasks": [],
|
| 91 |
+
"average_score": 0.11489351406848371
|
| 92 |
+
},
|
| 93 |
+
"Diagrams and Data Visualizations": {
|
| 94 |
+
"count": 101,
|
| 95 |
+
"num_samples": 1718,
|
| 96 |
+
"tasks": [],
|
| 97 |
+
"average_score": 0.1693681214060816
|
| 98 |
+
},
|
| 99 |
+
"Videos": {
|
| 100 |
+
"count": 43,
|
| 101 |
+
"num_samples": 698,
|
| 102 |
+
"tasks": [],
|
| 103 |
+
"average_score": 0.2123769209846321
|
| 104 |
+
},
|
| 105 |
+
"Artistic and Creative Content": {
|
| 106 |
+
"count": 32,
|
| 107 |
+
"num_samples": 541,
|
| 108 |
+
"tasks": [],
|
| 109 |
+
"average_score": 0.2520175802062012
|
| 110 |
+
},
|
| 111 |
+
"Photographs": {
|
| 112 |
+
"count": 143,
|
| 113 |
+
"num_samples": 2248,
|
| 114 |
+
"tasks": [],
|
| 115 |
+
"average_score": 0.2485354956932213
|
| 116 |
+
},
|
| 117 |
+
"3D Models and Aerial Imagery": {
|
| 118 |
+
"count": 11,
|
| 119 |
+
"num_samples": 169,
|
| 120 |
+
"tasks": [],
|
| 121 |
+
"average_score": 0.06418655520777307
|
| 122 |
+
}
|
| 123 |
+
},
|
| 124 |
+
"output_format": {
|
| 125 |
+
"contextual_formatted_text": {
|
| 126 |
+
"count": 98,
|
| 127 |
+
"num_samples": 1514,
|
| 128 |
+
"tasks": [],
|
| 129 |
+
"average_score": 0.12417283740525839
|
| 130 |
+
},
|
| 131 |
+
"structured_output": {
|
| 132 |
+
"count": 110,
|
| 133 |
+
"num_samples": 1714,
|
| 134 |
+
"tasks": [],
|
| 135 |
+
"average_score": 0.16374180545556977
|
| 136 |
+
},
|
| 137 |
+
"exact_text": {
|
| 138 |
+
"count": 83,
|
| 139 |
+
"num_samples": 1278,
|
| 140 |
+
"tasks": [],
|
| 141 |
+
"average_score": 0.1576236804437753
|
| 142 |
+
},
|
| 143 |
+
"numerical_data": {
|
| 144 |
+
"count": 49,
|
| 145 |
+
"num_samples": 862,
|
| 146 |
+
"tasks": [],
|
| 147 |
+
"average_score": 0.15014439824913947
|
| 148 |
+
},
|
| 149 |
+
"open_ended_output": {
|
| 150 |
+
"count": 80,
|
| 151 |
+
"num_samples": 1454,
|
| 152 |
+
"tasks": [],
|
| 153 |
+
"average_score": 0.3003142292328822
|
| 154 |
+
},
|
| 155 |
+
"multiple_choice": {
|
| 156 |
+
"count": 85,
|
| 157 |
+
"num_samples": 1363,
|
| 158 |
+
"tasks": [],
|
| 159 |
+
"average_score": 0.19270157739425633
|
| 160 |
+
}
|
| 161 |
+
},
|
| 162 |
+
"input_num": {
|
| 163 |
+
"6-8 images": {
|
| 164 |
+
"count": 21,
|
| 165 |
+
"num_samples": 314,
|
| 166 |
+
"tasks": [],
|
| 167 |
+
"average_score": 0.1463246409674981
|
| 168 |
+
},
|
| 169 |
+
"9-image or more": {
|
| 170 |
+
"count": 41,
|
| 171 |
+
"num_samples": 623,
|
| 172 |
+
"tasks": [],
|
| 173 |
+
"average_score": 0.0732004839476103
|
| 174 |
+
},
|
| 175 |
+
"1-image": {
|
| 176 |
+
"count": 315,
|
| 177 |
+
"num_samples": 5228,
|
| 178 |
+
"tasks": [],
|
| 179 |
+
"average_score": 0.1960107191983825
|
| 180 |
+
},
|
| 181 |
+
"video": {
|
| 182 |
+
"count": 43,
|
| 183 |
+
"num_samples": 698,
|
| 184 |
+
"tasks": [],
|
| 185 |
+
"average_score": 0.2123769209846321
|
| 186 |
+
},
|
| 187 |
+
"4-5 images": {
|
| 188 |
+
"count": 34,
|
| 189 |
+
"num_samples": 520,
|
| 190 |
+
"tasks": [],
|
| 191 |
+
"average_score": 0.1351857051327849
|
| 192 |
+
},
|
| 193 |
+
"2-3 images": {
|
| 194 |
+
"count": 51,
|
| 195 |
+
"num_samples": 802,
|
| 196 |
+
"tasks": [],
|
| 197 |
+
"average_score": 0.18586695387250338
|
| 198 |
+
}
|
| 199 |
+
},
|
| 200 |
+
"app": {
|
| 201 |
+
"Information_Extraction": {
|
| 202 |
+
"count": 72,
|
| 203 |
+
"num_samples": 1124,
|
| 204 |
+
"tasks": [],
|
| 205 |
+
"average_score": 0.17288724679416761
|
| 206 |
+
},
|
| 207 |
+
"Planning": {
|
| 208 |
+
"count": 78,
|
| 209 |
+
"num_samples": 1239,
|
| 210 |
+
"tasks": [],
|
| 211 |
+
"average_score": 0.08100042975820579
|
| 212 |
+
},
|
| 213 |
+
"Coding": {
|
| 214 |
+
"count": 31,
|
| 215 |
+
"num_samples": 474,
|
| 216 |
+
"tasks": [],
|
| 217 |
+
"average_score": 0.0575426944971537
|
| 218 |
+
},
|
| 219 |
+
"Perception": {
|
| 220 |
+
"count": 145,
|
| 221 |
+
"num_samples": 2313,
|
| 222 |
+
"tasks": [],
|
| 223 |
+
"average_score": 0.19899465185565898
|
| 224 |
+
},
|
| 225 |
+
"Metrics": {
|
| 226 |
+
"count": 20,
|
| 227 |
+
"num_samples": 309,
|
| 228 |
+
"tasks": [],
|
| 229 |
+
"average_score": 0.254316961351997
|
| 230 |
+
},
|
| 231 |
+
"Science": {
|
| 232 |
+
"count": 29,
|
| 233 |
+
"num_samples": 574,
|
| 234 |
+
"tasks": [],
|
| 235 |
+
"average_score": 0.162801811963855
|
| 236 |
+
},
|
| 237 |
+
"Knowledge": {
|
| 238 |
+
"count": 97,
|
| 239 |
+
"num_samples": 1605,
|
| 240 |
+
"tasks": [],
|
| 241 |
+
"average_score": 0.28055776664538923
|
| 242 |
+
},
|
| 243 |
+
"Mathematics": {
|
| 244 |
+
"count": 33,
|
| 245 |
+
"num_samples": 547,
|
| 246 |
+
"tasks": [],
|
| 247 |
+
"average_score": 0.13937853323074623
|
| 248 |
+
}
|
| 249 |
+
}
|
| 250 |
+
}
|
| 251 |
+
}
|
static/eval_results/Default/Llama_3_2_11B/task_results.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
static/eval_results/Default/Mammoth_VL/summary_results.json
ADDED
|
@@ -0,0 +1,251 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_summary": {
|
| 3 |
+
"core": {
|
| 4 |
+
"num_eval_tasks": 440,
|
| 5 |
+
"num_eval_samples": 6539,
|
| 6 |
+
"macro_mean_score": 0.264052880412689,
|
| 7 |
+
"micro_mean_score": 0.2626894374387823
|
| 8 |
+
},
|
| 9 |
+
"open": {
|
| 10 |
+
"num_eval_tasks": 65,
|
| 11 |
+
"num_eval_samples": 1163,
|
| 12 |
+
"macro_mean_score": 0.37992668750165337,
|
| 13 |
+
"micro_mean_score": 0.40120378331900275
|
| 14 |
+
},
|
| 15 |
+
"overall_score": 0.27896733083008046
|
| 16 |
+
},
|
| 17 |
+
"keyword_stats": {
|
| 18 |
+
"skills": {
|
| 19 |
+
"Object Recognition and Classification": {
|
| 20 |
+
"count": 303,
|
| 21 |
+
"num_samples": 4755,
|
| 22 |
+
"tasks": [],
|
| 23 |
+
"average_score": 0.30194776127683565
|
| 24 |
+
},
|
| 25 |
+
"Text Recognition (OCR)": {
|
| 26 |
+
"count": 137,
|
| 27 |
+
"num_samples": 2239,
|
| 28 |
+
"tasks": [],
|
| 29 |
+
"average_score": 0.2365295791606494
|
| 30 |
+
},
|
| 31 |
+
"Language Understanding and Generation": {
|
| 32 |
+
"count": 154,
|
| 33 |
+
"num_samples": 2509,
|
| 34 |
+
"tasks": [],
|
| 35 |
+
"average_score": 0.2993927028494267
|
| 36 |
+
},
|
| 37 |
+
"Scene and Event Understanding": {
|
| 38 |
+
"count": 154,
|
| 39 |
+
"num_samples": 2467,
|
| 40 |
+
"tasks": [],
|
| 41 |
+
"average_score": 0.3366347826116991
|
| 42 |
+
},
|
| 43 |
+
"Mathematical and Logical Reasoning": {
|
| 44 |
+
"count": 109,
|
| 45 |
+
"num_samples": 1910,
|
| 46 |
+
"tasks": [],
|
| 47 |
+
"average_score": 0.2408454736444444
|
| 48 |
+
},
|
| 49 |
+
"Commonsense and Social Reasoning": {
|
| 50 |
+
"count": 51,
|
| 51 |
+
"num_samples": 855,
|
| 52 |
+
"tasks": [],
|
| 53 |
+
"average_score": 0.37895522991264047
|
| 54 |
+
},
|
| 55 |
+
"Ethical and Safety Reasoning": {
|
| 56 |
+
"count": 15,
|
| 57 |
+
"num_samples": 245,
|
| 58 |
+
"tasks": [],
|
| 59 |
+
"average_score": 0.48003508771929826
|
| 60 |
+
},
|
| 61 |
+
"Domain-Specific Knowledge and Skills": {
|
| 62 |
+
"count": 77,
|
| 63 |
+
"num_samples": 1386,
|
| 64 |
+
"tasks": [],
|
| 65 |
+
"average_score": 0.27232427744946475
|
| 66 |
+
},
|
| 67 |
+
"Spatial and Temporal Reasoning": {
|
| 68 |
+
"count": 152,
|
| 69 |
+
"num_samples": 2437,
|
| 70 |
+
"tasks": [],
|
| 71 |
+
"average_score": 0.24522937191710698
|
| 72 |
+
},
|
| 73 |
+
"Planning and Decision Making": {
|
| 74 |
+
"count": 37,
|
| 75 |
+
"num_samples": 577,
|
| 76 |
+
"tasks": [],
|
| 77 |
+
"average_score": 0.11457024299726488
|
| 78 |
+
}
|
| 79 |
+
},
|
| 80 |
+
"input_format": {
|
| 81 |
+
"User Interface Screenshots": {
|
| 82 |
+
"count": 93,
|
| 83 |
+
"num_samples": 1517,
|
| 84 |
+
"tasks": [],
|
| 85 |
+
"average_score": 0.18941525254390731
|
| 86 |
+
},
|
| 87 |
+
"Text-Based Images and Documents": {
|
| 88 |
+
"count": 82,
|
| 89 |
+
"num_samples": 1294,
|
| 90 |
+
"tasks": [],
|
| 91 |
+
"average_score": 0.1718334741390191
|
| 92 |
+
},
|
| 93 |
+
"Diagrams and Data Visualizations": {
|
| 94 |
+
"count": 101,
|
| 95 |
+
"num_samples": 1718,
|
| 96 |
+
"tasks": [],
|
| 97 |
+
"average_score": 0.28108187023954245
|
| 98 |
+
},
|
| 99 |
+
"Videos": {
|
| 100 |
+
"count": 43,
|
| 101 |
+
"num_samples": 698,
|
| 102 |
+
"tasks": [],
|
| 103 |
+
"average_score": 0.3391119999611432
|
| 104 |
+
},
|
| 105 |
+
"Artistic and Creative Content": {
|
| 106 |
+
"count": 32,
|
| 107 |
+
"num_samples": 541,
|
| 108 |
+
"tasks": [],
|
| 109 |
+
"average_score": 0.36434285930327387
|
| 110 |
+
},
|
| 111 |
+
"Photographs": {
|
| 112 |
+
"count": 143,
|
| 113 |
+
"num_samples": 2248,
|
| 114 |
+
"tasks": [],
|
| 115 |
+
"average_score": 0.36915384448504296
|
| 116 |
+
},
|
| 117 |
+
"3D Models and Aerial Imagery": {
|
| 118 |
+
"count": 11,
|
| 119 |
+
"num_samples": 169,
|
| 120 |
+
"tasks": [],
|
| 121 |
+
"average_score": 0.15940750469262005
|
| 122 |
+
}
|
| 123 |
+
},
|
| 124 |
+
"output_format": {
|
| 125 |
+
"contextual_formatted_text": {
|
| 126 |
+
"count": 98,
|
| 127 |
+
"num_samples": 1514,
|
| 128 |
+
"tasks": [],
|
| 129 |
+
"average_score": 0.2456942956200745
|
| 130 |
+
},
|
| 131 |
+
"structured_output": {
|
| 132 |
+
"count": 110,
|
| 133 |
+
"num_samples": 1714,
|
| 134 |
+
"tasks": [],
|
| 135 |
+
"average_score": 0.21586513216389874
|
| 136 |
+
},
|
| 137 |
+
"exact_text": {
|
| 138 |
+
"count": 83,
|
| 139 |
+
"num_samples": 1278,
|
| 140 |
+
"tasks": [],
|
| 141 |
+
"average_score": 0.29359048024032264
|
| 142 |
+
},
|
| 143 |
+
"numerical_data": {
|
| 144 |
+
"count": 49,
|
| 145 |
+
"num_samples": 862,
|
| 146 |
+
"tasks": [],
|
| 147 |
+
"average_score": 0.2646677074112521
|
| 148 |
+
},
|
| 149 |
+
"open_ended_output": {
|
| 150 |
+
"count": 80,
|
| 151 |
+
"num_samples": 1454,
|
| 152 |
+
"tasks": [],
|
| 153 |
+
"average_score": 0.34733130661096645
|
| 154 |
+
},
|
| 155 |
+
"multiple_choice": {
|
| 156 |
+
"count": 85,
|
| 157 |
+
"num_samples": 1363,
|
| 158 |
+
"tasks": [],
|
| 159 |
+
"average_score": 0.3286125236284589
|
| 160 |
+
}
|
| 161 |
+
},
|
| 162 |
+
"input_num": {
|
| 163 |
+
"6-8 images": {
|
| 164 |
+
"count": 21,
|
| 165 |
+
"num_samples": 314,
|
| 166 |
+
"tasks": [],
|
| 167 |
+
"average_score": 0.16358654572940287
|
| 168 |
+
},
|
| 169 |
+
"9-image or more": {
|
| 170 |
+
"count": 41,
|
| 171 |
+
"num_samples": 623,
|
| 172 |
+
"tasks": [],
|
| 173 |
+
"average_score": 0.25463059203015115
|
| 174 |
+
},
|
| 175 |
+
"1-image": {
|
| 176 |
+
"count": 315,
|
| 177 |
+
"num_samples": 5228,
|
| 178 |
+
"tasks": [],
|
| 179 |
+
"average_score": 0.2919119209789575
|
| 180 |
+
},
|
| 181 |
+
"video": {
|
| 182 |
+
"count": 43,
|
| 183 |
+
"num_samples": 698,
|
| 184 |
+
"tasks": [],
|
| 185 |
+
"average_score": 0.3391119999611432
|
| 186 |
+
},
|
| 187 |
+
"4-5 images": {
|
| 188 |
+
"count": 34,
|
| 189 |
+
"num_samples": 520,
|
| 190 |
+
"tasks": [],
|
| 191 |
+
"average_score": 0.20016011839130254
|
| 192 |
+
},
|
| 193 |
+
"2-3 images": {
|
| 194 |
+
"count": 51,
|
| 195 |
+
"num_samples": 802,
|
| 196 |
+
"tasks": [],
|
| 197 |
+
"average_score": 0.2679179451692527
|
| 198 |
+
}
|
| 199 |
+
},
|
| 200 |
+
"app": {
|
| 201 |
+
"Information_Extraction": {
|
| 202 |
+
"count": 72,
|
| 203 |
+
"num_samples": 1124,
|
| 204 |
+
"tasks": [],
|
| 205 |
+
"average_score": 0.23600902063965679
|
| 206 |
+
},
|
| 207 |
+
"Planning": {
|
| 208 |
+
"count": 78,
|
| 209 |
+
"num_samples": 1239,
|
| 210 |
+
"tasks": [],
|
| 211 |
+
"average_score": 0.15326915093278803
|
| 212 |
+
},
|
| 213 |
+
"Coding": {
|
| 214 |
+
"count": 31,
|
| 215 |
+
"num_samples": 474,
|
| 216 |
+
"tasks": [],
|
| 217 |
+
"average_score": 0.20668466311255687
|
| 218 |
+
},
|
| 219 |
+
"Perception": {
|
| 220 |
+
"count": 145,
|
| 221 |
+
"num_samples": 2313,
|
| 222 |
+
"tasks": [],
|
| 223 |
+
"average_score": 0.33348955971237954
|
| 224 |
+
},
|
| 225 |
+
"Metrics": {
|
| 226 |
+
"count": 20,
|
| 227 |
+
"num_samples": 309,
|
| 228 |
+
"tasks": [],
|
| 229 |
+
"average_score": 0.3759170425350556
|
| 230 |
+
},
|
| 231 |
+
"Science": {
|
| 232 |
+
"count": 29,
|
| 233 |
+
"num_samples": 574,
|
| 234 |
+
"tasks": [],
|
| 235 |
+
"average_score": 0.23894961766260706
|
| 236 |
+
},
|
| 237 |
+
"Knowledge": {
|
| 238 |
+
"count": 97,
|
| 239 |
+
"num_samples": 1605,
|
| 240 |
+
"tasks": [],
|
| 241 |
+
"average_score": 0.351703435685048
|
| 242 |
+
},
|
| 243 |
+
"Mathematics": {
|
| 244 |
+
"count": 33,
|
| 245 |
+
"num_samples": 547,
|
| 246 |
+
"tasks": [],
|
| 247 |
+
"average_score": 0.26074348700688493
|
| 248 |
+
}
|
| 249 |
+
}
|
| 250 |
+
}
|
| 251 |
+
}
|
static/eval_results/Default/Mammoth_VL/task_results.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
static/eval_results/Default/MiniCPM_v2.6/summary_results.json
ADDED
|
@@ -0,0 +1,251 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_summary": {
|
| 3 |
+
"core": {
|
| 4 |
+
"num_eval_tasks": 440,
|
| 5 |
+
"num_eval_samples": 6539,
|
| 6 |
+
"macro_mean_score": 0.22955895202146906,
|
| 7 |
+
"micro_mean_score": 0.22560399396899078
|
| 8 |
+
},
|
| 9 |
+
"open": {
|
| 10 |
+
"num_eval_tasks": 65,
|
| 11 |
+
"num_eval_samples": 1163,
|
| 12 |
+
"macro_mean_score": 0.41728623355613875,
|
| 13 |
+
"micro_mean_score": 0.43452278589853827
|
| 14 |
+
},
|
| 15 |
+
"overall_score": 0.2537218694467236
|
| 16 |
+
},
|
| 17 |
+
"keyword_stats": {
|
| 18 |
+
"skills": {
|
| 19 |
+
"Object Recognition and Classification": {
|
| 20 |
+
"count": 303,
|
| 21 |
+
"num_samples": 4755,
|
| 22 |
+
"tasks": [],
|
| 23 |
+
"average_score": 0.2604967101191775
|
| 24 |
+
},
|
| 25 |
+
"Text Recognition (OCR)": {
|
| 26 |
+
"count": 137,
|
| 27 |
+
"num_samples": 2239,
|
| 28 |
+
"tasks": [],
|
| 29 |
+
"average_score": 0.2500331562865158
|
| 30 |
+
},
|
| 31 |
+
"Language Understanding and Generation": {
|
| 32 |
+
"count": 154,
|
| 33 |
+
"num_samples": 2509,
|
| 34 |
+
"tasks": [],
|
| 35 |
+
"average_score": 0.3003169369011028
|
| 36 |
+
},
|
| 37 |
+
"Scene and Event Understanding": {
|
| 38 |
+
"count": 154,
|
| 39 |
+
"num_samples": 2467,
|
| 40 |
+
"tasks": [],
|
| 41 |
+
"average_score": 0.31808748114668184
|
| 42 |
+
},
|
| 43 |
+
"Mathematical and Logical Reasoning": {
|
| 44 |
+
"count": 109,
|
| 45 |
+
"num_samples": 1910,
|
| 46 |
+
"tasks": [],
|
| 47 |
+
"average_score": 0.18281637763548025
|
| 48 |
+
},
|
| 49 |
+
"Commonsense and Social Reasoning": {
|
| 50 |
+
"count": 51,
|
| 51 |
+
"num_samples": 855,
|
| 52 |
+
"tasks": [],
|
| 53 |
+
"average_score": 0.40732197204308807
|
| 54 |
+
},
|
| 55 |
+
"Ethical and Safety Reasoning": {
|
| 56 |
+
"count": 15,
|
| 57 |
+
"num_samples": 245,
|
| 58 |
+
"tasks": [],
|
| 59 |
+
"average_score": 0.48798245614035085
|
| 60 |
+
},
|
| 61 |
+
"Domain-Specific Knowledge and Skills": {
|
| 62 |
+
"count": 77,
|
| 63 |
+
"num_samples": 1386,
|
| 64 |
+
"tasks": [],
|
| 65 |
+
"average_score": 0.23723675736151562
|
| 66 |
+
},
|
| 67 |
+
"Spatial and Temporal Reasoning": {
|
| 68 |
+
"count": 152,
|
| 69 |
+
"num_samples": 2437,
|
| 70 |
+
"tasks": [],
|
| 71 |
+
"average_score": 0.1968926733821904
|
| 72 |
+
},
|
| 73 |
+
"Planning and Decision Making": {
|
| 74 |
+
"count": 37,
|
| 75 |
+
"num_samples": 577,
|
| 76 |
+
"tasks": [],
|
| 77 |
+
"average_score": 0.08735883237069725
|
| 78 |
+
}
|
| 79 |
+
},
|
| 80 |
+
"input_format": {
|
| 81 |
+
"User Interface Screenshots": {
|
| 82 |
+
"count": 93,
|
| 83 |
+
"num_samples": 1517,
|
| 84 |
+
"tasks": [],
|
| 85 |
+
"average_score": 0.21195711598986072
|
| 86 |
+
},
|
| 87 |
+
"Text-Based Images and Documents": {
|
| 88 |
+
"count": 82,
|
| 89 |
+
"num_samples": 1294,
|
| 90 |
+
"tasks": [],
|
| 91 |
+
"average_score": 0.18639148159043903
|
| 92 |
+
},
|
| 93 |
+
"Diagrams and Data Visualizations": {
|
| 94 |
+
"count": 101,
|
| 95 |
+
"num_samples": 1718,
|
| 96 |
+
"tasks": [],
|
| 97 |
+
"average_score": 0.21578309681746147
|
| 98 |
+
},
|
| 99 |
+
"Videos": {
|
| 100 |
+
"count": 43,
|
| 101 |
+
"num_samples": 698,
|
| 102 |
+
"tasks": [],
|
| 103 |
+
"average_score": 0.3527537836840162
|
| 104 |
+
},
|
| 105 |
+
"Artistic and Creative Content": {
|
| 106 |
+
"count": 32,
|
| 107 |
+
"num_samples": 541,
|
| 108 |
+
"tasks": [],
|
| 109 |
+
"average_score": 0.3096882575625531
|
| 110 |
+
},
|
| 111 |
+
"Photographs": {
|
| 112 |
+
"count": 143,
|
| 113 |
+
"num_samples": 2248,
|
| 114 |
+
"tasks": [],
|
| 115 |
+
"average_score": 0.3176880312524649
|
| 116 |
+
},
|
| 117 |
+
"3D Models and Aerial Imagery": {
|
| 118 |
+
"count": 11,
|
| 119 |
+
"num_samples": 169,
|
| 120 |
+
"tasks": [],
|
| 121 |
+
"average_score": 0.0755920550038197
|
| 122 |
+
}
|
| 123 |
+
},
|
| 124 |
+
"output_format": {
|
| 125 |
+
"contextual_formatted_text": {
|
| 126 |
+
"count": 98,
|
| 127 |
+
"num_samples": 1514,
|
| 128 |
+
"tasks": [],
|
| 129 |
+
"average_score": 0.23506388020592064
|
| 130 |
+
},
|
| 131 |
+
"structured_output": {
|
| 132 |
+
"count": 110,
|
| 133 |
+
"num_samples": 1714,
|
| 134 |
+
"tasks": [],
|
| 135 |
+
"average_score": 0.1781127776443048
|
| 136 |
+
},
|
| 137 |
+
"exact_text": {
|
| 138 |
+
"count": 83,
|
| 139 |
+
"num_samples": 1278,
|
| 140 |
+
"tasks": [],
|
| 141 |
+
"average_score": 0.2551275278138797
|
| 142 |
+
},
|
| 143 |
+
"numerical_data": {
|
| 144 |
+
"count": 49,
|
| 145 |
+
"num_samples": 862,
|
| 146 |
+
"tasks": [],
|
| 147 |
+
"average_score": 0.20833171754655547
|
| 148 |
+
},
|
| 149 |
+
"open_ended_output": {
|
| 150 |
+
"count": 80,
|
| 151 |
+
"num_samples": 1454,
|
| 152 |
+
"tasks": [],
|
| 153 |
+
"average_score": 0.36473950920880716
|
| 154 |
+
},
|
| 155 |
+
"multiple_choice": {
|
| 156 |
+
"count": 85,
|
| 157 |
+
"num_samples": 1363,
|
| 158 |
+
"tasks": [],
|
| 159 |
+
"average_score": 0.293386806641223
|
| 160 |
+
}
|
| 161 |
+
},
|
| 162 |
+
"input_num": {
|
| 163 |
+
"6-8 images": {
|
| 164 |
+
"count": 21,
|
| 165 |
+
"num_samples": 314,
|
| 166 |
+
"tasks": [],
|
| 167 |
+
"average_score": 0.13955971277399848
|
| 168 |
+
},
|
| 169 |
+
"9-image or more": {
|
| 170 |
+
"count": 41,
|
| 171 |
+
"num_samples": 623,
|
| 172 |
+
"tasks": [],
|
| 173 |
+
"average_score": 0.23596215721092323
|
| 174 |
+
},
|
| 175 |
+
"1-image": {
|
| 176 |
+
"count": 315,
|
| 177 |
+
"num_samples": 5228,
|
| 178 |
+
"tasks": [],
|
| 179 |
+
"average_score": 0.26319603880798287
|
| 180 |
+
},
|
| 181 |
+
"video": {
|
| 182 |
+
"count": 43,
|
| 183 |
+
"num_samples": 698,
|
| 184 |
+
"tasks": [],
|
| 185 |
+
"average_score": 0.3527537836840162
|
| 186 |
+
},
|
| 187 |
+
"4-5 images": {
|
| 188 |
+
"count": 34,
|
| 189 |
+
"num_samples": 520,
|
| 190 |
+
"tasks": [],
|
| 191 |
+
"average_score": 0.17888270664238365
|
| 192 |
+
},
|
| 193 |
+
"2-3 images": {
|
| 194 |
+
"count": 51,
|
| 195 |
+
"num_samples": 802,
|
| 196 |
+
"tasks": [],
|
| 197 |
+
"average_score": 0.22288558250834017
|
| 198 |
+
}
|
| 199 |
+
},
|
| 200 |
+
"app": {
|
| 201 |
+
"Information_Extraction": {
|
| 202 |
+
"count": 72,
|
| 203 |
+
"num_samples": 1124,
|
| 204 |
+
"tasks": [],
|
| 205 |
+
"average_score": 0.2666989364424082
|
| 206 |
+
},
|
| 207 |
+
"Planning": {
|
| 208 |
+
"count": 78,
|
| 209 |
+
"num_samples": 1239,
|
| 210 |
+
"tasks": [],
|
| 211 |
+
"average_score": 0.11693267119342445
|
| 212 |
+
},
|
| 213 |
+
"Coding": {
|
| 214 |
+
"count": 31,
|
| 215 |
+
"num_samples": 474,
|
| 216 |
+
"tasks": [],
|
| 217 |
+
"average_score": 0.15342045420318667
|
| 218 |
+
},
|
| 219 |
+
"Perception": {
|
| 220 |
+
"count": 145,
|
| 221 |
+
"num_samples": 2313,
|
| 222 |
+
"tasks": [],
|
| 223 |
+
"average_score": 0.29243044121840894
|
| 224 |
+
},
|
| 225 |
+
"Metrics": {
|
| 226 |
+
"count": 20,
|
| 227 |
+
"num_samples": 309,
|
| 228 |
+
"tasks": [],
|
| 229 |
+
"average_score": 0.3777897246686755
|
| 230 |
+
},
|
| 231 |
+
"Science": {
|
| 232 |
+
"count": 29,
|
| 233 |
+
"num_samples": 574,
|
| 234 |
+
"tasks": [],
|
| 235 |
+
"average_score": 0.25714862989687987
|
| 236 |
+
},
|
| 237 |
+
"Knowledge": {
|
| 238 |
+
"count": 97,
|
| 239 |
+
"num_samples": 1605,
|
| 240 |
+
"tasks": [],
|
| 241 |
+
"average_score": 0.33187729423141027
|
| 242 |
+
},
|
| 243 |
+
"Mathematics": {
|
| 244 |
+
"count": 33,
|
| 245 |
+
"num_samples": 547,
|
| 246 |
+
"tasks": [],
|
| 247 |
+
"average_score": 0.16493399805627715
|
| 248 |
+
}
|
| 249 |
+
}
|
| 250 |
+
}
|
| 251 |
+
}
|
static/eval_results/Default/MiniCPM_v2.6/task_results.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
static/eval_results/Default/NVLM/summary_results.json
ADDED
|
@@ -0,0 +1,251 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_summary": {
|
| 3 |
+
"core": {
|
| 4 |
+
"num_eval_tasks": 440,
|
| 5 |
+
"num_eval_samples": 6539,
|
| 6 |
+
"macro_mean_score": 0.21589726765847422,
|
| 7 |
+
"micro_mean_score": 0.21406043849932396
|
| 8 |
+
},
|
| 9 |
+
"open": {
|
| 10 |
+
"num_eval_tasks": 65,
|
| 11 |
+
"num_eval_samples": 1163,
|
| 12 |
+
"macro_mean_score": 0.3478114310231307,
|
| 13 |
+
"micro_mean_score": 0.3947549441100602
|
| 14 |
+
},
|
| 15 |
+
"overall_score": 0.23287631838857856
|
| 16 |
+
},
|
| 17 |
+
"keyword_stats": {
|
| 18 |
+
"skills": {
|
| 19 |
+
"Object Recognition and Classification": {
|
| 20 |
+
"count": 303,
|
| 21 |
+
"num_samples": 4755,
|
| 22 |
+
"tasks": [],
|
| 23 |
+
"average_score": 0.21591473223174515
|
| 24 |
+
},
|
| 25 |
+
"Text Recognition (OCR)": {
|
| 26 |
+
"count": 137,
|
| 27 |
+
"num_samples": 2239,
|
| 28 |
+
"tasks": [],
|
| 29 |
+
"average_score": 0.27426258729618225
|
| 30 |
+
},
|
| 31 |
+
"Language Understanding and Generation": {
|
| 32 |
+
"count": 154,
|
| 33 |
+
"num_samples": 2509,
|
| 34 |
+
"tasks": [],
|
| 35 |
+
"average_score": 0.284874072963892
|
| 36 |
+
},
|
| 37 |
+
"Scene and Event Understanding": {
|
| 38 |
+
"count": 154,
|
| 39 |
+
"num_samples": 2467,
|
| 40 |
+
"tasks": [],
|
| 41 |
+
"average_score": 0.2134087963800149
|
| 42 |
+
},
|
| 43 |
+
"Mathematical and Logical Reasoning": {
|
| 44 |
+
"count": 109,
|
| 45 |
+
"num_samples": 1910,
|
| 46 |
+
"tasks": [],
|
| 47 |
+
"average_score": 0.2525993645909815
|
| 48 |
+
},
|
| 49 |
+
"Commonsense and Social Reasoning": {
|
| 50 |
+
"count": 51,
|
| 51 |
+
"num_samples": 855,
|
| 52 |
+
"tasks": [],
|
| 53 |
+
"average_score": 0.4029543142569604
|
| 54 |
+
},
|
| 55 |
+
"Ethical and Safety Reasoning": {
|
| 56 |
+
"count": 15,
|
| 57 |
+
"num_samples": 245,
|
| 58 |
+
"tasks": [],
|
| 59 |
+
"average_score": 0.4317142857142857
|
| 60 |
+
},
|
| 61 |
+
"Domain-Specific Knowledge and Skills": {
|
| 62 |
+
"count": 77,
|
| 63 |
+
"num_samples": 1386,
|
| 64 |
+
"tasks": [],
|
| 65 |
+
"average_score": 0.2442484196551863
|
| 66 |
+
},
|
| 67 |
+
"Spatial and Temporal Reasoning": {
|
| 68 |
+
"count": 152,
|
| 69 |
+
"num_samples": 2437,
|
| 70 |
+
"tasks": [],
|
| 71 |
+
"average_score": 0.1424318574406695
|
| 72 |
+
},
|
| 73 |
+
"Planning and Decision Making": {
|
| 74 |
+
"count": 37,
|
| 75 |
+
"num_samples": 577,
|
| 76 |
+
"tasks": [],
|
| 77 |
+
"average_score": 0.046798309600525674
|
| 78 |
+
}
|
| 79 |
+
},
|
| 80 |
+
"input_format": {
|
| 81 |
+
"User Interface Screenshots": {
|
| 82 |
+
"count": 93,
|
| 83 |
+
"num_samples": 1517,
|
| 84 |
+
"tasks": [],
|
| 85 |
+
"average_score": 0.19655048708297065
|
| 86 |
+
},
|
| 87 |
+
"Text-Based Images and Documents": {
|
| 88 |
+
"count": 82,
|
| 89 |
+
"num_samples": 1294,
|
| 90 |
+
"tasks": [],
|
| 91 |
+
"average_score": 0.18621338396242557
|
| 92 |
+
},
|
| 93 |
+
"Diagrams and Data Visualizations": {
|
| 94 |
+
"count": 101,
|
| 95 |
+
"num_samples": 1718,
|
| 96 |
+
"tasks": [],
|
| 97 |
+
"average_score": 0.2922667531642391
|
| 98 |
+
},
|
| 99 |
+
"Videos": {
|
| 100 |
+
"count": 43,
|
| 101 |
+
"num_samples": 698,
|
| 102 |
+
"tasks": [],
|
| 103 |
+
"average_score": 0.0
|
| 104 |
+
},
|
| 105 |
+
"Artistic and Creative Content": {
|
| 106 |
+
"count": 32,
|
| 107 |
+
"num_samples": 541,
|
| 108 |
+
"tasks": [],
|
| 109 |
+
"average_score": 0.3447361496776569
|
| 110 |
+
},
|
| 111 |
+
"Photographs": {
|
| 112 |
+
"count": 143,
|
| 113 |
+
"num_samples": 2248,
|
| 114 |
+
"tasks": [],
|
| 115 |
+
"average_score": 0.29674507895195534
|
| 116 |
+
},
|
| 117 |
+
"3D Models and Aerial Imagery": {
|
| 118 |
+
"count": 11,
|
| 119 |
+
"num_samples": 169,
|
| 120 |
+
"tasks": [],
|
| 121 |
+
"average_score": 0.09716389574493003
|
| 122 |
+
}
|
| 123 |
+
},
|
| 124 |
+
"output_format": {
|
| 125 |
+
"contextual_formatted_text": {
|
| 126 |
+
"count": 98,
|
| 127 |
+
"num_samples": 1514,
|
| 128 |
+
"tasks": [],
|
| 129 |
+
"average_score": 0.19684666506287793
|
| 130 |
+
},
|
| 131 |
+
"structured_output": {
|
| 132 |
+
"count": 110,
|
| 133 |
+
"num_samples": 1714,
|
| 134 |
+
"tasks": [],
|
| 135 |
+
"average_score": 0.2199792859352912
|
| 136 |
+
},
|
| 137 |
+
"exact_text": {
|
| 138 |
+
"count": 83,
|
| 139 |
+
"num_samples": 1278,
|
| 140 |
+
"tasks": [],
|
| 141 |
+
"average_score": 0.25164831125437204
|
| 142 |
+
},
|
| 143 |
+
"numerical_data": {
|
| 144 |
+
"count": 49,
|
| 145 |
+
"num_samples": 862,
|
| 146 |
+
"tasks": [],
|
| 147 |
+
"average_score": 0.2396831363622878
|
| 148 |
+
},
|
| 149 |
+
"open_ended_output": {
|
| 150 |
+
"count": 80,
|
| 151 |
+
"num_samples": 1454,
|
| 152 |
+
"tasks": [],
|
| 153 |
+
"average_score": 0.3215948035793096
|
| 154 |
+
},
|
| 155 |
+
"multiple_choice": {
|
| 156 |
+
"count": 85,
|
| 157 |
+
"num_samples": 1363,
|
| 158 |
+
"tasks": [],
|
| 159 |
+
"average_score": 0.1853526865291571
|
| 160 |
+
}
|
| 161 |
+
},
|
| 162 |
+
"input_num": {
|
| 163 |
+
"6-8 images": {
|
| 164 |
+
"count": 21,
|
| 165 |
+
"num_samples": 314,
|
| 166 |
+
"tasks": [],
|
| 167 |
+
"average_score": 0.0
|
| 168 |
+
},
|
| 169 |
+
"9-image or more": {
|
| 170 |
+
"count": 41,
|
| 171 |
+
"num_samples": 623,
|
| 172 |
+
"tasks": [],
|
| 173 |
+
"average_score": 0.0
|
| 174 |
+
},
|
| 175 |
+
"1-image": {
|
| 176 |
+
"count": 315,
|
| 177 |
+
"num_samples": 5228,
|
| 178 |
+
"tasks": [],
|
| 179 |
+
"average_score": 0.3352056263801705
|
| 180 |
+
},
|
| 181 |
+
"video": {
|
| 182 |
+
"count": 43,
|
| 183 |
+
"num_samples": 698,
|
| 184 |
+
"tasks": [],
|
| 185 |
+
"average_score": 0.0
|
| 186 |
+
},
|
| 187 |
+
"4-5 images": {
|
| 188 |
+
"count": 34,
|
| 189 |
+
"num_samples": 520,
|
| 190 |
+
"tasks": [],
|
| 191 |
+
"average_score": 0.038244047619047615
|
| 192 |
+
},
|
| 193 |
+
"2-3 images": {
|
| 194 |
+
"count": 51,
|
| 195 |
+
"num_samples": 802,
|
| 196 |
+
"tasks": [],
|
| 197 |
+
"average_score": 0.2100484481849172
|
| 198 |
+
}
|
| 199 |
+
},
|
| 200 |
+
"app": {
|
| 201 |
+
"Information_Extraction": {
|
| 202 |
+
"count": 72,
|
| 203 |
+
"num_samples": 1124,
|
| 204 |
+
"tasks": [],
|
| 205 |
+
"average_score": 0.15704252277801936
|
| 206 |
+
},
|
| 207 |
+
"Planning": {
|
| 208 |
+
"count": 78,
|
| 209 |
+
"num_samples": 1239,
|
| 210 |
+
"tasks": [],
|
| 211 |
+
"average_score": 0.06688589450465973
|
| 212 |
+
},
|
| 213 |
+
"Coding": {
|
| 214 |
+
"count": 31,
|
| 215 |
+
"num_samples": 474,
|
| 216 |
+
"tasks": [],
|
| 217 |
+
"average_score": 0.2292747206409446
|
| 218 |
+
},
|
| 219 |
+
"Perception": {
|
| 220 |
+
"count": 145,
|
| 221 |
+
"num_samples": 2313,
|
| 222 |
+
"tasks": [],
|
| 223 |
+
"average_score": 0.2689383226748064
|
| 224 |
+
},
|
| 225 |
+
"Metrics": {
|
| 226 |
+
"count": 20,
|
| 227 |
+
"num_samples": 309,
|
| 228 |
+
"tasks": [],
|
| 229 |
+
"average_score": 0.18857142857142856
|
| 230 |
+
},
|
| 231 |
+
"Science": {
|
| 232 |
+
"count": 29,
|
| 233 |
+
"num_samples": 574,
|
| 234 |
+
"tasks": [],
|
| 235 |
+
"average_score": 0.23682040748983965
|
| 236 |
+
},
|
| 237 |
+
"Knowledge": {
|
| 238 |
+
"count": 97,
|
| 239 |
+
"num_samples": 1605,
|
| 240 |
+
"tasks": [],
|
| 241 |
+
"average_score": 0.3656649917873737
|
| 242 |
+
},
|
| 243 |
+
"Mathematics": {
|
| 244 |
+
"count": 33,
|
| 245 |
+
"num_samples": 547,
|
| 246 |
+
"tasks": [],
|
| 247 |
+
"average_score": 0.26866914106442213
|
| 248 |
+
}
|
| 249 |
+
}
|
| 250 |
+
}
|
| 251 |
+
}
|
static/eval_results/Default/NVLM/task_results.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
static/eval_results/Default/Phi-3.5-vision/summary_results.json
ADDED
|
@@ -0,0 +1,251 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_summary": {
|
| 3 |
+
"core": {
|
| 4 |
+
"num_eval_tasks": 440,
|
| 5 |
+
"num_eval_samples": 6539,
|
| 6 |
+
"macro_mean_score": 0.22995297916629392,
|
| 7 |
+
"micro_mean_score": 0.22708502951025372
|
| 8 |
+
},
|
| 9 |
+
"open": {
|
| 10 |
+
"num_eval_tasks": 65,
|
| 11 |
+
"num_eval_samples": 1163,
|
| 12 |
+
"macro_mean_score": 0.3947914647737769,
|
| 13 |
+
"micro_mean_score": 0.42459157351676696
|
| 14 |
+
},
|
| 15 |
+
"overall_score": 0.2511698139474551
|
| 16 |
+
},
|
| 17 |
+
"keyword_stats": {
|
| 18 |
+
"skills": {
|
| 19 |
+
"Object Recognition and Classification": {
|
| 20 |
+
"count": 303,
|
| 21 |
+
"num_samples": 4755,
|
| 22 |
+
"tasks": [],
|
| 23 |
+
"average_score": 0.2550326045763433
|
| 24 |
+
},
|
| 25 |
+
"Text Recognition (OCR)": {
|
| 26 |
+
"count": 137,
|
| 27 |
+
"num_samples": 2239,
|
| 28 |
+
"tasks": [],
|
| 29 |
+
"average_score": 0.24395249720074527
|
| 30 |
+
},
|
| 31 |
+
"Language Understanding and Generation": {
|
| 32 |
+
"count": 154,
|
| 33 |
+
"num_samples": 2509,
|
| 34 |
+
"tasks": [],
|
| 35 |
+
"average_score": 0.2858236369733704
|
| 36 |
+
},
|
| 37 |
+
"Scene and Event Understanding": {
|
| 38 |
+
"count": 154,
|
| 39 |
+
"num_samples": 2467,
|
| 40 |
+
"tasks": [],
|
| 41 |
+
"average_score": 0.29876274710122536
|
| 42 |
+
},
|
| 43 |
+
"Mathematical and Logical Reasoning": {
|
| 44 |
+
"count": 109,
|
| 45 |
+
"num_samples": 1910,
|
| 46 |
+
"tasks": [],
|
| 47 |
+
"average_score": 0.21972896566746963
|
| 48 |
+
},
|
| 49 |
+
"Commonsense and Social Reasoning": {
|
| 50 |
+
"count": 51,
|
| 51 |
+
"num_samples": 855,
|
| 52 |
+
"tasks": [],
|
| 53 |
+
"average_score": 0.37513466171380355
|
| 54 |
+
},
|
| 55 |
+
"Ethical and Safety Reasoning": {
|
| 56 |
+
"count": 15,
|
| 57 |
+
"num_samples": 245,
|
| 58 |
+
"tasks": [],
|
| 59 |
+
"average_score": 0.4713934837092732
|
| 60 |
+
},
|
| 61 |
+
"Domain-Specific Knowledge and Skills": {
|
| 62 |
+
"count": 77,
|
| 63 |
+
"num_samples": 1386,
|
| 64 |
+
"tasks": [],
|
| 65 |
+
"average_score": 0.25475240046465697
|
| 66 |
+
},
|
| 67 |
+
"Spatial and Temporal Reasoning": {
|
| 68 |
+
"count": 152,
|
| 69 |
+
"num_samples": 2437,
|
| 70 |
+
"tasks": [],
|
| 71 |
+
"average_score": 0.20386233377001492
|
| 72 |
+
},
|
| 73 |
+
"Planning and Decision Making": {
|
| 74 |
+
"count": 37,
|
| 75 |
+
"num_samples": 577,
|
| 76 |
+
"tasks": [],
|
| 77 |
+
"average_score": 0.06657701969095552
|
| 78 |
+
}
|
| 79 |
+
},
|
| 80 |
+
"input_format": {
|
| 81 |
+
"User Interface Screenshots": {
|
| 82 |
+
"count": 93,
|
| 83 |
+
"num_samples": 1517,
|
| 84 |
+
"tasks": [],
|
| 85 |
+
"average_score": 0.16556787388989183
|
| 86 |
+
},
|
| 87 |
+
"Text-Based Images and Documents": {
|
| 88 |
+
"count": 82,
|
| 89 |
+
"num_samples": 1294,
|
| 90 |
+
"tasks": [],
|
| 91 |
+
"average_score": 0.17989790940001513
|
| 92 |
+
},
|
| 93 |
+
"Diagrams and Data Visualizations": {
|
| 94 |
+
"count": 101,
|
| 95 |
+
"num_samples": 1718,
|
| 96 |
+
"tasks": [],
|
| 97 |
+
"average_score": 0.2671646581690049
|
| 98 |
+
},
|
| 99 |
+
"Videos": {
|
| 100 |
+
"count": 43,
|
| 101 |
+
"num_samples": 698,
|
| 102 |
+
"tasks": [],
|
| 103 |
+
"average_score": 0.24920333780186898
|
| 104 |
+
},
|
| 105 |
+
"Artistic and Creative Content": {
|
| 106 |
+
"count": 32,
|
| 107 |
+
"num_samples": 541,
|
| 108 |
+
"tasks": [],
|
| 109 |
+
"average_score": 0.3057560384411286
|
| 110 |
+
},
|
| 111 |
+
"Photographs": {
|
| 112 |
+
"count": 143,
|
| 113 |
+
"num_samples": 2248,
|
| 114 |
+
"tasks": [],
|
| 115 |
+
"average_score": 0.3341992361416253
|
| 116 |
+
},
|
| 117 |
+
"3D Models and Aerial Imagery": {
|
| 118 |
+
"count": 11,
|
| 119 |
+
"num_samples": 169,
|
| 120 |
+
"tasks": [],
|
| 121 |
+
"average_score": 0.12884156381685322
|
| 122 |
+
}
|
| 123 |
+
},
|
| 124 |
+
"output_format": {
|
| 125 |
+
"contextual_formatted_text": {
|
| 126 |
+
"count": 98,
|
| 127 |
+
"num_samples": 1514,
|
| 128 |
+
"tasks": [],
|
| 129 |
+
"average_score": 0.20494682188374266
|
| 130 |
+
},
|
| 131 |
+
"structured_output": {
|
| 132 |
+
"count": 110,
|
| 133 |
+
"num_samples": 1714,
|
| 134 |
+
"tasks": [],
|
| 135 |
+
"average_score": 0.21180084406324556
|
| 136 |
+
},
|
| 137 |
+
"exact_text": {
|
| 138 |
+
"count": 83,
|
| 139 |
+
"num_samples": 1278,
|
| 140 |
+
"tasks": [],
|
| 141 |
+
"average_score": 0.2609992615064841
|
| 142 |
+
},
|
| 143 |
+
"numerical_data": {
|
| 144 |
+
"count": 49,
|
| 145 |
+
"num_samples": 862,
|
| 146 |
+
"tasks": [],
|
| 147 |
+
"average_score": 0.2149689274645855
|
| 148 |
+
},
|
| 149 |
+
"open_ended_output": {
|
| 150 |
+
"count": 80,
|
| 151 |
+
"num_samples": 1454,
|
| 152 |
+
"tasks": [],
|
| 153 |
+
"average_score": 0.365192668303297
|
| 154 |
+
},
|
| 155 |
+
"multiple_choice": {
|
| 156 |
+
"count": 85,
|
| 157 |
+
"num_samples": 1363,
|
| 158 |
+
"tasks": [],
|
| 159 |
+
"average_score": 0.2593652357274648
|
| 160 |
+
}
|
| 161 |
+
},
|
| 162 |
+
"input_num": {
|
| 163 |
+
"6-8 images": {
|
| 164 |
+
"count": 21,
|
| 165 |
+
"num_samples": 314,
|
| 166 |
+
"tasks": [],
|
| 167 |
+
"average_score": 0.10107709750566891
|
| 168 |
+
},
|
| 169 |
+
"9-image or more": {
|
| 170 |
+
"count": 41,
|
| 171 |
+
"num_samples": 623,
|
| 172 |
+
"tasks": [],
|
| 173 |
+
"average_score": 0.11861055655587921
|
| 174 |
+
},
|
| 175 |
+
"1-image": {
|
| 176 |
+
"count": 315,
|
| 177 |
+
"num_samples": 5228,
|
| 178 |
+
"tasks": [],
|
| 179 |
+
"average_score": 0.2824151476986241
|
| 180 |
+
},
|
| 181 |
+
"video": {
|
| 182 |
+
"count": 43,
|
| 183 |
+
"num_samples": 698,
|
| 184 |
+
"tasks": [],
|
| 185 |
+
"average_score": 0.24920333780186898
|
| 186 |
+
},
|
| 187 |
+
"4-5 images": {
|
| 188 |
+
"count": 34,
|
| 189 |
+
"num_samples": 520,
|
| 190 |
+
"tasks": [],
|
| 191 |
+
"average_score": 0.1980440594073205
|
| 192 |
+
},
|
| 193 |
+
"2-3 images": {
|
| 194 |
+
"count": 51,
|
| 195 |
+
"num_samples": 802,
|
| 196 |
+
"tasks": [],
|
| 197 |
+
"average_score": 0.2636292373854696
|
| 198 |
+
}
|
| 199 |
+
},
|
| 200 |
+
"app": {
|
| 201 |
+
"Information_Extraction": {
|
| 202 |
+
"count": 72,
|
| 203 |
+
"num_samples": 1124,
|
| 204 |
+
"tasks": [],
|
| 205 |
+
"average_score": 0.20747122167273002
|
| 206 |
+
},
|
| 207 |
+
"Planning": {
|
| 208 |
+
"count": 78,
|
| 209 |
+
"num_samples": 1239,
|
| 210 |
+
"tasks": [],
|
| 211 |
+
"average_score": 0.08602953103518936
|
| 212 |
+
},
|
| 213 |
+
"Coding": {
|
| 214 |
+
"count": 31,
|
| 215 |
+
"num_samples": 474,
|
| 216 |
+
"tasks": [],
|
| 217 |
+
"average_score": 0.20136893467064246
|
| 218 |
+
},
|
| 219 |
+
"Perception": {
|
| 220 |
+
"count": 145,
|
| 221 |
+
"num_samples": 2313,
|
| 222 |
+
"tasks": [],
|
| 223 |
+
"average_score": 0.30979039348232706
|
| 224 |
+
},
|
| 225 |
+
"Metrics": {
|
| 226 |
+
"count": 20,
|
| 227 |
+
"num_samples": 309,
|
| 228 |
+
"tasks": [],
|
| 229 |
+
"average_score": 0.3495072422622861
|
| 230 |
+
},
|
| 231 |
+
"Science": {
|
| 232 |
+
"count": 29,
|
| 233 |
+
"num_samples": 574,
|
| 234 |
+
"tasks": [],
|
| 235 |
+
"average_score": 0.25858403958844717
|
| 236 |
+
},
|
| 237 |
+
"Knowledge": {
|
| 238 |
+
"count": 97,
|
| 239 |
+
"num_samples": 1605,
|
| 240 |
+
"tasks": [],
|
| 241 |
+
"average_score": 0.3357218088688187
|
| 242 |
+
},
|
| 243 |
+
"Mathematics": {
|
| 244 |
+
"count": 33,
|
| 245 |
+
"num_samples": 547,
|
| 246 |
+
"tasks": [],
|
| 247 |
+
"average_score": 0.21140555087788399
|
| 248 |
+
}
|
| 249 |
+
}
|
| 250 |
+
}
|
| 251 |
+
}
|
static/eval_results/Default/Phi-3.5-vision/task_results.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
static/eval_results/Default/Pixtral_12B/summary_results.json
ADDED
|
@@ -0,0 +1,251 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_summary": {
|
| 3 |
+
"core": {
|
| 4 |
+
"num_eval_tasks": 440,
|
| 5 |
+
"num_eval_samples": 6539,
|
| 6 |
+
"macro_mean_score": 0.31362045151669854,
|
| 7 |
+
"micro_mean_score": 0.3100986209078182
|
| 8 |
+
},
|
| 9 |
+
"open": {
|
| 10 |
+
"num_eval_tasks": 65,
|
| 11 |
+
"num_eval_samples": 1163,
|
| 12 |
+
"macro_mean_score": 0.4566234428542061,
|
| 13 |
+
"micro_mean_score": 0.4870593293207223
|
| 14 |
+
},
|
| 15 |
+
"overall_score": 0.33202677713439754
|
| 16 |
+
},
|
| 17 |
+
"keyword_stats": {
|
| 18 |
+
"skills": {
|
| 19 |
+
"Object Recognition and Classification": {
|
| 20 |
+
"count": 303,
|
| 21 |
+
"num_samples": 4755,
|
| 22 |
+
"tasks": [],
|
| 23 |
+
"average_score": 0.34184129499032456
|
| 24 |
+
},
|
| 25 |
+
"Text Recognition (OCR)": {
|
| 26 |
+
"count": 137,
|
| 27 |
+
"num_samples": 2239,
|
| 28 |
+
"tasks": [],
|
| 29 |
+
"average_score": 0.37667712211439836
|
| 30 |
+
},
|
| 31 |
+
"Language Understanding and Generation": {
|
| 32 |
+
"count": 154,
|
| 33 |
+
"num_samples": 2509,
|
| 34 |
+
"tasks": [],
|
| 35 |
+
"average_score": 0.37896441862738645
|
| 36 |
+
},
|
| 37 |
+
"Scene and Event Understanding": {
|
| 38 |
+
"count": 154,
|
| 39 |
+
"num_samples": 2467,
|
| 40 |
+
"tasks": [],
|
| 41 |
+
"average_score": 0.37077191302051077
|
| 42 |
+
},
|
| 43 |
+
"Mathematical and Logical Reasoning": {
|
| 44 |
+
"count": 109,
|
| 45 |
+
"num_samples": 1910,
|
| 46 |
+
"tasks": [],
|
| 47 |
+
"average_score": 0.2843861774995234
|
| 48 |
+
},
|
| 49 |
+
"Commonsense and Social Reasoning": {
|
| 50 |
+
"count": 51,
|
| 51 |
+
"num_samples": 855,
|
| 52 |
+
"tasks": [],
|
| 53 |
+
"average_score": 0.4098150360139686
|
| 54 |
+
},
|
| 55 |
+
"Ethical and Safety Reasoning": {
|
| 56 |
+
"count": 15,
|
| 57 |
+
"num_samples": 245,
|
| 58 |
+
"tasks": [],
|
| 59 |
+
"average_score": 0.533077694235589
|
| 60 |
+
},
|
| 61 |
+
"Domain-Specific Knowledge and Skills": {
|
| 62 |
+
"count": 77,
|
| 63 |
+
"num_samples": 1386,
|
| 64 |
+
"tasks": [],
|
| 65 |
+
"average_score": 0.3372902862054838
|
| 66 |
+
},
|
| 67 |
+
"Spatial and Temporal Reasoning": {
|
| 68 |
+
"count": 152,
|
| 69 |
+
"num_samples": 2437,
|
| 70 |
+
"tasks": [],
|
| 71 |
+
"average_score": 0.25372282838901716
|
| 72 |
+
},
|
| 73 |
+
"Planning and Decision Making": {
|
| 74 |
+
"count": 37,
|
| 75 |
+
"num_samples": 577,
|
| 76 |
+
"tasks": [],
|
| 77 |
+
"average_score": 0.09524894246403817
|
| 78 |
+
}
|
| 79 |
+
},
|
| 80 |
+
"input_format": {
|
| 81 |
+
"User Interface Screenshots": {
|
| 82 |
+
"count": 93,
|
| 83 |
+
"num_samples": 1517,
|
| 84 |
+
"tasks": [],
|
| 85 |
+
"average_score": 0.2972619996610934
|
| 86 |
+
},
|
| 87 |
+
"Text-Based Images and Documents": {
|
| 88 |
+
"count": 82,
|
| 89 |
+
"num_samples": 1294,
|
| 90 |
+
"tasks": [],
|
| 91 |
+
"average_score": 0.28304049684103855
|
| 92 |
+
},
|
| 93 |
+
"Diagrams and Data Visualizations": {
|
| 94 |
+
"count": 101,
|
| 95 |
+
"num_samples": 1718,
|
| 96 |
+
"tasks": [],
|
| 97 |
+
"average_score": 0.33523333364720703
|
| 98 |
+
},
|
| 99 |
+
"Videos": {
|
| 100 |
+
"count": 43,
|
| 101 |
+
"num_samples": 698,
|
| 102 |
+
"tasks": [],
|
| 103 |
+
"average_score": 0.3988260865341648
|
| 104 |
+
},
|
| 105 |
+
"Artistic and Creative Content": {
|
| 106 |
+
"count": 32,
|
| 107 |
+
"num_samples": 541,
|
| 108 |
+
"tasks": [],
|
| 109 |
+
"average_score": 0.39117521970978353
|
| 110 |
+
},
|
| 111 |
+
"Photographs": {
|
| 112 |
+
"count": 143,
|
| 113 |
+
"num_samples": 2248,
|
| 114 |
+
"tasks": [],
|
| 115 |
+
"average_score": 0.35583482417594536
|
| 116 |
+
},
|
| 117 |
+
"3D Models and Aerial Imagery": {
|
| 118 |
+
"count": 11,
|
| 119 |
+
"num_samples": 169,
|
| 120 |
+
"tasks": [],
|
| 121 |
+
"average_score": 0.21897822147396953
|
| 122 |
+
}
|
| 123 |
+
},
|
| 124 |
+
"output_format": {
|
| 125 |
+
"contextual_formatted_text": {
|
| 126 |
+
"count": 98,
|
| 127 |
+
"num_samples": 1514,
|
| 128 |
+
"tasks": [],
|
| 129 |
+
"average_score": 0.3436473210057542
|
| 130 |
+
},
|
| 131 |
+
"structured_output": {
|
| 132 |
+
"count": 110,
|
| 133 |
+
"num_samples": 1714,
|
| 134 |
+
"tasks": [],
|
| 135 |
+
"average_score": 0.28979044279399635
|
| 136 |
+
},
|
| 137 |
+
"exact_text": {
|
| 138 |
+
"count": 83,
|
| 139 |
+
"num_samples": 1278,
|
| 140 |
+
"tasks": [],
|
| 141 |
+
"average_score": 0.33530850344530555
|
| 142 |
+
},
|
| 143 |
+
"numerical_data": {
|
| 144 |
+
"count": 49,
|
| 145 |
+
"num_samples": 862,
|
| 146 |
+
"tasks": [],
|
| 147 |
+
"average_score": 0.30160980000905374
|
| 148 |
+
},
|
| 149 |
+
"open_ended_output": {
|
| 150 |
+
"count": 80,
|
| 151 |
+
"num_samples": 1454,
|
| 152 |
+
"tasks": [],
|
| 153 |
+
"average_score": 0.4166613092238044
|
| 154 |
+
},
|
| 155 |
+
"multiple_choice": {
|
| 156 |
+
"count": 85,
|
| 157 |
+
"num_samples": 1363,
|
| 158 |
+
"tasks": [],
|
| 159 |
+
"average_score": 0.30796171250186904
|
| 160 |
+
}
|
| 161 |
+
},
|
| 162 |
+
"input_num": {
|
| 163 |
+
"6-8 images": {
|
| 164 |
+
"count": 21,
|
| 165 |
+
"num_samples": 314,
|
| 166 |
+
"tasks": [],
|
| 167 |
+
"average_score": 0.22871315192743763
|
| 168 |
+
},
|
| 169 |
+
"9-image or more": {
|
| 170 |
+
"count": 41,
|
| 171 |
+
"num_samples": 623,
|
| 172 |
+
"tasks": [],
|
| 173 |
+
"average_score": 0.21669652626580332
|
| 174 |
+
},
|
| 175 |
+
"1-image": {
|
| 176 |
+
"count": 315,
|
| 177 |
+
"num_samples": 5228,
|
| 178 |
+
"tasks": [],
|
| 179 |
+
"average_score": 0.36087312117067055
|
| 180 |
+
},
|
| 181 |
+
"video": {
|
| 182 |
+
"count": 43,
|
| 183 |
+
"num_samples": 698,
|
| 184 |
+
"tasks": [],
|
| 185 |
+
"average_score": 0.3988260865341648
|
| 186 |
+
},
|
| 187 |
+
"4-5 images": {
|
| 188 |
+
"count": 34,
|
| 189 |
+
"num_samples": 520,
|
| 190 |
+
"tasks": [],
|
| 191 |
+
"average_score": 0.24616927284658197
|
| 192 |
+
},
|
| 193 |
+
"2-3 images": {
|
| 194 |
+
"count": 51,
|
| 195 |
+
"num_samples": 802,
|
| 196 |
+
"tasks": [],
|
| 197 |
+
"average_score": 0.2900329121369093
|
| 198 |
+
}
|
| 199 |
+
},
|
| 200 |
+
"app": {
|
| 201 |
+
"Information_Extraction": {
|
| 202 |
+
"count": 72,
|
| 203 |
+
"num_samples": 1124,
|
| 204 |
+
"tasks": [],
|
| 205 |
+
"average_score": 0.42652313209316933
|
| 206 |
+
},
|
| 207 |
+
"Planning": {
|
| 208 |
+
"count": 78,
|
| 209 |
+
"num_samples": 1239,
|
| 210 |
+
"tasks": [],
|
| 211 |
+
"average_score": 0.1209559708312353
|
| 212 |
+
},
|
| 213 |
+
"Coding": {
|
| 214 |
+
"count": 31,
|
| 215 |
+
"num_samples": 474,
|
| 216 |
+
"tasks": [],
|
| 217 |
+
"average_score": 0.25678368121442124
|
| 218 |
+
},
|
| 219 |
+
"Perception": {
|
| 220 |
+
"count": 145,
|
| 221 |
+
"num_samples": 2313,
|
| 222 |
+
"tasks": [],
|
| 223 |
+
"average_score": 0.37605128363484847
|
| 224 |
+
},
|
| 225 |
+
"Metrics": {
|
| 226 |
+
"count": 20,
|
| 227 |
+
"num_samples": 309,
|
| 228 |
+
"tasks": [],
|
| 229 |
+
"average_score": 0.4576088857728113
|
| 230 |
+
},
|
| 231 |
+
"Science": {
|
| 232 |
+
"count": 29,
|
| 233 |
+
"num_samples": 574,
|
| 234 |
+
"tasks": [],
|
| 235 |
+
"average_score": 0.3464929909487855
|
| 236 |
+
},
|
| 237 |
+
"Knowledge": {
|
| 238 |
+
"count": 97,
|
| 239 |
+
"num_samples": 1605,
|
| 240 |
+
"tasks": [],
|
| 241 |
+
"average_score": 0.3858431845580602
|
| 242 |
+
},
|
| 243 |
+
"Mathematics": {
|
| 244 |
+
"count": 33,
|
| 245 |
+
"num_samples": 547,
|
| 246 |
+
"tasks": [],
|
| 247 |
+
"average_score": 0.2549787156825223
|
| 248 |
+
}
|
| 249 |
+
}
|
| 250 |
+
}
|
| 251 |
+
}
|
static/eval_results/Default/Pixtral_12B/task_results.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
static/eval_results/Default/Qwen2_VL_2B/summary_results.json
ADDED
|
@@ -0,0 +1,251 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_summary": {
|
| 3 |
+
"core": {
|
| 4 |
+
"num_eval_tasks": 440,
|
| 5 |
+
"num_eval_samples": 6539,
|
| 6 |
+
"macro_mean_score": 0.20877163406364055,
|
| 7 |
+
"micro_mean_score": 0.20561526268932287
|
| 8 |
+
},
|
| 9 |
+
"open": {
|
| 10 |
+
"num_eval_tasks": 65,
|
| 11 |
+
"num_eval_samples": 1163,
|
| 12 |
+
"macro_mean_score": 0.3154302566225611,
|
| 13 |
+
"micro_mean_score": 0.33856405846947557
|
| 14 |
+
},
|
| 15 |
+
"overall_score": 0.22249997162072932
|
| 16 |
+
},
|
| 17 |
+
"keyword_stats": {
|
| 18 |
+
"skills": {
|
| 19 |
+
"Object Recognition and Classification": {
|
| 20 |
+
"count": 303,
|
| 21 |
+
"num_samples": 4755,
|
| 22 |
+
"tasks": [],
|
| 23 |
+
"average_score": 0.22236161923122505
|
| 24 |
+
},
|
| 25 |
+
"Text Recognition (OCR)": {
|
| 26 |
+
"count": 137,
|
| 27 |
+
"num_samples": 2239,
|
| 28 |
+
"tasks": [],
|
| 29 |
+
"average_score": 0.23701014663017753
|
| 30 |
+
},
|
| 31 |
+
"Language Understanding and Generation": {
|
| 32 |
+
"count": 154,
|
| 33 |
+
"num_samples": 2509,
|
| 34 |
+
"tasks": [],
|
| 35 |
+
"average_score": 0.25669221785292334
|
| 36 |
+
},
|
| 37 |
+
"Scene and Event Understanding": {
|
| 38 |
+
"count": 154,
|
| 39 |
+
"num_samples": 2467,
|
| 40 |
+
"tasks": [],
|
| 41 |
+
"average_score": 0.26526414975225454
|
| 42 |
+
},
|
| 43 |
+
"Mathematical and Logical Reasoning": {
|
| 44 |
+
"count": 109,
|
| 45 |
+
"num_samples": 1910,
|
| 46 |
+
"tasks": [],
|
| 47 |
+
"average_score": 0.17623548305581763
|
| 48 |
+
},
|
| 49 |
+
"Commonsense and Social Reasoning": {
|
| 50 |
+
"count": 51,
|
| 51 |
+
"num_samples": 855,
|
| 52 |
+
"tasks": [],
|
| 53 |
+
"average_score": 0.31250702198481506
|
| 54 |
+
},
|
| 55 |
+
"Ethical and Safety Reasoning": {
|
| 56 |
+
"count": 15,
|
| 57 |
+
"num_samples": 245,
|
| 58 |
+
"tasks": [],
|
| 59 |
+
"average_score": 0.4140676691729323
|
| 60 |
+
},
|
| 61 |
+
"Domain-Specific Knowledge and Skills": {
|
| 62 |
+
"count": 77,
|
| 63 |
+
"num_samples": 1386,
|
| 64 |
+
"tasks": [],
|
| 65 |
+
"average_score": 0.20802820480076603
|
| 66 |
+
},
|
| 67 |
+
"Spatial and Temporal Reasoning": {
|
| 68 |
+
"count": 152,
|
| 69 |
+
"num_samples": 2437,
|
| 70 |
+
"tasks": [],
|
| 71 |
+
"average_score": 0.17320633068307653
|
| 72 |
+
},
|
| 73 |
+
"Planning and Decision Making": {
|
| 74 |
+
"count": 37,
|
| 75 |
+
"num_samples": 577,
|
| 76 |
+
"tasks": [],
|
| 77 |
+
"average_score": 0.06209506566980099
|
| 78 |
+
}
|
| 79 |
+
},
|
| 80 |
+
"input_format": {
|
| 81 |
+
"User Interface Screenshots": {
|
| 82 |
+
"count": 93,
|
| 83 |
+
"num_samples": 1517,
|
| 84 |
+
"tasks": [],
|
| 85 |
+
"average_score": 0.190837839372028
|
| 86 |
+
},
|
| 87 |
+
"Text-Based Images and Documents": {
|
| 88 |
+
"count": 82,
|
| 89 |
+
"num_samples": 1294,
|
| 90 |
+
"tasks": [],
|
| 91 |
+
"average_score": 0.16287824421269087
|
| 92 |
+
},
|
| 93 |
+
"Diagrams and Data Visualizations": {
|
| 94 |
+
"count": 101,
|
| 95 |
+
"num_samples": 1718,
|
| 96 |
+
"tasks": [],
|
| 97 |
+
"average_score": 0.19640906475019812
|
| 98 |
+
},
|
| 99 |
+
"Videos": {
|
| 100 |
+
"count": 43,
|
| 101 |
+
"num_samples": 698,
|
| 102 |
+
"tasks": [],
|
| 103 |
+
"average_score": 0.2520741776922928
|
| 104 |
+
},
|
| 105 |
+
"Artistic and Creative Content": {
|
| 106 |
+
"count": 32,
|
| 107 |
+
"num_samples": 541,
|
| 108 |
+
"tasks": [],
|
| 109 |
+
"average_score": 0.24883076673424442
|
| 110 |
+
},
|
| 111 |
+
"Photographs": {
|
| 112 |
+
"count": 143,
|
| 113 |
+
"num_samples": 2248,
|
| 114 |
+
"tasks": [],
|
| 115 |
+
"average_score": 0.2877316297453947
|
| 116 |
+
},
|
| 117 |
+
"3D Models and Aerial Imagery": {
|
| 118 |
+
"count": 11,
|
| 119 |
+
"num_samples": 169,
|
| 120 |
+
"tasks": [],
|
| 121 |
+
"average_score": 0.13398525561847363
|
| 122 |
+
}
|
| 123 |
+
},
|
| 124 |
+
"output_format": {
|
| 125 |
+
"contextual_formatted_text": {
|
| 126 |
+
"count": 98,
|
| 127 |
+
"num_samples": 1514,
|
| 128 |
+
"tasks": [],
|
| 129 |
+
"average_score": 0.1624451002757208
|
| 130 |
+
},
|
| 131 |
+
"structured_output": {
|
| 132 |
+
"count": 110,
|
| 133 |
+
"num_samples": 1714,
|
| 134 |
+
"tasks": [],
|
| 135 |
+
"average_score": 0.20960092816529263
|
| 136 |
+
},
|
| 137 |
+
"exact_text": {
|
| 138 |
+
"count": 83,
|
| 139 |
+
"num_samples": 1278,
|
| 140 |
+
"tasks": [],
|
| 141 |
+
"average_score": 0.19986806708136184
|
| 142 |
+
},
|
| 143 |
+
"numerical_data": {
|
| 144 |
+
"count": 49,
|
| 145 |
+
"num_samples": 862,
|
| 146 |
+
"tasks": [],
|
| 147 |
+
"average_score": 0.2201024015934558
|
| 148 |
+
},
|
| 149 |
+
"open_ended_output": {
|
| 150 |
+
"count": 80,
|
| 151 |
+
"num_samples": 1454,
|
| 152 |
+
"tasks": [],
|
| 153 |
+
"average_score": 0.30248748033122763
|
| 154 |
+
},
|
| 155 |
+
"multiple_choice": {
|
| 156 |
+
"count": 85,
|
| 157 |
+
"num_samples": 1363,
|
| 158 |
+
"tasks": [],
|
| 159 |
+
"average_score": 0.256631742010999
|
| 160 |
+
}
|
| 161 |
+
},
|
| 162 |
+
"input_num": {
|
| 163 |
+
"6-8 images": {
|
| 164 |
+
"count": 21,
|
| 165 |
+
"num_samples": 314,
|
| 166 |
+
"tasks": [],
|
| 167 |
+
"average_score": 0.07681405895691609
|
| 168 |
+
},
|
| 169 |
+
"9-image or more": {
|
| 170 |
+
"count": 41,
|
| 171 |
+
"num_samples": 623,
|
| 172 |
+
"tasks": [],
|
| 173 |
+
"average_score": 0.10526691703628158
|
| 174 |
+
},
|
| 175 |
+
"1-image": {
|
| 176 |
+
"count": 315,
|
| 177 |
+
"num_samples": 5228,
|
| 178 |
+
"tasks": [],
|
| 179 |
+
"average_score": 0.25018977062352593
|
| 180 |
+
},
|
| 181 |
+
"video": {
|
| 182 |
+
"count": 43,
|
| 183 |
+
"num_samples": 698,
|
| 184 |
+
"tasks": [],
|
| 185 |
+
"average_score": 0.2520741776922928
|
| 186 |
+
},
|
| 187 |
+
"4-5 images": {
|
| 188 |
+
"count": 34,
|
| 189 |
+
"num_samples": 520,
|
| 190 |
+
"tasks": [],
|
| 191 |
+
"average_score": 0.17435940889565366
|
| 192 |
+
},
|
| 193 |
+
"2-3 images": {
|
| 194 |
+
"count": 51,
|
| 195 |
+
"num_samples": 802,
|
| 196 |
+
"tasks": [],
|
| 197 |
+
"average_score": 0.21286783416184518
|
| 198 |
+
}
|
| 199 |
+
},
|
| 200 |
+
"app": {
|
| 201 |
+
"Information_Extraction": {
|
| 202 |
+
"count": 72,
|
| 203 |
+
"num_samples": 1124,
|
| 204 |
+
"tasks": [],
|
| 205 |
+
"average_score": 0.2521972668785968
|
| 206 |
+
},
|
| 207 |
+
"Planning": {
|
| 208 |
+
"count": 78,
|
| 209 |
+
"num_samples": 1239,
|
| 210 |
+
"tasks": [],
|
| 211 |
+
"average_score": 0.06967138760493456
|
| 212 |
+
},
|
| 213 |
+
"Coding": {
|
| 214 |
+
"count": 31,
|
| 215 |
+
"num_samples": 474,
|
| 216 |
+
"tasks": [],
|
| 217 |
+
"average_score": 0.16996250112948405
|
| 218 |
+
},
|
| 219 |
+
"Perception": {
|
| 220 |
+
"count": 145,
|
| 221 |
+
"num_samples": 2313,
|
| 222 |
+
"tasks": [],
|
| 223 |
+
"average_score": 0.27603334911345223
|
| 224 |
+
},
|
| 225 |
+
"Metrics": {
|
| 226 |
+
"count": 20,
|
| 227 |
+
"num_samples": 309,
|
| 228 |
+
"tasks": [],
|
| 229 |
+
"average_score": 0.31002436092347696
|
| 230 |
+
},
|
| 231 |
+
"Science": {
|
| 232 |
+
"count": 29,
|
| 233 |
+
"num_samples": 574,
|
| 234 |
+
"tasks": [],
|
| 235 |
+
"average_score": 0.21061929716065056
|
| 236 |
+
},
|
| 237 |
+
"Knowledge": {
|
| 238 |
+
"count": 97,
|
| 239 |
+
"num_samples": 1605,
|
| 240 |
+
"tasks": [],
|
| 241 |
+
"average_score": 0.2656728023444808
|
| 242 |
+
},
|
| 243 |
+
"Mathematics": {
|
| 244 |
+
"count": 33,
|
| 245 |
+
"num_samples": 547,
|
| 246 |
+
"tasks": [],
|
| 247 |
+
"average_score": 0.16356158787929762
|
| 248 |
+
}
|
| 249 |
+
}
|
| 250 |
+
}
|
| 251 |
+
}
|
static/eval_results/Default/Qwen2_VL_2B/task_results.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
static/eval_results/Default/Qwen2_VL_72B/summary_results.json
ADDED
|
@@ -0,0 +1,251 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_summary": {
|
| 3 |
+
"core": {
|
| 4 |
+
"num_eval_tasks": 440,
|
| 5 |
+
"num_eval_samples": 6539,
|
| 6 |
+
"macro_mean_score": 0.4542376574527161,
|
| 7 |
+
"micro_mean_score": 0.4501201906164793
|
| 8 |
+
},
|
| 9 |
+
"open": {
|
| 10 |
+
"num_eval_tasks": 65,
|
| 11 |
+
"num_eval_samples": 1163,
|
| 12 |
+
"macro_mean_score": 0.5639771804231668,
|
| 13 |
+
"micro_mean_score": 0.5835339638865004
|
| 14 |
+
},
|
| 15 |
+
"overall_score": 0.4683625465479226
|
| 16 |
+
},
|
| 17 |
+
"keyword_stats": {
|
| 18 |
+
"skills": {
|
| 19 |
+
"Object Recognition and Classification": {
|
| 20 |
+
"count": 303,
|
| 21 |
+
"num_samples": 4755,
|
| 22 |
+
"tasks": [],
|
| 23 |
+
"average_score": 0.48669152179713876
|
| 24 |
+
},
|
| 25 |
+
"Text Recognition (OCR)": {
|
| 26 |
+
"count": 137,
|
| 27 |
+
"num_samples": 2239,
|
| 28 |
+
"tasks": [],
|
| 29 |
+
"average_score": 0.5291932917937967
|
| 30 |
+
},
|
| 31 |
+
"Language Understanding and Generation": {
|
| 32 |
+
"count": 154,
|
| 33 |
+
"num_samples": 2509,
|
| 34 |
+
"tasks": [],
|
| 35 |
+
"average_score": 0.53654503409075
|
| 36 |
+
},
|
| 37 |
+
"Scene and Event Understanding": {
|
| 38 |
+
"count": 154,
|
| 39 |
+
"num_samples": 2467,
|
| 40 |
+
"tasks": [],
|
| 41 |
+
"average_score": 0.4931554892760308
|
| 42 |
+
},
|
| 43 |
+
"Mathematical and Logical Reasoning": {
|
| 44 |
+
"count": 109,
|
| 45 |
+
"num_samples": 1910,
|
| 46 |
+
"tasks": [],
|
| 47 |
+
"average_score": 0.3908023665629473
|
| 48 |
+
},
|
| 49 |
+
"Commonsense and Social Reasoning": {
|
| 50 |
+
"count": 51,
|
| 51 |
+
"num_samples": 855,
|
| 52 |
+
"tasks": [],
|
| 53 |
+
"average_score": 0.5668846347262286
|
| 54 |
+
},
|
| 55 |
+
"Ethical and Safety Reasoning": {
|
| 56 |
+
"count": 15,
|
| 57 |
+
"num_samples": 245,
|
| 58 |
+
"tasks": [],
|
| 59 |
+
"average_score": 0.6121127819548872
|
| 60 |
+
},
|
| 61 |
+
"Domain-Specific Knowledge and Skills": {
|
| 62 |
+
"count": 77,
|
| 63 |
+
"num_samples": 1386,
|
| 64 |
+
"tasks": [],
|
| 65 |
+
"average_score": 0.4493794346300551
|
| 66 |
+
},
|
| 67 |
+
"Spatial and Temporal Reasoning": {
|
| 68 |
+
"count": 152,
|
| 69 |
+
"num_samples": 2437,
|
| 70 |
+
"tasks": [],
|
| 71 |
+
"average_score": 0.33622171962424363
|
| 72 |
+
},
|
| 73 |
+
"Planning and Decision Making": {
|
| 74 |
+
"count": 37,
|
| 75 |
+
"num_samples": 577,
|
| 76 |
+
"tasks": [],
|
| 77 |
+
"average_score": 0.21642754068858566
|
| 78 |
+
}
|
| 79 |
+
},
|
| 80 |
+
"input_format": {
|
| 81 |
+
"User Interface Screenshots": {
|
| 82 |
+
"count": 93,
|
| 83 |
+
"num_samples": 1517,
|
| 84 |
+
"tasks": [],
|
| 85 |
+
"average_score": 0.5263730250833892
|
| 86 |
+
},
|
| 87 |
+
"Text-Based Images and Documents": {
|
| 88 |
+
"count": 82,
|
| 89 |
+
"num_samples": 1294,
|
| 90 |
+
"tasks": [],
|
| 91 |
+
"average_score": 0.42759570727857965
|
| 92 |
+
},
|
| 93 |
+
"Diagrams and Data Visualizations": {
|
| 94 |
+
"count": 101,
|
| 95 |
+
"num_samples": 1718,
|
| 96 |
+
"tasks": [],
|
| 97 |
+
"average_score": 0.4228561177227288
|
| 98 |
+
},
|
| 99 |
+
"Videos": {
|
| 100 |
+
"count": 43,
|
| 101 |
+
"num_samples": 698,
|
| 102 |
+
"tasks": [],
|
| 103 |
+
"average_score": 0.4780253686541936
|
| 104 |
+
},
|
| 105 |
+
"Artistic and Creative Content": {
|
| 106 |
+
"count": 32,
|
| 107 |
+
"num_samples": 541,
|
| 108 |
+
"tasks": [],
|
| 109 |
+
"average_score": 0.5070774860945021
|
| 110 |
+
},
|
| 111 |
+
"Photographs": {
|
| 112 |
+
"count": 143,
|
| 113 |
+
"num_samples": 2248,
|
| 114 |
+
"tasks": [],
|
| 115 |
+
"average_score": 0.4807292191169126
|
| 116 |
+
},
|
| 117 |
+
"3D Models and Aerial Imagery": {
|
| 118 |
+
"count": 11,
|
| 119 |
+
"num_samples": 169,
|
| 120 |
+
"tasks": [],
|
| 121 |
+
"average_score": 0.38847545874852984
|
| 122 |
+
}
|
| 123 |
+
},
|
| 124 |
+
"output_format": {
|
| 125 |
+
"contextual_formatted_text": {
|
| 126 |
+
"count": 98,
|
| 127 |
+
"num_samples": 1514,
|
| 128 |
+
"tasks": [],
|
| 129 |
+
"average_score": 0.4359156358804688
|
| 130 |
+
},
|
| 131 |
+
"structured_output": {
|
| 132 |
+
"count": 110,
|
| 133 |
+
"num_samples": 1714,
|
| 134 |
+
"tasks": [],
|
| 135 |
+
"average_score": 0.43781407268698613
|
| 136 |
+
},
|
| 137 |
+
"exact_text": {
|
| 138 |
+
"count": 83,
|
| 139 |
+
"num_samples": 1278,
|
| 140 |
+
"tasks": [],
|
| 141 |
+
"average_score": 0.49080138099759946
|
| 142 |
+
},
|
| 143 |
+
"numerical_data": {
|
| 144 |
+
"count": 49,
|
| 145 |
+
"num_samples": 862,
|
| 146 |
+
"tasks": [],
|
| 147 |
+
"average_score": 0.42481004254128113
|
| 148 |
+
},
|
| 149 |
+
"open_ended_output": {
|
| 150 |
+
"count": 80,
|
| 151 |
+
"num_samples": 1454,
|
| 152 |
+
"tasks": [],
|
| 153 |
+
"average_score": 0.5132810622684265
|
| 154 |
+
},
|
| 155 |
+
"multiple_choice": {
|
| 156 |
+
"count": 85,
|
| 157 |
+
"num_samples": 1363,
|
| 158 |
+
"tasks": [],
|
| 159 |
+
"average_score": 0.5062248706593999
|
| 160 |
+
}
|
| 161 |
+
},
|
| 162 |
+
"input_num": {
|
| 163 |
+
"6-8 images": {
|
| 164 |
+
"count": 21,
|
| 165 |
+
"num_samples": 314,
|
| 166 |
+
"tasks": [],
|
| 167 |
+
"average_score": 0.3063303099017385
|
| 168 |
+
},
|
| 169 |
+
"9-image or more": {
|
| 170 |
+
"count": 41,
|
| 171 |
+
"num_samples": 623,
|
| 172 |
+
"tasks": [],
|
| 173 |
+
"average_score": 0.523959576707116
|
| 174 |
+
},
|
| 175 |
+
"1-image": {
|
| 176 |
+
"count": 315,
|
| 177 |
+
"num_samples": 5228,
|
| 178 |
+
"tasks": [],
|
| 179 |
+
"average_score": 0.4879791577413812
|
| 180 |
+
},
|
| 181 |
+
"video": {
|
| 182 |
+
"count": 43,
|
| 183 |
+
"num_samples": 698,
|
| 184 |
+
"tasks": [],
|
| 185 |
+
"average_score": 0.4780253686541936
|
| 186 |
+
},
|
| 187 |
+
"4-5 images": {
|
| 188 |
+
"count": 34,
|
| 189 |
+
"num_samples": 520,
|
| 190 |
+
"tasks": [],
|
| 191 |
+
"average_score": 0.34846161336322395
|
| 192 |
+
},
|
| 193 |
+
"2-3 images": {
|
| 194 |
+
"count": 51,
|
| 195 |
+
"num_samples": 802,
|
| 196 |
+
"tasks": [],
|
| 197 |
+
"average_score": 0.44101149919132854
|
| 198 |
+
}
|
| 199 |
+
},
|
| 200 |
+
"app": {
|
| 201 |
+
"Information_Extraction": {
|
| 202 |
+
"count": 72,
|
| 203 |
+
"num_samples": 1124,
|
| 204 |
+
"tasks": [],
|
| 205 |
+
"average_score": 0.5663587858366833
|
| 206 |
+
},
|
| 207 |
+
"Planning": {
|
| 208 |
+
"count": 78,
|
| 209 |
+
"num_samples": 1239,
|
| 210 |
+
"tasks": [],
|
| 211 |
+
"average_score": 0.3067825586087303
|
| 212 |
+
},
|
| 213 |
+
"Coding": {
|
| 214 |
+
"count": 31,
|
| 215 |
+
"num_samples": 474,
|
| 216 |
+
"tasks": [],
|
| 217 |
+
"average_score": 0.4121566368482877
|
| 218 |
+
},
|
| 219 |
+
"Perception": {
|
| 220 |
+
"count": 145,
|
| 221 |
+
"num_samples": 2313,
|
| 222 |
+
"tasks": [],
|
| 223 |
+
"average_score": 0.5176521211872086
|
| 224 |
+
},
|
| 225 |
+
"Metrics": {
|
| 226 |
+
"count": 20,
|
| 227 |
+
"num_samples": 309,
|
| 228 |
+
"tasks": [],
|
| 229 |
+
"average_score": 0.5030444649397028
|
| 230 |
+
},
|
| 231 |
+
"Science": {
|
| 232 |
+
"count": 29,
|
| 233 |
+
"num_samples": 574,
|
| 234 |
+
"tasks": [],
|
| 235 |
+
"average_score": 0.45616267568458396
|
| 236 |
+
},
|
| 237 |
+
"Knowledge": {
|
| 238 |
+
"count": 97,
|
| 239 |
+
"num_samples": 1605,
|
| 240 |
+
"tasks": [],
|
| 241 |
+
"average_score": 0.5047683071464567
|
| 242 |
+
},
|
| 243 |
+
"Mathematics": {
|
| 244 |
+
"count": 33,
|
| 245 |
+
"num_samples": 547,
|
| 246 |
+
"tasks": [],
|
| 247 |
+
"average_score": 0.3553838743540432
|
| 248 |
+
}
|
| 249 |
+
}
|
| 250 |
+
}
|
| 251 |
+
}
|
static/eval_results/Default/Qwen2_VL_72B/task_results.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
static/eval_results/Default/Qwen2_VL_7B/summary_results.json
ADDED
|
@@ -0,0 +1,251 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_summary": {
|
| 3 |
+
"core": {
|
| 4 |
+
"num_eval_tasks": 440,
|
| 5 |
+
"num_eval_samples": 6539,
|
| 6 |
+
"macro_mean_score": 0.3293449599230247,
|
| 7 |
+
"micro_mean_score": 0.325331493515679
|
| 8 |
+
},
|
| 9 |
+
"open": {
|
| 10 |
+
"num_eval_tasks": 65,
|
| 11 |
+
"num_eval_samples": 1170,
|
| 12 |
+
"macro_mean_score": 0.43955105763038577,
|
| 13 |
+
"micro_mean_score": 0.45508547008546996
|
| 14 |
+
},
|
| 15 |
+
"overall_score": 0.34352990319228904
|
| 16 |
+
},
|
| 17 |
+
"keyword_stats": {
|
| 18 |
+
"skills": {
|
| 19 |
+
"Object Recognition and Classification": {
|
| 20 |
+
"count": 303,
|
| 21 |
+
"num_samples": 4755,
|
| 22 |
+
"tasks": [],
|
| 23 |
+
"average_score": 0.3506773570484231
|
| 24 |
+
},
|
| 25 |
+
"Text Recognition (OCR)": {
|
| 26 |
+
"count": 137,
|
| 27 |
+
"num_samples": 2239,
|
| 28 |
+
"tasks": [],
|
| 29 |
+
"average_score": 0.38363163370919123
|
| 30 |
+
},
|
| 31 |
+
"Language Understanding and Generation": {
|
| 32 |
+
"count": 154,
|
| 33 |
+
"num_samples": 2511,
|
| 34 |
+
"tasks": [],
|
| 35 |
+
"average_score": 0.3882785389756705
|
| 36 |
+
},
|
| 37 |
+
"Scene and Event Understanding": {
|
| 38 |
+
"count": 154,
|
| 39 |
+
"num_samples": 2469,
|
| 40 |
+
"tasks": [],
|
| 41 |
+
"average_score": 0.38292659892379843
|
| 42 |
+
},
|
| 43 |
+
"Mathematical and Logical Reasoning": {
|
| 44 |
+
"count": 109,
|
| 45 |
+
"num_samples": 1910,
|
| 46 |
+
"tasks": [],
|
| 47 |
+
"average_score": 0.2730765188348748
|
| 48 |
+
},
|
| 49 |
+
"Commonsense and Social Reasoning": {
|
| 50 |
+
"count": 51,
|
| 51 |
+
"num_samples": 855,
|
| 52 |
+
"tasks": [],
|
| 53 |
+
"average_score": 0.4625711182912848
|
| 54 |
+
},
|
| 55 |
+
"Ethical and Safety Reasoning": {
|
| 56 |
+
"count": 15,
|
| 57 |
+
"num_samples": 245,
|
| 58 |
+
"tasks": [],
|
| 59 |
+
"average_score": 0.5287318295739348
|
| 60 |
+
},
|
| 61 |
+
"Domain-Specific Knowledge and Skills": {
|
| 62 |
+
"count": 77,
|
| 63 |
+
"num_samples": 1386,
|
| 64 |
+
"tasks": [],
|
| 65 |
+
"average_score": 0.32297080808954215
|
| 66 |
+
},
|
| 67 |
+
"Spatial and Temporal Reasoning": {
|
| 68 |
+
"count": 152,
|
| 69 |
+
"num_samples": 2439,
|
| 70 |
+
"tasks": [],
|
| 71 |
+
"average_score": 0.2561357336105554
|
| 72 |
+
},
|
| 73 |
+
"Planning and Decision Making": {
|
| 74 |
+
"count": 37,
|
| 75 |
+
"num_samples": 577,
|
| 76 |
+
"tasks": [],
|
| 77 |
+
"average_score": 0.12651411144309255
|
| 78 |
+
}
|
| 79 |
+
},
|
| 80 |
+
"input_format": {
|
| 81 |
+
"User Interface Screenshots": {
|
| 82 |
+
"count": 93,
|
| 83 |
+
"num_samples": 1517,
|
| 84 |
+
"tasks": [],
|
| 85 |
+
"average_score": 0.35229497847636093
|
| 86 |
+
},
|
| 87 |
+
"Text-Based Images and Documents": {
|
| 88 |
+
"count": 82,
|
| 89 |
+
"num_samples": 1294,
|
| 90 |
+
"tasks": [],
|
| 91 |
+
"average_score": 0.2881996369284258
|
| 92 |
+
},
|
| 93 |
+
"Diagrams and Data Visualizations": {
|
| 94 |
+
"count": 101,
|
| 95 |
+
"num_samples": 1718,
|
| 96 |
+
"tasks": [],
|
| 97 |
+
"average_score": 0.3162917354476226
|
| 98 |
+
},
|
| 99 |
+
"Videos": {
|
| 100 |
+
"count": 43,
|
| 101 |
+
"num_samples": 700,
|
| 102 |
+
"tasks": [],
|
| 103 |
+
"average_score": 0.3555910609857979
|
| 104 |
+
},
|
| 105 |
+
"Artistic and Creative Content": {
|
| 106 |
+
"count": 32,
|
| 107 |
+
"num_samples": 541,
|
| 108 |
+
"tasks": [],
|
| 109 |
+
"average_score": 0.3513518594470202
|
| 110 |
+
},
|
| 111 |
+
"Photographs": {
|
| 112 |
+
"count": 143,
|
| 113 |
+
"num_samples": 2248,
|
| 114 |
+
"tasks": [],
|
| 115 |
+
"average_score": 0.39509504888372243
|
| 116 |
+
},
|
| 117 |
+
"3D Models and Aerial Imagery": {
|
| 118 |
+
"count": 11,
|
| 119 |
+
"num_samples": 169,
|
| 120 |
+
"tasks": [],
|
| 121 |
+
"average_score": 0.19173322639974366
|
| 122 |
+
}
|
| 123 |
+
},
|
| 124 |
+
"output_format": {
|
| 125 |
+
"contextual_formatted_text": {
|
| 126 |
+
"count": 98,
|
| 127 |
+
"num_samples": 1514,
|
| 128 |
+
"tasks": [],
|
| 129 |
+
"average_score": 0.3118818521697947
|
| 130 |
+
},
|
| 131 |
+
"structured_output": {
|
| 132 |
+
"count": 110,
|
| 133 |
+
"num_samples": 1714,
|
| 134 |
+
"tasks": [],
|
| 135 |
+
"average_score": 0.3323478338046426
|
| 136 |
+
},
|
| 137 |
+
"exact_text": {
|
| 138 |
+
"count": 83,
|
| 139 |
+
"num_samples": 1278,
|
| 140 |
+
"tasks": [],
|
| 141 |
+
"average_score": 0.31975345327634014
|
| 142 |
+
},
|
| 143 |
+
"numerical_data": {
|
| 144 |
+
"count": 49,
|
| 145 |
+
"num_samples": 862,
|
| 146 |
+
"tasks": [],
|
| 147 |
+
"average_score": 0.3207400992620562
|
| 148 |
+
},
|
| 149 |
+
"open_ended_output": {
|
| 150 |
+
"count": 80,
|
| 151 |
+
"num_samples": 1456,
|
| 152 |
+
"tasks": [],
|
| 153 |
+
"average_score": 0.39680785337230745
|
| 154 |
+
},
|
| 155 |
+
"multiple_choice": {
|
| 156 |
+
"count": 85,
|
| 157 |
+
"num_samples": 1363,
|
| 158 |
+
"tasks": [],
|
| 159 |
+
"average_score": 0.38069986029874947
|
| 160 |
+
}
|
| 161 |
+
},
|
| 162 |
+
"input_num": {
|
| 163 |
+
"6-8 images": {
|
| 164 |
+
"count": 21,
|
| 165 |
+
"num_samples": 314,
|
| 166 |
+
"tasks": [],
|
| 167 |
+
"average_score": 0.21448412698412703
|
| 168 |
+
},
|
| 169 |
+
"9-image or more": {
|
| 170 |
+
"count": 41,
|
| 171 |
+
"num_samples": 623,
|
| 172 |
+
"tasks": [],
|
| 173 |
+
"average_score": 0.34991843422677277
|
| 174 |
+
},
|
| 175 |
+
"1-image": {
|
| 176 |
+
"count": 315,
|
| 177 |
+
"num_samples": 5228,
|
| 178 |
+
"tasks": [],
|
| 179 |
+
"average_score": 0.36487656334089386
|
| 180 |
+
},
|
| 181 |
+
"video": {
|
| 182 |
+
"count": 43,
|
| 183 |
+
"num_samples": 700,
|
| 184 |
+
"tasks": [],
|
| 185 |
+
"average_score": 0.3555910609857979
|
| 186 |
+
},
|
| 187 |
+
"4-5 images": {
|
| 188 |
+
"count": 34,
|
| 189 |
+
"num_samples": 520,
|
| 190 |
+
"tasks": [],
|
| 191 |
+
"average_score": 0.23950364354876252
|
| 192 |
+
},
|
| 193 |
+
"2-3 images": {
|
| 194 |
+
"count": 51,
|
| 195 |
+
"num_samples": 802,
|
| 196 |
+
"tasks": [],
|
| 197 |
+
"average_score": 0.31886513111201115
|
| 198 |
+
}
|
| 199 |
+
},
|
| 200 |
+
"app": {
|
| 201 |
+
"Information_Extraction": {
|
| 202 |
+
"count": 72,
|
| 203 |
+
"num_samples": 1124,
|
| 204 |
+
"tasks": [],
|
| 205 |
+
"average_score": 0.3972495309304478
|
| 206 |
+
},
|
| 207 |
+
"Planning": {
|
| 208 |
+
"count": 78,
|
| 209 |
+
"num_samples": 1239,
|
| 210 |
+
"tasks": [],
|
| 211 |
+
"average_score": 0.18098305857595157
|
| 212 |
+
},
|
| 213 |
+
"Coding": {
|
| 214 |
+
"count": 31,
|
| 215 |
+
"num_samples": 474,
|
| 216 |
+
"tasks": [],
|
| 217 |
+
"average_score": 0.30887234822244314
|
| 218 |
+
},
|
| 219 |
+
"Perception": {
|
| 220 |
+
"count": 145,
|
| 221 |
+
"num_samples": 2315,
|
| 222 |
+
"tasks": [],
|
| 223 |
+
"average_score": 0.39256038521661607
|
| 224 |
+
},
|
| 225 |
+
"Metrics": {
|
| 226 |
+
"count": 20,
|
| 227 |
+
"num_samples": 309,
|
| 228 |
+
"tasks": [],
|
| 229 |
+
"average_score": 0.44924313486983725
|
| 230 |
+
},
|
| 231 |
+
"Science": {
|
| 232 |
+
"count": 29,
|
| 233 |
+
"num_samples": 574,
|
| 234 |
+
"tasks": [],
|
| 235 |
+
"average_score": 0.2880278656037017
|
| 236 |
+
},
|
| 237 |
+
"Knowledge": {
|
| 238 |
+
"count": 97,
|
| 239 |
+
"num_samples": 1605,
|
| 240 |
+
"tasks": [],
|
| 241 |
+
"average_score": 0.4015531477048036
|
| 242 |
+
},
|
| 243 |
+
"Mathematics": {
|
| 244 |
+
"count": 33,
|
| 245 |
+
"num_samples": 547,
|
| 246 |
+
"tasks": [],
|
| 247 |
+
"average_score": 0.24179792538224956
|
| 248 |
+
}
|
| 249 |
+
}
|
| 250 |
+
}
|
| 251 |
+
}
|
static/eval_results/Default/Qwen2_VL_7B/task_results.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
static/eval_results/Default/all_model_keywords_stats.json
DELETED
|
The diff for this file is too large to render.
See raw diff
|
|
|
static/eval_results/Default/all_summary.json
DELETED
|
@@ -1,525 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"GPT_4o": {
|
| 3 |
-
"core_noncot": {
|
| 4 |
-
"num_eval_tasks": 440,
|
| 5 |
-
"num_eval_samples": 6539,
|
| 6 |
-
"num_not_eval_samples": 0,
|
| 7 |
-
"macro_mean_score": 0.5203440930873326,
|
| 8 |
-
"micro_mean_score": 0.514302640282204
|
| 9 |
-
},
|
| 10 |
-
"core_cot": {
|
| 11 |
-
"num_eval_tasks": 440,
|
| 12 |
-
"num_eval_samples": 6539,
|
| 13 |
-
"num_not_eval_samples": 0,
|
| 14 |
-
"macro_mean_score": 0.5265030595065238,
|
| 15 |
-
"micro_mean_score": 0.5236338521693411
|
| 16 |
-
},
|
| 17 |
-
"open": {
|
| 18 |
-
"num_eval_tasks": 65,
|
| 19 |
-
"num_eval_samples": 1163,
|
| 20 |
-
"macro_mean_score": 0.6478225794744895,
|
| 21 |
-
"micro_mean_score": 0.665391229578676
|
| 22 |
-
},
|
| 23 |
-
"overall_score": 0.5421184432647768
|
| 24 |
-
},
|
| 25 |
-
"Gemini_1.5_pro_002": {
|
| 26 |
-
"core_noncot": {
|
| 27 |
-
"num_eval_tasks": 440,
|
| 28 |
-
"num_eval_samples": 6539,
|
| 29 |
-
"num_not_eval_samples": 0,
|
| 30 |
-
"macro_mean_score": 0.4699992918320008,
|
| 31 |
-
"micro_mean_score": 0.4651116133689296
|
| 32 |
-
},
|
| 33 |
-
"core_cot": {
|
| 34 |
-
"num_eval_tasks": 440,
|
| 35 |
-
"num_eval_samples": 6539,
|
| 36 |
-
"num_not_eval_samples": 0,
|
| 37 |
-
"macro_mean_score": 0.4822473962867704,
|
| 38 |
-
"micro_mean_score": 0.4764805563057179
|
| 39 |
-
},
|
| 40 |
-
"open": {
|
| 41 |
-
"num_eval_tasks": 65,
|
| 42 |
-
"num_eval_samples": 1163,
|
| 43 |
-
"macro_mean_score": 0.5858190649927173,
|
| 44 |
-
"micro_mean_score": 0.6104901117798793
|
| 45 |
-
},
|
| 46 |
-
"overall_score": 0.4955784031499121
|
| 47 |
-
},
|
| 48 |
-
"Gemini_1.5_flash_002": {
|
| 49 |
-
"core_noncot": {
|
| 50 |
-
"num_eval_tasks": 440,
|
| 51 |
-
"num_eval_samples": 6539,
|
| 52 |
-
"num_not_eval_samples": 0,
|
| 53 |
-
"macro_mean_score": 0.41898948981774853,
|
| 54 |
-
"micro_mean_score": 0.4127376993779598
|
| 55 |
-
},
|
| 56 |
-
"core_cot": {
|
| 57 |
-
"num_eval_tasks": 440,
|
| 58 |
-
"num_eval_samples": 6539,
|
| 59 |
-
"num_not_eval_samples": 0,
|
| 60 |
-
"macro_mean_score": 0.4189319021967416,
|
| 61 |
-
"micro_mean_score": 0.41567515414375245
|
| 62 |
-
},
|
| 63 |
-
"open": {
|
| 64 |
-
"num_eval_tasks": 65,
|
| 65 |
-
"num_eval_samples": 1163,
|
| 66 |
-
"macro_mean_score": 0.5691365176285039,
|
| 67 |
-
"micro_mean_score": 0.5987532244196045
|
| 68 |
-
},
|
| 69 |
-
"overall_score": 0.43831534488249924
|
| 70 |
-
},
|
| 71 |
-
"Claude_3.5": {
|
| 72 |
-
"core_noncot": {
|
| 73 |
-
"num_eval_tasks": 440,
|
| 74 |
-
"num_eval_samples": 6539,
|
| 75 |
-
"num_not_eval_samples": 0,
|
| 76 |
-
"macro_mean_score": 0.48800427486796155,
|
| 77 |
-
"micro_mean_score": 0.4814327812005499
|
| 78 |
-
},
|
| 79 |
-
"core_cot": {
|
| 80 |
-
"num_eval_tasks": 440,
|
| 81 |
-
"num_eval_samples": 6539,
|
| 82 |
-
"num_not_eval_samples": 0,
|
| 83 |
-
"macro_mean_score": 0.5040975742801586,
|
| 84 |
-
"micro_mean_score": 0.5002259116666758
|
| 85 |
-
},
|
| 86 |
-
"open": {
|
| 87 |
-
"num_eval_tasks": 65,
|
| 88 |
-
"num_eval_samples": 1163,
|
| 89 |
-
"macro_mean_score": 0.6373907158949892,
|
| 90 |
-
"micro_mean_score": 0.6569647463456579
|
| 91 |
-
},
|
| 92 |
-
"overall_score": 0.5212541172602853
|
| 93 |
-
},
|
| 94 |
-
"Claude_3.5_new": {
|
| 95 |
-
"core_noncot": {
|
| 96 |
-
"num_eval_tasks": 440,
|
| 97 |
-
"num_eval_samples": 6539,
|
| 98 |
-
"num_not_eval_samples": 0,
|
| 99 |
-
"macro_mean_score": 0.4919657684484185,
|
| 100 |
-
"micro_mean_score": 0.4874520567007144
|
| 101 |
-
},
|
| 102 |
-
"core_cot": {
|
| 103 |
-
"num_eval_tasks": 440,
|
| 104 |
-
"num_eval_samples": 6539,
|
| 105 |
-
"num_not_eval_samples": 0,
|
| 106 |
-
"macro_mean_score": 0.5259191914020757,
|
| 107 |
-
"micro_mean_score": 0.5230785894131227
|
| 108 |
-
},
|
| 109 |
-
"open": {
|
| 110 |
-
"num_eval_tasks": 65,
|
| 111 |
-
"num_eval_samples": 1163,
|
| 112 |
-
"macro_mean_score": 0.6563419761104125,
|
| 113 |
-
"micro_mean_score": 0.6724419604471196
|
| 114 |
-
},
|
| 115 |
-
"overall_score": 0.5427062825031487
|
| 116 |
-
},
|
| 117 |
-
"GPT_4o_mini": {
|
| 118 |
-
"core_noncot": {
|
| 119 |
-
"num_eval_tasks": 440,
|
| 120 |
-
"num_eval_samples": 6539,
|
| 121 |
-
"num_not_eval_samples": 0,
|
| 122 |
-
"macro_mean_score": 0.39854757130003565,
|
| 123 |
-
"micro_mean_score": 0.3936551517403452
|
| 124 |
-
},
|
| 125 |
-
"core_cot": {
|
| 126 |
-
"num_eval_tasks": 440,
|
| 127 |
-
"num_eval_samples": 6539,
|
| 128 |
-
"num_not_eval_samples": 0,
|
| 129 |
-
"macro_mean_score": 0.40767494558789397,
|
| 130 |
-
"micro_mean_score": 0.40431644154143376
|
| 131 |
-
},
|
| 132 |
-
"open": {
|
| 133 |
-
"num_eval_tasks": 65,
|
| 134 |
-
"num_eval_samples": 1163,
|
| 135 |
-
"macro_mean_score": 0.586537827213665,
|
| 136 |
-
"micro_mean_score": 0.6133276010318144
|
| 137 |
-
},
|
| 138 |
-
"overall_score": 0.43069690064863675
|
| 139 |
-
},
|
| 140 |
-
"Qwen2_VL_72B": {
|
| 141 |
-
"core_noncot": {
|
| 142 |
-
"num_eval_tasks": 440,
|
| 143 |
-
"num_eval_samples": 6539,
|
| 144 |
-
"num_not_eval_samples": 0,
|
| 145 |
-
"macro_mean_score": 0.46406654108789214,
|
| 146 |
-
"micro_mean_score": 0.4584702152011697
|
| 147 |
-
},
|
| 148 |
-
"core_cot": {
|
| 149 |
-
"num_eval_tasks": 440,
|
| 150 |
-
"num_eval_samples": 6539,
|
| 151 |
-
"num_not_eval_samples": 0,
|
| 152 |
-
"macro_mean_score": 0.4542376574527161,
|
| 153 |
-
"micro_mean_score": 0.4501201906164793
|
| 154 |
-
},
|
| 155 |
-
"open": {
|
| 156 |
-
"num_eval_tasks": 65,
|
| 157 |
-
"num_eval_samples": 1163,
|
| 158 |
-
"macro_mean_score": 0.5639771804231668,
|
| 159 |
-
"micro_mean_score": 0.5835339638865004
|
| 160 |
-
},
|
| 161 |
-
"overall_score": 0.4769263263488681
|
| 162 |
-
},
|
| 163 |
-
"Qwen2_VL_7B": {
|
| 164 |
-
"core_noncot": {
|
| 165 |
-
"num_eval_tasks": 440,
|
| 166 |
-
"num_eval_samples": 6539,
|
| 167 |
-
"num_not_eval_samples": 0,
|
| 168 |
-
"macro_mean_score": 0.3480020832611913,
|
| 169 |
-
"micro_mean_score": 0.3441858958345098
|
| 170 |
-
},
|
| 171 |
-
"core_cot": {
|
| 172 |
-
"num_eval_tasks": 440,
|
| 173 |
-
"num_eval_samples": 6539,
|
| 174 |
-
"num_not_eval_samples": 0,
|
| 175 |
-
"macro_mean_score": 0.3293449599230247,
|
| 176 |
-
"micro_mean_score": 0.325331493515679
|
| 177 |
-
},
|
| 178 |
-
"open": {
|
| 179 |
-
"num_eval_tasks": 65,
|
| 180 |
-
"num_eval_samples": 1170,
|
| 181 |
-
"macro_mean_score": 0.43955105763038577,
|
| 182 |
-
"micro_mean_score": 0.45508547008546996
|
| 183 |
-
},
|
| 184 |
-
"overall_score": 0.3597856146156421
|
| 185 |
-
},
|
| 186 |
-
"llava_onevision_72B": {
|
| 187 |
-
"core_noncot": {
|
| 188 |
-
"num_eval_tasks": 440,
|
| 189 |
-
"num_eval_samples": 6539,
|
| 190 |
-
"num_not_eval_samples": 0,
|
| 191 |
-
"macro_mean_score": 0.3199332158220174,
|
| 192 |
-
"micro_mean_score": 0.31770770553892647
|
| 193 |
-
},
|
| 194 |
-
"core_cot": {
|
| 195 |
-
"num_eval_tasks": 440,
|
| 196 |
-
"num_eval_samples": 6539,
|
| 197 |
-
"num_not_eval_samples": 0,
|
| 198 |
-
"macro_mean_score": 0.2974368415462532,
|
| 199 |
-
"micro_mean_score": 0.2956217833156672
|
| 200 |
-
},
|
| 201 |
-
"open": {
|
| 202 |
-
"num_eval_tasks": 65,
|
| 203 |
-
"num_eval_samples": 1163,
|
| 204 |
-
"macro_mean_score": 0.4599484231632498,
|
| 205 |
-
"micro_mean_score": 0.4850386930352536
|
| 206 |
-
},
|
| 207 |
-
"overall_score": 0.33795497518277007
|
| 208 |
-
},
|
| 209 |
-
"llava_onevision_7B": {
|
| 210 |
-
"core_noncot": {
|
| 211 |
-
"num_eval_tasks": 440,
|
| 212 |
-
"num_eval_samples": 6539,
|
| 213 |
-
"num_not_eval_samples": 0,
|
| 214 |
-
"macro_mean_score": 0.22409531510496777,
|
| 215 |
-
"micro_mean_score": 0.22238854298563537
|
| 216 |
-
},
|
| 217 |
-
"core_cot": {
|
| 218 |
-
"num_eval_tasks": 440,
|
| 219 |
-
"num_eval_samples": 6539,
|
| 220 |
-
"num_not_eval_samples": 0,
|
| 221 |
-
"macro_mean_score": 0.21362697219149712,
|
| 222 |
-
"micro_mean_score": 0.21073910058505504
|
| 223 |
-
},
|
| 224 |
-
"open": {
|
| 225 |
-
"num_eval_tasks": 65,
|
| 226 |
-
"num_eval_samples": 1163,
|
| 227 |
-
"macro_mean_score": 0.33979975321921935,
|
| 228 |
-
"micro_mean_score": 0.36474634565778147
|
| 229 |
-
},
|
| 230 |
-
"overall_score": 0.23898796555531696
|
| 231 |
-
},
|
| 232 |
-
"InternVL2_76B": {
|
| 233 |
-
"core_noncot": {
|
| 234 |
-
"num_eval_tasks": 440,
|
| 235 |
-
"num_eval_samples": 6539,
|
| 236 |
-
"num_not_eval_samples": 0,
|
| 237 |
-
"macro_mean_score": 0.3502244283768534,
|
| 238 |
-
"micro_mean_score": 0.3456783051732046
|
| 239 |
-
},
|
| 240 |
-
"core_cot": {
|
| 241 |
-
"num_eval_tasks": 440,
|
| 242 |
-
"num_eval_samples": 6539,
|
| 243 |
-
"num_not_eval_samples": 0,
|
| 244 |
-
"macro_mean_score": 0.3562710424410931,
|
| 245 |
-
"micro_mean_score": 0.35129859801162616
|
| 246 |
-
},
|
| 247 |
-
"open": {
|
| 248 |
-
"num_eval_tasks": 65,
|
| 249 |
-
"num_eval_samples": 1163,
|
| 250 |
-
"macro_mean_score": 0.5192997443033639,
|
| 251 |
-
"micro_mean_score": 0.5421324161650903
|
| 252 |
-
},
|
| 253 |
-
"overall_score": 0.3772549347599992
|
| 254 |
-
},
|
| 255 |
-
"InternVL2_8B": {
|
| 256 |
-
"core_noncot": {
|
| 257 |
-
"num_eval_tasks": 440,
|
| 258 |
-
"num_eval_samples": 6539,
|
| 259 |
-
"num_not_eval_samples": 0,
|
| 260 |
-
"macro_mean_score": 0.25956581776451815,
|
| 261 |
-
"micro_mean_score": 0.2546984460483302
|
| 262 |
-
},
|
| 263 |
-
"core_cot": {
|
| 264 |
-
"num_eval_tasks": 440,
|
| 265 |
-
"num_eval_samples": 6539,
|
| 266 |
-
"num_not_eval_samples": 0,
|
| 267 |
-
"macro_mean_score": 0.24090301358258295,
|
| 268 |
-
"micro_mean_score": 0.23819084111520938
|
| 269 |
-
},
|
| 270 |
-
"open": {
|
| 271 |
-
"num_eval_tasks": 65,
|
| 272 |
-
"num_eval_samples": 1165,
|
| 273 |
-
"macro_mean_score": 0.3978571701460552,
|
| 274 |
-
"micro_mean_score": 0.4108583690987125
|
| 275 |
-
},
|
| 276 |
-
"overall_score": 0.2773656948037259
|
| 277 |
-
},
|
| 278 |
-
"MiniCPM_v2.6": {
|
| 279 |
-
"core_noncot": {
|
| 280 |
-
"num_eval_tasks": 440,
|
| 281 |
-
"num_eval_samples": 6539,
|
| 282 |
-
"num_not_eval_samples": 0,
|
| 283 |
-
"macro_mean_score": 0.2287645706203155,
|
| 284 |
-
"micro_mean_score": 0.2249087742955901
|
| 285 |
-
},
|
| 286 |
-
"core_cot": {
|
| 287 |
-
"num_eval_tasks": 440,
|
| 288 |
-
"num_eval_samples": 6539,
|
| 289 |
-
"num_not_eval_samples": 0,
|
| 290 |
-
"macro_mean_score": 0.22955895202146906,
|
| 291 |
-
"micro_mean_score": 0.22560399396899078
|
| 292 |
-
},
|
| 293 |
-
"open": {
|
| 294 |
-
"num_eval_tasks": 65,
|
| 295 |
-
"num_eval_samples": 1163,
|
| 296 |
-
"macro_mean_score": 0.41728623355613875,
|
| 297 |
-
"micro_mean_score": 0.43452278589853827
|
| 298 |
-
},
|
| 299 |
-
"overall_score": 0.2537218694467236
|
| 300 |
-
},
|
| 301 |
-
"Phi-3.5-vision": {
|
| 302 |
-
"core_noncot": {
|
| 303 |
-
"num_eval_tasks": 440,
|
| 304 |
-
"num_eval_samples": 6539,
|
| 305 |
-
"num_not_eval_samples": 0,
|
| 306 |
-
"macro_mean_score": 0.23271251159409778,
|
| 307 |
-
"micro_mean_score": 0.2296262323791101
|
| 308 |
-
},
|
| 309 |
-
"core_cot": {
|
| 310 |
-
"num_eval_tasks": 440,
|
| 311 |
-
"num_eval_samples": 6539,
|
| 312 |
-
"num_not_eval_samples": 0,
|
| 313 |
-
"macro_mean_score": 0.22995297916629392,
|
| 314 |
-
"micro_mean_score": 0.22708502951025372
|
| 315 |
-
},
|
| 316 |
-
"open": {
|
| 317 |
-
"num_eval_tasks": 65,
|
| 318 |
-
"num_eval_samples": 1163,
|
| 319 |
-
"macro_mean_score": 0.3947914647737769,
|
| 320 |
-
"micro_mean_score": 0.42459157351676696
|
| 321 |
-
},
|
| 322 |
-
"overall_score": 0.25357415903306635
|
| 323 |
-
},
|
| 324 |
-
"Pixtral_12B": {
|
| 325 |
-
"core_noncot": {
|
| 326 |
-
"num_eval_tasks": 440,
|
| 327 |
-
"num_eval_samples": 6539,
|
| 328 |
-
"num_not_eval_samples": 0,
|
| 329 |
-
"macro_mean_score": 0.31905695620134694,
|
| 330 |
-
"micro_mean_score": 0.31556607913724777
|
| 331 |
-
},
|
| 332 |
-
"core_cot": {
|
| 333 |
-
"num_eval_tasks": 440,
|
| 334 |
-
"num_eval_samples": 6539,
|
| 335 |
-
"num_not_eval_samples": 0,
|
| 336 |
-
"macro_mean_score": 0.31362045151669854,
|
| 337 |
-
"micro_mean_score": 0.3100986209078182
|
| 338 |
-
},
|
| 339 |
-
"open": {
|
| 340 |
-
"num_eval_tasks": 65,
|
| 341 |
-
"num_eval_samples": 1163,
|
| 342 |
-
"macro_mean_score": 0.4566234428542061,
|
| 343 |
-
"micro_mean_score": 0.4870593293207223
|
| 344 |
-
},
|
| 345 |
-
"overall_score": 0.33676353369131895
|
| 346 |
-
},
|
| 347 |
-
"Llama_3_2_11B": {
|
| 348 |
-
"core_noncot": {
|
| 349 |
-
"num_eval_tasks": 440,
|
| 350 |
-
"num_eval_samples": 6539,
|
| 351 |
-
"num_not_eval_samples": 0,
|
| 352 |
-
"macro_mean_score": 0.10044261716549671,
|
| 353 |
-
"micro_mean_score": 0.09980638766828835
|
| 354 |
-
},
|
| 355 |
-
"core_cot": {
|
| 356 |
-
"num_eval_tasks": 440,
|
| 357 |
-
"num_eval_samples": 6539,
|
| 358 |
-
"num_not_eval_samples": 0,
|
| 359 |
-
"macro_mean_score": 0.15999641916771298,
|
| 360 |
-
"micro_mean_score": 0.15809331016967038
|
| 361 |
-
},
|
| 362 |
-
"open": {
|
| 363 |
-
"num_eval_tasks": 65,
|
| 364 |
-
"num_eval_samples": 1163,
|
| 365 |
-
"macro_mean_score": 0.3173342406187366,
|
| 366 |
-
"micro_mean_score": 0.3487962166809973
|
| 367 |
-
},
|
| 368 |
-
"overall_score": 0.1802478219287358
|
| 369 |
-
},
|
| 370 |
-
"Idefics3": {
|
| 371 |
-
"core_noncot": {
|
| 372 |
-
"num_eval_tasks": 440,
|
| 373 |
-
"num_eval_samples": 6539,
|
| 374 |
-
"num_not_eval_samples": 0,
|
| 375 |
-
"macro_mean_score": 0.11118980301103833,
|
| 376 |
-
"micro_mean_score": 0.11201785633274061
|
| 377 |
-
},
|
| 378 |
-
"core_cot": {
|
| 379 |
-
"num_eval_tasks": 440,
|
| 380 |
-
"num_eval_samples": 6539,
|
| 381 |
-
"num_not_eval_samples": 0,
|
| 382 |
-
"macro_mean_score": 0.08956972487602757,
|
| 383 |
-
"micro_mean_score": 0.08982225274252693
|
| 384 |
-
},
|
| 385 |
-
"open": {
|
| 386 |
-
"num_eval_tasks": 65,
|
| 387 |
-
"num_eval_samples": 1163,
|
| 388 |
-
"macro_mean_score": 0.3210866162255635,
|
| 389 |
-
"micro_mean_score": 0.35649183147033553
|
| 390 |
-
},
|
| 391 |
-
"overall_score": 0.138206224513898
|
| 392 |
-
},
|
| 393 |
-
"Aria": {
|
| 394 |
-
"core_noncot": {
|
| 395 |
-
"num_eval_tasks": 440,
|
| 396 |
-
"num_eval_samples": 6539,
|
| 397 |
-
"num_not_eval_samples": 0,
|
| 398 |
-
"macro_mean_score": 0.30485930718699694,
|
| 399 |
-
"micro_mean_score": 0.3016713629035311
|
| 400 |
-
},
|
| 401 |
-
"core_cot": {
|
| 402 |
-
"num_eval_tasks": 440,
|
| 403 |
-
"num_eval_samples": 6539,
|
| 404 |
-
"num_not_eval_samples": 0,
|
| 405 |
-
"macro_mean_score": 0.289073788209904,
|
| 406 |
-
"micro_mean_score": 0.2859007507765791
|
| 407 |
-
},
|
| 408 |
-
"open": {
|
| 409 |
-
"num_eval_tasks": 65,
|
| 410 |
-
"num_eval_samples": 1163,
|
| 411 |
-
"macro_mean_score": 0.5103725263180767,
|
| 412 |
-
"micro_mean_score": 0.5349957007738607
|
| 413 |
-
},
|
| 414 |
-
"overall_score": 0.3313115037088191
|
| 415 |
-
},
|
| 416 |
-
"NVLM": {
|
| 417 |
-
"core_noncot": {
|
| 418 |
-
"num_eval_tasks": 440,
|
| 419 |
-
"num_eval_samples": 6539,
|
| 420 |
-
"num_not_eval_samples": 0,
|
| 421 |
-
"macro_mean_score": 0.2420528895703979,
|
| 422 |
-
"micro_mean_score": 0.23838419989257642
|
| 423 |
-
},
|
| 424 |
-
"core_cot": {
|
| 425 |
-
"num_eval_tasks": 440,
|
| 426 |
-
"num_eval_samples": 6539,
|
| 427 |
-
"num_not_eval_samples": 0,
|
| 428 |
-
"macro_mean_score": 0.21589726765847422,
|
| 429 |
-
"micro_mean_score": 0.21406043849932396
|
| 430 |
-
},
|
| 431 |
-
"open": {
|
| 432 |
-
"num_eval_tasks": 65,
|
| 433 |
-
"num_eval_samples": 1163,
|
| 434 |
-
"macro_mean_score": 0.3478114310231307,
|
| 435 |
-
"micro_mean_score": 0.3947549441100602
|
| 436 |
-
},
|
| 437 |
-
"overall_score": 0.25566537510391796
|
| 438 |
-
},
|
| 439 |
-
"InternVL2_2B": {
|
| 440 |
-
"core_noncot": {
|
| 441 |
-
"num_eval_tasks": 440,
|
| 442 |
-
"num_eval_samples": 6539,
|
| 443 |
-
"num_not_eval_samples": 0,
|
| 444 |
-
"macro_mean_score": 0.09089701489596874,
|
| 445 |
-
"micro_mean_score": 0.09036328295381871
|
| 446 |
-
},
|
| 447 |
-
"core_cot": {
|
| 448 |
-
"num_eval_tasks": 440,
|
| 449 |
-
"num_eval_samples": 6539,
|
| 450 |
-
"num_not_eval_samples": 0,
|
| 451 |
-
"macro_mean_score": 0.13141974398938763,
|
| 452 |
-
"micro_mean_score": 0.13063500716262516
|
| 453 |
-
},
|
| 454 |
-
"open": {
|
| 455 |
-
"num_eval_tasks": 65,
|
| 456 |
-
"num_eval_samples": 1163,
|
| 457 |
-
"macro_mean_score": 0.23864417043743646,
|
| 458 |
-
"micro_mean_score": 0.24901117798796224
|
| 459 |
-
},
|
| 460 |
-
"overall_score": 0.14522090778963154
|
| 461 |
-
},
|
| 462 |
-
"Qwen2_VL_2B": {
|
| 463 |
-
"core_noncot": {
|
| 464 |
-
"num_eval_tasks": 440,
|
| 465 |
-
"num_eval_samples": 6539,
|
| 466 |
-
"num_not_eval_samples": 0,
|
| 467 |
-
"macro_mean_score": 0.16448220309703876,
|
| 468 |
-
"micro_mean_score": 0.1610710186451323
|
| 469 |
-
},
|
| 470 |
-
"core_cot": {
|
| 471 |
-
"num_eval_tasks": 440,
|
| 472 |
-
"num_eval_samples": 6539,
|
| 473 |
-
"num_not_eval_samples": 0,
|
| 474 |
-
"macro_mean_score": 0.20877163406364055,
|
| 475 |
-
"micro_mean_score": 0.20561526268932287
|
| 476 |
-
},
|
| 477 |
-
"open": {
|
| 478 |
-
"num_eval_tasks": 65,
|
| 479 |
-
"num_eval_samples": 1163,
|
| 480 |
-
"macro_mean_score": 0.3154302566225611,
|
| 481 |
-
"micro_mean_score": 0.33856405846947557
|
| 482 |
-
},
|
| 483 |
-
"overall_score": 0.22249997162072932
|
| 484 |
-
},
|
| 485 |
-
"Aquila_VL_2B": {
|
| 486 |
-
"core_noncot": {
|
| 487 |
-
"num_eval_tasks": 440,
|
| 488 |
-
"num_eval_samples": 6539,
|
| 489 |
-
"num_not_eval_samples": 0,
|
| 490 |
-
"macro_mean_score": 0.16317824309838627,
|
| 491 |
-
"micro_mean_score": 0.16198837245148487
|
| 492 |
-
},
|
| 493 |
-
"core_cot": {
|
| 494 |
-
"num_eval_tasks": 440,
|
| 495 |
-
"num_eval_samples": 6539,
|
| 496 |
-
"num_not_eval_samples": 0,
|
| 497 |
-
"macro_mean_score": 0.159970161379836,
|
| 498 |
-
"micro_mean_score": 0.15844711671722148
|
| 499 |
-
},
|
| 500 |
-
"open": {
|
| 501 |
-
"num_eval_tasks": 65,
|
| 502 |
-
"num_eval_samples": 1163,
|
| 503 |
-
"macro_mean_score": 0.24567572098570653,
|
| 504 |
-
"micro_mean_score": 0.2704213241616509
|
| 505 |
-
},
|
| 506 |
-
"overall_score": 0.17379673035120966
|
| 507 |
-
},
|
| 508 |
-
"Mammoth_VL": {
|
| 509 |
-
"core_noncot": {
|
| 510 |
-
"num_eval_tasks": 440,
|
| 511 |
-
"num_eval_samples": 6539,
|
| 512 |
-
"num_not_eval_samples": 0,
|
| 513 |
-
"macro_mean_score": 0.264052880412689,
|
| 514 |
-
"micro_mean_score": 0.2626894374387823
|
| 515 |
-
},
|
| 516 |
-
"core_cot": null,
|
| 517 |
-
"open": {
|
| 518 |
-
"num_eval_tasks": 65,
|
| 519 |
-
"num_eval_samples": 1163,
|
| 520 |
-
"macro_mean_score": 0.37992668750165337,
|
| 521 |
-
"micro_mean_score": 0.40120378331900275
|
| 522 |
-
},
|
| 523 |
-
"overall_score": 0.27896733083008046
|
| 524 |
-
}
|
| 525 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|