MEGA-Bench

Running

cccjc commited on May 19

Commit

2b8200c

1 Parent(s): 0396eb4

update

Files changed (3) hide show

constants.py CHANGED Viewed

@@ -261,6 +261,9 @@ MODEL_URLS = {
     "InternVL3_14B": "https://huggingface.co/OpenGVLab/InternVL3-14B",
     "InternVL3_38B": "https://huggingface.co/OpenGVLab/InternVL3-38B",
     "InternVL3_78B": "https://huggingface.co/OpenGVLab/InternVL3-78B",
 }
 # Define the base MODEL_GROUPS structure

     "InternVL3_14B": "https://huggingface.co/OpenGVLab/InternVL3-14B",
     "InternVL3_38B": "https://huggingface.co/OpenGVLab/InternVL3-38B",
     "InternVL3_78B": "https://huggingface.co/OpenGVLab/InternVL3-78B",
+    "GPT-o1": "https://openai.com/o1/",
+    "GPT-o1-mini": "https://openai.com/index/openai-o1-mini-advancing-cost-efficient-reasoning/",
+    "Seed1.5-VL": "https://github.com/ByteDance-Seed/Seed1.5-VL",
 }
 # Define the base MODEL_GROUPS structure

static/eval_results/Default/self_reported.json CHANGED Viewed

@@ -1,8 +1,25 @@
 {
-    "MiniMax-VL-01": 47.4,
-    "Qwen2.5-VL-72B": 51.3,
-    "Qwen2.5-VL-7B": 36.8,
-    "Qwen2.5-VL-3B": 28.9,
-    "GPT-o1": 58.0,
-    "GPT-o1-mini": 54.2
 }

 {
+    "MiniMax-VL-01": {
+        "overall": 47.4
+    },
+    "Qwen2.5-VL-72B": {
+        "overall": 51.3
+    },
+    "Qwen2.5-VL-7B": {
+        "overall": 36.8
+    },
+    "Qwen2.5-VL-3B": {
+        "overall": 28.9
+    },
+    "GPT-o1": {
+        "overall": 58.0
+    },
+    "GPT-o1-mini": {
+        "overall": 54.2
+    },
+    "Seed1.5-VL": {
+        "overall": 59.85,
+        "core": 58.58,
+        "open": 68.46
+    }
 }

utils.py CHANGED Viewed

@@ -106,11 +106,20 @@ class MEGABenchEvalDataLoader:
             # Add asterisk for self-reported results
             if model in self.SELF_REPORTED:
                 # Store numeric value for sorting but display with asterisk
-                row["Overall"] = self.SELF_REPORTED[model]
-                row["Overall_display"] = f"{self.SELF_REPORTED[model]:.2f}*"
-                row["Core"] = None
-                row["Open-ended"] = None
                 for display_name in self.SUPER_GROUPS[selected_super_group]:
                     row[display_name] = None
             else:

             # Add asterisk for self-reported results
             if model in self.SELF_REPORTED:
+                model_scores = self.SELF_REPORTED[model]
                 # Store numeric value for sorting but display with asterisk
+                row["Overall"] = model_scores["overall"]
+                row["Overall_display"] = f"{model_scores['overall']:.2f}*"
+                # Handle optional core and open scores
+                core_score = model_scores.get("core")
+                open_score = model_scores.get("open")
+                row["Core"] = core_score
+                row["Open-ended"] = open_score
+                # Add asterisk to core and open scores if they exist
+                if core_score is not None:
+                    row["Core"] = f"{core_score:.2f}*"
+                if open_score is not None:
+                    row["Open-ended"] = f"{open_score:.2f}*"
                 for display_name in self.SUPER_GROUPS[selected_super_group]:
                     row[display_name] = None
             else: