Spaces:

facebook
/

fairchem_leaderboard

Running on CPU Upgrade

App Files Files Community

mshuaibi commited on Aug 22

Commit

ed02609

1 Parent(s): b8a7697

speed up s2ef

Browse files

Files changed (3) hide show

app.py +1 -1
content.py +1 -2
evaluator.py +77 -95

app.py CHANGED Viewed

@@ -324,7 +324,7 @@ def add_new_eval(
             return
         # Evaluate the submission
-        yield "⚙️ Evaluating your submission..."
         metrics = evaluate(
             leaderboard_data.target_paths[eval_type],
             path_to_file,

             return
         # Evaluate the submission
+        yield "⚙️ Evaluating your submission...(do not close/refresh this page!)"
         metrics = evaluate(
             leaderboard_data.target_paths[eval_type],
             path_to_file,

content.py CHANGED Viewed

@@ -39,8 +39,7 @@ Users are limited to 5 successful submissions per month for each evaluation type
 - Ensure your prediction file format matches the expected format for the selected evaluation
 - Your email will be stored privately and only used for communication regarding your submission
 - Results will appear on the leaderboard after successful validation
-- Remain on the page until you see the "Success" message.
-- S2EF evaluations can take 10-20 minutes, the other evaluations happen in a few minutes. Please be patient.
 - If you wish to have your model removed from the leaderboard please reach out to mshuaibi@meta.com with the model name and submission date.
 This leaderboard is actively being developed and we are always open to feedback. If you run into any issues or have a question please

 - Ensure your prediction file format matches the expected format for the selected evaluation
 - Your email will be stored privately and only used for communication regarding your submission
 - Results will appear on the leaderboard after successful validation
+- Remain on the page until you see the "Success" message. Evaluations can take several minutes, please be patient.
 - If you wish to have your model removed from the leaderboard please reach out to mshuaibi@meta.com with the model name and submission date.
 This leaderboard is actively being developed and we are always open to feedback. If you run into any issues or have a question please

evaluator.py CHANGED Viewed

@@ -35,79 +35,6 @@ OMOL_DATA_ID_MAPPING = {
 }
-def npz_2_s2ef_input(npz_input_file: Path, subset: str) -> Dict[str, torch.tensor]:
-    with np.load(npz_input_file, allow_pickle=True) as data:
-        forces = data["forces"]
-        energy = data["energy"]
-        data_ids = np.array(data["data_ids"])
-    out_energy = []
-    out_forces = []
-    out_atoms = []
-    order = range(len(forces))
-    for x in order:
-        data_id = data_ids[x]
-        if subset == "all" or data_id in OMOL_DATA_ID_MAPPING.get(subset, []):
-            out_energy.append(energy[x])
-            force_array = forces[x]
-            out_forces.append(torch.tensor(force_array, dtype=torch.float32))
-            out_atoms.append(len(force_array))
-    energy = torch.tensor(out_energy)
-    out_forces = torch.cat(out_forces, dim=0)
-    out_dict = {
-        "energy": energy.float(),
-        "forces": out_forces,
-        "natoms": torch.tensor(out_atoms),
-    }
-    return out_dict
-def npz_2_s2ef_submission(
-    npz_input_file: Path, order: List[int], subset: str = "All"
-) -> Dict[str, torch.tensor]:
-    with np.load(npz_input_file) as data:
-        forces = data["forces"]
-        energy = data["energy"]
-        natoms = data["natoms"]
-        data_ids = data["data_ids"]
-        forces = np.split(forces, np.cumsum(natoms)[:-1])
-    # check for infs
-    if len(set(np.where(np.isinf(energy))[0])) != 0:
-        inf_energy_ids = list(set(np.where(np.isinf(energy))[0]))
-        raise Exception(
-            f"Inf values found in `energy` for IDs: ({inf_energy_ids[:3]}, ...)"
-        )
-    out_energy = []
-    out_forces = []
-    out_atoms = []
-    if order is None:
-        order = range(len(forces))
-    for x in order:
-        data_id = data_ids[x]
-        if subset == "all" or data_id in OMOL_DATA_ID_MAPPING.get(subset, []):
-            out_energy.append(energy[x])
-            force_array = forces[x]
-            out_forces.append(torch.tensor(force_array, dtype=torch.float32))
-            out_atoms.append(force_array.shape[0])
-    energy = torch.tensor(out_energy)
-    out_forces = torch.cat(out_forces, dim=0)
-    out_dict = {
-        "energy": energy.float().squeeze(),
-        "forces": out_forces,
-        "natoms": torch.tensor(out_atoms),
-    }
-    return out_dict
 def reorder(ref: np.ndarray, to_reorder: np.ndarray) -> np.ndarray:
     """
     Get the ordering so that `to_reorder[ordering]` == ref.
@@ -146,9 +73,13 @@ def get_order(path_submission: Path, path_annotations: Path):
     with np.load(path_annotations, allow_pickle=True) as data:
         annotations_ids = data["ids"]
-    if set(submission_ids) != set(annotations_ids):
-        missing_ids = set(annotations_ids) - set(submission_ids)
-        unexpected_ids = set(submission_ids) - set(annotations_ids)
         details = (
             f"{len(missing_ids)} missing IDs: ({list(missing_ids)[:3]}, ...)\n"
@@ -159,39 +90,79 @@ def get_order(path_submission: Path, path_annotations: Path):
     return reorder(annotations_ids, submission_ids)
-def extract_and_align(
-    path_submission: Path,
-    path_annotations: Path,
-    subset: str,
-) -> Tuple[Dict[str, torch.tensor], Dict[str, torch.tensor]]:
-    order = get_order(path_submission, path_annotations)
-    submission_data = npz_2_s2ef_submission(path_submission, order, subset)
-    annotations_data = npz_2_s2ef_input(path_annotations, subset)
-    return submission_data, annotations_data
 def s2ef_metrics(
     annotations_path: Path,
     submission_filename: Path,
     subsets: list = ["all"],
 ) -> Dict[str, float]:
-    evaluator = Evaluator(task="s2ef")
     metrics = {}
     for subset in subsets:
-        submission_data, annotations_data = extract_and_align(
-            submission_filename,
-            annotations_path,
-            subset,
         )
         subset_metrics = evaluator.eval(
             submission_data, annotations_data, prev_metrics={}
         )
         for key in ["energy_mae", "forces_mae"]:
             metrics[f"{subset}_{key}"] = subset_metrics[key]["metric"]
     return metrics
@@ -204,6 +175,17 @@ def omol_evaluations(
         submission_data = json.load(f)
     with open(annotations_path) as f:
         annotations_data = json.load(f)
     eval_fn = OMOL_EVAL_FUNCTIONS.get(eval_type)
     metrics = eval_fn(annotations_data, submission_data)
     return metrics

 }
 def reorder(ref: np.ndarray, to_reorder: np.ndarray) -> np.ndarray:
     """
     Get the ordering so that `to_reorder[ordering]` == ref.
     with np.load(path_annotations, allow_pickle=True) as data:
         annotations_ids = data["ids"]
+    # Use sets for faster comparison
+    submission_set = set(submission_ids)
+    annotations_set = set(annotations_ids)
+    if submission_set != annotations_set:
+        missing_ids = annotations_set - submission_set
+        unexpected_ids = submission_set - annotations_set
         details = (
             f"{len(missing_ids)} missing IDs: ({list(missing_ids)[:3]}, ...)\n"
     return reorder(annotations_ids, submission_ids)
 def s2ef_metrics(
     annotations_path: Path,
     submission_filename: Path,
     subsets: list = ["all"],
 ) -> Dict[str, float]:
+    eval_metrics = {
+        "energy": ["mae"],
+        "forces": ["mae"],
+    }
+    evaluator = Evaluator(eval_metrics=eval_metrics)
+    # Get order once for all subsets
+    order = get_order(submission_filename, annotations_path)
+    with np.load(submission_filename) as data:
+        forces = data["forces"]
+        energy = data["energy"][order]
+        natoms = data["natoms"]
+        forces = np.array(np.split(forces, np.cumsum(natoms)[:-1]), dtype=object)[order]
+        natoms = natoms[order]
+    if len(set(np.where(np.isinf(energy))[0])) != 0:
+        inf_energy_ids = list(set(np.where(np.isinf(energy))[0]))
+        raise Exception(
+            f"Inf values found in `energy` for IDs: ({inf_energy_ids[:3]}, ...)"
+        )
+    with np.load(annotations_path, allow_pickle=True) as data:
+        target_forces = data["forces"]
+        target_energy = data["energy"]
+        target_data_ids = data["data_ids"]
     metrics = {}
     for subset in subsets:
+        if subset == "all":
+            subset_mask = np.ones(len(target_data_ids), dtype=bool)
+        else:
+            allowed_ids = set(OMOL_DATA_ID_MAPPING.get(subset, []))
+            subset_mask = np.array(
+                [data_id in allowed_ids for data_id in target_data_ids]
+            )
+        sub_energy = torch.from_numpy(energy[subset_mask])
+        sub_forces = torch.from_numpy(np.concatenate(forces[subset_mask]))
+        sub_natoms = torch.from_numpy(natoms[subset_mask])
+        submission_data = {
+            "energy": sub_energy,
+            "forces": sub_forces,
+            "natoms": sub_natoms,
+        }
+        target_energy_tensor = torch.from_numpy(target_energy[subset_mask])
+        target_force_tensors = torch.from_numpy(
+            np.concatenate(target_forces[subset_mask])
+        )
+        target_natoms_tensor = torch.tensor(
+            [force_array.shape[0] for force_array in target_forces[subset_mask]],
+            dtype=torch.long,
         )
+        annotations_data = {
+            "energy": target_energy_tensor,
+            "forces": target_force_tensors,
+            "natoms": target_natoms_tensor,
+        }
         subset_metrics = evaluator.eval(
             submission_data, annotations_data, prev_metrics={}
         )
         for key in ["energy_mae", "forces_mae"]:
             metrics[f"{subset}_{key}"] = subset_metrics[key]["metric"]
     return metrics
         submission_data = json.load(f)
     with open(annotations_path) as f:
         annotations_data = json.load(f)
+    submission_entries = set(submission_data.keys())
+    annotation_entries = set(annotations_data.keys())
+    if submission_entries != annotation_entries:
+        missing = annotation_entries - submission_entries
+        unexpected = submission_entries - annotation_entries
+        raise ValueError(
+            f"Submission and annotations entries do not match.\n"
+            f"Missing entries in submission: {missing}\n"
+            f"Unexpected entries in submission: {unexpected}"
+        )
     eval_fn = OMOL_EVAL_FUNCTIONS.get(eval_type)
     metrics = eval_fn(annotations_data, submission_data)
     return metrics