Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
File size: 5,910 Bytes
925b37d ed02609 925b37d ed02609 5433f8c ed02609 925b37d ed02609 5433f8c ed02609 5433f8c ed02609 5433f8c ed02609 925b37d ed02609 925b37d 5433f8c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 |
import logging
from pathlib import Path
from typing import Dict, List, Tuple
import numpy as np
import torch
import json
from fairchem.data.omol.modules.evaluator import (
ligand_pocket,
ligand_strain,
geom_conformers,
protonation_energies,
unoptimized_ie_ea,
distance_scaling,
unoptimized_spin_gap,
)
OMOL_EVAL_FUNCTIONS = {
"Ligand pocket": ligand_pocket,
"Ligand strain": ligand_strain,
"Conformers": geom_conformers,
"Protonation": protonation_energies,
"IE_EA": unoptimized_ie_ea,
"Distance scaling": distance_scaling,
"Spin gap": unoptimized_spin_gap,
}
OMOL_DATA_ID_MAPPING = {
"metal_complexes": ["metal_complexes"],
"electrolytes": ["elytes"],
"biomolecules": ["biomolecules"],
"neutral_organics": ["ani2x", "orbnet_denali", "geom_orca6", "trans1x", "rgd"],
}
def reorder(ref: np.ndarray, to_reorder: np.ndarray) -> np.ndarray:
"""
Get the ordering so that `to_reorder[ordering]` == ref.
eg:
ref = [c, a, b]
to_reorder = [b, a, c]
order = reorder(ref, to_reorder) # [2, 1, 0]
assert ref == to_reorder[order]
Parameters
----------
ref : np.ndarray
Reference array. Must not contains duplicates.
to_reorder : np.ndarray
Array to re-order. Must not contains duplicates.
Items must be the same as in `ref`.
Returns
-------
np.ndarray
the ordering to apply on `to_reorder`
"""
assert len(ref) == len(set(ref))
assert len(to_reorder) == len(set(to_reorder))
assert set(ref) == set(to_reorder)
item_to_idx = {item: idx for idx, item in enumerate(to_reorder)}
return np.array([item_to_idx[item] for item in ref])
def get_order(path_submission: Path, path_annotations: Path):
with np.load(path_submission) as data:
submission_ids = data["ids"]
with np.load(path_annotations, allow_pickle=True) as data:
annotations_ids = data["ids"]
# Use sets for faster comparison
submission_set = set(submission_ids)
annotations_set = set(annotations_ids)
if submission_set != annotations_set:
missing_ids = annotations_set - submission_set
unexpected_ids = submission_set - annotations_set
details = (
f"{len(missing_ids)} missing IDs: ({list(missing_ids)[:3]}, ...)\n"
f"{len(unexpected_ids)} unexpected IDs: ({list(unexpected_ids)[:3]}, ...)"
)
raise Exception(f"IDs don't match.\n{details}")
return reorder(annotations_ids, submission_ids)
def s2ef_metrics(
annotations_path: Path,
submission_filename: Path,
subsets: list = ["all"],
) -> Dict[str, float]:
order = get_order(submission_filename, annotations_path)
with np.load(submission_filename) as data:
forces = data["forces"]
energy = data["energy"][order]
forces = np.array(np.split(forces, np.cumsum(data["natoms"])[:-1]), dtype=object)[order]
if len(set(np.where(np.isinf(energy))[0])) != 0:
inf_energy_ids = list(set(np.where(np.isinf(energy))[0]))
raise Exception(
f"Inf values found in `energy` for IDs: ({inf_energy_ids[:3]}, ...)"
)
with np.load(annotations_path, allow_pickle=True) as data:
target_forces = data["forces"]
target_energy = data["energy"]
target_data_ids = data["data_ids"]
metrics = {}
for subset in subsets:
if subset == "all":
subset_mask = np.ones(len(target_data_ids), dtype=bool)
else:
allowed_ids = set(OMOL_DATA_ID_MAPPING.get(subset, []))
subset_mask = np.array(
[data_id in allowed_ids for data_id in target_data_ids]
)
sub_energy = energy[subset_mask]
sub_target_energy = target_energy[subset_mask]
energy_mae = np.mean(np.abs(sub_target_energy - sub_energy))
metrics[f"{subset}_energy_mae"] = energy_mae
forces_mae = 0
natoms = 0
for sub_forces, sub_target_forces in zip(forces[subset_mask], target_forces[subset_mask]):
forces_mae += np.sum(np.abs(sub_target_forces - sub_forces))
natoms += sub_forces.shape[0]
forces_mae /= (3*natoms)
metrics[f"{subset}_forces_mae"] = forces_mae
return metrics
def omol_evaluations(
annotations_path: Path,
submission_filename: Path,
eval_type: str,
) -> Dict[str, float]:
with open(submission_filename) as f:
submission_data = json.load(f)
with open(annotations_path) as f:
annotations_data = json.load(f)
submission_entries = set(submission_data.keys())
annotation_entries = set(annotations_data.keys())
if submission_entries != annotation_entries:
missing = annotation_entries - submission_entries
unexpected = submission_entries - annotation_entries
raise ValueError(
f"Submission and annotations entries do not match.\n"
f"Missing entries in submission: {missing}\n"
f"Unexpected entries in submission: {unexpected}"
)
eval_fn = OMOL_EVAL_FUNCTIONS.get(eval_type)
metrics = eval_fn(annotations_data, submission_data)
return metrics
def evaluate(
annotations_path: Path,
submission_filename: Path,
eval_type: str,
):
if eval_type in ["Validation", "Test"]:
metrics = s2ef_metrics(
annotations_path,
submission_filename,
subsets=[
"all",
"metal_complexes",
"electrolytes",
"biomolecules",
"neutral_organics",
],
)
elif eval_type in OMOL_EVAL_FUNCTIONS:
metrics = omol_evaluations(
annotations_path,
submission_filename,
eval_type,
)
else:
raise ValueError(f"Unknown eval_type: {eval_type}")
return metrics |