valence-labs · FNTwin · Jul 29, 2024 · Aug 2, 2024 · Aug 2, 2024 · Aug 2, 2024
@@ -31,6 +31,7 @@ def get_project_root():
     "COMP6": "openqdc.datasets.potential.comp6",
     "GDML": "openqdc.datasets.potential.gdml",
     "Molecule3D": "openqdc.datasets.potential.molecule3d",
+    "MACEOFF": "openqdc.datasets.potential.maceoff",
     "OrbnetDenali": "openqdc.datasets.potential.orbnet_denali",
     "SN2RXN": "openqdc.datasets.potential.sn2_rxn",
     "QM7X": "openqdc.datasets.potential.qm7x",
@@ -122,6 +123,7 @@ def __dir__():
     from .datasets.potential.gdml import GDML
     from .datasets.potential.geom import GEOM
     from .datasets.potential.iso_17 import ISO17
+    from .datasets.potential.maceoff import MACEOFF
     from .datasets.potential.md22 import MD22
     from .datasets.potential.molecule3d import Molecule3D
     from .datasets.potential.multixcqm9 import MultixcQM9, MultixcQM9_V2

@@ -5,6 +5,7 @@
 from .gdml import GDML
 from .geom import GEOM
 from .iso_17 import ISO17
+from .maceoff import MACEOFF
 from .md22 import MD22
 from .molecule3d import Molecule3D
 from .multixcqm9 import MultixcQM9, MultixcQM9_V2
@@ -38,6 +39,7 @@
     "GEOM": GEOM,
     "ISO17": ISO17,
     "Molecule3D": Molecule3D,
+    "MACEOFF": MACEOFF,
     "NablaDFT": NablaDFT,
     "OrbnetDenali": OrbnetDenali,
     "PCQM_B3LYP": PCQM_B3LYP,

@@ -0,0 +1,133 @@
+import re
+from functools import partial
+from os.path import join as p_join
+
+import datamol as dm
+import numpy as np
+
+from openqdc.datasets.base import BaseDataset
+from openqdc.methods import PotentialMethod
+from openqdc.utils.constants import ATOMIC_NUMBERS
+from openqdc.utils.molecule import get_atomic_number_and_charge
+
+
+def parse_mace_xyz(xyzpath):
+    energy_re = re.compile(r"energy=(\S+)")
+    smiles_re = re.compile(r"smiles=(\S+)")
+    subset_re = re.compile(r"config_type=(\S+)")
+    with open(xyzpath, "r") as f:
+        n_atoms = None
+        counter = 0
+        positions = []
+        numbers = []
+        forces = []
+        energy = None
+        for line in f:
+            if n_atoms is None:
+                n_atoms = int(line)
+                positions = []
+                numbers = []
+                forces = []
+                energy = None
+                counter = 1
+                continue
+            if counter == 1:
+                props = line
+                energy = float(energy_re.search(props).group(1))
+                subset = subset_re.search(props).group(1)
+                try:
+                    smiles = smiles_re.search(props).group(1)
+                except AttributeError:  # water and qmugs subsets do not have smiles
+                    smiles = ""
+                counter = 2
+                continue
+            el, x, y, z, fx, fy, fz, _, _, _ = line.split()
+            numbers.append(ATOMIC_NUMBERS[el])
+            positions.append([float(x), float(y), float(z)])
+            forces.append([float(fx), float(fy), float(fz)])
+            smiles = smiles.replace('"', "")
+            subset = subset.replace('"', "")
+            counter += 1
+            if counter == n_atoms + 2:
+                n_atoms = None
+                yield energy, numbers, positions, forces, smiles, subset
+
+
+def build_data_object(data, split):
+    energy, numbers, positions, forces, smiles, subset = data
+    if smiles == "":
+        x = np.concatenate((np.array(numbers)[:, None], np.zeros((len(numbers), 1))), axis=-1)
+    else:
+        x = get_atomic_number_and_charge(dm.to_mol(smiles, remove_hs=False, ordered=True))
+    res = dict(
+        name=np.array([smiles]),
+        subset=np.array([subset]),
+        energies=np.array([[energy]], dtype=np.float64),
+        forces=np.array(forces, dtype=np.float32).reshape(-1, 3, 1),
+        atomic_inputs=np.concatenate((x, np.array(positions)), axis=-1, dtype=np.float32).reshape(-1, 5),
+        n_atoms=np.array([x.shape[0]], dtype=np.int32),
+        split=np.array([split]),
+    )
+    return res
+
+
+class MACEOFF(BaseDataset):
+    """
+    MACEOFF dataset core of the dataset consist in the Spice V1 dataset.
+    95% of the data are used for training and validation under the "train" split,
+    and 5% for testing. The dataset uses the Spice level of theory
+    ωB97M-D3(BJ)/def2-TZVPPD as implemented in the PSI4 software.
+    MACEOFF uses a subset of SPICE that contains the ten chemical elements
+    H, C, N, O, F, P, S, Cl, Br, and I, and has a neutral formal charge.
+    MACEOFF doesn't contain ion pairs. To facilitate the learning of intramolecular
+    non-bonded interactions, MACEOFF dataset contains larger 50–90 atom molecules
+    randomly selected from the QMugs dataset.
+    MACEOFF contains a number of water clusters carved out of molecular dynamics simulations
+    of liquid water, with sizes of up to 50 water molecules and part of the
+    COMP6 tripeptide geometry dataset.
+
+    Usage:
+    ```python
+    from openqdc.datasets import MACEOFF
+    dataset = MACEOFF()
+    ```
+
+    Species:
+        [H, C, N, O, F, P, S, Cl, Br, I]
+
+    References:
+        https://arxiv.org/pdf/2312.15211\n
+        https://doi.org/10.17863/CAM.107498
+    """
+
+    __name__ = "maceoff"
+
+    __energy_methods__ = [PotentialMethod.WB97M_D3BJ_DEF2_TZVPPD]
+    __force_mask__ = [True]
+    __energy_unit__ = "ev"
+    __distance_unit__ = "ang"
+    __forces_unit__ = "ev/ang"
+
+    energy_target_names = ["dft_total_energy"]
+    force_target_names = ["dft_total_gradient"]
+
+    __links__ = {
+        "train_large_neut_no_bad_clean.tar.gz": "https://api.repository.cam.ac.uk/server/api/core/bitstreams/b185b5ab-91cf-489a-9302-63bfac42824a/content",  # noqa: E501
+        "test_large_neut_all.tar.gz": "https://api.repository.cam.ac.uk/server/api/core/bitstreams/cb8351dd-f09c-413f-921c-67a702a7f0c5/content",  # noqa: E501
+    }
+
+    def read_raw_entries(self):
+        entries = []
+        for filename in self.__links__:
+            filename = filename.split(".")[0]
+            xyzpath = p_join(self.root, f"{filename}.xyz")
+            split = filename.split("_")[0]
+            structure_iterator = parse_mace_xyz(xyzpath)
+            func = partial(build_data_object, split=split)
+            entries.extend(dm.utils.parallelized(func, structure_iterator))
+        return entries
+
+    def __getitem__(self, idx):
+        data = super().__getitem__(idx)
+        data.__setattr__("split", self._convert_array(self.data["split"][idx]))
+        return data
diff --git a/openqdc/methods/enums.py b/openqdc/methods/enums.py
@@ -224,7 +224,6 @@ def atom_energies_dict(self):
         raise NotImplementedError()
 
 
-@unique
 class PotentialMethod(QmMethod):  # SPLIT FOR INTERACTIO ENERGIES AND FIX MD1
     B1LYP_VWN5_DZP = Functional.B1LYP_VWN5, BasisSet.DZP
     B1LYP_VWN5_SZ = Functional.B1LYP_VWN5, BasisSet.SZ
@@ -501,7 +500,6 @@ def atom_energies_dict(self):
         return energies
 
 
-@unique
 class InteractionMethod(QmMethod):
     CCSD_T_NN = Functional.CCSDT, BasisSet.NN
     CCSD_T_CBS = Functional.CCSDT, BasisSet.CBS