From 09c75a7e19275ca66ab2dc6f84023c5de18ad81f Mon Sep 17 00:00:00 2001 From: FNTwin Date: Mon, 29 Jul 2024 09:47:40 -0600 Subject: [PATCH 1/8] MaceOff dataset --- openqdc/__init__.py | 2 + openqdc/datasets/potential/__init__.py | 2 + openqdc/datasets/potential/maceoff.py | 95 ++++++++++++++++++++++++++ openqdc/methods/enums.py | 2 - 4 files changed, 99 insertions(+), 2 deletions(-) create mode 100644 openqdc/datasets/potential/maceoff.py diff --git a/openqdc/__init__.py b/openqdc/__init__.py index c6be72d4..051aeefb 100644 --- a/openqdc/__init__.py +++ b/openqdc/__init__.py @@ -31,6 +31,7 @@ def get_project_root(): "COMP6": "openqdc.datasets.potential.comp6", "GDML": "openqdc.datasets.potential.gdml", "Molecule3D": "openqdc.datasets.potential.molecule3d", + "MACEOFF": "openqdc.datasets.potential.maceoff", "OrbnetDenali": "openqdc.datasets.potential.orbnet_denali", "SN2RXN": "openqdc.datasets.potential.sn2_rxn", "QM7X": "openqdc.datasets.potential.qm7x", @@ -122,6 +123,7 @@ def __dir__(): from .datasets.potential.gdml import GDML from .datasets.potential.geom import GEOM from .datasets.potential.iso_17 import ISO17 + from .datasets.potential.maceoff import MACEOFF from .datasets.potential.md22 import MD22 from .datasets.potential.molecule3d import Molecule3D from .datasets.potential.multixcqm9 import MultixcQM9, MultixcQM9_V2 diff --git a/openqdc/datasets/potential/__init__.py b/openqdc/datasets/potential/__init__.py index 35721dde..e0207b7f 100644 --- a/openqdc/datasets/potential/__init__.py +++ b/openqdc/datasets/potential/__init__.py @@ -5,6 +5,7 @@ from .gdml import GDML from .geom import GEOM from .iso_17 import ISO17 +from .maceoff import MACEOFF from .md22 import MD22 from .molecule3d import Molecule3D from .multixcqm9 import MultixcQM9, MultixcQM9_V2 @@ -38,6 +39,7 @@ "GEOM": GEOM, "ISO17": ISO17, "Molecule3D": Molecule3D, + "MACEOFF": MACEOFF, "NablaDFT": NablaDFT, "OrbnetDenali": OrbnetDenali, "PCQM_B3LYP": PCQM_B3LYP, diff --git a/openqdc/datasets/potential/maceoff.py b/openqdc/datasets/potential/maceoff.py new file mode 100644 index 00000000..c8fb06b0 --- /dev/null +++ b/openqdc/datasets/potential/maceoff.py @@ -0,0 +1,95 @@ +import re +from os.path import join as p_join + +import datamol as dm +import numpy as np + +from openqdc.datasets.base import BaseDataset +from openqdc.methods import PotentialMethod +from openqdc.utils.constants import ATOMIC_NUMBERS +from openqdc.utils.molecule import get_atomic_number_and_charge + + +def parse_mace_xyz(xyzpath): + energy_re = re.compile(r"energy=(\S+)") + smiles_re = re.compile(r"smiles=(\S+)") + subset_re = re.compile(r"config_type=(\S+)") + with open(xyzpath, "r") as f: + n_atoms = None + counter = 0 + positions = [] + numbers = [] + forces = [] + energy = None + for line in f: + if n_atoms is None: + n_atoms = int(line) + positions = [] + numbers = [] + forces = [] + energy = None + counter = 1 + continue + if counter == 1: + props = line + energy = float(energy_re.search(props).group(1)) + subset = subset_re.search(props).group(1) + try: + smiles = smiles_re.search(props).group(1) + except AttributeError: # water and qmugs subsets do not have smiles + smiles = "" + counter = 2 + continue + el, x, y, z, fx, fy, fz, _, _, _ = line.split() + numbers.append(ATOMIC_NUMBERS[el]) + positions.append([float(x), float(y), float(z)]) + forces.append([float(fx), float(fy), float(fz)]) + smiles = smiles.replace('"', "") + subset = subset.replace('"', "") + counter += 1 + if counter == n_atoms + 2: + n_atoms = None + yield energy, numbers, positions, forces, smiles, subset + + +def build_data_object(data): + energy, numbers, positions, forces, smiles, subset = data + if smiles == "": + x = np.concatenate((np.array(numbers)[:, None], np.zeros((len(numbers), 1))), axis=-1) + else: + x = get_atomic_number_and_charge(dm.to_mol(smiles, remove_hs=False, ordered=True)) + res = dict( + name=np.array([smiles]), + subset=np.array([subset]), + energies=np.array([[energy]], dtype=np.float64), + forces=np.array(forces, dtype=np.float32).reshape( + -1, 3, 1 + ), # forces -ve of energy gradient but the -1.0 is done in the convert_forces method + atomic_inputs=np.concatenate((x, np.array(positions)), axis=-1, dtype=np.float32).reshape(-1, 5), + n_atoms=np.array([x.shape[0]], dtype=np.int32), + ) + return res + + +class MACEOFF(BaseDataset): + __name__ = "maceoff" + + __energy_methods__ = [PotentialMethod.WB97M_D3BJ_DEF2_TZVPPD] + __force_mask__ = [True] + __energy_unit__ = "ev" + __distance_unit__ = "ang" + __forces_unit__ = "ev/ang" + + energy_target_names = ["dft_total_energy"] + force_target_names = ["dft_total_gradient"] + + __links__ = { + "train_large_neut_no_bad_clean.tar.gz": "https://api.repository.cam.ac.uk/server/api/core/bitstreams/b185b5ab-91cf-489a-9302-63bfac42824a/content" # noqa: E501 + } + + def read_raw_entries(self): + filepath = p_join(self.root, "train_large_neut_no_bad_clean.xyz") + xyzpath = p_join(self.root, filepath) + structure_iterator = parse_mace_xyz(xyzpath) + res = dm.utils.parallelized(build_data_object, structure_iterator) + return res diff --git a/openqdc/methods/enums.py b/openqdc/methods/enums.py index 9dff4a15..a4b958ba 100644 --- a/openqdc/methods/enums.py +++ b/openqdc/methods/enums.py @@ -224,7 +224,6 @@ def atom_energies_dict(self): raise NotImplementedError() -@unique class PotentialMethod(QmMethod): # SPLIT FOR INTERACTIO ENERGIES AND FIX MD1 B1LYP_VWN5_DZP = Functional.B1LYP_VWN5, BasisSet.DZP B1LYP_VWN5_SZ = Functional.B1LYP_VWN5, BasisSet.SZ @@ -501,7 +500,6 @@ def atom_energies_dict(self): return energies -@unique class InteractionMethod(QmMethod): CCSD_T_NN = Functional.CCSDT, BasisSet.NN CCSD_T_CBS = Functional.CCSDT, BasisSet.CBS From b16a410c9b51f0d1ec8cc42f5b93d336314e59a1 Mon Sep 17 00:00:00 2001 From: Hatem Helal Date: Fri, 2 Aug 2024 07:00:53 -0600 Subject: [PATCH 2/8] initial scaffolding for BPA dataset --- openqdc/__init__.py | 2 ++ openqdc/datasets/potential/__init__.py | 2 ++ openqdc/datasets/potential/bpa.py | 26 ++++++++++++++++++++++++++ 3 files changed, 30 insertions(+) create mode 100644 openqdc/datasets/potential/bpa.py diff --git a/openqdc/__init__.py b/openqdc/__init__.py index c6be72d4..63d5558b 100644 --- a/openqdc/__init__.py +++ b/openqdc/__init__.py @@ -21,6 +21,7 @@ def get_project_root(): "ANI1CCX_V2": "openqdc.datasets.potential.ani", "ANI1X": "openqdc.datasets.potential.ani", "ANI2X": "openqdc.datasets.potential.ani", + "BPA": "openqdc.datasets.potential.bpa", "Spice": "openqdc.datasets.potential.spice", "SpiceV2": "openqdc.datasets.potential.spice", "SpiceVL2": "openqdc.datasets.potential.spice", @@ -117,6 +118,7 @@ def __dir__(): # POTENTIAL from .datasets.potential.alchemy import Alchemy from .datasets.potential.ani import ANI1, ANI1CCX, ANI1CCX_V2, ANI1X, ANI2X + from .datasets.potential.bpa import BPA from .datasets.potential.comp6 import COMP6 from .datasets.potential.dummy import Dummy, PredefinedDataset from .datasets.potential.gdml import GDML diff --git a/openqdc/datasets/potential/__init__.py b/openqdc/datasets/potential/__init__.py index 35721dde..86671792 100644 --- a/openqdc/datasets/potential/__init__.py +++ b/openqdc/datasets/potential/__init__.py @@ -1,5 +1,6 @@ from .alchemy import Alchemy from .ani import ANI1, ANI1CCX, ANI1CCX_V2, ANI1X, ANI2X +from .bpa import BPA from .comp6 import COMP6 from .dummy import Dummy, PredefinedDataset from .gdml import GDML @@ -33,6 +34,7 @@ "ANI1CCX_V2": ANI1CCX_V2, "ANI1X": ANI1X, "ANI2X": ANI2X, + "BPA": BPA, "COMP6": COMP6, "GDML": GDML, "GEOM": GEOM, diff --git a/openqdc/datasets/potential/bpa.py b/openqdc/datasets/potential/bpa.py new file mode 100644 index 00000000..a6aa5df1 --- /dev/null +++ b/openqdc/datasets/potential/bpa.py @@ -0,0 +1,26 @@ +from openqdc import BaseDataset +from openqdc.methods import PotentialMethod + + +class BPA(BaseDataset): + """ + _summary_ + + + Usage: + ```python + from openqdc.datasets import BPA + dataset = BPA() + ``` + + + References: + https://pubs.acs.org/doi/10.1021/acs.jctc.1c00647 + """ + + __name__ = "BPA" + __energy_unit__ = "ev" + __forces_unit__ = "eV/ang" + __distance_unit__ = "ang" + __energy_methods__ = ([PotentialMethod.WB97X_6_31G_D],) + __links__ = {"BPA.zip": "https://pubs.acs.org/doi/suppl/10.1021/acs.jctc.1c00647/suppl_file/ct1c00647_si_002.zip"} From b613fb018c6b1e0d98dbf03bedeced8a8da7f67a Mon Sep 17 00:00:00 2001 From: Hatem Helal Date: Fri, 2 Aug 2024 10:33:04 -0600 Subject: [PATCH 3/8] fix download and parsing --- openqdc/datasets/potential/bpa.py | 41 ++++++++++++++++++++++++++++--- 1 file changed, 38 insertions(+), 3 deletions(-) diff --git a/openqdc/datasets/potential/bpa.py b/openqdc/datasets/potential/bpa.py index a6aa5df1..464c2759 100644 --- a/openqdc/datasets/potential/bpa.py +++ b/openqdc/datasets/potential/bpa.py @@ -1,7 +1,23 @@ +from typing import Any, Dict, List + +import numpy as np +from ase.atoms import Atoms + from openqdc import BaseDataset from openqdc.methods import PotentialMethod +def read_bpa_record(subset: str, atoms: Atoms) -> Dict[str, Any]: + return dict( + name=np.array([str(atoms.symbols)]), + subset=subset, + energies=np.array([atoms.get_potential_energy()], dtype=np.float32), + forces=atoms.get_forces().reshape(-1, 3, 1).astype(np.float32), + atomic_inputs=np.column_stack((atoms.numbers, atoms.get_initial_charges(), atoms.positions)).astype(np.float32), + n_atoms=np.array([len(atoms)], dtype=np.int32), + ) + + class BPA(BaseDataset): """ _summary_ @@ -20,7 +36,26 @@ class BPA(BaseDataset): __name__ = "BPA" __energy_unit__ = "ev" - __forces_unit__ = "eV/ang" + __forces_unit__ = "ev/ang" __distance_unit__ = "ang" - __energy_methods__ = ([PotentialMethod.WB97X_6_31G_D],) - __links__ = {"BPA.zip": "https://pubs.acs.org/doi/suppl/10.1021/acs.jctc.1c00647/suppl_file/ct1c00647_si_002.zip"} + __force_mask__ = [True] + __energy_methods__ = (PotentialMethod.WB97X_6_31G_D,) + __links__ = {"BPA.zip": "https://figshare.com/ndownloader/files/31325990"} + + def read_raw_entries(self) -> List[Dict]: + import os.path as osp + from glob import glob + + from ase.io import iread + + files = glob(osp.join(self.root, "dataset_3BPA", "*.xyz")) + files = [f for f in files if "iso_atoms.xyz" not in f] + all_records = [] + + for file in files: + subset = np.array([osp.basename(file).split(".")[0]]) + + for atoms in iread(file, format="extxyz"): + all_records.append(read_bpa_record(subset, atoms)) + + return all_records From d63cb552b1c12262effd6fc7e471efae52c6a5b6 Mon Sep 17 00:00:00 2001 From: FNTwin Date: Fri, 2 Aug 2024 12:03:31 -0600 Subject: [PATCH 4/8] Splits in MACEOFF --- openqdc/datasets/potential/maceoff.py | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/openqdc/datasets/potential/maceoff.py b/openqdc/datasets/potential/maceoff.py index c8fb06b0..2552970e 100644 --- a/openqdc/datasets/potential/maceoff.py +++ b/openqdc/datasets/potential/maceoff.py @@ -1,4 +1,5 @@ import re +from functools import partial from os.path import join as p_join import datamol as dm @@ -52,7 +53,7 @@ def parse_mace_xyz(xyzpath): yield energy, numbers, positions, forces, smiles, subset -def build_data_object(data): +def build_data_object(data, split): energy, numbers, positions, forces, smiles, subset = data if smiles == "": x = np.concatenate((np.array(numbers)[:, None], np.zeros((len(numbers), 1))), axis=-1) @@ -67,6 +68,7 @@ def build_data_object(data): ), # forces -ve of energy gradient but the -1.0 is done in the convert_forces method atomic_inputs=np.concatenate((x, np.array(positions)), axis=-1, dtype=np.float32).reshape(-1, 5), n_atoms=np.array([x.shape[0]], dtype=np.int32), + split=np.array([split]), ) return res @@ -84,12 +86,22 @@ class MACEOFF(BaseDataset): force_target_names = ["dft_total_gradient"] __links__ = { - "train_large_neut_no_bad_clean.tar.gz": "https://api.repository.cam.ac.uk/server/api/core/bitstreams/b185b5ab-91cf-489a-9302-63bfac42824a/content" # noqa: E501 + "train_large_neut_no_bad_clean.tar.gz": "https://api.repository.cam.ac.uk/server/api/core/bitstreams/b185b5ab-91cf-489a-9302-63bfac42824a/content", # noqa: E501 + "test_large_neut_all.tar.gz": "https://api.repository.cam.ac.uk/server/api/core/bitstreams/cb8351dd-f09c-413f-921c-67a702a7f0c5/content", # noqa: E501 } def read_raw_entries(self): - filepath = p_join(self.root, "train_large_neut_no_bad_clean.xyz") - xyzpath = p_join(self.root, filepath) - structure_iterator = parse_mace_xyz(xyzpath) - res = dm.utils.parallelized(build_data_object, structure_iterator) - return res + entries = [] + for filename in self.__links__: + filename = filename.split(".")[0] + xyzpath = p_join(self.root, f"{filename}.xyz") + split = filename.split("_")[0] + structure_iterator = parse_mace_xyz(xyzpath) + func = partial(build_data_object, split=split) + entries.extend(dm.utils.parallelized(func, structure_iterator)) + return entries + + def __getitem__(self, idx): + data = super().__getitem__(idx) + data.__setattr__("split", self._convert_array(self.data["split"][idx])) + return data From 737b81e14f778214f905f4876bbf8c7c6b32c875 Mon Sep 17 00:00:00 2001 From: FNTwin Date: Fri, 2 Aug 2024 12:51:14 -0600 Subject: [PATCH 5/8] WIP --- openqdc/datasets/potential/bpa.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/openqdc/datasets/potential/bpa.py b/openqdc/datasets/potential/bpa.py index 464c2759..78a00ad5 100644 --- a/openqdc/datasets/potential/bpa.py +++ b/openqdc/datasets/potential/bpa.py @@ -11,10 +11,11 @@ def read_bpa_record(subset: str, atoms: Atoms) -> Dict[str, Any]: return dict( name=np.array([str(atoms.symbols)]), subset=subset, - energies=np.array([atoms.get_potential_energy()], dtype=np.float32), + energies=np.array([atoms.get_potential_energy()], dtype=np.float64), forces=atoms.get_forces().reshape(-1, 3, 1).astype(np.float32), atomic_inputs=np.column_stack((atoms.numbers, atoms.get_initial_charges(), atoms.positions)).astype(np.float32), n_atoms=np.array([len(atoms)], dtype=np.int32), + split = np.array([subset.item().split("_")[0]]) ) @@ -59,3 +60,8 @@ def read_raw_entries(self) -> List[Dict]: all_records.append(read_bpa_record(subset, atoms)) return all_records + + def __getitem__(self, idx): + data = super().__getitem__(idx) + data.__setattr__("split", self._convert_array(self.data["split"][idx])) + return data From a1061a8f75264164d2b54a0c277da01eb92e209b Mon Sep 17 00:00:00 2001 From: FNTwin Date: Fri, 2 Aug 2024 13:02:48 -0600 Subject: [PATCH 6/8] MACEOFF docstrings --- openqdc/datasets/potential/maceoff.py | 32 ++++++++++++++++++++++++--- 1 file changed, 29 insertions(+), 3 deletions(-) diff --git a/openqdc/datasets/potential/maceoff.py b/openqdc/datasets/potential/maceoff.py index 2552970e..ed184eb6 100644 --- a/openqdc/datasets/potential/maceoff.py +++ b/openqdc/datasets/potential/maceoff.py @@ -63,9 +63,7 @@ def build_data_object(data, split): name=np.array([smiles]), subset=np.array([subset]), energies=np.array([[energy]], dtype=np.float64), - forces=np.array(forces, dtype=np.float32).reshape( - -1, 3, 1 - ), # forces -ve of energy gradient but the -1.0 is done in the convert_forces method + forces=np.array(forces, dtype=np.float32).reshape(-1, 3, 1), atomic_inputs=np.concatenate((x, np.array(positions)), axis=-1, dtype=np.float32).reshape(-1, 5), n_atoms=np.array([x.shape[0]], dtype=np.int32), split=np.array([split]), @@ -74,6 +72,34 @@ def build_data_object(data, split): class MACEOFF(BaseDataset): + """ + MACEOFF dataset core of the dataset consist in the Spice V1 dataset. + 95% of the data are used for training and validation under the "train" split, + and 5% for testing. The dataset uses the Spice level of theory + ωB97M-D3(BJ)/def2-TZVPPD as implemented in the PSI4 software. + MACEOFF uses a subset of SPICE that contains the ten chemical elements + H, C, N, O, F, P, S, Cl, Br, and I, and has a neutral formal charge. + MACEOFF doesn't contain ion pairs. To facilitate the learning of intramolecular + non-bonded interactions, MACEOFF dataset contains larger 50–90 atom molecules + randomly selected from the QMugs dataset. + MACEOFF contains a number of water clusters carved out of molecular dynamics simulations + of liquid water, with sizes of up to 50 water molecules and part of the + COMP6 tripeptide geometry dataset. + + Usage: + ```python + from openqdc.datasets import MACEOFF + dataset = MACEOFF() + ``` + + Species: + [H, C, N, O, F, P, S, Cl, Br, I] + + References: + https://arxiv.org/pdf/2312.15211\n + https://doi.org/10.17863/CAM.107498 + """ + __name__ = "maceoff" __energy_methods__ = [PotentialMethod.WB97M_D3BJ_DEF2_TZVPPD] From dab04efae3d6c10f91e9279704337a9001193988 Mon Sep 17 00:00:00 2001 From: FNTwin Date: Thu, 8 Aug 2024 14:08:19 -0600 Subject: [PATCH 7/8] Correct regex parsing + binary strings dec --- openqdc/datasets/potential/ani.py | 7 ++----- openqdc/datasets/potential/maceoff.py | 2 +- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/openqdc/datasets/potential/ani.py b/openqdc/datasets/potential/ani.py index aac35635..81f106f9 100644 --- a/openqdc/datasets/potential/ani.py +++ b/openqdc/datasets/potential/ani.py @@ -154,7 +154,7 @@ def convert_forces(self, x): return super().convert_forces(x) * 0.529177249 # correct the Dataset error def __smiles_converter__(self, x): - return x + return "-".join(x.decode("ascii").split("-")[:-1]) class ANI1CCX(ANI1): @@ -195,10 +195,7 @@ class ANI1CCX(ANI1): __links__ = {"ani1x.hdf5.gz": "https://zenodo.org/record/4081694/files/292.hdf5.gz"} def __smiles_converter__(self, x): - """util function to convert string to smiles: useful if the smiles is - encoded in a different format than its display format - """ - return x + return x.decode("ascii") class ANI1CCX_V2(ANI1CCX): diff --git a/openqdc/datasets/potential/maceoff.py b/openqdc/datasets/potential/maceoff.py index ed184eb6..f90a3c4f 100644 --- a/openqdc/datasets/potential/maceoff.py +++ b/openqdc/datasets/potential/maceoff.py @@ -14,7 +14,7 @@ def parse_mace_xyz(xyzpath): energy_re = re.compile(r"energy=(\S+)") smiles_re = re.compile(r"smiles=(\S+)") - subset_re = re.compile(r"config_type=(\S+)") + subset_re = re.compile(r"config_type=([^;]+)\ MACE_energy") with open(xyzpath, "r") as f: n_atoms = None counter = 0 From 3aa2796feb146f9e249a18ca09b49e041c9938cc Mon Sep 17 00:00:00 2001 From: FNTwin Date: Fri, 30 Aug 2024 09:38:30 -0600 Subject: [PATCH 8/8] BPA docstrings --- openqdc/datasets/potential/bpa.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/openqdc/datasets/potential/bpa.py b/openqdc/datasets/potential/bpa.py index 78a00ad5..16817105 100644 --- a/openqdc/datasets/potential/bpa.py +++ b/openqdc/datasets/potential/bpa.py @@ -15,14 +15,21 @@ def read_bpa_record(subset: str, atoms: Atoms) -> Dict[str, Any]: forces=atoms.get_forces().reshape(-1, 3, 1).astype(np.float32), atomic_inputs=np.column_stack((atoms.numbers, atoms.get_initial_charges(), atoms.positions)).astype(np.float32), n_atoms=np.array([len(atoms)], dtype=np.int32), - split = np.array([subset.item().split("_")[0]]) + split=np.array([subset.item().split("_")[0]]), ) class BPA(BaseDataset): """ - _summary_ - + BPA (or 3BPA) dataset is a dataset consisting of a flexible druglike + molecule 3-(benzyloxy)pyridin-2-amine. This dataset features + complex dihedral potential energy surface with many local minima, + which can be challenging to approximate using classical or ML force fields. + The configuration were sampled from short (0.5 ps) MD simulations using the ANI-1x force field to + perturb the toward lower potential energies. Furthermore, long 25 ps MD simulation were performed at + three different temperatures (300, 600, and 1200 K) using the Langevin thermostat and a 1 fs time step. + The final configurations were re-evaluated using ORCA at the DFT level of + theory using the ωB97X exchange correlation functional and the 6-31G(d) basis set. Usage: ```python @@ -40,7 +47,7 @@ class BPA(BaseDataset): __forces_unit__ = "ev/ang" __distance_unit__ = "ang" __force_mask__ = [True] - __energy_methods__ = (PotentialMethod.WB97X_6_31G_D,) + __energy_methods__ = [PotentialMethod.WB97X_6_31G_D] __links__ = {"BPA.zip": "https://figshare.com/ndownloader/files/31325990"} def read_raw_entries(self) -> List[Dict]: @@ -60,7 +67,7 @@ def read_raw_entries(self) -> List[Dict]: all_records.append(read_bpa_record(subset, atoms)) return all_records - + def __getitem__(self, idx): data = super().__getitem__(idx) data.__setattr__("split", self._convert_array(self.data["split"][idx]))