Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MACEOFF dataset + extra PotentialMethods #109

Closed
wants to merge 9 commits into from
Closed
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions openqdc/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ def get_project_root():
"COMP6": "openqdc.datasets.potential.comp6",
"GDML": "openqdc.datasets.potential.gdml",
"Molecule3D": "openqdc.datasets.potential.molecule3d",
"MACEOFF": "openqdc.datasets.potential.maceoff",
"OrbnetDenali": "openqdc.datasets.potential.orbnet_denali",
"SN2RXN": "openqdc.datasets.potential.sn2_rxn",
"QM7X": "openqdc.datasets.potential.qm7x",
Expand Down Expand Up @@ -122,6 +123,7 @@ def __dir__():
from .datasets.potential.gdml import GDML
from .datasets.potential.geom import GEOM
from .datasets.potential.iso_17 import ISO17
from .datasets.potential.maceoff import MACEOFF
from .datasets.potential.md22 import MD22
from .datasets.potential.molecule3d import Molecule3D
from .datasets.potential.multixcqm9 import MultixcQM9, MultixcQM9_V2
Expand Down
2 changes: 2 additions & 0 deletions openqdc/datasets/potential/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from .gdml import GDML
from .geom import GEOM
from .iso_17 import ISO17
from .maceoff import MACEOFF
from .md22 import MD22
from .molecule3d import Molecule3D
from .multixcqm9 import MultixcQM9, MultixcQM9_V2
Expand Down Expand Up @@ -38,6 +39,7 @@
"GEOM": GEOM,
"ISO17": ISO17,
"Molecule3D": Molecule3D,
"MACEOFF": MACEOFF,
"NablaDFT": NablaDFT,
"OrbnetDenali": OrbnetDenali,
"PCQM_B3LYP": PCQM_B3LYP,
Expand Down
133 changes: 133 additions & 0 deletions openqdc/datasets/potential/maceoff.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
import re
from functools import partial
from os.path import join as p_join

import datamol as dm
import numpy as np

from openqdc.datasets.base import BaseDataset
from openqdc.methods import PotentialMethod
from openqdc.utils.constants import ATOMIC_NUMBERS
from openqdc.utils.molecule import get_atomic_number_and_charge


def parse_mace_xyz(xyzpath):
energy_re = re.compile(r"energy=(\S+)")
smiles_re = re.compile(r"smiles=(\S+)")
subset_re = re.compile(r"config_type=(\S+)")
with open(xyzpath, "r") as f:
n_atoms = None
counter = 0
positions = []
numbers = []
forces = []
energy = None
for line in f:
if n_atoms is None:
n_atoms = int(line)
positions = []
numbers = []
forces = []
energy = None
counter = 1
continue
if counter == 1:
props = line
energy = float(energy_re.search(props).group(1))
subset = subset_re.search(props).group(1)
try:
smiles = smiles_re.search(props).group(1)
except AttributeError: # water and qmugs subsets do not have smiles
smiles = ""
counter = 2
continue
el, x, y, z, fx, fy, fz, _, _, _ = line.split()
numbers.append(ATOMIC_NUMBERS[el])
positions.append([float(x), float(y), float(z)])
forces.append([float(fx), float(fy), float(fz)])
smiles = smiles.replace('"', "")
subset = subset.replace('"', "")
counter += 1
if counter == n_atoms + 2:
n_atoms = None
yield energy, numbers, positions, forces, smiles, subset


def build_data_object(data, split):
energy, numbers, positions, forces, smiles, subset = data
if smiles == "":
x = np.concatenate((np.array(numbers)[:, None], np.zeros((len(numbers), 1))), axis=-1)
else:
x = get_atomic_number_and_charge(dm.to_mol(smiles, remove_hs=False, ordered=True))
res = dict(
name=np.array([smiles]),
subset=np.array([subset]),
energies=np.array([[energy]], dtype=np.float64),
forces=np.array(forces, dtype=np.float32).reshape(-1, 3, 1),
atomic_inputs=np.concatenate((x, np.array(positions)), axis=-1, dtype=np.float32).reshape(-1, 5),
n_atoms=np.array([x.shape[0]], dtype=np.int32),
split=np.array([split]),
)
return res


class MACEOFF(BaseDataset):
"""
MACEOFF dataset core of the dataset consist in the Spice V1 dataset.
95% of the data are used for training and validation under the "train" split,
and 5% for testing. The dataset uses the Spice level of theory
ωB97M-D3(BJ)/def2-TZVPPD as implemented in the PSI4 software.
MACEOFF uses a subset of SPICE that contains the ten chemical elements
H, C, N, O, F, P, S, Cl, Br, and I, and has a neutral formal charge.
MACEOFF doesn't contain ion pairs. To facilitate the learning of intramolecular
non-bonded interactions, MACEOFF dataset contains larger 50–90 atom molecules
randomly selected from the QMugs dataset.
MACEOFF contains a number of water clusters carved out of molecular dynamics simulations
of liquid water, with sizes of up to 50 water molecules and part of the
COMP6 tripeptide geometry dataset.

Usage:
```python
from openqdc.datasets import MACEOFF
dataset = MACEOFF()
```

Species:
[H, C, N, O, F, P, S, Cl, Br, I]

References:
https://arxiv.org/pdf/2312.15211\n
https://doi.org/10.17863/CAM.107498
"""

__name__ = "maceoff"

__energy_methods__ = [PotentialMethod.WB97M_D3BJ_DEF2_TZVPPD]
__force_mask__ = [True]
__energy_unit__ = "ev"
__distance_unit__ = "ang"
__forces_unit__ = "ev/ang"

energy_target_names = ["dft_total_energy"]
force_target_names = ["dft_total_gradient"]

__links__ = {
"train_large_neut_no_bad_clean.tar.gz": "https://api.repository.cam.ac.uk/server/api/core/bitstreams/b185b5ab-91cf-489a-9302-63bfac42824a/content", # noqa: E501
"test_large_neut_all.tar.gz": "https://api.repository.cam.ac.uk/server/api/core/bitstreams/cb8351dd-f09c-413f-921c-67a702a7f0c5/content", # noqa: E501
}

def read_raw_entries(self):
entries = []
for filename in self.__links__:
filename = filename.split(".")[0]
xyzpath = p_join(self.root, f"{filename}.xyz")
split = filename.split("_")[0]
structure_iterator = parse_mace_xyz(xyzpath)
func = partial(build_data_object, split=split)
entries.extend(dm.utils.parallelized(func, structure_iterator))
return entries

def __getitem__(self, idx):
data = super().__getitem__(idx)
data.__setattr__("split", self._convert_array(self.data["split"][idx]))
return data
2 changes: 0 additions & 2 deletions openqdc/methods/enums.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,7 +224,6 @@ def atom_energies_dict(self):
raise NotImplementedError()


@unique
class PotentialMethod(QmMethod): # SPLIT FOR INTERACTIO ENERGIES AND FIX MD1
B1LYP_VWN5_DZP = Functional.B1LYP_VWN5, BasisSet.DZP
B1LYP_VWN5_SZ = Functional.B1LYP_VWN5, BasisSet.SZ
Expand Down Expand Up @@ -501,7 +500,6 @@ def atom_energies_dict(self):
return energies


@unique
class InteractionMethod(QmMethod):
CCSD_T_NN = Functional.CCSDT, BasisSet.NN
CCSD_T_CBS = Functional.CCSDT, BasisSet.CBS
Expand Down
Loading