diff --git a/main/404.html b/main/404.html index cbdf37d..f4fcbf0 100644 --- a/main/404.html +++ b/main/404.html @@ -931,6 +931,27 @@ +
  • + + + + + 3BPA + + + + +
  • + + + + + + + + + +
  • @@ -994,6 +1015,27 @@ +
  • + + + + + MaceOFF + + + + +
  • + + + + + + + + + +
  • diff --git a/main/API/basedataset.html b/main/API/basedataset.html index 178b9f5..64cc3df 100644 --- a/main/API/basedataset.html +++ b/main/API/basedataset.html @@ -1200,6 +1200,27 @@ +
  • + + + + + 3BPA + + + + +
  • + + + + + + + + + +
  • @@ -1263,6 +1284,27 @@ +
  • + + + + + MaceOFF + + + + +
  • + + + + + + + + + +
  • diff --git a/main/API/datasets/3bpa.html b/main/API/datasets/3bpa.html new file mode 100644 index 0000000..db660a1 --- /dev/null +++ b/main/API/datasets/3bpa.html @@ -0,0 +1,2271 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + 3BPA - OpenQDC + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + + Skip to content + + +
    +
    + +
    + + + + + + +
    + + +
    + +
    + + + + + + + + + +
    +
    + + + +
    +
    +
    + + + + + + + +
    +
    +
    + + + +
    +
    +
    + + + +
    +
    +
    + + + +
    +
    + + + + + + + +

    3BPA

    + +
    + + + + +
    + + + +
    + + + + + + + + +
    + + + +

    + BPA + + +

    + + +
    +

    + Bases: BaseDataset

    + + +

    BPA (or 3BPA) dataset is a dataset consisting of a flexible druglike +molecule 3-(benzyloxy)pyridin-2-amine. This dataset features +complex dihedral potential energy surface with many local minima, +which can be challenging to approximate using classical or ML force fields. +The configuration were sampled from short (0.5 ps) MD simulations using the ANI-1x force field to +perturb the toward lower potential energies. Furthermore, long 25 ps MD simulation were performed at +three different temperatures (300, 600, and 1200 K) using the Langevin thermostat and a 1 fs time step. +The final configurations were re-evaluated using ORCA at the DFT level of +theory using the ωB97X exchange correlation functional and the 6-31G(d) basis set.

    +

    Usage: +

    from openqdc.datasets import BPA
    +dataset = BPA()
    +

    + + +
    + References +

    https://pubs.acs.org/doi/10.1021/acs.jctc.1c00647

    +
    +
    + Source code in openqdc/datasets/potential/bpa.py +
    22
    +23
    +24
    +25
    +26
    +27
    +28
    +29
    +30
    +31
    +32
    +33
    +34
    +35
    +36
    +37
    +38
    +39
    +40
    +41
    +42
    +43
    +44
    +45
    +46
    +47
    +48
    +49
    +50
    +51
    +52
    +53
    +54
    +55
    +56
    +57
    +58
    +59
    +60
    +61
    +62
    +63
    +64
    +65
    +66
    +67
    +68
    +69
    +70
    +71
    +72
    +73
    +74
    class BPA(BaseDataset):
    +    """
    +    BPA (or 3BPA) dataset is a dataset consisting of a flexible druglike
    +    molecule 3-(benzyloxy)pyridin-2-amine. This dataset features
    +    complex dihedral potential energy surface with many local minima,
    +    which can be challenging to approximate using classical or ML force fields.
    +    The configuration were sampled from short (0.5 ps) MD simulations using the ANI-1x force field to
    +    perturb the toward lower potential energies. Furthermore, long 25 ps MD simulation were performed at
    +    three different temperatures (300, 600, and 1200 K) using the Langevin thermostat and a 1 fs time step.
    +    The final configurations were re-evaluated using ORCA at the DFT level of
    +    theory using the ωB97X exchange correlation functional and the 6-31G(d) basis set.
    +
    +    Usage:
    +    ```python
    +    from openqdc.datasets import BPA
    +    dataset = BPA()
    +    ```
    +
    +
    +    References:
    +        https://pubs.acs.org/doi/10.1021/acs.jctc.1c00647
    +    """
    +
    +    __name__ = "BPA"
    +    __energy_unit__ = "ev"
    +    __forces_unit__ = "ev/ang"
    +    __distance_unit__ = "ang"
    +    __force_mask__ = [True]
    +    __energy_methods__ = [PotentialMethod.WB97X_6_31G_D]
    +    __links__ = {"BPA.zip": "https://figshare.com/ndownloader/files/31325990"}
    +
    +    def read_raw_entries(self) -> List[Dict]:
    +        import os.path as osp
    +        from glob import glob
    +
    +        from ase.io import iread
    +
    +        files = glob(osp.join(self.root, "dataset_3BPA", "*.xyz"))
    +        files = [f for f in files if "iso_atoms.xyz" not in f]
    +        all_records = []
    +
    +        for file in files:
    +            subset = np.array([osp.basename(file).split(".")[0]])
    +
    +            for atoms in iread(file, format="extxyz"):
    +                all_records.append(read_bpa_record(subset, atoms))
    +
    +        return all_records
    +
    +    def __getitem__(self, idx):
    +        data = super().__getitem__(idx)
    +        data.__setattr__("split", self._convert_array(self.data["split"][idx]))
    +        return data
    +
    +
    + + + +
    + + + + + + + + + + + +
    + +
    + +
    + + + + +
    + +
    + +
    + + + + + + + + + + + + + + + + + +
    +
    + + + +
    + +
    + + + +
    +
    +
    +
    + + + + + + + + + + + + + + \ No newline at end of file diff --git a/main/API/datasets/alchemy.html b/main/API/datasets/alchemy.html index afc962c..7f33abd 100644 --- a/main/API/datasets/alchemy.html +++ b/main/API/datasets/alchemy.html @@ -13,7 +13,7 @@ - + @@ -951,6 +951,27 @@ + + +
  • + + + + + 3BPA + + + + +
  • + + + + + + + + @@ -1070,6 +1091,27 @@ +
  • + + + + + MaceOFF + + + + +
  • + + + + + + + + + +
  • diff --git a/main/API/datasets/ani.html b/main/API/datasets/ani.html index 3460e7c..861c30f 100644 --- a/main/API/datasets/ani.html +++ b/main/API/datasets/ani.html @@ -953,6 +953,27 @@ +
  • + + + + + 3BPA + + + + +
  • + + + + + + + + + +
  • @@ -1106,6 +1127,27 @@ +
  • + + + + + MaceOFF + + + + +
  • + + + + + + + + + +
  • diff --git a/main/API/datasets/comp6.html b/main/API/datasets/comp6.html index 4810f4f..12048c5 100644 --- a/main/API/datasets/comp6.html +++ b/main/API/datasets/comp6.html @@ -953,6 +953,27 @@ +
  • + + + + + 3BPA + + + + +
  • + + + + + + + + + +
  • @@ -1016,6 +1037,27 @@ +
  • + + + + + MaceOFF + + + + +
  • + + + + + + + + + +
  • diff --git a/main/API/datasets/des.html b/main/API/datasets/des.html index 853ea36..2a5300d 100644 --- a/main/API/datasets/des.html +++ b/main/API/datasets/des.html @@ -951,6 +951,27 @@ +
  • + + + + + 3BPA + + + + +
  • + + + + + + + + + +
  • @@ -1014,6 +1035,27 @@ +
  • + + + + + MaceOFF + + + + +
  • + + + + + + + + + +
  • diff --git a/main/API/datasets/gdml.html b/main/API/datasets/gdml.html index 58e8232..f6a0b33 100644 --- a/main/API/datasets/gdml.html +++ b/main/API/datasets/gdml.html @@ -953,6 +953,27 @@ +
  • + + + + + 3BPA + + + + +
  • + + + + + + + + + +
  • @@ -1016,6 +1037,27 @@ +
  • + + + + + MaceOFF + + + + +
  • + + + + + + + + + +
  • diff --git a/main/API/datasets/geom.html b/main/API/datasets/geom.html index 6610a54..dd7b324 100644 --- a/main/API/datasets/geom.html +++ b/main/API/datasets/geom.html @@ -13,7 +13,7 @@ - + @@ -953,6 +953,27 @@ +
  • + + + + + 3BPA + + + + +
  • + + + + + + + + + +
  • @@ -1014,6 +1035,27 @@ + + +
  • + + + + + MaceOFF + + + + +
  • + + + + + + + + diff --git a/main/API/datasets/iso_17.html b/main/API/datasets/iso_17.html index 8a39159..7e4e37f 100644 --- a/main/API/datasets/iso_17.html +++ b/main/API/datasets/iso_17.html @@ -953,6 +953,27 @@ +
  • + + + + + 3BPA + + + + +
  • + + + + + + + + + +
  • @@ -1016,6 +1037,27 @@ +
  • + + + + + MaceOFF + + + + +
  • + + + + + + + + + +
  • diff --git a/main/API/datasets/l7.html b/main/API/datasets/l7.html index a1c0dcd..6c2a92c 100644 --- a/main/API/datasets/l7.html +++ b/main/API/datasets/l7.html @@ -951,6 +951,27 @@ +
  • + + + + + 3BPA + + + + +
  • + + + + + + + + + +
  • @@ -1014,6 +1035,27 @@ +
  • + + + + + MaceOFF + + + + +
  • + + + + + + + + + +
  • diff --git a/main/API/datasets/maceoff.html b/main/API/datasets/maceoff.html new file mode 100644 index 0000000..322920d --- /dev/null +++ b/main/API/datasets/maceoff.html @@ -0,0 +1,2294 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + MaceOFF - OpenQDC + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + + Skip to content + + +
    +
    + +
    + + + + + + +
    + + +
    + +
    + + + + + + + + + +
    +
    + + + +
    +
    +
    + + + + + + + +
    +
    +
    + + + +
    +
    +
    + + + +
    +
    +
    + + + +
    +
    + + + + + + + +

    MaceOFF

    + +
    + + + + +
    + + + +
    + + + + + + + + +
    + + + +

    + MACEOFF + + +

    + + +
    +

    + Bases: BaseDataset

    + + +

    MACEOFF dataset core of the dataset consist in the Spice V1 dataset. +95% of the data are used for training and validation under the "train" split, +and 5% for testing. The dataset uses the Spice level of theory +ωB97M-D3(BJ)/def2-TZVPPD as implemented in the PSI4 software. +MACEOFF uses a subset of SPICE that contains the ten chemical elements +H, C, N, O, F, P, S, Cl, Br, and I, and has a neutral formal charge. +MACEOFF doesn't contain ion pairs. To facilitate the learning of intramolecular +non-bonded interactions, MACEOFF dataset contains larger 50–90 atom molecules +randomly selected from the QMugs dataset. +MACEOFF contains a number of water clusters carved out of molecular dynamics simulations +of liquid water, with sizes of up to 50 water molecules and part of the +COMP6 tripeptide geometry dataset.

    +

    Usage: +

    from openqdc.datasets import MACEOFF
    +dataset = MACEOFF()
    +

    + + +
    + Species +

    [H, C, N, O, F, P, S, Cl, Br, I]

    +
    + +
    + References +

    https://arxiv.org/pdf/2312.15211

    +

    https://doi.org/10.17863/CAM.107498

    +
    +
    + Source code in openqdc/datasets/potential/maceoff.py +
     74
    + 75
    + 76
    + 77
    + 78
    + 79
    + 80
    + 81
    + 82
    + 83
    + 84
    + 85
    + 86
    + 87
    + 88
    + 89
    + 90
    + 91
    + 92
    + 93
    + 94
    + 95
    + 96
    + 97
    + 98
    + 99
    +100
    +101
    +102
    +103
    +104
    +105
    +106
    +107
    +108
    +109
    +110
    +111
    +112
    +113
    +114
    +115
    +116
    +117
    +118
    +119
    +120
    +121
    +122
    +123
    +124
    +125
    +126
    +127
    +128
    +129
    +130
    +131
    +132
    +133
    class MACEOFF(BaseDataset):
    +    """
    +    MACEOFF dataset core of the dataset consist in the Spice V1 dataset.
    +    95% of the data are used for training and validation under the "train" split,
    +    and 5% for testing. The dataset uses the Spice level of theory
    +    ωB97M-D3(BJ)/def2-TZVPPD as implemented in the PSI4 software.
    +    MACEOFF uses a subset of SPICE that contains the ten chemical elements
    +    H, C, N, O, F, P, S, Cl, Br, and I, and has a neutral formal charge.
    +    MACEOFF doesn't contain ion pairs. To facilitate the learning of intramolecular
    +    non-bonded interactions, MACEOFF dataset contains larger 50–90 atom molecules
    +    randomly selected from the QMugs dataset.
    +    MACEOFF contains a number of water clusters carved out of molecular dynamics simulations
    +    of liquid water, with sizes of up to 50 water molecules and part of the
    +    COMP6 tripeptide geometry dataset.
    +
    +    Usage:
    +    ```python
    +    from openqdc.datasets import MACEOFF
    +    dataset = MACEOFF()
    +    ```
    +
    +    Species:
    +        [H, C, N, O, F, P, S, Cl, Br, I]
    +
    +    References:
    +        https://arxiv.org/pdf/2312.15211\n
    +        https://doi.org/10.17863/CAM.107498
    +    """
    +
    +    __name__ = "maceoff"
    +
    +    __energy_methods__ = [PotentialMethod.WB97M_D3BJ_DEF2_TZVPPD]
    +    __force_mask__ = [True]
    +    __energy_unit__ = "ev"
    +    __distance_unit__ = "ang"
    +    __forces_unit__ = "ev/ang"
    +
    +    energy_target_names = ["dft_total_energy"]
    +    force_target_names = ["dft_total_gradient"]
    +
    +    __links__ = {
    +        "train_large_neut_no_bad_clean.tar.gz": "https://api.repository.cam.ac.uk/server/api/core/bitstreams/b185b5ab-91cf-489a-9302-63bfac42824a/content",  # noqa: E501
    +        "test_large_neut_all.tar.gz": "https://api.repository.cam.ac.uk/server/api/core/bitstreams/cb8351dd-f09c-413f-921c-67a702a7f0c5/content",  # noqa: E501
    +    }
    +
    +    def read_raw_entries(self):
    +        entries = []
    +        for filename in self.__links__:
    +            filename = filename.split(".")[0]
    +            xyzpath = p_join(self.root, f"{filename}.xyz")
    +            split = filename.split("_")[0]
    +            structure_iterator = parse_mace_xyz(xyzpath)
    +            func = partial(build_data_object, split=split)
    +            entries.extend(dm.utils.parallelized(func, structure_iterator))
    +        return entries
    +
    +    def __getitem__(self, idx):
    +        data = super().__getitem__(idx)
    +        data.__setattr__("split", self._convert_array(self.data["split"][idx]))
    +        return data
    +
    +
    + + + +
    + + + + + + + + + + + +
    + +
    + +
    + + + + +
    + +
    + +
    + + + + + + + + + + + + + + + + + +
    +
    + + + +
    + +
    + + + +
    +
    +
    +
    + + + + + + + + + + + + + + \ No newline at end of file diff --git a/main/API/datasets/md22.html b/main/API/datasets/md22.html index 65426e6..108327e 100644 --- a/main/API/datasets/md22.html +++ b/main/API/datasets/md22.html @@ -953,6 +953,27 @@ +
  • + + + + + 3BPA + + + + +
  • + + + + + + + + + +
  • @@ -1016,6 +1037,27 @@ +
  • + + + + + MaceOFF + + + + +
  • + + + + + + + + + +
  • diff --git a/main/API/datasets/metcalf.html b/main/API/datasets/metcalf.html index f4c4092..2a4b562 100644 --- a/main/API/datasets/metcalf.html +++ b/main/API/datasets/metcalf.html @@ -951,6 +951,27 @@ +
  • + + + + + 3BPA + + + + +
  • + + + + + + + + + +
  • @@ -1014,6 +1035,27 @@ +
  • + + + + + MaceOFF + + + + +
  • + + + + + + + + + +
  • diff --git a/main/API/datasets/molecule3d.html b/main/API/datasets/molecule3d.html index 3a484e9..51cab42 100644 --- a/main/API/datasets/molecule3d.html +++ b/main/API/datasets/molecule3d.html @@ -953,6 +953,27 @@ +
  • + + + + + 3BPA + + + + +
  • + + + + + + + + + +
  • @@ -1016,6 +1037,27 @@ +
  • + + + + + MaceOFF + + + + +
  • + + + + + + + + + +
  • diff --git a/main/API/datasets/multixcqm9.html b/main/API/datasets/multixcqm9.html index b6f022c..c086d71 100644 --- a/main/API/datasets/multixcqm9.html +++ b/main/API/datasets/multixcqm9.html @@ -953,6 +953,27 @@ +
  • + + + + + 3BPA + + + + +
  • + + + + + + + + + +
  • @@ -1016,6 +1037,27 @@ +
  • + + + + + MaceOFF + + + + +
  • + + + + + + + + + +
  • diff --git a/main/API/datasets/nabladft.html b/main/API/datasets/nabladft.html index d20fcc6..757a6be 100644 --- a/main/API/datasets/nabladft.html +++ b/main/API/datasets/nabladft.html @@ -953,6 +953,27 @@ +
  • + + + + + 3BPA + + + + +
  • + + + + + + + + + +
  • @@ -1016,6 +1037,27 @@ +
  • + + + + + MaceOFF + + + + +
  • + + + + + + + + + +
  • diff --git a/main/API/datasets/orbnet_denali.html b/main/API/datasets/orbnet_denali.html index a7cfb09..8e6755f 100644 --- a/main/API/datasets/orbnet_denali.html +++ b/main/API/datasets/orbnet_denali.html @@ -953,6 +953,27 @@ +
  • + + + + + 3BPA + + + + +
  • + + + + + + + + + +
  • @@ -1016,6 +1037,27 @@ +
  • + + + + + MaceOFF + + + + +
  • + + + + + + + + + +
  • diff --git a/main/API/datasets/pcqm.html b/main/API/datasets/pcqm.html index cb2e83d..6a3212f 100644 --- a/main/API/datasets/pcqm.html +++ b/main/API/datasets/pcqm.html @@ -953,6 +953,27 @@ +
  • + + + + + 3BPA + + + + +
  • + + + + + + + + + +
  • @@ -1016,6 +1037,27 @@ +
  • + + + + + MaceOFF + + + + +
  • + + + + + + + + + +
  • diff --git a/main/API/datasets/proteinfragments.html b/main/API/datasets/proteinfragments.html index 56e623e..ab03c1a 100644 --- a/main/API/datasets/proteinfragments.html +++ b/main/API/datasets/proteinfragments.html @@ -953,6 +953,27 @@ +
  • + + + + + 3BPA + + + + +
  • + + + + + + + + + +
  • @@ -1016,6 +1037,27 @@ +
  • + + + + + MaceOFF + + + + +
  • + + + + + + + + + +
  • diff --git a/main/API/datasets/qm1b.html b/main/API/datasets/qm1b.html index dcf9d91..e5de521 100644 --- a/main/API/datasets/qm1b.html +++ b/main/API/datasets/qm1b.html @@ -953,6 +953,27 @@ +
  • + + + + + 3BPA + + + + +
  • + + + + + + + + + +
  • @@ -1016,6 +1037,27 @@ +
  • + + + + + MaceOFF + + + + +
  • + + + + + + + + + +
  • diff --git a/main/API/datasets/qm7x.html b/main/API/datasets/qm7x.html index 0030918..2ae17ff 100644 --- a/main/API/datasets/qm7x.html +++ b/main/API/datasets/qm7x.html @@ -953,6 +953,27 @@ +
  • + + + + + 3BPA + + + + +
  • + + + + + + + + + +
  • @@ -1016,6 +1037,27 @@ +
  • + + + + + MaceOFF + + + + +
  • + + + + + + + + + +
  • diff --git a/main/API/datasets/qmugs.html b/main/API/datasets/qmugs.html index 4321a94..05582a7 100644 --- a/main/API/datasets/qmugs.html +++ b/main/API/datasets/qmugs.html @@ -953,6 +953,27 @@ +
  • + + + + + 3BPA + + + + +
  • + + + + + + + + + +
  • @@ -1016,6 +1037,27 @@ +
  • + + + + + MaceOFF + + + + +
  • + + + + + + + + + +
  • diff --git a/main/API/datasets/qmx.html b/main/API/datasets/qmx.html index 95d18da..b8dbd56 100644 --- a/main/API/datasets/qmx.html +++ b/main/API/datasets/qmx.html @@ -953,6 +953,27 @@ +
  • + + + + + 3BPA + + + + +
  • + + + + + + + + + +
  • @@ -1016,6 +1037,27 @@ +
  • + + + + + MaceOFF + + + + +
  • + + + + + + + + + +
  • diff --git a/main/API/datasets/revmd17.html b/main/API/datasets/revmd17.html index 09689b4..bcb1c60 100644 --- a/main/API/datasets/revmd17.html +++ b/main/API/datasets/revmd17.html @@ -953,6 +953,27 @@ +
  • + + + + + 3BPA + + + + +
  • + + + + + + + + + +
  • @@ -1016,6 +1037,27 @@ +
  • + + + + + MaceOFF + + + + +
  • + + + + + + + + + +
  • diff --git a/main/API/datasets/sn2_rxn.html b/main/API/datasets/sn2_rxn.html index 06ce956..e6508eb 100644 --- a/main/API/datasets/sn2_rxn.html +++ b/main/API/datasets/sn2_rxn.html @@ -953,6 +953,27 @@ +
  • + + + + + 3BPA + + + + +
  • + + + + + + + + + +
  • @@ -1016,6 +1037,27 @@ +
  • + + + + + MaceOFF + + + + +
  • + + + + + + + + + +
  • diff --git a/main/API/datasets/solvated_peptides.html b/main/API/datasets/solvated_peptides.html index c6f4bae..cc4a2a3 100644 --- a/main/API/datasets/solvated_peptides.html +++ b/main/API/datasets/solvated_peptides.html @@ -953,6 +953,27 @@ +
  • + + + + + 3BPA + + + + +
  • + + + + + + + + + +
  • @@ -1016,6 +1037,27 @@ +
  • + + + + + MaceOFF + + + + +
  • + + + + + + + + + +
  • diff --git a/main/API/datasets/spice.html b/main/API/datasets/spice.html index eefce69..be7c17b 100644 --- a/main/API/datasets/spice.html +++ b/main/API/datasets/spice.html @@ -16,7 +16,7 @@ - + @@ -953,6 +953,27 @@ +
  • + + + + + 3BPA + + + + +
  • + + + + + + + + + +
  • @@ -1097,6 +1118,27 @@ +
  • + + + + + MaceOFF + + + + +
  • + + + + + + + + + +
  • diff --git a/main/API/datasets/splinter.html b/main/API/datasets/splinter.html index ab2cdad..0366cde 100644 --- a/main/API/datasets/splinter.html +++ b/main/API/datasets/splinter.html @@ -951,6 +951,27 @@ +
  • + + + + + 3BPA + + + + +
  • + + + + + + + + + +
  • @@ -1014,6 +1035,27 @@ +
  • + + + + + MaceOFF + + + + +
  • + + + + + + + + + +
  • diff --git a/main/API/datasets/tmqm.html b/main/API/datasets/tmqm.html index d8e6467..abfaf01 100644 --- a/main/API/datasets/tmqm.html +++ b/main/API/datasets/tmqm.html @@ -953,6 +953,27 @@ +
  • + + + + + 3BPA + + + + +
  • + + + + + + + + + +
  • @@ -1016,6 +1037,27 @@ +
  • + + + + + MaceOFF + + + + +
  • + + + + + + + + + +
  • diff --git a/main/API/datasets/transition1x.html b/main/API/datasets/transition1x.html index 45ae301..2408e22 100644 --- a/main/API/datasets/transition1x.html +++ b/main/API/datasets/transition1x.html @@ -953,6 +953,27 @@ +
  • + + + + + 3BPA + + + + +
  • + + + + + + + + + +
  • @@ -1016,6 +1037,27 @@ +
  • + + + + + MaceOFF + + + + +
  • + + + + + + + + + +
  • diff --git a/main/API/datasets/vqm24.html b/main/API/datasets/vqm24.html index 5e347ab..eacc43c 100644 --- a/main/API/datasets/vqm24.html +++ b/main/API/datasets/vqm24.html @@ -953,6 +953,27 @@ +
  • + + + + + 3BPA + + + + +
  • + + + + + + + + + +
  • @@ -1016,6 +1037,27 @@ +
  • + + + + + MaceOFF + + + + +
  • + + + + + + + + + +
  • diff --git a/main/API/datasets/waterclusters.html b/main/API/datasets/waterclusters.html index 63b6093..808179e 100644 --- a/main/API/datasets/waterclusters.html +++ b/main/API/datasets/waterclusters.html @@ -953,6 +953,27 @@ +
  • + + + + + 3BPA + + + + +
  • + + + + + + + + + +
  • @@ -1016,6 +1037,27 @@ +
  • + + + + + MaceOFF + + + + +
  • + + + + + + + + + +
  • diff --git a/main/API/datasets/waterclusters3_30.html b/main/API/datasets/waterclusters3_30.html index b65f18b..08b24c5 100644 --- a/main/API/datasets/waterclusters3_30.html +++ b/main/API/datasets/waterclusters3_30.html @@ -953,6 +953,27 @@ +
  • + + + + + 3BPA + + + + +
  • + + + + + + + + + +
  • @@ -1016,6 +1037,27 @@ +
  • + + + + + MaceOFF + + + + +
  • + + + + + + + + + +
  • diff --git a/main/API/datasets/x40.html b/main/API/datasets/x40.html index e44c928..cced3e4 100644 --- a/main/API/datasets/x40.html +++ b/main/API/datasets/x40.html @@ -951,6 +951,27 @@ +
  • + + + + + 3BPA + + + + +
  • + + + + + + + + + +
  • @@ -1014,6 +1035,27 @@ +
  • + + + + + MaceOFF + + + + +
  • + + + + + + + + + +
  • diff --git a/main/API/e0_dispatcher.html b/main/API/e0_dispatcher.html index deffe74..7343e17 100644 --- a/main/API/e0_dispatcher.html +++ b/main/API/e0_dispatcher.html @@ -1182,6 +1182,27 @@ +
  • + + + + + 3BPA + + + + +
  • + + + + + + + + + +
  • @@ -1245,6 +1266,27 @@ +
  • + + + + + MaceOFF + + + + +
  • + + + + + + + + + +
  • diff --git a/main/API/formats.html b/main/API/formats.html index c5ed64f..f14865d 100644 --- a/main/API/formats.html +++ b/main/API/formats.html @@ -16,7 +16,7 @@ - + @@ -1090,6 +1090,27 @@ +
  • + + + + + 3BPA + + + + +
  • + + + + + + + + + +
  • @@ -1153,6 +1174,27 @@ +
  • + + + + + MaceOFF + + + + +
  • + + + + + + + + + +
  • diff --git a/main/API/methods.html b/main/API/methods.html index 9e36361..805f508 100644 --- a/main/API/methods.html +++ b/main/API/methods.html @@ -1079,6 +1079,27 @@ +
  • + + + + + 3BPA + + + + +
  • + + + + + + + + + +
  • @@ -1142,6 +1163,27 @@ +
  • + + + + + MaceOFF + + + + +
  • + + + + + + + + + +
  • diff --git a/main/API/properties.html b/main/API/properties.html index e4ce818..0cb6e9b 100644 --- a/main/API/properties.html +++ b/main/API/properties.html @@ -1060,6 +1060,27 @@ +
  • + + + + + 3BPA + + + + +
  • + + + + + + + + + +
  • @@ -1123,6 +1144,27 @@ +
  • + + + + + MaceOFF + + + + +
  • + + + + + + + + + +
  • diff --git a/main/API/regressor.html b/main/API/regressor.html index 0b59fce..47c7c01 100644 --- a/main/API/regressor.html +++ b/main/API/regressor.html @@ -1096,6 +1096,27 @@ +
  • + + + + + 3BPA + + + + +
  • + + + + + + + + + +
  • @@ -1159,6 +1180,27 @@ +
  • + + + + + MaceOFF + + + + +
  • + + + + + + + + + +
  • diff --git a/main/API/statistics.html b/main/API/statistics.html index 8942845..62b8690 100644 --- a/main/API/statistics.html +++ b/main/API/statistics.html @@ -1266,6 +1266,27 @@ +
  • + + + + + 3BPA + + + + +
  • + + + + + + + + + +
  • @@ -1329,6 +1350,27 @@ +
  • + + + + + MaceOFF + + + + +
  • + + + + + + + + + +
  • diff --git a/main/API/units.html b/main/API/units.html index cfd6d9d..3dccb6e 100644 --- a/main/API/units.html +++ b/main/API/units.html @@ -949,6 +949,27 @@ +
  • + + + + + 3BPA + + + + +
  • + + + + + + + + + +
  • @@ -1012,6 +1033,27 @@ +
  • + + + + + MaceOFF + + + + +
  • + + + + + + + + + +
  • diff --git a/main/API/utils.html b/main/API/utils.html index a6c54bd..585ec61 100644 --- a/main/API/utils.html +++ b/main/API/utils.html @@ -949,6 +949,27 @@ +
  • + + + + + 3BPA + + + + +
  • + + + + + + + + + +
  • @@ -1012,6 +1033,27 @@ +
  • + + + + + MaceOFF + + + + +
  • + + + + + + + + + +
  • diff --git a/main/cli.html b/main/cli.html index f156fd8..2131610 100644 --- a/main/cli.html +++ b/main/cli.html @@ -1052,6 +1052,27 @@ +
  • + + + + + 3BPA + + + + +
  • + + + + + + + + + +
  • @@ -1115,6 +1136,27 @@ +
  • + + + + + MaceOFF + + + + +
  • + + + + + + + + + +
  • diff --git a/main/contribute.html b/main/contribute.html index b985cbc..0e70051 100644 --- a/main/contribute.html +++ b/main/contribute.html @@ -944,6 +944,27 @@ +
  • + + + + + 3BPA + + + + +
  • + + + + + + + + + +
  • @@ -1007,6 +1028,27 @@ +
  • + + + + + MaceOFF + + + + +
  • + + + + + + + + + +
  • diff --git a/main/data_storage.html b/main/data_storage.html index 4d8579d..971ed62 100644 --- a/main/data_storage.html +++ b/main/data_storage.html @@ -998,6 +998,27 @@ +
  • + + + + + 3BPA + + + + +
  • + + + + + + + + + +
  • @@ -1061,6 +1082,27 @@ +
  • + + + + + MaceOFF + + + + +
  • + + + + + + + + + +
  • diff --git a/main/dataset_upload.html b/main/dataset_upload.html index 415e98b..bff6a4a 100644 --- a/main/dataset_upload.html +++ b/main/dataset_upload.html @@ -944,6 +944,27 @@ +
  • + + + + + 3BPA + + + + +
  • + + + + + + + + + +
  • @@ -1007,6 +1028,27 @@ +
  • + + + + + MaceOFF + + + + +
  • + + + + + + + + + +
  • diff --git a/main/datasets.html b/main/datasets.html index e51f169..c71bf43 100644 --- a/main/datasets.html +++ b/main/datasets.html @@ -954,6 +954,27 @@ +
  • + + + + + 3BPA + + + + +
  • + + + + + + + + + +
  • @@ -1017,6 +1038,27 @@ +
  • + + + + + MaceOFF + + + + +
  • + + + + + + + + + +
  • diff --git a/main/index.html b/main/index.html index b19c2ee..609bd4a 100644 --- a/main/index.html +++ b/main/index.html @@ -1018,6 +1018,27 @@ +
  • + + + + + 3BPA + + + + +
  • + + + + + + + + + +
  • @@ -1081,6 +1102,27 @@ +
  • + + + + + MaceOFF + + + + +
  • + + + + + + + + + +
  • diff --git a/main/licensing.html b/main/licensing.html index 14095f7..8fbceee 100644 --- a/main/licensing.html +++ b/main/licensing.html @@ -937,6 +937,27 @@ +
  • + + + + + 3BPA + + + + +
  • + + + + + + + + + +
  • @@ -1000,6 +1021,27 @@ +
  • + + + + + MaceOFF + + + + +
  • + + + + + + + + + +
  • diff --git a/main/normalization_e0s.html b/main/normalization_e0s.html index fc489fb..faaecc8 100644 --- a/main/normalization_e0s.html +++ b/main/normalization_e0s.html @@ -1026,6 +1026,27 @@ +
  • + + + + + 3BPA + + + + +
  • + + + + + + + + + +
  • @@ -1089,6 +1110,27 @@ +
  • + + + + + MaceOFF + + + + +
  • + + + + + + + + + +
  • diff --git a/main/objects.inv b/main/objects.inv index 88e5672..1d90554 100644 Binary files a/main/objects.inv and b/main/objects.inv differ diff --git a/main/search/search_index.json b/main/search/search_index.json index fc51cd9..263320c 100644 --- a/main/search/search_index.json +++ b/main/search/search_index.json @@ -1 +1 @@ -{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"index.html","title":"Overview","text":"

    OpenQDC is a python library to work with quantum datasets. It's a package aimed at providing a simple and efficient way to download, load and utilize various datasets and provide a way to standardize the data for easy use in machine learning models.

    Visit our website at https://openqdc.io .

    "},{"location":"index.html#installation","title":"Installation","text":"

    Use mamba:

    conda install -c conda-forge openqdc\n

    Tips: You can replace conda by mamba.

    Note: We highly recommend using a Conda Python distribution to install OpenQDC. The package is also pip installable if you need it: pip install openqdc.

    "},{"location":"index.html#quick-api-tour","title":"Quick API Tour","text":"
    from openqdc as Spice\n\n# Load the original dataset\ndataset = Spice()\n\n# Load the dataset with a different units\ndataset = Spice(\n    energy_unit = \"kcal/mol\",\n    distance_unit = \"ang\",\n    energy_type = \"formation\",\n    array_format = \"torch\"\n)\n\n# Access the data\ndata = dataset[0]\n\n# Get relevant statistics\ndataset.get_statistics()\n\n# Get dataset metadata\ndataset.average_n_atoms\ndataset.chemical_species\ndataset.charges\n\n# Compute physical descriptors\ndataset.calculate_descriptors(\n    descriptor_name = \"soap\"\n)\n
    "},{"location":"index.html#how-to-cite","title":"How to cite","text":"

    Please cite OpenQDC if you use it in your research: .

    "},{"location":"index.html#compatibilities","title":"Compatibilities","text":"

    OpenQDC is compatible with Python >= 3.8 and is tested on Linux, MacOS and Windows.

    "},{"location":"cli.html","title":"CLI for dataset downloading and uploading","text":"

    You can quickly download, fetch, preprocess and upload openQDC datasets using the command line interface (CLI).

    "},{"location":"cli.html#datasets","title":"Datasets","text":"

    Print a formatted table of the available openQDC datasets and some informations.

    Usage:

    openqdc datasets [OPTIONS]\n

    Options:

    --help          Show this message and exit.\n
    "},{"location":"cli.html#cache","title":"Cache","text":"

    Get the current local cache path of openQDC

    Usage:

    openqdc cache [OPTIONS]\n

    Options:

    --help          Show this message and exit.\n
    "},{"location":"cli.html#download","title":"Download","text":"

    Download preprocessed ml-ready datasets from the main openQDC hub.

    Usage:

    openqdc download DATASETS... [OPTIONS]\n

    Options:

    --help          Show this message and exit.\n--overwrite     Whether to force the re-download of the datasets and overwrite the current cached dataset. [default: no-overwrite]\n--cache-dir     Path to the cache. If not provided, the default cache directory (.cache/openqdc/) will be used. [default: None]\n--as-zarr       Whether to use a zarr format for the datasets instead of memmap. [default: no-as-zarr]\n--gs            Whether source to use for downloading. If True, Google Storage will be used.Otherwise, AWS S3 will be used [default: no-gs]\n

    Example:

    openqdc download Spice\n
    "},{"location":"cli.html#fetch","title":"Fetch","text":"

    Download the raw datasets files from the main openQDC hub

    Note:

    Special case: if the dataset is \"all\", \"potential\", \"interaction\".\n

    Usage:

    openqdc fetch DATASETS... [OPTIONS]\n

    Options:

    --help          Show this message and exit.\n--overwrite     Whether to overwrite or force the re-download of the raw files. [default: no-overwrite]\n--cache-dir     Path to the cache. If not provided, the default cache directory (.cache/openqdc/) will be used. [default: None]\n

    Example:

    openqdc fetch Spice\n
    "},{"location":"cli.html#preprocess","title":"Preprocess","text":"

    Preprocess a raw dataset (previously fetched) into a openqdc dataset and optionally push it to remote.

    Usage:

    openqdc preprocess DATASETS... [OPTIONS]\n

    Options:

    --help         Show this message and exit.\n--overwrite    Whether to overwrite the current cached datasets. [default: overwrite]\n--upload       Whether to attempt the upload to the remote storage. Must have write permissions. [default: no-upload]\n--as-zarr      Whether to preprocess as a zarr format or a memmap format. [default: no-as-zarr]\n

    Example:

    openqdc preprocess Spice QMugs\n
    "},{"location":"cli.html#upload","title":"Upload","text":"

    Upload a preprocessed dataset to the remote storage

    Usage:

    openqdc upload DATASETS... [OPTIONS]\n

    Options:

    --help          Show this message and exit.\n--overwrite     Whether to overwrite the remote files if they are present. [default: overwrite]\n--as-zarr       Whether to upload the zarr files if available. [default: no-as-zarr]\n

    Example:

    openqdc upload Spice --overwrite\n
    "},{"location":"cli.html#convert","title":"Convert","text":"

    Convert a preprocessed dataset from a memmap dataset to a zarr dataset.

    Usage:

    openqdc convert DATASETS... [OPTIONS]\n

    Options:

    --help          Show this message and exit.\n--overwrite     Whether to overwrite the current zarr cached datasets. [default: no-overwrite]\n--download      Whether to force the re-download of the memmap datasets. [default: no-download]\n
    "},{"location":"contribute.html","title":"Contribute","text":"

    The below documents the development lifecycle of OpenQDC.

    "},{"location":"contribute.html#setup-a-dev-environment","title":"Setup a dev environment","text":"
    mamba env create -n openqdc -f env.yml\nmamba activate datamol\npip install -e .\n
    "},{"location":"contribute.html#pre-commit-installation","title":"Pre commit installation","text":"
    pre-commit install\npre-commit run --all-files\n
    "},{"location":"contribute.html#continuous-integration","title":"Continuous Integration","text":"

    OpenQDC uses Github Actions to:

    "},{"location":"contribute.html#run-tests","title":"Run tests","text":"
    pytest\n
    "},{"location":"contribute.html#build-the-documentation","title":"Build the documentation","text":"

    You can build and serve the documentation locally with:

    # Build and serve the doc\nmike serve\n

    or with

    mkdocs serve\n
    "},{"location":"contribute.html#multi-versionning","title":"Multi-versionning","text":"

    The doc is built for eash push on main and every git tags using mike. Everything is automated using Github Actions. Please refer to the official mike's documentation for the details.

    "},{"location":"data_storage.html","title":"Data structure","text":""},{"location":"data_storage.html#dataset-structure","title":"Dataset structure","text":"

    For a dataset with N geometries, M atoms across all geometries, ne energy labels, and nf force labels, we use zarr or memory-mapped arrays of various sizes:

    The memory-mapped files efficiently access data stored on disk or in the cloud without reading them into memory, enabling training on machines with smaller RAM than the dataset size and accommodating concurrent reads in multi-GPU training. This allows for very efficient indexing, batching and iteration.

    "},{"location":"data_storage.html#formats","title":"Formats","text":"

    We currently support the following formats:

    1) Zarr : https://zarr.readthedocs.io/en/stable/index.html

    2) Memmap : https://numpy.org/doc/stable/index.html

    "},{"location":"dataset_upload.html","title":"How to Add a Dataset to OpenQDC","text":"

    Do you think that OpenQDC is missing some important dataset? Do you think your dataset would be a good fit for OpenQDC? If so, you can contribute to OpenQDC by adding your dataset to the OpenQDC repository in two ways:

    1. Opening a PR to add a new dataset
    2. Request a new dataset through Google Form
    "},{"location":"dataset_upload.html#openqdc-pr-guidelines","title":"OpenQDC PR Guidelines","text":"

    Implement your dataset in the OpenQDC repository by following the guidelines below:

    "},{"location":"dataset_upload.html#dataset-class","title":"Dataset class","text":""},{"location":"dataset_upload.html#test-the-dataset","title":"Test the dataset","text":"

    Try to run the openQDC CLI pipeline with the dataset you implemented.

    Run the following command to download the dataset:

    If the dataset is correctly loaded, you can open a PR to add the dataset to OpenQDC.

    Our team will review your PR and provide feedback if necessary. If everything is correct, your dataset will be added to OpenQDC remote storage.

    "},{"location":"dataset_upload.html#openqdc-google-form","title":"OpenQDC Google Form","text":"

    Alternatively, you can ask the OpenQDC main development team to take care of the dataset upload for you. You can fill out the Google Form here

    As the openQDC team will strive to provide a high quality curation and upload, please be patient as the team will need to review the dataset and carry out the necessary steps to ensure the dataset is uploaded correctly.

    "},{"location":"datasets.html","title":"Overview of Datasets","text":"

    We provide support for the following publicly available QM Datasets.

    Dataset # Molecules # Conformers Average Conformers per Molecule Force Labels Atom Types QM Level of Theory Off-Equilibrium Conformations GEOM 450,000 37,000,000 82 No 18 GFN2-xTB No Molecule3D 3,899,647 3,899,647 1 No 5 B3LYP/6-31G* No NablaDFT 1,000,000 5,000,000 5 No 6 \u03c9B97X-D/def2-SVP QMugs 665,000 2,000,000 3 No 10 GFN2-xTB, \u03c9B97X-D/def2-SVP No Spice 19,238 1,132,808 59 Yes 15 \u03c9B97M-D3(BJ)/def2-TZVPPD Yes ANI 57,462 20,000,000 348 No 4 \u03c9B97x:6-31G(d) Yes tmQM 86,665 No TPSSh-D3BJ/def2-SVP DES370K 3,700 370,000 100 No 20 CCSD(T) Yes DES5M 3,700 5,000,000 1351 No 20 SNS-MP2 Yes OrbNet Denali 212,905 2,300,000 11 No 16 GFN1-xTB Yes SN2RXN 39 452709 11,600 Yes 6 DSD-BLYP-D3(BJ)/def2-TZVP QM7X 6,950 4,195,237 603 Yes 7 PBE0+MBD Yes"},{"location":"licensing.html","title":"License","text":"
    Creative Commons Attribution-NonCommercial 4.0 International\n\nCreative Commons Corporation (\"Creative Commons\") is not a law firm and\ndoes not provide legal services or legal advice. Distribution of\nCreative Commons public licenses does not create a lawyer-client or\nother relationship. Creative Commons makes its licenses and related\ninformation available on an \"as-is\" basis. Creative Commons gives no\nwarranties regarding its licenses, any material licensed under their\nterms and conditions, or any related information. Creative Commons\ndisclaims all liability for damages resulting from their use to the\nfullest extent possible.\n\nUsing Creative Commons Public Licenses\n\nCreative Commons public licenses provide a standard set of terms and\nconditions that creators and other rights holders may use to share\noriginal works of authorship and other material subject to copyright and\ncertain other rights specified in the public license below. The\nfollowing considerations are for informational purposes only, are not\nexhaustive, and do not form part of our licenses.\n\n-   Considerations for licensors: Our public licenses are intended for\n    use by those authorized to give the public permission to use\n    material in ways otherwise restricted by copyright and certain other\n    rights. Our licenses are irrevocable. Licensors should read and\n    understand the terms and conditions of the license they choose\n    before applying it. Licensors should also secure all rights\n    necessary before applying our licenses so that the public can reuse\n    the material as expected. Licensors should clearly mark any material\n    not subject to the license. This includes other CC-licensed\n    material, or material used under an exception or limitation to\n    copyright. More considerations for licensors :\n    wiki.creativecommons.org/Considerations\\_for\\_licensors\n\n-   Considerations for the public: By using one of our public licenses,\n    a licensor grants the public permission to use the licensed material\n    under specified terms and conditions. If the licensor's permission\n    is not necessary for any reason\u2013for example, because of any\n    applicable exception or limitation to copyright\u2013then that use is not\n    regulated by the license. Our licenses grant only permissions under\n    copyright and certain other rights that a licensor has authority to\n    grant. Use of the licensed material may still be restricted for\n    other reasons, including because others have copyright or other\n    rights in the material. A licensor may make special requests, such\n    as asking that all changes be marked or described. Although not\n    required by our licenses, you are encouraged to respect those\n    requests where reasonable. More considerations for the public :\n    wiki.creativecommons.org/Considerations\\_for\\_licensees\n\nCreative Commons Attribution-NonCommercial 4.0 International Public\nLicense\n\nBy exercising the Licensed Rights (defined below), You accept and agree\nto be bound by the terms and conditions of this Creative Commons\nAttribution-NonCommercial 4.0 International Public License (\"Public\nLicense\"). To the extent this Public License may be interpreted as a\ncontract, You are granted the Licensed Rights in consideration of Your\nacceptance of these terms and conditions, and the Licensor grants You\nsuch rights in consideration of benefits the Licensor receives from\nmaking the Licensed Material available under these terms and conditions.\n\n-   Section 1 \u2013 Definitions.\n\n    -   a. Adapted Material means material subject to Copyright and\n        Similar Rights that is derived from or based upon the Licensed\n        Material and in which the Licensed Material is translated,\n        altered, arranged, transformed, or otherwise modified in a\n        manner requiring permission under the Copyright and Similar\n        Rights held by the Licensor. For purposes of this Public\n        License, where the Licensed Material is a musical work,\n        performance, or sound recording, Adapted Material is always\n        produced where the Licensed Material is synched in timed\n        relation with a moving image.\n    -   b. Adapter's License means the license You apply to Your\n        Copyright and Similar Rights in Your contributions to Adapted\n        Material in accordance with the terms and conditions of this\n        Public License.\n    -   c. Copyright and Similar Rights means copyright and/or similar\n        rights closely related to copyright including, without\n        limitation, performance, broadcast, sound recording, and Sui\n        Generis Database Rights, without regard to how the rights are\n        labeled or categorized. For purposes of this Public License, the\n        rights specified in Section 2(b)(1)-(2) are not Copyright and\n        Similar Rights.\n    -   d. Effective Technological Measures means those measures that,\n        in the absence of proper authority, may not be circumvented\n        under laws fulfilling obligations under Article 11 of the WIPO\n        Copyright Treaty adopted on December 20, 1996, and/or similar\n        international agreements.\n    -   e. Exceptions and Limitations means fair use, fair dealing,\n        and/or any other exception or limitation to Copyright and\n        Similar Rights that applies to Your use of the Licensed\n        Material.\n    -   f. Licensed Material means the artistic or literary work,\n        database, or other material to which the Licensor applied this\n        Public License.\n    -   g. Licensed Rights means the rights granted to You subject to\n        the terms and conditions of this Public License, which are\n        limited to all Copyright and Similar Rights that apply to Your\n        use of the Licensed Material and that the Licensor has authority\n        to license.\n    -   h. Licensor means the individual(s) or entity(ies) granting\n        rights under this Public License.\n    -   i. NonCommercial means not primarily intended for or directed\n        towards commercial advantage or monetary compensation. For\n        purposes of this Public License, the exchange of the Licensed\n        Material for other material subject to Copyright and Similar\n        Rights by digital file-sharing or similar means is NonCommercial\n        provided there is no payment of monetary compensation in\n        connection with the exchange.\n    -   j. Share means to provide material to the public by any means or\n        process that requires permission under the Licensed Rights, such\n        as reproduction, public display, public performance,\n        distribution, dissemination, communication, or importation, and\n        to make material available to the public including in ways that\n        members of the public may access the material from a place and\n        at a time individually chosen by them.\n    -   k. Sui Generis Database Rights means rights other than copyright\n        resulting from Directive 96/9/EC of the European Parliament and\n        of the Council of 11 March 1996 on the legal protection of\n        databases, as amended and/or succeeded, as well as other\n        essentially equivalent rights anywhere in the world.\n    -   l. You means the individual or entity exercising the Licensed\n        Rights under this Public License. Your has a corresponding\n        meaning.\n\n-   Section 2 \u2013 Scope.\n\n    -   a. License grant.\n        -   1. Subject to the terms and conditions of this Public\n            License, the Licensor hereby grants You a worldwide,\n            royalty-free, non-sublicensable, non-exclusive, irrevocable\n            license to exercise the Licensed Rights in the Licensed\n            Material to:\n            -   A. reproduce and Share the Licensed Material, in whole\n                or in part, for NonCommercial purposes only; and\n            -   B. produce, reproduce, and Share Adapted Material for\n                NonCommercial purposes only.\n        -   2. Exceptions and Limitations. For the avoidance of doubt,\n            where Exceptions and Limitations apply to Your use, this\n            Public License does not apply, and You do not need to comply\n            with its terms and conditions.\n        -   3. Term. The term of this Public License is specified in\n            Section 6(a).\n        -   4. Media and formats; technical modifications allowed. The\n            Licensor authorizes You to exercise the Licensed Rights in\n            all media and formats whether now known or hereafter\n            created, and to make technical modifications necessary to do\n            so. The Licensor waives and/or agrees not to assert any\n            right or authority to forbid You from making technical\n            modifications necessary to exercise the Licensed Rights,\n            including technical modifications necessary to circumvent\n            Effective Technological Measures. For purposes of this\n            Public License, simply making modifications authorized by\n            this Section 2(a)(4) never produces Adapted Material.\n        -   5. Downstream recipients.\n            -   A. Offer from the Licensor \u2013 Licensed Material. Every\n                recipient of the Licensed Material automatically\n                receives an offer from the Licensor to exercise the\n                Licensed Rights under the terms and conditions of this\n                Public License.\n            -   B. No downstream restrictions. You may not offer or\n                impose any additional or different terms or conditions\n                on, or apply any Effective Technological Measures to,\n                the Licensed Material if doing so restricts exercise of\n                the Licensed Rights by any recipient of the Licensed\n                Material.\n        -   6. No endorsement. Nothing in this Public License\n            constitutes or may be construed as permission to assert or\n            imply that You are, or that Your use of the Licensed\n            Material is, connected with, or sponsored, endorsed, or\n            granted official status by, the Licensor or others\n            designated to receive attribution as provided in Section\n            3(a)(1)(A)(i).\n    -   b. Other rights.\n        -   1. Moral rights, such as the right of integrity, are not\n            licensed under this Public License, nor are publicity,\n            privacy, and/or other similar personality rights; however,\n            to the extent possible, the Licensor waives and/or agrees\n            not to assert any such rights held by the Licensor to the\n            limited extent necessary to allow You to exercise the\n            Licensed Rights, but not otherwise.\n        -   2. Patent and trademark rights are not licensed under this\n            Public License.\n        -   3. To the extent possible, the Licensor waives any right to\n            collect royalties from You for the exercise of the Licensed\n            Rights, whether directly or through a collecting society\n            under any voluntary or waivable statutory or compulsory\n            licensing scheme. In all other cases the Licensor expressly\n            reserves any right to collect such royalties, including when\n            the Licensed Material is used other than for NonCommercial\n            purposes.\n\n-   Section 3 \u2013 License Conditions.\n\n    Your exercise of the Licensed Rights is expressly made subject to\n    the following conditions.\n\n    -   a. Attribution.\n        -   1. If You Share the Licensed Material (including in modified\n            form), You must:\n            -   A. retain the following if it is supplied by the\n                Licensor with the Licensed Material:\n                -   i. identification of the creator(s) of the Licensed\n                    Material and any others designated to receive\n                    attribution, in any reasonable manner requested by\n                    the Licensor (including by pseudonym if designated);\n                -   ii. a copyright notice;\n                -   iii. a notice that refers to this Public License;\n                -   iv. a notice that refers to the disclaimer of\n                    warranties;\n                -   v. a URI or hyperlink to the Licensed Material to\n                    the extent reasonably practicable;\n            -   B. indicate if You modified the Licensed Material and\n                retain an indication of any previous modifications; and\n            -   C. indicate the Licensed Material is licensed under this\n                Public License, and include the text of, or the URI or\n                hyperlink to, this Public License.\n        -   2. You may satisfy the conditions in Section 3(a)(1) in any\n            reasonable manner based on the medium, means, and context in\n            which You Share the Licensed Material. For example, it may\n            be reasonable to satisfy the conditions by providing a URI\n            or hyperlink to a resource that includes the required\n            information.\n        -   3. If requested by the Licensor, You must remove any of the\n            information required by Section 3(a)(1)(A) to the extent\n            reasonably practicable.\n        -   4. If You Share Adapted Material You produce, the Adapter's\n            License You apply must not prevent recipients of the Adapted\n            Material from complying with this Public License.\n\n-   Section 4 \u2013 Sui Generis Database Rights.\n\n    Where the Licensed Rights include Sui Generis Database Rights that\n    apply to Your use of the Licensed Material:\n\n    -   a. for the avoidance of doubt, Section 2(a)(1) grants You the\n        right to extract, reuse, reproduce, and Share all or a\n        substantial portion of the contents of the database for\n        NonCommercial purposes only;\n    -   b. if You include all or a substantial portion of the database\n        contents in a database in which You have Sui Generis Database\n        Rights, then the database in which You have Sui Generis Database\n        Rights (but not its individual contents) is Adapted Material;\n        and\n    -   c. You must comply with the conditions in Section 3(a) if You\n        Share all or a substantial portion of the contents of the\n        database.\n\n    For the avoidance of doubt, this Section 4 supplements and does not\n    replace Your obligations under this Public License where the\n    Licensed Rights include other Copyright and Similar Rights.\n\n-   Section 5 \u2013 Disclaimer of Warranties and Limitation of Liability.\n\n    -   a. Unless otherwise separately undertaken by the Licensor, to\n        the extent possible, the Licensor offers the Licensed Material\n        as-is and as-available, and makes no representations or\n        warranties of any kind concerning the Licensed Material, whether\n        express, implied, statutory, or other. This includes, without\n        limitation, warranties of title, merchantability, fitness for a\n        particular purpose, non-infringement, absence of latent or other\n        defects, accuracy, or the presence or absence of errors, whether\n        or not known or discoverable. Where disclaimers of warranties\n        are not allowed in full or in part, this disclaimer may not\n        apply to You.\n    -   b. To the extent possible, in no event will the Licensor be\n        liable to You on any legal theory (including, without\n        limitation, negligence) or otherwise for any direct, special,\n        indirect, incidental, consequential, punitive, exemplary, or\n        other losses, costs, expenses, or damages arising out of this\n        Public License or use of the Licensed Material, even if the\n        Licensor has been advised of the possibility of such losses,\n        costs, expenses, or damages. Where a limitation of liability is\n        not allowed in full or in part, this limitation may not apply to\n        You.\n    -   c. The disclaimer of warranties and limitation of liability\n        provided above shall be interpreted in a manner that, to the\n        extent possible, most closely approximates an absolute\n        disclaimer and waiver of all liability.\n\n-   Section 6 \u2013 Term and Termination.\n\n    -   a. This Public License applies for the term of the Copyright and\n        Similar Rights licensed here. However, if You fail to comply\n        with this Public License, then Your rights under this Public\n        License terminate automatically.\n    -   b. Where Your right to use the Licensed Material has terminated\n        under Section 6(a), it reinstates:\n\n        -   1. automatically as of the date the violation is cured,\n            provided it is cured within 30 days of Your discovery of the\n            violation; or\n        -   2. upon express reinstatement by the Licensor.\n\n        For the avoidance of doubt, this Section 6(b) does not affect\n        any right the Licensor may have to seek remedies for Your\n        violations of this Public License.\n\n    -   c. For the avoidance of doubt, the Licensor may also offer the\n        Licensed Material under separate terms or conditions or stop\n        distributing the Licensed Material at any time; however, doing\n        so will not terminate this Public License.\n    -   d. Sections 1, 5, 6, 7, and 8 survive termination of this Public\n        License.\n\n-   Section 7 \u2013 Other Terms and Conditions.\n\n    -   a. The Licensor shall not be bound by any additional or\n        different terms or conditions communicated by You unless\n        expressly agreed.\n    -   b. Any arrangements, understandings, or agreements regarding the\n        Licensed Material not stated herein are separate from and\n        independent of the terms and conditions of this Public License.\n\n-   Section 8 \u2013 Interpretation.\n\n    -   a. For the avoidance of doubt, this Public License does not, and\n        shall not be interpreted to, reduce, limit, restrict, or impose\n        conditions on any use of the Licensed Material that could\n        lawfully be made without permission under this Public License.\n    -   b. To the extent possible, if any provision of this Public\n        License is deemed unenforceable, it shall be automatically\n        reformed to the minimum extent necessary to make it enforceable.\n        If the provision cannot be reformed, it shall be severed from\n        this Public License without affecting the enforceability of the\n        remaining terms and conditions.\n    -   c. No term or condition of this Public License will be waived\n        and no failure to comply consented to unless expressly agreed to\n        by the Licensor.\n    -   d. Nothing in this Public License constitutes or may be\n        interpreted as a limitation upon, or waiver of, any privileges\n        and immunities that apply to the Licensor or You, including from\n        the legal processes of any jurisdiction or authority.\n\nCreative Commons is not a party to its public licenses. Notwithstanding,\nCreative Commons may elect to apply one of its public licenses to\nmaterial it publishes and in those instances will be considered the\n\"Licensor.\" The text of the Creative Commons public licenses is\ndedicated to the public domain under the CC0 Public Domain Dedication.\nExcept for the limited purpose of indicating that material is shared\nunder a Creative Commons public license or as otherwise permitted by the\nCreative Commons policies published at creativecommons.org/policies,\nCreative Commons does not authorize the use of the trademark \"Creative\nCommons\" or any other trademark or logo of Creative Commons without its\nprior written consent including, without limitation, in connection with\nany unauthorized modifications to any of its public licenses or any\nother arrangements, understandings, or agreements concerning use of\nlicensed material. For the avoidance of doubt, this paragraph does not\nform part of the public licenses.\n\nCreative Commons may be contacted at creativecommons.org.\n
    "},{"location":"normalization_e0s.html","title":"Overview of QM Methods and Normalization","text":"

    OpenQDC provides support for 250+ QM Methods and provides a way to standardize and categorize the usage of different level of theories used for Quantum Mechanics Single Point Calculations to add value and information to the datasets.

    "},{"location":"normalization_e0s.html#level-of-theory","title":"Level of Theory","text":"

    To avoid inconsistencies, level of theories are standardized and categorized into Python Enums consisting of a functional, a basis set, and a correction method. OpenQDC covers more than 106 functionals, 20 basis sets, and 11 correction methods. OpenQDC provides the computed the isolated atom energies e0 for each QM method.

    "},{"location":"normalization_e0s.html#normalization","title":"Normalization","text":"

    We provide support of energies through \"physical\" and \"regression\" normalization to conserve the size extensivity of chemical systems. OpenQDC through this normalization, provide a way to transform the potential energy to atomization energy by subtracting isolated atom energies e0 physically interpretable and extensivity-conserving normalization method. Alternatively, we pre- compute the average contribution of each atom species to potential energy via linear or ridge regression, centering the distribution at 0 and providing uncertainty estimation for the computed values. Predicted atomic energies can also be scaled to approximate a standard normal distribution.

    "},{"location":"normalization_e0s.html#physical-normalization","title":"Physical Normalization","text":"

    e0 energies are calculated for each atom in the dataset at the appropriate level of theory and then subtracted from the potential energy to obtain the atomization energy. This normalization method is physically interpretable and only remove the atom energy contribution from the potential energy.

    "},{"location":"normalization_e0s.html#regression-normalization","title":"Regression Normalization","text":"

    e0 energies are calculated for each atom in the dataset from fitting a regression model to the potential energy. The e0 energies are then subtracted from the potential energy to obtain the atomization energy. This normalization provides uncertainty estimation for the computed values and remove part of the interatomic energy contribution from the potential energy. The resulting formation energy is centered at 0.

    "},{"location":"usage.html","title":"Usage","text":""},{"location":"usage.html#how-to-use","title":"How to use","text":"

    OpenQDC has been designed to be used with a single import:

    import openqdc as qdc\ndataset = qdc.QM9()\n

    All openQDC functions are available under qdc. Or if you want to directly import a specific dataset:

    from openqdc as Spice\n# Spice dataset with distance unit in angstrom instead of bohr\ndataset = Spice(distance_unit=\"ang\",\n                array_format = \"jax\"\n)\ndataset[0] # dict of jax array\n

    Or if you prefer handling ase.Atoms objects:

    dataset.get_ase_atoms(0)\n
    "},{"location":"usage.html#iterators","title":"Iterators","text":"

    OpenQDC provides a simple way to get the data as iterators:

    for data in dataset.as_iter(atoms=True):\n    print(data) # Atoms object\n    break\n

    or if you want to just iterate over the data:

    for data in dataset:\n    print(data) # dict of arrays\n    break\n
    "},{"location":"usage.html#lazy-loading","title":"Lazy loading","text":"

    OpenQDC uses lazy loading to dynamically expose all its API without imposing a long import time during import openqdc as qdc. In case of trouble you can always disable lazy loading by setting the environment variable OPENQDC_DISABLE_LAZY_LOADING to 1.

    "},{"location":"API/basedataset.html","title":"BaseDataset","text":"

    The BaseDataset defining shared functionality between all datasets.

    "},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset","title":"BaseDataset","text":"

    Bases: DatasetPropertyMixIn

    Base class for datasets in the openQDC package.

    Source code in openqdc/datasets/base.py
    class BaseDataset(DatasetPropertyMixIn):\n    \"\"\"\n    Base class for datasets in the openQDC package.\n    \"\"\"\n\n    energy_target_names = []\n    force_target_names = []\n    read_as_zarr = False\n    __energy_methods__ = []\n    __force_mask__ = []\n    __isolated_atom_energies__ = []\n    _fn_energy = lambda x: x\n    _fn_distance = lambda x: x\n    _fn_forces = lambda x: x\n\n    __energy_unit__ = \"hartree\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"hartree/ang\"\n    __average_nb_atoms__ = None\n    __links__ = {}\n\n    def __init__(\n        self,\n        energy_unit: Optional[str] = None,\n        distance_unit: Optional[str] = None,\n        array_format: str = \"numpy\",\n        energy_type: Optional[str] = \"formation\",\n        overwrite_local_cache: bool = False,\n        cache_dir: Optional[str] = None,\n        recompute_statistics: bool = False,\n        transform: Optional[Callable] = None,\n        skip_statistics: bool = False,\n        read_as_zarr: bool = False,\n        regressor_kwargs: Dict = {\n            \"solver_type\": \"linear\",\n            \"sub_sample\": None,\n            \"stride\": 1,\n        },\n    ) -> None:\n        \"\"\"\n\n        Parameters:\n            energy_unit:\n                Energy unit to convert dataset to. Supported units: [\"kcal/mol\", \"kj/mol\", \"hartree\", \"ev\"]\n            distance_unit:\n                Distance unit to convert dataset to. Supported units: [\"ang\", \"nm\", \"bohr\"]\n            array_format:\n                Format to return arrays in. Supported formats: [\"numpy\", \"torch\", \"jax\"]\n            energy_type:\n                Type of isolated atom energy to use for the dataset. Default: \"formation\"\n                Supported types: [\"formation\", \"regression\", \"null\", None]\n            overwrite_local_cache:\n                Whether to overwrite the locally cached dataset.\n            cache_dir:\n                Cache directory location. Defaults to \"~/.cache/openqdc\"\n            recompute_statistics:\n                Whether to recompute the statistics of the dataset.\n            transform:\n                transformation to apply to the __getitem__ calls\n            regressor_kwargs:\n                Dictionary of keyword arguments to pass to the regressor.\n                Default: {\"solver_type\": \"linear\", \"sub_sample\": None, \"stride\": 1}\n                solver_type can be one of [\"linear\", \"ridge\"]\n        \"\"\"\n        set_cache_dir(cache_dir)\n        # self._init_lambda_fn()\n        self.data = None\n        self._original_unit = self.energy_unit\n        self.recompute_statistics = recompute_statistics\n        self.regressor_kwargs = regressor_kwargs\n        self.transform = transform\n        self.read_as_zarr = read_as_zarr\n        self.energy_type = energy_type if energy_type is not None else \"null\"\n        self.refit_e0s = recompute_statistics or overwrite_local_cache\n        self.skip_statistics = skip_statistics\n        if not self.is_preprocessed():\n            raise DatasetNotAvailableError(self.__name__)\n        else:\n            self.read_preprocess(overwrite_local_cache=overwrite_local_cache)\n        self.set_array_format(array_format)\n        self._post_init(overwrite_local_cache, energy_unit, distance_unit)\n\n    def _init_lambda_fn(self):\n        self._fn_energy = lambda x: x\n        self._fn_distance = lambda x: x\n        self._fn_forces = lambda x: x\n\n    @property\n    def dataset_wrapper(self):\n        if not hasattr(self, \"_dataset_wrapper\"):\n            self._dataset_wrapper = ZarrDataset() if self.read_as_zarr else MemMapDataset()\n        return self._dataset_wrapper\n\n    @property\n    def config(self):\n        assert len(self.__links__) > 0, \"No links provided for fetching\"\n        return dict(dataset_name=self.__name__, links=self.__links__)\n\n    @classmethod\n    def fetch(cls, cache_path: Optional[str] = None, overwrite: bool = False) -> None:\n        from openqdc.utils.download_api import DataDownloader\n\n        DataDownloader(cache_path, overwrite).from_config(cls.no_init().config)\n\n    def _post_init(\n        self,\n        overwrite_local_cache: bool = False,\n        energy_unit: Optional[str] = None,\n        distance_unit: Optional[str] = None,\n    ) -> None:\n        self._set_units(None, None)\n        self._set_isolated_atom_energies()\n        if not self.skip_statistics:\n            self._precompute_statistics(overwrite_local_cache=overwrite_local_cache)\n        self._set_units(energy_unit, distance_unit)\n        self._convert_data()\n        self._set_isolated_atom_energies()\n\n    def _precompute_statistics(self, overwrite_local_cache: bool = False):\n        # if self.recompute_statistics or overwrite_local_cache:\n        self.statistics = StatisticManager(\n            self,\n            self.recompute_statistics or overwrite_local_cache,  # check if we need to recompute\n            # Add the common statistics (Forces, TotalE, FormE, PerAtomE)\n            ForcesCalculatorStats,\n            TotalEnergyStats,\n            FormationEnergyStats,\n            PerAtomFormationEnergyStats,\n        )\n        self.statistics.run_calculators()  # run the calculators\n        self._compute_average_nb_atoms()\n\n    @classmethod\n    def no_init(cls):\n        \"\"\"\n        Class method to avoid the __init__ method to be called when the class is instanciated.\n        Useful for debugging purposes or preprocessing data.\n        \"\"\"\n        return cls.__new__(cls)\n\n    @property\n    def __force_methods__(self):\n        \"\"\"\n        For backward compatibility. To be removed in the future.\n        \"\"\"\n        return self.force_methods\n\n    @property\n    def energy_methods(self) -> List[str]:\n        \"\"\"Return the string version of the energy methods\"\"\"\n        return [str(i) for i in self.__energy_methods__]\n\n    @property\n    def force_mask(self):\n        if len(self.__class__.__force_mask__) == 0:\n            self.__class__.__force_mask__ = [False] * len(self.__energy_methods__)\n        return self.__class__.__force_mask__\n\n    @property\n    def force_methods(self):\n        return list(compress(self.energy_methods, self.force_mask))\n\n    @property\n    def e0s_dispatcher(self) -> AtomEnergies:\n        \"\"\"\n        Property to get the object that dispatched the isolated atom energies of the QM methods.\n\n        Returns:\n            Object wrapping the isolated atom energies of the QM methods.\n        \"\"\"\n        if not hasattr(self, \"_e0s_dispatcher\"):\n            # Automatically fetch/compute formation or regression energies\n            self._e0s_dispatcher = AtomEnergies(self, **self.regressor_kwargs)\n        return self._e0s_dispatcher\n\n    def _convert_data(self):\n        logger.info(\n            f\"Converting {self.__name__} data to the following units:\\n\\\n                     Energy: {str(self.energy_unit)},\\n\\\n                     Distance: {str(self.distance_unit)},\\n\\\n                     Forces: {str(self.force_unit) if self.__force_methods__ else 'None'}\"\n        )\n        for key in self.data_keys:\n            self.data[key] = self._convert_on_loading(self.data[key], key)\n\n    @property\n    def energy_unit(self):\n        return EnergyTypeConversion(self.__energy_unit__)\n\n    @property\n    def distance_unit(self):\n        return DistanceTypeConversion(self.__distance_unit__)\n\n    @property\n    def force_unit(self):\n        units = self.__forces_unit__.split(\"/\")\n        if len(units) > 2:\n            units = [\"/\".join(units[:2]), units[-1]]\n        return ForceTypeConversion(tuple(units))  # < 3.12 compatibility\n\n    @property\n    def root(self):\n        return p_join(get_local_cache(), self.__name__)\n\n    @property\n    def preprocess_path(self):\n        path = p_join(self.root, \"preprocessed\")\n        os.makedirs(path, exist_ok=True)\n        return path\n\n    @property\n    def data_keys(self):\n        keys = list(self.data_types.keys())\n        if len(self.__force_methods__) == 0:\n            keys.remove(\"forces\")\n        return keys\n\n    @property\n    def pkl_data_keys(self):\n        return list(self.pkl_data_types.keys())\n\n    @property\n    def pkl_data_types(self):\n        return {\"name\": str, \"subset\": str, \"n_atoms\": np.int32}\n\n    @property\n    def atom_energies(self):\n        return self._e0s_dispatcher\n\n    @property\n    def data_types(self):\n        return {\n            \"atomic_inputs\": np.float32,\n            \"position_idx_range\": np.int32,\n            \"energies\": np.float64,\n            \"forces\": np.float32,\n        }\n\n    @property\n    def data_shapes(self):\n        return {\n            \"atomic_inputs\": (-1, NB_ATOMIC_FEATURES),\n            \"position_idx_range\": (-1, 2),\n            \"energies\": (-1, len(self.energy_methods)),\n            \"forces\": (-1, 3, len(self.force_methods)),\n        }\n\n    def _set_units(self, en: Optional[str] = None, ds: Optional[str] = None):\n        old_en, old_ds = self.energy_unit, self.distance_unit\n        en = en if en is not None else old_en\n        ds = ds if ds is not None else old_ds\n        self.set_energy_unit(en)\n        self.set_distance_unit(ds)\n        if self.__force_methods__:\n            self._fn_forces = self.force_unit.to(str(self.energy_unit), str(self.distance_unit))\n            self.__forces_unit__ = str(self.energy_unit) + \"/\" + str(self.distance_unit)\n\n    def _set_isolated_atom_energies(self):\n        if self.__energy_methods__ is None:\n            logger.error(\"No energy methods defined for this dataset.\")\n        if self.energy_type == \"formation\":\n            f = get_conversion(\"hartree\", self.__energy_unit__)\n        else:\n            # regression are calculated on the original unit of the dataset\n            f = self._original_unit.to(self.energy_unit)\n        self.__isolated_atom_energies__ = f(self.e0s_dispatcher.e0s_matrix)\n\n    def convert_energy(self, x):\n        return self._fn_energy(x)\n\n    def convert_distance(self, x):\n        return self._fn_distance(x)\n\n    def convert_forces(self, x):\n        return self._fn_forces(x)\n\n    def set_energy_unit(self, value: str):\n        \"\"\"\n        Set a new energy unit for the dataset.\n\n        Parameters:\n            value:\n                New energy unit to set.\n        \"\"\"\n        # old_unit = self.energy_unit\n        # self.__energy_unit__ = value\n        self._fn_energy = self.energy_unit.to(value)  # get_conversion(old_unit, value)\n        self.__energy_unit__ = value\n\n    def set_distance_unit(self, value: str):\n        \"\"\"\n        Set a new distance unit for the dataset.\n\n        Parameters:\n            value:\n                New distance unit to set.\n        \"\"\"\n        # old_unit = self.distance_unit\n        # self.__distance_unit__ = value\n        self._fn_distance = self.distance_unit.to(value)  # get_conversion(old_unit, value)\n        self.__distance_unit__ = value\n\n    def set_array_format(self, format: str):\n        assert format in [\"numpy\", \"torch\", \"jax\"], f\"Format {format} not supported.\"\n        self.array_format = format\n\n    def read_raw_entries(self):\n        \"\"\"\n        Preprocess the raw (aka from the fetched source) into a list of dictionaries.\n        \"\"\"\n        raise NotImplementedError\n\n    def collate_list(self, list_entries: List[Dict]) -> Dict:\n        \"\"\"\n        Collate a list of entries into a single dictionary.\n\n        Parameters:\n            list_entries:\n                List of dictionaries containing the entries to collate.\n\n        Returns:\n            Dictionary containing the collated entries.\n        \"\"\"\n        # concatenate entries\n        res = {key: np.concatenate([r[key] for r in list_entries if r is not None], axis=0) for key in list_entries[0]}\n\n        csum = np.cumsum(res.get(\"n_atoms\"))\n        x = np.zeros((csum.shape[0], 2), dtype=np.int32)\n        x[1:, 0], x[:, 1] = csum[:-1], csum\n        res[\"position_idx_range\"] = x\n\n        return res\n\n    def save_preprocess(\n        self, data_dict: Dict[str, np.ndarray], upload: bool = False, overwrite: bool = True, as_zarr: bool = False\n    ):\n        \"\"\"\n        Save the preprocessed data to the cache directory and optionally upload it to the remote storage.\n\n        Parameters:\n            data_dict:\n                Dictionary containing the preprocessed data.\n            upload:\n                Whether to upload the preprocessed data to the remote storage or only saving it locally.\n            overwrite:\n                Whether to overwrite the preprocessed data if it already exists.\n                Only used if upload is True. Cache is always overwritten locally.\n        \"\"\"\n        # save memmaps\n        logger.info(\"Preprocessing data and saving it to cache.\")\n        paths = self.dataset_wrapper.save_preprocess(\n            self.preprocess_path, self.data_keys, data_dict, self.pkl_data_keys, self.pkl_data_types\n        )\n        if upload:\n            for local_path in paths:\n                push_remote(local_path, overwrite=overwrite)  # make it async?\n\n    def read_preprocess(self, overwrite_local_cache=False):\n        logger.info(\"Reading preprocessed data.\")\n        logger.info(\n            f\"Dataset {self.__name__} with the following units:\\n\\\n                     Energy: {self.energy_unit},\\n\\\n                     Distance: {self.distance_unit},\\n\\\n                     Forces: {self.force_unit if self.force_methods else 'None'}\"\n        )\n\n        self.data = self.dataset_wrapper.load_data(\n            self.preprocess_path,\n            self.data_keys,\n            self.data_types,\n            self.data_shapes,\n            self.pkl_data_keys,\n            overwrite_local_cache,\n        )  # this should be async if possible\n        for key in self.data:\n            logger.info(f\"Loaded {key} with shape {self.data[key].shape}, dtype {self.data[key].dtype}\")\n\n    def _convert_on_loading(self, x, key):\n        if key == \"energies\":\n            return self.convert_energy(x)\n        elif key == \"forces\":\n            return self.convert_forces(x)\n        elif key == \"atomic_inputs\":\n            x = np.array(x, dtype=np.float32)\n            x[:, -3:] = self.convert_distance(x[:, -3:])\n            return x\n        else:\n            return x\n\n    def is_preprocessed(self) -> bool:\n        \"\"\"\n        Check if the dataset is preprocessed and available online or locally.\n\n        Returns:\n            True if the dataset is available remotely or locally, False otherwise.\n        \"\"\"\n        predicats = [\n            copy_exists(p_join(self.preprocess_path, self.dataset_wrapper.add_extension(f\"{key}\")))\n            for key in self.data_keys\n        ]\n        predicats += [copy_exists(p_join(self.preprocess_path, file)) for file in self.dataset_wrapper._extra_files]\n        return all(predicats)\n\n    def is_cached(self) -> bool:\n        \"\"\"\n        Check if the dataset is cached locally.\n\n        Returns:\n            True if the dataset is cached locally, False otherwise.\n        \"\"\"\n        predicats = [\n            os.path.exists(p_join(self.preprocess_path, self.dataset_wrapper.add_extension(f\"{key}\")))\n            for key in self.data_keys\n        ]\n        predicats += [copy_exists(p_join(self.preprocess_path, file)) for file in self.dataset_wrapper._extra_files]\n        return all(predicats)\n\n    def preprocess(self, upload: bool = False, overwrite: bool = True, as_zarr: bool = True):\n        \"\"\"\n        Preprocess the dataset and save it.\n\n        Parameters:\n            upload:\n                Whether to upload the preprocessed data to the remote storage or only saving it locally.\n            overwrite:\n                hether to overwrite the preprocessed data if it already exists.\n                Only used if upload is True. Cache is always overwritten locally.\n            as_zarr:\n                Whether to save the data as zarr files\n        \"\"\"\n        if overwrite or not self.is_preprocessed():\n            entries = self.read_raw_entries()\n            res = self.collate_list(entries)\n            self.save_preprocess(res, upload, overwrite, as_zarr)\n\n    def upload(self, overwrite: bool = False, as_zarr: bool = False):\n        \"\"\"\n        Upload the preprocessed data to the remote storage. Must be called after preprocess and\n        need to have write privileges.\n\n        Parameters:\n            overwrite:\n                Whether to overwrite the remote data if it already exists\n            as_zarr:\n                Whether to upload the data as zarr files\n        \"\"\"\n        for key in self.data_keys:\n            local_path = p_join(self.preprocess_path, f\"{key}.mmap\" if not as_zarr else f\"{key}.zip\")\n            push_remote(local_path, overwrite=overwrite)\n        local_path = p_join(self.preprocess_path, \"props.pkl\" if not as_zarr else \"metadata.zip\")\n        push_remote(local_path, overwrite=overwrite)\n\n    def save_xyz(self, idx: int, energy_method: int = 0, path: Optional[str] = None, ext: bool = True):\n        \"\"\"\n        Save a single entry at index idx as an extxyz file.\n\n        Parameters:\n            idx:\n                Index of the entry\n            energy_method:\n                Index of the energy method to use\n            path:\n                Path to save the xyz file. If None, the current working directory is used.\n            ext:\n                Whether to include additional informations like forces and other metadatas (extxyz format)\n        \"\"\"\n        if path is None:\n            path = os.getcwd()\n        at = self.get_ase_atoms(idx, ext=ext, energy_method=energy_method)\n        write_extxyz(p_join(path, f\"mol_{idx}.xyz\"), at, plain=not ext)\n\n    def to_xyz(self, energy_method: int = 0, path: Optional[str] = None):\n        \"\"\"\n        Save dataset as single xyz file (extended xyz format).\n\n        Parameters:\n            energy_method:\n                Index of the energy method to use\n            path:\n                Path to save the xyz file\n        \"\"\"\n        with open(p_join(path if path else os.getcwd(), f\"{self.__name__}.xyz\"), \"w\") as f:\n            for atoms in tqdm(\n                self.as_iter(atoms=True, energy_method=energy_method),\n                total=len(self),\n                desc=f\"Saving {self.__name__} as xyz file\",\n            ):\n                write_extxyz(f, atoms, append=True)\n\n    def get_ase_atoms(self, idx: int, energy_method: int = 0, ext: bool = True) -> Atoms:\n        \"\"\"\n        Get the ASE atoms object for the entry at index idx.\n\n        Parameters:\n            idx:\n                Index of the entry.\n            energy_method:\n                Index of the energy method to use\n            ext:\n                Whether to include additional informations\n\n        Returns:\n            ASE atoms object\n        \"\"\"\n        entry = self[idx]\n        at = dict_to_atoms(entry, ext=ext, energy_method=energy_method)\n        return at\n\n    def subsample(\n        self, n_samples: Optional[Union[List[int], int, float]] = None, replace: bool = False, seed: int = 42\n    ):\n        np.random.seed(seed)\n        if n_samples is None:\n            return list(range(len(self)))\n        try:\n            if 0 < n_samples < 1:\n                n_samples = int(n_samples * len(self))\n            if isinstance(n_samples, int):\n                idxs = np.random.choice(len(self), size=n_samples, replace=replace)\n        except (ValueError, TypeError):  # list, set, np.ndarray\n            idxs = n_samples\n        return idxs\n\n    @requires_package(\"datamol\")\n    def calculate_descriptors(\n        self,\n        descriptor_name: str = \"soap\",\n        chemical_species: Optional[List[str]] = None,\n        n_samples: Optional[Union[List[int], int, float]] = None,\n        progress: bool = True,\n        **descriptor_kwargs,\n    ) -> Dict[str, np.ndarray]:\n        \"\"\"\n        Compute the descriptors for the dataset.\n\n        Parameters:\n            descriptor_name:\n                Name of the descriptor to use. Supported descriptors are [\"soap\"]\n            chemical_species:\n                List of chemical species to use for the descriptor computation, by default None.\n                If None, the chemical species of the dataset are used.\n            n_samples:\n                Number of samples to use for the computation, by default None.\n                If None, all the dataset is used.\n                If a list of integers is provided, the descriptors are computed for\n                each of the specified idx of samples.\n            progress:\n                Whether to show a progress bar, by default True.\n            **descriptor_kwargs : dict\n                Keyword arguments to pass to the descriptor instantiation of the model.\n\n        Returns:\n            Dictionary containing the following keys:\n                - values : np.ndarray of shape (N, M) containing the descriptors for the dataset\n                - idxs : np.ndarray of shape (N,) containing the indices of the samples used\n\n        \"\"\"\n        import datamol as dm\n\n        datum = {}\n        idxs = self.subsample(n_samples)\n        model = get_descriptor(descriptor_name.lower())(\n            species=self.chemical_species if chemical_species is None else chemical_species, **descriptor_kwargs\n        )\n\n        def wrapper(idx):\n            entry = self.get_ase_atoms(idx, ext=False)\n            return model.calculate(entry)\n\n        descr = dm.parallelized(wrapper, idxs, progress=progress, scheduler=\"threads\", n_jobs=-1)\n        datum[\"values\"] = np.vstack(descr)\n        datum[\"idxs\"] = idxs\n        return datum\n\n    def as_iter(self, atoms: bool = False, energy_method: int = 0) -> Iterable:\n        \"\"\"\n        Return the dataset as an iterator.\n\n        Parameters:\n            atoms:\n                Whether to return the items as ASE atoms object, by default False\n            energy_method:\n                Index of the energy method to use\n\n        Returns:\n            Iterator of the dataset\n        \"\"\"\n\n        func = partial(self.get_ase_atoms, energy_method=energy_method) if atoms else self.__getitem__\n\n        for i in range(len(self)):\n            yield func(i)\n\n    def __iter__(self):\n        for idxs in range(len(self)):\n            yield self[idxs]\n\n    def get_statistics(self, return_none: bool = True) -> Dict:\n        \"\"\"\n        Get the converted statistics of the dataset.\n\n        Parameters:\n            return_none :\n                Whether to return None if the statistics for the forces are not available, by default True\n                Otherwise, the statistics for the forces are set to 0.0\n\n        Returns:\n            Dictionary containing the statistics of the dataset\n        \"\"\"\n        selected_stats = self.statistics.get_results()\n        if len(selected_stats) == 0:\n            raise StatisticsNotAvailableError(self.__name__)\n        if not return_none:\n            selected_stats.update(\n                {\n                    \"ForcesCalculatorStats\": {\n                        \"mean\": np.array([0.0]),\n                        \"std\": np.array([0.0]),\n                        \"component_mean\": np.array([[0.0], [0.0], [0.0]]),\n                        \"component_std\": np.array([[0.0], [0.0], [0.0]]),\n                        \"component_rms\": np.array([[0.0], [0.0], [0.0]]),\n                    }\n                }\n            )\n        # cycle trough dict to convert units\n        for key, result in selected_stats.items():\n            if isinstance(result, ForcesCalculatorStats):\n                result.transform(self.convert_forces)\n            else:\n                result.transform(self.convert_energy)\n            result.transform(self._convert_array)\n        return {k: result.to_dict() for k, result in selected_stats.items()}\n\n    def __str__(self):\n        return f\"{self.__name__}\"\n\n    def __repr__(self):\n        return f\"{self.__name__}\"\n\n    def __len__(self):\n        return self.data[\"energies\"].shape[0]\n\n    def __smiles_converter__(self, x):\n        \"\"\"util function to convert string to smiles: useful if the smiles is\n        encoded in a different format than its display format\n        \"\"\"\n        return x\n\n    def _convert_array(self, x: np.ndarray):\n        return _CONVERT_DICT.get(self.array_format)(x)\n\n    def __getitem__(self, idx: int):\n        shift = MAX_CHARGE\n        p_start, p_end = self.data[\"position_idx_range\"][idx]\n        input = self.data[\"atomic_inputs\"][p_start:p_end]\n        z, c, positions, energies = (\n            self._convert_array(np.array(input[:, 0], dtype=np.int32)),\n            self._convert_array(np.array(input[:, 1], dtype=np.int32)),\n            self._convert_array(np.array(input[:, -3:], dtype=np.float32)),\n            self._convert_array(np.array(self.data[\"energies\"][idx], dtype=np.float64)),\n        )\n        name = self.__smiles_converter__(self.data[\"name\"][idx])\n        subset = self.data[\"subset\"][idx]\n        e0s = self._convert_array(self.__isolated_atom_energies__[..., z, c + shift].T)\n        formation_energies = energies - e0s.sum(axis=0)\n        forces = None\n        if \"forces\" in self.data:\n            forces = self._convert_array(np.array(self.data[\"forces\"][p_start:p_end], dtype=np.float32))\n\n        bunch = Bunch(\n            positions=positions,\n            atomic_numbers=z,\n            charges=c,\n            e0=e0s,\n            energies=energies,\n            formation_energies=formation_energies,\n            per_atom_formation_energies=formation_energies / len(z),\n            name=name,\n            subset=subset,\n            forces=forces,\n        )\n\n        if self.transform is not None:\n            bunch = self.transform(bunch)\n\n        return bunch\n
    "},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.__force_methods__","title":"__force_methods__ property","text":"

    For backward compatibility. To be removed in the future.

    "},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.e0s_dispatcher","title":"e0s_dispatcher: AtomEnergies property","text":"

    Property to get the object that dispatched the isolated atom energies of the QM methods.

    Returns:

    Type Description AtomEnergies

    Object wrapping the isolated atom energies of the QM methods.

    "},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.energy_methods","title":"energy_methods: List[str] property","text":"

    Return the string version of the energy methods

    "},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.__init__","title":"__init__(energy_unit=None, distance_unit=None, array_format='numpy', energy_type='formation', overwrite_local_cache=False, cache_dir=None, recompute_statistics=False, transform=None, skip_statistics=False, read_as_zarr=False, regressor_kwargs={'solver_type': 'linear', 'sub_sample': None, 'stride': 1})","text":"

    Parameters:

    Name Type Description Default energy_unit Optional[str]

    Energy unit to convert dataset to. Supported units: [\"kcal/mol\", \"kj/mol\", \"hartree\", \"ev\"]

    None distance_unit Optional[str]

    Distance unit to convert dataset to. Supported units: [\"ang\", \"nm\", \"bohr\"]

    None array_format str

    Format to return arrays in. Supported formats: [\"numpy\", \"torch\", \"jax\"]

    'numpy' energy_type Optional[str]

    Type of isolated atom energy to use for the dataset. Default: \"formation\" Supported types: [\"formation\", \"regression\", \"null\", None]

    'formation' overwrite_local_cache bool

    Whether to overwrite the locally cached dataset.

    False cache_dir Optional[str]

    Cache directory location. Defaults to \"~/.cache/openqdc\"

    None recompute_statistics bool

    Whether to recompute the statistics of the dataset.

    False transform Optional[Callable]

    transformation to apply to the getitem calls

    None regressor_kwargs Dict

    Dictionary of keyword arguments to pass to the regressor. Default: {\"solver_type\": \"linear\", \"sub_sample\": None, \"stride\": 1} solver_type can be one of [\"linear\", \"ridge\"]

    {'solver_type': 'linear', 'sub_sample': None, 'stride': 1} Source code in openqdc/datasets/base.py
    def __init__(\n    self,\n    energy_unit: Optional[str] = None,\n    distance_unit: Optional[str] = None,\n    array_format: str = \"numpy\",\n    energy_type: Optional[str] = \"formation\",\n    overwrite_local_cache: bool = False,\n    cache_dir: Optional[str] = None,\n    recompute_statistics: bool = False,\n    transform: Optional[Callable] = None,\n    skip_statistics: bool = False,\n    read_as_zarr: bool = False,\n    regressor_kwargs: Dict = {\n        \"solver_type\": \"linear\",\n        \"sub_sample\": None,\n        \"stride\": 1,\n    },\n) -> None:\n    \"\"\"\n\n    Parameters:\n        energy_unit:\n            Energy unit to convert dataset to. Supported units: [\"kcal/mol\", \"kj/mol\", \"hartree\", \"ev\"]\n        distance_unit:\n            Distance unit to convert dataset to. Supported units: [\"ang\", \"nm\", \"bohr\"]\n        array_format:\n            Format to return arrays in. Supported formats: [\"numpy\", \"torch\", \"jax\"]\n        energy_type:\n            Type of isolated atom energy to use for the dataset. Default: \"formation\"\n            Supported types: [\"formation\", \"regression\", \"null\", None]\n        overwrite_local_cache:\n            Whether to overwrite the locally cached dataset.\n        cache_dir:\n            Cache directory location. Defaults to \"~/.cache/openqdc\"\n        recompute_statistics:\n            Whether to recompute the statistics of the dataset.\n        transform:\n            transformation to apply to the __getitem__ calls\n        regressor_kwargs:\n            Dictionary of keyword arguments to pass to the regressor.\n            Default: {\"solver_type\": \"linear\", \"sub_sample\": None, \"stride\": 1}\n            solver_type can be one of [\"linear\", \"ridge\"]\n    \"\"\"\n    set_cache_dir(cache_dir)\n    # self._init_lambda_fn()\n    self.data = None\n    self._original_unit = self.energy_unit\n    self.recompute_statistics = recompute_statistics\n    self.regressor_kwargs = regressor_kwargs\n    self.transform = transform\n    self.read_as_zarr = read_as_zarr\n    self.energy_type = energy_type if energy_type is not None else \"null\"\n    self.refit_e0s = recompute_statistics or overwrite_local_cache\n    self.skip_statistics = skip_statistics\n    if not self.is_preprocessed():\n        raise DatasetNotAvailableError(self.__name__)\n    else:\n        self.read_preprocess(overwrite_local_cache=overwrite_local_cache)\n    self.set_array_format(array_format)\n    self._post_init(overwrite_local_cache, energy_unit, distance_unit)\n
    "},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.__smiles_converter__","title":"__smiles_converter__(x)","text":"

    util function to convert string to smiles: useful if the smiles is encoded in a different format than its display format

    Source code in openqdc/datasets/base.py
    def __smiles_converter__(self, x):\n    \"\"\"util function to convert string to smiles: useful if the smiles is\n    encoded in a different format than its display format\n    \"\"\"\n    return x\n
    "},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.as_iter","title":"as_iter(atoms=False, energy_method=0)","text":"

    Return the dataset as an iterator.

    Parameters:

    Name Type Description Default atoms bool

    Whether to return the items as ASE atoms object, by default False

    False energy_method int

    Index of the energy method to use

    0

    Returns:

    Type Description Iterable

    Iterator of the dataset

    Source code in openqdc/datasets/base.py
    def as_iter(self, atoms: bool = False, energy_method: int = 0) -> Iterable:\n    \"\"\"\n    Return the dataset as an iterator.\n\n    Parameters:\n        atoms:\n            Whether to return the items as ASE atoms object, by default False\n        energy_method:\n            Index of the energy method to use\n\n    Returns:\n        Iterator of the dataset\n    \"\"\"\n\n    func = partial(self.get_ase_atoms, energy_method=energy_method) if atoms else self.__getitem__\n\n    for i in range(len(self)):\n        yield func(i)\n
    "},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.calculate_descriptors","title":"calculate_descriptors(descriptor_name='soap', chemical_species=None, n_samples=None, progress=True, **descriptor_kwargs)","text":"

    Compute the descriptors for the dataset.

    Parameters:

    Name Type Description Default descriptor_name str

    Name of the descriptor to use. Supported descriptors are [\"soap\"]

    'soap' chemical_species Optional[List[str]]

    List of chemical species to use for the descriptor computation, by default None. If None, the chemical species of the dataset are used.

    None n_samples Optional[Union[List[int], int, float]]

    Number of samples to use for the computation, by default None. If None, all the dataset is used. If a list of integers is provided, the descriptors are computed for each of the specified idx of samples.

    None progress bool

    Whether to show a progress bar, by default True.

    True **descriptor_kwargs

    dict Keyword arguments to pass to the descriptor instantiation of the model.

    {}

    Returns:

    Type Description Dict[str, ndarray]

    Dictionary containing the following keys: - values : np.ndarray of shape (N, M) containing the descriptors for the dataset - idxs : np.ndarray of shape (N,) containing the indices of the samples used

    Source code in openqdc/datasets/base.py
    @requires_package(\"datamol\")\ndef calculate_descriptors(\n    self,\n    descriptor_name: str = \"soap\",\n    chemical_species: Optional[List[str]] = None,\n    n_samples: Optional[Union[List[int], int, float]] = None,\n    progress: bool = True,\n    **descriptor_kwargs,\n) -> Dict[str, np.ndarray]:\n    \"\"\"\n    Compute the descriptors for the dataset.\n\n    Parameters:\n        descriptor_name:\n            Name of the descriptor to use. Supported descriptors are [\"soap\"]\n        chemical_species:\n            List of chemical species to use for the descriptor computation, by default None.\n            If None, the chemical species of the dataset are used.\n        n_samples:\n            Number of samples to use for the computation, by default None.\n            If None, all the dataset is used.\n            If a list of integers is provided, the descriptors are computed for\n            each of the specified idx of samples.\n        progress:\n            Whether to show a progress bar, by default True.\n        **descriptor_kwargs : dict\n            Keyword arguments to pass to the descriptor instantiation of the model.\n\n    Returns:\n        Dictionary containing the following keys:\n            - values : np.ndarray of shape (N, M) containing the descriptors for the dataset\n            - idxs : np.ndarray of shape (N,) containing the indices of the samples used\n\n    \"\"\"\n    import datamol as dm\n\n    datum = {}\n    idxs = self.subsample(n_samples)\n    model = get_descriptor(descriptor_name.lower())(\n        species=self.chemical_species if chemical_species is None else chemical_species, **descriptor_kwargs\n    )\n\n    def wrapper(idx):\n        entry = self.get_ase_atoms(idx, ext=False)\n        return model.calculate(entry)\n\n    descr = dm.parallelized(wrapper, idxs, progress=progress, scheduler=\"threads\", n_jobs=-1)\n    datum[\"values\"] = np.vstack(descr)\n    datum[\"idxs\"] = idxs\n    return datum\n
    "},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.collate_list","title":"collate_list(list_entries)","text":"

    Collate a list of entries into a single dictionary.

    Parameters:

    Name Type Description Default list_entries List[Dict]

    List of dictionaries containing the entries to collate.

    required

    Returns:

    Type Description Dict

    Dictionary containing the collated entries.

    Source code in openqdc/datasets/base.py
    def collate_list(self, list_entries: List[Dict]) -> Dict:\n    \"\"\"\n    Collate a list of entries into a single dictionary.\n\n    Parameters:\n        list_entries:\n            List of dictionaries containing the entries to collate.\n\n    Returns:\n        Dictionary containing the collated entries.\n    \"\"\"\n    # concatenate entries\n    res = {key: np.concatenate([r[key] for r in list_entries if r is not None], axis=0) for key in list_entries[0]}\n\n    csum = np.cumsum(res.get(\"n_atoms\"))\n    x = np.zeros((csum.shape[0], 2), dtype=np.int32)\n    x[1:, 0], x[:, 1] = csum[:-1], csum\n    res[\"position_idx_range\"] = x\n\n    return res\n
    "},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.get_ase_atoms","title":"get_ase_atoms(idx, energy_method=0, ext=True)","text":"

    Get the ASE atoms object for the entry at index idx.

    Parameters:

    Name Type Description Default idx int

    Index of the entry.

    required energy_method int

    Index of the energy method to use

    0 ext bool

    Whether to include additional informations

    True

    Returns:

    Type Description Atoms

    ASE atoms object

    Source code in openqdc/datasets/base.py
    def get_ase_atoms(self, idx: int, energy_method: int = 0, ext: bool = True) -> Atoms:\n    \"\"\"\n    Get the ASE atoms object for the entry at index idx.\n\n    Parameters:\n        idx:\n            Index of the entry.\n        energy_method:\n            Index of the energy method to use\n        ext:\n            Whether to include additional informations\n\n    Returns:\n        ASE atoms object\n    \"\"\"\n    entry = self[idx]\n    at = dict_to_atoms(entry, ext=ext, energy_method=energy_method)\n    return at\n
    "},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.get_statistics","title":"get_statistics(return_none=True)","text":"

    Get the converted statistics of the dataset.

    Parameters:

    Name Type Description Default return_none

    Whether to return None if the statistics for the forces are not available, by default True Otherwise, the statistics for the forces are set to 0.0

    True

    Returns:

    Type Description Dict

    Dictionary containing the statistics of the dataset

    Source code in openqdc/datasets/base.py
    def get_statistics(self, return_none: bool = True) -> Dict:\n    \"\"\"\n    Get the converted statistics of the dataset.\n\n    Parameters:\n        return_none :\n            Whether to return None if the statistics for the forces are not available, by default True\n            Otherwise, the statistics for the forces are set to 0.0\n\n    Returns:\n        Dictionary containing the statistics of the dataset\n    \"\"\"\n    selected_stats = self.statistics.get_results()\n    if len(selected_stats) == 0:\n        raise StatisticsNotAvailableError(self.__name__)\n    if not return_none:\n        selected_stats.update(\n            {\n                \"ForcesCalculatorStats\": {\n                    \"mean\": np.array([0.0]),\n                    \"std\": np.array([0.0]),\n                    \"component_mean\": np.array([[0.0], [0.0], [0.0]]),\n                    \"component_std\": np.array([[0.0], [0.0], [0.0]]),\n                    \"component_rms\": np.array([[0.0], [0.0], [0.0]]),\n                }\n            }\n        )\n    # cycle trough dict to convert units\n    for key, result in selected_stats.items():\n        if isinstance(result, ForcesCalculatorStats):\n            result.transform(self.convert_forces)\n        else:\n            result.transform(self.convert_energy)\n        result.transform(self._convert_array)\n    return {k: result.to_dict() for k, result in selected_stats.items()}\n
    "},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.is_cached","title":"is_cached()","text":"

    Check if the dataset is cached locally.

    Returns:

    Type Description bool

    True if the dataset is cached locally, False otherwise.

    Source code in openqdc/datasets/base.py
    def is_cached(self) -> bool:\n    \"\"\"\n    Check if the dataset is cached locally.\n\n    Returns:\n        True if the dataset is cached locally, False otherwise.\n    \"\"\"\n    predicats = [\n        os.path.exists(p_join(self.preprocess_path, self.dataset_wrapper.add_extension(f\"{key}\")))\n        for key in self.data_keys\n    ]\n    predicats += [copy_exists(p_join(self.preprocess_path, file)) for file in self.dataset_wrapper._extra_files]\n    return all(predicats)\n
    "},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.is_preprocessed","title":"is_preprocessed()","text":"

    Check if the dataset is preprocessed and available online or locally.

    Returns:

    Type Description bool

    True if the dataset is available remotely or locally, False otherwise.

    Source code in openqdc/datasets/base.py
    def is_preprocessed(self) -> bool:\n    \"\"\"\n    Check if the dataset is preprocessed and available online or locally.\n\n    Returns:\n        True if the dataset is available remotely or locally, False otherwise.\n    \"\"\"\n    predicats = [\n        copy_exists(p_join(self.preprocess_path, self.dataset_wrapper.add_extension(f\"{key}\")))\n        for key in self.data_keys\n    ]\n    predicats += [copy_exists(p_join(self.preprocess_path, file)) for file in self.dataset_wrapper._extra_files]\n    return all(predicats)\n
    "},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.no_init","title":"no_init() classmethod","text":"

    Class method to avoid the init method to be called when the class is instanciated. Useful for debugging purposes or preprocessing data.

    Source code in openqdc/datasets/base.py
    @classmethod\ndef no_init(cls):\n    \"\"\"\n    Class method to avoid the __init__ method to be called when the class is instanciated.\n    Useful for debugging purposes or preprocessing data.\n    \"\"\"\n    return cls.__new__(cls)\n
    "},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.preprocess","title":"preprocess(upload=False, overwrite=True, as_zarr=True)","text":"

    Preprocess the dataset and save it.

    Parameters:

    Name Type Description Default upload bool

    Whether to upload the preprocessed data to the remote storage or only saving it locally.

    False overwrite bool

    hether to overwrite the preprocessed data if it already exists. Only used if upload is True. Cache is always overwritten locally.

    True as_zarr bool

    Whether to save the data as zarr files

    True Source code in openqdc/datasets/base.py
    def preprocess(self, upload: bool = False, overwrite: bool = True, as_zarr: bool = True):\n    \"\"\"\n    Preprocess the dataset and save it.\n\n    Parameters:\n        upload:\n            Whether to upload the preprocessed data to the remote storage or only saving it locally.\n        overwrite:\n            hether to overwrite the preprocessed data if it already exists.\n            Only used if upload is True. Cache is always overwritten locally.\n        as_zarr:\n            Whether to save the data as zarr files\n    \"\"\"\n    if overwrite or not self.is_preprocessed():\n        entries = self.read_raw_entries()\n        res = self.collate_list(entries)\n        self.save_preprocess(res, upload, overwrite, as_zarr)\n
    "},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.read_raw_entries","title":"read_raw_entries()","text":"

    Preprocess the raw (aka from the fetched source) into a list of dictionaries.

    Source code in openqdc/datasets/base.py
    def read_raw_entries(self):\n    \"\"\"\n    Preprocess the raw (aka from the fetched source) into a list of dictionaries.\n    \"\"\"\n    raise NotImplementedError\n
    "},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.save_preprocess","title":"save_preprocess(data_dict, upload=False, overwrite=True, as_zarr=False)","text":"

    Save the preprocessed data to the cache directory and optionally upload it to the remote storage.

    Parameters:

    Name Type Description Default data_dict Dict[str, ndarray]

    Dictionary containing the preprocessed data.

    required upload bool

    Whether to upload the preprocessed data to the remote storage or only saving it locally.

    False overwrite bool

    Whether to overwrite the preprocessed data if it already exists. Only used if upload is True. Cache is always overwritten locally.

    True Source code in openqdc/datasets/base.py
    def save_preprocess(\n    self, data_dict: Dict[str, np.ndarray], upload: bool = False, overwrite: bool = True, as_zarr: bool = False\n):\n    \"\"\"\n    Save the preprocessed data to the cache directory and optionally upload it to the remote storage.\n\n    Parameters:\n        data_dict:\n            Dictionary containing the preprocessed data.\n        upload:\n            Whether to upload the preprocessed data to the remote storage or only saving it locally.\n        overwrite:\n            Whether to overwrite the preprocessed data if it already exists.\n            Only used if upload is True. Cache is always overwritten locally.\n    \"\"\"\n    # save memmaps\n    logger.info(\"Preprocessing data and saving it to cache.\")\n    paths = self.dataset_wrapper.save_preprocess(\n        self.preprocess_path, self.data_keys, data_dict, self.pkl_data_keys, self.pkl_data_types\n    )\n    if upload:\n        for local_path in paths:\n            push_remote(local_path, overwrite=overwrite)  # make it async?\n
    "},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.save_xyz","title":"save_xyz(idx, energy_method=0, path=None, ext=True)","text":"

    Save a single entry at index idx as an extxyz file.

    Parameters:

    Name Type Description Default idx int

    Index of the entry

    required energy_method int

    Index of the energy method to use

    0 path Optional[str]

    Path to save the xyz file. If None, the current working directory is used.

    None ext bool

    Whether to include additional informations like forces and other metadatas (extxyz format)

    True Source code in openqdc/datasets/base.py
    def save_xyz(self, idx: int, energy_method: int = 0, path: Optional[str] = None, ext: bool = True):\n    \"\"\"\n    Save a single entry at index idx as an extxyz file.\n\n    Parameters:\n        idx:\n            Index of the entry\n        energy_method:\n            Index of the energy method to use\n        path:\n            Path to save the xyz file. If None, the current working directory is used.\n        ext:\n            Whether to include additional informations like forces and other metadatas (extxyz format)\n    \"\"\"\n    if path is None:\n        path = os.getcwd()\n    at = self.get_ase_atoms(idx, ext=ext, energy_method=energy_method)\n    write_extxyz(p_join(path, f\"mol_{idx}.xyz\"), at, plain=not ext)\n
    "},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.set_distance_unit","title":"set_distance_unit(value)","text":"

    Set a new distance unit for the dataset.

    Parameters:

    Name Type Description Default value str

    New distance unit to set.

    required Source code in openqdc/datasets/base.py
    def set_distance_unit(self, value: str):\n    \"\"\"\n    Set a new distance unit for the dataset.\n\n    Parameters:\n        value:\n            New distance unit to set.\n    \"\"\"\n    # old_unit = self.distance_unit\n    # self.__distance_unit__ = value\n    self._fn_distance = self.distance_unit.to(value)  # get_conversion(old_unit, value)\n    self.__distance_unit__ = value\n
    "},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.set_energy_unit","title":"set_energy_unit(value)","text":"

    Set a new energy unit for the dataset.

    Parameters:

    Name Type Description Default value str

    New energy unit to set.

    required Source code in openqdc/datasets/base.py
    def set_energy_unit(self, value: str):\n    \"\"\"\n    Set a new energy unit for the dataset.\n\n    Parameters:\n        value:\n            New energy unit to set.\n    \"\"\"\n    # old_unit = self.energy_unit\n    # self.__energy_unit__ = value\n    self._fn_energy = self.energy_unit.to(value)  # get_conversion(old_unit, value)\n    self.__energy_unit__ = value\n
    "},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.to_xyz","title":"to_xyz(energy_method=0, path=None)","text":"

    Save dataset as single xyz file (extended xyz format).

    Parameters:

    Name Type Description Default energy_method int

    Index of the energy method to use

    0 path Optional[str]

    Path to save the xyz file

    None Source code in openqdc/datasets/base.py
    def to_xyz(self, energy_method: int = 0, path: Optional[str] = None):\n    \"\"\"\n    Save dataset as single xyz file (extended xyz format).\n\n    Parameters:\n        energy_method:\n            Index of the energy method to use\n        path:\n            Path to save the xyz file\n    \"\"\"\n    with open(p_join(path if path else os.getcwd(), f\"{self.__name__}.xyz\"), \"w\") as f:\n        for atoms in tqdm(\n            self.as_iter(atoms=True, energy_method=energy_method),\n            total=len(self),\n            desc=f\"Saving {self.__name__} as xyz file\",\n        ):\n            write_extxyz(f, atoms, append=True)\n
    "},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.upload","title":"upload(overwrite=False, as_zarr=False)","text":"

    Upload the preprocessed data to the remote storage. Must be called after preprocess and need to have write privileges.

    Parameters:

    Name Type Description Default overwrite bool

    Whether to overwrite the remote data if it already exists

    False as_zarr bool

    Whether to upload the data as zarr files

    False Source code in openqdc/datasets/base.py
    def upload(self, overwrite: bool = False, as_zarr: bool = False):\n    \"\"\"\n    Upload the preprocessed data to the remote storage. Must be called after preprocess and\n    need to have write privileges.\n\n    Parameters:\n        overwrite:\n            Whether to overwrite the remote data if it already exists\n        as_zarr:\n            Whether to upload the data as zarr files\n    \"\"\"\n    for key in self.data_keys:\n        local_path = p_join(self.preprocess_path, f\"{key}.mmap\" if not as_zarr else f\"{key}.zip\")\n        push_remote(local_path, overwrite=overwrite)\n    local_path = p_join(self.preprocess_path, \"props.pkl\" if not as_zarr else \"metadata.zip\")\n    push_remote(local_path, overwrite=overwrite)\n
    "},{"location":"API/e0_dispatcher.html","title":"e0 Dispatcher","text":""},{"location":"API/e0_dispatcher.html#openqdc.datasets.energies.AtomEnergies","title":"AtomEnergies","text":"

    Manager class for interface with the isolated atom energies classes and providing the generals function to retrieve the data

    Source code in openqdc/datasets/energies.py
    class AtomEnergies:\n    \"\"\"\n    Manager class for interface with the isolated atom energies classes\n    and providing the generals function to retrieve the data\n    \"\"\"\n\n    def __init__(self, data, **kwargs) -> None:\n        self.atom_energies = data.energy_type\n        self.factory = dispatch_factory(data, **kwargs)\n\n    @property\n    def e0s_matrix(self) -> np.ndarray:\n        \"\"\"\n        Return the isolated atom energies dictionary\n\n        Returns:\n            Matrix Array with the isolated atom energies\n        \"\"\"\n        return self.factory.e0_matrix\n\n    @property\n    def e0s_dict(self) -> Dict[AtomSpecies, AtomEnergy]:\n        \"\"\"\n        Return the isolated atom energies dictionary\n\n        Returns:\n            Dictionary with the isolated atom energies\n        \"\"\"\n        return self.factory.e0_dict\n\n    def __str__(self):\n        return f\"Atoms: { list(set(map(lambda x : x.symbol, self.e0s_dict.keys())))}\"\n\n    def __repr__(self):\n        return str(self)\n\n    def __getitem__(self, item: AtomSpecies) -> AtomEnergy:\n        \"\"\"\n        Retrieve a key from the isolated atom dictionary.\n        Item can be written as tuple(Symbol, charge),\n        tuple(Chemical number, charge). If no charge is passed,\n        it will be automatically set to 0.\n\n        Examples:\n            AtomEnergies[6], AtomEnergies[6,1], \\n\n            AtomEnergies[\"C\",1], AtomEnergies[(6,1)], \\n\n            AtomEnergies[(\"C,1)]\n\n        Parameters:\n            item:\n                AtomSpecies object or tuple with the atom symbol and charge\n\n        Returns:\n            AtomEnergy object with the isolated atom energy\n        \"\"\"\n        try:\n            atom, charge = item[0], item[1]\n        except TypeError:\n            atom = item\n            charge = 0\n        except IndexError:\n            atom = item[0]\n            charge = 0\n        if not isinstance(atom, str):\n            atom = ATOM_SYMBOLS[atom]\n        return self.e0s_dict[(atom, charge)]\n
    "},{"location":"API/e0_dispatcher.html#openqdc.datasets.energies.AtomEnergies.e0s_dict","title":"e0s_dict: Dict[AtomSpecies, AtomEnergy] property","text":"

    Return the isolated atom energies dictionary

    Returns:

    Type Description Dict[AtomSpecies, AtomEnergy]

    Dictionary with the isolated atom energies

    "},{"location":"API/e0_dispatcher.html#openqdc.datasets.energies.AtomEnergies.e0s_matrix","title":"e0s_matrix: np.ndarray property","text":"

    Return the isolated atom energies dictionary

    Returns:

    Type Description ndarray

    Matrix Array with the isolated atom energies

    "},{"location":"API/e0_dispatcher.html#openqdc.datasets.energies.AtomEnergies.__getitem__","title":"__getitem__(item)","text":"

    Retrieve a key from the isolated atom dictionary. Item can be written as tuple(Symbol, charge), tuple(Chemical number, charge). If no charge is passed, it will be automatically set to 0.

    Examples:

    AtomEnergies[6], AtomEnergies[6,1],

    AtomEnergies[\"C\",1], AtomEnergies[(6,1)],

    AtomEnergies[(\"C,1)]

    Parameters:

    Name Type Description Default item AtomSpecies

    AtomSpecies object or tuple with the atom symbol and charge

    required

    Returns:

    Type Description AtomEnergy

    AtomEnergy object with the isolated atom energy

    Source code in openqdc/datasets/energies.py
    def __getitem__(self, item: AtomSpecies) -> AtomEnergy:\n    \"\"\"\n    Retrieve a key from the isolated atom dictionary.\n    Item can be written as tuple(Symbol, charge),\n    tuple(Chemical number, charge). If no charge is passed,\n    it will be automatically set to 0.\n\n    Examples:\n        AtomEnergies[6], AtomEnergies[6,1], \\n\n        AtomEnergies[\"C\",1], AtomEnergies[(6,1)], \\n\n        AtomEnergies[(\"C,1)]\n\n    Parameters:\n        item:\n            AtomSpecies object or tuple with the atom symbol and charge\n\n    Returns:\n        AtomEnergy object with the isolated atom energy\n    \"\"\"\n    try:\n        atom, charge = item[0], item[1]\n    except TypeError:\n        atom = item\n        charge = 0\n    except IndexError:\n        atom = item[0]\n        charge = 0\n    if not isinstance(atom, str):\n        atom = ATOM_SYMBOLS[atom]\n    return self.e0s_dict[(atom, charge)]\n
    "},{"location":"API/e0_dispatcher.html#openqdc.datasets.energies.AtomEnergy","title":"AtomEnergy dataclass","text":"

    Datastructure to store isolated atom energies and the std deviation associated to the value. By default the std will be 1 if no value was calculated or not available (formation energy case)

    Source code in openqdc/datasets/energies.py
    @dataclass\nclass AtomEnergy:\n    \"\"\"\n    Datastructure to store isolated atom energies\n    and the std deviation associated to the value.\n    By default the std will be 1 if no value was calculated\n    or not available (formation energy case)\n    \"\"\"\n\n    mean: np.array\n    std: np.array = field(default_factory=lambda: np.array([1], dtype=np.float32))\n\n    def __post_init__(self):\n        if not isinstance(self.mean, np.ndarray):\n            self.mean = np.array([self.mean], dtype=np.float32)\n\n    def append(self, other: \"AtomEnergy\"):\n        \"\"\"\n        Append the mean and std of another atom energy\n        \"\"\"\n        self.mean = np.append(self.mean, other.mean)\n        self.std = np.append(self.std, other.std)\n
    "},{"location":"API/e0_dispatcher.html#openqdc.datasets.energies.AtomEnergy.append","title":"append(other)","text":"

    Append the mean and std of another atom energy

    Source code in openqdc/datasets/energies.py
    def append(self, other: \"AtomEnergy\"):\n    \"\"\"\n    Append the mean and std of another atom energy\n    \"\"\"\n    self.mean = np.append(self.mean, other.mean)\n    self.std = np.append(self.std, other.std)\n
    "},{"location":"API/e0_dispatcher.html#openqdc.datasets.energies.AtomSpecies","title":"AtomSpecies dataclass","text":"

    Structure that defines a tuple of chemical specie and charge and provide hash and automatic conversion from atom number to checmical symbol

    Source code in openqdc/datasets/energies.py
    @dataclass(frozen=False, eq=True)\nclass AtomSpecies:\n    \"\"\"\n    Structure that defines a tuple of chemical specie and charge\n    and provide hash and automatic conversion from atom number to\n    checmical symbol\n    \"\"\"\n\n    symbol: Union[str, int]\n    charge: int = 0\n\n    def __post_init__(self):\n        if not isinstance(self.symbol, str):\n            self.symbol = ATOM_SYMBOLS[self.symbol]\n        self.number = ATOMIC_NUMBERS[self.symbol]\n\n    def __hash__(self):\n        return hash((self.symbol, self.charge))\n\n    def __eq__(self, other):\n        if not isinstance(other, AtomSpecies):\n            symbol, charge = other[0], other[1]\n            other = AtomSpecies(symbol=symbol, charge=charge)\n        return (self.number, self.charge) == (other.number, other.charge)\n
    "},{"location":"API/e0_dispatcher.html#openqdc.datasets.energies.IsolatedEnergyInterface","title":"IsolatedEnergyInterface","text":"

    Bases: ABC

    Abstract class that defines the interface for the different implementation of an isolated atom energy value

    Source code in openqdc/datasets/energies.py
    class IsolatedEnergyInterface(ABC):\n    \"\"\"\n    Abstract class that defines the interface for the\n    different implementation of an isolated atom energy value\n    \"\"\"\n\n    def __init__(self, data, **kwargs):\n        \"\"\"\n        Parameters:\n            data : openqdc.datasets.Dataset\n                Dataset object that contains the information\n                about the isolated atom energies. Info will be passed\n                by references\n            kwargs : dict\n                Additional arguments that will be passed to the\n                selected energy class. Mostly used for regression\n                to pass the regressor_kwargs.\n        \"\"\"\n        self._e0_matrixs = []\n        self._e0_dict = None\n        self.kwargs = kwargs\n        self.data = data\n        self._post_init()\n\n    @property\n    def refit(self) -> bool:\n        return self.data.refit_e0s\n\n    @abstractmethod\n    def _post_init(self):\n        \"\"\"\n        Main method to fetch/compute/recomputed the isolated atom energies.\n        Need to be implemented in all child classes.\n        \"\"\"\n        pass\n\n    def __len__(self):\n        return len(self.data.energy_methods)\n\n    @property\n    def e0_matrix(self) -> np.ndarray:\n        \"\"\"\n        Return the isolated atom energies matrixes\n\n        Returns:\n            Matrix Array with the isolated atom energies\n        \"\"\"\n        return np.array(self._e0_matrixs)\n\n    @property\n    def e0_dict(self) -> Dict:\n        \"\"\"\n        Return the isolated atom energies dict\n\n        Returns:\n            Dictionary with the isolated atom energies\n        \"\"\"\n\n        return self._e0s_dict\n\n    def __str__(self) -> str:\n        return self.__class__.__name__.lower()\n
    "},{"location":"API/e0_dispatcher.html#openqdc.datasets.energies.IsolatedEnergyInterface.e0_dict","title":"e0_dict: Dict property","text":"

    Return the isolated atom energies dict

    Returns:

    Type Description Dict

    Dictionary with the isolated atom energies

    "},{"location":"API/e0_dispatcher.html#openqdc.datasets.energies.IsolatedEnergyInterface.e0_matrix","title":"e0_matrix: np.ndarray property","text":"

    Return the isolated atom energies matrixes

    Returns:

    Type Description ndarray

    Matrix Array with the isolated atom energies

    "},{"location":"API/e0_dispatcher.html#openqdc.datasets.energies.IsolatedEnergyInterface.__init__","title":"__init__(data, **kwargs)","text":"

    Parameters:

    Name Type Description Default data

    openqdc.datasets.Dataset Dataset object that contains the information about the isolated atom energies. Info will be passed by references

    required kwargs

    dict Additional arguments that will be passed to the selected energy class. Mostly used for regression to pass the regressor_kwargs.

    {} Source code in openqdc/datasets/energies.py
    def __init__(self, data, **kwargs):\n    \"\"\"\n    Parameters:\n        data : openqdc.datasets.Dataset\n            Dataset object that contains the information\n            about the isolated atom energies. Info will be passed\n            by references\n        kwargs : dict\n            Additional arguments that will be passed to the\n            selected energy class. Mostly used for regression\n            to pass the regressor_kwargs.\n    \"\"\"\n    self._e0_matrixs = []\n    self._e0_dict = None\n    self.kwargs = kwargs\n    self.data = data\n    self._post_init()\n
    "},{"location":"API/e0_dispatcher.html#openqdc.datasets.energies.NullEnergy","title":"NullEnergy","text":"

    Bases: IsolatedEnergyInterface

    Class that returns a null (zeros) matrix for the isolated atom energies in case of no energies are available.

    Source code in openqdc/datasets/energies.py
    class NullEnergy(IsolatedEnergyInterface):\n    \"\"\"\n    Class that returns a null (zeros) matrix for the isolated atom energies in case\n    of no energies are available.\n    \"\"\"\n\n    def _assembly_e0_dict(self):\n        datum = {}\n        for _ in self.data.__energy_methods__:\n            for key, values in PotentialMethod.NONE.atom_energies_dict.items():\n                atm = AtomSpecies(*key)\n                ens = AtomEnergy(values)\n                if atm not in datum:\n                    datum[atm] = ens\n                else:\n                    datum[atm].append(ens)\n        self._e0s_dict = datum\n\n    def _post_init(self):\n        self._e0_matrixs = [PotentialMethod.NONE.atom_energies_matrix for _ in range(len(self.data.energy_methods))]\n        self._assembly_e0_dict()\n
    "},{"location":"API/e0_dispatcher.html#openqdc.datasets.energies.PhysicalEnergy","title":"PhysicalEnergy","text":"

    Bases: IsolatedEnergyInterface

    Class that returns a physical (SE,DFT,etc) isolated atom energies.

    Source code in openqdc/datasets/energies.py
    class PhysicalEnergy(IsolatedEnergyInterface):\n    \"\"\"\n    Class that returns a physical (SE,DFT,etc) isolated atom energies.\n    \"\"\"\n\n    def _assembly_e0_dict(self):\n        datum = {}\n        for method in self.data.__energy_methods__:\n            for key, values in method.atom_energies_dict.items():\n                atm = AtomSpecies(*key)\n                ens = AtomEnergy(values)\n                if atm not in datum:\n                    datum[atm] = ens\n                else:\n                    datum[atm].append(ens)\n        self._e0s_dict = datum\n\n    def _post_init(self):\n        self._e0_matrixs = [energy_method.atom_energies_matrix for energy_method in self.data.__energy_methods__]\n        self._assembly_e0_dict()\n
    "},{"location":"API/e0_dispatcher.html#openqdc.datasets.energies.RegressionEnergy","title":"RegressionEnergy","text":"

    Bases: IsolatedEnergyInterface

    Class that compute and returns the regressed isolated atom energies.

    Source code in openqdc/datasets/energies.py
    class RegressionEnergy(IsolatedEnergyInterface):\n    \"\"\"\n    Class that compute and returns the regressed isolated atom energies.\n    \"\"\"\n\n    def _post_init(self):\n        if not self.attempt_load() or self.refit:\n            self.regressor = Regressor.from_openqdc_dataset(self.data, **self.kwargs)\n            E0s, cov = self._compute_regression_e0s()\n            self._set_lin_atom_species_dict(E0s, cov)\n        self._set_linear_e0s()\n\n    def _compute_regression_e0s(self) -> Tuple[np.ndarray, Optional[np.ndarray]]:\n        \"\"\"\n        Try to compute the regressed isolated atom energies.\n        raise an error if the regression fails.\n        return the regressed isolated atom energies and the uncertainty values.\n\n        Returns:\n            Tuple with the regressed isolated atom energies and the uncertainty values of the regression\n            if available.\n        \"\"\"\n        try:\n            E0s, cov = self.regressor.solve()\n        except np.linalg.LinAlgError:\n            logger.warning(f\"Failed to compute E0s using {self.regressor.solver_type} regression.\")\n            raise np.linalg.LinAlgError\n        return E0s, cov\n\n    def _set_lin_atom_species_dict(self, E0s, covs) -> None:\n        \"\"\"\n        Set the regressed isolated atom energies in a dictionary format\n        and Save the values in a pickle file to easy loading.\n        \"\"\"\n        atomic_energies_dict = {}\n        for i, z in enumerate(self.regressor.numbers):\n            for charge in range(-10, 11):\n                atomic_energies_dict[AtomSpecies(z, charge)] = AtomEnergy(E0s[i], 1 if covs is None else covs[i])\n            # atomic_energies_dict[z] = E0s[i]\n        self._e0s_dict = atomic_energies_dict\n        self.save_e0s()\n\n    def _set_linear_e0s(self) -> None:\n        \"\"\"\n        Transform the e0s dictionary into the correct e0s\n        matrix format.\n        \"\"\"\n        new_e0s = [np.zeros((max(self.data.numbers) + 1, MAX_CHARGE_NUMBER)) for _ in range(len(self))]\n        for z, e0 in self._e0s_dict.items():\n            for i in range(len(self)):\n                # new_e0s[i][z, :] = e0[i]\n                new_e0s[i][z.number, z.charge] = e0.mean[i]\n            # for atom_sp, values in\n        self._e0_matrixs = new_e0s\n\n    def save_e0s(self) -> None:\n        \"\"\"\n        Save the regressed isolated atom energies in a pickle file.\n        \"\"\"\n        save_pkl(self._e0s_dict, self.preprocess_path)\n\n    def attempt_load(self) -> bool:\n        \"\"\"\n        Try to load the regressed isolated atom energies from the\n        object pickle file and return the success of the operation.\n        \"\"\"\n        try:\n            self._e0s_dict = load_pkl(self.preprocess_path)\n            logger.info(f\"Found energy file for {str(self)}.\")\n            return True\n        except FileNotFoundError:\n            logger.warning(f\"Energy file for {str(self)} not found.\")\n            return False\n\n    @property\n    def preprocess_path(self):\n        \"\"\"\n        Return the path to the object pickle file.\n        \"\"\"\n        path = p_join(self.data.root, \"preprocessed\", str(self) + \".pkl\")\n        return path\n
    "},{"location":"API/e0_dispatcher.html#openqdc.datasets.energies.RegressionEnergy.preprocess_path","title":"preprocess_path property","text":"

    Return the path to the object pickle file.

    "},{"location":"API/e0_dispatcher.html#openqdc.datasets.energies.RegressionEnergy.attempt_load","title":"attempt_load()","text":"

    Try to load the regressed isolated atom energies from the object pickle file and return the success of the operation.

    Source code in openqdc/datasets/energies.py
    def attempt_load(self) -> bool:\n    \"\"\"\n    Try to load the regressed isolated atom energies from the\n    object pickle file and return the success of the operation.\n    \"\"\"\n    try:\n        self._e0s_dict = load_pkl(self.preprocess_path)\n        logger.info(f\"Found energy file for {str(self)}.\")\n        return True\n    except FileNotFoundError:\n        logger.warning(f\"Energy file for {str(self)} not found.\")\n        return False\n
    "},{"location":"API/e0_dispatcher.html#openqdc.datasets.energies.RegressionEnergy.save_e0s","title":"save_e0s()","text":"

    Save the regressed isolated atom energies in a pickle file.

    Source code in openqdc/datasets/energies.py
    def save_e0s(self) -> None:\n    \"\"\"\n    Save the regressed isolated atom energies in a pickle file.\n    \"\"\"\n    save_pkl(self._e0s_dict, self.preprocess_path)\n
    "},{"location":"API/e0_dispatcher.html#openqdc.datasets.energies.dispatch_factory","title":"dispatch_factory(data, **kwargs)","text":"

    Factory function that select the correct energy class for the fetching/calculation of isolated atom energies.

    Parameters:

    Name Type Description Default data

    openqdc.datasets.Dataset Dataset object that contains the information about the isolated atom energies. Info will be passed by references

    required kwargs

    dict Additional arguments that will be passed to the selected energy class. Mostly used for regression to pass the regressor_kwargs.

    {}

    Returns:

    Type Description IsolatedEnergyInterface

    Initialized IsolatedEnergyInterface-like object

    Source code in openqdc/datasets/energies.py
    def dispatch_factory(data: Any, **kwargs: Dict) -> \"IsolatedEnergyInterface\":\n    \"\"\"\n    Factory function that select the correct\n    energy class for the fetching/calculation\n    of isolated atom energies.\n\n    Parameters:\n        data : openqdc.datasets.Dataset\n            Dataset object that contains the information\n            about the isolated atom energies. Info will be passed\n            by references\n        kwargs : dict\n            Additional arguments that will be passed to the\n            selected energy class. Mostly used for regression\n            to pass the regressor_kwargs.\n\n    Returns:\n        Initialized IsolatedEnergyInterface-like object\n    \"\"\"\n    if data.energy_type == \"formation\":\n        return PhysicalEnergy(data, **kwargs)\n    elif data.energy_type == \"regression\":\n        try:\n            return RegressionEnergy(data, **kwargs)\n        except np.linalg.LinAlgError:\n            logger.warning(\"Error! Using physical energies instead.\")\n            return PhysicalEnergy(data, **kwargs)\n    elif data.energy_type == \"null\":\n        return NullEnergy(data, **kwargs)\n
    "},{"location":"API/formats.html","title":"Format loading","text":""},{"location":"API/formats.html#openqdc.datasets.structure.GeneralStructure","title":"GeneralStructure","text":"

    Bases: ABC

    Abstract Factory class for datasets type in the openQDC package.

    Source code in openqdc/datasets/structure.py
    class GeneralStructure(ABC):\n    \"\"\"\n    Abstract Factory class for datasets type in the openQDC package.\n    \"\"\"\n\n    _ext: Optional[str] = None\n    _extra_files: Optional[List[str]] = None\n\n    @property\n    def ext(self):\n        return self._ext\n\n    @property\n    @abstractmethod\n    def load_fn(self) -> Callable:\n        \"\"\"\n        Function to use for loading the data.\n        Must be implemented by the child class.\n\n        Returns:\n            the function to use for loading the data\n        \"\"\"\n        raise NotImplementedError\n\n    def add_extension(self, filename: str) -> str:\n        \"\"\"\n        Add the correct extension to a filename\n\n        Parameters:\n            filename:  the filename to add the extension to\n\n        Returns:\n            the filename with the extension\n        \"\"\"\n        return filename + self.ext\n\n    @abstractmethod\n    def save_preprocess(\n        self,\n        preprocess_path: Union[str, PathLike],\n        data_keys: List[str],\n        data_dict: Dict[str, np.ndarray],\n        extra_data_keys: List[str],\n        extra_data_types: Dict[str, type],\n    ) -> List[str]:\n        \"\"\"\n        Save the preprocessed data to the cache directory and optionally upload it to the remote storage.\n        Must be implemented by the child class.\n\n        Parameters:\n            preprocess_path:  path to the preprocessed data file\n            data_keys:        list of keys to load from the data file\n            data_dict:        dictionary of data to save\n            extra_data_keys:  list of keys to load from the extra data file\n            extra_data_types: dictionary of data types for each key\n        \"\"\"\n        raise NotImplementedError\n\n    @abstractmethod\n    def load_extra_files(\n        self,\n        data: Dict[str, np.ndarray],\n        preprocess_path: Union[str, PathLike],\n        data_keys: List[str],\n        pkl_data_keys: List[str],\n        overwrite: bool,\n    ):\n        \"\"\"\n        Load extra files required to define other types of data.\n        Must be implemented by the child class.\n\n        Parameters:\n            data:  dictionary of data to load\n            preprocess_path:  path to the preprocessed data file\n            data_keys:    list of keys to load from the data file\n            pkl_data_keys:   list of keys to load from the extra files\n            overwrite:   whether to overwrite the local cache\n        \"\"\"\n        raise NotImplementedError\n\n    def join_and_ext(self, path: Union[str, PathLike], filename: str) -> Union[str, PathLike]:\n        \"\"\"\n        Join a path and a filename and add the correct extension.\n\n        Parameters:\n            path:  the path to join\n            filename:  the filename to join\n\n        Returns:\n            the joined path with the correct extension\n        \"\"\"\n        return p_join(path, self.add_extension(filename))\n\n    def load_data(\n        self,\n        preprocess_path: Union[str, PathLike],\n        data_keys: List[str],\n        data_types: Dict[str, np.dtype],\n        data_shapes: Dict[str, Tuple[int, int]],\n        extra_data_keys: List[str],\n        overwrite: bool,\n    ):\n        \"\"\"\n        Main method to load the data from a filetype structure like memmap or zarr.\n\n        Parameters:\n            preprocess_path:  path to the preprocessed data file\n            data_keys:        list of keys to load from the data file\n            data_types:       dictionary of data types for each key\n            data_shapes:      dictionary of shapes for each key\n            extra_data_keys:  list of keys to load from the extra data file\n            overwrite:        whether to overwrite the local cache\n        \"\"\"\n        data = {}\n        for key in data_keys:\n            filename = self.join_and_ext(preprocess_path, key)\n            pull_locally(filename, overwrite=overwrite)\n            data[key] = self.load_fn(filename, mode=\"r\", dtype=data_types[key])\n            data[key] = self.unpack(data[key])\n            data[key] = data[key].reshape(*data_shapes[key])\n\n        data = self.load_extra_files(data, preprocess_path, data_keys, extra_data_keys, overwrite)\n        return data\n\n    def unpack(self, data: any) -> any:\n        \"\"\"\n        Unpack the data from the loaded file.\n\n        Parameters:\n            data:  the data to unpack\n\n        Returns:\n            the unpacked data\n        \"\"\"\n        return data\n
    "},{"location":"API/formats.html#openqdc.datasets.structure.GeneralStructure.load_fn","title":"load_fn: Callable abstractmethod property","text":"

    Function to use for loading the data. Must be implemented by the child class.

    Returns:

    Type Description Callable

    the function to use for loading the data

    "},{"location":"API/formats.html#openqdc.datasets.structure.GeneralStructure.add_extension","title":"add_extension(filename)","text":"

    Add the correct extension to a filename

    Parameters:

    Name Type Description Default filename str

    the filename to add the extension to

    required

    Returns:

    Type Description str

    the filename with the extension

    Source code in openqdc/datasets/structure.py
    def add_extension(self, filename: str) -> str:\n    \"\"\"\n    Add the correct extension to a filename\n\n    Parameters:\n        filename:  the filename to add the extension to\n\n    Returns:\n        the filename with the extension\n    \"\"\"\n    return filename + self.ext\n
    "},{"location":"API/formats.html#openqdc.datasets.structure.GeneralStructure.join_and_ext","title":"join_and_ext(path, filename)","text":"

    Join a path and a filename and add the correct extension.

    Parameters:

    Name Type Description Default path Union[str, PathLike]

    the path to join

    required filename str

    the filename to join

    required

    Returns:

    Type Description Union[str, PathLike]

    the joined path with the correct extension

    Source code in openqdc/datasets/structure.py
    def join_and_ext(self, path: Union[str, PathLike], filename: str) -> Union[str, PathLike]:\n    \"\"\"\n    Join a path and a filename and add the correct extension.\n\n    Parameters:\n        path:  the path to join\n        filename:  the filename to join\n\n    Returns:\n        the joined path with the correct extension\n    \"\"\"\n    return p_join(path, self.add_extension(filename))\n
    "},{"location":"API/formats.html#openqdc.datasets.structure.GeneralStructure.load_data","title":"load_data(preprocess_path, data_keys, data_types, data_shapes, extra_data_keys, overwrite)","text":"

    Main method to load the data from a filetype structure like memmap or zarr.

    Parameters:

    Name Type Description Default preprocess_path Union[str, PathLike]

    path to the preprocessed data file

    required data_keys List[str]

    list of keys to load from the data file

    required data_types Dict[str, dtype]

    dictionary of data types for each key

    required data_shapes Dict[str, Tuple[int, int]]

    dictionary of shapes for each key

    required extra_data_keys List[str]

    list of keys to load from the extra data file

    required overwrite bool

    whether to overwrite the local cache

    required Source code in openqdc/datasets/structure.py
    def load_data(\n    self,\n    preprocess_path: Union[str, PathLike],\n    data_keys: List[str],\n    data_types: Dict[str, np.dtype],\n    data_shapes: Dict[str, Tuple[int, int]],\n    extra_data_keys: List[str],\n    overwrite: bool,\n):\n    \"\"\"\n    Main method to load the data from a filetype structure like memmap or zarr.\n\n    Parameters:\n        preprocess_path:  path to the preprocessed data file\n        data_keys:        list of keys to load from the data file\n        data_types:       dictionary of data types for each key\n        data_shapes:      dictionary of shapes for each key\n        extra_data_keys:  list of keys to load from the extra data file\n        overwrite:        whether to overwrite the local cache\n    \"\"\"\n    data = {}\n    for key in data_keys:\n        filename = self.join_and_ext(preprocess_path, key)\n        pull_locally(filename, overwrite=overwrite)\n        data[key] = self.load_fn(filename, mode=\"r\", dtype=data_types[key])\n        data[key] = self.unpack(data[key])\n        data[key] = data[key].reshape(*data_shapes[key])\n\n    data = self.load_extra_files(data, preprocess_path, data_keys, extra_data_keys, overwrite)\n    return data\n
    "},{"location":"API/formats.html#openqdc.datasets.structure.GeneralStructure.load_extra_files","title":"load_extra_files(data, preprocess_path, data_keys, pkl_data_keys, overwrite) abstractmethod","text":"

    Load extra files required to define other types of data. Must be implemented by the child class.

    Parameters:

    Name Type Description Default data Dict[str, ndarray]

    dictionary of data to load

    required preprocess_path Union[str, PathLike]

    path to the preprocessed data file

    required data_keys List[str]

    list of keys to load from the data file

    required pkl_data_keys List[str]

    list of keys to load from the extra files

    required overwrite bool

    whether to overwrite the local cache

    required Source code in openqdc/datasets/structure.py
    @abstractmethod\ndef load_extra_files(\n    self,\n    data: Dict[str, np.ndarray],\n    preprocess_path: Union[str, PathLike],\n    data_keys: List[str],\n    pkl_data_keys: List[str],\n    overwrite: bool,\n):\n    \"\"\"\n    Load extra files required to define other types of data.\n    Must be implemented by the child class.\n\n    Parameters:\n        data:  dictionary of data to load\n        preprocess_path:  path to the preprocessed data file\n        data_keys:    list of keys to load from the data file\n        pkl_data_keys:   list of keys to load from the extra files\n        overwrite:   whether to overwrite the local cache\n    \"\"\"\n    raise NotImplementedError\n
    "},{"location":"API/formats.html#openqdc.datasets.structure.GeneralStructure.save_preprocess","title":"save_preprocess(preprocess_path, data_keys, data_dict, extra_data_keys, extra_data_types) abstractmethod","text":"

    Save the preprocessed data to the cache directory and optionally upload it to the remote storage. Must be implemented by the child class.

    Parameters:

    Name Type Description Default preprocess_path Union[str, PathLike]

    path to the preprocessed data file

    required data_keys List[str]

    list of keys to load from the data file

    required data_dict Dict[str, ndarray]

    dictionary of data to save

    required extra_data_keys List[str]

    list of keys to load from the extra data file

    required extra_data_types Dict[str, type]

    dictionary of data types for each key

    required Source code in openqdc/datasets/structure.py
    @abstractmethod\ndef save_preprocess(\n    self,\n    preprocess_path: Union[str, PathLike],\n    data_keys: List[str],\n    data_dict: Dict[str, np.ndarray],\n    extra_data_keys: List[str],\n    extra_data_types: Dict[str, type],\n) -> List[str]:\n    \"\"\"\n    Save the preprocessed data to the cache directory and optionally upload it to the remote storage.\n    Must be implemented by the child class.\n\n    Parameters:\n        preprocess_path:  path to the preprocessed data file\n        data_keys:        list of keys to load from the data file\n        data_dict:        dictionary of data to save\n        extra_data_keys:  list of keys to load from the extra data file\n        extra_data_types: dictionary of data types for each key\n    \"\"\"\n    raise NotImplementedError\n
    "},{"location":"API/formats.html#openqdc.datasets.structure.GeneralStructure.unpack","title":"unpack(data)","text":"

    Unpack the data from the loaded file.

    Parameters:

    Name Type Description Default data any

    the data to unpack

    required

    Returns:

    Type Description any

    the unpacked data

    Source code in openqdc/datasets/structure.py
    def unpack(self, data: any) -> any:\n    \"\"\"\n    Unpack the data from the loaded file.\n\n    Parameters:\n        data:  the data to unpack\n\n    Returns:\n        the unpacked data\n    \"\"\"\n    return data\n
    "},{"location":"API/formats.html#openqdc.datasets.structure.MemMapDataset","title":"MemMapDataset","text":"

    Bases: GeneralStructure

    Dataset structure for memory-mapped numpy arrays and props.pkl files.

    Source code in openqdc/datasets/structure.py
    class MemMapDataset(GeneralStructure):\n    \"\"\"\n    Dataset structure for memory-mapped numpy arrays and props.pkl files.\n    \"\"\"\n\n    _ext = \".mmap\"\n    _extra_files = [\"props.pkl\"]\n\n    @property\n    def load_fn(self):\n        return np.memmap\n\n    def save_preprocess(self, preprocess_path, data_keys, data_dict, extra_data_keys, extra_data_types) -> List[str]:\n        local_paths = []\n        for key in data_keys:\n            local_path = self.join_and_ext(preprocess_path, key)\n            out = np.memmap(local_path, mode=\"w+\", dtype=data_dict[key].dtype, shape=data_dict[key].shape)\n            out[:] = data_dict.pop(key)[:]\n            out.flush()\n            local_paths.append(local_path)\n\n        # save smiles and subset\n        local_path = p_join(preprocess_path, \"props.pkl\")\n\n        # assert that (required) pkl keys are present in data_dict\n        assert all([key in data_dict.keys() for key in extra_data_keys])\n\n        # store unique and inverse indices for str-based pkl keys\n        for key in extra_data_keys:\n            if extra_data_types[key] == str:\n                data_dict[key] = np.unique(data_dict[key], return_inverse=True)\n\n        with open(local_path, \"wb\") as f:\n            pkl.dump(data_dict, f)\n\n        local_paths.append(local_path)\n        return local_paths\n\n    def load_extra_files(self, data, preprocess_path, data_keys, pkl_data_keys, overwrite):\n        filename = p_join(preprocess_path, \"props.pkl\")\n        pull_locally(filename, overwrite=overwrite)\n        with open(filename, \"rb\") as f:\n            tmp = pkl.load(f)\n            all_pkl_keys = set(tmp.keys()) - set(data_keys)\n            # assert required pkl_keys are present in all_pkl_keys\n            assert all([key in all_pkl_keys for key in pkl_data_keys])\n            for key in all_pkl_keys:\n                x = tmp.pop(key)\n                if len(x) == 2:\n                    data[key] = x[0][x[1]]\n                else:\n                    data[key] = x\n        return data\n
    "},{"location":"API/formats.html#openqdc.datasets.structure.ZarrDataset","title":"ZarrDataset","text":"

    Bases: GeneralStructure

    Dataset structure for zarr files.

    Source code in openqdc/datasets/structure.py
    class ZarrDataset(GeneralStructure):\n    \"\"\"\n    Dataset structure for zarr files.\n    \"\"\"\n\n    _ext = \".zip\"\n    _extra_files = [\"metadata.zip\"]\n    _zarr_version = 2\n\n    @property\n    def load_fn(self):\n        return zarr.open\n\n    def unpack(self, data):\n        return data[:]\n\n    def save_preprocess(self, preprocess_path, data_keys, data_dict, extra_data_keys, extra_data_types) -> List[str]:\n        # os.makedirs(p_join(ds.root, \"zips\",  ds.__name__), exist_ok=True)\n        local_paths = []\n        for key, value in data_dict.items():\n            if key not in data_keys:\n                continue\n            zarr_path = self.join_and_ext(preprocess_path, key)\n            value = data_dict.pop(key)\n            z = zarr.open(\n                zarr.storage.ZipStore(zarr_path),\n                \"w\",\n                zarr_version=self._zarr_version,\n                shape=value.shape,\n                dtype=value.dtype,\n            )\n            z[:] = value[:]\n            local_paths.append(zarr_path)\n            # if key in attrs:\n            #    z.attrs.update(attrs[key])\n\n        metadata = p_join(preprocess_path, \"metadata.zip\")\n\n        group = zarr.group(zarr.storage.ZipStore(metadata))\n\n        for key in extra_data_keys:\n            if extra_data_types[key] == str:\n                data_dict[key] = np.unique(data_dict[key], return_inverse=True)\n\n        for key, value in data_dict.items():\n            # sub=group.create_group(key)\n            if key in [\"name\", \"subset\"]:\n                data = group.create_dataset(key, shape=value[0].shape, dtype=value[0].dtype)\n                data[:] = value[0][:]\n                data2 = group.create_dataset(key + \"_ptr\", shape=value[1].shape, dtype=np.int32)\n                data2[:] = value[1][:]\n            else:\n                data = group.create_dataset(key, shape=value.shape, dtype=value.dtype)\n                data[:] = value[:]\n        local_paths.append(metadata)\n        return local_paths\n\n    def load_extra_files(self, data, preprocess_path, data_keys, pkl_data_keys, overwrite):\n        filename = self.join_and_ext(preprocess_path, \"metadata\")\n        pull_locally(filename, overwrite=overwrite)\n        tmp = self.load_fn(filename)\n        all_pkl_keys = set(tmp.keys()) - set(data_keys)\n        # assert required pkl_keys are present in all_pkl_keys\n        assert all([key in all_pkl_keys for key in pkl_data_keys])\n        for key in all_pkl_keys:\n            if key not in pkl_data_keys:\n                data[key] = tmp[key][:][tmp[key][:]]\n            else:\n                data[key] = tmp[key][:]\n        return data\n
    "},{"location":"API/methods.html","title":"QM Methods","text":""},{"location":"API/methods.html#openqdc.methods.enums.InteractionMethod","title":"InteractionMethod","text":"

    Bases: QmMethod

    Source code in openqdc/methods/enums.py
    class InteractionMethod(QmMethod):\n    CCSD_T_NN = Functional.CCSDT, BasisSet.NN\n    CCSD_T_CBS = Functional.CCSDT, BasisSet.CBS\n    CCSD_T_CC_PVDZ = Functional.CCSDT, BasisSet.CC_PVDZ\n    DCCSDT_HA_DZ = Functional.DCCSDT, BasisSet.HA_DZ\n    DCCSDT_HA_TZ = Functional.DCCSDT, BasisSet.HA_TZ\n    DLPNO_CCSDT = Functional.DLPNO_CCSDT, BasisSet.NONE\n    DLPNO_CCSDT0 = (\n        Functional.DLPNO_CCSDT0,\n        BasisSet.NONE,\n    )\n    FN_DMC = Functional.FN_DMC, BasisSet.NONE\n    FIXED = Functional.FIXED, BasisSet.NONE\n    LNO_CCSDT = Functional.LNO_CCSDT, BasisSet.NONE\n    MP2_CBS = Functional.MP2, BasisSet.CBS\n    MP2_CC_PVDZ = Functional.MP2, BasisSet.CC_PVDZ\n    MP2_CC_PVQZ = Functional.MP2, BasisSet.CC_PVQZ\n    MP2_CC_PVTZ = Functional.MP2, BasisSet.CC_PVTZ\n    MP2_5_CBS_ADZ = Functional.MP2_5, BasisSet.CBS_ADZ\n    MP2C_CBS = Functional.MP2C, BasisSet.CBS\n    QCISDT_CBS = Functional.QCISDT, BasisSet.CBS\n    SAPT0_AUG_CC_PWCVXZ = Functional.SAPT0, BasisSet.AUG_CC_PWCVXZ\n    SAPT0_JUN_CC_PVDZ = Functional.SAPT0, BasisSet.JUN_CC_PVDZ\n    SAPT0_JUN_CC_PVDDZ = Functional.SAPT0, BasisSet.JUN_CC_PVDDZ\n    SAPT0_AUG_CC_PVDDZ = Functional.SAPT0, BasisSet.AUG_CC_PVDDZ\n\n    @property\n    def atom_energies_dict(self):\n        \"\"\"Get an empty atomization energy dictionary because Interaction methods don't require this\"\"\"\n        return {}\n
    "},{"location":"API/methods.html#openqdc.methods.enums.InteractionMethod.atom_energies_dict","title":"atom_energies_dict property","text":"

    Get an empty atomization energy dictionary because Interaction methods don't require this

    "},{"location":"API/methods.html#openqdc.methods.enums.PotentialMethod","title":"PotentialMethod","text":"

    Bases: QmMethod

    Source code in openqdc/methods/enums.py
    class PotentialMethod(QmMethod):  # SPLIT FOR INTERACTIO ENERGIES AND FIX MD1\n    B1LYP_VWN5_DZP = Functional.B1LYP_VWN5, BasisSet.DZP\n    B1LYP_VWN5_SZ = Functional.B1LYP_VWN5, BasisSet.SZ\n    B1LYP_VWN5_TZP = Functional.B1LYP_VWN5, BasisSet.TZP\n    B1PW91_VWN5_DZP = Functional.B1PW91_VWN5, BasisSet.DZP\n    B1PW91_VWN5_SZ = Functional.B1PW91_VWN5, BasisSet.SZ\n    B1PW91_VWN5_TZP = Functional.B1PW91_VWN5, BasisSet.TZP\n    B3LYP_STO3G = Functional.B3LYP, BasisSet.STO3G  # TODO: calculate e0s\n    B3LYP_VWN5_DZP = Functional.B3LYP_VWN5, BasisSet.DZP\n    B3LYP_VWN5_SZ = Functional.B3LYP_VWN5, BasisSet.SZ\n    B3LYP_VWN5_TZP = Functional.B3LYP_VWN5, BasisSet.TZP\n    B3LYP_S_VWN5_DZP = Functional.B3LYP_S_VWN5, BasisSet.DZP\n    B3LYP_S_VWN5_SZ = Functional.B3LYP_S_VWN5, BasisSet.SZ\n    B3LYP_S_VWN5_TZP = Functional.B3LYP_S_VWN5, BasisSet.TZP\n    B3LYP_D_DZP = Functional.B3LYPD, BasisSet.DZP\n    B3LYP_D_SZ = Functional.B3LYPD, BasisSet.SZ\n    B3LYP_D_TZP = Functional.B3LYPD, BasisSet.TZP\n    B3LYP_D3_BJ_DEF2_TZVP = Functional.B3LYP_D3_BJ, BasisSet.DEF2_TZVP\n    B3LYP_6_31G_D = Functional.B3LYP, BasisSet.GSTAR\n    B3LYP_DEF2_TZVP = Functional.B3LYP, BasisSet.DEF2_TZVP\n    B97_1_DZP = Functional.B97_1, BasisSet.DZP\n    B97_1_SZ = Functional.B97_1, BasisSet.SZ\n    B97_1_TZP = Functional.B97_1, BasisSet.TZP\n    B97_2_DZP = Functional.B97_2, BasisSet.DZP\n    B97_2_SZ = Functional.B97_2, BasisSet.SZ\n    B97_2_TZP = Functional.B97_2, BasisSet.TZP\n    B97_D_DZP = Functional.B97_D, BasisSet.DZP\n    B97_D_SZ = Functional.B97_D, BasisSet.SZ\n    B97_D_TZP = Functional.B97_D, BasisSet.TZP\n    B97_DZP = Functional.B97, BasisSet.DZP\n    B97_SZ = Functional.B97, BasisSet.SZ\n    B97_TZP = Functional.B97, BasisSet.TZP\n    BECKE00_X_ONLY_DZP = Functional.BECKE00_X_ONLY, BasisSet.DZP\n    BECKE00_X_ONLY_SZ = Functional.BECKE00_X_ONLY, BasisSet.SZ\n    BECKE00_X_ONLY_TZP = Functional.BECKE00_X_ONLY, BasisSet.TZP\n    BECKE00_DZP = Functional.BECKE00, BasisSet.DZP\n    BECKE00_SZ = Functional.BECKE00, BasisSet.SZ\n    BECKE00_TZP = Functional.BECKE00, BasisSet.TZP\n    BECKE00X_XC_DZP = Functional.BECKE00X_XC, BasisSet.DZP\n    BECKE00X_XC_SZ = Functional.BECKE00X_XC, BasisSet.SZ\n    BECKE00X_XC_TZP = Functional.BECKE00X_XC, BasisSet.TZP\n    BECKE88X_BR89C_DZP = Functional.BECKE88X_BR89C, BasisSet.DZP\n    BECKE88X_BR89C_SZ = Functional.BECKE88X_BR89C, BasisSet.SZ\n    BECKE88X_BR89C_TZP = Functional.BECKE88X_BR89C, BasisSet.TZP\n    BHANDH_DZP = Functional.BHANDH, BasisSet.DZP\n    BHANDH_SZ = Functional.BHANDH, BasisSet.SZ\n    BHANDH_TZP = Functional.BHANDH, BasisSet.TZP\n    BHANDHLYP_DZP = Functional.BHANDHLYP, BasisSet.DZP\n    BHANDHLYP_SZ = Functional.BHANDHLYP, BasisSet.SZ\n    BHANDHLYP_TZP = Functional.BHANDHLYP, BasisSet.TZP\n    BLAP3_DZP = Functional.BLAP3, BasisSet.DZP\n    BLAP3_SZ = Functional.BLAP3, BasisSet.SZ\n    BLAP3_TZP = Functional.BLAP3, BasisSet.TZP\n    BLYP_D_DZP = Functional.BLYPD, BasisSet.DZP\n    BLYP_D_SZ = Functional.BLYPD, BasisSet.SZ\n    BLYP_D_TZP = Functional.BLYPD, BasisSet.TZP\n    BLYP_DZP = Functional.BLYP, BasisSet.DZP\n    BLYP_SZ = Functional.BLYP, BasisSet.SZ\n    BLYP_TZP = Functional.BLYP, BasisSet.TZP\n    BMTAU1_DZP = Functional.BMTAU1, BasisSet.DZP\n    BMTAU1_SZ = Functional.BMTAU1, BasisSet.SZ\n    BMTAU1_TZP = Functional.BMTAU1, BasisSet.TZP\n    BOP_DZP = Functional.BOP, BasisSet.DZP\n    BOP_SZ = Functional.BOP, BasisSet.SZ\n    BOP_TZP = Functional.BOP, BasisSet.TZP\n    BP_DZP = Functional.BP, BasisSet.DZP\n    BP_SZ = Functional.BP, BasisSet.SZ\n    BP_TZP = Functional.BP, BasisSet.TZP\n    BP86_D_DZP = Functional.BP86_D, BasisSet.DZP\n    BP86_D_SZ = Functional.BP86_D, BasisSet.SZ\n    BP86_D_TZP = Functional.BP86_D, BasisSet.TZP\n    CCSD_T_CBS = Functional.CCSDT, BasisSet.CBS\n    CCSD_T_CC_PVTZ = Functional.CCSDT, BasisSet.CC_PVDZ\n    CCSD_T_CC_PVDZ = Functional.CCSDT, BasisSet.CC_PVDZ\n    CCSD_CC_PVDZ = Functional.CCSD, BasisSet.CC_PVDZ\n\n    DFT3B = Functional.DFT3B, BasisSet.NONE\n    DSD_BLYP_D3_BJ_DEF2_TZVP = Functional.DSD_BLYP_D3_BJ, BasisSet.DEF2_TZVP\n    FT97_DZP = Functional.FT97, BasisSet.DZP\n    FT97_SZ = Functional.FT97, BasisSet.SZ\n    FT97_TZP = Functional.FT97, BasisSet.TZP\n    GFN1_XTB = Functional.GFN1_XTB, BasisSet.NONE\n    GFN2_XTB = Functional.GFN2_XTB, BasisSet.NONE\n    HCTH_120_DZP = Functional.HCTH_120, BasisSet.DZP\n    HCTH_120_SZ = Functional.HCTH_120, BasisSet.SZ\n    HCTH_120_TZP = Functional.HCTH_120, BasisSet.TZP\n    HCTH_147_DZP = Functional.HCTH_147, BasisSet.DZP\n    HCTH_147_SZ = Functional.HCTH_147, BasisSet.SZ\n    HCTH_147_TZP = Functional.HCTH_147, BasisSet.TZP\n    HCTH_407_DZP = Functional.HCTH_407, BasisSet.DZP\n    HCTH_407_SZ = Functional.HCTH_407, BasisSet.SZ\n    HCTH_407_TZP = Functional.HCTH_407, BasisSet.TZP\n    HCTH_93_DZP = Functional.HCTH_93, BasisSet.DZP\n    HCTH_93_SZ = Functional.HCTH_93, BasisSet.SZ\n    HCTH_93_TZP = Functional.HCTH_93, BasisSet.TZP\n    HF_DEF2_TZVP = Functional.HF, BasisSet.DEF2_TZVP\n    HF_CC_PVDZ = (\n        Functional.HF,\n        BasisSet.CC_PVDZ,\n    )\n    HF_CC_PVQZ = (\n        Functional.HF,\n        BasisSet.CC_PVQZ,\n    )\n    HF_CC_PVTZ = (\n        Functional.HF,\n        BasisSet.CC_PVTZ,\n    )\n    KCIS_MODIFIED_DZP = Functional.KCIS_MODIFIED, BasisSet.DZP\n    KCIS_MODIFIED_SZ = Functional.KCIS_MODIFIED, BasisSet.SZ\n    KCIS_MODIFIED_TZP = Functional.KCIS_MODIFIED, BasisSet.TZP\n    KCIS_ORIGINAL_DZP = Functional.KCIS_ORIGINAL, BasisSet.DZP\n    KCIS_ORIGINAL_SZ = Functional.KCIS_ORIGINAL, BasisSet.SZ\n    KCIS_ORIGINAL_TZP = Functional.KCIS_ORIGINAL, BasisSet.TZP\n    KMLYP_VWN5_DZP = Functional.KMLYP_VWN5, BasisSet.DZP\n    KMLYP_VWN5_SZ = Functional.KMLYP_VWN5, BasisSet.SZ\n    KMLYP_VWN5_TZP = Functional.KMLYP_VWN5, BasisSet.TZP\n    KT1_DZP = Functional.KT1, BasisSet.DZP\n    KT1_SZ = Functional.KT1, BasisSet.SZ\n    KT1_TZP = Functional.KT1, BasisSet.TZP\n    KT2_DZP = Functional.KT2, BasisSet.DZP\n    KT2_SZ = Functional.KT2, BasisSet.SZ\n    KT2_TZP = Functional.KT2, BasisSet.TZP\n    LDA_VWN_DZP = Functional.LDA_VWN, BasisSet.DZP\n    LDA_VWN_SZ = Functional.LDA_VWN, BasisSet.SZ\n    LDA_VWN_TZP = Functional.LDA_VWN, BasisSet.TZP\n    M05_2X_DZP = Functional.M05_2X, BasisSet.DZP\n    M05_2X_SZ = Functional.M05_2X, BasisSet.SZ\n    M05_2X_TZP = Functional.M05_2X, BasisSet.TZP\n    M05_DZP = Functional.M05, BasisSet.DZP\n    M05_SZ = Functional.M05, BasisSet.SZ\n    M05_TZP = Functional.M05, BasisSet.TZP\n    M06_2X_DZP = Functional.M06_2X, BasisSet.DZP\n    M06_2X_SZ = Functional.M06_2X, BasisSet.SZ\n    M06_2X_TZP = Functional.M06_2X, BasisSet.TZP\n    M06_L_DZP = Functional.M06_L, BasisSet.DZP\n    M06_L_SZ = Functional.M06_L, BasisSet.SZ\n    M06_L_TZP = Functional.M06_L, BasisSet.TZP\n    M06_DZP = Functional.M06, BasisSet.DZP\n    M06_SZ = Functional.M06, BasisSet.SZ\n    M06_TZP = Functional.M06, BasisSet.TZP\n    MP2_CC_PVDZ = Functional.MP2, BasisSet.CC_PVDZ\n    MP2_CC_PVQZ = Functional.MP2, BasisSet.CC_PVQZ\n    MP2_CC_PVTZ = Functional.MP2, BasisSet.CC_PVTZ\n    MPBE_DZP = Functional.MPBE, BasisSet.DZP\n    MPBE_SZ = Functional.MPBE, BasisSet.SZ\n    MPBE_TZP = Functional.MPBE, BasisSet.TZP\n    MPBE0KCIS_DZP = Functional.MPBE0KCIS, BasisSet.DZP\n    MPBE0KCIS_SZ = Functional.MPBE0KCIS, BasisSet.SZ\n    MPBE0KCIS_TZP = Functional.MPBE0KCIS, BasisSet.TZP\n    MPBE1KCIS_DZP = Functional.MPBE1KCIS, BasisSet.DZP\n    MPBE1KCIS_SZ = Functional.MPBE1KCIS, BasisSet.SZ\n    MPBE1KCIS_TZP = Functional.MPBE1KCIS, BasisSet.TZP\n    MPBEKCIS_DZP = Functional.MPBEKCIS, BasisSet.DZP\n    MPBEKCIS_SZ = Functional.MPBEKCIS, BasisSet.SZ\n    MPBEKCIS_TZP = Functional.MPBEKCIS, BasisSet.TZP\n    MPW_DZP = Functional.MPW, BasisSet.DZP\n    MPW_SZ = Functional.MPW, BasisSet.SZ\n    MPW_TZP = Functional.MPW, BasisSet.TZP\n    MPW1K_DZP = Functional.MPW1K, BasisSet.DZP\n    MPW1K_SZ = Functional.MPW1K, BasisSet.SZ\n    MPW1K_TZP = Functional.MPW1K, BasisSet.TZP\n    MPW1PW_DZP = Functional.MPW1PW, BasisSet.DZP\n    MPW1PW_SZ = Functional.MPW1PW, BasisSet.SZ\n    MPW1PW_TZP = Functional.MPW1PW, BasisSet.TZP\n    MVS_DZP = Functional.MVS, BasisSet.DZP\n    MVS_SZ = Functional.MVS, BasisSet.SZ\n    MVS_TZP = Functional.MVS, BasisSet.TZP\n    MVSX_DZP = Functional.MVSX, BasisSet.DZP\n    MVSX_SZ = Functional.MVSX, BasisSet.SZ\n    MVSX_TZP = Functional.MVSX, BasisSet.TZP\n    O3LYP_VWN5_DZP = Functional.O3LYP_VWN5, BasisSet.DZP\n    O3LYP_VWN5_SZ = Functional.O3LYP_VWN5, BasisSet.SZ\n    O3LYP_VWN5_TZP = Functional.O3LYP_VWN5, BasisSet.TZP\n    OLAP3_DZP = Functional.OLAP3, BasisSet.DZP\n    OLAP3_SZ = Functional.OLAP3, BasisSet.SZ\n    OLAP3_TZP = Functional.OLAP3, BasisSet.TZP\n    OLYP_DZP = Functional.OLYP, BasisSet.DZP\n    OLYP_SZ = Functional.OLYP, BasisSet.SZ\n    OLYP_TZP = Functional.OLYP, BasisSet.TZP\n    OPBE_DZP = Functional.OPBE, BasisSet.DZP\n    OPBE_SZ = Functional.OPBE, BasisSet.SZ\n    OPBE_TZP = Functional.OPBE, BasisSet.TZP\n    OPBE0_DZP = Functional.OPBE0, BasisSet.DZP\n    OPBE0_SZ = Functional.OPBE0, BasisSet.SZ\n    OPBE0_TZP = Functional.OPBE0, BasisSet.TZP\n    OPERDEW_DZP = Functional.OPERDEW, BasisSet.DZP\n    OPERDEW_SZ = Functional.OPERDEW, BasisSet.SZ\n    OPERDEW_TZP = Functional.OPERDEW, BasisSet.TZP\n    PBE_D_DZP = Functional.PBE_D, BasisSet.DZP\n    PBE_D_SZ = Functional.PBE_D, BasisSet.SZ\n    PBE_D_TZP = Functional.PBE_D, BasisSet.TZP\n    PBE_D3_BJ_DEF2_TZVP = Functional.PBE_D3_BJ, BasisSet.DEF2_TZVP\n    PBE_DEF2_TZVP = Functional.PBE, BasisSet.DEF2_TZVP\n    PBE_DZP = Functional.PBE, BasisSet.DZP\n    PBE_SZ = Functional.PBE, BasisSet.SZ\n    PBE_TZP = Functional.PBE, BasisSet.TZP\n    PBE0_DZP = Functional.PBE0, BasisSet.DZP\n    PBE0_DEF2_TZVP = Functional.PBE0, BasisSet.DEF2_TZVP\n    PBE0_SZ = Functional.PBE0, BasisSet.SZ\n    PBE0_TZP = Functional.PBE0, BasisSet.TZP\n    PBE0_MBD_DEF2_TZVPP = Functional.PBE0_MBD, BasisSet.DEF2_TZVPPD\n    PBESOL_DZP = Functional.PBESOL, BasisSet.DZP\n    PBESOL_SZ = Functional.PBESOL, BasisSet.SZ\n    PBESOL_TZP = Functional.PBESOL, BasisSet.TZP\n    PKZB_DZP = Functional.PKZB, BasisSet.DZP\n    PKZB_SZ = Functional.PKZB, BasisSet.SZ\n    PKZB_TZP = Functional.PKZB, BasisSet.TZP\n    PKZBX_KCISCOR_DZP = Functional.PKZBX_KCISCOR, BasisSet.DZP\n    PKZBX_KCISCOR_SZ = Functional.PKZBX_KCISCOR, BasisSet.SZ\n    PKZBX_KCISCOR_TZP = Functional.PKZBX_KCISCOR, BasisSet.TZP\n    PM6 = Functional.PM6, BasisSet.NONE\n    PW91_DZP = Functional.PW91, BasisSet.DZP\n    PW91_SZ = Functional.PW91, BasisSet.SZ\n    PW91_TZP = Functional.PW91, BasisSet.TZP\n    REVPBE_D3_BJ_DEF2_TZVP = Functional.REVPBE_D3_BJ, BasisSet.DEF2_TZVP\n    REVPBE_DZP = Functional.REVPBE, BasisSet.DZP\n    REVPBE_SZ = Functional.REVPBE, BasisSet.SZ\n    REVPBE_TZP = Functional.REVPBE, BasisSet.TZP\n    REVTPSS_DZP = Functional.REVTPSS, BasisSet.DZP\n    REVTPSS_SZ = Functional.REVTPSS, BasisSet.SZ\n    REVTPSS_TZP = Functional.REVTPSS, BasisSet.TZP\n    RGE2_DZP = Functional.RGE2, BasisSet.DZP\n    RGE2_SZ = Functional.RGE2, BasisSet.SZ\n    RGE2_TZP = Functional.RGE2, BasisSet.TZP\n    RPBE_DZP = Functional.RPBE, BasisSet.DZP\n    RPBE_SZ = Functional.RPBE, BasisSet.SZ\n    RPBE_TZP = Functional.RPBE, BasisSet.TZP\n    SSB_D_DZP = Functional.SSB_D, BasisSet.DZP\n    SSB_D_SZ = Functional.SSB_D, BasisSet.SZ\n    SSB_D_TZP = Functional.SSB_D, BasisSet.TZP\n    SVWN_DEF2_TZVP = Functional.SVWN, BasisSet.DEF2_TZVP\n    TMGGA_DZP = Functional.TMGGA, BasisSet.DZP\n    TMGGA_SZ = Functional.TMGGA, BasisSet.SZ\n    TMGGA_TZP = Functional.TMGGA, BasisSet.TZP\n    TAU_HCTH_HYBRID_DZP = Functional.TAU_HCTH_HYBRID, BasisSet.DZP\n    TAU_HCTH_HYBRID_SZ = Functional.TAU_HCTH_HYBRID, BasisSet.SZ\n    TAU_HCTH_HYBRID_TZP = Functional.TAU_HCTH_HYBRID, BasisSet.TZP\n    TAU_HCTH_DZP = Functional.TAU_HCTH, BasisSet.DZP\n    TAU_HCTH_SZ = Functional.TAU_HCTH, BasisSet.SZ\n    TAU_HCTH_TZP = Functional.TAU_HCTH, BasisSet.TZP\n    TCSSD_T_CC_PVDZ = Functional.TCSSD_T, BasisSet.CC_PVDZ\n    TPSSD_DZP = Functional.TPSSD, BasisSet.DZP\n    TPSSD_SZ = Functional.TPSSD, BasisSet.SZ\n    TPSSD_TZP = Functional.TPSSD, BasisSet.TZP\n    TPSS_DZP = Functional.TPSS, BasisSet.DZP\n    TPSS_SZ = Functional.TPSS, BasisSet.SZ\n    TPSS_TZP = Functional.TPSS, BasisSet.TZP\n    TPSSH_DEF2_TZVP = Functional.TPSSH, BasisSet.DEF2_TZVP\n    TPSSH_DZP = Functional.TPSSH, BasisSet.DZP\n    TPSSH_SZ = Functional.TPSSH, BasisSet.SZ\n    TPSSH_TZP = Functional.TPSSH, BasisSet.TZP\n    TTM2_1_F = Functional.TTM2_1_F, BasisSet.NONE\n    VS98_X_XC_DZP = Functional.VS98_X_XC, BasisSet.DZP\n    VS98_X_XC_SZ = Functional.VS98_X_XC, BasisSet.SZ\n    VS98_X_XC_TZP = Functional.VS98_X_XC, BasisSet.TZP\n    VS98_X_ONLY_DZP = Functional.VS98_X_ONLY, BasisSet.DZP\n    VS98_X_ONLY_SZ = Functional.VS98_X_ONLY, BasisSet.SZ\n    VS98_X_ONLY_TZP = Functional.VS98_X_ONLY, BasisSet.TZP\n    VS98_DZP = Functional.VS98, BasisSet.DZP\n    VS98_SZ = Functional.VS98, BasisSet.SZ\n    VS98_TZP = Functional.VS98, BasisSet.TZP\n    WB97M_D3BJ_DEF2_TZVPPD = Functional.WB97M_D3BJ, BasisSet.DEF2_TZVPPD\n    WB97X_D_DEF2_SVP = Functional.WB97X_D, BasisSet.DEF2_SVP\n    WB97X_D3_DEF2_TZVP = Functional.WB97X_D3, BasisSet.DEF2_TZVP\n    WB97X_D3_CC_PVDZ = Functional.WB97X_D3, BasisSet.CC_PVDZ\n    WB97X_6_31G_D = Functional.WB97X, BasisSet.GSTAR\n    WB97X_CC_PVTZ = Functional.WB97X, BasisSet.CC_PVTZ\n    X3LYP_VWN5_DZP = Functional.X3LYP_VWN5, BasisSet.DZP\n    X3LYP_VWN5_SZ = Functional.X3LYP_VWN5, BasisSet.SZ\n    X3LYP_VWN5_TZP = Functional.X3LYP_VWN5, BasisSet.TZP\n    XLYP_DZP = Functional.XLYP, BasisSet.DZP\n    XLYP_SZ = Functional.XLYP, BasisSet.SZ\n    XLYP_TZP = Functional.XLYP, BasisSet.TZP\n    NONE = Functional.NONE, BasisSet.NONE\n\n    def _build_default_dict(self):\n        e0_dict = {}\n        for SYMBOL in ATOM_SYMBOLS:\n            for CHARGE in range(-10, 11):\n                e0_dict[(SYMBOL, CHARGE)] = array([0], dtype=float32)\n        return e0_dict\n\n    @property\n    def atom_energies_dict(self):\n        \"\"\"Get the atomization energy dictionary\"\"\"\n        key = str(self)\n        try:\n            # print(key)\n            energies = atom_energy_collection.get(key, {})\n            if len(energies) == 0:\n                raise\n        except:  # noqa\n            logger.info(f\"No available atomization energy for the QM method {key}. All values are set to 0.\")\n            energies = self._build_default_dict()\n        return energies\n
    "},{"location":"API/methods.html#openqdc.methods.enums.PotentialMethod.atom_energies_dict","title":"atom_energies_dict property","text":"

    Get the atomization energy dictionary

    "},{"location":"API/methods.html#openqdc.methods.enums.QmMethod","title":"QmMethod","text":"

    Bases: Enum

    Source code in openqdc/methods/enums.py
    class QmMethod(Enum):\n    def __init__(self, functional: Functional, basis_set: BasisSet, cost: float = 0):\n        self.functional = functional\n        self.basis_set = basis_set\n        self.cost = cost\n\n    def __str__(self):\n        if self.basis_set != \"\":\n            s = \"/\".join([str(self.functional), str(self.basis_set)])\n        else:\n            s = str(self.functional)\n        return s\n\n    @property\n    def atom_energies_matrix(self):\n        \"\"\"Get the atomization energy matrix\"\"\"\n        energies = self.atom_energies_dict\n        mat = to_e_matrix(energies)\n\n        return mat\n\n    @property\n    def atom_energies_dict(self):\n        \"\"\"Get the atomization energy dictionary\"\"\"\n        raise NotImplementedError()\n
    "},{"location":"API/methods.html#openqdc.methods.enums.QmMethod.atom_energies_dict","title":"atom_energies_dict property","text":"

    Get the atomization energy dictionary

    "},{"location":"API/methods.html#openqdc.methods.enums.QmMethod.atom_energies_matrix","title":"atom_energies_matrix property","text":"

    Get the atomization energy matrix

    "},{"location":"API/methods.html#isolated-atom-energies","title":"Isolated Atom Energies","text":""},{"location":"API/methods.html#openqdc.methods.atom_energies.to_e_matrix","title":"to_e_matrix(atom_energies)","text":"

    Get the matrix of isolated atom energies for a dict of non-null values calculates

    Parameters:

    Name Type Description Default atom_energies Dict

    Dict of energies computed for a given QM method. Keys are pairs of (atom, charge) and values are energy values

    required

    np.ndarray of shape (MAX_ATOMIC_NUMBER, 2 * MAX_CHARGE + 1)

    Type Description ndarray

    Matrix containing the isolated atom energies for each atom and charge written in the form:

            |   | -2 | -1 | 0 | +1 | +2 | <- charges\n        |---|----|----|---|----|----|\n        | 0 |    |    |   |    |    |\n        | 1 |    |    |   |    |    |\n        | 2 |    |    |   |    |    |\n
    Source code in openqdc/methods/atom_energies.py
    def to_e_matrix(atom_energies: Dict) -> np.ndarray:\n    \"\"\"\n    Get the matrix of isolated atom energies for a dict of non-null values calculates\n\n    Parameters:\n        atom_energies: Dict of energies computed for a given QM method.\n            Keys are pairs of (atom, charge) and values are energy values\n\n    Returns: np.ndarray of shape (MAX_ATOMIC_NUMBER, 2 * MAX_CHARGE + 1)\n        Matrix containing the isolated atom energies for each atom and charge written in the form:\n\n                        |   | -2 | -1 | 0 | +1 | +2 | <- charges\n                        |---|----|----|---|----|----|\n                        | 0 |    |    |   |    |    |\n                        | 1 |    |    |   |    |    |\n                        | 2 |    |    |   |    |    |\n    \"\"\"\n\n    matrix = np.zeros((MAX_ATOMIC_NUMBER, MAX_CHARGE_NUMBER))\n    if len(atom_energies) > 0:\n        for key in atom_energies.keys():\n            try:\n                matrix[ATOMIC_NUMBERS[key[0]], key[1] + MAX_CHARGE] = atom_energies[key]\n            except KeyError:\n                logger.error(f\"Isolated atom energies not found for {key}\")\n    return matrix\n
    "},{"location":"API/properties.html","title":"Defined properties for datasets","text":""},{"location":"API/properties.html#openqdc.datasets.properties.DatasetPropertyMixIn","title":"DatasetPropertyMixIn","text":"

    Mixin class for BaseDataset class to add properties that are common to all datasets.

    Source code in openqdc/datasets/properties.py
    class DatasetPropertyMixIn:\n    \"\"\"\n    Mixin class for BaseDataset class to add\n    properties that are common to all datasets.\n    \"\"\"\n\n    @property\n    def atoms_per_molecules(self):\n        try:\n            if hasattr(self, \"_n_atoms\"):\n                return self._n_atoms\n            self._n_atoms = self.data[\"n_atoms\"]\n            return self._n_atoms\n        except:  # noqa\n            return None\n\n    @property\n    def _stats(self):\n        return self.__stats__\n\n    def _compute_average_nb_atoms(self):\n        self.__average_nb_atoms__ = np.mean(self.data[\"n_atoms\"])\n\n    @property\n    def average_n_atoms(self) -> int:\n        \"\"\"\n        Average number of atoms in a molecule in the dataset.\n\n        Returns:\n            Average number of atoms in a molecule in the dataset.\n        \"\"\"\n        if self.__average_nb_atoms__ is None:\n            raise StatisticsNotAvailableError(self.__name__)\n        return self.__average_nb_atoms__\n\n    @property\n    def numbers(self) -> np.ndarray:\n        \"\"\"\n        Unique atomic numbers in the dataset\n\n        Returns:\n            Array of the unique atomic numbers in the dataset\n        \"\"\"\n        if hasattr(self, \"_numbers\"):\n            return self._numbers\n        self._numbers = pd.unique(self.data[\"atomic_inputs\"][..., 0]).astype(np.int32)\n        return self._numbers\n\n    @property\n    def charges(self) -> np.ndarray:\n        \"\"\"\n        Unique charges in the dataset\n\n        Returns:\n            Array of the unique charges in the dataset\n        \"\"\"\n        if hasattr(self, \"_charges\"):\n            return self._charges\n        self._charges = np.unique(self.data[\"atomic_inputs\"][..., :2], axis=0).astype(np.int32)\n        return self._charges\n\n    @property\n    def min_max_charges(self) -> Tuple[int, int]:\n        \"\"\"\n        Minimum and maximum charges in the dataset\n\n        Returns:\n            (min_charge, max_charge)\n        \"\"\"\n        if hasattr(self, \"_min_max_charges\"):\n            return self._min_max_charges\n        self._min_max_charges = np.min(self.charges[:, 1]), np.max(self.charges[:, 1])\n        return self._min_max_charges\n\n    @property\n    def chemical_species(self) -> np.ndarray:\n        \"\"\"\n        Chemical symbols in the dataset\n\n        Returns:\n            Array of the chemical symbols in the dataset\n        \"\"\"\n        return np.array(ATOM_SYMBOLS)[self.numbers]\n
    "},{"location":"API/properties.html#openqdc.datasets.properties.DatasetPropertyMixIn.average_n_atoms","title":"average_n_atoms: int property","text":"

    Average number of atoms in a molecule in the dataset.

    Returns:

    Type Description int

    Average number of atoms in a molecule in the dataset.

    "},{"location":"API/properties.html#openqdc.datasets.properties.DatasetPropertyMixIn.charges","title":"charges: np.ndarray property","text":"

    Unique charges in the dataset

    Returns:

    Type Description ndarray

    Array of the unique charges in the dataset

    "},{"location":"API/properties.html#openqdc.datasets.properties.DatasetPropertyMixIn.chemical_species","title":"chemical_species: np.ndarray property","text":"

    Chemical symbols in the dataset

    Returns:

    Type Description ndarray

    Array of the chemical symbols in the dataset

    "},{"location":"API/properties.html#openqdc.datasets.properties.DatasetPropertyMixIn.min_max_charges","title":"min_max_charges: Tuple[int, int] property","text":"

    Minimum and maximum charges in the dataset

    Returns:

    Type Description Tuple[int, int]

    (min_charge, max_charge)

    "},{"location":"API/properties.html#openqdc.datasets.properties.DatasetPropertyMixIn.numbers","title":"numbers: np.ndarray property","text":"

    Unique atomic numbers in the dataset

    Returns:

    Type Description ndarray

    Array of the unique atomic numbers in the dataset

    "},{"location":"API/regressor.html","title":"Normalization regressor","text":"

    Linear Atom Energies regression utilities.

    "},{"location":"API/regressor.html#openqdc.utils.regressor.LinearSolver","title":"LinearSolver","text":"

    Bases: Solver

    Linear regression solver.

    Note

    No Uncertainty associated as it is quite small.

    Source code in openqdc/utils/regressor.py
    class LinearSolver(Solver):\n    \"\"\"\n    Linear regression solver.\n\n    Note:\n        No Uncertainty associated as it is quite small.\n    \"\"\"\n\n    _regr_str = \"linear\"\n\n    @staticmethod\n    def solve(X, y):\n        X, y, y_mean = atom_standardization(X, y)\n        E0s = np.linalg.lstsq(X, y, rcond=None)[0]\n        return E0s, None\n
    "},{"location":"API/regressor.html#openqdc.utils.regressor.Regressor","title":"Regressor","text":"

    Regressor class for preparing and solving regression problem for isolated atom energies. A isolated atom energy regression problem is defined as:

    X = [n_samples, n_species] (number of atoms of each species per sample)

    Y = [n_samples, ] (energies)

    The regression problem is solved by solving the linear system X E0 = Y.

    Example

    For a sytem of 2 samples (H20, CH4)

    n_species = 3, n_samples = 2\n\nH20 = 2H , 1O -> X = [2, 1, 0]\n\nCH4 = 4C, 1H -> X = [1, 0, 4]\n\nX = [[2, 1, 0],\n    [ 1, 0, 4]]\n\nY = [[10, 20]]\n\nX E0 = Y\n

    Linear system to solve

    [[2 eH, 1 eO, 0 eC],\n[ 1 eH, 0 eO, 4 eC]] = [[10, 20]]\n
    Source code in openqdc/utils/regressor.py
    class Regressor:\n    \"\"\"\n    Regressor class for preparing and solving regression problem for isolated atom energies.\n    A isolated atom energy regression problem is defined as:\\n\n    X = [n_samples, n_species] (number of atoms of each species per sample)\\n\n    Y = [n_samples, ] (energies)\\n\n    The regression problem is solved by solving the linear system X E0 = Y.\n\n    Example:\n        For a sytem of 2 samples (H20, CH4)\\n\n            n_species = 3, n_samples = 2\\n\n            H20 = 2H , 1O -> X = [2, 1, 0]\\n\n            CH4 = 4C, 1H -> X = [1, 0, 4]\\n\n            X = [[2, 1, 0],\n                [ 1, 0, 4]]\\n\n            Y = [[10, 20]]\\n\n            X E0 = Y\\n\n        Linear system to solve\\n\n            [[2 eH, 1 eO, 0 eC],\n            [ 1 eH, 0 eO, 4 eC]] = [[10, 20]]\n    \"\"\"\n\n    solver: Solver\n\n    def __init__(\n        self,\n        energies: np.ndarray,\n        atomic_numbers: np.ndarray,\n        position_idx_range: np.ndarray,\n        solver_type: str = \"linear\",\n        stride: int = 1,\n        subsample: Optional[Union[float, int]] = None,\n        remove_nan: bool = True,\n        *args: any,\n        **kwargs: any,\n    ):\n        \"\"\"\n        Regressor class for preparing and solving regression problem for isolated atom energies.\n\n        Parameters:\n            energies:\n                numpy array of energies in the shape (n_samples, n_energy_methods)\n            atomic_numbers:\n                numpy array of atomic numbers in the shape (n_atoms,)\n            position_idx_range:\n                array of shape (n_samples, 2) containing the start and end indices of the atoms in the dataset\n            solver_type: Type of solver to use. [\"linear\", \"ridge\"]\n            stride: Stride to use for the regression.\n            subsample: Sumsample the dataset.\n                If a float, it is interpreted as a fraction of the dataset to use.\n                If >1 it is interpreted as the number of samples to use.\n            remove_nan: Sanitize the dataset by removing energies samples with NaN values.\n            *args: Additional arguments to be passed to the regressor.\n            **kwargs: Additional keyword arguments to be passed to the regressor.\n        \"\"\"\n        self.subsample = subsample\n        self.stride = stride\n        self.solver_type = solver_type.lower()\n        self.energies = energies\n        self.atomic_numbers = atomic_numbers\n        self.numbers = pd.unique(atomic_numbers)\n        self.position_idx_range = position_idx_range\n        self.remove_nan = remove_nan\n        self.hparams = {\n            \"subsample\": subsample,\n            \"stride\": stride,\n            \"solver_type\": solver_type,\n        }\n        self._post_init()\n\n    @classmethod\n    def from_openqdc_dataset(cls, dataset: any, *args: any, **kwargs: any) -> \"Regressor\":\n        \"\"\"\n        Initialize the regressor object from an openqdc dataset. This is the default method.\n        *args and and **kwargs are passed to the __init__ method and depends on the specific regressor.\n\n        Parameters:\n            dataset: openqdc dataset object.\n            *args: Additional arguments to be passed to the regressor.\n            **kwargs: Additional keyword arguments to be passed to the regressor.\n\n        Returns:\n            Instance of the regressor class.\n        \"\"\"\n        energies = dataset.data[\"energies\"]\n        position_idx_range = dataset.data[\"position_idx_range\"]\n        atomic_numbers = dataset.data[\"atomic_inputs\"][:, 0].astype(\"int32\")\n        return cls(energies, atomic_numbers, position_idx_range, *args, **kwargs)\n\n    def _post_init(self):\n        if self.subsample is not None:\n            self._downsample()\n        self._prepare_inputs()\n        self.solver = self._get_solver()\n\n    def update_hparams(self, hparams):\n        self.hparams.update(hparams)\n\n    def _downsample(self):\n        if self.subsample < 1:\n            idxs = np.arange(self.energies.shape[0])\n            np.random.shuffle(idxs)\n            idxs = idxs[: int(self.energies.shape[0] * self.subsample)]\n            self.energies = self.energies[:: int(1 / self.subsample)]\n            self.position_idx_range = self.position_idx_range[:: int(1 / self.subsample)]\n        else:\n            idxs = np.random.randint(0, self.energies.shape[0], int(self.subsample))\n            self.energies = self.energies[idxs]\n            self.position_idx_range = self.position_idx_range[idxs]\n        self.update_hparams({\"idxs\": idxs})\n\n    def _get_solver(self):\n        try:\n            return AVAILABLE_SOLVERS[self.solver_type]()\n        except KeyError:\n            logger.warning(f\"Unknown solver type {self.solver_type}, defaulting to linear regression.\")\n            return LinearSolver()\n\n    def _prepare_inputs(self) -> Tuple[np.ndarray, np.ndarray]:\n        logger.info(\"Preparing inputs for regression.\")\n        len_train = self.energies.shape[0]\n        len_zs = len(self.numbers)\n        A = np.zeros((len_train, len_zs))[:: self.stride]\n        B = self.energies[:: self.stride]\n        for i, ij in enumerate(self.position_idx_range[:: self.stride]):\n            tmp = self.atomic_numbers[ij[0] : ij[1]]\n            for j, z in enumerate(self.numbers):\n                A[i, j] = np.count_nonzero(tmp == z)\n        self.X = A\n        self.y = B\n\n    def solve(self):\n        \"\"\"\n        Solve the regression problem and return the predicted isolated energies and the estimated uncertainty.\n        \"\"\"\n        logger.info(f\"Solving regression with {self.solver}.\")\n        E0_list, cov_list = [], []\n        for energy_idx in range(self.y.shape[1]):\n            if self.remove_nan:\n                idxs = non_nan_idxs(self.y[:, energy_idx])\n                X, y = self.X[idxs], self.y[idxs, energy_idx]\n            else:\n                X, y = self.X, self.y[:, energy_idx]\n            E0s, cov = self.solver(X, y)\n            if cov is None:\n                cov = np.zeros_like(E0s) + 1.0\n            E0_list.append(E0s)\n            cov_list.append(cov)\n        return np.vstack(E0_list).T, np.vstack(cov_list).T\n\n    def __call__(self):\n        return self.solve()\n
    "},{"location":"API/regressor.html#openqdc.utils.regressor.Regressor.__init__","title":"__init__(energies, atomic_numbers, position_idx_range, solver_type='linear', stride=1, subsample=None, remove_nan=True, *args, **kwargs)","text":"

    Regressor class for preparing and solving regression problem for isolated atom energies.

    Parameters:

    Name Type Description Default energies ndarray

    numpy array of energies in the shape (n_samples, n_energy_methods)

    required atomic_numbers ndarray

    numpy array of atomic numbers in the shape (n_atoms,)

    required position_idx_range ndarray

    array of shape (n_samples, 2) containing the start and end indices of the atoms in the dataset

    required solver_type str

    Type of solver to use. [\"linear\", \"ridge\"]

    'linear' stride int

    Stride to use for the regression.

    1 subsample Optional[Union[float, int]]

    Sumsample the dataset. If a float, it is interpreted as a fraction of the dataset to use. If >1 it is interpreted as the number of samples to use.

    None remove_nan bool

    Sanitize the dataset by removing energies samples with NaN values.

    True *args any

    Additional arguments to be passed to the regressor.

    () **kwargs any

    Additional keyword arguments to be passed to the regressor.

    {} Source code in openqdc/utils/regressor.py
    def __init__(\n    self,\n    energies: np.ndarray,\n    atomic_numbers: np.ndarray,\n    position_idx_range: np.ndarray,\n    solver_type: str = \"linear\",\n    stride: int = 1,\n    subsample: Optional[Union[float, int]] = None,\n    remove_nan: bool = True,\n    *args: any,\n    **kwargs: any,\n):\n    \"\"\"\n    Regressor class for preparing and solving regression problem for isolated atom energies.\n\n    Parameters:\n        energies:\n            numpy array of energies in the shape (n_samples, n_energy_methods)\n        atomic_numbers:\n            numpy array of atomic numbers in the shape (n_atoms,)\n        position_idx_range:\n            array of shape (n_samples, 2) containing the start and end indices of the atoms in the dataset\n        solver_type: Type of solver to use. [\"linear\", \"ridge\"]\n        stride: Stride to use for the regression.\n        subsample: Sumsample the dataset.\n            If a float, it is interpreted as a fraction of the dataset to use.\n            If >1 it is interpreted as the number of samples to use.\n        remove_nan: Sanitize the dataset by removing energies samples with NaN values.\n        *args: Additional arguments to be passed to the regressor.\n        **kwargs: Additional keyword arguments to be passed to the regressor.\n    \"\"\"\n    self.subsample = subsample\n    self.stride = stride\n    self.solver_type = solver_type.lower()\n    self.energies = energies\n    self.atomic_numbers = atomic_numbers\n    self.numbers = pd.unique(atomic_numbers)\n    self.position_idx_range = position_idx_range\n    self.remove_nan = remove_nan\n    self.hparams = {\n        \"subsample\": subsample,\n        \"stride\": stride,\n        \"solver_type\": solver_type,\n    }\n    self._post_init()\n
    "},{"location":"API/regressor.html#openqdc.utils.regressor.Regressor.from_openqdc_dataset","title":"from_openqdc_dataset(dataset, *args, **kwargs) classmethod","text":"

    Initialize the regressor object from an openqdc dataset. This is the default method. args and and *kwargs are passed to the init method and depends on the specific regressor.

    Parameters:

    Name Type Description Default dataset any

    openqdc dataset object.

    required *args any

    Additional arguments to be passed to the regressor.

    () **kwargs any

    Additional keyword arguments to be passed to the regressor.

    {}

    Returns:

    Type Description Regressor

    Instance of the regressor class.

    Source code in openqdc/utils/regressor.py
    @classmethod\ndef from_openqdc_dataset(cls, dataset: any, *args: any, **kwargs: any) -> \"Regressor\":\n    \"\"\"\n    Initialize the regressor object from an openqdc dataset. This is the default method.\n    *args and and **kwargs are passed to the __init__ method and depends on the specific regressor.\n\n    Parameters:\n        dataset: openqdc dataset object.\n        *args: Additional arguments to be passed to the regressor.\n        **kwargs: Additional keyword arguments to be passed to the regressor.\n\n    Returns:\n        Instance of the regressor class.\n    \"\"\"\n    energies = dataset.data[\"energies\"]\n    position_idx_range = dataset.data[\"position_idx_range\"]\n    atomic_numbers = dataset.data[\"atomic_inputs\"][:, 0].astype(\"int32\")\n    return cls(energies, atomic_numbers, position_idx_range, *args, **kwargs)\n
    "},{"location":"API/regressor.html#openqdc.utils.regressor.Regressor.solve","title":"solve()","text":"

    Solve the regression problem and return the predicted isolated energies and the estimated uncertainty.

    Source code in openqdc/utils/regressor.py
    def solve(self):\n    \"\"\"\n    Solve the regression problem and return the predicted isolated energies and the estimated uncertainty.\n    \"\"\"\n    logger.info(f\"Solving regression with {self.solver}.\")\n    E0_list, cov_list = [], []\n    for energy_idx in range(self.y.shape[1]):\n        if self.remove_nan:\n            idxs = non_nan_idxs(self.y[:, energy_idx])\n            X, y = self.X[idxs], self.y[idxs, energy_idx]\n        else:\n            X, y = self.X, self.y[:, energy_idx]\n        E0s, cov = self.solver(X, y)\n        if cov is None:\n            cov = np.zeros_like(E0s) + 1.0\n        E0_list.append(E0s)\n        cov_list.append(cov)\n    return np.vstack(E0_list).T, np.vstack(cov_list).T\n
    "},{"location":"API/regressor.html#openqdc.utils.regressor.RidgeSolver","title":"RidgeSolver","text":"

    Bases: Solver

    Ridge regression solver.

    Source code in openqdc/utils/regressor.py
    class RidgeSolver(Solver):\n    \"\"\"\n    Ridge regression solver.\n    \"\"\"\n\n    _regr_str = \"ridge\"\n\n    @staticmethod\n    def solve(X, y):\n        X, y, y_mean = atom_standardization(X, y)\n        A = X.T @ X\n        dy = y - (np.sum(X, axis=1, keepdims=True) * y_mean).reshape(y.shape)\n        Xy = X.T @ dy\n        mean = np.linalg.solve(A, Xy)\n        sigma2 = np.var(X @ mean - dy)\n        Ainv = np.linalg.inv(A)\n        cov = np.sqrt(sigma2 * np.einsum(\"ij,kj,kl,li->i\", Ainv, X, X, Ainv))\n        mean = mean + y_mean.reshape([-1])\n        return mean, cov\n
    "},{"location":"API/regressor.html#openqdc.utils.regressor.Solver","title":"Solver","text":"

    Bases: ABC

    Abstract class for regression solvers.

    Source code in openqdc/utils/regressor.py
    class Solver(ABC):\n    \"\"\"Abstract class for regression solvers.\"\"\"\n\n    _regr_str: str\n\n    @staticmethod\n    @abstractmethod\n    def solve(X: np.ndarray, Y: np.ndarray) -> Tuple[np.ndarray, Optional[np.ndarray]]:\n        \"\"\"\n        Main method to solve the regression problem.\n        Must be implemented in all the subclasses.\n\n        Parameters:\n            X: Input features of shape (n_samples, n_species)\n            Y: Target values of shape (n_samples,) (energy values for the regression)\n\n        Returns:\n            Tuple of predicted values and the estimated uncertainty.\n        \"\"\"\n        pass\n\n    def __call__(self, X, Y):\n        return self.solve(X, Y)\n\n    def __str__(self):\n        return self._regr_str\n\n    def __repr__(self):\n        return str(self)\n
    "},{"location":"API/regressor.html#openqdc.utils.regressor.Solver.solve","title":"solve(X, Y) abstractmethod staticmethod","text":"

    Main method to solve the regression problem. Must be implemented in all the subclasses.

    Parameters:

    Name Type Description Default X ndarray

    Input features of shape (n_samples, n_species)

    required Y ndarray

    Target values of shape (n_samples,) (energy values for the regression)

    required

    Returns:

    Type Description Tuple[ndarray, Optional[ndarray]]

    Tuple of predicted values and the estimated uncertainty.

    Source code in openqdc/utils/regressor.py
    @staticmethod\n@abstractmethod\ndef solve(X: np.ndarray, Y: np.ndarray) -> Tuple[np.ndarray, Optional[np.ndarray]]:\n    \"\"\"\n    Main method to solve the regression problem.\n    Must be implemented in all the subclasses.\n\n    Parameters:\n        X: Input features of shape (n_samples, n_species)\n        Y: Target values of shape (n_samples,) (energy values for the regression)\n\n    Returns:\n        Tuple of predicted values and the estimated uncertainty.\n    \"\"\"\n    pass\n
    "},{"location":"API/regressor.html#openqdc.utils.regressor.atom_standardization","title":"atom_standardization(X, y)","text":"

    Standardize the energies and the atom counts. This will make the calculated uncertainty more meaningful.

    Source code in openqdc/utils/regressor.py
    def atom_standardization(X, y):\n    \"\"\"\n    Standardize the energies and the atom counts.\n    This will make the calculated uncertainty more\n    meaningful.\n    \"\"\"\n    X_norm = X.sum()\n    X = X / X_norm\n    y = y / X_norm\n    y_mean = y.sum() / X.sum()\n    return X, y, y_mean\n
    "},{"location":"API/regressor.html#openqdc.utils.regressor.non_nan_idxs","title":"non_nan_idxs(array)","text":"

    Return non nan indices of an array.

    Source code in openqdc/utils/regressor.py
    def non_nan_idxs(array):\n    \"\"\"\n    Return non nan indices of an array.\n    \"\"\"\n    return np.where(~np.isnan(array))[0]\n
    "},{"location":"API/statistics.html","title":"Statistics","text":""},{"location":"API/statistics.html#openqdc.datasets.statistics.AbstractStatsCalculator","title":"AbstractStatsCalculator","text":"

    Bases: ABC

    Abstract class that defines the interface for all the calculators object and the methods to compute the statistics.

    Source code in openqdc/datasets/statistics.py
    class AbstractStatsCalculator(ABC):\n    \"\"\"\n    Abstract class that defines the interface for all\n    the calculators object and the methods to\n    compute the statistics.\n    \"\"\"\n\n    # State Dependencies of the calculator to skip part of the calculation\n    state_dependency = []\n    name = None\n\n    def __init__(\n        self,\n        name: str,\n        energy_type: Optional[str] = None,\n        force_recompute: bool = False,\n        energies: Optional[np.ndarray] = None,\n        n_atoms: Optional[np.ndarray] = None,\n        atom_species: Optional[np.ndarray] = None,\n        position_idx_range: Optional[np.ndarray] = None,\n        e0_matrix: Optional[np.ndarray] = None,\n        atom_charges: Optional[np.ndarray] = None,\n        forces: Optional[np.ndarray] = None,\n    ):\n        \"\"\"\n        Parameters:\n            name :\n                Name of the dataset for saving and loading.\n            energy_type :\n                Type of the energy for the computation of the statistics. Used for loading and saving.\n            force_recompute :\n                Flag to force the recomputation of the statistics\n            energies : n\n                Energies of the dataset\n            n_atoms :\n                Number of atoms in the dataset\n            atom_species :\n                Atomic species of the dataset\n            position_idx_range : n\n                Position index range of the dataset\n            e0_matrix :\n                Isolated atom energies matrix of the dataset\n            atom_charges :\n                Atomic charges of the dataset\n            forces :\n                Forces of the dataset\n        \"\"\"\n        self.name = name\n        self.energy_type = energy_type\n        self.force_recompute = force_recompute\n        self.energies = energies\n        self.forces = forces\n        self.position_idx_range = position_idx_range\n        self.e0_matrix = e0_matrix\n        self.n_atoms = n_atoms\n        self.atom_species_charges_tuple = (atom_species, atom_charges)\n        self._root = p_join(get_local_cache(), self.name)\n        if atom_species is not None and atom_charges is not None:\n            # by value not reference\n            self.atom_species_charges_tuple = np.concatenate((atom_species[:, None], atom_charges[:, None]), axis=-1)\n\n    @property\n    def has_forces(self) -> bool:\n        return self.forces is not None\n\n    @property\n    def preprocess_path(self):\n        path = p_join(self.root, \"statistics\", self.name + f\"_{str(self)}\" + \".pkl\")\n        return path\n\n    @property\n    def root(self):\n        \"\"\"\n        Path to the dataset folder\n        \"\"\"\n        return self._root\n\n    @classmethod\n    def from_openqdc_dataset(cls, dataset, recompute: bool = False):\n        \"\"\"\n        Create a calculator object from a dataset object.\n        \"\"\"\n        obj = cls(\n            name=dataset.__name__,\n            force_recompute=recompute,\n            energy_type=dataset.energy_type,\n            energies=dataset.data[\"energies\"],\n            forces=dataset.data[\"forces\"] if \"forces\" in dataset.data else None,\n            n_atoms=dataset.data[\"n_atoms\"],\n            position_idx_range=dataset.data[\"position_idx_range\"],\n            atom_species=dataset.data[\"atomic_inputs\"][:, 0].ravel(),\n            atom_charges=dataset.data[\"atomic_inputs\"][:, 1].ravel(),\n            e0_matrix=dataset.__isolated_atom_energies__,\n        )\n        obj._root = dataset.root  # set to the dataset root in case of multiple datasets\n        return obj\n\n    @abstractmethod\n    def compute(self) -> StatisticsResults:\n        \"\"\"\n        Abstract method to compute the statistics.\n        Must return a StatisticsResults object and be implemented\n        in all the childs\n        \"\"\"\n        raise NotImplementedError\n\n    def save_statistics(self) -> None:\n        \"\"\"\n        Save statistics file to the dataset folder as a pkl file\n        \"\"\"\n        save_pkl(self.result, self.preprocess_path)\n\n    def attempt_load(self) -> bool:\n        \"\"\"\n        Load precomputed statistics file and return the success of the operation\n        \"\"\"\n        try:\n            self.result = load_pkl(self.preprocess_path)\n            logger.info(f\"Statistics for {str(self)} loaded successfully\")\n            return True\n        except FileNotFoundError:\n            logger.warning(f\"Statistics for {str(self)} not found. Computing...\")\n            return False\n\n    def _setup_deps(self, state: Dict) -> None:\n        \"\"\"\n        Check if the dependencies of calculators are satisfied\n        from the state object and set the attributes of the calculator\n        to skip part of the calculation\n        \"\"\"\n        self.state = state\n        self.deps_satisfied = all([dep in state for dep in self.state_dependency])\n        if self.deps_satisfied:\n            for dep in self.state_dependency:\n                setattr(self, dep, state[dep])\n\n    def write_state(self, update: Dict) -> None:\n        \"\"\"\n        Write/update the state dictionary with the update dictionary\n\n        update:\n            dictionary containing the update to the state\n        \"\"\"\n        self.state.update(update)\n\n    def run(self, state: Dict) -> None:\n        \"\"\"\n        Main method to run the calculator.\n        Setup the dependencies from the state dictionary\n        Check if the statistics are already computed and load them or\n        recompute them\n        Save the statistics in the correct folder\n\n        state:\n            dictionary containing the state of the calculator\n        \"\"\"\n        self._setup_deps(state)\n        if self.force_recompute or not self.attempt_load():\n            self.result = self.compute()\n            self.save_statistics()\n\n    def __str__(self) -> str:\n        return self.__class__.__name__.lower()\n
    "},{"location":"API/statistics.html#openqdc.datasets.statistics.AbstractStatsCalculator.root","title":"root property","text":"

    Path to the dataset folder

    "},{"location":"API/statistics.html#openqdc.datasets.statistics.AbstractStatsCalculator.__init__","title":"__init__(name, energy_type=None, force_recompute=False, energies=None, n_atoms=None, atom_species=None, position_idx_range=None, e0_matrix=None, atom_charges=None, forces=None)","text":"

    Parameters:

    Name Type Description Default name

    Name of the dataset for saving and loading.

    required energy_type

    Type of the energy for the computation of the statistics. Used for loading and saving.

    None force_recompute

    Flag to force the recomputation of the statistics

    False energies

    n Energies of the dataset

    None n_atoms

    Number of atoms in the dataset

    None atom_species

    Atomic species of the dataset

    None position_idx_range

    n Position index range of the dataset

    None e0_matrix

    Isolated atom energies matrix of the dataset

    None atom_charges

    Atomic charges of the dataset

    None forces

    Forces of the dataset

    None Source code in openqdc/datasets/statistics.py
    def __init__(\n    self,\n    name: str,\n    energy_type: Optional[str] = None,\n    force_recompute: bool = False,\n    energies: Optional[np.ndarray] = None,\n    n_atoms: Optional[np.ndarray] = None,\n    atom_species: Optional[np.ndarray] = None,\n    position_idx_range: Optional[np.ndarray] = None,\n    e0_matrix: Optional[np.ndarray] = None,\n    atom_charges: Optional[np.ndarray] = None,\n    forces: Optional[np.ndarray] = None,\n):\n    \"\"\"\n    Parameters:\n        name :\n            Name of the dataset for saving and loading.\n        energy_type :\n            Type of the energy for the computation of the statistics. Used for loading and saving.\n        force_recompute :\n            Flag to force the recomputation of the statistics\n        energies : n\n            Energies of the dataset\n        n_atoms :\n            Number of atoms in the dataset\n        atom_species :\n            Atomic species of the dataset\n        position_idx_range : n\n            Position index range of the dataset\n        e0_matrix :\n            Isolated atom energies matrix of the dataset\n        atom_charges :\n            Atomic charges of the dataset\n        forces :\n            Forces of the dataset\n    \"\"\"\n    self.name = name\n    self.energy_type = energy_type\n    self.force_recompute = force_recompute\n    self.energies = energies\n    self.forces = forces\n    self.position_idx_range = position_idx_range\n    self.e0_matrix = e0_matrix\n    self.n_atoms = n_atoms\n    self.atom_species_charges_tuple = (atom_species, atom_charges)\n    self._root = p_join(get_local_cache(), self.name)\n    if atom_species is not None and atom_charges is not None:\n        # by value not reference\n        self.atom_species_charges_tuple = np.concatenate((atom_species[:, None], atom_charges[:, None]), axis=-1)\n
    "},{"location":"API/statistics.html#openqdc.datasets.statistics.AbstractStatsCalculator.attempt_load","title":"attempt_load()","text":"

    Load precomputed statistics file and return the success of the operation

    Source code in openqdc/datasets/statistics.py
    def attempt_load(self) -> bool:\n    \"\"\"\n    Load precomputed statistics file and return the success of the operation\n    \"\"\"\n    try:\n        self.result = load_pkl(self.preprocess_path)\n        logger.info(f\"Statistics for {str(self)} loaded successfully\")\n        return True\n    except FileNotFoundError:\n        logger.warning(f\"Statistics for {str(self)} not found. Computing...\")\n        return False\n
    "},{"location":"API/statistics.html#openqdc.datasets.statistics.AbstractStatsCalculator.compute","title":"compute() abstractmethod","text":"

    Abstract method to compute the statistics. Must return a StatisticsResults object and be implemented in all the childs

    Source code in openqdc/datasets/statistics.py
    @abstractmethod\ndef compute(self) -> StatisticsResults:\n    \"\"\"\n    Abstract method to compute the statistics.\n    Must return a StatisticsResults object and be implemented\n    in all the childs\n    \"\"\"\n    raise NotImplementedError\n
    "},{"location":"API/statistics.html#openqdc.datasets.statistics.AbstractStatsCalculator.from_openqdc_dataset","title":"from_openqdc_dataset(dataset, recompute=False) classmethod","text":"

    Create a calculator object from a dataset object.

    Source code in openqdc/datasets/statistics.py
    @classmethod\ndef from_openqdc_dataset(cls, dataset, recompute: bool = False):\n    \"\"\"\n    Create a calculator object from a dataset object.\n    \"\"\"\n    obj = cls(\n        name=dataset.__name__,\n        force_recompute=recompute,\n        energy_type=dataset.energy_type,\n        energies=dataset.data[\"energies\"],\n        forces=dataset.data[\"forces\"] if \"forces\" in dataset.data else None,\n        n_atoms=dataset.data[\"n_atoms\"],\n        position_idx_range=dataset.data[\"position_idx_range\"],\n        atom_species=dataset.data[\"atomic_inputs\"][:, 0].ravel(),\n        atom_charges=dataset.data[\"atomic_inputs\"][:, 1].ravel(),\n        e0_matrix=dataset.__isolated_atom_energies__,\n    )\n    obj._root = dataset.root  # set to the dataset root in case of multiple datasets\n    return obj\n
    "},{"location":"API/statistics.html#openqdc.datasets.statistics.AbstractStatsCalculator.run","title":"run(state)","text":"

    Main method to run the calculator. Setup the dependencies from the state dictionary Check if the statistics are already computed and load them or recompute them Save the statistics in the correct folder

    state

    dictionary containing the state of the calculator

    Source code in openqdc/datasets/statistics.py
    def run(self, state: Dict) -> None:\n    \"\"\"\n    Main method to run the calculator.\n    Setup the dependencies from the state dictionary\n    Check if the statistics are already computed and load them or\n    recompute them\n    Save the statistics in the correct folder\n\n    state:\n        dictionary containing the state of the calculator\n    \"\"\"\n    self._setup_deps(state)\n    if self.force_recompute or not self.attempt_load():\n        self.result = self.compute()\n        self.save_statistics()\n
    "},{"location":"API/statistics.html#openqdc.datasets.statistics.AbstractStatsCalculator.save_statistics","title":"save_statistics()","text":"

    Save statistics file to the dataset folder as a pkl file

    Source code in openqdc/datasets/statistics.py
    def save_statistics(self) -> None:\n    \"\"\"\n    Save statistics file to the dataset folder as a pkl file\n    \"\"\"\n    save_pkl(self.result, self.preprocess_path)\n
    "},{"location":"API/statistics.html#openqdc.datasets.statistics.AbstractStatsCalculator.write_state","title":"write_state(update)","text":"

    Write/update the state dictionary with the update dictionary

    update

    dictionary containing the update to the state

    Source code in openqdc/datasets/statistics.py
    def write_state(self, update: Dict) -> None:\n    \"\"\"\n    Write/update the state dictionary with the update dictionary\n\n    update:\n        dictionary containing the update to the state\n    \"\"\"\n    self.state.update(update)\n
    "},{"location":"API/statistics.html#openqdc.datasets.statistics.EnergyStatistics","title":"EnergyStatistics dataclass","text":"

    Bases: StatisticsResults

    Dataclass for energy related statistics

    Source code in openqdc/datasets/statistics.py
    @dataclass\nclass EnergyStatistics(StatisticsResults):\n    \"\"\"\n    Dataclass for energy related statistics\n    \"\"\"\n\n    mean: Optional[np.ndarray]\n    std: Optional[np.ndarray]\n
    "},{"location":"API/statistics.html#openqdc.datasets.statistics.ForceStatistics","title":"ForceStatistics dataclass","text":"

    Bases: StatisticsResults

    Dataclass for force statistics

    Source code in openqdc/datasets/statistics.py
    @dataclass\nclass ForceStatistics(StatisticsResults):\n    \"\"\"\n    Dataclass for force statistics\n    \"\"\"\n\n    mean: Optional[np.ndarray]\n    std: Optional[np.ndarray]\n    component_mean: Optional[np.ndarray]\n    component_std: Optional[np.ndarray]\n    component_rms: Optional[np.ndarray]\n
    "},{"location":"API/statistics.html#openqdc.datasets.statistics.ForcesCalculatorStats","title":"ForcesCalculatorStats","text":"

    Bases: AbstractStatsCalculator

    Forces statistics calculator class

    Source code in openqdc/datasets/statistics.py
    class ForcesCalculatorStats(AbstractStatsCalculator):\n    \"\"\"\n    Forces statistics calculator class\n    \"\"\"\n\n    def compute(self) -> ForceStatistics:\n        if not self.has_forces:\n            return ForceStatistics(mean=None, std=None, component_mean=None, component_std=None, component_rms=None)\n        converted_force_data = self.forces\n        num_methods = converted_force_data.shape[2]\n        mean = np.nanmean(converted_force_data.reshape(-1, num_methods), axis=0)\n        std = np.nanstd(converted_force_data.reshape(-1, num_methods), axis=0)\n        component_mean = np.nanmean(converted_force_data, axis=0)\n        component_std = np.nanstd(converted_force_data, axis=0)\n        component_rms = np.sqrt(np.nanmean(converted_force_data**2, axis=0))\n        return ForceStatistics(\n            mean=np.atleast_2d(mean),\n            std=np.atleast_2d(std),\n            component_mean=np.atleast_2d(component_mean),\n            component_std=np.atleast_2d(component_std),\n            component_rms=np.atleast_2d(component_rms),\n        )\n
    "},{"location":"API/statistics.html#openqdc.datasets.statistics.FormationEnergyInterface","title":"FormationEnergyInterface","text":"

    Bases: AbstractStatsCalculator, ABC

    Formation Energy interface calculator class. Define the use of the dependency formation_energy in the compute method

    Source code in openqdc/datasets/statistics.py
    class FormationEnergyInterface(AbstractStatsCalculator, ABC):\n    \"\"\"\n    Formation Energy interface calculator class.\n    Define the use of the dependency formation_energy in the\n    compute method\n    \"\"\"\n\n    state_dependency = [\"formation_energy\"]\n\n    def compute(self) -> EnergyStatistics:\n        # if the state has not the dependency satisfied\n        if not self.deps_satisfied:\n            # run the main computation\n            from openqdc.utils.constants import MAX_CHARGE\n\n            splits_idx = self.position_idx_range[:, 1]\n            s = np.array(self.atom_species_charges_tuple, dtype=int)\n            s[:, 1] += MAX_CHARGE\n            matrixs = [matrix[s[:, 0], s[:, 1]] for matrix in self.e0_matrix]\n            converted_energy_data = self.energies\n            E = []\n            for i, matrix in enumerate(matrixs):\n                c = np.cumsum(np.append([0], matrix))[splits_idx]\n                c[1:] = c[1:] - c[:-1]\n                E.append(converted_energy_data[:, i] - c)\n        else:\n            # if the dependency is satisfied get the dependency\n            E = getattr(self, self.state_dependency[0])\n        self.write_state({self.state_dependency[0]: E})\n        E = np.array(E).T\n        return self._compute(E)\n\n    @abstractmethod\n    def _compute(self, energy) -> EnergyStatistics:\n        raise NotImplementedError\n\n    def __str__(self) -> str:\n        # override the __str__ method to add the energy type to the name\n        # to differentiate between formation and regression type\n        return f\"{self.__class__.__name__.lower()}_{self.energy_type.lower()}\"\n
    "},{"location":"API/statistics.html#openqdc.datasets.statistics.FormationEnergyStats","title":"FormationEnergyStats","text":"

    Bases: FormationEnergyInterface

    Formation Energy calculator class.

    Source code in openqdc/datasets/statistics.py
    class FormationEnergyStats(FormationEnergyInterface):\n    \"\"\"\n    Formation Energy  calculator class.\n    \"\"\"\n\n    def _compute(self, energy) -> EnergyStatistics:\n        formation_E_mean = np.nanmean(energy, axis=0)\n        formation_E_std = np.nanstd(energy, axis=0)\n        return EnergyStatistics(mean=np.atleast_2d(formation_E_mean), std=np.atleast_2d(formation_E_std))\n
    "},{"location":"API/statistics.html#openqdc.datasets.statistics.PerAtomFormationEnergyStats","title":"PerAtomFormationEnergyStats","text":"

    Bases: FormationEnergyInterface

    Per atom Formation Energy calculator class.

    Source code in openqdc/datasets/statistics.py
    class PerAtomFormationEnergyStats(FormationEnergyInterface):\n    \"\"\"\n    Per atom Formation Energy  calculator class.\n    \"\"\"\n\n    def _compute(self, energy) -> EnergyStatistics:\n        inter_E_mean = np.nanmean((energy / self.n_atoms[:, None]), axis=0)\n        inter_E_std = np.nanstd((energy / self.n_atoms[:, None]), axis=0)\n        return EnergyStatistics(mean=np.atleast_2d(inter_E_mean), std=np.atleast_2d(inter_E_std))\n
    "},{"location":"API/statistics.html#openqdc.datasets.statistics.StatisticManager","title":"StatisticManager","text":"

    Manager class that automatically handle the shared state between the statistic calculators

    Source code in openqdc/datasets/statistics.py
    class StatisticManager:\n    \"\"\"\n    Manager class that automatically handle the shared state between\n    the statistic calculators\n    \"\"\"\n\n    def __init__(self, dataset: Any, recompute: bool = False, *statistic_calculators: \"AbstractStatsCalculator\"):\n        \"\"\"\n        Parameters:\n            dataset : openqdc.datasets.base.BaseDataset\n                The dataset object to compute the statistics\n            recompute:\n                Flag to recompute the statistics\n            *statistic_calculators:\n                List of statistic calculators to run\n        \"\"\"\n        self._state = {}\n        self._results = {}\n        self._statistic_calculators = [\n            statistic_calculators.from_openqdc_dataset(dataset, recompute)\n            for statistic_calculators in statistic_calculators\n        ]\n\n    @property\n    def state(self) -> Dict:\n        \"\"\"\n        Return the dictionary state of the manager\n\n        Returns:\n            State of the StatisticManager\n        \"\"\"\n        return self._state\n\n    def reset_state(self):\n        \"\"\"\n        Reset the state dictionary\n        \"\"\"\n        self._state = {}\n\n    def reset_results(self):\n        \"\"\"\n        Reset the results dictionary\n        \"\"\"\n        self._results = {}\n\n    def get_state(self, key: Optional[str] = None) -> Optional[Any]:\n        \"\"\"\n        Return the value of the key in the state dictionary\n\n        Parameters:\n            key: str, default = None\n        Returns:\n            the value of the key in the state dictionary\n            or the whole state dictionary if key is None\n        \"\"\"\n        if key is None:\n            return self._state\n        return self._state.get(key, None)\n\n    def has_state(self, key: str) -> bool:\n        \"\"\"\n        Check is state has key\n\n        Parameters:\n            key:\n                Key to check in the state dictionary\n\n        Returns:\n            True if the key is in the state dictionary\n        \"\"\"\n        return key in self._state\n\n    def get_results(self, as_dict: bool = False):\n        \"\"\"\n        Aggregate results from all the calculators\n\n        Parameters:\n            as_dict:\n                Flag to return the results as a dictionary\n        \"\"\"\n        results = deepcopy(self._results)\n        if as_dict:\n            return {k: v.as_dict() for k, v in results.items()}\n        return {k: v for k, v in self._results.items()}\n\n    def run_calculators(self):\n        \"\"\"\n        Run the saved calculators and save the results in the manager\n        \"\"\"\n        logger.info(\"Processing dataset statistics\")\n        for calculator in self._statistic_calculators:\n            calculator.run(self.state)\n            self._results[calculator.__class__.__name__] = calculator.result\n
    "},{"location":"API/statistics.html#openqdc.datasets.statistics.StatisticManager.state","title":"state: Dict property","text":"

    Return the dictionary state of the manager

    Returns:

    Type Description Dict

    State of the StatisticManager

    "},{"location":"API/statistics.html#openqdc.datasets.statistics.StatisticManager.__init__","title":"__init__(dataset, recompute=False, *statistic_calculators)","text":"

    Parameters:

    Name Type Description Default dataset

    openqdc.datasets.base.BaseDataset The dataset object to compute the statistics

    required recompute bool

    Flag to recompute the statistics

    False *statistic_calculators AbstractStatsCalculator

    List of statistic calculators to run

    () Source code in openqdc/datasets/statistics.py
    def __init__(self, dataset: Any, recompute: bool = False, *statistic_calculators: \"AbstractStatsCalculator\"):\n    \"\"\"\n    Parameters:\n        dataset : openqdc.datasets.base.BaseDataset\n            The dataset object to compute the statistics\n        recompute:\n            Flag to recompute the statistics\n        *statistic_calculators:\n            List of statistic calculators to run\n    \"\"\"\n    self._state = {}\n    self._results = {}\n    self._statistic_calculators = [\n        statistic_calculators.from_openqdc_dataset(dataset, recompute)\n        for statistic_calculators in statistic_calculators\n    ]\n
    "},{"location":"API/statistics.html#openqdc.datasets.statistics.StatisticManager.get_results","title":"get_results(as_dict=False)","text":"

    Aggregate results from all the calculators

    Parameters:

    Name Type Description Default as_dict bool

    Flag to return the results as a dictionary

    False Source code in openqdc/datasets/statistics.py
    def get_results(self, as_dict: bool = False):\n    \"\"\"\n    Aggregate results from all the calculators\n\n    Parameters:\n        as_dict:\n            Flag to return the results as a dictionary\n    \"\"\"\n    results = deepcopy(self._results)\n    if as_dict:\n        return {k: v.as_dict() for k, v in results.items()}\n    return {k: v for k, v in self._results.items()}\n
    "},{"location":"API/statistics.html#openqdc.datasets.statistics.StatisticManager.get_state","title":"get_state(key=None)","text":"

    Return the value of the key in the state dictionary

    Parameters:

    Name Type Description Default key Optional[str]

    str, default = None

    None

    Returns: the value of the key in the state dictionary or the whole state dictionary if key is None

    Source code in openqdc/datasets/statistics.py
    def get_state(self, key: Optional[str] = None) -> Optional[Any]:\n    \"\"\"\n    Return the value of the key in the state dictionary\n\n    Parameters:\n        key: str, default = None\n    Returns:\n        the value of the key in the state dictionary\n        or the whole state dictionary if key is None\n    \"\"\"\n    if key is None:\n        return self._state\n    return self._state.get(key, None)\n
    "},{"location":"API/statistics.html#openqdc.datasets.statistics.StatisticManager.has_state","title":"has_state(key)","text":"

    Check is state has key

    Parameters:

    Name Type Description Default key str

    Key to check in the state dictionary

    required

    Returns:

    Type Description bool

    True if the key is in the state dictionary

    Source code in openqdc/datasets/statistics.py
    def has_state(self, key: str) -> bool:\n    \"\"\"\n    Check is state has key\n\n    Parameters:\n        key:\n            Key to check in the state dictionary\n\n    Returns:\n        True if the key is in the state dictionary\n    \"\"\"\n    return key in self._state\n
    "},{"location":"API/statistics.html#openqdc.datasets.statistics.StatisticManager.reset_results","title":"reset_results()","text":"

    Reset the results dictionary

    Source code in openqdc/datasets/statistics.py
    def reset_results(self):\n    \"\"\"\n    Reset the results dictionary\n    \"\"\"\n    self._results = {}\n
    "},{"location":"API/statistics.html#openqdc.datasets.statistics.StatisticManager.reset_state","title":"reset_state()","text":"

    Reset the state dictionary

    Source code in openqdc/datasets/statistics.py
    def reset_state(self):\n    \"\"\"\n    Reset the state dictionary\n    \"\"\"\n    self._state = {}\n
    "},{"location":"API/statistics.html#openqdc.datasets.statistics.StatisticManager.run_calculators","title":"run_calculators()","text":"

    Run the saved calculators and save the results in the manager

    Source code in openqdc/datasets/statistics.py
    def run_calculators(self):\n    \"\"\"\n    Run the saved calculators and save the results in the manager\n    \"\"\"\n    logger.info(\"Processing dataset statistics\")\n    for calculator in self._statistic_calculators:\n        calculator.run(self.state)\n        self._results[calculator.__class__.__name__] = calculator.result\n
    "},{"location":"API/statistics.html#openqdc.datasets.statistics.StatisticsResults","title":"StatisticsResults","text":"

    Parent class to statistics results to provide general methods.

    Source code in openqdc/datasets/statistics.py
    class StatisticsResults:\n    \"\"\"\n    Parent class to statistics results\n    to provide general methods.\n    \"\"\"\n\n    def to_dict(self) -> Dict:\n        \"\"\"\n        Convert the class to a dictionary\n\n        Returns:\n            Dictionary representation of the class\n        \"\"\"\n        return asdict(self)\n\n    def transform(self, func: Callable):\n        \"\"\"\n        Apply a function to all the attributes of the class\n\n        Parameters:\n            func:\n                Function to apply to the attributes\n        \"\"\"\n        for k, v in self.to_dict().items():\n            if v is not None:\n                setattr(self, k, func(v))\n
    "},{"location":"API/statistics.html#openqdc.datasets.statistics.StatisticsResults.to_dict","title":"to_dict()","text":"

    Convert the class to a dictionary

    Returns:

    Type Description Dict

    Dictionary representation of the class

    Source code in openqdc/datasets/statistics.py
    def to_dict(self) -> Dict:\n    \"\"\"\n    Convert the class to a dictionary\n\n    Returns:\n        Dictionary representation of the class\n    \"\"\"\n    return asdict(self)\n
    "},{"location":"API/statistics.html#openqdc.datasets.statistics.StatisticsResults.transform","title":"transform(func)","text":"

    Apply a function to all the attributes of the class

    Parameters:

    Name Type Description Default func Callable

    Function to apply to the attributes

    required Source code in openqdc/datasets/statistics.py
    def transform(self, func: Callable):\n    \"\"\"\n    Apply a function to all the attributes of the class\n\n    Parameters:\n        func:\n            Function to apply to the attributes\n    \"\"\"\n    for k, v in self.to_dict().items():\n        if v is not None:\n            setattr(self, k, func(v))\n
    "},{"location":"API/statistics.html#openqdc.datasets.statistics.TotalEnergyStats","title":"TotalEnergyStats","text":"

    Bases: AbstractStatsCalculator

    Total Energy statistics calculator class

    Source code in openqdc/datasets/statistics.py
    class TotalEnergyStats(AbstractStatsCalculator):\n    \"\"\"\n    Total Energy statistics calculator class\n    \"\"\"\n\n    def compute(self) -> EnergyStatistics:\n        converted_energy_data = self.energies\n        total_E_mean = np.nanmean(converted_energy_data, axis=0)\n        total_E_std = np.nanstd(converted_energy_data, axis=0)\n        return EnergyStatistics(mean=np.atleast_2d(total_E_mean), std=np.atleast_2d(total_E_std))\n
    "},{"location":"API/units.html","title":"UNITS","text":"

    Units conversion utilities module.

    Available Energy units

    [\"kcal/mol\", \"kj/mol\", \"hartree\", \"ev\" \"mev\", \"ryd]

    Available Distance units

    [\"ang\", \"nm\", \"bohr\"]

    Available Force units

    Combinations between Energy and Distance units

    "},{"location":"API/units.html#openqdc.utils.units.Conversion","title":"Conversion","text":"

    Conversion from one unit system to another defined by a name and a callable

    Source code in openqdc/utils/units.py
    class Conversion:\n    \"\"\"\n    Conversion from one unit system to another defined by a name and a callable\n    \"\"\"\n\n    def __init__(self, in_unit: str, out_unit: str, func: Callable[[float], float]):\n        \"\"\"\n\n        Parameters:\n            in_unit: String defining the units of the current values\n            out_unit: String defining the target units\n            func: The callable to compute the conversion\n        \"\"\"\n        name = \"convert_\" + in_unit.lower().strip() + \"_to_\" + out_unit.lower().strip()\n\n        if name in CONVERSION_REGISTRY:\n            raise ConversionAlreadyDefined(in_unit, out_unit)\n        CONVERSION_REGISTRY[name] = self\n\n        self.name = name\n        self.fn = func\n\n    def __call__(self, x):\n        return self.fn(x)\n
    "},{"location":"API/units.html#openqdc.utils.units.Conversion.__init__","title":"__init__(in_unit, out_unit, func)","text":"

    Parameters:

    Name Type Description Default in_unit str

    String defining the units of the current values

    required out_unit str

    String defining the target units

    required func Callable[[float], float]

    The callable to compute the conversion

    required Source code in openqdc/utils/units.py
    def __init__(self, in_unit: str, out_unit: str, func: Callable[[float], float]):\n    \"\"\"\n\n    Parameters:\n        in_unit: String defining the units of the current values\n        out_unit: String defining the target units\n        func: The callable to compute the conversion\n    \"\"\"\n    name = \"convert_\" + in_unit.lower().strip() + \"_to_\" + out_unit.lower().strip()\n\n    if name in CONVERSION_REGISTRY:\n        raise ConversionAlreadyDefined(in_unit, out_unit)\n    CONVERSION_REGISTRY[name] = self\n\n    self.name = name\n    self.fn = func\n
    "},{"location":"API/units.html#openqdc.utils.units.DistanceTypeConversion","title":"DistanceTypeConversion","text":"

    Bases: ConversionEnum, StrEnum

    Define the possible distance units for conversion

    Source code in openqdc/utils/units.py
    @unique\nclass DistanceTypeConversion(ConversionEnum, StrEnum):\n    \"\"\"\n    Define the possible distance units for conversion\n    \"\"\"\n\n    ANG = \"ang\"\n    NM = \"nm\"\n    BOHR = \"bohr\"\n\n    def to(self, distance: \"DistanceTypeConversion\", fraction: bool = False) -> Callable[[float], float]:\n        \"\"\"\n        Get the conversion function to convert the distance to the desired units.\n\n        Parameters:\n            distance: distance unit to convert to\n            fraction: whether it is distance^1 or distance^-1\n\n        Returns:\n            callable to convert the distance to the desired units\n        \"\"\"\n        return get_conversion(str(self), str(distance)) if not fraction else get_conversion(str(distance), str(self))\n
    "},{"location":"API/units.html#openqdc.utils.units.DistanceTypeConversion.to","title":"to(distance, fraction=False)","text":"

    Get the conversion function to convert the distance to the desired units.

    Parameters:

    Name Type Description Default distance DistanceTypeConversion

    distance unit to convert to

    required fraction bool

    whether it is distance^1 or distance^-1

    False

    Returns:

    Type Description Callable[[float], float]

    callable to convert the distance to the desired units

    Source code in openqdc/utils/units.py
    def to(self, distance: \"DistanceTypeConversion\", fraction: bool = False) -> Callable[[float], float]:\n    \"\"\"\n    Get the conversion function to convert the distance to the desired units.\n\n    Parameters:\n        distance: distance unit to convert to\n        fraction: whether it is distance^1 or distance^-1\n\n    Returns:\n        callable to convert the distance to the desired units\n    \"\"\"\n    return get_conversion(str(self), str(distance)) if not fraction else get_conversion(str(distance), str(self))\n
    "},{"location":"API/units.html#openqdc.utils.units.EnergyTypeConversion","title":"EnergyTypeConversion","text":"

    Bases: ConversionEnum, StrEnum

    Define the possible energy units for conversion

    Source code in openqdc/utils/units.py
    @unique\nclass EnergyTypeConversion(ConversionEnum, StrEnum):\n    \"\"\"\n    Define the possible energy units for conversion\n    \"\"\"\n\n    KCAL_MOL = \"kcal/mol\"\n    KJ_MOL = \"kj/mol\"\n    HARTREE = \"hartree\"\n    EV = \"ev\"\n    MEV = \"mev\"\n    RYD = \"ryd\"\n\n    def to(self, energy: \"EnergyTypeConversion\") -> Callable[[float], float]:\n        \"\"\"\n        Get the conversion function to convert the energy to the desired units.\n\n        Parameters:\n            energy: energy unit to convert to\n\n        Returns:\n            Callable to convert the distance to the desired units\n        \"\"\"\n        return get_conversion(str(self), str(energy))\n
    "},{"location":"API/units.html#openqdc.utils.units.EnergyTypeConversion.to","title":"to(energy)","text":"

    Get the conversion function to convert the energy to the desired units.

    Parameters:

    Name Type Description Default energy EnergyTypeConversion

    energy unit to convert to

    required

    Returns:

    Type Description Callable[[float], float]

    Callable to convert the distance to the desired units

    Source code in openqdc/utils/units.py
    def to(self, energy: \"EnergyTypeConversion\") -> Callable[[float], float]:\n    \"\"\"\n    Get the conversion function to convert the energy to the desired units.\n\n    Parameters:\n        energy: energy unit to convert to\n\n    Returns:\n        Callable to convert the distance to the desired units\n    \"\"\"\n    return get_conversion(str(self), str(energy))\n
    "},{"location":"API/units.html#openqdc.utils.units.ForceTypeConversion","title":"ForceTypeConversion","text":"

    Bases: ConversionEnum

    Define the possible foce units for conversion

    Source code in openqdc/utils/units.py
    @unique\nclass ForceTypeConversion(ConversionEnum):\n    \"\"\"\n    Define the possible foce units for conversion\n    \"\"\"\n\n    #     Name      = EnergyTypeConversion,         , DistanceTypeConversion\n    HARTREE_BOHR = EnergyTypeConversion.HARTREE, DistanceTypeConversion.BOHR\n    HARTREE_ANG = EnergyTypeConversion.HARTREE, DistanceTypeConversion.ANG\n    HARTREE_NM = EnergyTypeConversion.HARTREE, DistanceTypeConversion.NM\n    EV_BOHR = EnergyTypeConversion.EV, DistanceTypeConversion.BOHR\n    EV_ANG = EnergyTypeConversion.EV, DistanceTypeConversion.ANG\n    EV_NM = EnergyTypeConversion.EV, DistanceTypeConversion.NM\n    KCAL_MOL_BOHR = EnergyTypeConversion.KCAL_MOL, DistanceTypeConversion.BOHR\n    KCAL_MOL_ANG = EnergyTypeConversion.KCAL_MOL, DistanceTypeConversion.ANG\n    KCAL_MOL_NM = EnergyTypeConversion.KCAL_MOL, DistanceTypeConversion.NM\n    KJ_MOL_BOHR = EnergyTypeConversion.KJ_MOL, DistanceTypeConversion.BOHR\n    KJ_MOL_ANG = EnergyTypeConversion.KJ_MOL, DistanceTypeConversion.ANG\n    KJ_MOL_NM = EnergyTypeConversion.KJ_MOL, DistanceTypeConversion.NM\n    MEV_BOHR = EnergyTypeConversion.MEV, DistanceTypeConversion.BOHR\n    MEV_ANG = EnergyTypeConversion.MEV, DistanceTypeConversion.ANG\n    MEV_NM = EnergyTypeConversion.MEV, DistanceTypeConversion.NM\n    RYD_BOHR = EnergyTypeConversion.RYD, DistanceTypeConversion.BOHR\n    RYD_ANG = EnergyTypeConversion.RYD, DistanceTypeConversion.ANG\n    RYD_NM = EnergyTypeConversion.RYD, DistanceTypeConversion.NM\n\n    def __init__(self, energy: EnergyTypeConversion, distance: DistanceTypeConversion):\n        self.energy = energy\n        self.distance = distance\n\n    def __str__(self):\n        return f\"{self.energy}/{self.distance}\"\n\n    def to(self, energy: EnergyTypeConversion, distance: DistanceTypeConversion) -> Callable[[float], float]:\n        \"\"\"\n        Get the conversion function to convert the force to the desired units.\n\n        Parameters:\n            energy: energy unit to convert to\n            distance: distance unit to convert to\n\n        Returns:\n            callable to convert the distance to the desired units\n        \"\"\"\n        return lambda x: self.distance.to(distance, fraction=True)(self.energy.to(energy)(x))\n
    "},{"location":"API/units.html#openqdc.utils.units.ForceTypeConversion.to","title":"to(energy, distance)","text":"

    Get the conversion function to convert the force to the desired units.

    Parameters:

    Name Type Description Default energy EnergyTypeConversion

    energy unit to convert to

    required distance DistanceTypeConversion

    distance unit to convert to

    required

    Returns:

    Type Description Callable[[float], float]

    callable to convert the distance to the desired units

    Source code in openqdc/utils/units.py
    def to(self, energy: EnergyTypeConversion, distance: DistanceTypeConversion) -> Callable[[float], float]:\n    \"\"\"\n    Get the conversion function to convert the force to the desired units.\n\n    Parameters:\n        energy: energy unit to convert to\n        distance: distance unit to convert to\n\n    Returns:\n        callable to convert the distance to the desired units\n    \"\"\"\n    return lambda x: self.distance.to(distance, fraction=True)(self.energy.to(energy)(x))\n
    "},{"location":"API/units.html#openqdc.utils.units.get_conversion","title":"get_conversion(in_unit, out_unit)","text":"

    Utility function to get the conversion function between two units.

    Parameters:

    Name Type Description Default in_unit

    The input unit

    required out_unit

    The output unit

    required

    Returns:

    Type Description Callable[[float], float]

    The conversion function

    Source code in openqdc/utils/units.py
    def get_conversion(in_unit: str, out_unit: str) -> Callable[[float], float]:\n    \"\"\"\n    Utility function to get the conversion function between two units.\n\n    Parameters:\n        in_unit : The input unit\n        out_unit : The output unit\n\n    Returns:\n        The conversion function\n    \"\"\"\n    name = \"convert_\" + in_unit.lower().strip() + \"_to_\" + out_unit.lower().strip()\n    if in_unit.lower().strip() == out_unit.lower().strip():\n        return lambda x: x\n    if name not in CONVERSION_REGISTRY:\n        raise ConversionNotDefinedError(in_unit, out_unit)\n    return CONVERSION_REGISTRY[name]\n
    "},{"location":"API/utils.html","title":"Utils","text":""},{"location":"API/utils.html#openqdc.utils.check_file","title":"check_file(path)","text":"

    Checks if file present on local

    Source code in openqdc/utils/io.py
    def check_file(path) -> bool:\n    \"\"\"Checks if file present on local\"\"\"\n    return os.path.exists(path)\n
    "},{"location":"API/utils.html#openqdc.utils.create_hdf5_file","title":"create_hdf5_file(hdf5_file_path)","text":"

    Creates hdf5 file with fsspec

    Source code in openqdc/utils/io.py
    def create_hdf5_file(hdf5_file_path: str):\n    \"\"\"Creates hdf5 file with fsspec\"\"\"\n    fp = fsspec.open(hdf5_file_path, \"wb\")\n    if hasattr(fp, \"open\"):\n        fp = fp.open()\n    return h5py.File(fp, \"a\")\n
    "},{"location":"API/utils.html#openqdc.utils.get_conversion","title":"get_conversion(in_unit, out_unit)","text":"

    Utility function to get the conversion function between two units.

    Parameters:

    Name Type Description Default in_unit

    The input unit

    required out_unit

    The output unit

    required

    Returns:

    Type Description Callable[[float], float]

    The conversion function

    Source code in openqdc/utils/units.py
    def get_conversion(in_unit: str, out_unit: str) -> Callable[[float], float]:\n    \"\"\"\n    Utility function to get the conversion function between two units.\n\n    Parameters:\n        in_unit : The input unit\n        out_unit : The output unit\n\n    Returns:\n        The conversion function\n    \"\"\"\n    name = \"convert_\" + in_unit.lower().strip() + \"_to_\" + out_unit.lower().strip()\n    if in_unit.lower().strip() == out_unit.lower().strip():\n        return lambda x: x\n    if name not in CONVERSION_REGISTRY:\n        raise ConversionNotDefinedError(in_unit, out_unit)\n    return CONVERSION_REGISTRY[name]\n
    "},{"location":"API/utils.html#openqdc.utils.get_local_cache","title":"get_local_cache()","text":"

    Returns the local cache directory. It creates it if it does not exist.

    Returns:

    Name Type Description str str

    path to the local cache directory

    Source code in openqdc/utils/io.py
    def get_local_cache() -> str:\n    \"\"\"\n    Returns the local cache directory. It creates it if it does not exist.\n\n    Returns:\n        str: path to the local cache directory\n    \"\"\"\n    cache_dir = os.path.expanduser(os.path.expandvars(_OPENQDC_CACHE_DIR))\n    os.makedirs(cache_dir, exist_ok=True)\n    return cache_dir\n
    "},{"location":"API/utils.html#openqdc.utils.get_remote_cache","title":"get_remote_cache(write_access=False)","text":"

    Returns the entry point based on the write access.

    Source code in openqdc/utils/io.py
    def get_remote_cache(write_access=False) -> str:\n    \"\"\"\n    Returns the entry point based on the write access.\n    \"\"\"\n    if write_access:\n        remote_cache = \"openqdc/v1\"  # \"gs://qmdata-public/openqdc\"\n        # remote_cache = \"gs://qmdata-public/openqdc\"\n    else:\n        remote_cache = _OPENQDC_DOWNLOAD_API.get(os.environ.get(\"OPENQDC_DOWNLOAD_API\", \"s3\"))\n        # remote_cache = \"https://storage.googleapis.com/qmdata-public/openqdc\"\n    return remote_cache\n
    "},{"location":"API/utils.html#openqdc.utils.load_hdf5_file","title":"load_hdf5_file(hdf5_file_path)","text":"

    Loads hdf5 file with fsspec

    Source code in openqdc/utils/io.py
    def load_hdf5_file(hdf5_file_path: str):\n    \"\"\"Loads hdf5 file with fsspec\"\"\"\n    if not check_file(hdf5_file_path):\n        raise FileNotFoundError(f\"File {hdf5_file_path} does not exist on GCS and local.\")\n\n    fp = fsspec.open(hdf5_file_path, \"rb\")\n    if hasattr(fp, \"open\"):\n        fp = fp.open()\n    file = h5py.File(fp)\n\n    # inorder to enable multiprocessing:\n    # https://github.com/fsspec/gcsfs/issues/379#issuecomment-839929801\n    # fsspec.asyn.iothread[0] = None\n    # fsspec.asyn.loop[0] = None\n\n    return file\n
    "},{"location":"API/utils.html#openqdc.utils.load_json","title":"load_json(path)","text":"

    Loads json file

    Source code in openqdc/utils/io.py
    def load_json(path):\n    \"\"\"Loads json file\"\"\"\n    with fsspec.open(path, \"r\") as fp:  # Unpickling\n        return json.load(fp)\n
    "},{"location":"API/utils.html#openqdc.utils.load_pkl","title":"load_pkl(path, check=True)","text":"

    Load pkl file

    Source code in openqdc/utils/io.py
    def load_pkl(path, check=True):\n    \"\"\"Load pkl file\"\"\"\n    if check:\n        if not check_file(path):\n            raise FileNotFoundError(f\"File {path} does not exist on GCS and local.\")\n\n    with open(path, \"rb\") as fp:  # Unpickling\n        return pkl.load(fp)\n
    "},{"location":"API/utils.html#openqdc.utils.makedirs","title":"makedirs(path, exist_ok=True)","text":"

    Creates directory

    Source code in openqdc/utils/io.py
    def makedirs(path, exist_ok=True):\n    \"\"\"Creates directory\"\"\"\n    os.makedirs(path, exist_ok=exist_ok)\n
    "},{"location":"API/utils.html#openqdc.utils.read_qc_archive_h5","title":"read_qc_archive_h5(raw_path, subset, energy_target_names, force_target_names=None)","text":"

    Extracts data from the HDF5 archive file.

    Source code in openqdc/utils/io.py
    def read_qc_archive_h5(\n    raw_path: str, subset: str, energy_target_names: List[str], force_target_names: Optional[List[str]] = None\n) -> List[Dict[str, np.ndarray]]:\n    \"\"\"Extracts data from the HDF5 archive file.\"\"\"\n    data = load_hdf5_file(raw_path)\n    data_t = {k2: data[k1][k2][:] for k1 in data.keys() for k2 in data[k1].keys()}\n\n    n = len(data_t[\"molecule_id\"])\n    samples = [extract_entry(data_t, i, subset, energy_target_names, force_target_names) for i in tqdm(range(n))]\n    return samples\n
    "},{"location":"API/utils.html#openqdc.utils.save_pkl","title":"save_pkl(file, path)","text":"

    Saves pkl file

    Source code in openqdc/utils/io.py
    def save_pkl(file, path):\n    \"\"\"Saves pkl file\"\"\"\n    logger.info(f\"Saving file at {path}\")\n    with fsspec.open(path, \"wb\") as fp:  # Pickling\n        pkl.dump(file, fp)\n
    "},{"location":"API/utils.html#openqdc.utils.set_cache_dir","title":"set_cache_dir(d)","text":"

    Optionally set the _OPENQDC_CACHE_DIR directory.

    Parameters:

    Name Type Description Default d str

    path to a local folder.

    required Source code in openqdc/utils/io.py
    def set_cache_dir(d):\n    r\"\"\"\n    Optionally set the _OPENQDC_CACHE_DIR directory.\n\n    Args:\n        d (str): path to a local folder.\n    \"\"\"\n    if d is None:\n        return\n    global _OPENQDC_CACHE_DIR\n    _OPENQDC_CACHE_DIR = os.path.normpath(os.path.expanduser(d))\n
    "},{"location":"API/datasets/alchemy.html","title":"Alchemy","text":""},{"location":"API/datasets/alchemy.html#openqdc.datasets.potential.alchemy.Alchemy","title":"Alchemy","text":"

    Bases: BaseDataset

    Alchemy comprises of 119,487 organic molecules with up to 14 heavy atoms, sampled from the GDB MedChem database. Molecular properties are calculated using PySCF's implementation of the DFT Kohn-Sham method at the B3LYP level with the basis set 6-31G(2df,p). The equilibrium geometry is optimized in three passes. First, OpenBabel is used to parse SMILES string and build the Cartesian coordinates with MMFF94 force field optimization. Second, HF/STO3G is used to generate the preliminary geometry. Third, for the final pass of geometry relaxation, the B3LYP/6-31G(2df,p) model with the density fittting approximation for electron repulsion integrals is used. The auxillary basis cc-pVDZ-jkfit is employed in density fitting to build the Coulomb matrix and the HF exchange matrix.

    Usage:

    from openqdc.datasets import Alchemy\ndataset = Alchemy()\n

    Reference

    https://arxiv.org/abs/1906.09427 https://alchemy.tencent.com/

    Source code in openqdc/datasets/potential/alchemy.py
    class Alchemy(BaseDataset):\n    \"\"\"\n    Alchemy comprises of 119,487 organic molecules with up to 14 heavy atoms, sampled from the GDB MedChem database.\n    Molecular properties are calculated using PySCF's implementation of the DFT Kohn-Sham method at the B3LYP level\n    with the basis set 6-31G(2df,p). The equilibrium geometry is optimized in three passes. First, OpenBabel is used\n    to parse SMILES string and build the Cartesian coordinates with MMFF94 force field optimization. Second, HF/STO3G\n    is used to generate the preliminary geometry. Third, for the final pass of geometry relaxation, the\n    B3LYP/6-31G(2df,p) model with the density fittting approximation for electron repulsion integrals is used. The\n    auxillary basis cc-pVDZ-jkfit is employed in density fitting to build the Coulomb matrix and the HF exchange\n    matrix.\n\n    Usage:\n    ```python\n    from openqdc.datasets import Alchemy\n    dataset = Alchemy()\n    ```\n\n    Reference:\n        https://arxiv.org/abs/1906.09427\n        https://alchemy.tencent.com/\n    \"\"\"\n\n    __name__ = \"alchemy\"\n\n    __energy_methods__ = [\n        PotentialMethod.WB97X_6_31G_D,  # \"wb97x/6-31g(d)\"\n    ]\n\n    energy_target_names = [\n        \"\u03c9B97x:6-31G(d) Energy\",\n    ]\n\n    __energy_unit__ = \"hartree\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"hartree/ang\"\n    __links__ = {\"alchemy.zip\": \"https://alchemy.tencent.com/data/alchemy-v20191129.zip\"}\n\n    def read_raw_entries(self):\n        dir_path = p_join(self.root, \"Alchemy-v20191129\")\n        full_csv = pd.read_csv(p_join(dir_path, \"final_version.csv\"))\n        energies = full_csv[\"U0\\n(Ha, internal energy at 0 K)\"].tolist()\n        atom_folder = full_csv[\"atom number\"]\n        gdb_idx = full_csv[\"gdb_idx\"]\n        idxs = full_csv.index.tolist()\n        samples = []\n        for i in tqdm(idxs):\n            sdf_file = p_join(dir_path, f\"atom_{atom_folder[i]}\", f\"{gdb_idx[i]}.sdf\")\n            energy = energies[i]\n            samples.append(read_mol(sdf_file, energy))\n        return samples\n
    "},{"location":"API/datasets/ani.html","title":"ANI","text":""},{"location":"API/datasets/ani.html#openqdc.datasets.potential.ani.ANI1","title":"ANI1","text":"

    Bases: BaseDataset

    The ANI-1 dataset is a collection of 22 x 10^6 structural conformations from 57,000 distinct small organic molecules. The molecules contain 4 distinct atoms, C, N, O and H. Electronic structure calculations use the wB97x density functional and the 6-31G(d) basis set. For generating structures, smiles strings for molecules are used for generating 3D conformations using RDKit. These 3D structures are then pre-optimized to a stationary point using the MMFF94 force field. Finally, geometries are optimized until energy minima using the chosen DFT level.

    Usage:

    from openqdc.datasets import ANI1\ndataset = ANI1()\n

    References

    https://www.nature.com/articles/sdata2017193

    https://github.com/aiqm/ANI1x_datasets

    Source code in openqdc/datasets/potential/ani.py
    class ANI1(BaseDataset):\n    \"\"\"\n    The ANI-1 dataset is a collection of 22 x 10^6 structural conformations from 57,000 distinct small organic\n    molecules. The molecules contain 4 distinct atoms, C, N, O and H. Electronic structure calculations use the\n    wB97x density functional and the 6-31G(d) basis set. For generating structures, smiles strings for molecules\n    are used for generating 3D conformations using RDKit. These 3D structures are then pre-optimized to a stationary\n    point using the MMFF94 force field. Finally, geometries are optimized until energy minima using the chosen DFT\n    level.\n\n    Usage:\n    ```python\n    from openqdc.datasets import ANI1\n    dataset = ANI1()\n    ```\n\n    References:\n        https://www.nature.com/articles/sdata2017193\\n\n        https://github.com/aiqm/ANI1x_datasets\n    \"\"\"\n\n    __name__ = \"ani1\"\n\n    __energy_methods__ = [\n        PotentialMethod.WB97X_6_31G_D,\n    ]\n\n    energy_target_names = [\n        \"\u03c9B97x:6-31G(d) Energy\",\n    ]\n\n    __energy_unit__ = \"hartree\"\n    __distance_unit__ = \"bohr\"\n    __forces_unit__ = \"hartree/bohr\"\n    __links__ = {\"ani1.hdf5.gz\": \"https://zenodo.org/record/3585840/files/214.hdf5.gz\"}\n\n    @property\n    def root(self):\n        return p_join(get_local_cache(), \"ani\")\n\n    @property\n    def config(self):\n        assert len(self.__links__) > 0, \"No links provided for fetching\"\n        return dict(dataset_name=\"ani\", links=self.__links__)\n\n    def __smiles_converter__(self, x):\n        return \"-\".join(x.decode(\"ascii\").split(\"-\")[:-1])\n\n    @property\n    def preprocess_path(self):\n        path = p_join(self.root, \"preprocessed\", self.__name__)\n        os.makedirs(path, exist_ok=True)\n        return path\n\n    def read_raw_entries(self):\n        raw_path = p_join(self.root, f\"{self.__name__}.h5.gz\")\n        samples = read_qc_archive_h5(raw_path, self.__name__, self.energy_target_names, self.force_target_names)\n        return samples\n
    "},{"location":"API/datasets/ani.html#openqdc.datasets.potential.ani.ANI1CCX","title":"ANI1CCX","text":"

    Bases: ANI1

    ANI1-CCX is a dataset of 500k conformers subsampled from the 5.5M conformers of ANI-1X dataset using active learning. The conformations are labelled using a high accuracy CCSD(T)*/CBS method.

    Usage:

    from openqdc.datasets import ANI1CCX\ndataset = ANI1CCX()\n

    References

    https://doi.org/10.1038/s41467-019-10827-4

    https://github.com/aiqm/ANI1x_datasets

    Source code in openqdc/datasets/potential/ani.py
    class ANI1CCX(ANI1):\n    \"\"\"\n    ANI1-CCX is a dataset of 500k conformers subsampled from the 5.5M conformers of ANI-1X dataset using active\n    learning. The conformations are labelled using a high accuracy CCSD(T)*/CBS method.\n\n    Usage:\n    ```python\n    from openqdc.datasets import ANI1CCX\n    dataset = ANI1CCX()\n    ```\n\n    References:\n        https://doi.org/10.1038/s41467-019-10827-4\\n\n        https://github.com/aiqm/ANI1x_datasets\n    \"\"\"\n\n    __name__ = \"ani1ccx\"\n    __energy_unit__ = \"hartree\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"hartree/ang\"\n\n    __energy_methods__ = [\n        PotentialMethod.CCSD_T_CBS,  # \"ccsd(t)/cbs\",\n        PotentialMethod.CCSD_T_CC_PVDZ,  # \"ccsd(t)/cc-pvdz\",\n        PotentialMethod.CCSD_T_CC_PVTZ,  # \"ccsd(t)/cc-pvtz\",\n        PotentialMethod.TCSSD_T_CC_PVDZ,  # \"tccsd(t)/cc-pvdz\",\n    ]\n\n    energy_target_names = [\n        \"CCSD(T)*:CBS Total Energy\",\n        \"NPNO-CCSD(T):cc-pVDZ Correlation Energy\",\n        \"NPNO-CCSD(T):cc-pVTZ Correlation Energy\",\n        \"TPNO-CCSD(T):cc-pVDZ Correlation Energy\",\n    ]\n    force_target_names = []\n    __links__ = {\"ani1x.hdf5.gz\": \"https://zenodo.org/record/4081694/files/292.hdf5.gz\"}\n\n    def __smiles_converter__(self, x):\n        return x.decode(\"ascii\")\n
    "},{"location":"API/datasets/ani.html#openqdc.datasets.potential.ani.ANI1CCX_V2","title":"ANI1CCX_V2","text":"

    Bases: ANI1CCX

    ANI1CCX_V2 is an extension of the ANI1CCX dataset with additional PM6 and GFN2_xTB labels for each conformation.

    Usage:

    from openqdc.datasets import ANI1CCX_V2\ndataset = ANI1CCX_V2()\n

    References

    https://doi.org/10.1038/s41467-019-10827-4

    https://github.com/aiqm/ANI1x_datasets

    Source code in openqdc/datasets/potential/ani.py
    class ANI1CCX_V2(ANI1CCX):\n    \"\"\"\n    ANI1CCX_V2 is an extension of the ANI1CCX dataset with additional PM6 and GFN2_xTB labels\n    for each conformation.\n\n    Usage:\n    ```python\n    from openqdc.datasets import ANI1CCX_V2\n    dataset = ANI1CCX_V2()\n    ```\n\n    References:\n        https://doi.org/10.1038/s41467-019-10827-4\\n\n        https://github.com/aiqm/ANI1x_datasets\n    \"\"\"\n\n    __name__ = \"ani1ccx_v2\"\n\n    __energy_methods__ = ANI1CCX.__energy_methods__ + [PotentialMethod.PM6, PotentialMethod.GFN2_XTB]\n    energy_target_names = ANI1CCX.energy_target_names + [\"PM6\", \"GFN2\"]\n    __force_mask__ = ANI1CCX.__force_mask__ + [False, False]\n
    "},{"location":"API/datasets/ani.html#openqdc.datasets.potential.ani.ANI1X","title":"ANI1X","text":"

    Bases: ANI1

    The ANI-1X dataset consists of ANI-1 molecules + some molecules added using active learning, which leads to a total of 5,496,771 conformers with 63,865 unique molecules. Databases of molecules like GDB-11, ChEMBL, generated amino acids and 2-amino acid peptides are used for sampling new molecules. One of the techniques are used for sampling conformations, (1) molecular dynamics, (2) normal mode sampling, (3) dimer sampling and (4) torsion sampling.

    Usage:

    from openqdc.datasets import ANI1X\ndataset = ANI1X()\n

    References

    https://doi.org/10.1063/1.5023802

    https://github.com/aiqm/ANI1x_datasets

    Source code in openqdc/datasets/potential/ani.py
    class ANI1X(ANI1):\n    \"\"\"\n    The ANI-1X dataset consists of ANI-1 molecules + some molecules added using active learning, which leads to\n    a total of 5,496,771 conformers with 63,865 unique molecules. Databases of molecules like GDB-11, ChEMBL,\n    generated amino acids and 2-amino acid peptides are used for sampling new molecules. One of the techniques\n    are used for sampling conformations, (1) molecular dynamics, (2) normal mode sampling, (3) dimer sampling and\n    (4) torsion sampling.\n\n    Usage:\n    ```python\n    from openqdc.datasets import ANI1X\n    dataset = ANI1X()\n    ```\n\n    References:\n        https://doi.org/10.1063/1.5023802\\n\n        https://github.com/aiqm/ANI1x_datasets\n    \"\"\"\n\n    __name__ = \"ani1x\"\n    __energy_unit__ = \"hartree\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"hartree/ang\"\n\n    __energy_methods__ = [\n        PotentialMethod.HF_CC_PVDZ,\n        PotentialMethod.HF_CC_PVQZ,\n        PotentialMethod.HF_CC_PVTZ,\n        PotentialMethod.MP2_CC_PVDZ,\n        PotentialMethod.MP2_CC_PVQZ,\n        PotentialMethod.MP2_CC_PVTZ,\n        PotentialMethod.WB97X_6_31G_D,\n        PotentialMethod.WB97X_CC_PVTZ,\n    ]\n\n    energy_target_names = [\n        \"HF:cc-pVDZ Total Energy\",\n        \"HF:cc-pVQZ Total Energy\",\n        \"HF:cc-pVTZ Total Energy\",\n        \"MP2:cc-pVDZ Correlation Energy\",\n        \"MP2:cc-pVQZ Correlation Energy\",\n        \"MP2:cc-pVTZ Correlation Energy\",\n        \"wB97x:6-31G(d) Total Energy\",\n        \"wB97x:def2-TZVPP Total Energy\",\n    ]\n\n    force_target_names = [\n        \"wB97x:6-31G(d) Atomic Forces\",\n        \"wB97x:def2-TZVPP Atomic Forces\",\n    ]\n\n    __force_mask__ = [False, False, False, False, False, False, True, True]\n    __links__ = {\"ani1ccx.hdf5.gz\": \"https://zenodo.org/record/4081692/files/293.hdf5.gz\"}\n\n    def convert_forces(self, x):\n        return super().convert_forces(x) * 0.529177249  # correct the Dataset error\n\n    def __smiles_converter__(self, x):\n        return \"-\".join(x.decode(\"ascii\").split(\"-\")[:-1])\n
    "},{"location":"API/datasets/ani.html#openqdc.datasets.potential.ani.ANI2X","title":"ANI2X","text":"

    Bases: ANI1

    The ANI-2X dataset was constructed using active learning from modified versions of GDB-11, CheMBL, and s66x8. It adds three new elements (F, Cl, S) resulting in 4.6 million conformers from 13k chemical isomers, optimized using the LBFGS algorithm and labeled with \u03c9B97X/6-31G*. The same sampling techniques as done in ANI-1X are used for generating geometries.

    Usage:

    from openqdc.datasets import ANI2X\ndataset = ANI2X()\n

    References

    https://doi.org/10.1021/acs.jctc.0c00121 https://github.com/aiqm/ANI1x_datasets

    Source code in openqdc/datasets/potential/ani.py
    class ANI2X(ANI1):\n    \"\"\"\n    The ANI-2X dataset was constructed using active learning from modified versions of GDB-11, CheMBL, and s66x8.\n    It adds three new elements (F, Cl, S) resulting in 4.6 million conformers from 13k chemical isomers, optimized\n    using the LBFGS algorithm and labeled with \u03c9B97X/6-31G*. The same sampling techniques as done in ANI-1X are\n    used for generating geometries.\n\n    Usage:\n    ```python\n    from openqdc.datasets import ANI2X\n    dataset = ANI2X()\n    ```\n\n    References:\n        https://doi.org/10.1021/acs.jctc.0c00121\n        https://github.com/aiqm/ANI1x_datasets\n    \"\"\"\n\n    __name__ = \"ani2x\"\n    __energy_unit__ = \"hartree\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"hartree/ang\"\n\n    __energy_methods__ = [\n        # PotentialMethod.NONE,  # \"b973c/def2mtzvp\",\n        PotentialMethod.WB97X_6_31G_D,  # \"wb97x/631gd\", # PAPER DATASET\n        # PotentialMethod.NONE,  # \"wb97md3bj/def2tzvpp\",\n        # PotentialMethod.NONE,  # \"wb97mv/def2tzvpp\",\n        # PotentialMethod.NONE,  # \"wb97x/def2tzvpp\",\n    ]\n\n    energy_target_names = [\n        # \"b973c/def2mtzvp\",\n        \"wb97x/631gd\",\n        # \"wb97md3bj/def2tzvpp\",\n        # \"wb97mv/def2tzvpp\",\n        # \"wb97x/def2tzvpp\",\n    ]\n\n    force_target_names = [\"wb97x/631gd\"]  # \"b973c/def2mtzvp\",\n\n    __force_mask__ = [True]\n    __links__ = {  # \"ANI-2x-B973c-def2mTZVP.tar.gz\": \"https://zenodo.org/records/10108942/files/ANI-2x-B973c-def2mTZVP.tar.gz?download=1\",  # noqa\n        # \"ANI-2x-wB97MD3BJ-def2TZVPP.tar.gz\": \"https://zenodo.org/records/10108942/files/ANI-2x-wB97MD3BJ-def2TZVPP.tar.gz?download=1\", # noqa\n        # \"ANI-2x-wB97MV-def2TZVPP.tar.gz\": \"https://zenodo.org/records/10108942/files/ANI-2x-wB97MV-def2TZVPP.tar.gz?download=1\", # noqa\n        \"ANI-2x-wB97X-631Gd.tar.gz\": \"https://zenodo.org/records/10108942/files/ANI-2x-wB97X-631Gd.tar.gz?download=1\",  # noqa\n        # \"ANI-2x-wB97X-def2TZVPP.tar.gz\": \"https://zenodo.org/records/10108942/files/ANI-2x-wB97X-def2TZVPP.tar.gz?download=1\", # noqa\n    }\n\n    def __smiles_converter__(self, x):\n        return x\n\n    def read_raw_entries(self):\n        samples = []\n        for lvl_theory in self.__links__.keys():\n            raw_path = p_join(self.root, \"final_h5\", f\"{lvl_theory.split('.')[0]}.h5\")\n            samples.extend(read_ani2_h5(raw_path))\n        return samples\n
    "},{"location":"API/datasets/comp6.html","title":"Comp6","text":""},{"location":"API/datasets/comp6.html#openqdc.datasets.potential.comp6.COMP6","title":"COMP6","text":"

    Bases: BaseDataset

    COMP6 is a benchmark suite consisting of broad regions of bio-chemical and organic space developed for testing the ANI-1x potential. It is curated from 6 benchmark sets: S66x8, ANI-MD, GDB7to9, GDB10to13, DrugBank, and Tripeptides. Energies and forces for all non-equilibrium molecular conformations are calculated using the wB97x density functional with the 6-31G(d) basis set. The dataset also includes Hirshfield charges and molecular dipoles.

    Details of the benchmark sets are as follows

    S66x8: Consists of 66 dimeric systems involving hydrogen bonding, pi-pi stacking, London interactions and

    mixed influence interactions.

    ANI Molecular Dynamics (ANI-MD): Forces from the ANI-1x potential are used for running 1ns vacuum molecular\n

    dynamics with a 0.25fs time step at 300K using the Langevin thermostat of 14 well-known drug molecules and 2 small proteins. A random subsample of 128 frames from each 1ns trajectory is selected, and reference DFT single point calculations are performed to calculate energies and forces.

    GDB7to9: Consists of 1500 molecules where 500 per 7, 8 and 9 heavy atoms subsampled from the GDB-11 dataset.\n

    The intial structure are randomly embedded into 3D space using RDKit and are optimized with tight convergence criteria. Normal modes/force constants are computer using the reference DFT model. Finally, Diverse normal mode sampling (DNMS) is carried out to generate non-equilibrium conformations.

    GDB10to13: Consists of 3000 molecules where 500 molecules per 10 and 11 heavy atoms are subsampled from GDB-11\n

    and 1000 molecules per 12 and 13 heavy atom are subsampled from GDB-13. Non-equilibrium conformations are generated via DNMS.

    Tripeptide: Consists of 248 random tripeptides. Structures are optimized similar to GDB7to9.\n\nDrugBank: Consists of 837 molecules subsampled from the original DrugBank database of real drug molecules.\n

    Structures are optimized similar to GDB7to9.

    Usage:

    from openqdc.datasets import COMP6\ndataset = COMP6()\n

    References

    https://aip.scitation.org/doi/abs/10.1063/1.5023802

    https://github.com/isayev/COMP6

    S66x8: https://pubs.rsc.org/en/content/articlehtml/2016/cp/c6cp00688d

    GDB-11: https://pubmed.ncbi.nlm.nih.gov/15674983/

    GDB-13: https://pubmed.ncbi.nlm.nih.gov/19505099/

    DrugBank: https://pubs.acs.org/doi/10.1021/ja902302h

    Source code in openqdc/datasets/potential/comp6.py
    class COMP6(BaseDataset):\n    \"\"\"\n    COMP6 is a benchmark suite consisting of broad regions of bio-chemical and organic space developed for testing the\n    ANI-1x potential. It is curated from 6 benchmark sets: S66x8, ANI-MD, GDB7to9, GDB10to13, DrugBank, and\n    Tripeptides. Energies and forces for all non-equilibrium molecular conformations are calculated using\n    the wB97x density functional with the 6-31G(d) basis set. The dataset also includes Hirshfield charges and\n    molecular dipoles.\n\n    Details of the benchmark sets are as follows:\n        S66x8: Consists of 66 dimeric systems involving hydrogen bonding, pi-pi stacking, London interactions and\n    mixed influence interactions.\\n\n        ANI Molecular Dynamics (ANI-MD): Forces from the ANI-1x potential are used for running 1ns vacuum molecular\n    dynamics with a 0.25fs time step at 300K using the Langevin thermostat of 14 well-known drug molecules and 2 small\n    proteins. A random subsample of 128 frames from each 1ns trajectory is selected, and reference DFT single point\n    calculations are performed to calculate energies and forces.\\n\n        GDB7to9: Consists of 1500 molecules where 500 per 7, 8 and 9 heavy atoms subsampled from the GDB-11 dataset.\n    The intial structure are randomly embedded into 3D space using RDKit and are optimized with tight convergence\n    criteria. Normal modes/force constants are computer using the reference DFT model. Finally, Diverse normal\n    mode sampling (DNMS) is carried out to generate non-equilibrium conformations.\\n\n        GDB10to13: Consists of 3000 molecules where 500 molecules per 10 and 11 heavy atoms are subsampled from GDB-11\n    and 1000 molecules per 12 and 13 heavy atom are subsampled from GDB-13. Non-equilibrium conformations are\n    generated via DNMS.\\n\n        Tripeptide: Consists of 248 random tripeptides. Structures are optimized similar to GDB7to9.\\n\n        DrugBank: Consists of 837 molecules subsampled from the original DrugBank database of real drug molecules.\n    Structures are optimized similar to GDB7to9.\n\n    Usage:\n    ```python\n    from openqdc.datasets import COMP6\n    dataset = COMP6()\n    ```\n\n    References:\n        https://aip.scitation.org/doi/abs/10.1063/1.5023802\\n\n        https://github.com/isayev/COMP6\\n\n        S66x8: https://pubs.rsc.org/en/content/articlehtml/2016/cp/c6cp00688d\\n\n        GDB-11: https://pubmed.ncbi.nlm.nih.gov/15674983/\\n\n        GDB-13: https://pubmed.ncbi.nlm.nih.gov/19505099/\\n\n        DrugBank: https://pubs.acs.org/doi/10.1021/ja902302h\n    \"\"\"\n\n    __name__ = \"comp6\"\n\n    # watchout that forces are stored as -grad(E)\n    __energy_unit__ = \"kcal/mol\"\n    __distance_unit__ = \"ang\"  # angstorm\n    __forces_unit__ = \"kcal/mol/ang\"\n\n    __energy_methods__ = [\n        PotentialMethod.WB97X_6_31G_D,  # \"wb97x/6-31g*\",\n        PotentialMethod.B3LYP_D3_BJ_DEF2_TZVP,  # \"b3lyp-d3(bj)/def2-tzvp\",\n        PotentialMethod.B3LYP_DEF2_TZVP,  # \"b3lyp/def2-tzvp\",\n        PotentialMethod.HF_DEF2_TZVP,  # \"hf/def2-tzvp\",\n        PotentialMethod.PBE_D3_BJ_DEF2_TZVP,  # \"pbe-d3(bj)/def2-tzvp\",\n        PotentialMethod.PBE_DEF2_TZVP,  # \"pbe/def2-tzvp\",\n        PotentialMethod.SVWN_DEF2_TZVP,  # \"svwn/def2-tzvp\",\n    ]\n\n    energy_target_names = [\n        \"Energy\",\n        \"B3LYP-D3M(BJ):def2-tzvp\",\n        \"B3LYP:def2-tzvp\",\n        \"HF:def2-tzvp\",\n        \"PBE-D3M(BJ):def2-tzvp\",\n        \"PBE:def2-tzvp\",\n        \"SVWN:def2-tzvp\",\n    ]\n    __force_mask__ = [True, False, False, False, False, False, False]\n\n    force_target_names = [\n        \"Gradient\",\n    ]\n\n    def __smiles_converter__(self, x):\n        \"\"\"util function to convert string to smiles: useful if the smiles is\n        encoded in a different format than its display format\n        \"\"\"\n        return \"-\".join(x.decode(\"ascii\").split(\"_\")[:-1])\n\n    def read_raw_entries(self):\n        samples = []\n        for subset in [\"ani_md\", \"drugbank\", \"gdb7_9\", \"gdb10_13\", \"s66x8\", \"tripeptides\"]:\n            raw_path = p_join(self.root, f\"{subset}.h5.gz\")\n            samples += read_qc_archive_h5(raw_path, subset, self.energy_target_names, self.force_target_names)\n\n        return samples\n
    "},{"location":"API/datasets/comp6.html#openqdc.datasets.potential.comp6.COMP6.__smiles_converter__","title":"__smiles_converter__(x)","text":"

    util function to convert string to smiles: useful if the smiles is encoded in a different format than its display format

    Source code in openqdc/datasets/potential/comp6.py
    def __smiles_converter__(self, x):\n    \"\"\"util function to convert string to smiles: useful if the smiles is\n    encoded in a different format than its display format\n    \"\"\"\n    return \"-\".join(x.decode(\"ascii\").split(\"_\")[:-1])\n
    "},{"location":"API/datasets/des.html","title":"DES","text":""},{"location":"API/datasets/des.html#openqdc.datasets.interaction.des.DES370K","title":"DES370K","text":"

    Bases: BaseInteractionDataset, IDES

    DE Shaw 370K (DES370K) is a dataset of 3,691 distinct dimers with 370K unique geometries with interaction energies computed at CCSD(T)/CBS level of theory. It consists of 392 closed-shell chemical species (both neutral molecules and ions) including water and functional groups found in proteins. Dimer geometries are generated using QM-based optimization with DF-LMP2/aVDZ level of theory and MD-based from condensed phase MD simulations.

    Usage:

    from openqdc.datasets import DES370K\ndataset = DES370K()\n

    Reference

    https://www.nature.com/articles/s41597-021-00833-x

    Source code in openqdc/datasets/interaction/des.py
    class DES370K(BaseInteractionDataset, IDES):\n    \"\"\"\n    DE Shaw 370K (DES370K) is a dataset of 3,691 distinct dimers with 370K unique geometries with interaction energies\n    computed at CCSD(T)/CBS level of theory. It consists of 392 closed-shell chemical species (both neutral molecules\n    and ions) including water and functional groups found in proteins. Dimer geometries are generated using\n    QM-based optimization with DF-LMP2/aVDZ level of theory and MD-based from condensed phase MD simulations.\n\n    Usage:\n    ```python\n    from openqdc.datasets import DES370K\n    dataset = DES370K()\n    ```\n\n    Reference:\n        https://www.nature.com/articles/s41597-021-00833-x\n    \"\"\"\n\n    __name__ = \"des370k_interaction\"\n    __filename__ = \"DES370K.csv\"\n    __energy_unit__ = \"kcal/mol\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"kcal/mol/ang\"\n    __energy_methods__ = [\n        InteractionMethod.MP2_CC_PVDZ,\n        InteractionMethod.MP2_CC_PVQZ,\n        InteractionMethod.MP2_CC_PVTZ,\n        InteractionMethod.MP2_CBS,\n        InteractionMethod.CCSD_T_CC_PVDZ,\n        InteractionMethod.CCSD_T_CBS,\n        InteractionMethod.CCSD_T_NN,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n    ]\n\n    __energy_type__ = [\n        InterEnergyType.TOTAL,\n        InterEnergyType.TOTAL,\n        InterEnergyType.TOTAL,\n        InterEnergyType.TOTAL,\n        InterEnergyType.TOTAL,\n        InterEnergyType.TOTAL,\n        InterEnergyType.TOTAL,\n        InterEnergyType.TOTAL,\n        InterEnergyType.ES,\n        InterEnergyType.EX,\n        InterEnergyType.EX_S2,\n        InterEnergyType.IND,\n        InterEnergyType.EX_IND,\n        InterEnergyType.DISP,\n        InterEnergyType.EX_DISP_OS,\n        InterEnergyType.EX_DISP_SS,\n        InterEnergyType.DELTA_HF,\n    ]\n\n    energy_target_names = [\n        \"cc_MP2_all\",\n        \"qz_MP2_all\",\n        \"tz_MP2_all\",\n        \"cbs_MP2_all\",\n        \"cc_CCSD(T)_all\",\n        \"cbs_CCSD(T)_all\",\n        \"nn_CCSD(T)_all\",\n        \"sapt_all\",\n        \"sapt_es\",\n        \"sapt_ex\",\n        \"sapt_exs2\",\n        \"sapt_ind\",\n        \"sapt_exind\",\n        \"sapt_disp\",\n        \"sapt_exdisp_os\",\n        \"sapt_exdisp_ss\",\n        \"sapt_delta_HF\",\n    ]\n    __links__ = {\n        \"DES370K.zip\": \"https://zenodo.org/record/5676266/files/DES370K.zip\",\n    }\n\n    @property\n    def csv_path(self):\n        return os.path.join(self.root, self.__filename__)\n\n    def _create_subsets(self, **kwargs):\n        return create_subset(kwargs[\"smiles0\"], kwargs[\"smiles1\"])\n\n    def read_raw_entries(self) -> List[Dict]:\n        filepath = self.csv_path\n        logger.info(f\"Reading {self.__name__} interaction data from {filepath}\")\n        df = pd.read_csv(filepath)\n        data = []\n        for idx, row in tqdm(df.iterrows(), total=df.shape[0]):\n            item = parse_des_df(row, self.energy_target_names)\n            item[\"subset\"] = self._create_subsets(row=row, **item)\n            item = convert_to_record(item)\n            data.append(item)\n        return data\n
    "},{"location":"API/datasets/des.html#openqdc.datasets.interaction.des.DES5M","title":"DES5M","text":"

    Bases: DES370K

    DE Shaw 5M (DES5M) is a dataset of 3,691 distinct dimers with 5,000,000 unique geometries with interaction energies computed using SNS-MP2, a machine learning approach. The unique geometries are generated similar to DES370K using QM based optimization and MD simulations.

    Usage:

    from openqdc.datasets import DES5M\ndataset = DES5M()\n

    Reference

    https://www.nature.com/articles/s41597-021-00833-x

    Source code in openqdc/datasets/interaction/des.py
    class DES5M(DES370K):\n    \"\"\"\n    DE Shaw 5M (DES5M) is a dataset of 3,691 distinct dimers with 5,000,000 unique geometries with interaction energies\n    computed using SNS-MP2, a machine learning approach. The unique geometries are generated similar to DES370K using\n    QM based optimization and MD simulations.\n\n    Usage:\n    ```python\n    from openqdc.datasets import DES5M\n    dataset = DES5M()\n    ```\n\n    Reference:\n        https://www.nature.com/articles/s41597-021-00833-x\n    \"\"\"\n\n    __name__ = \"des5m_interaction\"\n    __filename__ = \"DES5M.csv\"\n\n    __energy_methods__ = [\n        InteractionMethod.MP2_CC_PVQZ,\n        InteractionMethod.MP2_CC_PVTZ,\n        InteractionMethod.MP2_CBS,\n        InteractionMethod.CCSD_T_NN,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n    ]\n\n    __energy_type__ = [\n        InterEnergyType.TOTAL,\n        InterEnergyType.TOTAL,\n        InterEnergyType.TOTAL,\n        InterEnergyType.TOTAL,\n        InterEnergyType.TOTAL,\n        InterEnergyType.ES,\n        InterEnergyType.EX,\n        InterEnergyType.EX_S2,\n        InterEnergyType.IND,\n        InterEnergyType.EX_IND,\n        InterEnergyType.DISP,\n        InterEnergyType.EX_DISP_OS,\n        InterEnergyType.EX_DISP_SS,\n        InterEnergyType.DELTA_HF,\n    ]\n\n    energy_target_names = [\n        \"qz_MP2_all\",\n        \"tz_MP2_all\",\n        \"cbs_MP2_all\",\n        \"nn_CCSD(T)_all\",\n        \"sapt_all\",\n        \"sapt_es\",\n        \"sapt_ex\",\n        \"sapt_exs2\",\n        \"sapt_ind\",\n        \"sapt_exind\",\n        \"sapt_disp\",\n        \"sapt_exdisp_os\",\n        \"sapt_exdisp_ss\",\n        \"sapt_delta_HF\",\n    ]\n    __links__ = {\n        \"DES5M.zip\": \"https://zenodo.org/records/5706002/files/DESS5M.zip?download=1\",\n    }\n
    "},{"location":"API/datasets/des.html#openqdc.datasets.interaction.des.DESS66","title":"DESS66","text":"

    Bases: DES370K

    DESS66 is a dataset consisting of 66 molecular complexes from the S66 dataset with CCSD(T)/CBS dimer interaction energies with 1 equilibrium geometry giving 66 conformers in total. The protocol for estimating energies is based on the DES370K paper.

    Usage:

    from openqdc.datasets import DESS66\ndataset = DESS66()\n

    Reference

    https://www.nature.com/articles/s41597-021-00833-x

    S66: https://pubs.acs.org/doi/10.1021/ct2002946

    Source code in openqdc/datasets/interaction/des.py
    class DESS66(DES370K):\n    \"\"\"\n    DESS66 is a dataset consisting of 66 molecular complexes from the S66 dataset with CCSD(T)/CBS\n    dimer interaction energies with 1 equilibrium geometry giving 66 conformers in total.\n    The protocol for estimating energies is based on the DES370K paper.\n\n    Usage:\n    ```python\n    from openqdc.datasets import DESS66\n    dataset = DESS66()\n    ```\n\n    Reference:\n        https://www.nature.com/articles/s41597-021-00833-x\\n\n        S66: https://pubs.acs.org/doi/10.1021/ct2002946\n    \"\"\"\n\n    __name__ = \"des_s66\"\n    __filename__ = \"DESS66.csv\"\n    __links__ = {\"DESS66.zip\": \"https://zenodo.org/records/5676284/files/DESS66.zip?download=1\"}\n\n    def _create_subsets(self, **kwargs):\n        return kwargs[\"row\"][\"system_name\"]\n
    "},{"location":"API/datasets/des.html#openqdc.datasets.interaction.des.DESS66x8","title":"DESS66x8","text":"

    Bases: DESS66

    DESS66x8 is a dataset consisting of 66 molecular complexes from the S66 dataset with CCSD(T)/CBS dimer interaction energies with 1 equilibrium geometry and 8 geometries along the dissociation curve giving 592 conformers in total. The protocol for estimating energies is based on the DES370K paper.

    Usage:

    from openqdc.datasets import DESS66x8\ndataset = DESS66x8()\n

    Reference

    https://www.nature.com/articles/s41597-021-00833-x

    Source code in openqdc/datasets/interaction/des.py
    class DESS66x8(DESS66):\n    \"\"\"\n    DESS66x8 is a dataset consisting of 66 molecular complexes from the S66 dataset with CCSD(T)/CBS\n    dimer interaction energies with 1 equilibrium geometry and 8 geometries along the dissociation curve\n    giving 592 conformers in total. The protocol for estimating energies is based on the DES370K paper.\n\n    Usage:\n    ```python\n    from openqdc.datasets import DESS66x8\n    dataset = DESS66x8()\n    ```\n\n    Reference:\n        https://www.nature.com/articles/s41597-021-00833-x\n    \"\"\"\n\n    __name__ = \"des_s66x8\"\n    __filename__ = \"DESS66x8.csv\"\n    __links__ = {\"DESS66x8.zip\": \"https://zenodo.org/records/5676284/files/DESS66x8.zip?download=1\"}\n
    "},{"location":"API/datasets/gdml.html","title":"GDML","text":""},{"location":"API/datasets/gdml.html#openqdc.datasets.potential.gdml.GDML","title":"GDML","text":"

    Bases: BaseDataset

    Gradient Domain Machine Learning (GDML) is a dataset consisting of samples from ab initio molecular dynamics (AIMD) trajectories at a resolution of 0.5fs. The dataset consists of, Benzene (627000 conformations), Uracil (133000 conformations), Naptalene (326000 conformations), Aspirin (211000 conformations) Salicylic Acid (320000 conformations), Malonaldehyde (993000 conformations), Ethanol (555000 conformations) and Toluene (100000 conformations). Energy and force labels for each conformation are computed using the PBE + vdW-TS electronic structure method. molecular dynamics (AIMD) trajectories.

    The dataset consists of the following trajectories

    Benzene: 627000 samples

    Uracil: 133000 samples

    Naptalene: 326000 samples

    Aspirin: 211000 samples

    Salicylic Acid: 320000 samples

    Malonaldehyde: 993000 samples

    Ethanol: 555000 samples

    Toluene: 100000 samples

    Usage:

    from openqdc.datasets import GDML\ndataset = GDML()\n

    References

    https://www.science.org/doi/10.1126/sciadv.1603015 http://www.sgdml.org/#datasets

    Source code in openqdc/datasets/potential/gdml.py
    class GDML(BaseDataset):\n    \"\"\"\n    Gradient Domain Machine Learning (GDML) is a dataset consisting of samples from ab initio\n    molecular dynamics (AIMD) trajectories at a resolution of 0.5fs. The dataset consists of, Benzene\n    (627000 conformations), Uracil (133000 conformations), Naptalene (326000 conformations), Aspirin\n    (211000 conformations) Salicylic Acid (320000 conformations), Malonaldehyde (993000 conformations),\n    Ethanol (555000 conformations) and Toluene (100000 conformations). Energy and force labels for\n    each conformation are computed using the PBE + vdW-TS electronic structure method.\n    molecular dynamics (AIMD) trajectories.\n\n    The dataset consists of the following trajectories:\n        Benzene: 627000 samples\\n\n        Uracil: 133000 samples\\n\n        Naptalene: 326000 samples\\n\n        Aspirin: 211000 samples\\n\n        Salicylic Acid: 320000 samples\\n\n        Malonaldehyde: 993000 samples\\n\n        Ethanol: 555000 samples\\n\n        Toluene: 100000 samples\\n\n\n    Usage:\n    ```python\n    from openqdc.datasets import GDML\n    dataset = GDML()\n    ```\n\n    References:\n        https://www.science.org/doi/10.1126/sciadv.1603015\n        http://www.sgdml.org/#datasets\n    \"\"\"\n\n    __name__ = \"gdml\"\n\n    __energy_methods__ = [\n        PotentialMethod.CCSD_CC_PVDZ,  # \"ccsd/cc-pvdz\",\n        PotentialMethod.CCSD_T_CC_PVDZ,  # \"ccsd(t)/cc-pvdz\",\n        # TODO: verify if basis set vdw-ts == def2-tzvp and\n        # it is the same in ISO17 and revmd17\n        PotentialMethod.PBE_DEF2_TZVP,  # \"pbe/def2-tzvp\",  # MD17\n    ]\n\n    energy_target_names = [\n        \"CCSD Energy\",\n        \"CCSD(T) Energy\",\n        \"PBE-TS Energy\",\n    ]\n\n    __force_mask__ = [True, True, True]\n\n    force_target_names = [\n        \"CCSD Gradient\",\n        \"CCSD(T) Gradient\",\n        \"PBE-TS Gradient\",\n    ]\n\n    __energy_unit__ = \"kcal/mol\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"kcal/mol/ang\"\n    __links__ = {\n        \"gdb7_9.hdf5.gz\": \"https://zenodo.org/record/3588361/files/208.hdf5.gz\",\n        \"gdb10_13.hdf5.gz\": \"https://zenodo.org/record/3588364/files/209.hdf5.gz\",\n        \"drugbank.hdf5.gz\": \"https://zenodo.org/record/3588361/files/207.hdf5.gz\",\n        \"tripeptides.hdf5.gz\": \"https://zenodo.org/record/3588368/files/211.hdf5.gz\",\n        \"ani_md.hdf5.gz\": \"https://zenodo.org/record/3588341/files/205.hdf5.gz\",\n        \"s66x8.hdf5.gz\": \"https://zenodo.org/record/3588367/files/210.hdf5.gz\",\n    }\n\n    def read_raw_entries(self):\n        raw_path = p_join(self.root, \"gdml.h5.gz\")\n        samples = read_qc_archive_h5(raw_path, \"gdml\", self.energy_target_names, self.force_target_names)\n\n        return samples\n
    "},{"location":"API/datasets/geom.html","title":"GEOM","text":"

    Bases: BaseDataset

    Geometric Ensemble Of Molecules (GEOM) dataset contains 37 million conformers for 133,000 molecules from QM9, and 317,000 molecules with experimental data related to biophysics, physiology, and physical chemistry. For each molecule, the initial structure is generated with RDKit, optimized with the GFN2-xTB energy method and the lowest energy conformer is fed to the CREST software. CREST software uses metadynamics for exploring the conformational space for each molecule. Energies in the dataset are computed using semi-empirical method GFN2-xTB.

    Usage:

    from openqdc.datasets import GEOM\ndataset = GEOM()\n

    References

    https://www.nature.com/articles/s41597-022-01288-4

    https://github.com/learningmatter-mit/geom

    CREST Software: https://pubs.rsc.org/en/content/articlelanding/2020/cp/c9cp06869d

    Source code in openqdc/datasets/potential/geom.py
    class GEOM(BaseDataset):\n    \"\"\"\n    Geometric Ensemble Of Molecules (GEOM) dataset contains 37 million conformers for 133,000 molecules\n    from QM9, and 317,000 molecules with experimental data related to biophysics, physiology, and physical chemistry.\n    For each molecule, the initial structure is generated with RDKit, optimized with the GFN2-xTB energy method and\n    the lowest energy conformer is fed to the CREST software. CREST software uses metadynamics for exploring the\n    conformational space for each molecule. Energies in the dataset are computed using semi-empirical method GFN2-xTB.\n\n    Usage:\n    ```python\n    from openqdc.datasets import GEOM\n    dataset = GEOM()\n    ```\n\n    References:\n        https://www.nature.com/articles/s41597-022-01288-4\\n\n        https://github.com/learningmatter-mit/geom\\n\n        CREST Software: https://pubs.rsc.org/en/content/articlelanding/2020/cp/c9cp06869d\n    \"\"\"\n\n    __name__ = \"geom\"\n    __energy_methods__ = [PotentialMethod.GFN2_XTB]\n\n    __energy_unit__ = \"hartree\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"hartree/ang\"\n\n    energy_target_names = [\"gfn2_xtb.energy\"]\n    force_target_names = []\n\n    partitions = [\"qm9\", \"drugs\"]\n    __links__ = {\"rdkit_folder.tar.gz\": \"https://dataverse.harvard.edu/api/access/datafile/4327252\"}\n\n    def _read_raw_(self, partition):\n        raw_path = p_join(self.root, \"rdkit_folder\")\n\n        mols = load_json(p_join(raw_path, f\"summary_{partition}.json\"))\n        mols = list(mols.items())\n\n        fn = lambda x: read_mol(x[0], x[1], raw_path, partition)  # noqa E731\n        samples = dm.parallelized(fn, mols, n_jobs=1, progress=True)  # don't use more than 1 job\n        return samples\n\n    def read_raw_entries(self):\n        samples = sum([self._read_raw_(partition) for partition in self.partitions], [])\n        return samples\n
    "},{"location":"API/datasets/iso_17.html","title":"ISO_17","text":""},{"location":"API/datasets/iso_17.html#openqdc.datasets.potential.iso_17.ISO17","title":"ISO17","text":"

    Bases: BaseDataset

    ISO17 dataset consists of the largest set of isomers from the QM9 dataset that consists of a fixed composition of atoms (C7O2H10) arranged in different chemically valid structures. It consist of 129 molecules, each containing 5,000 conformational geometries, energies and forces with a resolution of 1 fs in the molecular dynamics trajectories. The simulations were carried out using density functional theory (DFT) in the generalized gradient approximation (GGA) with the Perdew-Burke-Ernzerhof (PBE) functional and the Tkatchenko-Scheffler (TS) van der Waals correction method.

    Usage:

    from openqdc.datasets import ISO17\ndataset = ISO17()\n

    References

    https://arxiv.org/abs/1706.08566

    https://arxiv.org/abs/1609.08259

    https://www.nature.com/articles/sdata201422

    https://pubmed.ncbi.nlm.nih.gov/10062328/

    https://pubmed.ncbi.nlm.nih.gov/19257665/

    Source code in openqdc/datasets/potential/iso_17.py
    class ISO17(BaseDataset):\n    \"\"\"\n    ISO17 dataset consists of the largest set of isomers from the QM9 dataset that consists of a fixed composition of\n    atoms (C7O2H10) arranged in different chemically valid structures. It consist of 129 molecules, each containing\n    5,000 conformational geometries, energies and forces with a resolution of 1 fs in the molecular dynamics\n    trajectories. The simulations were carried out using density functional theory (DFT) in the generalized gradient\n    approximation (GGA) with the Perdew-Burke-Ernzerhof (PBE) functional and the Tkatchenko-Scheffler (TS) van der\n    Waals correction method.\n\n    Usage:\n    ```python\n    from openqdc.datasets import ISO17\n    dataset = ISO17()\n    ```\n\n    References:\n        https://arxiv.org/abs/1706.08566\\n\n        https://arxiv.org/abs/1609.08259\\n\n        https://www.nature.com/articles/sdata201422\\n\n        https://pubmed.ncbi.nlm.nih.gov/10062328/\\n\n        https://pubmed.ncbi.nlm.nih.gov/19257665/\n    \"\"\"\n\n    __name__ = \"iso_17\"\n\n    __energy_methods__ = [\n        PotentialMethod.PBE_DEF2_TZVP,  # \"pbe/def2-tzvp\",\n    ]\n\n    energy_target_names = [\n        \"PBE-TS Energy\",\n    ]\n\n    __force_mask__ = [True]\n\n    force_target_names = [\n        \"PBE-TS Gradient\",\n    ]\n\n    __energy_unit__ = \"ev\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"ev/ang\"\n    __links__ = {\"iso_17.hdf5.gz\": \"https://zenodo.org/record/3585907/files/216.hdf5.gz\"}\n\n    def __smiles_converter__(self, x):\n        \"\"\"util function to convert string to smiles: useful if the smiles is\n        encoded in a different format than its display format\n        \"\"\"\n        return \"-\".join(x.decode(\"ascii\").split(\"_\")[:-1])\n\n    def read_raw_entries(self):\n        raw_path = p_join(self.root, \"iso_17.h5.gz\")\n        samples = read_qc_archive_h5(raw_path, \"iso_17\", self.energy_target_names, self.force_target_names)\n\n        return samples\n
    "},{"location":"API/datasets/iso_17.html#openqdc.datasets.potential.iso_17.ISO17.__smiles_converter__","title":"__smiles_converter__(x)","text":"

    util function to convert string to smiles: useful if the smiles is encoded in a different format than its display format

    Source code in openqdc/datasets/potential/iso_17.py
    def __smiles_converter__(self, x):\n    \"\"\"util function to convert string to smiles: useful if the smiles is\n    encoded in a different format than its display format\n    \"\"\"\n    return \"-\".join(x.decode(\"ascii\").split(\"_\")[:-1])\n
    "},{"location":"API/datasets/l7.html","title":"L7","text":""},{"location":"API/datasets/l7.html#openqdc.datasets.interaction.l7.L7","title":"L7","text":"

    Bases: YamlDataset

    The L7 interaction energy dataset consists of 7 dispersion stabilized non-covalent complexes with energies labelled using semi-empirical and quantum mechanical methods. The intial geometries are taken from crystal X-ray data and optimized with a DFT method specific to the complex.

    Usage:

    from openqdc.datasets import L7\ndataset = L7()\n

    Reference

    https://pubs.acs.org/doi/10.1021/ct400036b

    Source code in openqdc/datasets/interaction/l7.py
    class L7(YamlDataset):\n    \"\"\"\n    The L7 interaction energy dataset consists of 7 dispersion stabilized non-covalent complexes with\n    energies labelled using semi-empirical and quantum mechanical methods. The intial geometries are\n    taken from crystal X-ray data and optimized with a DFT method specific to the complex.\n\n    Usage:\n    ```python\n    from openqdc.datasets import L7\n    dataset = L7()\n    ```\n\n    Reference:\n        https://pubs.acs.org/doi/10.1021/ct400036b\n    \"\"\"\n\n    __name__ = \"l7\"\n    __energy_methods__ = [\n        InteractionMethod.QCISDT_CBS,  # \"QCISD(T)/CBS\",\n        InteractionMethod.DLPNO_CCSDT,  # \"DLPNO-CCSD(T)\",\n        InteractionMethod.MP2_CBS,  # \"MP2/CBS\",\n        InteractionMethod.MP2C_CBS,  # \"MP2C/CBS\",\n        InteractionMethod.FIXED,  # \"fixed\", TODO: we should remove this level of theory because unless we have a pro\n        InteractionMethod.DLPNO_CCSDT0,  # \"DLPNO-CCSD(T0)\",\n        InteractionMethod.LNO_CCSDT,  # \"LNO-CCSD(T)\",\n        InteractionMethod.FN_DMC,  # \"FN-DMC\",\n    ]\n    __links__ = {\n        \"l7.yaml\": \"http://cuby4.molecular.cz/download_datasets/l7.yaml\",\n        \"geometries.tar.gz\": \"http://cuby4.molecular.cz/download_geometries/L7.tar\",\n    }\n\n    def _process_name(self, item):\n        return item.geometry.split(\":\")[1]\n\n    def get_n_atoms_ptr(self, item, root, filename):\n        return np.array([int(item.setup[\"molecule_a\"][\"selection\"].split(\"-\")[1])], dtype=np.int32)\n
    "},{"location":"API/datasets/md22.html","title":"MD22","text":""},{"location":"API/datasets/md22.html#openqdc.datasets.potential.md22.MD22","title":"MD22","text":"

    Bases: RevMD17

    MD22 consists of molecular dynamics (MD) trajectories of four major classes of biomolecules and supramolecules, ranging from a small peptide with 42 atoms to a double-walled nanotube with 370 atoms. The simulation trajectories are sampled at 400K and 500K with a resolution of 1fs. Potential energy and forces are computed using the PBE+MBD level of theory.

    Usage:

    from openqdc.datasets import MD22\ndataset = MD22()\n

    Reference

    https://arxiv.org/abs/2209.14865

    Source code in openqdc/datasets/potential/md22.py
    class MD22(RevMD17):\n    \"\"\"\n    MD22 consists of molecular dynamics (MD) trajectories of four major classes of biomolecules and supramolecules,\n    ranging from a small peptide with 42 atoms to a double-walled nanotube with 370 atoms. The simulation trajectories\n    are sampled at 400K and 500K with a resolution of 1fs. Potential energy and forces are computed using the PBE+MBD\n    level of theory.\n\n    Usage:\n    ```python\n    from openqdc.datasets import MD22\n    dataset = MD22()\n    ```\n\n    Reference:\n        https://arxiv.org/abs/2209.14865\n    \"\"\"\n\n    __name__ = \"md22\"\n    __links__ = {\n        f\"{x}.npz\": f\"http://www.quantum-machine.org/gdml/repo/datasets/md22_{x}.npz\"\n        for x in [\n            \"Ac-Ala3-NHMe\",\n            \"DHA\",\n            \"stachyose\",\n            \"AT-AT\",\n            \"AT-AT-CG-CG\",\n            \"double-walled_nanotube\",\n            \"buckyball-catcher\",\n        ]\n    }\n\n    def read_raw_entries(self):\n        entries_list = []\n        for trajectory in trajectories:\n            entries_list.append(read_npz_entry(trajectory, self.root))\n        return entries_list\n
    "},{"location":"API/datasets/metcalf.html","title":"Metcalf","text":""},{"location":"API/datasets/metcalf.html#openqdc.datasets.interaction.metcalf.Metcalf","title":"Metcalf","text":"

    Bases: BaseInteractionDataset

    Metcalf is a dataset consisting of 126 hydrogen-bonded dimers involving N-methylacetamide (NMA) with 14,744 to 156,704 geometries/configurations for each complex. The geometries are optimized using the RI-MP2 method and the cc-pVTZ basis set. SAPT(0) calculations are performed for computing interaction energies and the various components.

    Usage:

    from openqdc.datasets import Metcalf\ndataset = Metcalf()\n

    Reference

    https://doi.org/10.1063/1.5142636

    Source code in openqdc/datasets/interaction/metcalf.py
    class Metcalf(BaseInteractionDataset):\n    \"\"\"\n    Metcalf is a dataset consisting of 126 hydrogen-bonded dimers involving N-methylacetamide (NMA) with 14,744 to\n    156,704 geometries/configurations for each complex. The geometries are optimized using the RI-MP2 method and\n    the cc-pVTZ basis set. SAPT(0) calculations are performed for computing interaction energies and the various\n    components.\n\n    Usage:\n    ```python\n    from openqdc.datasets import Metcalf\n    dataset = Metcalf()\n    ```\n\n    Reference:\n        https://doi.org/10.1063/1.5142636\n    \"\"\"\n\n    __name__ = \"metcalf\"\n    __energy_unit__ = \"kcal/mol\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"kcal/mol/ang\"\n    __energy_methods__ = [\n        InteractionMethod.SAPT0_JUN_CC_PVDZ,\n        InteractionMethod.SAPT0_JUN_CC_PVDZ,\n        InteractionMethod.SAPT0_JUN_CC_PVDZ,\n        InteractionMethod.SAPT0_JUN_CC_PVDZ,\n        InteractionMethod.SAPT0_JUN_CC_PVDZ,\n    ]\n    __energy_type__ = [\n        InterEnergyType.TOTAL,\n        InterEnergyType.ES,\n        InterEnergyType.EX,\n        InterEnergyType.IND,\n        InterEnergyType.DISP,\n    ]\n    energy_target_names = [\n        \"total energy\",\n        \"electrostatic energy\",\n        \"exchange energy\",\n        \"induction energy\",\n        \"dispersion energy\",\n    ]\n    __links__ = {\"model-data.tar.gz\": \"https://zenodo.org/records/10934211/files/model-data.tar?download=1\"}\n\n    def read_raw_entries(self) -> List[Dict]:\n        # extract in folders\n        extract_raw_tar_gz(self.root)\n        data = []\n        for filename in glob(self.root + f\"{os.sep}*.xyz\"):\n            data.extend(read_xyz(filename, self.__name__))\n        return data\n
    "},{"location":"API/datasets/molecule3d.html","title":"Molecule3D","text":""},{"location":"API/datasets/molecule3d.html#openqdc.datasets.potential.molecule3d.Molecule3D","title":"Molecule3D","text":"

    Bases: BaseDataset

    Molecule3D dataset consists of 3,899,647 molecules with equilibrium geometries and energies calculated at the B3LYP/6-31G* level of theory. The molecules are extracted from the PubChem database and cleaned by removing molecules with invalid molecule files, with SMILES conversion error, RDKIT warnings, sanitization problems, or with damaged log files.

    Usage:

    from openqdc.datasets import Molecule3D\ndataset = Molecule3D()\n

    References

    https://arxiv.org/abs/2110.01717

    https://github.com/divelab/MoleculeX

    Source code in openqdc/datasets/potential/molecule3d.py
    class Molecule3D(BaseDataset):\n    \"\"\"\n    Molecule3D dataset consists of 3,899,647 molecules with equilibrium geometries and energies calculated at the\n    B3LYP/6-31G* level of theory. The molecules are extracted from the PubChem database and cleaned by removing\n    molecules with invalid molecule files, with SMILES conversion error, RDKIT warnings, sanitization problems,\n    or with damaged log files.\n\n    Usage:\n    ```python\n    from openqdc.datasets import Molecule3D\n    dataset = Molecule3D()\n    ```\n\n    References:\n        https://arxiv.org/abs/2110.01717\\n\n        https://github.com/divelab/MoleculeX\n    \"\"\"\n\n    __name__ = \"molecule3d\"\n    __energy_methods__ = [PotentialMethod.B3LYP_6_31G_D]  # \"b3lyp/6-31g*\",\n    # UNITS MOST LIKELY WRONG, MUST CHECK THEM MANUALLY\n    __energy_unit__ = \"ev\"  # CALCULATED\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"ev/ang\"\n    __links__ = {\"molecule3d.zip\": \"https://drive.google.com/uc?id=1C_KRf8mX-gxny7kL9ACNCEV4ceu_fUGy\"}\n\n    energy_target_names = [\"b3lyp/6-31g*.energy\"]\n\n    def read_raw_entries(self):\n        raw = p_join(self.root, \"data\", \"raw\")\n        sdf_paths = glob(p_join(raw, \"*.sdf\"))\n        properties_path = p_join(raw, \"properties.csv\")\n\n        fn = lambda x: _read_sdf(x, properties_path)\n        res = dm.parallelized(fn, sdf_paths, n_jobs=1)  # don't use more than 1 job\n        samples = sum(res, [])\n        return samples\n
    "},{"location":"API/datasets/molecule3d.html#openqdc.datasets.potential.molecule3d.read_mol","title":"read_mol(mol, energy)","text":"

    Read molecule (Chem.rdchem.Mol) and energy (float) and return dict with conformers and energies

    "},{"location":"API/datasets/molecule3d.html#openqdc.datasets.potential.molecule3d.read_mol--parameters","title":"Parameters","text":"

    mol: Chem.rdchem.Mol RDKit molecule energy: float Energy of the molecule

    "},{"location":"API/datasets/molecule3d.html#openqdc.datasets.potential.molecule3d.read_mol--returns","title":"Returns","text":"

    res: dict Dictionary containing the following keys: - name: np.ndarray of shape (N,) containing the smiles of the molecule - atomic_inputs: flatten np.ndarray of shape (M, 5) containing the atomic numbers, charges and positions - energies: np.ndarray of shape (1,) containing the energy of the conformer - n_atoms: np.ndarray of shape (1) containing the number of atoms in the conformer - subset: np.ndarray of shape (1) containing \"molecule3d\"

    Source code in openqdc/datasets/potential/molecule3d.py
    def read_mol(mol: Chem.rdchem.Mol, energy: float) -> Dict[str, np.ndarray]:\n    \"\"\"Read molecule (Chem.rdchem.Mol) and energy (float) and return dict with conformers and energies\n\n    Parameters\n    ----------\n    mol: Chem.rdchem.Mol\n        RDKit molecule\n    energy: float\n        Energy of the molecule\n\n    Returns\n    -------\n    res: dict\n        Dictionary containing the following keys:\n        - name: np.ndarray of shape (N,) containing the smiles of the molecule\n        - atomic_inputs: flatten np.ndarray of shape (M, 5) containing the atomic numbers, charges and positions\n        - energies: np.ndarray of shape (1,) containing the energy of the conformer\n        - n_atoms: np.ndarray of shape (1) containing the number of atoms in the conformer\n        - subset: np.ndarray of shape (1) containing \"molecule3d\"\n    \"\"\"\n    smiles = dm.to_smiles(mol, explicit_hs=False)\n    # subset = dm.to_smiles(dm.to_scaffold_murcko(mol, make_generic=True), explicit_hs=False)\n    x = get_atomic_number_and_charge(mol)\n    positions = mol.GetConformer().GetPositions()\n\n    res = dict(\n        name=np.array([smiles]),\n        subset=np.array([\"molecule3d\"]),\n        energies=np.array([energy]).astype(np.float64)[:, None],\n        atomic_inputs=np.concatenate((x, positions), axis=-1, dtype=np.float32),\n        n_atoms=np.array([x.shape[0]], dtype=np.int32),\n    )\n\n    return res\n
    "},{"location":"API/datasets/multixcqm9.html","title":"MultixcQM9","text":""},{"location":"API/datasets/multixcqm9.html#openqdc.datasets.potential.multixcqm9.MultixcQM9","title":"MultixcQM9","text":"

    Bases: BaseDataset

    MultixcQM9 is a dataset of molecular and reaction energies from multi-level quantum chemical methods consisting of 133K QM9 molecules geometries calculated with 76 different DFT functionals and three different basis sets resulting in 228 energy values for each molecule along with semi-empirical method GFN2-xTB. Geometries for the molecules are used directly from Kim et al. which uses G4MP2 method.

    Usage:

    from openqdc.datasets import MultixcQM9\ndataset = MultixcQM9()\n

    References

    https://www.nature.com/articles/s41597-023-02690-2

    https://github.com/chemsurajit/largeDFTdata

    https://www.nature.com/articles/s41597-019-0121-7

    Source code in openqdc/datasets/potential/multixcqm9.py
    class MultixcQM9(BaseDataset):\n    \"\"\"\n    MultixcQM9 is a dataset of molecular and reaction energies from multi-level quantum chemical methods consisting\n    of 133K QM9 molecules geometries calculated with 76 different DFT functionals and three different basis sets\n    resulting in 228 energy values for each molecule along with semi-empirical method GFN2-xTB. Geometries for the\n    molecules are used directly from Kim et al. which uses G4MP2 method.\n\n    Usage:\n    ```python\n    from openqdc.datasets import MultixcQM9\n    dataset = MultixcQM9()\n    ```\n\n    References:\n        https://www.nature.com/articles/s41597-023-02690-2\\n\n        https://github.com/chemsurajit/largeDFTdata\\n\n        https://www.nature.com/articles/s41597-019-0121-7\\n\n    \"\"\"\n\n    __name__ = \"multixcqm9\"\n\n    __energy_methods__ = [\n        PotentialMethod.KCIS_MODIFIED_DZP,\n        PotentialMethod.KCIS_ORIGINAL_DZP,\n        PotentialMethod.PKZB_DZP,\n        PotentialMethod.VS98_DZP,\n        PotentialMethod.LDA_VWN_DZP,\n        PotentialMethod.PW91_DZP,\n        PotentialMethod.BLYP_DZP,\n        PotentialMethod.BP_DZP,\n        PotentialMethod.PBE_DZP,\n        PotentialMethod.RPBE_DZP,\n        PotentialMethod.REVPBE_DZP,\n        PotentialMethod.OLYP_DZP,\n        PotentialMethod.FT97_DZP,\n        PotentialMethod.BLAP3_DZP,\n        PotentialMethod.HCTH_93_DZP,\n        PotentialMethod.HCTH_120_DZP,\n        PotentialMethod.HCTH_147_DZP,\n        PotentialMethod.HCTH_407_DZP,\n        PotentialMethod.BMTAU1_DZP,\n        PotentialMethod.BOP_DZP,\n        PotentialMethod.PKZBX_KCISCOR_DZP,\n        PotentialMethod.VS98_X_XC_DZP,\n        PotentialMethod.VS98_X_ONLY_DZP,\n        PotentialMethod.BECKE00_DZP,\n        PotentialMethod.BECKE00X_XC_DZP,\n        PotentialMethod.BECKE00_X_ONLY_DZP,\n        PotentialMethod.BECKE88X_BR89C_DZP,\n        PotentialMethod.OLAP3_DZP,\n        PotentialMethod.TPSS_DZP,\n        PotentialMethod.MPBE_DZP,\n        PotentialMethod.OPBE_DZP,\n        PotentialMethod.OPERDEW_DZP,\n        PotentialMethod.MPBEKCIS_DZP,\n        PotentialMethod.MPW_DZP,\n        PotentialMethod.TAU_HCTH_DZP,\n        PotentialMethod.XLYP_DZP,\n        PotentialMethod.KT1_DZP,\n        PotentialMethod.KT2_DZP,\n        PotentialMethod.M06_L_DZP,\n        PotentialMethod.BLYP_D_DZP,\n        PotentialMethod.BP86_D_DZP,\n        PotentialMethod.PBE_D_DZP,\n        PotentialMethod.TPSSD_DZP,\n        PotentialMethod.B97_D_DZP,\n        PotentialMethod.REVTPSS_DZP,\n        PotentialMethod.PBESOL_DZP,\n        PotentialMethod.RGE2_DZP,\n        PotentialMethod.SSB_D_DZP,\n        PotentialMethod.MVS_DZP,\n        PotentialMethod.MVSX_DZP,\n        PotentialMethod.TMGGA_DZP,\n        PotentialMethod.TPSSH_DZP,\n        PotentialMethod.B3LYP_VWN5_DZP,\n        PotentialMethod.O3LYP_VWN5_DZP,\n        PotentialMethod.KMLYP_VWN5_DZP,\n        PotentialMethod.PBE0_DZP,\n        PotentialMethod.B3LYP_S_VWN5_DZP,\n        PotentialMethod.BHANDH_DZP,\n        PotentialMethod.BHANDHLYP_DZP,\n        PotentialMethod.B97_DZP,\n        PotentialMethod.B97_1_DZP,\n        PotentialMethod.B97_2_DZP,\n        PotentialMethod.MPBE0KCIS_DZP,\n        PotentialMethod.MPBE1KCIS_DZP,\n        PotentialMethod.B1LYP_VWN5_DZP,\n        PotentialMethod.B1PW91_VWN5_DZP,\n        PotentialMethod.MPW1PW_DZP,\n        PotentialMethod.MPW1K_DZP,\n        PotentialMethod.TAU_HCTH_HYBRID_DZP,\n        PotentialMethod.X3LYP_VWN5_DZP,\n        PotentialMethod.OPBE0_DZP,\n        PotentialMethod.M05_DZP,\n        PotentialMethod.M05_2X_DZP,\n        PotentialMethod.M06_DZP,\n        PotentialMethod.M06_2X_DZP,\n        PotentialMethod.B3LYP_D_DZP,\n        PotentialMethod.KCIS_MODIFIED_TZP,\n        PotentialMethod.KCIS_ORIGINAL_TZP,\n        PotentialMethod.PKZB_TZP,\n        PotentialMethod.VS98_TZP,\n        PotentialMethod.LDA_VWN_TZP,\n        PotentialMethod.PW91_TZP,\n        PotentialMethod.BLYP_TZP,\n        PotentialMethod.BP_TZP,\n        PotentialMethod.PBE_TZP,\n        PotentialMethod.RPBE_TZP,\n        PotentialMethod.REVPBE_TZP,\n        PotentialMethod.OLYP_TZP,\n        PotentialMethod.FT97_TZP,\n        PotentialMethod.BLAP3_TZP,\n        PotentialMethod.HCTH_93_TZP,\n        PotentialMethod.HCTH_120_TZP,\n        PotentialMethod.HCTH_147_TZP,\n        PotentialMethod.HCTH_407_TZP,\n        PotentialMethod.BMTAU1_TZP,\n        PotentialMethod.BOP_TZP,\n        PotentialMethod.PKZBX_KCISCOR_TZP,\n        PotentialMethod.VS98_X_XC_TZP,\n        PotentialMethod.VS98_X_ONLY_TZP,\n        PotentialMethod.BECKE00_TZP,\n        PotentialMethod.BECKE00X_XC_TZP,\n        PotentialMethod.BECKE00_X_ONLY_TZP,\n        PotentialMethod.BECKE88X_BR89C_TZP,\n        PotentialMethod.OLAP3_TZP,\n        PotentialMethod.TPSS_TZP,\n        PotentialMethod.MPBE_TZP,\n        PotentialMethod.OPBE_TZP,\n        PotentialMethod.OPERDEW_TZP,\n        PotentialMethod.MPBEKCIS_TZP,\n        PotentialMethod.MPW_TZP,\n        PotentialMethod.TAU_HCTH_TZP,\n        PotentialMethod.XLYP_TZP,\n        PotentialMethod.KT1_TZP,\n        PotentialMethod.KT2_TZP,\n        PotentialMethod.M06_L_TZP,\n        PotentialMethod.BLYP_D_TZP,\n        PotentialMethod.BP86_D_TZP,\n        PotentialMethod.PBE_D_TZP,\n        PotentialMethod.TPSSD_TZP,\n        PotentialMethod.B97_D_TZP,\n        PotentialMethod.REVTPSS_TZP,\n        PotentialMethod.PBESOL_TZP,\n        PotentialMethod.RGE2_TZP,\n        PotentialMethod.SSB_D_TZP,\n        PotentialMethod.MVS_TZP,\n        PotentialMethod.MVSX_TZP,\n        PotentialMethod.TMGGA_TZP,\n        PotentialMethod.TPSSH_TZP,\n        PotentialMethod.B3LYP_VWN5_TZP,\n        PotentialMethod.O3LYP_VWN5_TZP,\n        PotentialMethod.KMLYP_VWN5_TZP,\n        PotentialMethod.PBE0_TZP,\n        PotentialMethod.B3LYP_S_VWN5_TZP,\n        PotentialMethod.BHANDH_TZP,\n        PotentialMethod.BHANDHLYP_TZP,\n        PotentialMethod.B97_TZP,\n        PotentialMethod.B97_1_TZP,\n        PotentialMethod.B97_2_TZP,\n        PotentialMethod.MPBE0KCIS_TZP,\n        PotentialMethod.MPBE1KCIS_TZP,\n        PotentialMethod.B1LYP_VWN5_TZP,\n        PotentialMethod.B1PW91_VWN5_TZP,\n        PotentialMethod.MPW1PW_TZP,\n        PotentialMethod.MPW1K_TZP,\n        PotentialMethod.TAU_HCTH_HYBRID_TZP,\n        PotentialMethod.X3LYP_VWN5_TZP,\n        PotentialMethod.OPBE0_TZP,\n        PotentialMethod.M05_TZP,\n        PotentialMethod.M05_2X_TZP,\n        PotentialMethod.M06_TZP,\n        PotentialMethod.M06_2X_TZP,\n        PotentialMethod.B3LYP_D_TZP,\n        PotentialMethod.KCIS_MODIFIED_SZ,\n        PotentialMethod.KCIS_ORIGINAL_SZ,\n        PotentialMethod.PKZB_SZ,\n        PotentialMethod.VS98_SZ,\n        PotentialMethod.LDA_VWN_SZ,\n        PotentialMethod.PW91_SZ,\n        PotentialMethod.BLYP_SZ,\n        PotentialMethod.BP_SZ,\n        PotentialMethod.PBE_SZ,\n        PotentialMethod.RPBE_SZ,\n        PotentialMethod.REVPBE_SZ,\n        PotentialMethod.OLYP_SZ,\n        PotentialMethod.FT97_SZ,\n        PotentialMethod.BLAP3_SZ,\n        PotentialMethod.HCTH_93_SZ,\n        PotentialMethod.HCTH_120_SZ,\n        PotentialMethod.HCTH_147_SZ,\n        PotentialMethod.HCTH_407_SZ,\n        PotentialMethod.BMTAU1_SZ,\n        PotentialMethod.BOP_SZ,\n        PotentialMethod.PKZBX_KCISCOR_SZ,\n        PotentialMethod.VS98_X_XC_SZ,\n        PotentialMethod.VS98_X_ONLY_SZ,\n        PotentialMethod.BECKE00_SZ,\n        PotentialMethod.BECKE00X_XC_SZ,\n        PotentialMethod.BECKE00_X_ONLY_SZ,\n        PotentialMethod.BECKE88X_BR89C_SZ,\n        PotentialMethod.OLAP3_SZ,\n        PotentialMethod.TPSS_SZ,\n        PotentialMethod.MPBE_SZ,\n        PotentialMethod.OPBE_SZ,\n        PotentialMethod.OPERDEW_SZ,\n        PotentialMethod.MPBEKCIS_SZ,\n        PotentialMethod.MPW_SZ,\n        PotentialMethod.TAU_HCTH_SZ,\n        PotentialMethod.XLYP_SZ,\n        PotentialMethod.KT1_SZ,\n        PotentialMethod.KT2_SZ,\n        PotentialMethod.M06_L_SZ,\n        PotentialMethod.BLYP_D_SZ,\n        PotentialMethod.BP86_D_SZ,\n        PotentialMethod.PBE_D_SZ,\n        PotentialMethod.TPSSD_SZ,\n        PotentialMethod.B97_D_SZ,\n        PotentialMethod.REVTPSS_SZ,\n        PotentialMethod.PBESOL_SZ,\n        PotentialMethod.RGE2_SZ,\n        PotentialMethod.SSB_D_SZ,\n        PotentialMethod.MVS_SZ,\n        PotentialMethod.MVSX_SZ,\n        PotentialMethod.TMGGA_SZ,\n        PotentialMethod.TPSSH_SZ,\n        PotentialMethod.B3LYP_VWN5_SZ,\n        PotentialMethod.O3LYP_VWN5_SZ,\n        PotentialMethod.KMLYP_VWN5_SZ,\n        PotentialMethod.PBE0_SZ,\n        PotentialMethod.B3LYP_S_VWN5_SZ,\n        PotentialMethod.BHANDH_SZ,\n        PotentialMethod.BHANDHLYP_SZ,\n        PotentialMethod.B97_SZ,\n        PotentialMethod.B97_1_SZ,\n        PotentialMethod.B97_2_SZ,\n        PotentialMethod.MPBE0KCIS_SZ,\n        PotentialMethod.MPBE1KCIS_SZ,\n        PotentialMethod.B1LYP_VWN5_SZ,\n        PotentialMethod.B1PW91_VWN5_SZ,\n        PotentialMethod.MPW1PW_SZ,\n        PotentialMethod.MPW1K_SZ,\n        PotentialMethod.TAU_HCTH_HYBRID_SZ,\n        PotentialMethod.X3LYP_VWN5_SZ,\n        PotentialMethod.OPBE0_SZ,\n        PotentialMethod.M05_SZ,\n        PotentialMethod.M05_2X_SZ,\n        PotentialMethod.M06_SZ,\n        PotentialMethod.M06_2X_SZ,\n        PotentialMethod.B3LYP_D_SZ,\n        PotentialMethod.GFN2_XTB,\n    ]\n\n    energy_target_names = [\n        \"KCIS-MODIFIED/DZP\",\n        \"KCIS-ORIGINAL/DZP\",\n        \"PKZB/DZP\",\n        \"VS98/DZP\",\n        \"LDA(VWN)/DZP\",\n        \"PW91/DZP\",\n        \"BLYP/DZP\",\n        \"BP/DZP\",\n        \"PBE/DZP\",\n        \"RPBE/DZP\",\n        \"REVPBE/DZP\",\n        \"OLYP/DZP\",\n        \"FT97/DZP\",\n        \"BLAP3/DZP\",\n        \"HCTH/93/DZP\",\n        \"HCTH/120/DZP\",\n        \"HCTH/147/DZP\",\n        \"HCTH/407/DZP\",\n        \"BMTAU1/DZP\",\n        \"BOP/DZP\",\n        \"PKZBX-KCISCOR/DZP\",\n        \"VS98-X(XC)/DZP\",\n        \"VS98-X-ONLY/DZP\",\n        \"BECKE00/DZP\",\n        \"BECKE00X(XC)/DZP\",\n        \"BECKE00-X-ONLY/DZP\",\n        \"BECKE88X+BR89C/DZP\",\n        \"OLAP3/DZP\",\n        \"TPSS/DZP\",\n        \"MPBE/DZP\",\n        \"OPBE/DZP\",\n        \"OPERDEW/DZP\",\n        \"MPBEKCIS/DZP\",\n        \"MPW/DZP\",\n        \"TAU-HCTH/DZP\",\n        \"XLYP/DZP\",\n        \"KT1/DZP\",\n        \"KT2/DZP\",\n        \"M06-L/DZP\",\n        \"BLYP-D/DZP\",\n        \"BP86-D/DZP\",\n        \"PBE-D/DZP\",\n        \"TPSS-D/DZP\",\n        \"B97-D/DZP\",\n        \"REVTPSS/DZP\",\n        \"PBESOL/DZP\",\n        \"RGE2/DZP\",\n        \"SSB-D/DZP\",\n        \"MVS/DZP\",\n        \"MVSX/DZP\",\n        \"T-MGGA/DZP\",\n        \"TPSSH/DZP\",\n        \"B3LYP(VWN5)/DZP\",\n        \"O3LYP(VWN5)/DZP\",\n        \"KMLYP(VWN5)/DZP\",\n        \"PBE0/DZP\",\n        \"B3LYP*(VWN5)/DZP\",\n        \"BHANDH/DZP\",\n        \"BHANDHLYP/DZP\",\n        \"B97/DZP\",\n        \"B97-1/DZP\",\n        \"B97-2/DZP\",\n        \"MPBE0KCIS/DZP\",\n        \"MPBE1KCIS/DZP\",\n        \"B1LYP(VWN5)/DZP\",\n        \"B1PW91(VWN5)/DZP\",\n        \"MPW1PW/DZP\",\n        \"MPW1K/DZP\",\n        \"TAU-HCTH-HYBRID/DZP\",\n        \"X3LYP(VWN5)/DZP\",\n        \"OPBE0/DZP\",\n        \"M05/DZP\",\n        \"M05-2X/DZP\",\n        \"M06/DZP\",\n        \"M06-2X/DZP\",\n        \"B3LYP-D/DZP\",\n        \"KCIS-MODIFIED/TZP\",\n        \"KCIS-ORIGINAL/TZP\",\n        \"PKZB/TZP\",\n        \"VS98/TZP\",\n        \"LDA(VWN)/TZP\",\n        \"PW91/TZP\",\n        \"BLYP/TZP\",\n        \"BP/TZP\",\n        \"PBE/TZP\",\n        \"RPBE/TZP\",\n        \"REVPBE/TZP\",\n        \"OLYP/TZP\",\n        \"FT97/TZP\",\n        \"BLAP3/TZP\",\n        \"HCTH/93/TZP\",\n        \"HCTH/120/TZP\",\n        \"HCTH/147/TZP\",\n        \"HCTH/407/TZP\",\n        \"BMTAU1/TZP\",\n        \"BOP/TZP\",\n        \"PKZBX-KCISCOR/TZP\",\n        \"VS98-X(XC)/TZP\",\n        \"VS98-X-ONLY/TZP\",\n        \"BECKE00/TZP\",\n        \"BECKE00X(XC)/TZP\",\n        \"BECKE00-X-ONLY/TZP\",\n        \"BECKE88X+BR89C/TZP\",\n        \"OLAP3/TZP\",\n        \"TPSS/TZP\",\n        \"MPBE/TZP\",\n        \"OPBE/TZP\",\n        \"OPERDEW/TZP\",\n        \"MPBEKCIS/TZP\",\n        \"MPW/TZP\",\n        \"TAU-HCTH/TZP\",\n        \"XLYP/TZP\",\n        \"KT1/TZP\",\n        \"KT2/TZP\",\n        \"M06-L/TZP\",\n        \"BLYP-D/TZP\",\n        \"BP86-D/TZP\",\n        \"PBE-D/TZP\",\n        \"TPSS-D/TZP\",\n        \"B97-D/TZP\",\n        \"REVTPSS/TZP\",\n        \"PBESOL/TZP\",\n        \"RGE2/TZP\",\n        \"SSB-D/TZP\",\n        \"MVS/TZP\",\n        \"MVSX/TZP\",\n        \"T-MGGA/TZP\",\n        \"TPSSH/TZP\",\n        \"B3LYP(VWN5)/TZP\",\n        \"O3LYP(VWN5)/TZP\",\n        \"KMLYP(VWN5)/TZP\",\n        \"PBE0/TZP\",\n        \"B3LYP*(VWN5)/TZP\",\n        \"BHANDH/TZP\",\n        \"BHANDHLYP/TZP\",\n        \"B97/TZP\",\n        \"B97-1/TZP\",\n        \"B97-2/TZP\",\n        \"MPBE0KCIS/TZP\",\n        \"MPBE1KCIS/TZP\",\n        \"B1LYP(VWN5)/TZP\",\n        \"B1PW91(VWN5)/TZP\",\n        \"MPW1PW/TZP\",\n        \"MPW1K/TZP\",\n        \"TAU-HCTH-HYBRID/TZP\",\n        \"X3LYP(VWN5)/TZP\",\n        \"OPBE0/TZP\",\n        \"M05/TZP\",\n        \"M05-2X/TZP\",\n        \"M06/TZP\",\n        \"M06-2X/TZP\",\n        \"B3LYP-D/TZP\",\n        \"KCIS-MODIFIED/SZ\",\n        \"KCIS-ORIGINAL/SZ\",\n        \"PKZB/SZ\",\n        \"VS98/SZ\",\n        \"LDA(VWN)/SZ\",\n        \"PW91/SZ\",\n        \"BLYP/SZ\",\n        \"BP/SZ\",\n        \"PBE/SZ\",\n        \"RPBE/SZ\",\n        \"REVPBE/SZ\",\n        \"OLYP/SZ\",\n        \"FT97/SZ\",\n        \"BLAP3/SZ\",\n        \"HCTH/93/SZ\",\n        \"HCTH/120/SZ\",\n        \"HCTH/147/SZ\",\n        \"HCTH/407/SZ\",\n        \"BMTAU1/SZ\",\n        \"BOP/SZ\",\n        \"PKZBX-KCISCOR/SZ\",\n        \"VS98-X(XC)/SZ\",\n        \"VS98-X-ONLY/SZ\",\n        \"BECKE00/SZ\",\n        \"BECKE00X(XC)/SZ\",\n        \"BECKE00-X-ONLY/SZ\",\n        \"BECKE88X+BR89C/SZ\",\n        \"OLAP3/SZ\",\n        \"TPSS/SZ\",\n        \"MPBE/SZ\",\n        \"OPBE/SZ\",\n        \"OPERDEW/SZ\",\n        \"MPBEKCIS/SZ\",\n        \"MPW/SZ\",\n        \"TAU-HCTH/SZ\",\n        \"XLYP/SZ\",\n        \"KT1/SZ\",\n        \"KT2/SZ\",\n        \"M06-L/SZ\",\n        \"BLYP-D/SZ\",\n        \"BP86-D/SZ\",\n        \"PBE-D/SZ\",\n        \"TPSS-D/SZ\",\n        \"B97-D/SZ\",\n        \"REVTPSS/SZ\",\n        \"PBESOL/SZ\",\n        \"RGE2/SZ\",\n        \"SSB-D/SZ\",\n        \"MVS/SZ\",\n        \"MVSX/SZ\",\n        \"T-MGGA/SZ\",\n        \"TPSSH/SZ\",\n        \"B3LYP(VWN5)/SZ\",\n        \"O3LYP(VWN5)/SZ\",\n        \"KMLYP(VWN5)/SZ\",\n        \"PBE0/SZ\",\n        \"B3LYP*(VWN5)/SZ\",\n        \"BHANDH/SZ\",\n        \"BHANDHLYP/SZ\",\n        \"B97/SZ\",\n        \"B97-1/SZ\",\n        \"B97-2/SZ\",\n        \"MPBE0KCIS/SZ\",\n        \"MPBE1KCIS/SZ\",\n        \"B1LYP(VWN5)/SZ\",\n        \"B1PW91(VWN5)/SZ\",\n        \"MPW1PW/SZ\",\n        \"MPW1K/SZ\",\n        \"TAU-HCTH-HYBRID/SZ\",\n        \"X3LYP(VWN5)/SZ\",\n        \"OPBE0/SZ\",\n        \"M05/SZ\",\n        \"M05-2X/SZ\",\n        \"M06/SZ\",\n        \"M06-2X/SZ\",\n        \"B3LYP-D/SZ\",\n        \"GFNXTB\",\n    ]\n\n    __energy_unit__ = \"ev\"  # to fix\n    __distance_unit__ = \"ang\"  # to fix\n    __forces_unit__ = \"ev/ang\"  # to fix\n    __links__ = {\n        \"xyz.zip\": \"https://data.dtu.dk/ndownloader/files/35143624\",\n        \"xtb.zip\": \"https://data.dtu.dk/ndownloader/files/42444300\",\n        \"dzp.zip\": \"https://data.dtu.dk/ndownloader/files/42443925\",\n        \"tzp.zip\": \"https://data.dtu.dk/ndownloader/files/42444129\",\n        \"sz.zip\": \"https://data.dtu.dk/ndownloader/files/42441345\",\n        \"failed_indices.dat\": \"https://data.dtu.dk/ndownloader/files/37337677\",\n    }\n\n    def _read_molecules_energies(self):\n        d = {\"DZP\": None, \"TZP\": None, \"SZ\": None, \"XTB\": None}\n        for basis in d.keys():\n            d[basis] = pd.read_csv(p_join(self.root, basis, \"molecules/molecules.csv\"), index_col=False).drop(\n                columns=[\"index\"]\n            )\n        return pd.concat([d[\"DZP\"], d[\"TZP\"], d[\"SZ\"], d[\"XTB\"]], axis=1, ignore_index=False)\n\n    def _read_all_xyzs(self):\n        xyz_list = read_xyz_files(self.root)\n        return pd.DataFrame(xyz_list)\n\n    def read_raw_entries(self):\n        df_energies = self._read_molecules_energies()\n        df_xyz = self._read_all_xyzs()\n        return [\n            {\"energies\": np.atleast_2d(en), **xyz_dict}\n            for xyz_dict, en in zip(df_xyz.to_dict(\"records\"), df_energies.values.astype(np.float64))\n        ]\n
    "},{"location":"API/datasets/nabladft.html","title":"NablaDFT","text":""},{"location":"API/datasets/nabladft.html#openqdc.datasets.potential.nabladft.NablaDFT","title":"NablaDFT","text":"

    Bases: BaseDataset

    NablaDFT is a dataset constructed from a subset of the Molecular Sets (MOSES) dataset consisting of 1 million molecules with 5,340,152 unique conformations. Conformations for each molecule are generated in 2 steps. First, a set of conformations are generated using RDKit. Second, using Butina Clustering Method on conformations, clusters that cover 95% of the conformations are selected and the centroids of those clusters are selected as the final set. This results in 1-62 conformations per molecule. For generating quantum properties, Kohn-Sham method at wB97X-D/def2-XVP levels are used to generate the energy.

    Usage:

    from openqdc.datasets import NablaDFT\ndataset = NablaDFT()\n

    References

    https://pubs.rsc.org/en/content/articlelanding/2022/CP/D2CP03966D

    https://github.com/AIRI-Institute/nablaDFT

    Source code in openqdc/datasets/potential/nabladft.py
    class NablaDFT(BaseDataset):\n    \"\"\"\n    NablaDFT is a dataset constructed from a subset of the\n    [Molecular Sets (MOSES) dataset](https://github.com/molecularsets/moses) consisting of 1 million molecules\n    with 5,340,152 unique conformations. Conformations for each molecule are generated in 2 steps. First, a set of\n    conformations are generated using RDKit. Second, using Butina Clustering Method on conformations, clusters that\n    cover 95% of the conformations are selected and the centroids of those clusters are selected as the final set.\n    This results in 1-62 conformations per molecule. For generating quantum properties, Kohn-Sham method at\n    wB97X-D/def2-XVP levels are used to generate the energy.\n\n    Usage:\n    ```python\n    from openqdc.datasets import NablaDFT\n    dataset = NablaDFT()\n    ```\n\n    References:\n        https://pubs.rsc.org/en/content/articlelanding/2022/CP/D2CP03966D\\n\n        https://github.com/AIRI-Institute/nablaDFT\n    \"\"\"\n\n    __name__ = \"nabladft\"\n    __energy_methods__ = [\n        PotentialMethod.WB97X_D_DEF2_SVP,\n    ]  # \"wb97x-d/def2-svp\"\n\n    energy_target_names = [\"wb97x-d/def2-svp\"]\n    __energy_unit__ = \"hartree\"\n    __distance_unit__ = \"bohr\"\n    __forces_unit__ = \"hartree/bohr\"\n    __links__ = {\"nabladft.db\": \"https://n-usr-31b1j.s3pd12.sbercloud.ru/b-usr-31b1j-qz9/data/moses_db/dataset_full.db\"}\n\n    @property\n    def data_types(self):\n        return {\n            \"atomic_inputs\": np.float32,\n            \"position_idx_range\": np.int32,\n            \"energies\": np.float32,\n            \"forces\": np.float32,\n        }\n\n    @requires_package(\"nablaDFT\")\n    def read_raw_entries(self):\n        from nablaDFT.dataset import HamiltonianDatabase\n\n        label_path = p_join(self.root, \"summary.csv\")\n        df = pd.read_csv(label_path, usecols=[\"MOSES id\", \"CONFORMER id\", \"SMILES\", \"DFT TOTAL ENERGY\"])\n        labels = df.set_index(keys=[\"MOSES id\", \"CONFORMER id\"]).to_dict(\"index\")\n\n        raw_path = p_join(self.root, \"dataset_full.db\")\n        train = HamiltonianDatabase(raw_path)\n        n, c = len(train), 20\n        step_size = int(np.ceil(n / os.cpu_count()))\n\n        fn = lambda i: read_chunk_from_db(raw_path, i * step_size, min((i + 1) * step_size, n), labels=labels)\n        samples = dm.parallelized(\n            fn, list(range(c)), n_jobs=c, progress=False, scheduler=\"threads\"\n        )  # don't use more than 1 job\n\n        return sum(samples, [])\n
    "},{"location":"API/datasets/orbnet_denali.html","title":"Orbnet Denali","text":""},{"location":"API/datasets/orbnet_denali.html#openqdc.datasets.potential.orbnet_denali.OrbnetDenali","title":"OrbnetDenali","text":"

    Bases: BaseDataset

    Orbnet Denali is a collection of 2.3 million conformers from 212,905 unique molecules. Molecules include a range of organic molecules with protonation and tautomeric states, non-covalent interactions, common salts, and counterions, spanning the most common elements in bio and organic chemistry. Geometries are generated in 2 steps. First, four energy-minimized conformations are generated for each molecule using the ENTOS BREEZE conformer generator. Second, using the four energy-minimized conformers, non-equilibrium geometries are generated using normal mode sampling at 300K or ab initio molecular dynamics (AIMD) for 200fs at 500K; using GFN1-xTB level of theory. Energies are calculated using DFT method wB97X-D3/def2-TZVP and semi-empirical method GFN1-xTB level of theory.

    Usage:

    from openqdc.datasets import OrbnetDenali\ndataset = OrbnetDenali()\n

    References

    https://arxiv.org/abs/2107.00299

    https://figshare.com/articles/dataset/OrbNet_Denali_Training_Data/14883867

    Source code in openqdc/datasets/potential/orbnet_denali.py
    class OrbnetDenali(BaseDataset):\n    \"\"\"\n    Orbnet Denali is a collection of 2.3 million conformers from 212,905 unique molecules. Molecules include a range\n    of organic molecules with protonation and tautomeric states, non-covalent interactions, common salts, and\n    counterions, spanning the most common elements in bio and organic chemistry. Geometries are generated in 2 steps.\n    First, four energy-minimized conformations are generated for each molecule using the ENTOS BREEZE conformer\n    generator. Second, using the four energy-minimized conformers, non-equilibrium geometries are generated using\n    normal mode sampling at 300K or ab initio molecular dynamics (AIMD) for 200fs at 500K; using GFN1-xTB level of\n    theory. Energies are calculated using DFT method wB97X-D3/def2-TZVP and semi-empirical method GFN1-xTB level of\n    theory.\n\n    Usage:\n    ```python\n    from openqdc.datasets import OrbnetDenali\n    dataset = OrbnetDenali()\n    ```\n\n    References:\n        https://arxiv.org/abs/2107.00299\\n\n        https://figshare.com/articles/dataset/OrbNet_Denali_Training_Data/14883867\n    \"\"\"\n\n    __name__ = \"orbnet_denali\"\n    __energy_methods__ = [\n        PotentialMethod.WB97X_D3_DEF2_TZVP,\n        PotentialMethod.GFN1_XTB,\n    ]  # [\"wb97x-d3/def2-tzvp\", \"gfn1_xtb\"]\n    energy_target_names = [\"dft_energy\", \"xtb1_energy\"]\n    __energy_unit__ = \"hartree\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"hartree/ang\"\n    __links__ = {\n        \"orbnet_denali.tar.gz\": \"https://figshare.com/ndownloader/files/28672287\",\n        \"orbnet_denali_targets.tar.gz\": \"https://figshare.com/ndownloader/files/28672248\",\n    }\n\n    def read_raw_entries(self):\n        label_path = p_join(self.root, \"denali_labels.csv\")\n        df = pd.read_csv(label_path, usecols=[\"sample_id\", \"mol_id\", \"subset\", \"dft_energy\", \"xtb1_energy\"])\n        labels = {\n            mol_id: group.drop([\"mol_id\"], axis=1).drop_duplicates(\"sample_id\").set_index(\"sample_id\").to_dict(\"index\")\n            for mol_id, group in df.groupby(\"mol_id\")\n        }\n\n        fn = lambda x: read_archive(x[0], x[1], self.root, self.energy_target_names)\n        res = dm.parallelized(fn, list(labels.items()), scheduler=\"threads\", n_jobs=-1, progress=True)\n        samples = sum(res, [])\n        return samples\n
    "},{"location":"API/datasets/pcqm.html","title":"PCQM","text":""},{"location":"API/datasets/pcqm.html#openqdc.datasets.potential.pcqm.PCQM_B3LYP","title":"PCQM_B3LYP","text":"

    Bases: PCQM_PM6

    PubChemQC B3LYP/6-31G (PCQM_B3LYP) comprises of 85 million molecules ranging from essential compounds to biomolecules. The geometries for the molecule are optimized using PM6. Using the optimized geometry, the electronic structure and properties are calculated using B3LIP/6-31G method.

    Usage:

    from openqdc.datasets import PCQM_B3LYP\ndataset = PCQM_B3LYP()\n

    References

    https://arxiv.org/abs/2305.18454

    Source code in openqdc/datasets/potential/pcqm.py
    class PCQM_B3LYP(PCQM_PM6):\n    \"\"\"\n    PubChemQC B3LYP/6-31G* (PCQM_B3LYP) comprises of 85 million molecules ranging from essential compounds to\n    biomolecules. The geometries for the molecule are optimized using PM6. Using the optimized geometry,\n    the electronic structure and properties are calculated using B3LIP/6-31G* method.\n\n    Usage:\n    ```python\n    from openqdc.datasets import PCQM_B3LYP\n    dataset = PCQM_B3LYP()\n    ```\n\n    References:\n        https://arxiv.org/abs/2305.18454\n    \"\"\"\n\n    __name__ = \"pubchemqc_b3lyp\"\n    __energy_methods__ = [\"b3lyp/6-31g*\"]\n    energy_target_names = [\"b3lyp\"]\n
    "},{"location":"API/datasets/pcqm.html#openqdc.datasets.potential.pcqm.PCQM_PM6","title":"PCQM_PM6","text":"

    Bases: BaseDataset

    PubChemQC PM6 (PCQM_PM6) is an exhaustive dataset containing 221 million organic molecules with optimized molecular geometries and electronic properties. To generate the dataset, only molecules with weights less than 1000g/mol are considered from the PubChem ftp site. The initial structure is generated using OpenBabel and then is optimized using geometry optimization with the semi-empirical method PM6. The energies are also computed using the PM6 method.

    Usage:

    from openqdc.datasets import PCQM_PM6\ndataset = PCQM_PM6()\n

    References

    https://pubs.acs.org/doi/abs/10.1021/acs.jcim.0c00740

    Source code in openqdc/datasets/potential/pcqm.py
    class PCQM_PM6(BaseDataset):\n    \"\"\"\n    PubChemQC PM6 (PCQM_PM6) is an exhaustive dataset containing 221 million organic molecules with optimized\n    molecular geometries and electronic properties. To generate the dataset, only molecules with weights less\n    than 1000g/mol are considered from the PubChem ftp site. The initial structure is generated using OpenBabel\n    and then is optimized using geometry optimization with the semi-empirical method PM6. The energies are also\n    computed using the PM6 method.\n\n    Usage:\n    ```python\n    from openqdc.datasets import PCQM_PM6\n    dataset = PCQM_PM6()\n    ```\n\n    References:\n        https://pubs.acs.org/doi/abs/10.1021/acs.jcim.0c00740\n    \"\"\"\n\n    __name__ = \"pubchemqc_pm6\"\n    __energy_methods__ = [PotentialMethod.PM6]\n\n    energy_target_names = [\"pm6\"]\n\n    __force_methods__ = []\n    force_target_names = []\n\n    @property\n    def root(self):\n        return p_join(get_local_cache(), \"pubchemqc\")\n\n    @property\n    def preprocess_path(self):\n        path = p_join(self.root, \"preprocessed\", self.__name__)\n        os.makedirs(path, exist_ok=True)\n        return path\n\n    def collate_list(self, list_entries):\n        predicat = list_entries is not None and len(list_entries) > 0\n        list_entries = [x for x in list_entries if x is not None]\n        if predicat:\n            res = super().collate_list(list_entries)\n        else:\n            res = None\n        return res\n\n    @property\n    def data_types(self):\n        return {\n            \"atomic_inputs\": np.float32,\n            \"position_idx_range\": np.int32,\n            \"energies\": np.float32,\n            \"forces\": np.float32,\n        }\n\n    def read_raw_entries(self):\n        arxiv_paths = glob(p_join(self.root, f\"{self.__energy_methods__[0]}\", \"*.pkl\"))\n        f = lambda x: self.collate_list(read_preprocessed_archive(x))\n        samples = dm.parallelized(f, arxiv_paths, n_jobs=1, progress=True)\n        samples = [x for x in samples if x is not None]\n        return samples\n\n    def preprocess(self, overwrite=False):\n        if overwrite or not self.is_preprocessed():\n            logger.info(\"Preprocessing data and saving it to cache.\")\n            logger.info(\n                f\"Dataset {self.__name__} data with the following units:\\n\"\n                f\"Energy: {self.energy_unit}, Distance: {self.distance_unit}, \"\n                f\"Forces: {self.force_unit if self.__force_methods__ else 'None'}\"\n            )\n            entries = self.read_raw_entries()\n            self.collate_and_save_list(entries)\n\n    def collate_and_save_list(self, list_entries):\n        n_molecules, n_atoms = 0, 0\n        for i in range(len(list_entries)):\n            list_entries[i][\"position_idx_range\"] += n_atoms\n            n_atoms += list_entries[i][\"position_idx_range\"].max()\n            n_molecules += list_entries[i][\"position_idx_range\"].shape[0]\n\n        for key in self.data_keys:\n            first = list_entries[0][key]\n            shape = (n_molecules, *first.shape[1:])\n            local_path = p_join(self.preprocess_path, f\"{key}.mmap\")\n            out = np.memmap(local_path, mode=\"w+\", dtype=first.dtype, shape=shape)\n\n            start = 0\n            for i in range(len(list_entries)):\n                x = list_entries[i].pop(key)\n                n = x.shape[0]\n                out[start : start + n] = x\n                out.flush()\n            push_remote(local_path, overwrite=True)\n\n        # save smiles and subset\n        tmp, n = dict(name=[]), len(list_entries)\n        local_path = p_join(self.preprocess_path, \"props.pkl\")\n        names = [list_entries[i].pop(\"name\") for i in range(n)]\n        f = lambda xs: [dm.to_inchikey(x) for x in xs]\n        res = dm.parallelized(f, names, n_jobs=-1, progress=False)\n        for x in res:\n            tmp[\"name\"] += x\n        for key in [\"subset\", \"n_atoms\"]:\n            tmp[key] = []\n            for i in range(n):\n                tmp[key] += list(list_entries[i].pop(key))\n        with open(local_path, \"wb\") as f:\n            pkl.dump(tmp, f)\n        push_remote(local_path, overwrite=True)\n
    "},{"location":"API/datasets/proteinfragments.html","title":"Protein Fragments","text":""},{"location":"API/datasets/proteinfragments.html#openqdc.datasets.potential.proteinfragments.MDDataset","title":"MDDataset","text":"

    Bases: ProteinFragments

    MDDataset is a subset of the proteinfragments dataset that generated from the molecular dynamics with their model. The sampling was done with Molecular Dynamics at room temperature 300K in various solvent phase:

    Subsets

    Polyalanine: All the polyalanine are sampled in gas phase. AceAla15Lys is a polyalanine peptides capped with an N-terminal acetyl group and a protonated lysine residue at the C-terminus, Acela15nme is polyalanine peptide capped with an N-terminal acetyl group and a C-terminal N-methyl amide group

    Crambin: 46-residue protein crambin in aqueous solution (25,257 atoms)

    Usage:

    from openqdc.datasets import MDDataset\ndataset = MDDataset()\n

    References

    https://www.science.org/doi/10.1126/sciadv.adn4397

    Source code in openqdc/datasets/potential/proteinfragments.py
    class MDDataset(ProteinFragments):\n    \"\"\"\n    MDDataset is a subset of the proteinfragments dataset that\n    generated from the molecular dynamics with their model.\n    The sampling was done with Molecular Dynamics\n    at room temperature 300K in various solvent phase:\n\n    Subsets:\n        Polyalanine:\n            All the polyalanine are sampled in gas phase. AceAla15Lys is\n            a polyalanine peptides capped with an N-terminal acetyl group\n            and a protonated lysine residue at the C-terminus,\n            Acela15nme is polyalanine peptide capped with an N-terminal acetyl group\n            and a C-terminal N-methyl amide group\\n\n        Crambin: 46-residue protein crambin in aqueous solution (25,257 atoms)\n\n    Usage:\n    ```python\n    from openqdc.datasets import MDDataset\n    dataset = MDDataset()\n    ```\n\n    References:\n        https://www.science.org/doi/10.1126/sciadv.adn4397\n    \"\"\"\n\n    __name__ = \"mddataset\"\n\n    __links__ = {\n        f\"{name}.db\": f\"https://zenodo.org/records/10720941/files/{name}.db?download=1\"\n        for name in [\"acala15nme_folding_clusters\", \"crambin\", \"minimahopping_acala15lysh\", \"minimahopping_acala15nme\"]\n    }\n
    "},{"location":"API/datasets/proteinfragments.html#openqdc.datasets.potential.proteinfragments.ProteinFragments","title":"ProteinFragments","text":"

    Bases: BaseDataset

    ProteinFragments is a dataset constructed from a subset of the the data was generated from a top-down and bottom-up approach:

    Top-down

    Fragments are generated by cutting out a spherical region around an atom (including solvent molecules) and saturating all dangling bonds. Sampling was done with the Molecular Dynamics (MD) method from conventional FF at room temperature.

    Bottom-up

    Fragments are generated by constructing chemical graphs of one to eight nonhydrogen atoms. Sampling of multiple conformers per fragments was done with MD simulations at high temperatures or normal mode sampling.

    Usage:

    from openqdc.datasets import ProteinFragments\ndataset = ProteinFragments()\n

    References

    https://www.science.org/doi/10.1126/sciadv.adn4397

    Source code in openqdc/datasets/potential/proteinfragments.py
    class ProteinFragments(BaseDataset):\n    \"\"\"\n    ProteinFragments is a dataset constructed from a subset of the\n    the data was generated from a top-down and bottom-up approach:\n\n    Top-down:\n        Fragments are generated by cutting out a spherical\n        region around an atom (including solvent molecules)\n        and saturating all dangling bonds.\n        Sampling was done with the Molecular Dynamics (MD) method from\n        conventional FF at room temperature.\n\n    Bottom-up:\n        Fragments are generated by constructing chemical graphs\n        of one to eight nonhydrogen atoms.\n        Sampling of multiple conformers per fragments was done with\n        MD simulations at high temperatures or normal mode sampling.\n\n\n    Usage:\n    ```python\n    from openqdc.datasets import ProteinFragments\n    dataset = ProteinFragments()\n    ```\n\n    References:\n        https://www.science.org/doi/10.1126/sciadv.adn4397\n    \"\"\"\n\n    __name__ = \"proteinfragments\"\n    # PBE0/def2-TZVPP+MBD\n    __energy_methods__ = [\n        PotentialMethod.PBE0_MBD_DEF2_TZVPP,\n    ]\n\n    energy_target_names = [\n        \"PBE0+MBD/def2-TZVPP\",\n    ]\n\n    __energy_unit__ = \"ev\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"ev/ang\"\n    __links__ = {\n        f\"{name}.db\": f\"https://zenodo.org/records/10720941/files/{name}.db?download=1\"\n        for name in [\"general_protein_fragments\"]\n    }\n\n    @property\n    def root(self):\n        return p_join(get_local_cache(), \"proteinfragments\")\n\n    @property\n    def config(self):\n        assert len(self.__links__) > 0, \"No links provided for fetching\"\n        return dict(dataset_name=\"proteinfragments\", links=self.__links__)\n\n    @property\n    def preprocess_path(self):\n        path = p_join(self.root, \"preprocessed\", self.__name__)\n        os.makedirs(path, exist_ok=True)\n        return path\n\n    def read_raw_entries(self):\n        samples = []\n        for name in self.__links__:\n            raw_path = p_join(self.root, f\"{name}\")\n            samples.extend(read_db(raw_path))\n        return samples\n
    "},{"location":"API/datasets/qm1b.html","title":"QM1B","text":""},{"location":"API/datasets/qm1b.html#openqdc.datasets.potential.qm1b.QM1B","title":"QM1B","text":"

    Bases: BaseDataset

    QM1B is a dataset containing 1 billion conformations for 1.09M small molecules generated using a custom PySCF library that incorporates hardware acceleration via IPUs. The molecules contain 9-11 heavy atoms and are subsampled from the Generated Data Bank (GDB). For each molecule, 1000 geometries are generated using RDKit. Electronic properties for each conformation are then calculated using the density functional B3LYP and the basis set STO-3G.

    Usage:

    from openqdc.datasets import QM1B\ndataset = QM1B()\n

    References

    https://arxiv.org/pdf/2311.01135

    https://github.com/graphcore-research/qm1b-dataset/

    Source code in openqdc/datasets/potential/qm1b.py
    class QM1B(BaseDataset):\n    \"\"\"\n    QM1B is a dataset containing 1 billion conformations for 1.09M small molecules generated using a custom\n    PySCF library that incorporates hardware acceleration via IPUs. The molecules contain 9-11 heavy atoms and are\n    subsampled from the Generated Data Bank (GDB). For each molecule, 1000 geometries are generated using RDKit.\n    Electronic properties for each conformation are then calculated using the density functional B3LYP\n    and the basis set STO-3G.\n\n    Usage:\n    ```python\n    from openqdc.datasets import QM1B\n    dataset = QM1B()\n    ```\n\n    References:\n        https://arxiv.org/pdf/2311.01135\\n\n        https://github.com/graphcore-research/qm1b-dataset/\n    \"\"\"\n\n    __name__ = \"qm1b\"\n\n    __energy_methods__ = [PotentialMethod.B3LYP_STO3G]\n    __force_methods__ = []\n\n    energy_target_names = [\"b3lyp/sto-3g\"]\n    force_target_names = []\n\n    __energy_unit__ = \"ev\"\n    __distance_unit__ = \"bohr\"\n    __forces_unit__ = \"ev/bohr\"\n    __links__ = {\n        \"qm1b_validation.parquet\": \"https://ndownloader.figshare.com/files/43005175\",\n        **{f\"part_{i:03d}.parquet\": f\"https://ndownloader.figshare.com/files/{FILE_NUM[i]}\" for i in range(0, 256)},\n    }\n\n    @property\n    def root(self):\n        return p_join(get_local_cache(), \"qm1b\")\n\n    @property\n    def preprocess_path(self):\n        path = p_join(self.root, \"preprocessed\", self.__name__)\n        os.makedirs(path, exist_ok=True)\n        return path\n\n    def read_raw_entries(self):\n        filenames = list(map(lambda x: p_join(self.root, f\"part_{x:03d}.parquet\"), list(range(0, 256)))) + [\n            p_join(self.root, \"qm1b_validation.parquet\")\n        ]\n\n        def read_entries_parallel(filename):\n            df = pd.read_parquet(filename)\n\n            def extract_parallel(df, i):\n                return extract_from_row(df.iloc[i])\n\n            fn = partial(extract_parallel, df)\n            list_of_idxs = list(range(len(df)))\n            results = dm.utils.parallelized(fn, list_of_idxs, scheduler=\"threads\", progress=False)\n            return results\n\n        list_of_list = dm.utils.parallelized(read_entries_parallel, filenames, scheduler=\"processes\", progress=True)\n\n        return [x for xs in list_of_list for x in xs]\n
    "},{"location":"API/datasets/qm1b.html#openqdc.datasets.potential.qm1b.QM1B_SMALL","title":"QM1B_SMALL","text":"

    Bases: QM1B

    QM1B_SMALL is a subset of the QM1B dataset containing a maximum of 15 random conformers per molecule.

    Usage:

    from openqdc.datasets import QM1B_SMALL\ndataset = QM1B_SMALL()\n

    Source code in openqdc/datasets/potential/qm1b.py
    class QM1B_SMALL(QM1B):\n    \"\"\"\n    QM1B_SMALL is a subset of the QM1B dataset containing a maximum of 15 random conformers per molecule.\n\n    Usage:\n    ```python\n    from openqdc.datasets import QM1B_SMALL\n    dataset = QM1B_SMALL()\n    ```\n    \"\"\"\n\n    __name__ = \"qm1b_small\"\n
    "},{"location":"API/datasets/qm7x.html","title":"QM7X","text":""},{"location":"API/datasets/qm7x.html#openqdc.datasets.potential.qm7x.QM7X","title":"QM7X","text":"

    Bases: BaseDataset

    QM7X is a collection of almost 4.2 million conformers from 6,950 unique organic molecules. The molecules with up to seven heavy (C, N, O, S, Cl) atoms are considered from the GDB13 database. For generating conformations, OpenBabel is utilized to get an initial structure using the MMFF94 force field. Using the initial structure, meta- stable conformational isomers are generated using the Confab tool along with the MMFF94 force field. The structure is then re-optimized with density-functional tight binding (DFTB) supplemented with many-body dispersion (MBD) interactions. The lowest energy structure is then considered as the final equilibrium conformer. Additionally, non -equilibrium conformations are generated by displacing the equilibrium geometry along a linear combination of normal mode coordinates computed at the DFTB3-MBD level within the harmonic approximation. The dataset has energy values for each geometry computed at PBE0-MBD and DFTB3-MBD method.

    Usage:

    from openqdc.datasets import QM7X\ndataset = QM7X()\n

    References

    https://arxiv.org/abs/2006.15139

    https://zenodo.org/records/4288677

    Source code in openqdc/datasets/potential/qm7x.py
    class QM7X(BaseDataset):\n    \"\"\"\n    QM7X is a collection of almost 4.2 million conformers from 6,950 unique organic molecules. The molecules with\n    up to seven heavy (C, N, O, S, Cl) atoms are considered from the GDB13 database. For generating conformations,\n    OpenBabel is utilized to get an initial structure using the MMFF94 force field. Using the initial structure, meta-\n    stable conformational isomers are generated using the Confab tool along with the MMFF94 force field. The structure\n    is then re-optimized with density-functional tight binding (DFTB) supplemented with many-body dispersion (MBD)\n    interactions. The lowest energy structure is then considered as the final equilibrium conformer. Additionally, non\n    -equilibrium conformations are generated by displacing the equilibrium geometry along a linear combination of\n    normal mode coordinates computed at the DFTB3-MBD level within the harmonic approximation. The dataset has\n    energy values for each geometry computed at PBE0-MBD and DFTB3-MBD method.\n\n    Usage:\n    ```python\n    from openqdc.datasets import QM7X\n    dataset = QM7X()\n    ```\n\n    References:\n        https://arxiv.org/abs/2006.15139\\n\n        https://zenodo.org/records/4288677\n    \"\"\"\n\n    __name__ = \"qm7x\"\n\n    __energy_methods__ = [PotentialMethod.PBE0_DEF2_TZVP, PotentialMethod.DFT3B]  # \"pbe0/def2-tzvp\", \"dft3b\"]\n\n    energy_target_names = [\"ePBE0+MBD\", \"eDFTB+MBD\"]\n\n    __force_mask__ = [True, False]\n\n    force_target_names = [\"pbe0FOR\"]\n\n    __energy_unit__ = \"ev\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"ev/ang\"\n    __links__ = {f\"{i}000.xz\": f\"https://zenodo.org/record/4288677/files/{i}000.xz\" for i in range(1, 9)}\n\n    def read_raw_entries(self):\n        samples = []\n        for i in range(1, 9):\n            raw_path = p_join(self.root, f\"{i}000\")\n            data = load_hdf5_file(raw_path)\n            samples += [\n                read_mol(data[k], k, self.energy_target_names, self.force_target_names) for k in tqdm(data.keys())\n            ]\n\n        return samples\n
    "},{"location":"API/datasets/qm7x.html#openqdc.datasets.potential.qm7x.QM7X_V2","title":"QM7X_V2","text":"

    Bases: QM7X

    QM7X_V2 is an extension of the QM7X dataset containing PM6 labels for each of the 4.2M geometries.

    Usage:

    from openqdc.datasets import QM7X_V2\ndataset = QM7X_V2()\n

    Source code in openqdc/datasets/potential/qm7x.py
    class QM7X_V2(QM7X):\n    \"\"\"\n    QM7X_V2 is an extension of the QM7X dataset containing PM6 labels for each of the 4.2M geometries.\n\n    Usage:\n    ```python\n    from openqdc.datasets import QM7X_V2\n    dataset = QM7X_V2()\n    ```\n    \"\"\"\n\n    __name__ = \"qm7x_v2\"\n    __energy_methods__ = QM7X.__energy_methods__ + [PotentialMethod.PM6]\n    __force_mask__ = QM7X.__force_mask__ + [False]\n    energy_target_names = QM7X.energy_target_names + [\"PM6\"]\n    force_target_names = QM7X.force_target_names\n
    "},{"location":"API/datasets/qmugs.html","title":"Qmugs","text":""},{"location":"API/datasets/qmugs.html#openqdc.datasets.potential.qmugs.QMugs","title":"QMugs","text":"

    Bases: BaseDataset

    The QMugs dataset contains 2 million conformers for 665k biologically and pharmacologically relevant molecules extracted from the ChEMBL database. Three geometries per molecule are generated and optimized using the GFN2-xTB method. Using the optimized geometry, the atomic and molecular properties are calculated using both, semi-empirical method (GFN2-xTB) and DFT method (\u03c9B97X-D/def2-SVP).

    Usage:

    from openqdc.datasets import QMugs\ndataset = QMugs()\n

    References

    https://arxiv.org/abs/2107.00367

    https://www.nature.com/articles/s41597-022-01390-7#ethics

    https://www.research-collection.ethz.ch/handle/20.500.11850/482129

    Source code in openqdc/datasets/potential/qmugs.py
    class QMugs(BaseDataset):\n    \"\"\"\n    The QMugs dataset contains 2 million conformers for 665k biologically and pharmacologically relevant molecules\n    extracted from the ChEMBL database. Three geometries per molecule are generated and optimized using the GFN2-xTB\n    method. Using the optimized geometry, the atomic and molecular properties are calculated using both, semi-empirical\n    method (GFN2-xTB) and DFT method (\u03c9B97X-D/def2-SVP).\n\n    Usage:\n    ```python\n    from openqdc.datasets import QMugs\n    dataset = QMugs()\n    ```\n\n    References:\n        https://arxiv.org/abs/2107.00367\\n\n        https://www.nature.com/articles/s41597-022-01390-7#ethics\\n\n        https://www.research-collection.ethz.ch/handle/20.500.11850/482129\n    \"\"\"\n\n    __name__ = \"qmugs\"\n    __energy_methods__ = [PotentialMethod.GFN2_XTB, PotentialMethod.WB97X_D_DEF2_SVP]  # \"gfn2_xtb\", \"wb97x-d/def2-svp\"\n    __energy_unit__ = \"hartree\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"hartree/ang\"\n    __links__ = {\n        \"summary.csv\": \"https://libdrive.ethz.ch/index.php/s/X5vOBNSITAG5vzM/download?path=%2F&files=summary.csv\",\n        \"structures.tar.gz\": \"https://libdrive.ethz.ch/index.php/s/X5vOBNSITAG5vzM/download?path=%2F&files=structures.tar.gz\",  # noqa\n    }\n\n    energy_target_names = [\n        \"GFN2:TOTAL_ENERGY\",\n        \"DFT:TOTAL_ENERGY\",\n    ]\n\n    def read_raw_entries(self):\n        raw_path = p_join(self.root, \"structures\")\n        mol_dirs = [p_join(raw_path, d) for d in os.listdir(raw_path)]\n\n        samples = dm.parallelized(read_mol, mol_dirs, n_jobs=-1, progress=True, scheduler=\"threads\")\n        return samples\n
    "},{"location":"API/datasets/qmugs.html#openqdc.datasets.potential.qmugs.QMugs_V2","title":"QMugs_V2","text":"

    Bases: QMugs

    QMugs_V2 is an extension of the QMugs dataset containing PM6 labels for each of the 4.2M geometries.

    Usage:

    from openqdc.datasets import QMugs_V2\ndataset = QMugs_V2()\n

    Source code in openqdc/datasets/potential/qmugs.py
    class QMugs_V2(QMugs):\n    \"\"\"\n    QMugs_V2 is an extension of the QMugs dataset containing PM6 labels for each of the 4.2M geometries.\n\n    Usage:\n    ```python\n    from openqdc.datasets import QMugs_V2\n    dataset = QMugs_V2()\n    ```\n    \"\"\"\n\n    __name__ = \"qmugs_v2\"\n    __energy_methods__ = QMugs.__energy_methods__ + [PotentialMethod.PM6]\n    energy_target_names = QMugs.energy_target_names + [\"PM6\"]\n    __force_mask__ = QMugs.__force_mask__ + [False]\n
    "},{"location":"API/datasets/qmx.html","title":"QMX","text":""},{"location":"API/datasets/qmx.html#openqdc.datasets.potential.qmx.QM7","title":"QM7","text":"

    Bases: QMX

    QM7 is a dataset constructed from subsets of the GDB-13 database ( stable and synthetically accessible organic molecules), containing up to seven \u201cheavy\u201d atoms. The molecules conformation are optimized using DFT at the PBE0/def2-TZVP level of theory.

    Chemical species

    [C, N, O, S, H]

    Usage:

    from openqdc.datasets import QM7\ndataset = QM7()\n

    References

    https://arxiv.org/pdf/1703.00564

    Source code in openqdc/datasets/potential/qmx.py
    class QM7(QMX):\n    \"\"\"\n    QM7 is a dataset constructed from subsets of the GDB-13 database (\n    stable and synthetically accessible organic molecules),\n    containing up to seven \u201cheavy\u201d atoms.\n    The molecules conformation are optimized using DFT at the\n    PBE0/def2-TZVP level of theory.\n\n    Chemical species:\n        [C, N, O, S, H]\n\n    Usage:\n    ```python\n    from openqdc.datasets import QM7\n    dataset = QM7()\n    ```\n\n    References:\n        https://arxiv.org/pdf/1703.00564\n    \"\"\"\n\n    __links__ = {\"qm7.hdf5.gz\": \"https://zenodo.org/record/3588337/files/150.hdf5.gz?download=1\"}\n    __name__ = \"qm7\"\n\n    energy_target_names = [\n        \"B2PLYP-D3(BJ):aug-cc-pvdz\",\n        \"B2PLYP-D3(BJ):aug-cc-pvtz\",\n        \"B2PLYP-D3(BJ):def2-svp\",\n        \"B2PLYP-D3(BJ):def2-tzvp\",\n        \"B2PLYP-D3(BJ):sto-3g\",\n        \"B2PLYP-D3:aug-cc-pvdz\",\n        \"B2PLYP-D3:aug-cc-pvtz\",\n        \"B2PLYP-D3:def2-svp\",\n        \"B2PLYP-D3:def2-tzvp\",\n        \"B2PLYP-D3:sto-3g\",\n        \"B2PLYP-D3M(BJ):aug-cc-pvdz\",\n        \"B2PLYP-D3M(BJ):aug-cc-pvtz\",\n        \"B2PLYP-D3M(BJ):def2-svp\",\n        \"B2PLYP-D3M(BJ):def2-tzvp\",\n        \"B2PLYP-D3M(BJ):sto-3g\",\n        \"B2PLYP-D3M:aug-cc-pvdz\",\n        \"B2PLYP-D3M:aug-cc-pvtz\",\n        \"B2PLYP-D3M:def2-svp\",\n        \"B2PLYP-D3M:def2-tzvp\",\n        \"B2PLYP-D3M:sto-3g\",\n        \"B2PLYP:aug-cc-pvdz\",\n        \"B2PLYP:aug-cc-pvtz\",\n        \"B2PLYP:def2-svp\",\n        \"B2PLYP:def2-tzvp\",\n        \"B2PLYP:sto-3g\",\n        \"B3LYP-D3(BJ):aug-cc-pvdz\",\n        \"B3LYP-D3(BJ):aug-cc-pvtz\",\n        \"B3LYP-D3(BJ):def2-svp\",\n        \"B3LYP-D3(BJ):def2-tzvp\",\n        \"B3LYP-D3(BJ):sto-3g\",\n        \"B3LYP-D3:aug-cc-pvdz\",\n        \"B3LYP-D3:aug-cc-pvtz\",\n        \"B3LYP-D3:def2-svp\",\n        \"B3LYP-D3:def2-tzvp\",\n        \"B3LYP-D3:sto-3g\",\n        \"B3LYP-D3M(BJ):aug-cc-pvdz\",\n        \"B3LYP-D3M(BJ):aug-cc-pvtz\",\n        \"B3LYP-D3M(BJ):def2-svp\",\n        \"B3LYP-D3M(BJ):def2-tzvp\",\n        \"B3LYP-D3M(BJ):sto-3g\",\n        \"B3LYP-D3M:aug-cc-pvdz\",\n        \"B3LYP-D3M:aug-cc-pvtz\",\n        \"B3LYP-D3M:def2-svp\",\n        \"B3LYP-D3M:def2-tzvp\",\n        \"B3LYP-D3M:sto-3g\",\n        \"B3LYP:aug-cc-pvdz\",\n        \"B3LYP:aug-cc-pvtz\",\n        \"B3LYP:def2-svp\",\n        \"B3LYP:def2-tzvp\",\n        \"B3LYP:sto-3g\",\n        \"HF:aug-cc-pvdz\",\n        \"HF:aug-cc-pvtz\",\n        \"HF:def2-svp\",\n        \"HF:def2-tzvp\",\n        \"HF:sto-3g\",\n        \"MP2:aug-cc-pvdz\",\n        \"MP2:aug-cc-pvtz\",\n        \"MP2:def2-svp\",\n        \"MP2:def2-tzvp\",\n        \"MP2:sto-3g\",\n        \"PBE0:aug-cc-pvdz\",\n        \"PBE0:aug-cc-pvtz\",\n        \"PBE0:def2-svp\",\n        \"PBE0:def2-tzvp\",\n        \"PBE0:sto-3g\",\n        \"PBE:aug-cc-pvdz\",\n        \"PBE:aug-cc-pvtz\",\n        \"PBE:def2-svp\",\n        \"PBE:def2-tzvp\",\n        \"PBE:sto-3g\",\n        \"WB97M-V:aug-cc-pvdz\",\n        \"WB97M-V:aug-cc-pvtz\",\n        \"WB97M-V:def2-svp\",\n        \"WB97M-V:def2-tzvp\",\n        \"WB97M-V:sto-3g\",\n        \"WB97X-D:aug-cc-pvdz\",\n        \"WB97X-D:aug-cc-pvtz\",\n        \"WB97X-D:def2-svp\",\n        \"WB97X-D:def2-tzvp\",\n        \"WB97X-D:sto-3g\",\n    ]\n\n    __energy_methods__ = [PotentialMethod.NONE for _ in range(len(energy_target_names))]  # \"wb97x/6-31g(d)\"\n
    "},{"location":"API/datasets/qmx.html#openqdc.datasets.potential.qmx.QM7b","title":"QM7b","text":"

    Bases: QMX

    QM7b is a dataset constructed from subsets of the GDB-13 database ( stable and synthetically accessible organic molecules), containing up to seven \u201cheavy\u201d atoms. The molecules conformation are optimized using DFT at the PBE0/def2-TZVP level of theory.

    Chemical species

    [C, N, O, S, Cl, H]

    Usage:

    from openqdc.datasets import QM7b\ndataset = QM7b()\n

    References

    https://arxiv.org/pdf/1703.00564

    Source code in openqdc/datasets/potential/qmx.py
    class QM7b(QMX):\n    \"\"\"\n    QM7b is a dataset constructed from subsets of the GDB-13 database (\n    stable and synthetically accessible organic molecules),\n    containing up to seven \u201cheavy\u201d atoms.\n    The molecules conformation are optimized using DFT at the\n    PBE0/def2-TZVP level of theory.\n\n    Chemical species:\n        [C, N, O, S, Cl, H]\n\n    Usage:\n    ```python\n    from openqdc.datasets import QM7b\n    dataset = QM7b()\n    ```\n\n    References:\n        https://arxiv.org/pdf/1703.00564\n    \"\"\"\n\n    __links__ = {\"qm7b.hdf5.gz\": \"https://zenodo.org/record/3588335/files/200.hdf5.gz?download=1\"}\n    __name__ = \"qm7b\"\n    energy_target_names = [\n        \"CCSD(T0):cc-pVDZ\",\n        \"HF:cc-pVDZ\",\n        \"HF:cc-pVTZ\",\n        \"MP2:cc-pVTZ\",\n        \"B2PLYP-D3:aug-cc-pvdz\",\n        \"B2PLYP-D3:aug-cc-pvtz\",\n        \"B2PLYP-D3:def2-svp\",\n        \"B2PLYP-D3:def2-tzvp\",\n        \"B2PLYP-D3:sto-3g\",\n        \"B2PLYP-D3M(BJ):aug-cc-pvdz\",\n        \"B2PLYP-D3M(BJ):aug-cc-pvtz\",\n        \"B2PLYP-D3M(BJ):def2-svp\",\n        \"B2PLYP-D3M(BJ):def2-tzvp\",\n        \"B2PLYP-D3M(BJ):sto-3g\",\n        \"B2PLYP-D3M:aug-cc-pvdz\",\n        \"B2PLYP-D3M:aug-cc-pvtz\",\n        \"B2PLYP-D3M:def2-svp\",\n        \"B2PLYP-D3M:def2-tzvp\",\n        \"B2PLYP-D3M:sto-3g\",\n        \"B2PLYP:aug-cc-pvdz\",\n        \"B2PLYP:aug-cc-pvtz\",\n        \"B2PLYP:def2-svp\",\n        \"B2PLYP:def2-tzvp\",\n        \"B2PLYP:sto-3g\",\n        \"B3LYP-D3(BJ):aug-cc-pvdz\",\n        \"B3LYP-D3(BJ):aug-cc-pvtz\",\n        \"B3LYP-D3(BJ):def2-svp\",\n        \"B3LYP-D3(BJ):def2-tzvp\",\n        \"B3LYP-D3(BJ):sto-3g\",\n        \"B3LYP-D3:aug-cc-pvdz\",\n        \"B3LYP-D3:aug-cc-pvtz\",\n        \"B3LYP-D3:def2-svp\",\n        \"B3LYP-D3:def2-tzvp\",\n        \"B3LYP-D3:sto-3g\",\n        \"B3LYP-D3M(BJ):aug-cc-pvdz\",\n        \"B3LYP-D3M(BJ):aug-cc-pvtz\",\n        \"B3LYP-D3M(BJ):def2-svp\",\n        \"B3LYP-D3M(BJ):def2-tzvp\",\n        \"B3LYP-D3M(BJ):sto-3g\",\n        \"B3LYP-D3M:aug-cc-pvdz\",\n        \"B3LYP-D3M:aug-cc-pvtz\",\n        \"B3LYP-D3M:def2-svp\",\n        \"B3LYP-D3M:def2-tzvp\",\n        \"B3LYP-D3M:sto-3g\",\n        \"B3LYP:aug-cc-pvdz\",\n        \"B3LYP:aug-cc-pvtz\",\n        \"B3LYP:def2-svp\",\n        \"B3LYP:def2-tzvp\",\n        \"B3LYP:sto-3g\",\n        \"HF:aug-cc-pvdz\",\n        \"HF:aug-cc-pvtz\",\n        \"HF:cc-pvtz\",\n        \"HF:def2-svp\",\n        \"HF:def2-tzvp\",\n        \"HF:sto-3g\",\n        \"PBE0:aug-cc-pvdz\",\n        \"PBE0:aug-cc-pvtz\",\n        \"PBE0:def2-svp\",\n        \"PBE0:def2-tzvp\",\n        \"PBE0:sto-3g\",\n        \"PBE:aug-cc-pvdz\",\n        \"PBE:aug-cc-pvtz\",\n        \"PBE:def2-svp\",\n        \"PBE:def2-tzvp\",\n        \"PBE:sto-3g\",\n        \"SVWN:sto-3g\",\n        \"WB97M-V:aug-cc-pvdz\",\n        \"WB97M-V:aug-cc-pvtz\",\n        \"WB97M-V:def2-svp\",\n        \"WB97M-V:def2-tzvp\",\n        \"WB97M-V:sto-3g\",\n        \"WB97X-D:aug-cc-pvdz\",\n        \"WB97X-D:aug-cc-pvtz\",\n        \"WB97X-D:def2-svp\",\n        \"WB97X-D:def2-tzvp\",\n        \"WB97X-D:sto-3g\",\n    ]\n    __energy_methods__ = [PotentialMethod.NONE for _ in range(len(energy_target_names))]  # \"wb97x/6-31g(d)\"]\n
    "},{"location":"API/datasets/qmx.html#openqdc.datasets.potential.qmx.QM8","title":"QM8","text":"

    Bases: QMX

    QM8 is the subset of QM9 used in a study on modeling quantum mechanical calculations of electronic spectra and excited state energy (a increase of energy from the ground states) of small molecules up to eight heavy atoms. Multiple methods were used, including time-dependent density functional theories (TDDFT) and second-order approximate coupled-cluster (CC2). The molecules conformations are relaxed geometries computed using the DFT B3LYP with basis set 6-31G(2df,p). For more information about the sampling, check QM9 dataset.

    Usage:

    from openqdc.datasets import QM8\ndataset = QM8()\n

    References

    https://arxiv.org/pdf/1504.01966

    Source code in openqdc/datasets/potential/qmx.py
    class QM8(QMX):\n    \"\"\"QM8 is the subset of QM9 used in a study on modeling quantum\n    mechanical calculations of electronic spectra and excited\n    state energy (a increase of energy from the ground states) of small molecules\n    up to eight heavy atoms.\n    Multiple methods were used, including\n    time-dependent density functional theories (TDDFT) and\n    second-order approximate coupled-cluster (CC2).\n    The molecules conformations are relaxed geometries computed using\n    the DFT B3LYP with basis set 6-31G(2df,p).\n    For more information about the sampling, check QM9 dataset.\n\n    Usage:\n    ```python\n    from openqdc.datasets import QM8\n    dataset = QM8()\n    ```\n\n    References:\n        https://arxiv.org/pdf/1504.01966\n    \"\"\"\n\n    __name__ = \"qm8\"\n\n    __energy_methods__ = [\n        PotentialMethod.NONE,  # \"wb97x/6-31g(d)\"\n        PotentialMethod.NONE,\n        PotentialMethod.NONE,\n        PotentialMethod.NONE,\n        PotentialMethod.NONE,\n        PotentialMethod.NONE,\n        PotentialMethod.NONE,\n        PotentialMethod.NONE,\n    ]\n\n    __links__ = {\n        \"qm8.csv\": \"https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/qm8.csv\",\n        \"qm8.tar.gz\": \"https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/gdb8.tar.gz\",\n    }\n\n    def read_raw_entries(self):\n        df = pd.read_csv(p_join(self.root, \"qm8.csv\"))\n        mols = dm.read_sdf(p_join(self.root, \"qm8.sdf\"), sanitize=False, remove_hs=False)\n        samples = []\n        for idx_row, mol in zip(df.iterrows(), mols):\n            _, row = idx_row\n            positions = mol.GetConformer().GetPositions()\n            x = get_atomic_number_and_charge(mol)\n            n_atoms = positions.shape[0]\n            samples.append(\n                dict(\n                    atomic_inputs=np.concatenate((x, positions), axis=-1, dtype=np.float32).reshape(-1, 5),\n                    name=np.array([row[\"smiles\"]]),\n                    energies=np.array(\n                        [\n                            row[\n                                [\"E1-CC2\", \"E2-CC2\", \"E1-PBE0\", \"E2-PBE0\", \"E1-PBE0.1\", \"E2-PBE0.1\", \"E1-CAM\", \"E2-CAM\"]\n                            ].tolist()\n                        ],\n                        dtype=np.float64,\n                    ).reshape(1, -1),\n                    n_atoms=np.array([n_atoms], dtype=np.int32),\n                    subset=np.array([f\"{self.__name__}\"]),\n                )\n            )\n        return samples\n
    "},{"location":"API/datasets/qmx.html#openqdc.datasets.potential.qmx.QM9","title":"QM9","text":"

    Bases: QMX

    QM7b is a dataset constructed containing 134k molecules from subsets of the GDB-17 database, containing up to 9 \u201cheavy\u201d atoms. All molecular properties are calculated at B3LUP/6-31G(2df,p) level of quantum chemistry. For each of the 134k molecules, equilibrium geometries are computed by relaxing geometries with quantum mechanical method B3LYP.

    Usage:

    from openqdc.datasets import QM9\ndataset = QM9()\n

    Reference

    https://www.nature.com/articles/sdata201422

    Source code in openqdc/datasets/potential/qmx.py
    class QM9(QMX):\n    \"\"\"\n    QM7b is a dataset constructed containing 134k molecules from subsets of the GDB-17 database,\n    containing up to 9 \u201cheavy\u201d atoms. All molecular properties are calculated at B3LUP/6-31G(2df,p)\n    level of quantum chemistry. For each of the 134k molecules, equilibrium geometries are computed\n    by relaxing geometries with quantum mechanical method B3LYP.\n\n    Usage:\n    ```python\n    from openqdc.datasets import QM9\n    dataset = QM9()\n    ```\n\n    Reference:\n        https://www.nature.com/articles/sdata201422\n    \"\"\"\n\n    __links__ = {\"qm9.hdf5.gz\": \"https://zenodo.org/record/3588339/files/155.hdf5.gz?download=1\"}\n    __name__ = \"qm9\"\n    energy_target_names = [\n        \"Internal energy at 0 K\",\n        \"B3LYP:def2-svp\",\n        \"HF:cc-pvtz\",\n        \"HF:sto-3g\",\n        \"PBE:sto-3g\",\n        \"SVWN:sto-3g\",\n        \"WB97X-D:aug-cc-pvtz\",\n        \"WB97X-D:def2-svp\",\n        \"WB97X-D:def2-tzvp\",\n    ]\n\n    __energy_methods__ = [\n        PotentialMethod.NONE,  # \"wb97x/6-31g(d)\"\n        PotentialMethod.NONE,\n        PotentialMethod.NONE,\n        PotentialMethod.NONE,\n        PotentialMethod.NONE,\n        PotentialMethod.NONE,\n        PotentialMethod.NONE,\n        PotentialMethod.NONE,\n        PotentialMethod.NONE,\n    ]\n
    "},{"location":"API/datasets/qmx.html#openqdc.datasets.potential.qmx.QMX","title":"QMX","text":"

    Bases: ABC, BaseDataset

    QMX dataset base abstract class

    Source code in openqdc/datasets/potential/qmx.py
    class QMX(ABC, BaseDataset):\n    \"\"\"\n    QMX dataset base abstract class\n    \"\"\"\n\n    __name__ = \"qm9\"\n\n    __energy_methods__ = [\n        PotentialMethod.WB97X_6_31G_D,  # \"wb97x/6-31g(d)\"\n    ]\n\n    energy_target_names = [\n        \"\u03c9B97x:6-31G(d) Energy\",\n    ]\n\n    __energy_unit__ = \"hartree\"\n    __distance_unit__ = \"bohr\"\n    __forces_unit__ = \"hartree/bohr\"\n    __links__ = {}\n\n    @property\n    def root(self):\n        return p_join(get_local_cache(), \"qmx\")\n\n    @property\n    def preprocess_path(self):\n        path = p_join(self.root, \"preprocessed\", self.__name__)\n        os.makedirs(path, exist_ok=True)\n        return path\n\n    @property\n    def config(self):\n        assert len(self.__links__) > 0, \"No links provided for fetching\"\n        return dict(dataset_name=\"qmx\", links=self.__links__)\n\n    def read_raw_entries(self):\n        raw_path = p_join(self.root, f\"{self.__name__}.h5.gz\")\n        samples = read_qc_archive_h5(raw_path, self.__name__, self.energy_target_names, None)\n        return samples\n
    "},{"location":"API/datasets/revmd17.html","title":"RevMD17","text":""},{"location":"API/datasets/revmd17.html#openqdc.datasets.potential.revmd17.RevMD17","title":"RevMD17","text":"

    Bases: BaseDataset

    Revised MD (RevMD17) improves upon the MD17 dataset by removing all the numerical noise present in the original dataset. The data is generated from an ab-initio molecular dynamics (AIMD) simulation where forces and energies are computed at the PBE/def2-SVP level of theory using very tigh SCF convergence and very dense DFT integration grid. The dataset contains the following molecules: Benzene: 627000 samples

    Uracil: 133000 samples\n\nNaptalene: 326000 samples\n\nAspirin: 211000 samples\n\nSalicylic Acid: 320000 samples\n\nMalonaldehyde: 993000 samples\n\nEthanol: 555000 samples\n\nToluene: 100000 samples\n

    Usage:

    from openqdc.datasets import RevMD17\ndataset = RevMD17()\n

    References

    https://arxiv.org/abs/2007.09593

    Source code in openqdc/datasets/potential/revmd17.py
    class RevMD17(BaseDataset):\n    \"\"\"\n    Revised MD (RevMD17) improves upon the MD17 dataset by removing all the numerical noise present in the original\n    dataset. The data is generated from an ab-initio molecular dynamics (AIMD) simulation where forces and energies\n    are computed at the PBE/def2-SVP level of theory using very tigh SCF convergence and very dense DFT integration\n    grid. The dataset contains the following molecules:\n        Benzene: 627000 samples\\n\n        Uracil: 133000 samples\\n\n        Naptalene: 326000 samples\\n\n        Aspirin: 211000 samples\\n\n        Salicylic Acid: 320000 samples\\n\n        Malonaldehyde: 993000 samples\\n\n        Ethanol: 555000 samples\\n\n        Toluene: 100000 samples\\n\n\n    Usage:\n    ```python\n    from openqdc.datasets import RevMD17\n    dataset = RevMD17()\n    ```\n\n    References:\n        https://arxiv.org/abs/2007.09593\n    \"\"\"\n\n    __name__ = \"revmd17\"\n\n    __energy_methods__ = [\n        PotentialMethod.PBE_DEF2_TZVP\n        # \"pbe/def2-tzvp\",\n    ]\n    __force_mask__ = [True]\n\n    energy_target_names = [\n        \"PBE-TS Energy\",\n    ]\n\n    __force_methods__ = [\n        \"pbe/def2-tzvp\",\n    ]\n\n    force_target_names = [\n        \"PBE-TS Gradient\",\n    ]\n    __links__ = {\"revmd17.zip\": \"https://figshare.com/ndownloader/articles/12672038/versions/3\"}\n\n    __energy_unit__ = \"kcal/mol\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"kcal/mol/ang\"\n\n    def read_raw_entries(self):\n        entries_list = []\n        decompress_tar_gz(p_join(self.root, \"rmd17.tar.bz2\"))\n        for trajectory in trajectories:\n            entries_list.append(read_npz_entry(trajectory, self.root))\n        return entries_list\n
    "},{"location":"API/datasets/sn2_rxn.html","title":"SN2 RXN","text":""},{"location":"API/datasets/sn2_rxn.html#openqdc.datasets.potential.sn2_rxn.SN2RXN","title":"SN2RXN","text":"

    Bases: BaseDataset

    This dataset probes chemical reactions of methyl halides with halide anions, i.e. X- + CH3Y -> CH3X + Y-, and contains structures for all possible combinations of X,Y = F, Cl, Br, I. The conformations are generated by running MD simulations at a temperature of 5000K with a time step of 0.1 fs using Atomic Simulation Environment (ASE). The forces are derived using semi-empirical method PM7 and the structures are saved every 10 steps, and for each of them, energy and forces are calculated at the DSD-BLYP-D3(BJ)/def2-TZVP level of theory. The dataset contains 452,709 structures along with the energy, force and dipole moments.

    Usage:

    from openqdc.datasets import SN2RXN\ndataset = SN2RXN()\n

    References

    https://doi.org/10.1021/acs.jctc.9b00181

    https://zenodo.org/records/2605341

    Source code in openqdc/datasets/potential/sn2_rxn.py
    class SN2RXN(BaseDataset):\n    \"\"\"\n    This dataset probes chemical reactions of methyl halides with halide anions, i.e. X- + CH3Y -> CH3X +  Y-, and\n    contains structures for all possible combinations of X,Y = F, Cl, Br, I. The conformations are generated by\n    running MD simulations at a temperature of 5000K with a time step of 0.1 fs using Atomic Simulation Environment\n    (ASE). The forces are derived using semi-empirical method PM7 and the structures are saved every 10 steps, and\n    for each of them, energy and forces are calculated at the DSD-BLYP-D3(BJ)/def2-TZVP level of theory. The dataset\n    contains 452,709 structures along with the energy, force and dipole moments.\n\n    Usage:\n    ```python\n    from openqdc.datasets import SN2RXN\n    dataset = SN2RXN()\n    ```\n\n    References:\n        https://doi.org/10.1021/acs.jctc.9b00181\\n\n        https://zenodo.org/records/2605341\n    \"\"\"\n\n    __name__ = \"sn2_rxn\"\n\n    __energy_methods__ = [\n        PotentialMethod.DSD_BLYP_D3_BJ_DEF2_TZVP\n        # \"dsd-blyp-d3(bj)/def2-tzvp\",\n    ]\n    __energy_unit__ = \"ev\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"ev/ang\"\n    __links__ = {\"sn2_rxn.npz\": \"https://zenodo.org/records/2605341/files/sn2_reactions.npz\"}\n\n    energy_target_names = [\n        # TODO: We need to revalidate this to make sure that is not atomization energies.\n        \"DSD-BLYP-D3(BJ):def2-TZVP Atomization Energy\",\n    ]\n\n    __force_mask__ = [True]\n\n    force_target_names = [\n        \"DSD-BLYP-D3(BJ):def2-TZVP Gradient\",\n    ]\n\n    def read_raw_entries(self):\n        raw_path = p_join(self.root, \"sn2_rxn.npz\")\n        data = np.load(raw_path)\n        samples = extract_npz_entry(data)\n\n        return samples\n
    "},{"location":"API/datasets/solvated_peptides.html","title":"Solvated Peptides","text":""},{"location":"API/datasets/solvated_peptides.html#openqdc.datasets.potential.solvated_peptides.SolvatedPeptides","title":"SolvatedPeptides","text":"

    Bases: BaseDataset

    The solvated protein fragments dataset probes many-body intermolecular interactions between \"protein fragments\" and water molecules. Geometries are first optimized with the semi-empirical method PM7 and then MD simulations are run at 1000K with a time-step of 0.1fs using Atomic Simulations Environment (ASE). Structures are saved every 10 steps, where energies, forces and dipole moments are calculated at revPBE-D3(BJ)/def2-TZVP level of theory.

    Usage:

    from openqdc.datasets import SolvatedPeptides\ndataset = SolvatedPeptides()\n

    References

    https://doi.org/10.1021/acs.jctc.9b00181

    https://zenodo.org/records/2605372

    Source code in openqdc/datasets/potential/solvated_peptides.py
    class SolvatedPeptides(BaseDataset):\n    \"\"\"\n    The solvated protein fragments dataset probes many-body intermolecular interactions between \"protein fragments\"\n    and water molecules. Geometries are first optimized with the semi-empirical method PM7 and then MD simulations are\n    run at 1000K with a time-step of 0.1fs using Atomic Simulations Environment (ASE). Structures are saved every 10\n    steps, where energies, forces and dipole moments are calculated at revPBE-D3(BJ)/def2-TZVP level of theory.\n\n    Usage:\n    ```python\n    from openqdc.datasets import SolvatedPeptides\n    dataset = SolvatedPeptides()\n    ```\n\n    References:\n        https://doi.org/10.1021/acs.jctc.9b00181\\n\n        https://zenodo.org/records/2605372\n    \"\"\"\n\n    __name__ = \"solvated_peptides\"\n\n    __energy_methods__ = [\n        PotentialMethod.REVPBE_D3_BJ_DEF2_TZVP\n        # \"revpbe-d3(bj)/def2-tzvp\",\n    ]\n\n    energy_target_names = [\n        \"revPBE-D3(BJ):def2-TZVP Atomization Energy\",\n    ]\n\n    __force_mask__ = [True]\n\n    force_target_names = [\n        \"revPBE-D3(BJ):def2-TZVP Gradient\",\n    ]\n\n    # TO CHECK\n    __energy_unit__ = \"ev\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"ev/ang\"\n    __links__ = {\"solvated_peptides.hdf5.gz\": \"https://zenodo.org/record/3585804/files/213.hdf5.gz\"}\n\n    def __smiles_converter__(self, x):\n        \"\"\"util function to convert string to smiles: useful if the smiles is\n        encoded in a different format than its display format\n        \"\"\"\n        return \"_\".join(x.decode(\"ascii\").split(\"_\")[:-1])\n\n    def read_raw_entries(self):\n        raw_path = p_join(self.root, \"solvated_peptides.h5.gz\")\n        samples = read_qc_archive_h5(raw_path, \"solvated_peptides\", self.energy_target_names, self.force_target_names)\n\n        return samples\n
    "},{"location":"API/datasets/solvated_peptides.html#openqdc.datasets.potential.solvated_peptides.SolvatedPeptides.__smiles_converter__","title":"__smiles_converter__(x)","text":"

    util function to convert string to smiles: useful if the smiles is encoded in a different format than its display format

    Source code in openqdc/datasets/potential/solvated_peptides.py
    def __smiles_converter__(self, x):\n    \"\"\"util function to convert string to smiles: useful if the smiles is\n    encoded in a different format than its display format\n    \"\"\"\n    return \"_\".join(x.decode(\"ascii\").split(\"_\")[:-1])\n
    "},{"location":"API/datasets/spice.html","title":"Spice","text":""},{"location":"API/datasets/spice.html#openqdc.datasets.potential.spice.Spice","title":"Spice","text":"

    Bases: BaseDataset

    Spice dataset consists of 1.1 million conformations for a diverse set of 19k unique molecules consisting of small molecules, dimers, dipeptides, and solvated amino acids. Conformations are first generated with RDKit, and then molecular dynamics simulations at 100ps and 500K using OpenMM and Amber force field are used to generate 100 high energy conformations. Low-energy conformations are then generated by L-BFGS energy minimization and molecular dynamics at 1ps and 100K. Forces and energies for conformations are calculated at the wB97M-D3(BJ)/def2-TZVPPD level of theory.

    Usage:

    from openqdc.datasets import Spice\ndataset = Spice()\n

    References

    https://arxiv.org/abs/2209.10702

    https://github.com/openmm/spice-dataset

    Source code in openqdc/datasets/potential/spice.py
    class Spice(BaseDataset):\n    \"\"\"\n    Spice dataset consists of 1.1 million conformations for a diverse set of 19k unique molecules consisting of\n    small molecules, dimers, dipeptides, and solvated amino acids. Conformations are first generated with RDKit,\n    and then molecular dynamics simulations at 100ps and 500K using OpenMM and Amber force field are used to generate\n    100 high energy conformations. Low-energy conformations are then generated by L-BFGS energy minimization and\n    molecular dynamics at 1ps and 100K. Forces and energies for conformations are calculated at the\n    wB97M-D3(BJ)/def2-TZVPPD level of theory.\n\n    Usage:\n    ```python\n    from openqdc.datasets import Spice\n    dataset = Spice()\n    ```\n\n    References:\n        https://arxiv.org/abs/2209.10702\\n\n        https://github.com/openmm/spice-dataset\n    \"\"\"\n\n    __name__ = \"spice\"\n    __energy_methods__ = [PotentialMethod.WB97M_D3BJ_DEF2_TZVPPD]\n    __force_mask__ = [True]\n    __energy_unit__ = \"hartree\"\n    __distance_unit__ = \"bohr\"\n    __forces_unit__ = \"hartree/bohr\"\n\n    energy_target_names = [\"dft_total_energy\"]\n\n    force_target_names = [\"dft_total_gradient\"]\n\n    subset_mapping = {\n        \"SPICE Solvated Amino Acids Single Points Dataset v1.1\": \"Solvated Amino Acids\",\n        \"SPICE Dipeptides Single Points Dataset v1.2\": \"Dipeptides\",\n        \"SPICE DES Monomers Single Points Dataset v1.1\": \"DES370K Monomers\",\n        \"SPICE DES370K Single Points Dataset v1.0\": \"DES370K Dimers\",\n        \"SPICE DES370K Single Points Dataset Supplement v1.0\": \"DES370K Dimers\",\n        \"SPICE PubChem Set 1 Single Points Dataset v1.2\": \"PubChem\",\n        \"SPICE PubChem Set 2 Single Points Dataset v1.2\": \"PubChem\",\n        \"SPICE PubChem Set 3 Single Points Dataset v1.2\": \"PubChem\",\n        \"SPICE PubChem Set 4 Single Points Dataset v1.2\": \"PubChem\",\n        \"SPICE PubChem Set 5 Single Points Dataset v1.2\": \"PubChem\",\n        \"SPICE PubChem Set 6 Single Points Dataset v1.2\": \"PubChem\",\n        \"SPICE Ion Pairs Single Points Dataset v1.1\": \"Ion Pairs\",\n    }\n    __links__ = {\"SPICE-1.1.4.hdf5\": \"https://zenodo.org/record/8222043/files/SPICE-1.1.4.hdf5\"}\n\n    def convert_forces(self, x):\n        return (-1.0) * super().convert_forces(x)\n\n    def read_raw_entries(self):\n        raw_path = p_join(self.root, \"SPICE-1.1.4.hdf5\")\n\n        data = load_hdf5_file(raw_path)\n        tmp = [read_record(data[mol_name], self) for mol_name in tqdm(data)]  # don't use parallelized here\n\n        return tmp\n
    "},{"location":"API/datasets/spice.html#openqdc.datasets.potential.spice.SpiceV2","title":"SpiceV2","text":"

    Bases: Spice

    SpiceV2 dataset augments the Spice data with amino acids complexes, water boxes, pubchem solvated molecules. The main changes include, (1) over 13,000 new PubChem molecules, out of which 1500 contain boron and 1900 contain silicon, (2) 194,000 conformations of dimers containing amino acid and ligands, (3) 1000 water clusters to improve sampling interactions in bulk water, (4) 1397 PubChem molecules solvated with a shell of water molecules, and (5) Fixing bad calculations from the Spice dataset. The data generation process is the same as the Spice dataset.

    Usage:

    from openqdc.datasets import SpiceV2\ndataset = SpiceV2()\n

    References

    https://github.com/openmm/spice-dataset/releases/tag/2.0.0

    https://github.com/openmm/spice-dataset

    Source code in openqdc/datasets/potential/spice.py
    class SpiceV2(Spice):\n    \"\"\"\n    SpiceV2 dataset augments the Spice data with amino acids complexes, water boxes, pubchem solvated molecules.\n    The main changes include, (1) over 13,000 new PubChem molecules, out of which 1500 contain boron and 1900 contain\n    silicon, (2) 194,000 conformations of dimers containing amino acid and ligands, (3) 1000 water clusters to improve\n    sampling interactions in bulk water, (4) 1397 PubChem molecules solvated with a shell of water molecules, and\n    (5) Fixing bad calculations from the Spice dataset. The data generation process is the same as the Spice dataset.\n\n    Usage:\n    ```python\n    from openqdc.datasets import SpiceV2\n    dataset = SpiceV2()\n    ```\n\n    References:\n        https://github.com/openmm/spice-dataset/releases/tag/2.0.0\\n\n        https://github.com/openmm/spice-dataset\n    \"\"\"\n\n    __name__ = \"spicev2\"\n\n    subset_mapping = {\n        \"SPICE Dipeptides Single Points Dataset v1.3\": \"Dipeptides\",\n        \"SPICE Solvated Amino Acids Single Points Dataset v1.1\": \"Solvated Amino Acids\",\n        \"SPICE Water Clusters v1.0\": \"Water Clusters\",\n        \"SPICE Solvated PubChem Set 1 v1.0\": \"Solvated PubChem\",\n        \"SPICE Amino Acid Ligand v1.0\": \"Amino Acid Ligand\",\n        \"SPICE PubChem Set 1 Single Points Dataset v1.3\": \"PubChem\",\n        \"SPICE PubChem Set 2 Single Points Dataset v1.3\": \"PubChem\",\n        \"SPICE PubChem Set 3 Single Points Dataset v1.3\": \"PubChem\",\n        \"SPICE PubChem Set 4 Single Points Dataset v1.3\": \"PubChem\",\n        \"SPICE PubChem Set 5 Single Points Dataset v1.3\": \"PubChem\",\n        \"SPICE PubChem Set 6 Single Points Dataset v1.3\": \"PubChem\",\n        \"SPICE PubChem Set 7 Single Points Dataset v1.0\": \"PubChemv2\",\n        \"SPICE PubChem Set 8 Single Points Dataset v1.0\": \"PubChemv2\",\n        \"SPICE PubChem Set 9 Single Points Dataset v1.0\": \"PubChemv2\",\n        \"SPICE PubChem Set 10 Single Points Dataset v1.0\": \"PubChemv2\",\n        \"SPICE DES Monomers Single Points Dataset v1.1\": \"DES370K Monomers\",\n        \"SPICE DES370K Single Points Dataset v1.0\": \"DES370K Dimers\",\n        \"SPICE DES370K Single Points Dataset Supplement v1.1\": \"DES370K Dimers\",\n        \"SPICE PubChem Boron Silicon v1.0\": \"PubChem Boron Silicon\",\n        \"SPICE Ion Pairs Single Points Dataset v1.2\": \"Ion Pairs\",\n    }\n    __links__ = {\"spice-2.0.0.hdf5\": \"https://zenodo.org/records/10835749/files/SPICE-2.0.0.hdf5?download=1\"}\n\n    def read_raw_entries(self):\n        raw_path = p_join(self.root, \"spice-2.0.0.hdf5\")\n\n        data = load_hdf5_file(raw_path)\n        # Entry 40132 without positions, skip it\n        # don't use parallelized here\n        tmp = [read_record(data[mol_name], self) for i, mol_name in enumerate(tqdm(data)) if i != 40132]\n\n        return tmp\n
    "},{"location":"API/datasets/spice.html#openqdc.datasets.potential.spice.SpiceVL2","title":"SpiceVL2","text":"

    Bases: SpiceV2

    SpiceVL2 is an extension of the SpiceV2 dataset with additional semi-empirical GFN2-xTB and PM6 energy methods.

    Usage:

    from openqdc.datasets import SpiceVL2\ndataset = SpiceVL2()\n

    References

    https://github.com/openmm/spice-dataset/releases/tag/2.0.0

    https://github.com/openmm/spice-dataset

    Source code in openqdc/datasets/potential/spice.py
    class SpiceVL2(SpiceV2):\n    \"\"\"\n    SpiceVL2 is an extension of the SpiceV2 dataset with additional semi-empirical GFN2-xTB and PM6 energy methods.\n\n    Usage:\n    ```python\n    from openqdc.datasets import SpiceVL2\n    dataset = SpiceVL2()\n    ```\n\n    References:\n        https://github.com/openmm/spice-dataset/releases/tag/2.0.0\\n\n        https://github.com/openmm/spice-dataset\n    \"\"\"\n\n    __name__ = \"spice_vl2\"\n\n    __energy_methods__ = SpiceV2.__energy_methods__ + [PotentialMethod.GFN2_XTB, PotentialMethod.PM6]\n    energy_target_names = SpiceV2.energy_target_names + [\"GFN2,\" \"PM6\"]\n    __force_mask__ = SpiceV2.__force_mask__ + [False, False]\n
    "},{"location":"API/datasets/spice.html#openqdc.datasets.potential.spice.read_record","title":"read_record(r, obj)","text":"

    Read record from hdf5 file. r : hdf5 record obj : Spice class object used to grab subset and names

    Source code in openqdc/datasets/potential/spice.py
    def read_record(r, obj):\n    \"\"\"\n    Read record from hdf5 file.\n        r : hdf5 record\n        obj : Spice class object used to grab subset and names\n    \"\"\"\n    smiles = r[\"smiles\"].asstr()[0]\n    subset = r[\"subset\"][0].decode(\"utf-8\")\n    n_confs = r[\"conformations\"].shape[0]\n    x = get_atomic_number_and_charge(dm.to_mol(smiles, remove_hs=False, ordered=True))\n    positions = r[\"conformations\"][:]\n\n    res = dict(\n        name=np.array([smiles] * n_confs),\n        subset=np.array([obj.subset_mapping[subset]] * n_confs),\n        energies=r[obj.energy_target_names[0]][:][:, None].astype(np.float64),\n        forces=r[obj.force_target_names[0]][:].reshape(\n            -1, 3, 1\n        ),  # forces -ve of energy gradient but the -1.0 is done in the convert_forces method\n        atomic_inputs=np.concatenate(\n            (x[None, ...].repeat(n_confs, axis=0), positions), axis=-1, dtype=np.float32\n        ).reshape(-1, 5),\n        n_atoms=np.array([x.shape[0]] * n_confs, dtype=np.int32),\n    )\n\n    return res\n
    "},{"location":"API/datasets/splinter.html","title":"Splinter","text":""},{"location":"API/datasets/splinter.html#openqdc.datasets.interaction.splinter.Splinter","title":"Splinter","text":"

    Bases: BaseInteractionDataset

    Splinter consists of 30,416A dimer pairs with over 1.5 million geometries. The geometries are generated by quantum mechanical optimization with B3LYP-D3/aug-cc-pV(D+d)Z level of theory. The interaction energies and the various components are computed using SAPT0/qug-cc-pV(D=d)Z method.

    Usage:

    from openqdc.datasets import Splinter\ndataset = Splinter()\n

    Reference

    https://doi.org/10.1038/s41597-023-02443-1

    Source code in openqdc/datasets/interaction/splinter.py
    class Splinter(BaseInteractionDataset):\n    \"\"\"\n    Splinter consists of 30,416A dimer pairs with over 1.5 million geometries. The geometries are generated\n    by quantum mechanical optimization with B3LYP-D3/aug-cc-pV(D+d)Z level of theory. The interaction energies\n    and the various components are computed using SAPT0/qug-cc-pV(D=d)Z method.\n\n    Usage:\n    ```python\n    from openqdc.datasets import Splinter\n    dataset = Splinter()\n    ```\n\n    Reference:\n        https://doi.org/10.1038/s41597-023-02443-1\n    \"\"\"\n\n    __energy_unit__ = \"kcal/mol\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"kcal/mol/ang\"\n\n    __name__ = \"splinter\"\n    __energy_methods__ = [\n        InteractionMethod.SAPT0_JUN_CC_PVDDZ,\n        InteractionMethod.SAPT0_JUN_CC_PVDDZ,\n        InteractionMethod.SAPT0_JUN_CC_PVDDZ,\n        InteractionMethod.SAPT0_JUN_CC_PVDDZ,\n        InteractionMethod.SAPT0_JUN_CC_PVDDZ,\n        InteractionMethod.SAPT0_JUN_CC_PVDDZ,\n        InteractionMethod.SAPT0_JUN_CC_PVDDZ,\n        InteractionMethod.SAPT0_JUN_CC_PVDDZ,\n        InteractionMethod.SAPT0_JUN_CC_PVDDZ,\n        InteractionMethod.SAPT0_JUN_CC_PVDDZ,\n        InteractionMethod.SAPT0_AUG_CC_PVDDZ,\n        InteractionMethod.SAPT0_AUG_CC_PVDDZ,\n        InteractionMethod.SAPT0_AUG_CC_PVDDZ,\n        InteractionMethod.SAPT0_AUG_CC_PVDDZ,\n        InteractionMethod.SAPT0_AUG_CC_PVDDZ,\n        InteractionMethod.SAPT0_AUG_CC_PVDDZ,\n        InteractionMethod.SAPT0_AUG_CC_PVDDZ,\n        InteractionMethod.SAPT0_AUG_CC_PVDDZ,\n        InteractionMethod.SAPT0_AUG_CC_PVDDZ,\n        InteractionMethod.SAPT0_AUG_CC_PVDDZ,\n        # \"sapt0/jun-cc-pV(D+d)Z_unscaled\", #TODO: we need to pick the unscaled version only here\n        # \"sapt0/jun-cc-pV(D+d)Z_es_unscaled\",\n        # \"sapt0/jun-cc-pV(D+d)Z_ex_unscaled\",\n        # \"sapt0/jun-cc-pV(D+d)Z_ind_unscaled\",\n        # \"sapt0/jun-cc-pV(D+d)Z_disp_unscaled\",\n        # \"sapt0/jun-cc-pV(D+d)Z_scaled\",\n        # \"sapt0/jun-cc-pV(D+d)Z_es_scaled\",\n        # \"sapt0/jun-cc-pV(D+d)Z_ex_scaled\",\n        # \"sapt0/jun-cc-pV(D+d)Z_ind_scaled\",\n        # \"sapt0/jun-cc-pV(D+d)Z_disp_scaled\",\n        # \"sapt0/aug-cc-pV(D+d)Z_unscaled\",\n        # \"sapt0/aug-cc-pV(D+d)Z_es_unscaled\",\n        # \"sapt0/aug-cc-pV(D+d)Z_ex_unscaled\",\n        # \"sapt0/aug-cc-pV(D+d)Z_ind_unscaled\",\n        # \"sapt0/aug-cc-pV(D+d)Z_disp_unscaled\",\n        # \"sapt0/aug-cc-pV(D+d)Z_scaled\",\n        # \"sapt0/aug-cc-pV(D+d)Z_es_scaled\",\n        # \"sapt0/aug-cc-pV(D+d)Z_ex_scaled\",\n        # \"sapt0/aug-cc-pV(D+d)Z_ind_scaled\",\n        # \"sapt0/aug-cc-pV(D+d)Z_disp_scaled\",\n    ]\n\n    __energy_type__ = [\n        InterEnergyType.TOTAL,\n        InterEnergyType.ES,\n        InterEnergyType.EX,\n        InterEnergyType.IND,\n        InterEnergyType.DISP,\n        InterEnergyType.TOTAL,\n        InterEnergyType.ES,\n        InterEnergyType.EX,\n        InterEnergyType.IND,\n        InterEnergyType.DISP,\n        InterEnergyType.TOTAL,\n        InterEnergyType.ES,\n        InterEnergyType.EX,\n        InterEnergyType.IND,\n        InterEnergyType.DISP,\n        InterEnergyType.TOTAL,\n        InterEnergyType.ES,\n        InterEnergyType.EX,\n        InterEnergyType.IND,\n        InterEnergyType.DISP,\n    ]\n    energy_target_names = []\n    __links__ = {\n        \"dimerpairs.0.tar.gz\": \"https://figshare.com/ndownloader/files/39449167\",\n        \"dimerpairs.1.tar.gz\": \"https://figshare.com/ndownloader/files/40271983\",\n        \"dimerpairs.2.tar.gz\": \"https://figshare.com/ndownloader/files/40271989\",\n        \"dimerpairs.3.tar.gz\": \"https://figshare.com/ndownloader/files/40272001\",\n        \"dimerpairs.4.tar.gz\": \"https://figshare.com/ndownloader/files/40272022\",\n        \"dimerpairs.5.tar.gz\": \"https://figshare.com/ndownloader/files/40552931\",\n        \"dimerpairs.6.tar.gz\": \"https://figshare.com/ndownloader/files/40272040\",\n        \"dimerpairs.7.tar.gz\": \"https://figshare.com/ndownloader/files/40272052\",\n        \"dimerpairs.8.tar.gz\": \"https://figshare.com/ndownloader/files/40272061\",\n        \"dimerpairs.9.tar.gz\": \"https://figshare.com/ndownloader/files/40272064\",\n        \"dimerpairs_nonstandard.tar.gz\": \"https://figshare.com/ndownloader/files/40272067\",\n        \"lig_interaction_sites.sdf\": \"https://figshare.com/ndownloader/files/40272070\",\n        \"lig_monomers.sdf\": \"https://figshare.com/ndownloader/files/40272073\",\n        \"prot_interaction_sites.sdf\": \"https://figshare.com/ndownloader/files/40272076\",\n        \"prot_monomers.sdf\": \"https://figshare.com/ndownloader/files/40272079\",\n        \"merge_monomers.py\": \"https://figshare.com/ndownloader/files/41807682\",\n    }\n\n    def read_raw_entries(self) -> List[Dict]:\n        logger.info(f\"Reading Splinter interaction data from {self.root}\")\n        data = []\n        i = 0\n        with tqdm(total=1680022) as progress_bar:\n            for root, dirs, files in os.walk(self.root):  # total is currently an approximation\n                for filename in files:\n                    if not filename.endswith(\".xyz\"):\n                        continue\n                    i += 1\n                    filepath = os.path.join(root, filename)\n                    filein = open(filepath, \"r\")\n                    lines = list(map(lambda x: x.strip(), filein.readlines()))\n                    n_atoms = np.array([int(lines[0])], dtype=np.int32)\n                    metadata = lines[1].split(\",\")\n                    try:\n                        (\n                            protein_monomer_name,\n                            protein_interaction_site_type,\n                            ligand_monomer_name,\n                            ligand_interaction_site_type,\n                            index,\n                            r,\n                            theta_P,\n                            tau_P,\n                            theta_L,\n                            tau_L,\n                            tau_PL,\n                        ) = metadata[0].split(\"_\")\n                        index, r, theta_P, tau_P, theta_L, tau_L, tau_PL = list(\n                            map(float, [index, r, theta_P, tau_P, theta_L, tau_L, tau_PL])\n                        )\n                    except ValueError:\n                        (\n                            protein_monomer_name,\n                            protein_interaction_site_type,\n                            ligand_monomer_name,\n                            ligand_interaction_site_type,\n                            index,\n                            _,\n                        ) = metadata[0].split(\"_\")\n                        r, theta_P, tau_P, theta_L, tau_L, tau_PL = [np.nan] * 6\n                    energies = np.array([list(map(float, metadata[4:-1]))]).astype(np.float32)\n                    n_atoms_ptr = np.array([int(metadata[-1])], dtype=np.int32)\n                    total_charge, charge0, charge1 = list(map(int, metadata[1:4]))\n                    lines = list(map(lambda x: x.split(), lines[2:]))\n                    pos = np.array(lines)[:, 1:].astype(np.float32)\n                    elems = np.array(lines)[:, 0]\n                    atomic_nums = np.expand_dims(np.array([ATOM_TABLE.GetAtomicNumber(x) for x in elems]), axis=1)\n                    natoms0 = n_atoms_ptr[0]\n                    natoms1 = n_atoms[0] - natoms0\n                    charges = np.expand_dims(np.array([charge0] * natoms0 + [charge1] * natoms1), axis=1)\n                    atomic_inputs = np.concatenate((atomic_nums, charges, pos), axis=-1, dtype=np.float32)\n                    subset = np.array([root.split(\"/\")[-1]])\n\n                    item = dict(\n                        energies=energies,\n                        subset=subset,\n                        n_atoms=n_atoms,\n                        n_atoms_ptr=n_atoms_ptr,\n                        atomic_inputs=atomic_inputs,\n                        protein_monomer_name=np.array([protein_monomer_name]),\n                        protein_interaction_site_type=np.array([protein_interaction_site_type]),\n                        ligand_monomer_name=np.array([ligand_monomer_name]),\n                        ligand_interaction_site_type=np.array([ligand_interaction_site_type]),\n                        index=np.array([index], dtype=np.float32),\n                        r=np.array([r], dtype=np.float32),\n                        theta_P=np.array([theta_P], dtype=np.float32),\n                        tau_P=np.array([tau_P], dtype=np.float32),\n                        theta_L=np.array([theta_L], dtype=np.float32),\n                        tau_L=np.array([tau_L], dtype=np.float32),\n                        tau_PL=np.array([tau_PL], dtype=np.float32),\n                        name=np.array([protein_monomer_name + \".\" + ligand_monomer_name]),\n                    )\n                    data.append(item)\n                    progress_bar.update(1)\n        logger.info(f\"Processed {i} files in total\")\n        return data\n
    "},{"location":"API/datasets/tmqm.html","title":"TMQM","text":""},{"location":"API/datasets/tmqm.html#openqdc.datasets.potential.tmqm.TMQM","title":"TMQM","text":"

    Bases: BaseDataset

    tmQM dataset contains the geometries of a large transition metal-organic compound space with a large variety of organic ligands and 30 transition metals. It contains energy labels for 86,665 mononuclear complexes calculated at the TPSSh-D3BJ/def2-SV DFT level of theory. Structures are first extracted from Cambridge Structure Database and then optimized in gas phase with the extended tight-binding GFN2-xTB method.

    Usage:

    from openqdc.datasets import TMQM\ndataset = TMQM()\n

    References

    https://pubs.acs.org/doi/10.1021/acs.jcim.0c01041

    https://github.com/bbskjelstad/tmqm

    Source code in openqdc/datasets/potential/tmqm.py
    class TMQM(BaseDataset):\n    \"\"\"\n    tmQM dataset contains the geometries of a large transition metal-organic compound space with a large variety of\n    organic ligands and 30 transition metals. It contains energy labels for 86,665 mononuclear complexes calculated\n    at the TPSSh-D3BJ/def2-SV DFT level of theory. Structures are first extracted from Cambridge Structure Database\n    and then optimized in gas phase with the extended tight-binding GFN2-xTB method.\n\n    Usage:\n    ```python\n    from openqdc.datasets import TMQM\n    dataset = TMQM()\n    ```\n\n    References:\n        https://pubs.acs.org/doi/10.1021/acs.jcim.0c01041\\n\n        https://github.com/bbskjelstad/tmqm\n    \"\"\"\n\n    __name__ = \"tmqm\"\n\n    __energy_methods__ = [PotentialMethod.TPSSH_DEF2_TZVP]  # \"tpssh/def2-tzvp\"]\n\n    energy_target_names = [\"TPSSh/def2TZVP level\"]\n\n    __energy_unit__ = \"hartree\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"hartree/ang\"\n    __links__ = {\n        x: f\"https://raw.githubusercontent.com/bbskjelstad/tmqm/master/data/{x}\"\n        for x in [\"tmQM_X1.xyz.gz\", \"tmQM_X2.xyz.gz\", \"tmQM_y.csv\", \"Benchmark2_TPSSh_Opt.xyz\"]\n    }\n\n    def read_raw_entries(self):\n        df = pd.read_csv(p_join(self.root, \"tmQM_y.csv\"), sep=\";\", usecols=[\"CSD_code\", \"Electronic_E\"])\n        e_map = dict(zip(df[\"CSD_code\"], df[\"Electronic_E\"]))\n        raw_fnames = [\"tmQM_X1.xyz\", \"tmQM_X2.xyz\", \"Benchmark2_TPSSh_Opt.xyz\"]\n        samples = []\n        for fname in raw_fnames:\n            data = read_xyz(p_join(self.root, fname), e_map)\n            samples += data\n\n        return samples\n
    "},{"location":"API/datasets/transition1x.html","title":"Transition1X","text":""},{"location":"API/datasets/transition1x.html#openqdc.datasets.potential.transition1x.Transition1X","title":"Transition1X","text":"

    Bases: BaseDataset

    Transition1x dataset contains structures from 10k organic reaction pathways of various types. It contains energy and force labels for 9.6 mio. conformers calculated at the wB97x/6-31-G(d) level of theory. The geometries and the transition states are generated by running Nudged Elastic Band (NEB) with DFT.

    Usage:

    from openqdc.datasets import Transition1X\ndataset = Transition1X()\n

    References: - https://www.nature.com/articles/s41597-022-01870-w

    Source code in openqdc/datasets/potential/transition1x.py
    class Transition1X(BaseDataset):\n    \"\"\"\n    Transition1x dataset contains structures from 10k organic reaction pathways of various types. It contains energy\n    and force labels for 9.6 mio. conformers calculated at the wB97x/6-31-G(d) level of theory. The geometries and\n    the transition states are generated by running Nudged Elastic Band (NEB) with DFT.\n\n    Usage:\n    ```python\n    from openqdc.datasets import Transition1X\n    dataset = Transition1X()\n    ```\n\n    References:\n    - https://www.nature.com/articles/s41597-022-01870-w\\n\n    - https://gitlab.com/matschreiner/Transition1x\\n\n    \"\"\"\n\n    __name__ = \"transition1x\"\n\n    __energy_methods__ = [\n        PotentialMethod.WB97X_6_31G_D\n        # \"wb97x/6-31G(d)\",\n    ]\n\n    energy_target_names = [\n        \"wB97x_6-31G(d).energy\",\n    ]\n\n    __force_mask__ = [True]\n    force_target_names = [\n        \"wB97x_6-31G(d).forces\",\n    ]\n\n    __energy_unit__ = \"ev\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"ev/ang\"\n    __links__ = {\"Transition1x.h5\": \"https://figshare.com/ndownloader/files/36035789\"}\n\n    def read_raw_entries(self):\n        raw_path = p_join(self.root, \"Transition1x.h5\")\n        f = load_hdf5_file(raw_path)[\"data\"]\n\n        res = sum([read_record(f[g], group=g) for g in tqdm(f)], [])  # don't use parallelized here\n        return res\n
    "},{"location":"API/datasets/vqm24.html","title":"VQM24","text":""},{"location":"API/datasets/vqm24.html#openqdc.datasets.potential.vqm24.VQM24","title":"VQM24","text":"

    Bases: BaseDataset

    Vector-QM24 (VQM24) dataset consists of small organic and inorganic molecules with quantum mechanical properties calculated at wB97x-D3//cc-pVDZ level of theory. This leads to 258,242 unique constitutional isomers and 577,705 conformers of varying stoichiometries. Geometries are generated using GFN2-xTB, and relaxed with DFT method wB97x-D3/cc-pVDZ. The energy values are calculated with DFT method wB97x-D3/cc-pVDZ.

    Usage:

    from openqdc.datasets import VQM24\ndataset = VQM24()\n

    Reference

    https://arxiv.org/abs/2405.05961

    Source code in openqdc/datasets/potential/vqm24.py
    class VQM24(BaseDataset):\n    \"\"\"\n    Vector-QM24 (VQM24) dataset consists of small organic and inorganic molecules with quantum mechanical\n    properties calculated at wB97x-D3//cc-pVDZ level of theory. This leads to 258,242 unique constitutional\n    isomers and 577,705 conformers of varying stoichiometries. Geometries are generated using GFN2-xTB, and\n    relaxed with DFT method wB97x-D3/cc-pVDZ. The energy values are calculated with DFT method wB97x-D3/cc-pVDZ.\n\n    Usage:\n    ```python\n    from openqdc.datasets import VQM24\n    dataset = VQM24()\n    ```\n\n    Reference:\n        https://arxiv.org/abs/2405.05961\n    \"\"\"\n\n    __name__ = \"vqm24\"\n\n    __energy_methods__ = [\n        PotentialMethod.WB97X_D3_CC_PVDZ,  # \"wB97x-D3/cc-pVDZ.\"\n    ]\n\n    energy_target_names = [\n        \"wB97x-D3/cc-pVDZ\",\n    ]\n    # \u03c9B97X-D3/cc-pVDZ\n    __energy_unit__ = \"hartree\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"hartree/ang\"\n    __links__ = {\n        f\"{name}.npz\": f\"https://zenodo.org/records/11164951/files/{name}.npz?download=1\"\n        for name in [\"DFT_all\", \"DFT_saddles\", \"DFT_uniques\", \"DMC\"]\n    }\n\n    def read_raw_entries(self):\n        samples = []\n        for name in self.__links__:\n            raw_path = p_join(self.root, f\"{name}\")\n            samples.append(read_npz_entry(raw_path))\n        return samples\n
    "},{"location":"API/datasets/waterclusters.html","title":"SCAN Waterclusters","text":""},{"location":"API/datasets/waterclusters.html#openqdc.datasets.potential.waterclusters.SCANWaterClusters","title":"SCANWaterClusters","text":"

    Bases: BaseDataset

    The SCAN Water Clusters dataset contains conformations of neutral water clusters containing up to 20 monomers, charged water clusters, and alkali- and halide-water clusters. This dataset consists of our data sets of water clusters: the benchmark energy and geometry database (BEGDB) neutral water cluster subset; the WATER2723 set of 14 neutral, 5 protonated, 7 deprotonated, and one auto-ionized water cluster; and two sets of ion-water clusters M...(H2O)n, where M = Li+, Na+, K+, F\u2212, Cl\u2212, or Br\u2212. Water clusters were obtained from 10 nanosecond gas-phase molecular dynamics simulations using AMBER 9 and optimized to obtain lowest energy isomers were determined using MP2/aug-cc-pVDZ//MP2/6-31G* Gibbs free energies.

    Chemical Species

    [H, O, Li, Na, K, F, Cl, Br]

    Usage:

    from openqdc.datasets import SCANWaterClusters\ndataset = SCANWaterClusters()\n

    References

    https://chemrxiv.org/engage/chemrxiv/article-details/662aaff021291e5d1db7d8ec

    https://github.com/esoteric-ephemera/water_cluster_density_errors

    Source code in openqdc/datasets/potential/waterclusters.py
    class SCANWaterClusters(BaseDataset):\n    \"\"\"\n    The SCAN Water Clusters dataset contains conformations of\n    neutral water clusters containing up to 20 monomers, charged water clusters,\n    and alkali- and halide-water clusters. This dataset consists of our data sets of water clusters:\n    the benchmark energy and geometry database (BEGDB) neutral water cluster subset; the WATER2723 set of 14\n    neutral, 5 protonated, 7 deprotonated, and one auto-ionized water cluster; and two sets of\n    ion-water clusters M...(H2O)n, where M = Li+, Na+, K+, F\u2212, Cl\u2212, or Br\u2212.\n    Water clusters were obtained from  10 nanosecond gas-phase molecular dynamics\n    simulations using AMBER 9 and optimized to obtain\n    lowest energy isomers were determined using MP2/aug-cc-pVDZ//MP2/6-31G* Gibbs free energies.\n\n\n    Chemical Species:\n        [H, O, Li, Na, K, F, Cl, Br]\n\n    Usage:\n    ```python\n    from openqdc.datasets import SCANWaterClusters\n    dataset = SCANWaterClusters()\n    ```\n\n    References:\n        https://chemrxiv.org/engage/chemrxiv/article-details/662aaff021291e5d1db7d8ec\\n\n        https://github.com/esoteric-ephemera/water_cluster_density_errors\n    \"\"\"\n\n    __name__ = \"scanwaterclusters\"\n\n    __energy_unit__ = \"hartree\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"hartree/ang\"\n    energy_target_names = [\n        \"HF\",\n        \"HF-r2SCAN-DC4\",\n        \"SCAN\",\n        \"SCAN@HF\",\n        \"SCAN@r2SCAN50\",\n        \"r2SCAN\",\n        \"r2SCAN@HF\",\n        \"r2SCAN@r2SCAN50\",\n        \"r2SCAN50\",\n        \"r2SCAN100\",\n        \"r2SCAN10\",\n        \"r2SCAN20\",\n        \"r2SCAN25\",\n        \"r2SCAN30\",\n        \"r2SCAN40\",\n        \"r2SCAN60\",\n        \"r2SCAN70\",\n        \"r2SCAN80\",\n        \"r2SCAN90\",\n    ]\n    __energy_methods__ = [PotentialMethod.NONE for _ in range(len(energy_target_names))]\n    force_target_names = []\n    # 27            # 9 level\n    subsets = [\"BEGDB_H2O\", \"WATER27\", \"H2O_alkali_clusters\", \"H2O_halide_clusters\"]\n    __links__ = {\n        \"geometries.json.gz\": \"https://github.com/esoteric-ephemera/water_cluster_density_errors/blob/main/data_files/geometries.json.gz?raw=True\",  # noqa\n        \"total_energies.json.gz\": \"https://github.com/esoteric-ephemera/water_cluster_density_errors/blob/main/data_files/total_energies.json.gz?raw=True\",  # noqa\n    }\n\n    def read_raw_entries(self):\n        entries = []  # noqa\n        for i, subset in enumerate(self.subsets):\n            geometries = read_geometries(p_join(self.root, \"geometries.json.gz\"), subset)\n            energies = read_energies(p_join(self.root, \"total_energies.json.gz\"), subset)\n            datum = {}\n            for k in energies:\n                _ = energies[k].pop(\"metadata\")\n                datum[k] = energies[k][\"total_energies\"]\n            entries.extend(format_geometry_and_entries(geometries, datum, subset))\n        return entries\n
    "},{"location":"API/datasets/waterclusters3_30.html","title":"Waterclusters3_30","text":""},{"location":"API/datasets/waterclusters3_30.html#openqdc.datasets.potential.waterclusters3_30.WaterClusters","title":"WaterClusters","text":"

    Bases: BaseDataset

    The WaterClusters dataset contains putative minima and low energy networks for water clusters of sizes n = 3 - 30. The cluster structures are derived and labeled with the TTM2.1-F ab-initio based interaction potential for water. It contains approximately 4.5 mil. structures. Sampling was done with the Monte Carlo Temperature Basin Paving (MCTBP) method.

    Chemical Species

    [\"H\", \"O\"]

    Usage:

    from openqdc.datasets import WaterClusters\ndataset = WaterClusters()\n

    References

    https://doi.org/10.1063/1.5128378

    https://sites.uw.edu/wdbase/database-of-water-clusters/

    Source code in openqdc/datasets/potential/waterclusters3_30.py
    class WaterClusters(BaseDataset):\n    \"\"\"\n    The WaterClusters dataset contains putative minima and low energy networks for water\n    clusters of sizes n = 3 - 30. The cluster structures are derived and labeled with\n    the TTM2.1-F ab-initio based interaction potential for water.\n    It contains approximately 4.5 mil. structures.\n    Sampling was done with the Monte Carlo Temperature Basin Paving (MCTBP) method.\n\n    Chemical Species:\n        [\"H\", \"O\"]\n\n    Usage:\n    ```python\n    from openqdc.datasets import WaterClusters\n    dataset = WaterClusters()\n    ```\n\n    References:\n        https://doi.org/10.1063/1.5128378\\n\n        https://sites.uw.edu/wdbase/database-of-water-clusters/\\n\n    \"\"\"\n\n    __name__ = \"waterclusters3_30\"\n\n    # Energy in hartree, all zeros by default\n    atomic_energies = np.zeros((MAX_ATOMIC_NUMBER,), dtype=np.float32)\n    __energy_unit__ = \"kcal/mol\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"kcal/mol/ang\"\n\n    __energy_methods__ = [PotentialMethod.TTM2_1_F]  # \"ttm2.1-f\"\n    energy_target_names = [\"TTM2.1-F Potential\"]\n    __links__ = {\"W3-W30_all_geoms_TTM2.1-F.zip\": \"https://drive.google.com/uc?id=18Y7OiZXSCTsHrQ83GCc4fyE_abbL6E_n\"}\n\n    def read_raw_entries(self):\n        samples = []\n        parent_folder = p_join(self.root, \"W3-W30_all_geoms_TTM2.1-F/\")\n        for i in range(3, 31):\n            name = f\"W{i}_geoms_all\"\n            zip_path = p_join(parent_folder, f\"{name}.zip\")\n            xyz_path = p_join(parent_folder, f\"{name}.xyz\")\n            with zipfile.ZipFile(zip_path, \"r\") as zip_ref:\n                zip_ref.extractall(parent_folder)\n\n            data = read_xyz(xyz_path, i)\n            samples += data\n\n        return samples\n
    "},{"location":"API/datasets/x40.html","title":"X40","text":""},{"location":"API/datasets/x40.html#openqdc.datasets.interaction.x40.X40","title":"X40","text":"

    Bases: YamlDataset

    X40 interaction dataset of 40 noncovalent complexes of organic halides, halohydrides, and halogen molecules where the halogens participate in various interaction types such as electrostatic interactions, london dispersion, hydrogen bonds, halogen bonding, halogen-pi interactions and stacking of halogenated aromatic molecules. For each complex 10 geometries are generated resulting in 400 geometries in the dataset. The geometries are optimized using the MP2 level of theory with cc-pVTZ basis set whereas the interaction energies are computed with CCSD(T)/CBS level of theory.

    Usage:

    from openqdc.datasets import X40\ndataset = X40()\n

    Reference

    https://pubs.acs.org/doi/10.1021/ct300647k

    Source code in openqdc/datasets/interaction/x40.py
    class X40(YamlDataset):\n    \"\"\"\n    X40 interaction dataset of 40 noncovalent complexes of organic halides, halohydrides, and halogen molecules\n    where the halogens participate in various interaction types such as electrostatic interactions, london\n    dispersion, hydrogen bonds, halogen bonding, halogen-pi interactions and stacking of halogenated aromatic\n    molecules. For each complex 10 geometries are generated resulting in 400 geometries in the dataset. The geometries\n    are optimized using the MP2 level of theory with cc-pVTZ basis set whereas the interaction energies are\n    computed with CCSD(T)/CBS level of theory.\n\n    Usage:\n    ```python\n    from openqdc.datasets import X40\n    dataset = X40()\n    ```\n\n    Reference:\n        https://pubs.acs.org/doi/10.1021/ct300647k\n    \"\"\"\n\n    __name__ = \"x40\"\n    __energy_methods__ = [\n        InteractionMethod.CCSD_T_CBS,  # \"CCSD(T)/CBS\",\n        InteractionMethod.MP2_CBS,  # \"MP2/CBS\",\n        InteractionMethod.DCCSDT_HA_DZ,  # \"dCCSD(T)/haDZ\",\n        InteractionMethod.DCCSDT_HA_TZ,  # \"dCCSD(T)/haTZ\",\n        InteractionMethod.MP2_5_CBS_ADZ,  # \"MP2.5/CBS(aDZ)\",\n    ]\n    __links__ = {\n        \"x40.yaml\": \"http://cuby4.molecular.cz/download_datasets/x40.yaml\",\n        \"geometries.tar.gz\": \"http://cuby4.molecular.cz/download_geometries/X40.tar\",\n    }\n\n    def _process_name(self, item):\n        return item.shortname\n\n    def get_n_atoms_ptr(self, item, root, filename):\n        xyz_path = p_join(root, f\"{filename}.xyz\")\n        with open(xyz_path, \"r\") as xyz_file:  # avoid not closing the file\n            lines = list(map(lambda x: x.strip().split(), xyz_file.readlines()))\n            setup = lines.pop(1)\n            n_atoms_first = setup[0].split(\"-\")[1]\n            n_atoms_ptr = np.array([int(n_atoms_first)], dtype=np.int32)\n            return n_atoms_ptr\n
    "},{"location":"tutorials/usage.html","title":"OpenQDC Hands-on Tutorial","text":"In\u00a0[31]: Copied!
    from openqdc.datasets import Spice\nds = Spice(\n    energy_unit=\"kcal/mol\",\n    distance_unit=\"ang\",\n)\n
    from openqdc.datasets import Spice ds = Spice( energy_unit=\"kcal/mol\", distance_unit=\"ang\", )
    2024-02-29 12:17:13.349 | INFO     | openqdc.datasets.base:read_preprocess:381 - Reading preprocessed data.\n2024-02-29 12:17:13.349 | INFO     | openqdc.datasets.base:read_preprocess:382 - Dataset spice with the following units:\n                     Energy: hartree,\n                     Distance: bohr,\n                     Forces: hartree/bohr\n2024-02-29 12:17:13.978 | INFO     | openqdc.datasets.base:read_preprocess:406 - Loaded atomic_inputs with shape (33175288, 5), dtype float32\n2024-02-29 12:17:13.979 | INFO     | openqdc.datasets.base:read_preprocess:406 - Loaded position_idx_range with shape (1110165, 2), dtype int32\n2024-02-29 12:17:13.979 | INFO     | openqdc.datasets.base:read_preprocess:406 - Loaded energies with shape (1110165, 1), dtype float32\n2024-02-29 12:17:13.980 | INFO     | openqdc.datasets.base:read_preprocess:406 - Loaded forces with shape (33175288, 3, 1), dtype float32\n2024-02-29 12:17:13.980 | INFO     | openqdc.datasets.base:read_preprocess:406 - Loaded name with shape (1110165,), dtype <U632\n2024-02-29 12:17:13.981 | INFO     | openqdc.datasets.base:read_preprocess:406 - Loaded subset with shape (1110165,), dtype <U20\n2024-02-29 12:17:13.981 | INFO     | openqdc.datasets.base:read_preprocess:406 - Loaded n_atoms with shape (1110165,), dtype int32\n2024-02-29 12:17:13.983 | INFO     | openqdc.datasets.base:_precompute_statistics:154 - Loaded precomputed statistics\n2024-02-29 12:17:13.985 | INFO     | openqdc.datasets.base:_convert_data:141 - Converting spice data to the following units:\n                     Energy: kcal/mol,\n                     Distance: ang,\n                     Forces: kcal/mol/ang\n
    In\u00a0[39]: Copied!
    ds[0]\n
    ds[0] Out[39]:
    {'positions': array([[ 0.71034044,  2.1993854 , -1.7317094 ],\n        [ 0.06135919,  2.6528177 , -0.4163168 ],\n        [ 1.762424  ,  1.0939031 , -1.4321265 ],\n        [-0.22598556,  1.6802124 ,  0.5978407 ],\n        [ 1.1740401 , -0.04154727, -0.512898  ],\n        [-0.41957757, -0.24454471,  3.0900123 ],\n        [ 0.7238282 ,  0.52511275,  0.8248042 ],\n        [ 0.05533566, -0.6713925 ,  1.6488242 ],\n        [ 0.9663853 , -1.8097109 ,  1.8863406 ],\n        [-0.0657557 ,  1.8550861 , -2.3939755 ],\n        [ 1.2260683 ,  3.0082219 , -2.2036319 ],\n        [-0.8098082 ,  3.201651  , -0.6507186 ],\n        [ 0.792407  ,  3.368585  ,  0.01799216],\n        [ 2.558414  ,  1.5826052 , -0.9704587 ],\n        [ 2.166226  ,  0.64460325, -2.384977  ],\n        [-0.4735094 ,  2.0926695 ,  1.5486747 ],\n        [-1.1792994 ,  1.1978384 ,  0.34465855],\n        [ 1.8563557 , -0.90775317, -0.5115611 ],\n        [ 0.31435642, -0.42179283, -1.0628686 ],\n        [ 0.42152542,  0.25200853,  3.627957  ],\n        [-0.5416419 , -1.1152233 ,  3.7040234 ],\n        [-1.1868238 ,  0.46580845,  3.0541756 ],\n        [ 1.6525911 ,  0.8830018 ,  1.3779446 ],\n        [-0.7720179 , -0.9603249 ,  0.994841  ],\n        [ 1.7518724 , -1.5571898 ,  2.560223  ],\n        [ 1.3855549 , -2.1521344 ,  1.0039169 ],\n        [ 0.38311973, -2.5341127 ,  2.2767966 ]], dtype=float32),\n 'atomic_numbers': array([6, 6, 6, 6, 6, 6, 6, 6, 7, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n        1, 1, 1, 1, 1], dtype=int32),\n 'charges': array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n        0, 0, 0, 0, 0], dtype=int32),\n 'e0': array([[-23765.42563669],\n        [-23765.42563669],\n        [-23765.42563669],\n        [-23765.42563669],\n        [-23765.42563669],\n        [-23765.42563669],\n        [-23765.42563669],\n        [-23765.42563669],\n        [-33939.41501837],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ]]),\n 'energies': array([-232450.64], dtype=float32),\n 'name': '[H:10][C:1]1([C:2]([C:4]([C:7]([C:5]([C:3]1([H:14])[H:15])([H:18])[H:19])([H:23])[C@:8]([H:24])([C:6]([H:20])([H:21])[H:22])[N+:9]([H:25])([H:26])[H:27])([H:16])[H:17])([H:12])[H:13])[H:11]',\n 'subset': 'PubChem',\n 'forces': array([[[  2.1335483 ],\n         [-37.241825  ],\n         [ 22.830988  ]],\n \n        [[ 68.235725  ],\n         [ 59.30573   ],\n         [-27.672606  ]],\n \n        [[-34.137283  ],\n         [-30.504696  ],\n         [-33.670048  ]],\n \n        [[-49.57814   ],\n         [-75.2747    ],\n         [ 32.80194   ]],\n \n        [[  8.196513  ],\n         [ 17.132149  ],\n         [-36.84995   ]],\n \n        [[ 67.39872   ],\n         [ -8.923976  ],\n         [-20.772083  ]],\n \n        [[ 45.424217  ],\n         [-33.559574  ],\n         [ 20.30243   ]],\n \n        [[-13.522426  ],\n         [ 79.690094  ],\n         [ 15.531546  ]],\n \n        [[ 35.77895   ],\n         [  1.9324436 ],\n         [ -8.205132  ]],\n \n        [[ -3.3487453 ],\n         [ -7.991125  ],\n         [ -9.71156   ]],\n \n        [[  1.4049193 ],\n         [ 13.497365  ],\n         [ -5.981079  ]],\n \n        [[-21.196207  ],\n         [ 16.861713  ],\n         [ -1.7730864 ]],\n \n        [[-10.805695  ],\n         [ -2.033095  ],\n         [ -4.2524548 ]],\n \n        [[ 35.204765  ],\n         [ 12.971134  ],\n         [ 22.815577  ]],\n \n        [[-11.87403   ],\n         [ 10.404548  ],\n         [ 23.009806  ]],\n \n        [[  2.3782759 ],\n         [ 19.309696  ],\n         [ 15.546526  ]],\n \n        [[ -2.5732849 ],\n         [ -4.098344  ],\n         [ -5.087256  ]],\n \n        [[  3.5987573 ],\n         [ 10.469024  ],\n         [  9.869113  ]],\n \n        [[ -8.646548  ],\n         [ -0.35554707],\n         [  1.7650104 ]],\n \n        [[ -6.6712875 ],\n         [ -0.7742697 ],\n         [-15.672442  ]],\n \n        [[-25.453985  ],\n         [ -9.350726  ],\n         [  6.0056353 ]],\n \n        [[-32.657543  ],\n         [ 10.617167  ],\n         [  2.516469  ]],\n \n        [[-23.541552  ],\n         [ -9.305013  ],\n         [ -9.855984  ]],\n \n        [[  2.8105662 ],\n         [-13.78966   ],\n         [ 10.141727  ]],\n \n        [[-29.951014  ],\n         [ -9.25683   ],\n         [-23.69946   ]],\n \n        [[ -3.412568  ],\n         [  4.13157   ],\n         [ 12.421117  ]],\n \n        [[  4.77353   ],\n         [-13.841051  ],\n         [  7.6428723 ]]], dtype=float32)}
    In\u00a0[40]: Copied!
    ds.get_ase_atoms(0)\n
    ds.get_ase_atoms(0) Out[40]:
    Atoms(symbols='C8NH18', pbc=False, initial_charges=...)
    In\u00a0[53]: Copied!
    ds.get_ase_atoms(0).info\n
    ds.get_ase_atoms(0).info Out[53]:
    {'e0': array([[-23765.42563669],\n        [-23765.42563669],\n        [-23765.42563669],\n        [-23765.42563669],\n        [-23765.42563669],\n        [-23765.42563669],\n        [-23765.42563669],\n        [-23765.42563669],\n        [-33939.41501837],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ]]),\n 'energies': array([-232450.64], dtype=float32),\n 'name': '[H:10][C:1]1([C:2]([C:4]([C:7]([C:5]([C:3]1([H:14])[H:15])([H:18])[H:19])([H:23])[C@:8]([H:24])([C:6]([H:20])([H:21])[H:22])[N+:9]([H:25])([H:26])[H:27])([H:16])[H:17])([H:12])[H:13])[H:11]',\n 'subset': 'PubChem',\n 'forces': array([[[  2.1335483 ],\n         [-37.241825  ],\n         [ 22.830988  ]],\n \n        [[ 68.235725  ],\n         [ 59.30573   ],\n         [-27.672606  ]],\n \n        [[-34.137283  ],\n         [-30.504696  ],\n         [-33.670048  ]],\n \n        [[-49.57814   ],\n         [-75.2747    ],\n         [ 32.80194   ]],\n \n        [[  8.196513  ],\n         [ 17.132149  ],\n         [-36.84995   ]],\n \n        [[ 67.39872   ],\n         [ -8.923976  ],\n         [-20.772083  ]],\n \n        [[ 45.424217  ],\n         [-33.559574  ],\n         [ 20.30243   ]],\n \n        [[-13.522426  ],\n         [ 79.690094  ],\n         [ 15.531546  ]],\n \n        [[ 35.77895   ],\n         [  1.9324436 ],\n         [ -8.205132  ]],\n \n        [[ -3.3487453 ],\n         [ -7.991125  ],\n         [ -9.71156   ]],\n \n        [[  1.4049193 ],\n         [ 13.497365  ],\n         [ -5.981079  ]],\n \n        [[-21.196207  ],\n         [ 16.861713  ],\n         [ -1.7730864 ]],\n \n        [[-10.805695  ],\n         [ -2.033095  ],\n         [ -4.2524548 ]],\n \n        [[ 35.204765  ],\n         [ 12.971134  ],\n         [ 22.815577  ]],\n \n        [[-11.87403   ],\n         [ 10.404548  ],\n         [ 23.009806  ]],\n \n        [[  2.3782759 ],\n         [ 19.309696  ],\n         [ 15.546526  ]],\n \n        [[ -2.5732849 ],\n         [ -4.098344  ],\n         [ -5.087256  ]],\n \n        [[  3.5987573 ],\n         [ 10.469024  ],\n         [  9.869113  ]],\n \n        [[ -8.646548  ],\n         [ -0.35554707],\n         [  1.7650104 ]],\n \n        [[ -6.6712875 ],\n         [ -0.7742697 ],\n         [-15.672442  ]],\n \n        [[-25.453985  ],\n         [ -9.350726  ],\n         [  6.0056353 ]],\n \n        [[-32.657543  ],\n         [ 10.617167  ],\n         [  2.516469  ]],\n \n        [[-23.541552  ],\n         [ -9.305013  ],\n         [ -9.855984  ]],\n \n        [[  2.8105662 ],\n         [-13.78966   ],\n         [ 10.141727  ]],\n \n        [[-29.951014  ],\n         [ -9.25683   ],\n         [-23.69946   ]],\n \n        [[ -3.412568  ],\n         [  4.13157   ],\n         [ 12.421117  ]],\n \n        [[  4.77353   ],\n         [-13.841051  ],\n         [  7.6428723 ]]], dtype=float32)}
    In\u00a0[41]: Copied!
    for i in ds.as_iter():\n    print(i)\n    break\n
    for i in ds.as_iter(): print(i) break
    {'positions': array([[ 0.71034044,  2.1993854 , -1.7317094 ],\n       [ 0.06135919,  2.6528177 , -0.4163168 ],\n       [ 1.762424  ,  1.0939031 , -1.4321265 ],\n       [-0.22598556,  1.6802124 ,  0.5978407 ],\n       [ 1.1740401 , -0.04154727, -0.512898  ],\n       [-0.41957757, -0.24454471,  3.0900123 ],\n       [ 0.7238282 ,  0.52511275,  0.8248042 ],\n       [ 0.05533566, -0.6713925 ,  1.6488242 ],\n       [ 0.9663853 , -1.8097109 ,  1.8863406 ],\n       [-0.0657557 ,  1.8550861 , -2.3939755 ],\n       [ 1.2260683 ,  3.0082219 , -2.2036319 ],\n       [-0.8098082 ,  3.201651  , -0.6507186 ],\n       [ 0.792407  ,  3.368585  ,  0.01799216],\n       [ 2.558414  ,  1.5826052 , -0.9704587 ],\n       [ 2.166226  ,  0.64460325, -2.384977  ],\n       [-0.4735094 ,  2.0926695 ,  1.5486747 ],\n       [-1.1792994 ,  1.1978384 ,  0.34465855],\n       [ 1.8563557 , -0.90775317, -0.5115611 ],\n       [ 0.31435642, -0.42179283, -1.0628686 ],\n       [ 0.42152542,  0.25200853,  3.627957  ],\n       [-0.5416419 , -1.1152233 ,  3.7040234 ],\n       [-1.1868238 ,  0.46580845,  3.0541756 ],\n       [ 1.6525911 ,  0.8830018 ,  1.3779446 ],\n       [-0.7720179 , -0.9603249 ,  0.994841  ],\n       [ 1.7518724 , -1.5571898 ,  2.560223  ],\n       [ 1.3855549 , -2.1521344 ,  1.0039169 ],\n       [ 0.38311973, -2.5341127 ,  2.2767966 ]], dtype=float32), 'atomic_numbers': array([6, 6, 6, 6, 6, 6, 6, 6, 7, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n       1, 1, 1, 1, 1], dtype=int32), 'charges': array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n       0, 0, 0, 0, 0], dtype=int32), 'e0': array([[-23765.42563669],\n       [-23765.42563669],\n       [-23765.42563669],\n       [-23765.42563669],\n       [-23765.42563669],\n       [-23765.42563669],\n       [-23765.42563669],\n       [-23765.42563669],\n       [-33939.41501837],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ]]), 'energies': array([-232450.64], dtype=float32), 'name': '[H:10][C:1]1([C:2]([C:4]([C:7]([C:5]([C:3]1([H:14])[H:15])([H:18])[H:19])([H:23])[C@:8]([H:24])([C:6]([H:20])([H:21])[H:22])[N+:9]([H:25])([H:26])[H:27])([H:16])[H:17])([H:12])[H:13])[H:11]', 'subset': 'PubChem', 'forces': array([[[  2.1335483 ],\n        [-37.241825  ],\n        [ 22.830988  ]],\n\n       [[ 68.235725  ],\n        [ 59.30573   ],\n        [-27.672606  ]],\n\n       [[-34.137283  ],\n        [-30.504696  ],\n        [-33.670048  ]],\n\n       [[-49.57814   ],\n        [-75.2747    ],\n        [ 32.80194   ]],\n\n       [[  8.196513  ],\n        [ 17.132149  ],\n        [-36.84995   ]],\n\n       [[ 67.39872   ],\n        [ -8.923976  ],\n        [-20.772083  ]],\n\n       [[ 45.424217  ],\n        [-33.559574  ],\n        [ 20.30243   ]],\n\n       [[-13.522426  ],\n        [ 79.690094  ],\n        [ 15.531546  ]],\n\n       [[ 35.77895   ],\n        [  1.9324436 ],\n        [ -8.205132  ]],\n\n       [[ -3.3487453 ],\n        [ -7.991125  ],\n        [ -9.71156   ]],\n\n       [[  1.4049193 ],\n        [ 13.497365  ],\n        [ -5.981079  ]],\n\n       [[-21.196207  ],\n        [ 16.861713  ],\n        [ -1.7730864 ]],\n\n       [[-10.805695  ],\n        [ -2.033095  ],\n        [ -4.2524548 ]],\n\n       [[ 35.204765  ],\n        [ 12.971134  ],\n        [ 22.815577  ]],\n\n       [[-11.87403   ],\n        [ 10.404548  ],\n        [ 23.009806  ]],\n\n       [[  2.3782759 ],\n        [ 19.309696  ],\n        [ 15.546526  ]],\n\n       [[ -2.5732849 ],\n        [ -4.098344  ],\n        [ -5.087256  ]],\n\n       [[  3.5987573 ],\n        [ 10.469024  ],\n        [  9.869113  ]],\n\n       [[ -8.646548  ],\n        [ -0.35554707],\n        [  1.7650104 ]],\n\n       [[ -6.6712875 ],\n        [ -0.7742697 ],\n        [-15.672442  ]],\n\n       [[-25.453985  ],\n        [ -9.350726  ],\n        [  6.0056353 ]],\n\n       [[-32.657543  ],\n        [ 10.617167  ],\n        [  2.516469  ]],\n\n       [[-23.541552  ],\n        [ -9.305013  ],\n        [ -9.855984  ]],\n\n       [[  2.8105662 ],\n        [-13.78966   ],\n        [ 10.141727  ]],\n\n       [[-29.951014  ],\n        [ -9.25683   ],\n        [-23.69946   ]],\n\n       [[ -3.412568  ],\n        [  4.13157   ],\n        [ 12.421117  ]],\n\n       [[  4.77353   ],\n        [-13.841051  ],\n        [  7.6428723 ]]], dtype=float32)}\n
    In\u00a0[42]: Copied!
    for i in ds.as_iter(atoms=True):\n    print(i)\n    break\n
    for i in ds.as_iter(atoms=True): print(i) break
    Atoms(symbols='C8NH18', pbc=False, initial_charges=...)\n
    In\u00a0[43]: Copied!
    from openqdc.methods import QmMethod\n\n# Get the b3lyp/6-31g* method\nmethod = QmMethod.B3LYP_6_31G_D\nmethod.atom_energies_dict\n
    from openqdc.methods import QmMethod # Get the b3lyp/6-31g* method method = QmMethod.B3LYP_6_31G_D method.atom_energies_dict Out[43]:
    {('H', -1): -0.4618190740256503,\n ('H', 0): -0.5002733301377901,\n ('H', 1): 0.0,\n ('Li', 1): -7.284546111273075,\n ('B', -3): -23.577268753399462,\n ('B', -1): -24.614577395156598,\n ('B', 0): -24.65435524492553,\n ('B', 3): -22.018169862974275,\n ('C', -1): -37.844269871879376,\n ('C', 0): -37.84628033285479,\n ('C', 1): -37.42731164237431,\n ('N', -1): -54.52864356359092,\n ('N', 0): -54.584488815424095,\n ('N', 1): -54.0458621835885,\n ('O', -1): -75.05272792994404,\n ('O', 0): -75.06062109946738,\n ('O', 1): -74.54659271939704,\n ('F', -1): -99.75408410035712,\n ('F', 0): -99.71553471526475,\n ('Na', 1): -162.081235395777,\n ('Mg', 2): -199.22734695613283,\n ('Si', 4): -285.5564410277949,\n ('Si', 0): -289.3717359984153,\n ('Si', -4): -288.02795351148654,\n ('P', 0): -341.2580911838578,\n ('P', 1): -340.8765976669208,\n ('S', -1): -398.16568433994024,\n ('S', 0): -398.1049932797066,\n ('S', 1): -397.7199808615457,\n ('Cl', -2): -459.5066184980746,\n ('Cl', -1): -460.25223446009306,\n ('Cl', 0): -460.13624346967765,\n ('Cl', 2): -458.6740467177361,\n ('K', 1): -599.7247062673807,\n ('Ca', 2): -676.8667395990246,\n ('Br', -1): -2573.824201570383,\n ('Br', 0): -2573.705283744811,\n ('I', -1): None,\n ('I', 0): None}
    In\u00a0[44]: Copied!
    # Get the matrix of atomization energies for the b3lyp/6-31g* method\nmethod.atom_energies_matrix\n
    # Get the matrix of atomization energies for the b3lyp/6-31g* method method.atom_energies_matrix Out[44]:
    array([[0., 0., 0., ..., 0., 0., 0.],\n       [0., 0., 0., ..., 0., 0., 0.],\n       [0., 0., 0., ..., 0., 0., 0.],\n       ...,\n       [0., 0., 0., ..., 0., 0., 0.],\n       [0., 0., 0., ..., 0., 0., 0.],\n       [0., 0., 0., ..., 0., 0., 0.]])
    In\u00a0[45]: Copied!
    import matplotlib.pyplot as plt \nfrom sklearn.decomposition import PCA\ndatum = ds.soap_descriptors(n_samples=500, progress=True)\nreducer = PCA()\nembedding = reducer.fit_transform(datum[\"soap\"])\n
    import matplotlib.pyplot as plt from sklearn.decomposition import PCA datum = ds.soap_descriptors(n_samples=500, progress=True) reducer = PCA() embedding = reducer.fit_transform(datum[\"soap\"])
    100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 500/500 [00:01<00:00, 459.21it/s]\n
    In\u00a0[46]: Copied!
    plt.scatter(\n    embedding[:, 0],\n    embedding[:, 1],\n    c=[(ds[i].energies - ds[i][\"e0\"].sum() )/ ds.data[\"n_atoms\"][i] for i in datum[\"idxs\"]])\nplt.colorbar()\n
    plt.scatter( embedding[:, 0], embedding[:, 1], c=[(ds[i].energies - ds[i][\"e0\"].sum() )/ ds.data[\"n_atoms\"][i] for i in datum[\"idxs\"]]) plt.colorbar() Out[46]:
    <matplotlib.colorbar.Colorbar at 0x1554aa7bd820>
    "},{"location":"tutorials/usage.html#openqdc-hands-on-tutorial","title":"OpenQDC Hands-on Tutorial\u00b6","text":""},{"location":"tutorials/usage.html#instantiate-and-go","title":"Instantiate and GO!\u00b6","text":"

    If you don't have the dataset downloaded, it will be downloaded automatically and cached. You just instantiate the class and you are ready to go. Change of units is done automatically upon loading based on the units of the dataset.

    Supported energy units: [\"kcal/mol\", \"kj/mol\", \"hartree\", \"ev\"]

    Supported distance units: [\"ang\", \"nm\", \"bohr\"]

    "},{"location":"tutorials/usage.html#items-from-the-dataset-object-class-are-obtained-through-the-get-method","title":"Items from the dataset object class are obtained through the \"get\" method.\u00b6","text":"

    The dictionary of the item contains different important keys:

    "},{"location":"tutorials/usage.html#alternatively-we-can-also-retrieve-the-data-from-the-dataset-object-class-as-aseatoms-using-the-get_ase_atoms","title":"Alternatively, we can also retrieve the data from the dataset object class as ase.Atoms using the get_ase_atoms!\u00b6","text":""},{"location":"tutorials/usage.html#iterators","title":"Iterators\u00b6","text":"

    The method as_iter(atoms=False) returns an iterator over the dataset. If atoms is True, the iterator returns the data as an ase.Atoms objects. Otherwise, it returns the dictionary of the item.

    "},{"location":"tutorials/usage.html#isolated-atoms-energies-e0s","title":"Isolated atoms energies [e0s]\u00b6","text":"

    The potential energy of the system can be decomposed into the sum of isolated atom energies and the formation energy.

    $U(A_1, A_2, ...) = \\sum_{i_1}^N e_0(A_i) + e(A_1, A_2, ...)$

    The isolated atoms energies are automatically associated with the correct level of theory, and you can get access as follow

    "},{"location":"tutorials/usage.html#chemical-space-from-soap-descriptors","title":"Chemical space from SOAP descriptors\u00b6","text":"

    openQDC offer a simple way to calculate the Smooth Overlaps of Atomic Positions (SOAP) descriptors for the molecules in the dataset. The method get_soap_descriptors returns the SOAP descriptors for the molecules in the dataset.

    "}]} \ No newline at end of file +{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"index.html","title":"Overview","text":"

    OpenQDC is a python library to work with quantum datasets. It's a package aimed at providing a simple and efficient way to download, load and utilize various datasets and provide a way to standardize the data for easy use in machine learning models.

    Visit our website at https://openqdc.io .

    "},{"location":"index.html#installation","title":"Installation","text":"

    Use mamba:

    conda install -c conda-forge openqdc\n

    Tips: You can replace conda by mamba.

    Note: We highly recommend using a Conda Python distribution to install OpenQDC. The package is also pip installable if you need it: pip install openqdc.

    "},{"location":"index.html#quick-api-tour","title":"Quick API Tour","text":"
    from openqdc as Spice\n\n# Load the original dataset\ndataset = Spice()\n\n# Load the dataset with a different units\ndataset = Spice(\n    energy_unit = \"kcal/mol\",\n    distance_unit = \"ang\",\n    energy_type = \"formation\",\n    array_format = \"torch\"\n)\n\n# Access the data\ndata = dataset[0]\n\n# Get relevant statistics\ndataset.get_statistics()\n\n# Get dataset metadata\ndataset.average_n_atoms\ndataset.chemical_species\ndataset.charges\n\n# Compute physical descriptors\ndataset.calculate_descriptors(\n    descriptor_name = \"soap\"\n)\n
    "},{"location":"index.html#how-to-cite","title":"How to cite","text":"

    Please cite OpenQDC if you use it in your research: .

    "},{"location":"index.html#compatibilities","title":"Compatibilities","text":"

    OpenQDC is compatible with Python >= 3.8 and is tested on Linux, MacOS and Windows.

    "},{"location":"cli.html","title":"CLI for dataset downloading and uploading","text":"

    You can quickly download, fetch, preprocess and upload openQDC datasets using the command line interface (CLI).

    "},{"location":"cli.html#datasets","title":"Datasets","text":"

    Print a formatted table of the available openQDC datasets and some informations.

    Usage:

    openqdc datasets [OPTIONS]\n

    Options:

    --help          Show this message and exit.\n
    "},{"location":"cli.html#cache","title":"Cache","text":"

    Get the current local cache path of openQDC

    Usage:

    openqdc cache [OPTIONS]\n

    Options:

    --help          Show this message and exit.\n
    "},{"location":"cli.html#download","title":"Download","text":"

    Download preprocessed ml-ready datasets from the main openQDC hub.

    Usage:

    openqdc download DATASETS... [OPTIONS]\n

    Options:

    --help          Show this message and exit.\n--overwrite     Whether to force the re-download of the datasets and overwrite the current cached dataset. [default: no-overwrite]\n--cache-dir     Path to the cache. If not provided, the default cache directory (.cache/openqdc/) will be used. [default: None]\n--as-zarr       Whether to use a zarr format for the datasets instead of memmap. [default: no-as-zarr]\n--gs            Whether source to use for downloading. If True, Google Storage will be used.Otherwise, AWS S3 will be used [default: no-gs]\n

    Example:

    openqdc download Spice\n
    "},{"location":"cli.html#fetch","title":"Fetch","text":"

    Download the raw datasets files from the main openQDC hub

    Note:

    Special case: if the dataset is \"all\", \"potential\", \"interaction\".\n

    Usage:

    openqdc fetch DATASETS... [OPTIONS]\n

    Options:

    --help          Show this message and exit.\n--overwrite     Whether to overwrite or force the re-download of the raw files. [default: no-overwrite]\n--cache-dir     Path to the cache. If not provided, the default cache directory (.cache/openqdc/) will be used. [default: None]\n

    Example:

    openqdc fetch Spice\n
    "},{"location":"cli.html#preprocess","title":"Preprocess","text":"

    Preprocess a raw dataset (previously fetched) into a openqdc dataset and optionally push it to remote.

    Usage:

    openqdc preprocess DATASETS... [OPTIONS]\n

    Options:

    --help         Show this message and exit.\n--overwrite    Whether to overwrite the current cached datasets. [default: overwrite]\n--upload       Whether to attempt the upload to the remote storage. Must have write permissions. [default: no-upload]\n--as-zarr      Whether to preprocess as a zarr format or a memmap format. [default: no-as-zarr]\n

    Example:

    openqdc preprocess Spice QMugs\n
    "},{"location":"cli.html#upload","title":"Upload","text":"

    Upload a preprocessed dataset to the remote storage

    Usage:

    openqdc upload DATASETS... [OPTIONS]\n

    Options:

    --help          Show this message and exit.\n--overwrite     Whether to overwrite the remote files if they are present. [default: overwrite]\n--as-zarr       Whether to upload the zarr files if available. [default: no-as-zarr]\n

    Example:

    openqdc upload Spice --overwrite\n
    "},{"location":"cli.html#convert","title":"Convert","text":"

    Convert a preprocessed dataset from a memmap dataset to a zarr dataset.

    Usage:

    openqdc convert DATASETS... [OPTIONS]\n

    Options:

    --help          Show this message and exit.\n--overwrite     Whether to overwrite the current zarr cached datasets. [default: no-overwrite]\n--download      Whether to force the re-download of the memmap datasets. [default: no-download]\n
    "},{"location":"contribute.html","title":"Contribute","text":"

    The below documents the development lifecycle of OpenQDC.

    "},{"location":"contribute.html#setup-a-dev-environment","title":"Setup a dev environment","text":"
    mamba env create -n openqdc -f env.yml\nmamba activate datamol\npip install -e .\n
    "},{"location":"contribute.html#pre-commit-installation","title":"Pre commit installation","text":"
    pre-commit install\npre-commit run --all-files\n
    "},{"location":"contribute.html#continuous-integration","title":"Continuous Integration","text":"

    OpenQDC uses Github Actions to:

    "},{"location":"contribute.html#run-tests","title":"Run tests","text":"
    pytest\n
    "},{"location":"contribute.html#build-the-documentation","title":"Build the documentation","text":"

    You can build and serve the documentation locally with:

    # Build and serve the doc\nmike serve\n

    or with

    mkdocs serve\n
    "},{"location":"contribute.html#multi-versionning","title":"Multi-versionning","text":"

    The doc is built for eash push on main and every git tags using mike. Everything is automated using Github Actions. Please refer to the official mike's documentation for the details.

    "},{"location":"data_storage.html","title":"Data structure","text":""},{"location":"data_storage.html#dataset-structure","title":"Dataset structure","text":"

    For a dataset with N geometries, M atoms across all geometries, ne energy labels, and nf force labels, we use zarr or memory-mapped arrays of various sizes:

    The memory-mapped files efficiently access data stored on disk or in the cloud without reading them into memory, enabling training on machines with smaller RAM than the dataset size and accommodating concurrent reads in multi-GPU training. This allows for very efficient indexing, batching and iteration.

    "},{"location":"data_storage.html#formats","title":"Formats","text":"

    We currently support the following formats:

    1) Zarr : https://zarr.readthedocs.io/en/stable/index.html

    2) Memmap : https://numpy.org/doc/stable/index.html

    "},{"location":"dataset_upload.html","title":"How to Add a Dataset to OpenQDC","text":"

    Do you think that OpenQDC is missing some important dataset? Do you think your dataset would be a good fit for OpenQDC? If so, you can contribute to OpenQDC by adding your dataset to the OpenQDC repository in two ways:

    1. Opening a PR to add a new dataset
    2. Request a new dataset through Google Form
    "},{"location":"dataset_upload.html#openqdc-pr-guidelines","title":"OpenQDC PR Guidelines","text":"

    Implement your dataset in the OpenQDC repository by following the guidelines below:

    "},{"location":"dataset_upload.html#dataset-class","title":"Dataset class","text":""},{"location":"dataset_upload.html#test-the-dataset","title":"Test the dataset","text":"

    Try to run the openQDC CLI pipeline with the dataset you implemented.

    Run the following command to download the dataset:

    If the dataset is correctly loaded, you can open a PR to add the dataset to OpenQDC.

    Our team will review your PR and provide feedback if necessary. If everything is correct, your dataset will be added to OpenQDC remote storage.

    "},{"location":"dataset_upload.html#openqdc-google-form","title":"OpenQDC Google Form","text":"

    Alternatively, you can ask the OpenQDC main development team to take care of the dataset upload for you. You can fill out the Google Form here

    As the openQDC team will strive to provide a high quality curation and upload, please be patient as the team will need to review the dataset and carry out the necessary steps to ensure the dataset is uploaded correctly.

    "},{"location":"datasets.html","title":"Overview of Datasets","text":"

    We provide support for the following publicly available QM Datasets.

    Dataset # Molecules # Conformers Average Conformers per Molecule Force Labels Atom Types QM Level of Theory Off-Equilibrium Conformations GEOM 450,000 37,000,000 82 No 18 GFN2-xTB No Molecule3D 3,899,647 3,899,647 1 No 5 B3LYP/6-31G* No NablaDFT 1,000,000 5,000,000 5 No 6 \u03c9B97X-D/def2-SVP QMugs 665,000 2,000,000 3 No 10 GFN2-xTB, \u03c9B97X-D/def2-SVP No Spice 19,238 1,132,808 59 Yes 15 \u03c9B97M-D3(BJ)/def2-TZVPPD Yes ANI 57,462 20,000,000 348 No 4 \u03c9B97x:6-31G(d) Yes tmQM 86,665 No TPSSh-D3BJ/def2-SVP DES370K 3,700 370,000 100 No 20 CCSD(T) Yes DES5M 3,700 5,000,000 1351 No 20 SNS-MP2 Yes OrbNet Denali 212,905 2,300,000 11 No 16 GFN1-xTB Yes SN2RXN 39 452709 11,600 Yes 6 DSD-BLYP-D3(BJ)/def2-TZVP QM7X 6,950 4,195,237 603 Yes 7 PBE0+MBD Yes"},{"location":"licensing.html","title":"License","text":"
    Creative Commons Attribution-NonCommercial 4.0 International\n\nCreative Commons Corporation (\"Creative Commons\") is not a law firm and\ndoes not provide legal services or legal advice. Distribution of\nCreative Commons public licenses does not create a lawyer-client or\nother relationship. Creative Commons makes its licenses and related\ninformation available on an \"as-is\" basis. Creative Commons gives no\nwarranties regarding its licenses, any material licensed under their\nterms and conditions, or any related information. Creative Commons\ndisclaims all liability for damages resulting from their use to the\nfullest extent possible.\n\nUsing Creative Commons Public Licenses\n\nCreative Commons public licenses provide a standard set of terms and\nconditions that creators and other rights holders may use to share\noriginal works of authorship and other material subject to copyright and\ncertain other rights specified in the public license below. The\nfollowing considerations are for informational purposes only, are not\nexhaustive, and do not form part of our licenses.\n\n-   Considerations for licensors: Our public licenses are intended for\n    use by those authorized to give the public permission to use\n    material in ways otherwise restricted by copyright and certain other\n    rights. Our licenses are irrevocable. Licensors should read and\n    understand the terms and conditions of the license they choose\n    before applying it. Licensors should also secure all rights\n    necessary before applying our licenses so that the public can reuse\n    the material as expected. Licensors should clearly mark any material\n    not subject to the license. This includes other CC-licensed\n    material, or material used under an exception or limitation to\n    copyright. More considerations for licensors :\n    wiki.creativecommons.org/Considerations\\_for\\_licensors\n\n-   Considerations for the public: By using one of our public licenses,\n    a licensor grants the public permission to use the licensed material\n    under specified terms and conditions. If the licensor's permission\n    is not necessary for any reason\u2013for example, because of any\n    applicable exception or limitation to copyright\u2013then that use is not\n    regulated by the license. Our licenses grant only permissions under\n    copyright and certain other rights that a licensor has authority to\n    grant. Use of the licensed material may still be restricted for\n    other reasons, including because others have copyright or other\n    rights in the material. A licensor may make special requests, such\n    as asking that all changes be marked or described. Although not\n    required by our licenses, you are encouraged to respect those\n    requests where reasonable. More considerations for the public :\n    wiki.creativecommons.org/Considerations\\_for\\_licensees\n\nCreative Commons Attribution-NonCommercial 4.0 International Public\nLicense\n\nBy exercising the Licensed Rights (defined below), You accept and agree\nto be bound by the terms and conditions of this Creative Commons\nAttribution-NonCommercial 4.0 International Public License (\"Public\nLicense\"). To the extent this Public License may be interpreted as a\ncontract, You are granted the Licensed Rights in consideration of Your\nacceptance of these terms and conditions, and the Licensor grants You\nsuch rights in consideration of benefits the Licensor receives from\nmaking the Licensed Material available under these terms and conditions.\n\n-   Section 1 \u2013 Definitions.\n\n    -   a. Adapted Material means material subject to Copyright and\n        Similar Rights that is derived from or based upon the Licensed\n        Material and in which the Licensed Material is translated,\n        altered, arranged, transformed, or otherwise modified in a\n        manner requiring permission under the Copyright and Similar\n        Rights held by the Licensor. For purposes of this Public\n        License, where the Licensed Material is a musical work,\n        performance, or sound recording, Adapted Material is always\n        produced where the Licensed Material is synched in timed\n        relation with a moving image.\n    -   b. Adapter's License means the license You apply to Your\n        Copyright and Similar Rights in Your contributions to Adapted\n        Material in accordance with the terms and conditions of this\n        Public License.\n    -   c. Copyright and Similar Rights means copyright and/or similar\n        rights closely related to copyright including, without\n        limitation, performance, broadcast, sound recording, and Sui\n        Generis Database Rights, without regard to how the rights are\n        labeled or categorized. For purposes of this Public License, the\n        rights specified in Section 2(b)(1)-(2) are not Copyright and\n        Similar Rights.\n    -   d. Effective Technological Measures means those measures that,\n        in the absence of proper authority, may not be circumvented\n        under laws fulfilling obligations under Article 11 of the WIPO\n        Copyright Treaty adopted on December 20, 1996, and/or similar\n        international agreements.\n    -   e. Exceptions and Limitations means fair use, fair dealing,\n        and/or any other exception or limitation to Copyright and\n        Similar Rights that applies to Your use of the Licensed\n        Material.\n    -   f. Licensed Material means the artistic or literary work,\n        database, or other material to which the Licensor applied this\n        Public License.\n    -   g. Licensed Rights means the rights granted to You subject to\n        the terms and conditions of this Public License, which are\n        limited to all Copyright and Similar Rights that apply to Your\n        use of the Licensed Material and that the Licensor has authority\n        to license.\n    -   h. Licensor means the individual(s) or entity(ies) granting\n        rights under this Public License.\n    -   i. NonCommercial means not primarily intended for or directed\n        towards commercial advantage or monetary compensation. For\n        purposes of this Public License, the exchange of the Licensed\n        Material for other material subject to Copyright and Similar\n        Rights by digital file-sharing or similar means is NonCommercial\n        provided there is no payment of monetary compensation in\n        connection with the exchange.\n    -   j. Share means to provide material to the public by any means or\n        process that requires permission under the Licensed Rights, such\n        as reproduction, public display, public performance,\n        distribution, dissemination, communication, or importation, and\n        to make material available to the public including in ways that\n        members of the public may access the material from a place and\n        at a time individually chosen by them.\n    -   k. Sui Generis Database Rights means rights other than copyright\n        resulting from Directive 96/9/EC of the European Parliament and\n        of the Council of 11 March 1996 on the legal protection of\n        databases, as amended and/or succeeded, as well as other\n        essentially equivalent rights anywhere in the world.\n    -   l. You means the individual or entity exercising the Licensed\n        Rights under this Public License. Your has a corresponding\n        meaning.\n\n-   Section 2 \u2013 Scope.\n\n    -   a. License grant.\n        -   1. Subject to the terms and conditions of this Public\n            License, the Licensor hereby grants You a worldwide,\n            royalty-free, non-sublicensable, non-exclusive, irrevocable\n            license to exercise the Licensed Rights in the Licensed\n            Material to:\n            -   A. reproduce and Share the Licensed Material, in whole\n                or in part, for NonCommercial purposes only; and\n            -   B. produce, reproduce, and Share Adapted Material for\n                NonCommercial purposes only.\n        -   2. Exceptions and Limitations. For the avoidance of doubt,\n            where Exceptions and Limitations apply to Your use, this\n            Public License does not apply, and You do not need to comply\n            with its terms and conditions.\n        -   3. Term. The term of this Public License is specified in\n            Section 6(a).\n        -   4. Media and formats; technical modifications allowed. The\n            Licensor authorizes You to exercise the Licensed Rights in\n            all media and formats whether now known or hereafter\n            created, and to make technical modifications necessary to do\n            so. The Licensor waives and/or agrees not to assert any\n            right or authority to forbid You from making technical\n            modifications necessary to exercise the Licensed Rights,\n            including technical modifications necessary to circumvent\n            Effective Technological Measures. For purposes of this\n            Public License, simply making modifications authorized by\n            this Section 2(a)(4) never produces Adapted Material.\n        -   5. Downstream recipients.\n            -   A. Offer from the Licensor \u2013 Licensed Material. Every\n                recipient of the Licensed Material automatically\n                receives an offer from the Licensor to exercise the\n                Licensed Rights under the terms and conditions of this\n                Public License.\n            -   B. No downstream restrictions. You may not offer or\n                impose any additional or different terms or conditions\n                on, or apply any Effective Technological Measures to,\n                the Licensed Material if doing so restricts exercise of\n                the Licensed Rights by any recipient of the Licensed\n                Material.\n        -   6. No endorsement. Nothing in this Public License\n            constitutes or may be construed as permission to assert or\n            imply that You are, or that Your use of the Licensed\n            Material is, connected with, or sponsored, endorsed, or\n            granted official status by, the Licensor or others\n            designated to receive attribution as provided in Section\n            3(a)(1)(A)(i).\n    -   b. Other rights.\n        -   1. Moral rights, such as the right of integrity, are not\n            licensed under this Public License, nor are publicity,\n            privacy, and/or other similar personality rights; however,\n            to the extent possible, the Licensor waives and/or agrees\n            not to assert any such rights held by the Licensor to the\n            limited extent necessary to allow You to exercise the\n            Licensed Rights, but not otherwise.\n        -   2. Patent and trademark rights are not licensed under this\n            Public License.\n        -   3. To the extent possible, the Licensor waives any right to\n            collect royalties from You for the exercise of the Licensed\n            Rights, whether directly or through a collecting society\n            under any voluntary or waivable statutory or compulsory\n            licensing scheme. In all other cases the Licensor expressly\n            reserves any right to collect such royalties, including when\n            the Licensed Material is used other than for NonCommercial\n            purposes.\n\n-   Section 3 \u2013 License Conditions.\n\n    Your exercise of the Licensed Rights is expressly made subject to\n    the following conditions.\n\n    -   a. Attribution.\n        -   1. If You Share the Licensed Material (including in modified\n            form), You must:\n            -   A. retain the following if it is supplied by the\n                Licensor with the Licensed Material:\n                -   i. identification of the creator(s) of the Licensed\n                    Material and any others designated to receive\n                    attribution, in any reasonable manner requested by\n                    the Licensor (including by pseudonym if designated);\n                -   ii. a copyright notice;\n                -   iii. a notice that refers to this Public License;\n                -   iv. a notice that refers to the disclaimer of\n                    warranties;\n                -   v. a URI or hyperlink to the Licensed Material to\n                    the extent reasonably practicable;\n            -   B. indicate if You modified the Licensed Material and\n                retain an indication of any previous modifications; and\n            -   C. indicate the Licensed Material is licensed under this\n                Public License, and include the text of, or the URI or\n                hyperlink to, this Public License.\n        -   2. You may satisfy the conditions in Section 3(a)(1) in any\n            reasonable manner based on the medium, means, and context in\n            which You Share the Licensed Material. For example, it may\n            be reasonable to satisfy the conditions by providing a URI\n            or hyperlink to a resource that includes the required\n            information.\n        -   3. If requested by the Licensor, You must remove any of the\n            information required by Section 3(a)(1)(A) to the extent\n            reasonably practicable.\n        -   4. If You Share Adapted Material You produce, the Adapter's\n            License You apply must not prevent recipients of the Adapted\n            Material from complying with this Public License.\n\n-   Section 4 \u2013 Sui Generis Database Rights.\n\n    Where the Licensed Rights include Sui Generis Database Rights that\n    apply to Your use of the Licensed Material:\n\n    -   a. for the avoidance of doubt, Section 2(a)(1) grants You the\n        right to extract, reuse, reproduce, and Share all or a\n        substantial portion of the contents of the database for\n        NonCommercial purposes only;\n    -   b. if You include all or a substantial portion of the database\n        contents in a database in which You have Sui Generis Database\n        Rights, then the database in which You have Sui Generis Database\n        Rights (but not its individual contents) is Adapted Material;\n        and\n    -   c. You must comply with the conditions in Section 3(a) if You\n        Share all or a substantial portion of the contents of the\n        database.\n\n    For the avoidance of doubt, this Section 4 supplements and does not\n    replace Your obligations under this Public License where the\n    Licensed Rights include other Copyright and Similar Rights.\n\n-   Section 5 \u2013 Disclaimer of Warranties and Limitation of Liability.\n\n    -   a. Unless otherwise separately undertaken by the Licensor, to\n        the extent possible, the Licensor offers the Licensed Material\n        as-is and as-available, and makes no representations or\n        warranties of any kind concerning the Licensed Material, whether\n        express, implied, statutory, or other. This includes, without\n        limitation, warranties of title, merchantability, fitness for a\n        particular purpose, non-infringement, absence of latent or other\n        defects, accuracy, or the presence or absence of errors, whether\n        or not known or discoverable. Where disclaimers of warranties\n        are not allowed in full or in part, this disclaimer may not\n        apply to You.\n    -   b. To the extent possible, in no event will the Licensor be\n        liable to You on any legal theory (including, without\n        limitation, negligence) or otherwise for any direct, special,\n        indirect, incidental, consequential, punitive, exemplary, or\n        other losses, costs, expenses, or damages arising out of this\n        Public License or use of the Licensed Material, even if the\n        Licensor has been advised of the possibility of such losses,\n        costs, expenses, or damages. Where a limitation of liability is\n        not allowed in full or in part, this limitation may not apply to\n        You.\n    -   c. The disclaimer of warranties and limitation of liability\n        provided above shall be interpreted in a manner that, to the\n        extent possible, most closely approximates an absolute\n        disclaimer and waiver of all liability.\n\n-   Section 6 \u2013 Term and Termination.\n\n    -   a. This Public License applies for the term of the Copyright and\n        Similar Rights licensed here. However, if You fail to comply\n        with this Public License, then Your rights under this Public\n        License terminate automatically.\n    -   b. Where Your right to use the Licensed Material has terminated\n        under Section 6(a), it reinstates:\n\n        -   1. automatically as of the date the violation is cured,\n            provided it is cured within 30 days of Your discovery of the\n            violation; or\n        -   2. upon express reinstatement by the Licensor.\n\n        For the avoidance of doubt, this Section 6(b) does not affect\n        any right the Licensor may have to seek remedies for Your\n        violations of this Public License.\n\n    -   c. For the avoidance of doubt, the Licensor may also offer the\n        Licensed Material under separate terms or conditions or stop\n        distributing the Licensed Material at any time; however, doing\n        so will not terminate this Public License.\n    -   d. Sections 1, 5, 6, 7, and 8 survive termination of this Public\n        License.\n\n-   Section 7 \u2013 Other Terms and Conditions.\n\n    -   a. The Licensor shall not be bound by any additional or\n        different terms or conditions communicated by You unless\n        expressly agreed.\n    -   b. Any arrangements, understandings, or agreements regarding the\n        Licensed Material not stated herein are separate from and\n        independent of the terms and conditions of this Public License.\n\n-   Section 8 \u2013 Interpretation.\n\n    -   a. For the avoidance of doubt, this Public License does not, and\n        shall not be interpreted to, reduce, limit, restrict, or impose\n        conditions on any use of the Licensed Material that could\n        lawfully be made without permission under this Public License.\n    -   b. To the extent possible, if any provision of this Public\n        License is deemed unenforceable, it shall be automatically\n        reformed to the minimum extent necessary to make it enforceable.\n        If the provision cannot be reformed, it shall be severed from\n        this Public License without affecting the enforceability of the\n        remaining terms and conditions.\n    -   c. No term or condition of this Public License will be waived\n        and no failure to comply consented to unless expressly agreed to\n        by the Licensor.\n    -   d. Nothing in this Public License constitutes or may be\n        interpreted as a limitation upon, or waiver of, any privileges\n        and immunities that apply to the Licensor or You, including from\n        the legal processes of any jurisdiction or authority.\n\nCreative Commons is not a party to its public licenses. Notwithstanding,\nCreative Commons may elect to apply one of its public licenses to\nmaterial it publishes and in those instances will be considered the\n\"Licensor.\" The text of the Creative Commons public licenses is\ndedicated to the public domain under the CC0 Public Domain Dedication.\nExcept for the limited purpose of indicating that material is shared\nunder a Creative Commons public license or as otherwise permitted by the\nCreative Commons policies published at creativecommons.org/policies,\nCreative Commons does not authorize the use of the trademark \"Creative\nCommons\" or any other trademark or logo of Creative Commons without its\nprior written consent including, without limitation, in connection with\nany unauthorized modifications to any of its public licenses or any\nother arrangements, understandings, or agreements concerning use of\nlicensed material. For the avoidance of doubt, this paragraph does not\nform part of the public licenses.\n\nCreative Commons may be contacted at creativecommons.org.\n
    "},{"location":"normalization_e0s.html","title":"Overview of QM Methods and Normalization","text":"

    OpenQDC provides support for 250+ QM Methods and provides a way to standardize and categorize the usage of different level of theories used for Quantum Mechanics Single Point Calculations to add value and information to the datasets.

    "},{"location":"normalization_e0s.html#level-of-theory","title":"Level of Theory","text":"

    To avoid inconsistencies, level of theories are standardized and categorized into Python Enums consisting of a functional, a basis set, and a correction method. OpenQDC covers more than 106 functionals, 20 basis sets, and 11 correction methods. OpenQDC provides the computed the isolated atom energies e0 for each QM method.

    "},{"location":"normalization_e0s.html#normalization","title":"Normalization","text":"

    We provide support of energies through \"physical\" and \"regression\" normalization to conserve the size extensivity of chemical systems. OpenQDC through this normalization, provide a way to transform the potential energy to atomization energy by subtracting isolated atom energies e0 physically interpretable and extensivity-conserving normalization method. Alternatively, we pre- compute the average contribution of each atom species to potential energy via linear or ridge regression, centering the distribution at 0 and providing uncertainty estimation for the computed values. Predicted atomic energies can also be scaled to approximate a standard normal distribution.

    "},{"location":"normalization_e0s.html#physical-normalization","title":"Physical Normalization","text":"

    e0 energies are calculated for each atom in the dataset at the appropriate level of theory and then subtracted from the potential energy to obtain the atomization energy. This normalization method is physically interpretable and only remove the atom energy contribution from the potential energy.

    "},{"location":"normalization_e0s.html#regression-normalization","title":"Regression Normalization","text":"

    e0 energies are calculated for each atom in the dataset from fitting a regression model to the potential energy. The e0 energies are then subtracted from the potential energy to obtain the atomization energy. This normalization provides uncertainty estimation for the computed values and remove part of the interatomic energy contribution from the potential energy. The resulting formation energy is centered at 0.

    "},{"location":"usage.html","title":"Usage","text":""},{"location":"usage.html#how-to-use","title":"How to use","text":"

    OpenQDC has been designed to be used with a single import:

    import openqdc as qdc\ndataset = qdc.QM9()\n

    All openQDC functions are available under qdc. Or if you want to directly import a specific dataset:

    from openqdc as Spice\n# Spice dataset with distance unit in angstrom instead of bohr\ndataset = Spice(distance_unit=\"ang\",\n                array_format = \"jax\"\n)\ndataset[0] # dict of jax array\n

    Or if you prefer handling ase.Atoms objects:

    dataset.get_ase_atoms(0)\n
    "},{"location":"usage.html#iterators","title":"Iterators","text":"

    OpenQDC provides a simple way to get the data as iterators:

    for data in dataset.as_iter(atoms=True):\n    print(data) # Atoms object\n    break\n

    or if you want to just iterate over the data:

    for data in dataset:\n    print(data) # dict of arrays\n    break\n
    "},{"location":"usage.html#lazy-loading","title":"Lazy loading","text":"

    OpenQDC uses lazy loading to dynamically expose all its API without imposing a long import time during import openqdc as qdc. In case of trouble you can always disable lazy loading by setting the environment variable OPENQDC_DISABLE_LAZY_LOADING to 1.

    "},{"location":"API/basedataset.html","title":"BaseDataset","text":"

    The BaseDataset defining shared functionality between all datasets.

    "},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset","title":"BaseDataset","text":"

    Bases: DatasetPropertyMixIn

    Base class for datasets in the openQDC package.

    Source code in openqdc/datasets/base.py
    class BaseDataset(DatasetPropertyMixIn):\n    \"\"\"\n    Base class for datasets in the openQDC package.\n    \"\"\"\n\n    energy_target_names = []\n    force_target_names = []\n    read_as_zarr = False\n    __energy_methods__ = []\n    __force_mask__ = []\n    __isolated_atom_energies__ = []\n    _fn_energy = lambda x: x\n    _fn_distance = lambda x: x\n    _fn_forces = lambda x: x\n\n    __energy_unit__ = \"hartree\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"hartree/ang\"\n    __average_nb_atoms__ = None\n    __links__ = {}\n\n    def __init__(\n        self,\n        energy_unit: Optional[str] = None,\n        distance_unit: Optional[str] = None,\n        array_format: str = \"numpy\",\n        energy_type: Optional[str] = \"formation\",\n        overwrite_local_cache: bool = False,\n        cache_dir: Optional[str] = None,\n        recompute_statistics: bool = False,\n        transform: Optional[Callable] = None,\n        skip_statistics: bool = False,\n        read_as_zarr: bool = False,\n        regressor_kwargs: Dict = {\n            \"solver_type\": \"linear\",\n            \"sub_sample\": None,\n            \"stride\": 1,\n        },\n    ) -> None:\n        \"\"\"\n\n        Parameters:\n            energy_unit:\n                Energy unit to convert dataset to. Supported units: [\"kcal/mol\", \"kj/mol\", \"hartree\", \"ev\"]\n            distance_unit:\n                Distance unit to convert dataset to. Supported units: [\"ang\", \"nm\", \"bohr\"]\n            array_format:\n                Format to return arrays in. Supported formats: [\"numpy\", \"torch\", \"jax\"]\n            energy_type:\n                Type of isolated atom energy to use for the dataset. Default: \"formation\"\n                Supported types: [\"formation\", \"regression\", \"null\", None]\n            overwrite_local_cache:\n                Whether to overwrite the locally cached dataset.\n            cache_dir:\n                Cache directory location. Defaults to \"~/.cache/openqdc\"\n            recompute_statistics:\n                Whether to recompute the statistics of the dataset.\n            transform:\n                transformation to apply to the __getitem__ calls\n            regressor_kwargs:\n                Dictionary of keyword arguments to pass to the regressor.\n                Default: {\"solver_type\": \"linear\", \"sub_sample\": None, \"stride\": 1}\n                solver_type can be one of [\"linear\", \"ridge\"]\n        \"\"\"\n        set_cache_dir(cache_dir)\n        # self._init_lambda_fn()\n        self.data = None\n        self._original_unit = self.energy_unit\n        self.recompute_statistics = recompute_statistics\n        self.regressor_kwargs = regressor_kwargs\n        self.transform = transform\n        self.read_as_zarr = read_as_zarr\n        self.energy_type = energy_type if energy_type is not None else \"null\"\n        self.refit_e0s = recompute_statistics or overwrite_local_cache\n        self.skip_statistics = skip_statistics\n        if not self.is_preprocessed():\n            raise DatasetNotAvailableError(self.__name__)\n        else:\n            self.read_preprocess(overwrite_local_cache=overwrite_local_cache)\n        self.set_array_format(array_format)\n        self._post_init(overwrite_local_cache, energy_unit, distance_unit)\n\n    def _init_lambda_fn(self):\n        self._fn_energy = lambda x: x\n        self._fn_distance = lambda x: x\n        self._fn_forces = lambda x: x\n\n    @property\n    def dataset_wrapper(self):\n        if not hasattr(self, \"_dataset_wrapper\"):\n            self._dataset_wrapper = ZarrDataset() if self.read_as_zarr else MemMapDataset()\n        return self._dataset_wrapper\n\n    @property\n    def config(self):\n        assert len(self.__links__) > 0, \"No links provided for fetching\"\n        return dict(dataset_name=self.__name__, links=self.__links__)\n\n    @classmethod\n    def fetch(cls, cache_path: Optional[str] = None, overwrite: bool = False) -> None:\n        from openqdc.utils.download_api import DataDownloader\n\n        DataDownloader(cache_path, overwrite).from_config(cls.no_init().config)\n\n    def _post_init(\n        self,\n        overwrite_local_cache: bool = False,\n        energy_unit: Optional[str] = None,\n        distance_unit: Optional[str] = None,\n    ) -> None:\n        self._set_units(None, None)\n        self._set_isolated_atom_energies()\n        if not self.skip_statistics:\n            self._precompute_statistics(overwrite_local_cache=overwrite_local_cache)\n        self._set_units(energy_unit, distance_unit)\n        self._convert_data()\n        self._set_isolated_atom_energies()\n\n    def _precompute_statistics(self, overwrite_local_cache: bool = False):\n        # if self.recompute_statistics or overwrite_local_cache:\n        self.statistics = StatisticManager(\n            self,\n            self.recompute_statistics or overwrite_local_cache,  # check if we need to recompute\n            # Add the common statistics (Forces, TotalE, FormE, PerAtomE)\n            ForcesCalculatorStats,\n            TotalEnergyStats,\n            FormationEnergyStats,\n            PerAtomFormationEnergyStats,\n        )\n        self.statistics.run_calculators()  # run the calculators\n        self._compute_average_nb_atoms()\n\n    @classmethod\n    def no_init(cls):\n        \"\"\"\n        Class method to avoid the __init__ method to be called when the class is instanciated.\n        Useful for debugging purposes or preprocessing data.\n        \"\"\"\n        return cls.__new__(cls)\n\n    @property\n    def __force_methods__(self):\n        \"\"\"\n        For backward compatibility. To be removed in the future.\n        \"\"\"\n        return self.force_methods\n\n    @property\n    def energy_methods(self) -> List[str]:\n        \"\"\"Return the string version of the energy methods\"\"\"\n        return [str(i) for i in self.__energy_methods__]\n\n    @property\n    def force_mask(self):\n        if len(self.__class__.__force_mask__) == 0:\n            self.__class__.__force_mask__ = [False] * len(self.__energy_methods__)\n        return self.__class__.__force_mask__\n\n    @property\n    def force_methods(self):\n        return list(compress(self.energy_methods, self.force_mask))\n\n    @property\n    def e0s_dispatcher(self) -> AtomEnergies:\n        \"\"\"\n        Property to get the object that dispatched the isolated atom energies of the QM methods.\n\n        Returns:\n            Object wrapping the isolated atom energies of the QM methods.\n        \"\"\"\n        if not hasattr(self, \"_e0s_dispatcher\"):\n            # Automatically fetch/compute formation or regression energies\n            self._e0s_dispatcher = AtomEnergies(self, **self.regressor_kwargs)\n        return self._e0s_dispatcher\n\n    def _convert_data(self):\n        logger.info(\n            f\"Converting {self.__name__} data to the following units:\\n\\\n                     Energy: {str(self.energy_unit)},\\n\\\n                     Distance: {str(self.distance_unit)},\\n\\\n                     Forces: {str(self.force_unit) if self.__force_methods__ else 'None'}\"\n        )\n        for key in self.data_keys:\n            self.data[key] = self._convert_on_loading(self.data[key], key)\n\n    @property\n    def energy_unit(self):\n        return EnergyTypeConversion(self.__energy_unit__)\n\n    @property\n    def distance_unit(self):\n        return DistanceTypeConversion(self.__distance_unit__)\n\n    @property\n    def force_unit(self):\n        units = self.__forces_unit__.split(\"/\")\n        if len(units) > 2:\n            units = [\"/\".join(units[:2]), units[-1]]\n        return ForceTypeConversion(tuple(units))  # < 3.12 compatibility\n\n    @property\n    def root(self):\n        return p_join(get_local_cache(), self.__name__)\n\n    @property\n    def preprocess_path(self):\n        path = p_join(self.root, \"preprocessed\")\n        os.makedirs(path, exist_ok=True)\n        return path\n\n    @property\n    def data_keys(self):\n        keys = list(self.data_types.keys())\n        if len(self.__force_methods__) == 0:\n            keys.remove(\"forces\")\n        return keys\n\n    @property\n    def pkl_data_keys(self):\n        return list(self.pkl_data_types.keys())\n\n    @property\n    def pkl_data_types(self):\n        return {\"name\": str, \"subset\": str, \"n_atoms\": np.int32}\n\n    @property\n    def atom_energies(self):\n        return self._e0s_dispatcher\n\n    @property\n    def data_types(self):\n        return {\n            \"atomic_inputs\": np.float32,\n            \"position_idx_range\": np.int32,\n            \"energies\": np.float64,\n            \"forces\": np.float32,\n        }\n\n    @property\n    def data_shapes(self):\n        return {\n            \"atomic_inputs\": (-1, NB_ATOMIC_FEATURES),\n            \"position_idx_range\": (-1, 2),\n            \"energies\": (-1, len(self.energy_methods)),\n            \"forces\": (-1, 3, len(self.force_methods)),\n        }\n\n    def _set_units(self, en: Optional[str] = None, ds: Optional[str] = None):\n        old_en, old_ds = self.energy_unit, self.distance_unit\n        en = en if en is not None else old_en\n        ds = ds if ds is not None else old_ds\n        self.set_energy_unit(en)\n        self.set_distance_unit(ds)\n        if self.__force_methods__:\n            self._fn_forces = self.force_unit.to(str(self.energy_unit), str(self.distance_unit))\n            self.__forces_unit__ = str(self.energy_unit) + \"/\" + str(self.distance_unit)\n\n    def _set_isolated_atom_energies(self):\n        if self.__energy_methods__ is None:\n            logger.error(\"No energy methods defined for this dataset.\")\n        if self.energy_type == \"formation\":\n            f = get_conversion(\"hartree\", self.__energy_unit__)\n        else:\n            # regression are calculated on the original unit of the dataset\n            f = self._original_unit.to(self.energy_unit)\n        self.__isolated_atom_energies__ = f(self.e0s_dispatcher.e0s_matrix)\n\n    def convert_energy(self, x):\n        return self._fn_energy(x)\n\n    def convert_distance(self, x):\n        return self._fn_distance(x)\n\n    def convert_forces(self, x):\n        return self._fn_forces(x)\n\n    def set_energy_unit(self, value: str):\n        \"\"\"\n        Set a new energy unit for the dataset.\n\n        Parameters:\n            value:\n                New energy unit to set.\n        \"\"\"\n        # old_unit = self.energy_unit\n        # self.__energy_unit__ = value\n        self._fn_energy = self.energy_unit.to(value)  # get_conversion(old_unit, value)\n        self.__energy_unit__ = value\n\n    def set_distance_unit(self, value: str):\n        \"\"\"\n        Set a new distance unit for the dataset.\n\n        Parameters:\n            value:\n                New distance unit to set.\n        \"\"\"\n        # old_unit = self.distance_unit\n        # self.__distance_unit__ = value\n        self._fn_distance = self.distance_unit.to(value)  # get_conversion(old_unit, value)\n        self.__distance_unit__ = value\n\n    def set_array_format(self, format: str):\n        assert format in [\"numpy\", \"torch\", \"jax\"], f\"Format {format} not supported.\"\n        self.array_format = format\n\n    def read_raw_entries(self):\n        \"\"\"\n        Preprocess the raw (aka from the fetched source) into a list of dictionaries.\n        \"\"\"\n        raise NotImplementedError\n\n    def collate_list(self, list_entries: List[Dict]) -> Dict:\n        \"\"\"\n        Collate a list of entries into a single dictionary.\n\n        Parameters:\n            list_entries:\n                List of dictionaries containing the entries to collate.\n\n        Returns:\n            Dictionary containing the collated entries.\n        \"\"\"\n        # concatenate entries\n        res = {key: np.concatenate([r[key] for r in list_entries if r is not None], axis=0) for key in list_entries[0]}\n\n        csum = np.cumsum(res.get(\"n_atoms\"))\n        x = np.zeros((csum.shape[0], 2), dtype=np.int32)\n        x[1:, 0], x[:, 1] = csum[:-1], csum\n        res[\"position_idx_range\"] = x\n\n        return res\n\n    def save_preprocess(\n        self, data_dict: Dict[str, np.ndarray], upload: bool = False, overwrite: bool = True, as_zarr: bool = False\n    ):\n        \"\"\"\n        Save the preprocessed data to the cache directory and optionally upload it to the remote storage.\n\n        Parameters:\n            data_dict:\n                Dictionary containing the preprocessed data.\n            upload:\n                Whether to upload the preprocessed data to the remote storage or only saving it locally.\n            overwrite:\n                Whether to overwrite the preprocessed data if it already exists.\n                Only used if upload is True. Cache is always overwritten locally.\n        \"\"\"\n        # save memmaps\n        logger.info(\"Preprocessing data and saving it to cache.\")\n        paths = self.dataset_wrapper.save_preprocess(\n            self.preprocess_path, self.data_keys, data_dict, self.pkl_data_keys, self.pkl_data_types\n        )\n        if upload:\n            for local_path in paths:\n                push_remote(local_path, overwrite=overwrite)  # make it async?\n\n    def read_preprocess(self, overwrite_local_cache=False):\n        logger.info(\"Reading preprocessed data.\")\n        logger.info(\n            f\"Dataset {self.__name__} with the following units:\\n\\\n                     Energy: {self.energy_unit},\\n\\\n                     Distance: {self.distance_unit},\\n\\\n                     Forces: {self.force_unit if self.force_methods else 'None'}\"\n        )\n\n        self.data = self.dataset_wrapper.load_data(\n            self.preprocess_path,\n            self.data_keys,\n            self.data_types,\n            self.data_shapes,\n            self.pkl_data_keys,\n            overwrite_local_cache,\n        )  # this should be async if possible\n        for key in self.data:\n            logger.info(f\"Loaded {key} with shape {self.data[key].shape}, dtype {self.data[key].dtype}\")\n\n    def _convert_on_loading(self, x, key):\n        if key == \"energies\":\n            return self.convert_energy(x)\n        elif key == \"forces\":\n            return self.convert_forces(x)\n        elif key == \"atomic_inputs\":\n            x = np.array(x, dtype=np.float32)\n            x[:, -3:] = self.convert_distance(x[:, -3:])\n            return x\n        else:\n            return x\n\n    def is_preprocessed(self) -> bool:\n        \"\"\"\n        Check if the dataset is preprocessed and available online or locally.\n\n        Returns:\n            True if the dataset is available remotely or locally, False otherwise.\n        \"\"\"\n        predicats = [\n            copy_exists(p_join(self.preprocess_path, self.dataset_wrapper.add_extension(f\"{key}\")))\n            for key in self.data_keys\n        ]\n        predicats += [copy_exists(p_join(self.preprocess_path, file)) for file in self.dataset_wrapper._extra_files]\n        return all(predicats)\n\n    def is_cached(self) -> bool:\n        \"\"\"\n        Check if the dataset is cached locally.\n\n        Returns:\n            True if the dataset is cached locally, False otherwise.\n        \"\"\"\n        predicats = [\n            os.path.exists(p_join(self.preprocess_path, self.dataset_wrapper.add_extension(f\"{key}\")))\n            for key in self.data_keys\n        ]\n        predicats += [copy_exists(p_join(self.preprocess_path, file)) for file in self.dataset_wrapper._extra_files]\n        return all(predicats)\n\n    def preprocess(self, upload: bool = False, overwrite: bool = True, as_zarr: bool = True):\n        \"\"\"\n        Preprocess the dataset and save it.\n\n        Parameters:\n            upload:\n                Whether to upload the preprocessed data to the remote storage or only saving it locally.\n            overwrite:\n                hether to overwrite the preprocessed data if it already exists.\n                Only used if upload is True. Cache is always overwritten locally.\n            as_zarr:\n                Whether to save the data as zarr files\n        \"\"\"\n        if overwrite or not self.is_preprocessed():\n            entries = self.read_raw_entries()\n            res = self.collate_list(entries)\n            self.save_preprocess(res, upload, overwrite, as_zarr)\n\n    def upload(self, overwrite: bool = False, as_zarr: bool = False):\n        \"\"\"\n        Upload the preprocessed data to the remote storage. Must be called after preprocess and\n        need to have write privileges.\n\n        Parameters:\n            overwrite:\n                Whether to overwrite the remote data if it already exists\n            as_zarr:\n                Whether to upload the data as zarr files\n        \"\"\"\n        for key in self.data_keys:\n            local_path = p_join(self.preprocess_path, f\"{key}.mmap\" if not as_zarr else f\"{key}.zip\")\n            push_remote(local_path, overwrite=overwrite)\n        local_path = p_join(self.preprocess_path, \"props.pkl\" if not as_zarr else \"metadata.zip\")\n        push_remote(local_path, overwrite=overwrite)\n\n    def save_xyz(self, idx: int, energy_method: int = 0, path: Optional[str] = None, ext: bool = True):\n        \"\"\"\n        Save a single entry at index idx as an extxyz file.\n\n        Parameters:\n            idx:\n                Index of the entry\n            energy_method:\n                Index of the energy method to use\n            path:\n                Path to save the xyz file. If None, the current working directory is used.\n            ext:\n                Whether to include additional informations like forces and other metadatas (extxyz format)\n        \"\"\"\n        if path is None:\n            path = os.getcwd()\n        at = self.get_ase_atoms(idx, ext=ext, energy_method=energy_method)\n        write_extxyz(p_join(path, f\"mol_{idx}.xyz\"), at, plain=not ext)\n\n    def to_xyz(self, energy_method: int = 0, path: Optional[str] = None):\n        \"\"\"\n        Save dataset as single xyz file (extended xyz format).\n\n        Parameters:\n            energy_method:\n                Index of the energy method to use\n            path:\n                Path to save the xyz file\n        \"\"\"\n        with open(p_join(path if path else os.getcwd(), f\"{self.__name__}.xyz\"), \"w\") as f:\n            for atoms in tqdm(\n                self.as_iter(atoms=True, energy_method=energy_method),\n                total=len(self),\n                desc=f\"Saving {self.__name__} as xyz file\",\n            ):\n                write_extxyz(f, atoms, append=True)\n\n    def get_ase_atoms(self, idx: int, energy_method: int = 0, ext: bool = True) -> Atoms:\n        \"\"\"\n        Get the ASE atoms object for the entry at index idx.\n\n        Parameters:\n            idx:\n                Index of the entry.\n            energy_method:\n                Index of the energy method to use\n            ext:\n                Whether to include additional informations\n\n        Returns:\n            ASE atoms object\n        \"\"\"\n        entry = self[idx]\n        at = dict_to_atoms(entry, ext=ext, energy_method=energy_method)\n        return at\n\n    def subsample(\n        self, n_samples: Optional[Union[List[int], int, float]] = None, replace: bool = False, seed: int = 42\n    ):\n        np.random.seed(seed)\n        if n_samples is None:\n            return list(range(len(self)))\n        try:\n            if 0 < n_samples < 1:\n                n_samples = int(n_samples * len(self))\n            if isinstance(n_samples, int):\n                idxs = np.random.choice(len(self), size=n_samples, replace=replace)\n        except (ValueError, TypeError):  # list, set, np.ndarray\n            idxs = n_samples\n        return idxs\n\n    @requires_package(\"datamol\")\n    def calculate_descriptors(\n        self,\n        descriptor_name: str = \"soap\",\n        chemical_species: Optional[List[str]] = None,\n        n_samples: Optional[Union[List[int], int, float]] = None,\n        progress: bool = True,\n        **descriptor_kwargs,\n    ) -> Dict[str, np.ndarray]:\n        \"\"\"\n        Compute the descriptors for the dataset.\n\n        Parameters:\n            descriptor_name:\n                Name of the descriptor to use. Supported descriptors are [\"soap\"]\n            chemical_species:\n                List of chemical species to use for the descriptor computation, by default None.\n                If None, the chemical species of the dataset are used.\n            n_samples:\n                Number of samples to use for the computation, by default None.\n                If None, all the dataset is used.\n                If a list of integers is provided, the descriptors are computed for\n                each of the specified idx of samples.\n            progress:\n                Whether to show a progress bar, by default True.\n            **descriptor_kwargs : dict\n                Keyword arguments to pass to the descriptor instantiation of the model.\n\n        Returns:\n            Dictionary containing the following keys:\n                - values : np.ndarray of shape (N, M) containing the descriptors for the dataset\n                - idxs : np.ndarray of shape (N,) containing the indices of the samples used\n\n        \"\"\"\n        import datamol as dm\n\n        datum = {}\n        idxs = self.subsample(n_samples)\n        model = get_descriptor(descriptor_name.lower())(\n            species=self.chemical_species if chemical_species is None else chemical_species, **descriptor_kwargs\n        )\n\n        def wrapper(idx):\n            entry = self.get_ase_atoms(idx, ext=False)\n            return model.calculate(entry)\n\n        descr = dm.parallelized(wrapper, idxs, progress=progress, scheduler=\"threads\", n_jobs=-1)\n        datum[\"values\"] = np.vstack(descr)\n        datum[\"idxs\"] = idxs\n        return datum\n\n    def as_iter(self, atoms: bool = False, energy_method: int = 0) -> Iterable:\n        \"\"\"\n        Return the dataset as an iterator.\n\n        Parameters:\n            atoms:\n                Whether to return the items as ASE atoms object, by default False\n            energy_method:\n                Index of the energy method to use\n\n        Returns:\n            Iterator of the dataset\n        \"\"\"\n\n        func = partial(self.get_ase_atoms, energy_method=energy_method) if atoms else self.__getitem__\n\n        for i in range(len(self)):\n            yield func(i)\n\n    def __iter__(self):\n        for idxs in range(len(self)):\n            yield self[idxs]\n\n    def get_statistics(self, return_none: bool = True) -> Dict:\n        \"\"\"\n        Get the converted statistics of the dataset.\n\n        Parameters:\n            return_none :\n                Whether to return None if the statistics for the forces are not available, by default True\n                Otherwise, the statistics for the forces are set to 0.0\n\n        Returns:\n            Dictionary containing the statistics of the dataset\n        \"\"\"\n        selected_stats = self.statistics.get_results()\n        if len(selected_stats) == 0:\n            raise StatisticsNotAvailableError(self.__name__)\n        if not return_none:\n            selected_stats.update(\n                {\n                    \"ForcesCalculatorStats\": {\n                        \"mean\": np.array([0.0]),\n                        \"std\": np.array([0.0]),\n                        \"component_mean\": np.array([[0.0], [0.0], [0.0]]),\n                        \"component_std\": np.array([[0.0], [0.0], [0.0]]),\n                        \"component_rms\": np.array([[0.0], [0.0], [0.0]]),\n                    }\n                }\n            )\n        # cycle trough dict to convert units\n        for key, result in selected_stats.items():\n            if isinstance(result, ForcesCalculatorStats):\n                result.transform(self.convert_forces)\n            else:\n                result.transform(self.convert_energy)\n            result.transform(self._convert_array)\n        return {k: result.to_dict() for k, result in selected_stats.items()}\n\n    def __str__(self):\n        return f\"{self.__name__}\"\n\n    def __repr__(self):\n        return f\"{self.__name__}\"\n\n    def __len__(self):\n        return self.data[\"energies\"].shape[0]\n\n    def __smiles_converter__(self, x):\n        \"\"\"util function to convert string to smiles: useful if the smiles is\n        encoded in a different format than its display format\n        \"\"\"\n        return x\n\n    def _convert_array(self, x: np.ndarray):\n        return _CONVERT_DICT.get(self.array_format)(x)\n\n    def __getitem__(self, idx: int):\n        shift = MAX_CHARGE\n        p_start, p_end = self.data[\"position_idx_range\"][idx]\n        input = self.data[\"atomic_inputs\"][p_start:p_end]\n        z, c, positions, energies = (\n            self._convert_array(np.array(input[:, 0], dtype=np.int32)),\n            self._convert_array(np.array(input[:, 1], dtype=np.int32)),\n            self._convert_array(np.array(input[:, -3:], dtype=np.float32)),\n            self._convert_array(np.array(self.data[\"energies\"][idx], dtype=np.float64)),\n        )\n        name = self.__smiles_converter__(self.data[\"name\"][idx])\n        subset = self.data[\"subset\"][idx]\n        e0s = self._convert_array(self.__isolated_atom_energies__[..., z, c + shift].T)\n        formation_energies = energies - e0s.sum(axis=0)\n        forces = None\n        if \"forces\" in self.data:\n            forces = self._convert_array(np.array(self.data[\"forces\"][p_start:p_end], dtype=np.float32))\n\n        bunch = Bunch(\n            positions=positions,\n            atomic_numbers=z,\n            charges=c,\n            e0=e0s,\n            energies=energies,\n            formation_energies=formation_energies,\n            per_atom_formation_energies=formation_energies / len(z),\n            name=name,\n            subset=subset,\n            forces=forces,\n        )\n\n        if self.transform is not None:\n            bunch = self.transform(bunch)\n\n        return bunch\n
    "},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.__force_methods__","title":"__force_methods__ property","text":"

    For backward compatibility. To be removed in the future.

    "},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.e0s_dispatcher","title":"e0s_dispatcher: AtomEnergies property","text":"

    Property to get the object that dispatched the isolated atom energies of the QM methods.

    Returns:

    Type Description AtomEnergies

    Object wrapping the isolated atom energies of the QM methods.

    "},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.energy_methods","title":"energy_methods: List[str] property","text":"

    Return the string version of the energy methods

    "},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.__init__","title":"__init__(energy_unit=None, distance_unit=None, array_format='numpy', energy_type='formation', overwrite_local_cache=False, cache_dir=None, recompute_statistics=False, transform=None, skip_statistics=False, read_as_zarr=False, regressor_kwargs={'solver_type': 'linear', 'sub_sample': None, 'stride': 1})","text":"

    Parameters:

    Name Type Description Default energy_unit Optional[str]

    Energy unit to convert dataset to. Supported units: [\"kcal/mol\", \"kj/mol\", \"hartree\", \"ev\"]

    None distance_unit Optional[str]

    Distance unit to convert dataset to. Supported units: [\"ang\", \"nm\", \"bohr\"]

    None array_format str

    Format to return arrays in. Supported formats: [\"numpy\", \"torch\", \"jax\"]

    'numpy' energy_type Optional[str]

    Type of isolated atom energy to use for the dataset. Default: \"formation\" Supported types: [\"formation\", \"regression\", \"null\", None]

    'formation' overwrite_local_cache bool

    Whether to overwrite the locally cached dataset.

    False cache_dir Optional[str]

    Cache directory location. Defaults to \"~/.cache/openqdc\"

    None recompute_statistics bool

    Whether to recompute the statistics of the dataset.

    False transform Optional[Callable]

    transformation to apply to the getitem calls

    None regressor_kwargs Dict

    Dictionary of keyword arguments to pass to the regressor. Default: {\"solver_type\": \"linear\", \"sub_sample\": None, \"stride\": 1} solver_type can be one of [\"linear\", \"ridge\"]

    {'solver_type': 'linear', 'sub_sample': None, 'stride': 1} Source code in openqdc/datasets/base.py
    def __init__(\n    self,\n    energy_unit: Optional[str] = None,\n    distance_unit: Optional[str] = None,\n    array_format: str = \"numpy\",\n    energy_type: Optional[str] = \"formation\",\n    overwrite_local_cache: bool = False,\n    cache_dir: Optional[str] = None,\n    recompute_statistics: bool = False,\n    transform: Optional[Callable] = None,\n    skip_statistics: bool = False,\n    read_as_zarr: bool = False,\n    regressor_kwargs: Dict = {\n        \"solver_type\": \"linear\",\n        \"sub_sample\": None,\n        \"stride\": 1,\n    },\n) -> None:\n    \"\"\"\n\n    Parameters:\n        energy_unit:\n            Energy unit to convert dataset to. Supported units: [\"kcal/mol\", \"kj/mol\", \"hartree\", \"ev\"]\n        distance_unit:\n            Distance unit to convert dataset to. Supported units: [\"ang\", \"nm\", \"bohr\"]\n        array_format:\n            Format to return arrays in. Supported formats: [\"numpy\", \"torch\", \"jax\"]\n        energy_type:\n            Type of isolated atom energy to use for the dataset. Default: \"formation\"\n            Supported types: [\"formation\", \"regression\", \"null\", None]\n        overwrite_local_cache:\n            Whether to overwrite the locally cached dataset.\n        cache_dir:\n            Cache directory location. Defaults to \"~/.cache/openqdc\"\n        recompute_statistics:\n            Whether to recompute the statistics of the dataset.\n        transform:\n            transformation to apply to the __getitem__ calls\n        regressor_kwargs:\n            Dictionary of keyword arguments to pass to the regressor.\n            Default: {\"solver_type\": \"linear\", \"sub_sample\": None, \"stride\": 1}\n            solver_type can be one of [\"linear\", \"ridge\"]\n    \"\"\"\n    set_cache_dir(cache_dir)\n    # self._init_lambda_fn()\n    self.data = None\n    self._original_unit = self.energy_unit\n    self.recompute_statistics = recompute_statistics\n    self.regressor_kwargs = regressor_kwargs\n    self.transform = transform\n    self.read_as_zarr = read_as_zarr\n    self.energy_type = energy_type if energy_type is not None else \"null\"\n    self.refit_e0s = recompute_statistics or overwrite_local_cache\n    self.skip_statistics = skip_statistics\n    if not self.is_preprocessed():\n        raise DatasetNotAvailableError(self.__name__)\n    else:\n        self.read_preprocess(overwrite_local_cache=overwrite_local_cache)\n    self.set_array_format(array_format)\n    self._post_init(overwrite_local_cache, energy_unit, distance_unit)\n
    "},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.__smiles_converter__","title":"__smiles_converter__(x)","text":"

    util function to convert string to smiles: useful if the smiles is encoded in a different format than its display format

    Source code in openqdc/datasets/base.py
    def __smiles_converter__(self, x):\n    \"\"\"util function to convert string to smiles: useful if the smiles is\n    encoded in a different format than its display format\n    \"\"\"\n    return x\n
    "},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.as_iter","title":"as_iter(atoms=False, energy_method=0)","text":"

    Return the dataset as an iterator.

    Parameters:

    Name Type Description Default atoms bool

    Whether to return the items as ASE atoms object, by default False

    False energy_method int

    Index of the energy method to use

    0

    Returns:

    Type Description Iterable

    Iterator of the dataset

    Source code in openqdc/datasets/base.py
    def as_iter(self, atoms: bool = False, energy_method: int = 0) -> Iterable:\n    \"\"\"\n    Return the dataset as an iterator.\n\n    Parameters:\n        atoms:\n            Whether to return the items as ASE atoms object, by default False\n        energy_method:\n            Index of the energy method to use\n\n    Returns:\n        Iterator of the dataset\n    \"\"\"\n\n    func = partial(self.get_ase_atoms, energy_method=energy_method) if atoms else self.__getitem__\n\n    for i in range(len(self)):\n        yield func(i)\n
    "},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.calculate_descriptors","title":"calculate_descriptors(descriptor_name='soap', chemical_species=None, n_samples=None, progress=True, **descriptor_kwargs)","text":"

    Compute the descriptors for the dataset.

    Parameters:

    Name Type Description Default descriptor_name str

    Name of the descriptor to use. Supported descriptors are [\"soap\"]

    'soap' chemical_species Optional[List[str]]

    List of chemical species to use for the descriptor computation, by default None. If None, the chemical species of the dataset are used.

    None n_samples Optional[Union[List[int], int, float]]

    Number of samples to use for the computation, by default None. If None, all the dataset is used. If a list of integers is provided, the descriptors are computed for each of the specified idx of samples.

    None progress bool

    Whether to show a progress bar, by default True.

    True **descriptor_kwargs

    dict Keyword arguments to pass to the descriptor instantiation of the model.

    {}

    Returns:

    Type Description Dict[str, ndarray]

    Dictionary containing the following keys: - values : np.ndarray of shape (N, M) containing the descriptors for the dataset - idxs : np.ndarray of shape (N,) containing the indices of the samples used

    Source code in openqdc/datasets/base.py
    @requires_package(\"datamol\")\ndef calculate_descriptors(\n    self,\n    descriptor_name: str = \"soap\",\n    chemical_species: Optional[List[str]] = None,\n    n_samples: Optional[Union[List[int], int, float]] = None,\n    progress: bool = True,\n    **descriptor_kwargs,\n) -> Dict[str, np.ndarray]:\n    \"\"\"\n    Compute the descriptors for the dataset.\n\n    Parameters:\n        descriptor_name:\n            Name of the descriptor to use. Supported descriptors are [\"soap\"]\n        chemical_species:\n            List of chemical species to use for the descriptor computation, by default None.\n            If None, the chemical species of the dataset are used.\n        n_samples:\n            Number of samples to use for the computation, by default None.\n            If None, all the dataset is used.\n            If a list of integers is provided, the descriptors are computed for\n            each of the specified idx of samples.\n        progress:\n            Whether to show a progress bar, by default True.\n        **descriptor_kwargs : dict\n            Keyword arguments to pass to the descriptor instantiation of the model.\n\n    Returns:\n        Dictionary containing the following keys:\n            - values : np.ndarray of shape (N, M) containing the descriptors for the dataset\n            - idxs : np.ndarray of shape (N,) containing the indices of the samples used\n\n    \"\"\"\n    import datamol as dm\n\n    datum = {}\n    idxs = self.subsample(n_samples)\n    model = get_descriptor(descriptor_name.lower())(\n        species=self.chemical_species if chemical_species is None else chemical_species, **descriptor_kwargs\n    )\n\n    def wrapper(idx):\n        entry = self.get_ase_atoms(idx, ext=False)\n        return model.calculate(entry)\n\n    descr = dm.parallelized(wrapper, idxs, progress=progress, scheduler=\"threads\", n_jobs=-1)\n    datum[\"values\"] = np.vstack(descr)\n    datum[\"idxs\"] = idxs\n    return datum\n
    "},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.collate_list","title":"collate_list(list_entries)","text":"

    Collate a list of entries into a single dictionary.

    Parameters:

    Name Type Description Default list_entries List[Dict]

    List of dictionaries containing the entries to collate.

    required

    Returns:

    Type Description Dict

    Dictionary containing the collated entries.

    Source code in openqdc/datasets/base.py
    def collate_list(self, list_entries: List[Dict]) -> Dict:\n    \"\"\"\n    Collate a list of entries into a single dictionary.\n\n    Parameters:\n        list_entries:\n            List of dictionaries containing the entries to collate.\n\n    Returns:\n        Dictionary containing the collated entries.\n    \"\"\"\n    # concatenate entries\n    res = {key: np.concatenate([r[key] for r in list_entries if r is not None], axis=0) for key in list_entries[0]}\n\n    csum = np.cumsum(res.get(\"n_atoms\"))\n    x = np.zeros((csum.shape[0], 2), dtype=np.int32)\n    x[1:, 0], x[:, 1] = csum[:-1], csum\n    res[\"position_idx_range\"] = x\n\n    return res\n
    "},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.get_ase_atoms","title":"get_ase_atoms(idx, energy_method=0, ext=True)","text":"

    Get the ASE atoms object for the entry at index idx.

    Parameters:

    Name Type Description Default idx int

    Index of the entry.

    required energy_method int

    Index of the energy method to use

    0 ext bool

    Whether to include additional informations

    True

    Returns:

    Type Description Atoms

    ASE atoms object

    Source code in openqdc/datasets/base.py
    def get_ase_atoms(self, idx: int, energy_method: int = 0, ext: bool = True) -> Atoms:\n    \"\"\"\n    Get the ASE atoms object for the entry at index idx.\n\n    Parameters:\n        idx:\n            Index of the entry.\n        energy_method:\n            Index of the energy method to use\n        ext:\n            Whether to include additional informations\n\n    Returns:\n        ASE atoms object\n    \"\"\"\n    entry = self[idx]\n    at = dict_to_atoms(entry, ext=ext, energy_method=energy_method)\n    return at\n
    "},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.get_statistics","title":"get_statistics(return_none=True)","text":"

    Get the converted statistics of the dataset.

    Parameters:

    Name Type Description Default return_none

    Whether to return None if the statistics for the forces are not available, by default True Otherwise, the statistics for the forces are set to 0.0

    True

    Returns:

    Type Description Dict

    Dictionary containing the statistics of the dataset

    Source code in openqdc/datasets/base.py
    def get_statistics(self, return_none: bool = True) -> Dict:\n    \"\"\"\n    Get the converted statistics of the dataset.\n\n    Parameters:\n        return_none :\n            Whether to return None if the statistics for the forces are not available, by default True\n            Otherwise, the statistics for the forces are set to 0.0\n\n    Returns:\n        Dictionary containing the statistics of the dataset\n    \"\"\"\n    selected_stats = self.statistics.get_results()\n    if len(selected_stats) == 0:\n        raise StatisticsNotAvailableError(self.__name__)\n    if not return_none:\n        selected_stats.update(\n            {\n                \"ForcesCalculatorStats\": {\n                    \"mean\": np.array([0.0]),\n                    \"std\": np.array([0.0]),\n                    \"component_mean\": np.array([[0.0], [0.0], [0.0]]),\n                    \"component_std\": np.array([[0.0], [0.0], [0.0]]),\n                    \"component_rms\": np.array([[0.0], [0.0], [0.0]]),\n                }\n            }\n        )\n    # cycle trough dict to convert units\n    for key, result in selected_stats.items():\n        if isinstance(result, ForcesCalculatorStats):\n            result.transform(self.convert_forces)\n        else:\n            result.transform(self.convert_energy)\n        result.transform(self._convert_array)\n    return {k: result.to_dict() for k, result in selected_stats.items()}\n
    "},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.is_cached","title":"is_cached()","text":"

    Check if the dataset is cached locally.

    Returns:

    Type Description bool

    True if the dataset is cached locally, False otherwise.

    Source code in openqdc/datasets/base.py
    def is_cached(self) -> bool:\n    \"\"\"\n    Check if the dataset is cached locally.\n\n    Returns:\n        True if the dataset is cached locally, False otherwise.\n    \"\"\"\n    predicats = [\n        os.path.exists(p_join(self.preprocess_path, self.dataset_wrapper.add_extension(f\"{key}\")))\n        for key in self.data_keys\n    ]\n    predicats += [copy_exists(p_join(self.preprocess_path, file)) for file in self.dataset_wrapper._extra_files]\n    return all(predicats)\n
    "},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.is_preprocessed","title":"is_preprocessed()","text":"

    Check if the dataset is preprocessed and available online or locally.

    Returns:

    Type Description bool

    True if the dataset is available remotely or locally, False otherwise.

    Source code in openqdc/datasets/base.py
    def is_preprocessed(self) -> bool:\n    \"\"\"\n    Check if the dataset is preprocessed and available online or locally.\n\n    Returns:\n        True if the dataset is available remotely or locally, False otherwise.\n    \"\"\"\n    predicats = [\n        copy_exists(p_join(self.preprocess_path, self.dataset_wrapper.add_extension(f\"{key}\")))\n        for key in self.data_keys\n    ]\n    predicats += [copy_exists(p_join(self.preprocess_path, file)) for file in self.dataset_wrapper._extra_files]\n    return all(predicats)\n
    "},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.no_init","title":"no_init() classmethod","text":"

    Class method to avoid the init method to be called when the class is instanciated. Useful for debugging purposes or preprocessing data.

    Source code in openqdc/datasets/base.py
    @classmethod\ndef no_init(cls):\n    \"\"\"\n    Class method to avoid the __init__ method to be called when the class is instanciated.\n    Useful for debugging purposes or preprocessing data.\n    \"\"\"\n    return cls.__new__(cls)\n
    "},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.preprocess","title":"preprocess(upload=False, overwrite=True, as_zarr=True)","text":"

    Preprocess the dataset and save it.

    Parameters:

    Name Type Description Default upload bool

    Whether to upload the preprocessed data to the remote storage or only saving it locally.

    False overwrite bool

    hether to overwrite the preprocessed data if it already exists. Only used if upload is True. Cache is always overwritten locally.

    True as_zarr bool

    Whether to save the data as zarr files

    True Source code in openqdc/datasets/base.py
    def preprocess(self, upload: bool = False, overwrite: bool = True, as_zarr: bool = True):\n    \"\"\"\n    Preprocess the dataset and save it.\n\n    Parameters:\n        upload:\n            Whether to upload the preprocessed data to the remote storage or only saving it locally.\n        overwrite:\n            hether to overwrite the preprocessed data if it already exists.\n            Only used if upload is True. Cache is always overwritten locally.\n        as_zarr:\n            Whether to save the data as zarr files\n    \"\"\"\n    if overwrite or not self.is_preprocessed():\n        entries = self.read_raw_entries()\n        res = self.collate_list(entries)\n        self.save_preprocess(res, upload, overwrite, as_zarr)\n
    "},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.read_raw_entries","title":"read_raw_entries()","text":"

    Preprocess the raw (aka from the fetched source) into a list of dictionaries.

    Source code in openqdc/datasets/base.py
    def read_raw_entries(self):\n    \"\"\"\n    Preprocess the raw (aka from the fetched source) into a list of dictionaries.\n    \"\"\"\n    raise NotImplementedError\n
    "},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.save_preprocess","title":"save_preprocess(data_dict, upload=False, overwrite=True, as_zarr=False)","text":"

    Save the preprocessed data to the cache directory and optionally upload it to the remote storage.

    Parameters:

    Name Type Description Default data_dict Dict[str, ndarray]

    Dictionary containing the preprocessed data.

    required upload bool

    Whether to upload the preprocessed data to the remote storage or only saving it locally.

    False overwrite bool

    Whether to overwrite the preprocessed data if it already exists. Only used if upload is True. Cache is always overwritten locally.

    True Source code in openqdc/datasets/base.py
    def save_preprocess(\n    self, data_dict: Dict[str, np.ndarray], upload: bool = False, overwrite: bool = True, as_zarr: bool = False\n):\n    \"\"\"\n    Save the preprocessed data to the cache directory and optionally upload it to the remote storage.\n\n    Parameters:\n        data_dict:\n            Dictionary containing the preprocessed data.\n        upload:\n            Whether to upload the preprocessed data to the remote storage or only saving it locally.\n        overwrite:\n            Whether to overwrite the preprocessed data if it already exists.\n            Only used if upload is True. Cache is always overwritten locally.\n    \"\"\"\n    # save memmaps\n    logger.info(\"Preprocessing data and saving it to cache.\")\n    paths = self.dataset_wrapper.save_preprocess(\n        self.preprocess_path, self.data_keys, data_dict, self.pkl_data_keys, self.pkl_data_types\n    )\n    if upload:\n        for local_path in paths:\n            push_remote(local_path, overwrite=overwrite)  # make it async?\n
    "},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.save_xyz","title":"save_xyz(idx, energy_method=0, path=None, ext=True)","text":"

    Save a single entry at index idx as an extxyz file.

    Parameters:

    Name Type Description Default idx int

    Index of the entry

    required energy_method int

    Index of the energy method to use

    0 path Optional[str]

    Path to save the xyz file. If None, the current working directory is used.

    None ext bool

    Whether to include additional informations like forces and other metadatas (extxyz format)

    True Source code in openqdc/datasets/base.py
    def save_xyz(self, idx: int, energy_method: int = 0, path: Optional[str] = None, ext: bool = True):\n    \"\"\"\n    Save a single entry at index idx as an extxyz file.\n\n    Parameters:\n        idx:\n            Index of the entry\n        energy_method:\n            Index of the energy method to use\n        path:\n            Path to save the xyz file. If None, the current working directory is used.\n        ext:\n            Whether to include additional informations like forces and other metadatas (extxyz format)\n    \"\"\"\n    if path is None:\n        path = os.getcwd()\n    at = self.get_ase_atoms(idx, ext=ext, energy_method=energy_method)\n    write_extxyz(p_join(path, f\"mol_{idx}.xyz\"), at, plain=not ext)\n
    "},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.set_distance_unit","title":"set_distance_unit(value)","text":"

    Set a new distance unit for the dataset.

    Parameters:

    Name Type Description Default value str

    New distance unit to set.

    required Source code in openqdc/datasets/base.py
    def set_distance_unit(self, value: str):\n    \"\"\"\n    Set a new distance unit for the dataset.\n\n    Parameters:\n        value:\n            New distance unit to set.\n    \"\"\"\n    # old_unit = self.distance_unit\n    # self.__distance_unit__ = value\n    self._fn_distance = self.distance_unit.to(value)  # get_conversion(old_unit, value)\n    self.__distance_unit__ = value\n
    "},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.set_energy_unit","title":"set_energy_unit(value)","text":"

    Set a new energy unit for the dataset.

    Parameters:

    Name Type Description Default value str

    New energy unit to set.

    required Source code in openqdc/datasets/base.py
    def set_energy_unit(self, value: str):\n    \"\"\"\n    Set a new energy unit for the dataset.\n\n    Parameters:\n        value:\n            New energy unit to set.\n    \"\"\"\n    # old_unit = self.energy_unit\n    # self.__energy_unit__ = value\n    self._fn_energy = self.energy_unit.to(value)  # get_conversion(old_unit, value)\n    self.__energy_unit__ = value\n
    "},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.to_xyz","title":"to_xyz(energy_method=0, path=None)","text":"

    Save dataset as single xyz file (extended xyz format).

    Parameters:

    Name Type Description Default energy_method int

    Index of the energy method to use

    0 path Optional[str]

    Path to save the xyz file

    None Source code in openqdc/datasets/base.py
    def to_xyz(self, energy_method: int = 0, path: Optional[str] = None):\n    \"\"\"\n    Save dataset as single xyz file (extended xyz format).\n\n    Parameters:\n        energy_method:\n            Index of the energy method to use\n        path:\n            Path to save the xyz file\n    \"\"\"\n    with open(p_join(path if path else os.getcwd(), f\"{self.__name__}.xyz\"), \"w\") as f:\n        for atoms in tqdm(\n            self.as_iter(atoms=True, energy_method=energy_method),\n            total=len(self),\n            desc=f\"Saving {self.__name__} as xyz file\",\n        ):\n            write_extxyz(f, atoms, append=True)\n
    "},{"location":"API/basedataset.html#openqdc.datasets.base.BaseDataset.upload","title":"upload(overwrite=False, as_zarr=False)","text":"

    Upload the preprocessed data to the remote storage. Must be called after preprocess and need to have write privileges.

    Parameters:

    Name Type Description Default overwrite bool

    Whether to overwrite the remote data if it already exists

    False as_zarr bool

    Whether to upload the data as zarr files

    False Source code in openqdc/datasets/base.py
    def upload(self, overwrite: bool = False, as_zarr: bool = False):\n    \"\"\"\n    Upload the preprocessed data to the remote storage. Must be called after preprocess and\n    need to have write privileges.\n\n    Parameters:\n        overwrite:\n            Whether to overwrite the remote data if it already exists\n        as_zarr:\n            Whether to upload the data as zarr files\n    \"\"\"\n    for key in self.data_keys:\n        local_path = p_join(self.preprocess_path, f\"{key}.mmap\" if not as_zarr else f\"{key}.zip\")\n        push_remote(local_path, overwrite=overwrite)\n    local_path = p_join(self.preprocess_path, \"props.pkl\" if not as_zarr else \"metadata.zip\")\n    push_remote(local_path, overwrite=overwrite)\n
    "},{"location":"API/e0_dispatcher.html","title":"e0 Dispatcher","text":""},{"location":"API/e0_dispatcher.html#openqdc.datasets.energies.AtomEnergies","title":"AtomEnergies","text":"

    Manager class for interface with the isolated atom energies classes and providing the generals function to retrieve the data

    Source code in openqdc/datasets/energies.py
    class AtomEnergies:\n    \"\"\"\n    Manager class for interface with the isolated atom energies classes\n    and providing the generals function to retrieve the data\n    \"\"\"\n\n    def __init__(self, data, **kwargs) -> None:\n        self.atom_energies = data.energy_type\n        self.factory = dispatch_factory(data, **kwargs)\n\n    @property\n    def e0s_matrix(self) -> np.ndarray:\n        \"\"\"\n        Return the isolated atom energies dictionary\n\n        Returns:\n            Matrix Array with the isolated atom energies\n        \"\"\"\n        return self.factory.e0_matrix\n\n    @property\n    def e0s_dict(self) -> Dict[AtomSpecies, AtomEnergy]:\n        \"\"\"\n        Return the isolated atom energies dictionary\n\n        Returns:\n            Dictionary with the isolated atom energies\n        \"\"\"\n        return self.factory.e0_dict\n\n    def __str__(self):\n        return f\"Atoms: { list(set(map(lambda x : x.symbol, self.e0s_dict.keys())))}\"\n\n    def __repr__(self):\n        return str(self)\n\n    def __getitem__(self, item: AtomSpecies) -> AtomEnergy:\n        \"\"\"\n        Retrieve a key from the isolated atom dictionary.\n        Item can be written as tuple(Symbol, charge),\n        tuple(Chemical number, charge). If no charge is passed,\n        it will be automatically set to 0.\n\n        Examples:\n            AtomEnergies[6], AtomEnergies[6,1], \\n\n            AtomEnergies[\"C\",1], AtomEnergies[(6,1)], \\n\n            AtomEnergies[(\"C,1)]\n\n        Parameters:\n            item:\n                AtomSpecies object or tuple with the atom symbol and charge\n\n        Returns:\n            AtomEnergy object with the isolated atom energy\n        \"\"\"\n        try:\n            atom, charge = item[0], item[1]\n        except TypeError:\n            atom = item\n            charge = 0\n        except IndexError:\n            atom = item[0]\n            charge = 0\n        if not isinstance(atom, str):\n            atom = ATOM_SYMBOLS[atom]\n        return self.e0s_dict[(atom, charge)]\n
    "},{"location":"API/e0_dispatcher.html#openqdc.datasets.energies.AtomEnergies.e0s_dict","title":"e0s_dict: Dict[AtomSpecies, AtomEnergy] property","text":"

    Return the isolated atom energies dictionary

    Returns:

    Type Description Dict[AtomSpecies, AtomEnergy]

    Dictionary with the isolated atom energies

    "},{"location":"API/e0_dispatcher.html#openqdc.datasets.energies.AtomEnergies.e0s_matrix","title":"e0s_matrix: np.ndarray property","text":"

    Return the isolated atom energies dictionary

    Returns:

    Type Description ndarray

    Matrix Array with the isolated atom energies

    "},{"location":"API/e0_dispatcher.html#openqdc.datasets.energies.AtomEnergies.__getitem__","title":"__getitem__(item)","text":"

    Retrieve a key from the isolated atom dictionary. Item can be written as tuple(Symbol, charge), tuple(Chemical number, charge). If no charge is passed, it will be automatically set to 0.

    Examples:

    AtomEnergies[6], AtomEnergies[6,1],

    AtomEnergies[\"C\",1], AtomEnergies[(6,1)],

    AtomEnergies[(\"C,1)]

    Parameters:

    Name Type Description Default item AtomSpecies

    AtomSpecies object or tuple with the atom symbol and charge

    required

    Returns:

    Type Description AtomEnergy

    AtomEnergy object with the isolated atom energy

    Source code in openqdc/datasets/energies.py
    def __getitem__(self, item: AtomSpecies) -> AtomEnergy:\n    \"\"\"\n    Retrieve a key from the isolated atom dictionary.\n    Item can be written as tuple(Symbol, charge),\n    tuple(Chemical number, charge). If no charge is passed,\n    it will be automatically set to 0.\n\n    Examples:\n        AtomEnergies[6], AtomEnergies[6,1], \\n\n        AtomEnergies[\"C\",1], AtomEnergies[(6,1)], \\n\n        AtomEnergies[(\"C,1)]\n\n    Parameters:\n        item:\n            AtomSpecies object or tuple with the atom symbol and charge\n\n    Returns:\n        AtomEnergy object with the isolated atom energy\n    \"\"\"\n    try:\n        atom, charge = item[0], item[1]\n    except TypeError:\n        atom = item\n        charge = 0\n    except IndexError:\n        atom = item[0]\n        charge = 0\n    if not isinstance(atom, str):\n        atom = ATOM_SYMBOLS[atom]\n    return self.e0s_dict[(atom, charge)]\n
    "},{"location":"API/e0_dispatcher.html#openqdc.datasets.energies.AtomEnergy","title":"AtomEnergy dataclass","text":"

    Datastructure to store isolated atom energies and the std deviation associated to the value. By default the std will be 1 if no value was calculated or not available (formation energy case)

    Source code in openqdc/datasets/energies.py
    @dataclass\nclass AtomEnergy:\n    \"\"\"\n    Datastructure to store isolated atom energies\n    and the std deviation associated to the value.\n    By default the std will be 1 if no value was calculated\n    or not available (formation energy case)\n    \"\"\"\n\n    mean: np.array\n    std: np.array = field(default_factory=lambda: np.array([1], dtype=np.float32))\n\n    def __post_init__(self):\n        if not isinstance(self.mean, np.ndarray):\n            self.mean = np.array([self.mean], dtype=np.float32)\n\n    def append(self, other: \"AtomEnergy\"):\n        \"\"\"\n        Append the mean and std of another atom energy\n        \"\"\"\n        self.mean = np.append(self.mean, other.mean)\n        self.std = np.append(self.std, other.std)\n
    "},{"location":"API/e0_dispatcher.html#openqdc.datasets.energies.AtomEnergy.append","title":"append(other)","text":"

    Append the mean and std of another atom energy

    Source code in openqdc/datasets/energies.py
    def append(self, other: \"AtomEnergy\"):\n    \"\"\"\n    Append the mean and std of another atom energy\n    \"\"\"\n    self.mean = np.append(self.mean, other.mean)\n    self.std = np.append(self.std, other.std)\n
    "},{"location":"API/e0_dispatcher.html#openqdc.datasets.energies.AtomSpecies","title":"AtomSpecies dataclass","text":"

    Structure that defines a tuple of chemical specie and charge and provide hash and automatic conversion from atom number to checmical symbol

    Source code in openqdc/datasets/energies.py
    @dataclass(frozen=False, eq=True)\nclass AtomSpecies:\n    \"\"\"\n    Structure that defines a tuple of chemical specie and charge\n    and provide hash and automatic conversion from atom number to\n    checmical symbol\n    \"\"\"\n\n    symbol: Union[str, int]\n    charge: int = 0\n\n    def __post_init__(self):\n        if not isinstance(self.symbol, str):\n            self.symbol = ATOM_SYMBOLS[self.symbol]\n        self.number = ATOMIC_NUMBERS[self.symbol]\n\n    def __hash__(self):\n        return hash((self.symbol, self.charge))\n\n    def __eq__(self, other):\n        if not isinstance(other, AtomSpecies):\n            symbol, charge = other[0], other[1]\n            other = AtomSpecies(symbol=symbol, charge=charge)\n        return (self.number, self.charge) == (other.number, other.charge)\n
    "},{"location":"API/e0_dispatcher.html#openqdc.datasets.energies.IsolatedEnergyInterface","title":"IsolatedEnergyInterface","text":"

    Bases: ABC

    Abstract class that defines the interface for the different implementation of an isolated atom energy value

    Source code in openqdc/datasets/energies.py
    class IsolatedEnergyInterface(ABC):\n    \"\"\"\n    Abstract class that defines the interface for the\n    different implementation of an isolated atom energy value\n    \"\"\"\n\n    def __init__(self, data, **kwargs):\n        \"\"\"\n        Parameters:\n            data : openqdc.datasets.Dataset\n                Dataset object that contains the information\n                about the isolated atom energies. Info will be passed\n                by references\n            kwargs : dict\n                Additional arguments that will be passed to the\n                selected energy class. Mostly used for regression\n                to pass the regressor_kwargs.\n        \"\"\"\n        self._e0_matrixs = []\n        self._e0_dict = None\n        self.kwargs = kwargs\n        self.data = data\n        self._post_init()\n\n    @property\n    def refit(self) -> bool:\n        return self.data.refit_e0s\n\n    @abstractmethod\n    def _post_init(self):\n        \"\"\"\n        Main method to fetch/compute/recomputed the isolated atom energies.\n        Need to be implemented in all child classes.\n        \"\"\"\n        pass\n\n    def __len__(self):\n        return len(self.data.energy_methods)\n\n    @property\n    def e0_matrix(self) -> np.ndarray:\n        \"\"\"\n        Return the isolated atom energies matrixes\n\n        Returns:\n            Matrix Array with the isolated atom energies\n        \"\"\"\n        return np.array(self._e0_matrixs)\n\n    @property\n    def e0_dict(self) -> Dict:\n        \"\"\"\n        Return the isolated atom energies dict\n\n        Returns:\n            Dictionary with the isolated atom energies\n        \"\"\"\n\n        return self._e0s_dict\n\n    def __str__(self) -> str:\n        return self.__class__.__name__.lower()\n
    "},{"location":"API/e0_dispatcher.html#openqdc.datasets.energies.IsolatedEnergyInterface.e0_dict","title":"e0_dict: Dict property","text":"

    Return the isolated atom energies dict

    Returns:

    Type Description Dict

    Dictionary with the isolated atom energies

    "},{"location":"API/e0_dispatcher.html#openqdc.datasets.energies.IsolatedEnergyInterface.e0_matrix","title":"e0_matrix: np.ndarray property","text":"

    Return the isolated atom energies matrixes

    Returns:

    Type Description ndarray

    Matrix Array with the isolated atom energies

    "},{"location":"API/e0_dispatcher.html#openqdc.datasets.energies.IsolatedEnergyInterface.__init__","title":"__init__(data, **kwargs)","text":"

    Parameters:

    Name Type Description Default data

    openqdc.datasets.Dataset Dataset object that contains the information about the isolated atom energies. Info will be passed by references

    required kwargs

    dict Additional arguments that will be passed to the selected energy class. Mostly used for regression to pass the regressor_kwargs.

    {} Source code in openqdc/datasets/energies.py
    def __init__(self, data, **kwargs):\n    \"\"\"\n    Parameters:\n        data : openqdc.datasets.Dataset\n            Dataset object that contains the information\n            about the isolated atom energies. Info will be passed\n            by references\n        kwargs : dict\n            Additional arguments that will be passed to the\n            selected energy class. Mostly used for regression\n            to pass the regressor_kwargs.\n    \"\"\"\n    self._e0_matrixs = []\n    self._e0_dict = None\n    self.kwargs = kwargs\n    self.data = data\n    self._post_init()\n
    "},{"location":"API/e0_dispatcher.html#openqdc.datasets.energies.NullEnergy","title":"NullEnergy","text":"

    Bases: IsolatedEnergyInterface

    Class that returns a null (zeros) matrix for the isolated atom energies in case of no energies are available.

    Source code in openqdc/datasets/energies.py
    class NullEnergy(IsolatedEnergyInterface):\n    \"\"\"\n    Class that returns a null (zeros) matrix for the isolated atom energies in case\n    of no energies are available.\n    \"\"\"\n\n    def _assembly_e0_dict(self):\n        datum = {}\n        for _ in self.data.__energy_methods__:\n            for key, values in PotentialMethod.NONE.atom_energies_dict.items():\n                atm = AtomSpecies(*key)\n                ens = AtomEnergy(values)\n                if atm not in datum:\n                    datum[atm] = ens\n                else:\n                    datum[atm].append(ens)\n        self._e0s_dict = datum\n\n    def _post_init(self):\n        self._e0_matrixs = [PotentialMethod.NONE.atom_energies_matrix for _ in range(len(self.data.energy_methods))]\n        self._assembly_e0_dict()\n
    "},{"location":"API/e0_dispatcher.html#openqdc.datasets.energies.PhysicalEnergy","title":"PhysicalEnergy","text":"

    Bases: IsolatedEnergyInterface

    Class that returns a physical (SE,DFT,etc) isolated atom energies.

    Source code in openqdc/datasets/energies.py
    class PhysicalEnergy(IsolatedEnergyInterface):\n    \"\"\"\n    Class that returns a physical (SE,DFT,etc) isolated atom energies.\n    \"\"\"\n\n    def _assembly_e0_dict(self):\n        datum = {}\n        for method in self.data.__energy_methods__:\n            for key, values in method.atom_energies_dict.items():\n                atm = AtomSpecies(*key)\n                ens = AtomEnergy(values)\n                if atm not in datum:\n                    datum[atm] = ens\n                else:\n                    datum[atm].append(ens)\n        self._e0s_dict = datum\n\n    def _post_init(self):\n        self._e0_matrixs = [energy_method.atom_energies_matrix for energy_method in self.data.__energy_methods__]\n        self._assembly_e0_dict()\n
    "},{"location":"API/e0_dispatcher.html#openqdc.datasets.energies.RegressionEnergy","title":"RegressionEnergy","text":"

    Bases: IsolatedEnergyInterface

    Class that compute and returns the regressed isolated atom energies.

    Source code in openqdc/datasets/energies.py
    class RegressionEnergy(IsolatedEnergyInterface):\n    \"\"\"\n    Class that compute and returns the regressed isolated atom energies.\n    \"\"\"\n\n    def _post_init(self):\n        if not self.attempt_load() or self.refit:\n            self.regressor = Regressor.from_openqdc_dataset(self.data, **self.kwargs)\n            E0s, cov = self._compute_regression_e0s()\n            self._set_lin_atom_species_dict(E0s, cov)\n        self._set_linear_e0s()\n\n    def _compute_regression_e0s(self) -> Tuple[np.ndarray, Optional[np.ndarray]]:\n        \"\"\"\n        Try to compute the regressed isolated atom energies.\n        raise an error if the regression fails.\n        return the regressed isolated atom energies and the uncertainty values.\n\n        Returns:\n            Tuple with the regressed isolated atom energies and the uncertainty values of the regression\n            if available.\n        \"\"\"\n        try:\n            E0s, cov = self.regressor.solve()\n        except np.linalg.LinAlgError:\n            logger.warning(f\"Failed to compute E0s using {self.regressor.solver_type} regression.\")\n            raise np.linalg.LinAlgError\n        return E0s, cov\n\n    def _set_lin_atom_species_dict(self, E0s, covs) -> None:\n        \"\"\"\n        Set the regressed isolated atom energies in a dictionary format\n        and Save the values in a pickle file to easy loading.\n        \"\"\"\n        atomic_energies_dict = {}\n        for i, z in enumerate(self.regressor.numbers):\n            for charge in range(-10, 11):\n                atomic_energies_dict[AtomSpecies(z, charge)] = AtomEnergy(E0s[i], 1 if covs is None else covs[i])\n            # atomic_energies_dict[z] = E0s[i]\n        self._e0s_dict = atomic_energies_dict\n        self.save_e0s()\n\n    def _set_linear_e0s(self) -> None:\n        \"\"\"\n        Transform the e0s dictionary into the correct e0s\n        matrix format.\n        \"\"\"\n        new_e0s = [np.zeros((max(self.data.numbers) + 1, MAX_CHARGE_NUMBER)) for _ in range(len(self))]\n        for z, e0 in self._e0s_dict.items():\n            for i in range(len(self)):\n                # new_e0s[i][z, :] = e0[i]\n                new_e0s[i][z.number, z.charge] = e0.mean[i]\n            # for atom_sp, values in\n        self._e0_matrixs = new_e0s\n\n    def save_e0s(self) -> None:\n        \"\"\"\n        Save the regressed isolated atom energies in a pickle file.\n        \"\"\"\n        save_pkl(self._e0s_dict, self.preprocess_path)\n\n    def attempt_load(self) -> bool:\n        \"\"\"\n        Try to load the regressed isolated atom energies from the\n        object pickle file and return the success of the operation.\n        \"\"\"\n        try:\n            self._e0s_dict = load_pkl(self.preprocess_path)\n            logger.info(f\"Found energy file for {str(self)}.\")\n            return True\n        except FileNotFoundError:\n            logger.warning(f\"Energy file for {str(self)} not found.\")\n            return False\n\n    @property\n    def preprocess_path(self):\n        \"\"\"\n        Return the path to the object pickle file.\n        \"\"\"\n        path = p_join(self.data.root, \"preprocessed\", str(self) + \".pkl\")\n        return path\n
    "},{"location":"API/e0_dispatcher.html#openqdc.datasets.energies.RegressionEnergy.preprocess_path","title":"preprocess_path property","text":"

    Return the path to the object pickle file.

    "},{"location":"API/e0_dispatcher.html#openqdc.datasets.energies.RegressionEnergy.attempt_load","title":"attempt_load()","text":"

    Try to load the regressed isolated atom energies from the object pickle file and return the success of the operation.

    Source code in openqdc/datasets/energies.py
    def attempt_load(self) -> bool:\n    \"\"\"\n    Try to load the regressed isolated atom energies from the\n    object pickle file and return the success of the operation.\n    \"\"\"\n    try:\n        self._e0s_dict = load_pkl(self.preprocess_path)\n        logger.info(f\"Found energy file for {str(self)}.\")\n        return True\n    except FileNotFoundError:\n        logger.warning(f\"Energy file for {str(self)} not found.\")\n        return False\n
    "},{"location":"API/e0_dispatcher.html#openqdc.datasets.energies.RegressionEnergy.save_e0s","title":"save_e0s()","text":"

    Save the regressed isolated atom energies in a pickle file.

    Source code in openqdc/datasets/energies.py
    def save_e0s(self) -> None:\n    \"\"\"\n    Save the regressed isolated atom energies in a pickle file.\n    \"\"\"\n    save_pkl(self._e0s_dict, self.preprocess_path)\n
    "},{"location":"API/e0_dispatcher.html#openqdc.datasets.energies.dispatch_factory","title":"dispatch_factory(data, **kwargs)","text":"

    Factory function that select the correct energy class for the fetching/calculation of isolated atom energies.

    Parameters:

    Name Type Description Default data

    openqdc.datasets.Dataset Dataset object that contains the information about the isolated atom energies. Info will be passed by references

    required kwargs

    dict Additional arguments that will be passed to the selected energy class. Mostly used for regression to pass the regressor_kwargs.

    {}

    Returns:

    Type Description IsolatedEnergyInterface

    Initialized IsolatedEnergyInterface-like object

    Source code in openqdc/datasets/energies.py
    def dispatch_factory(data: Any, **kwargs: Dict) -> \"IsolatedEnergyInterface\":\n    \"\"\"\n    Factory function that select the correct\n    energy class for the fetching/calculation\n    of isolated atom energies.\n\n    Parameters:\n        data : openqdc.datasets.Dataset\n            Dataset object that contains the information\n            about the isolated atom energies. Info will be passed\n            by references\n        kwargs : dict\n            Additional arguments that will be passed to the\n            selected energy class. Mostly used for regression\n            to pass the regressor_kwargs.\n\n    Returns:\n        Initialized IsolatedEnergyInterface-like object\n    \"\"\"\n    if data.energy_type == \"formation\":\n        return PhysicalEnergy(data, **kwargs)\n    elif data.energy_type == \"regression\":\n        try:\n            return RegressionEnergy(data, **kwargs)\n        except np.linalg.LinAlgError:\n            logger.warning(\"Error! Using physical energies instead.\")\n            return PhysicalEnergy(data, **kwargs)\n    elif data.energy_type == \"null\":\n        return NullEnergy(data, **kwargs)\n
    "},{"location":"API/formats.html","title":"Format loading","text":""},{"location":"API/formats.html#openqdc.datasets.structure.GeneralStructure","title":"GeneralStructure","text":"

    Bases: ABC

    Abstract Factory class for datasets type in the openQDC package.

    Source code in openqdc/datasets/structure.py
    class GeneralStructure(ABC):\n    \"\"\"\n    Abstract Factory class for datasets type in the openQDC package.\n    \"\"\"\n\n    _ext: Optional[str] = None\n    _extra_files: Optional[List[str]] = None\n\n    @property\n    def ext(self):\n        return self._ext\n\n    @property\n    @abstractmethod\n    def load_fn(self) -> Callable:\n        \"\"\"\n        Function to use for loading the data.\n        Must be implemented by the child class.\n\n        Returns:\n            the function to use for loading the data\n        \"\"\"\n        raise NotImplementedError\n\n    def add_extension(self, filename: str) -> str:\n        \"\"\"\n        Add the correct extension to a filename\n\n        Parameters:\n            filename:  the filename to add the extension to\n\n        Returns:\n            the filename with the extension\n        \"\"\"\n        return filename + self.ext\n\n    @abstractmethod\n    def save_preprocess(\n        self,\n        preprocess_path: Union[str, PathLike],\n        data_keys: List[str],\n        data_dict: Dict[str, np.ndarray],\n        extra_data_keys: List[str],\n        extra_data_types: Dict[str, type],\n    ) -> List[str]:\n        \"\"\"\n        Save the preprocessed data to the cache directory and optionally upload it to the remote storage.\n        Must be implemented by the child class.\n\n        Parameters:\n            preprocess_path:  path to the preprocessed data file\n            data_keys:        list of keys to load from the data file\n            data_dict:        dictionary of data to save\n            extra_data_keys:  list of keys to load from the extra data file\n            extra_data_types: dictionary of data types for each key\n        \"\"\"\n        raise NotImplementedError\n\n    @abstractmethod\n    def load_extra_files(\n        self,\n        data: Dict[str, np.ndarray],\n        preprocess_path: Union[str, PathLike],\n        data_keys: List[str],\n        pkl_data_keys: List[str],\n        overwrite: bool,\n    ):\n        \"\"\"\n        Load extra files required to define other types of data.\n        Must be implemented by the child class.\n\n        Parameters:\n            data:  dictionary of data to load\n            preprocess_path:  path to the preprocessed data file\n            data_keys:    list of keys to load from the data file\n            pkl_data_keys:   list of keys to load from the extra files\n            overwrite:   whether to overwrite the local cache\n        \"\"\"\n        raise NotImplementedError\n\n    def join_and_ext(self, path: Union[str, PathLike], filename: str) -> Union[str, PathLike]:\n        \"\"\"\n        Join a path and a filename and add the correct extension.\n\n        Parameters:\n            path:  the path to join\n            filename:  the filename to join\n\n        Returns:\n            the joined path with the correct extension\n        \"\"\"\n        return p_join(path, self.add_extension(filename))\n\n    def load_data(\n        self,\n        preprocess_path: Union[str, PathLike],\n        data_keys: List[str],\n        data_types: Dict[str, np.dtype],\n        data_shapes: Dict[str, Tuple[int, int]],\n        extra_data_keys: List[str],\n        overwrite: bool,\n    ):\n        \"\"\"\n        Main method to load the data from a filetype structure like memmap or zarr.\n\n        Parameters:\n            preprocess_path:  path to the preprocessed data file\n            data_keys:        list of keys to load from the data file\n            data_types:       dictionary of data types for each key\n            data_shapes:      dictionary of shapes for each key\n            extra_data_keys:  list of keys to load from the extra data file\n            overwrite:        whether to overwrite the local cache\n        \"\"\"\n        data = {}\n        for key in data_keys:\n            filename = self.join_and_ext(preprocess_path, key)\n            pull_locally(filename, overwrite=overwrite)\n            data[key] = self.load_fn(filename, mode=\"r\", dtype=data_types[key])\n            data[key] = self.unpack(data[key])\n            data[key] = data[key].reshape(*data_shapes[key])\n\n        data = self.load_extra_files(data, preprocess_path, data_keys, extra_data_keys, overwrite)\n        return data\n\n    def unpack(self, data: any) -> any:\n        \"\"\"\n        Unpack the data from the loaded file.\n\n        Parameters:\n            data:  the data to unpack\n\n        Returns:\n            the unpacked data\n        \"\"\"\n        return data\n
    "},{"location":"API/formats.html#openqdc.datasets.structure.GeneralStructure.load_fn","title":"load_fn: Callable abstractmethod property","text":"

    Function to use for loading the data. Must be implemented by the child class.

    Returns:

    Type Description Callable

    the function to use for loading the data

    "},{"location":"API/formats.html#openqdc.datasets.structure.GeneralStructure.add_extension","title":"add_extension(filename)","text":"

    Add the correct extension to a filename

    Parameters:

    Name Type Description Default filename str

    the filename to add the extension to

    required

    Returns:

    Type Description str

    the filename with the extension

    Source code in openqdc/datasets/structure.py
    def add_extension(self, filename: str) -> str:\n    \"\"\"\n    Add the correct extension to a filename\n\n    Parameters:\n        filename:  the filename to add the extension to\n\n    Returns:\n        the filename with the extension\n    \"\"\"\n    return filename + self.ext\n
    "},{"location":"API/formats.html#openqdc.datasets.structure.GeneralStructure.join_and_ext","title":"join_and_ext(path, filename)","text":"

    Join a path and a filename and add the correct extension.

    Parameters:

    Name Type Description Default path Union[str, PathLike]

    the path to join

    required filename str

    the filename to join

    required

    Returns:

    Type Description Union[str, PathLike]

    the joined path with the correct extension

    Source code in openqdc/datasets/structure.py
    def join_and_ext(self, path: Union[str, PathLike], filename: str) -> Union[str, PathLike]:\n    \"\"\"\n    Join a path and a filename and add the correct extension.\n\n    Parameters:\n        path:  the path to join\n        filename:  the filename to join\n\n    Returns:\n        the joined path with the correct extension\n    \"\"\"\n    return p_join(path, self.add_extension(filename))\n
    "},{"location":"API/formats.html#openqdc.datasets.structure.GeneralStructure.load_data","title":"load_data(preprocess_path, data_keys, data_types, data_shapes, extra_data_keys, overwrite)","text":"

    Main method to load the data from a filetype structure like memmap or zarr.

    Parameters:

    Name Type Description Default preprocess_path Union[str, PathLike]

    path to the preprocessed data file

    required data_keys List[str]

    list of keys to load from the data file

    required data_types Dict[str, dtype]

    dictionary of data types for each key

    required data_shapes Dict[str, Tuple[int, int]]

    dictionary of shapes for each key

    required extra_data_keys List[str]

    list of keys to load from the extra data file

    required overwrite bool

    whether to overwrite the local cache

    required Source code in openqdc/datasets/structure.py
    def load_data(\n    self,\n    preprocess_path: Union[str, PathLike],\n    data_keys: List[str],\n    data_types: Dict[str, np.dtype],\n    data_shapes: Dict[str, Tuple[int, int]],\n    extra_data_keys: List[str],\n    overwrite: bool,\n):\n    \"\"\"\n    Main method to load the data from a filetype structure like memmap or zarr.\n\n    Parameters:\n        preprocess_path:  path to the preprocessed data file\n        data_keys:        list of keys to load from the data file\n        data_types:       dictionary of data types for each key\n        data_shapes:      dictionary of shapes for each key\n        extra_data_keys:  list of keys to load from the extra data file\n        overwrite:        whether to overwrite the local cache\n    \"\"\"\n    data = {}\n    for key in data_keys:\n        filename = self.join_and_ext(preprocess_path, key)\n        pull_locally(filename, overwrite=overwrite)\n        data[key] = self.load_fn(filename, mode=\"r\", dtype=data_types[key])\n        data[key] = self.unpack(data[key])\n        data[key] = data[key].reshape(*data_shapes[key])\n\n    data = self.load_extra_files(data, preprocess_path, data_keys, extra_data_keys, overwrite)\n    return data\n
    "},{"location":"API/formats.html#openqdc.datasets.structure.GeneralStructure.load_extra_files","title":"load_extra_files(data, preprocess_path, data_keys, pkl_data_keys, overwrite) abstractmethod","text":"

    Load extra files required to define other types of data. Must be implemented by the child class.

    Parameters:

    Name Type Description Default data Dict[str, ndarray]

    dictionary of data to load

    required preprocess_path Union[str, PathLike]

    path to the preprocessed data file

    required data_keys List[str]

    list of keys to load from the data file

    required pkl_data_keys List[str]

    list of keys to load from the extra files

    required overwrite bool

    whether to overwrite the local cache

    required Source code in openqdc/datasets/structure.py
    @abstractmethod\ndef load_extra_files(\n    self,\n    data: Dict[str, np.ndarray],\n    preprocess_path: Union[str, PathLike],\n    data_keys: List[str],\n    pkl_data_keys: List[str],\n    overwrite: bool,\n):\n    \"\"\"\n    Load extra files required to define other types of data.\n    Must be implemented by the child class.\n\n    Parameters:\n        data:  dictionary of data to load\n        preprocess_path:  path to the preprocessed data file\n        data_keys:    list of keys to load from the data file\n        pkl_data_keys:   list of keys to load from the extra files\n        overwrite:   whether to overwrite the local cache\n    \"\"\"\n    raise NotImplementedError\n
    "},{"location":"API/formats.html#openqdc.datasets.structure.GeneralStructure.save_preprocess","title":"save_preprocess(preprocess_path, data_keys, data_dict, extra_data_keys, extra_data_types) abstractmethod","text":"

    Save the preprocessed data to the cache directory and optionally upload it to the remote storage. Must be implemented by the child class.

    Parameters:

    Name Type Description Default preprocess_path Union[str, PathLike]

    path to the preprocessed data file

    required data_keys List[str]

    list of keys to load from the data file

    required data_dict Dict[str, ndarray]

    dictionary of data to save

    required extra_data_keys List[str]

    list of keys to load from the extra data file

    required extra_data_types Dict[str, type]

    dictionary of data types for each key

    required Source code in openqdc/datasets/structure.py
    @abstractmethod\ndef save_preprocess(\n    self,\n    preprocess_path: Union[str, PathLike],\n    data_keys: List[str],\n    data_dict: Dict[str, np.ndarray],\n    extra_data_keys: List[str],\n    extra_data_types: Dict[str, type],\n) -> List[str]:\n    \"\"\"\n    Save the preprocessed data to the cache directory and optionally upload it to the remote storage.\n    Must be implemented by the child class.\n\n    Parameters:\n        preprocess_path:  path to the preprocessed data file\n        data_keys:        list of keys to load from the data file\n        data_dict:        dictionary of data to save\n        extra_data_keys:  list of keys to load from the extra data file\n        extra_data_types: dictionary of data types for each key\n    \"\"\"\n    raise NotImplementedError\n
    "},{"location":"API/formats.html#openqdc.datasets.structure.GeneralStructure.unpack","title":"unpack(data)","text":"

    Unpack the data from the loaded file.

    Parameters:

    Name Type Description Default data any

    the data to unpack

    required

    Returns:

    Type Description any

    the unpacked data

    Source code in openqdc/datasets/structure.py
    def unpack(self, data: any) -> any:\n    \"\"\"\n    Unpack the data from the loaded file.\n\n    Parameters:\n        data:  the data to unpack\n\n    Returns:\n        the unpacked data\n    \"\"\"\n    return data\n
    "},{"location":"API/formats.html#openqdc.datasets.structure.MemMapDataset","title":"MemMapDataset","text":"

    Bases: GeneralStructure

    Dataset structure for memory-mapped numpy arrays and props.pkl files.

    Source code in openqdc/datasets/structure.py
    class MemMapDataset(GeneralStructure):\n    \"\"\"\n    Dataset structure for memory-mapped numpy arrays and props.pkl files.\n    \"\"\"\n\n    _ext = \".mmap\"\n    _extra_files = [\"props.pkl\"]\n\n    @property\n    def load_fn(self):\n        return np.memmap\n\n    def save_preprocess(self, preprocess_path, data_keys, data_dict, extra_data_keys, extra_data_types) -> List[str]:\n        local_paths = []\n        for key in data_keys:\n            local_path = self.join_and_ext(preprocess_path, key)\n            out = np.memmap(local_path, mode=\"w+\", dtype=data_dict[key].dtype, shape=data_dict[key].shape)\n            out[:] = data_dict.pop(key)[:]\n            out.flush()\n            local_paths.append(local_path)\n\n        # save smiles and subset\n        local_path = p_join(preprocess_path, \"props.pkl\")\n\n        # assert that (required) pkl keys are present in data_dict\n        assert all([key in data_dict.keys() for key in extra_data_keys])\n\n        # store unique and inverse indices for str-based pkl keys\n        for key in extra_data_keys:\n            if extra_data_types[key] == str:\n                data_dict[key] = np.unique(data_dict[key], return_inverse=True)\n\n        with open(local_path, \"wb\") as f:\n            pkl.dump(data_dict, f)\n\n        local_paths.append(local_path)\n        return local_paths\n\n    def load_extra_files(self, data, preprocess_path, data_keys, pkl_data_keys, overwrite):\n        filename = p_join(preprocess_path, \"props.pkl\")\n        pull_locally(filename, overwrite=overwrite)\n        with open(filename, \"rb\") as f:\n            tmp = pkl.load(f)\n            all_pkl_keys = set(tmp.keys()) - set(data_keys)\n            # assert required pkl_keys are present in all_pkl_keys\n            assert all([key in all_pkl_keys for key in pkl_data_keys])\n            for key in all_pkl_keys:\n                x = tmp.pop(key)\n                if len(x) == 2:\n                    data[key] = x[0][x[1]]\n                else:\n                    data[key] = x\n        return data\n
    "},{"location":"API/formats.html#openqdc.datasets.structure.ZarrDataset","title":"ZarrDataset","text":"

    Bases: GeneralStructure

    Dataset structure for zarr files.

    Source code in openqdc/datasets/structure.py
    class ZarrDataset(GeneralStructure):\n    \"\"\"\n    Dataset structure for zarr files.\n    \"\"\"\n\n    _ext = \".zip\"\n    _extra_files = [\"metadata.zip\"]\n    _zarr_version = 2\n\n    @property\n    def load_fn(self):\n        return zarr.open\n\n    def unpack(self, data):\n        return data[:]\n\n    def save_preprocess(self, preprocess_path, data_keys, data_dict, extra_data_keys, extra_data_types) -> List[str]:\n        # os.makedirs(p_join(ds.root, \"zips\",  ds.__name__), exist_ok=True)\n        local_paths = []\n        for key, value in data_dict.items():\n            if key not in data_keys:\n                continue\n            zarr_path = self.join_and_ext(preprocess_path, key)\n            value = data_dict.pop(key)\n            z = zarr.open(\n                zarr.storage.ZipStore(zarr_path),\n                \"w\",\n                zarr_version=self._zarr_version,\n                shape=value.shape,\n                dtype=value.dtype,\n            )\n            z[:] = value[:]\n            local_paths.append(zarr_path)\n            # if key in attrs:\n            #    z.attrs.update(attrs[key])\n\n        metadata = p_join(preprocess_path, \"metadata.zip\")\n\n        group = zarr.group(zarr.storage.ZipStore(metadata))\n\n        for key in extra_data_keys:\n            if extra_data_types[key] == str:\n                data_dict[key] = np.unique(data_dict[key], return_inverse=True)\n\n        for key, value in data_dict.items():\n            # sub=group.create_group(key)\n            if key in [\"name\", \"subset\"]:\n                data = group.create_dataset(key, shape=value[0].shape, dtype=value[0].dtype)\n                data[:] = value[0][:]\n                data2 = group.create_dataset(key + \"_ptr\", shape=value[1].shape, dtype=np.int32)\n                data2[:] = value[1][:]\n            else:\n                data = group.create_dataset(key, shape=value.shape, dtype=value.dtype)\n                data[:] = value[:]\n        local_paths.append(metadata)\n        return local_paths\n\n    def load_extra_files(self, data, preprocess_path, data_keys, pkl_data_keys, overwrite):\n        filename = self.join_and_ext(preprocess_path, \"metadata\")\n        pull_locally(filename, overwrite=overwrite)\n        tmp = self.load_fn(filename)\n        all_pkl_keys = set(tmp.keys()) - set(data_keys)\n        # assert required pkl_keys are present in all_pkl_keys\n        assert all([key in all_pkl_keys for key in pkl_data_keys])\n        for key in all_pkl_keys:\n            if key not in pkl_data_keys:\n                data[key] = tmp[key][:][tmp[key][:]]\n            else:\n                data[key] = tmp[key][:]\n        return data\n
    "},{"location":"API/methods.html","title":"QM Methods","text":""},{"location":"API/methods.html#openqdc.methods.enums.InteractionMethod","title":"InteractionMethod","text":"

    Bases: QmMethod

    Source code in openqdc/methods/enums.py
    class InteractionMethod(QmMethod):\n    CCSD_T_NN = Functional.CCSDT, BasisSet.NN\n    CCSD_T_CBS = Functional.CCSDT, BasisSet.CBS\n    CCSD_T_CC_PVDZ = Functional.CCSDT, BasisSet.CC_PVDZ\n    DCCSDT_HA_DZ = Functional.DCCSDT, BasisSet.HA_DZ\n    DCCSDT_HA_TZ = Functional.DCCSDT, BasisSet.HA_TZ\n    DLPNO_CCSDT = Functional.DLPNO_CCSDT, BasisSet.NONE\n    DLPNO_CCSDT0 = (\n        Functional.DLPNO_CCSDT0,\n        BasisSet.NONE,\n    )\n    FN_DMC = Functional.FN_DMC, BasisSet.NONE\n    FIXED = Functional.FIXED, BasisSet.NONE\n    LNO_CCSDT = Functional.LNO_CCSDT, BasisSet.NONE\n    MP2_CBS = Functional.MP2, BasisSet.CBS\n    MP2_CC_PVDZ = Functional.MP2, BasisSet.CC_PVDZ\n    MP2_CC_PVQZ = Functional.MP2, BasisSet.CC_PVQZ\n    MP2_CC_PVTZ = Functional.MP2, BasisSet.CC_PVTZ\n    MP2_5_CBS_ADZ = Functional.MP2_5, BasisSet.CBS_ADZ\n    MP2C_CBS = Functional.MP2C, BasisSet.CBS\n    QCISDT_CBS = Functional.QCISDT, BasisSet.CBS\n    SAPT0_AUG_CC_PWCVXZ = Functional.SAPT0, BasisSet.AUG_CC_PWCVXZ\n    SAPT0_JUN_CC_PVDZ = Functional.SAPT0, BasisSet.JUN_CC_PVDZ\n    SAPT0_JUN_CC_PVDDZ = Functional.SAPT0, BasisSet.JUN_CC_PVDDZ\n    SAPT0_AUG_CC_PVDDZ = Functional.SAPT0, BasisSet.AUG_CC_PVDDZ\n\n    @property\n    def atom_energies_dict(self):\n        \"\"\"Get an empty atomization energy dictionary because Interaction methods don't require this\"\"\"\n        return {}\n
    "},{"location":"API/methods.html#openqdc.methods.enums.InteractionMethod.atom_energies_dict","title":"atom_energies_dict property","text":"

    Get an empty atomization energy dictionary because Interaction methods don't require this

    "},{"location":"API/methods.html#openqdc.methods.enums.PotentialMethod","title":"PotentialMethod","text":"

    Bases: QmMethod

    Source code in openqdc/methods/enums.py
    class PotentialMethod(QmMethod):  # SPLIT FOR INTERACTIO ENERGIES AND FIX MD1\n    B1LYP_VWN5_DZP = Functional.B1LYP_VWN5, BasisSet.DZP\n    B1LYP_VWN5_SZ = Functional.B1LYP_VWN5, BasisSet.SZ\n    B1LYP_VWN5_TZP = Functional.B1LYP_VWN5, BasisSet.TZP\n    B1PW91_VWN5_DZP = Functional.B1PW91_VWN5, BasisSet.DZP\n    B1PW91_VWN5_SZ = Functional.B1PW91_VWN5, BasisSet.SZ\n    B1PW91_VWN5_TZP = Functional.B1PW91_VWN5, BasisSet.TZP\n    B3LYP_STO3G = Functional.B3LYP, BasisSet.STO3G  # TODO: calculate e0s\n    B3LYP_VWN5_DZP = Functional.B3LYP_VWN5, BasisSet.DZP\n    B3LYP_VWN5_SZ = Functional.B3LYP_VWN5, BasisSet.SZ\n    B3LYP_VWN5_TZP = Functional.B3LYP_VWN5, BasisSet.TZP\n    B3LYP_S_VWN5_DZP = Functional.B3LYP_S_VWN5, BasisSet.DZP\n    B3LYP_S_VWN5_SZ = Functional.B3LYP_S_VWN5, BasisSet.SZ\n    B3LYP_S_VWN5_TZP = Functional.B3LYP_S_VWN5, BasisSet.TZP\n    B3LYP_D_DZP = Functional.B3LYPD, BasisSet.DZP\n    B3LYP_D_SZ = Functional.B3LYPD, BasisSet.SZ\n    B3LYP_D_TZP = Functional.B3LYPD, BasisSet.TZP\n    B3LYP_D3_BJ_DEF2_TZVP = Functional.B3LYP_D3_BJ, BasisSet.DEF2_TZVP\n    B3LYP_6_31G_D = Functional.B3LYP, BasisSet.GSTAR\n    B3LYP_DEF2_TZVP = Functional.B3LYP, BasisSet.DEF2_TZVP\n    B97_1_DZP = Functional.B97_1, BasisSet.DZP\n    B97_1_SZ = Functional.B97_1, BasisSet.SZ\n    B97_1_TZP = Functional.B97_1, BasisSet.TZP\n    B97_2_DZP = Functional.B97_2, BasisSet.DZP\n    B97_2_SZ = Functional.B97_2, BasisSet.SZ\n    B97_2_TZP = Functional.B97_2, BasisSet.TZP\n    B97_D_DZP = Functional.B97_D, BasisSet.DZP\n    B97_D_SZ = Functional.B97_D, BasisSet.SZ\n    B97_D_TZP = Functional.B97_D, BasisSet.TZP\n    B97_DZP = Functional.B97, BasisSet.DZP\n    B97_SZ = Functional.B97, BasisSet.SZ\n    B97_TZP = Functional.B97, BasisSet.TZP\n    BECKE00_X_ONLY_DZP = Functional.BECKE00_X_ONLY, BasisSet.DZP\n    BECKE00_X_ONLY_SZ = Functional.BECKE00_X_ONLY, BasisSet.SZ\n    BECKE00_X_ONLY_TZP = Functional.BECKE00_X_ONLY, BasisSet.TZP\n    BECKE00_DZP = Functional.BECKE00, BasisSet.DZP\n    BECKE00_SZ = Functional.BECKE00, BasisSet.SZ\n    BECKE00_TZP = Functional.BECKE00, BasisSet.TZP\n    BECKE00X_XC_DZP = Functional.BECKE00X_XC, BasisSet.DZP\n    BECKE00X_XC_SZ = Functional.BECKE00X_XC, BasisSet.SZ\n    BECKE00X_XC_TZP = Functional.BECKE00X_XC, BasisSet.TZP\n    BECKE88X_BR89C_DZP = Functional.BECKE88X_BR89C, BasisSet.DZP\n    BECKE88X_BR89C_SZ = Functional.BECKE88X_BR89C, BasisSet.SZ\n    BECKE88X_BR89C_TZP = Functional.BECKE88X_BR89C, BasisSet.TZP\n    BHANDH_DZP = Functional.BHANDH, BasisSet.DZP\n    BHANDH_SZ = Functional.BHANDH, BasisSet.SZ\n    BHANDH_TZP = Functional.BHANDH, BasisSet.TZP\n    BHANDHLYP_DZP = Functional.BHANDHLYP, BasisSet.DZP\n    BHANDHLYP_SZ = Functional.BHANDHLYP, BasisSet.SZ\n    BHANDHLYP_TZP = Functional.BHANDHLYP, BasisSet.TZP\n    BLAP3_DZP = Functional.BLAP3, BasisSet.DZP\n    BLAP3_SZ = Functional.BLAP3, BasisSet.SZ\n    BLAP3_TZP = Functional.BLAP3, BasisSet.TZP\n    BLYP_D_DZP = Functional.BLYPD, BasisSet.DZP\n    BLYP_D_SZ = Functional.BLYPD, BasisSet.SZ\n    BLYP_D_TZP = Functional.BLYPD, BasisSet.TZP\n    BLYP_DZP = Functional.BLYP, BasisSet.DZP\n    BLYP_SZ = Functional.BLYP, BasisSet.SZ\n    BLYP_TZP = Functional.BLYP, BasisSet.TZP\n    BMTAU1_DZP = Functional.BMTAU1, BasisSet.DZP\n    BMTAU1_SZ = Functional.BMTAU1, BasisSet.SZ\n    BMTAU1_TZP = Functional.BMTAU1, BasisSet.TZP\n    BOP_DZP = Functional.BOP, BasisSet.DZP\n    BOP_SZ = Functional.BOP, BasisSet.SZ\n    BOP_TZP = Functional.BOP, BasisSet.TZP\n    BP_DZP = Functional.BP, BasisSet.DZP\n    BP_SZ = Functional.BP, BasisSet.SZ\n    BP_TZP = Functional.BP, BasisSet.TZP\n    BP86_D_DZP = Functional.BP86_D, BasisSet.DZP\n    BP86_D_SZ = Functional.BP86_D, BasisSet.SZ\n    BP86_D_TZP = Functional.BP86_D, BasisSet.TZP\n    CCSD_T_CBS = Functional.CCSDT, BasisSet.CBS\n    CCSD_T_CC_PVTZ = Functional.CCSDT, BasisSet.CC_PVDZ\n    CCSD_T_CC_PVDZ = Functional.CCSDT, BasisSet.CC_PVDZ\n    CCSD_CC_PVDZ = Functional.CCSD, BasisSet.CC_PVDZ\n\n    DFT3B = Functional.DFT3B, BasisSet.NONE\n    DSD_BLYP_D3_BJ_DEF2_TZVP = Functional.DSD_BLYP_D3_BJ, BasisSet.DEF2_TZVP\n    FT97_DZP = Functional.FT97, BasisSet.DZP\n    FT97_SZ = Functional.FT97, BasisSet.SZ\n    FT97_TZP = Functional.FT97, BasisSet.TZP\n    GFN1_XTB = Functional.GFN1_XTB, BasisSet.NONE\n    GFN2_XTB = Functional.GFN2_XTB, BasisSet.NONE\n    HCTH_120_DZP = Functional.HCTH_120, BasisSet.DZP\n    HCTH_120_SZ = Functional.HCTH_120, BasisSet.SZ\n    HCTH_120_TZP = Functional.HCTH_120, BasisSet.TZP\n    HCTH_147_DZP = Functional.HCTH_147, BasisSet.DZP\n    HCTH_147_SZ = Functional.HCTH_147, BasisSet.SZ\n    HCTH_147_TZP = Functional.HCTH_147, BasisSet.TZP\n    HCTH_407_DZP = Functional.HCTH_407, BasisSet.DZP\n    HCTH_407_SZ = Functional.HCTH_407, BasisSet.SZ\n    HCTH_407_TZP = Functional.HCTH_407, BasisSet.TZP\n    HCTH_93_DZP = Functional.HCTH_93, BasisSet.DZP\n    HCTH_93_SZ = Functional.HCTH_93, BasisSet.SZ\n    HCTH_93_TZP = Functional.HCTH_93, BasisSet.TZP\n    HF_DEF2_TZVP = Functional.HF, BasisSet.DEF2_TZVP\n    HF_CC_PVDZ = (\n        Functional.HF,\n        BasisSet.CC_PVDZ,\n    )\n    HF_CC_PVQZ = (\n        Functional.HF,\n        BasisSet.CC_PVQZ,\n    )\n    HF_CC_PVTZ = (\n        Functional.HF,\n        BasisSet.CC_PVTZ,\n    )\n    KCIS_MODIFIED_DZP = Functional.KCIS_MODIFIED, BasisSet.DZP\n    KCIS_MODIFIED_SZ = Functional.KCIS_MODIFIED, BasisSet.SZ\n    KCIS_MODIFIED_TZP = Functional.KCIS_MODIFIED, BasisSet.TZP\n    KCIS_ORIGINAL_DZP = Functional.KCIS_ORIGINAL, BasisSet.DZP\n    KCIS_ORIGINAL_SZ = Functional.KCIS_ORIGINAL, BasisSet.SZ\n    KCIS_ORIGINAL_TZP = Functional.KCIS_ORIGINAL, BasisSet.TZP\n    KMLYP_VWN5_DZP = Functional.KMLYP_VWN5, BasisSet.DZP\n    KMLYP_VWN5_SZ = Functional.KMLYP_VWN5, BasisSet.SZ\n    KMLYP_VWN5_TZP = Functional.KMLYP_VWN5, BasisSet.TZP\n    KT1_DZP = Functional.KT1, BasisSet.DZP\n    KT1_SZ = Functional.KT1, BasisSet.SZ\n    KT1_TZP = Functional.KT1, BasisSet.TZP\n    KT2_DZP = Functional.KT2, BasisSet.DZP\n    KT2_SZ = Functional.KT2, BasisSet.SZ\n    KT2_TZP = Functional.KT2, BasisSet.TZP\n    LDA_VWN_DZP = Functional.LDA_VWN, BasisSet.DZP\n    LDA_VWN_SZ = Functional.LDA_VWN, BasisSet.SZ\n    LDA_VWN_TZP = Functional.LDA_VWN, BasisSet.TZP\n    M05_2X_DZP = Functional.M05_2X, BasisSet.DZP\n    M05_2X_SZ = Functional.M05_2X, BasisSet.SZ\n    M05_2X_TZP = Functional.M05_2X, BasisSet.TZP\n    M05_DZP = Functional.M05, BasisSet.DZP\n    M05_SZ = Functional.M05, BasisSet.SZ\n    M05_TZP = Functional.M05, BasisSet.TZP\n    M06_2X_DZP = Functional.M06_2X, BasisSet.DZP\n    M06_2X_SZ = Functional.M06_2X, BasisSet.SZ\n    M06_2X_TZP = Functional.M06_2X, BasisSet.TZP\n    M06_L_DZP = Functional.M06_L, BasisSet.DZP\n    M06_L_SZ = Functional.M06_L, BasisSet.SZ\n    M06_L_TZP = Functional.M06_L, BasisSet.TZP\n    M06_DZP = Functional.M06, BasisSet.DZP\n    M06_SZ = Functional.M06, BasisSet.SZ\n    M06_TZP = Functional.M06, BasisSet.TZP\n    MP2_CC_PVDZ = Functional.MP2, BasisSet.CC_PVDZ\n    MP2_CC_PVQZ = Functional.MP2, BasisSet.CC_PVQZ\n    MP2_CC_PVTZ = Functional.MP2, BasisSet.CC_PVTZ\n    MPBE_DZP = Functional.MPBE, BasisSet.DZP\n    MPBE_SZ = Functional.MPBE, BasisSet.SZ\n    MPBE_TZP = Functional.MPBE, BasisSet.TZP\n    MPBE0KCIS_DZP = Functional.MPBE0KCIS, BasisSet.DZP\n    MPBE0KCIS_SZ = Functional.MPBE0KCIS, BasisSet.SZ\n    MPBE0KCIS_TZP = Functional.MPBE0KCIS, BasisSet.TZP\n    MPBE1KCIS_DZP = Functional.MPBE1KCIS, BasisSet.DZP\n    MPBE1KCIS_SZ = Functional.MPBE1KCIS, BasisSet.SZ\n    MPBE1KCIS_TZP = Functional.MPBE1KCIS, BasisSet.TZP\n    MPBEKCIS_DZP = Functional.MPBEKCIS, BasisSet.DZP\n    MPBEKCIS_SZ = Functional.MPBEKCIS, BasisSet.SZ\n    MPBEKCIS_TZP = Functional.MPBEKCIS, BasisSet.TZP\n    MPW_DZP = Functional.MPW, BasisSet.DZP\n    MPW_SZ = Functional.MPW, BasisSet.SZ\n    MPW_TZP = Functional.MPW, BasisSet.TZP\n    MPW1K_DZP = Functional.MPW1K, BasisSet.DZP\n    MPW1K_SZ = Functional.MPW1K, BasisSet.SZ\n    MPW1K_TZP = Functional.MPW1K, BasisSet.TZP\n    MPW1PW_DZP = Functional.MPW1PW, BasisSet.DZP\n    MPW1PW_SZ = Functional.MPW1PW, BasisSet.SZ\n    MPW1PW_TZP = Functional.MPW1PW, BasisSet.TZP\n    MVS_DZP = Functional.MVS, BasisSet.DZP\n    MVS_SZ = Functional.MVS, BasisSet.SZ\n    MVS_TZP = Functional.MVS, BasisSet.TZP\n    MVSX_DZP = Functional.MVSX, BasisSet.DZP\n    MVSX_SZ = Functional.MVSX, BasisSet.SZ\n    MVSX_TZP = Functional.MVSX, BasisSet.TZP\n    O3LYP_VWN5_DZP = Functional.O3LYP_VWN5, BasisSet.DZP\n    O3LYP_VWN5_SZ = Functional.O3LYP_VWN5, BasisSet.SZ\n    O3LYP_VWN5_TZP = Functional.O3LYP_VWN5, BasisSet.TZP\n    OLAP3_DZP = Functional.OLAP3, BasisSet.DZP\n    OLAP3_SZ = Functional.OLAP3, BasisSet.SZ\n    OLAP3_TZP = Functional.OLAP3, BasisSet.TZP\n    OLYP_DZP = Functional.OLYP, BasisSet.DZP\n    OLYP_SZ = Functional.OLYP, BasisSet.SZ\n    OLYP_TZP = Functional.OLYP, BasisSet.TZP\n    OPBE_DZP = Functional.OPBE, BasisSet.DZP\n    OPBE_SZ = Functional.OPBE, BasisSet.SZ\n    OPBE_TZP = Functional.OPBE, BasisSet.TZP\n    OPBE0_DZP = Functional.OPBE0, BasisSet.DZP\n    OPBE0_SZ = Functional.OPBE0, BasisSet.SZ\n    OPBE0_TZP = Functional.OPBE0, BasisSet.TZP\n    OPERDEW_DZP = Functional.OPERDEW, BasisSet.DZP\n    OPERDEW_SZ = Functional.OPERDEW, BasisSet.SZ\n    OPERDEW_TZP = Functional.OPERDEW, BasisSet.TZP\n    PBE_D_DZP = Functional.PBE_D, BasisSet.DZP\n    PBE_D_SZ = Functional.PBE_D, BasisSet.SZ\n    PBE_D_TZP = Functional.PBE_D, BasisSet.TZP\n    PBE_D3_BJ_DEF2_TZVP = Functional.PBE_D3_BJ, BasisSet.DEF2_TZVP\n    PBE_DEF2_TZVP = Functional.PBE, BasisSet.DEF2_TZVP\n    PBE_DZP = Functional.PBE, BasisSet.DZP\n    PBE_SZ = Functional.PBE, BasisSet.SZ\n    PBE_TZP = Functional.PBE, BasisSet.TZP\n    PBE0_DZP = Functional.PBE0, BasisSet.DZP\n    PBE0_DEF2_TZVP = Functional.PBE0, BasisSet.DEF2_TZVP\n    PBE0_SZ = Functional.PBE0, BasisSet.SZ\n    PBE0_TZP = Functional.PBE0, BasisSet.TZP\n    PBE0_MBD_DEF2_TZVPP = Functional.PBE0_MBD, BasisSet.DEF2_TZVPPD\n    PBESOL_DZP = Functional.PBESOL, BasisSet.DZP\n    PBESOL_SZ = Functional.PBESOL, BasisSet.SZ\n    PBESOL_TZP = Functional.PBESOL, BasisSet.TZP\n    PKZB_DZP = Functional.PKZB, BasisSet.DZP\n    PKZB_SZ = Functional.PKZB, BasisSet.SZ\n    PKZB_TZP = Functional.PKZB, BasisSet.TZP\n    PKZBX_KCISCOR_DZP = Functional.PKZBX_KCISCOR, BasisSet.DZP\n    PKZBX_KCISCOR_SZ = Functional.PKZBX_KCISCOR, BasisSet.SZ\n    PKZBX_KCISCOR_TZP = Functional.PKZBX_KCISCOR, BasisSet.TZP\n    PM6 = Functional.PM6, BasisSet.NONE\n    PW91_DZP = Functional.PW91, BasisSet.DZP\n    PW91_SZ = Functional.PW91, BasisSet.SZ\n    PW91_TZP = Functional.PW91, BasisSet.TZP\n    REVPBE_D3_BJ_DEF2_TZVP = Functional.REVPBE_D3_BJ, BasisSet.DEF2_TZVP\n    REVPBE_DZP = Functional.REVPBE, BasisSet.DZP\n    REVPBE_SZ = Functional.REVPBE, BasisSet.SZ\n    REVPBE_TZP = Functional.REVPBE, BasisSet.TZP\n    REVTPSS_DZP = Functional.REVTPSS, BasisSet.DZP\n    REVTPSS_SZ = Functional.REVTPSS, BasisSet.SZ\n    REVTPSS_TZP = Functional.REVTPSS, BasisSet.TZP\n    RGE2_DZP = Functional.RGE2, BasisSet.DZP\n    RGE2_SZ = Functional.RGE2, BasisSet.SZ\n    RGE2_TZP = Functional.RGE2, BasisSet.TZP\n    RPBE_DZP = Functional.RPBE, BasisSet.DZP\n    RPBE_SZ = Functional.RPBE, BasisSet.SZ\n    RPBE_TZP = Functional.RPBE, BasisSet.TZP\n    SSB_D_DZP = Functional.SSB_D, BasisSet.DZP\n    SSB_D_SZ = Functional.SSB_D, BasisSet.SZ\n    SSB_D_TZP = Functional.SSB_D, BasisSet.TZP\n    SVWN_DEF2_TZVP = Functional.SVWN, BasisSet.DEF2_TZVP\n    TMGGA_DZP = Functional.TMGGA, BasisSet.DZP\n    TMGGA_SZ = Functional.TMGGA, BasisSet.SZ\n    TMGGA_TZP = Functional.TMGGA, BasisSet.TZP\n    TAU_HCTH_HYBRID_DZP = Functional.TAU_HCTH_HYBRID, BasisSet.DZP\n    TAU_HCTH_HYBRID_SZ = Functional.TAU_HCTH_HYBRID, BasisSet.SZ\n    TAU_HCTH_HYBRID_TZP = Functional.TAU_HCTH_HYBRID, BasisSet.TZP\n    TAU_HCTH_DZP = Functional.TAU_HCTH, BasisSet.DZP\n    TAU_HCTH_SZ = Functional.TAU_HCTH, BasisSet.SZ\n    TAU_HCTH_TZP = Functional.TAU_HCTH, BasisSet.TZP\n    TCSSD_T_CC_PVDZ = Functional.TCSSD_T, BasisSet.CC_PVDZ\n    TPSSD_DZP = Functional.TPSSD, BasisSet.DZP\n    TPSSD_SZ = Functional.TPSSD, BasisSet.SZ\n    TPSSD_TZP = Functional.TPSSD, BasisSet.TZP\n    TPSS_DZP = Functional.TPSS, BasisSet.DZP\n    TPSS_SZ = Functional.TPSS, BasisSet.SZ\n    TPSS_TZP = Functional.TPSS, BasisSet.TZP\n    TPSSH_DEF2_TZVP = Functional.TPSSH, BasisSet.DEF2_TZVP\n    TPSSH_DZP = Functional.TPSSH, BasisSet.DZP\n    TPSSH_SZ = Functional.TPSSH, BasisSet.SZ\n    TPSSH_TZP = Functional.TPSSH, BasisSet.TZP\n    TTM2_1_F = Functional.TTM2_1_F, BasisSet.NONE\n    VS98_X_XC_DZP = Functional.VS98_X_XC, BasisSet.DZP\n    VS98_X_XC_SZ = Functional.VS98_X_XC, BasisSet.SZ\n    VS98_X_XC_TZP = Functional.VS98_X_XC, BasisSet.TZP\n    VS98_X_ONLY_DZP = Functional.VS98_X_ONLY, BasisSet.DZP\n    VS98_X_ONLY_SZ = Functional.VS98_X_ONLY, BasisSet.SZ\n    VS98_X_ONLY_TZP = Functional.VS98_X_ONLY, BasisSet.TZP\n    VS98_DZP = Functional.VS98, BasisSet.DZP\n    VS98_SZ = Functional.VS98, BasisSet.SZ\n    VS98_TZP = Functional.VS98, BasisSet.TZP\n    WB97M_D3BJ_DEF2_TZVPPD = Functional.WB97M_D3BJ, BasisSet.DEF2_TZVPPD\n    WB97X_D_DEF2_SVP = Functional.WB97X_D, BasisSet.DEF2_SVP\n    WB97X_D3_DEF2_TZVP = Functional.WB97X_D3, BasisSet.DEF2_TZVP\n    WB97X_D3_CC_PVDZ = Functional.WB97X_D3, BasisSet.CC_PVDZ\n    WB97X_6_31G_D = Functional.WB97X, BasisSet.GSTAR\n    WB97X_CC_PVTZ = Functional.WB97X, BasisSet.CC_PVTZ\n    X3LYP_VWN5_DZP = Functional.X3LYP_VWN5, BasisSet.DZP\n    X3LYP_VWN5_SZ = Functional.X3LYP_VWN5, BasisSet.SZ\n    X3LYP_VWN5_TZP = Functional.X3LYP_VWN5, BasisSet.TZP\n    XLYP_DZP = Functional.XLYP, BasisSet.DZP\n    XLYP_SZ = Functional.XLYP, BasisSet.SZ\n    XLYP_TZP = Functional.XLYP, BasisSet.TZP\n    NONE = Functional.NONE, BasisSet.NONE\n\n    def _build_default_dict(self):\n        e0_dict = {}\n        for SYMBOL in ATOM_SYMBOLS:\n            for CHARGE in range(-10, 11):\n                e0_dict[(SYMBOL, CHARGE)] = array([0], dtype=float32)\n        return e0_dict\n\n    @property\n    def atom_energies_dict(self):\n        \"\"\"Get the atomization energy dictionary\"\"\"\n        key = str(self)\n        try:\n            # print(key)\n            energies = atom_energy_collection.get(key, {})\n            if len(energies) == 0:\n                raise\n        except:  # noqa\n            logger.info(f\"No available atomization energy for the QM method {key}. All values are set to 0.\")\n            energies = self._build_default_dict()\n        return energies\n
    "},{"location":"API/methods.html#openqdc.methods.enums.PotentialMethod.atom_energies_dict","title":"atom_energies_dict property","text":"

    Get the atomization energy dictionary

    "},{"location":"API/methods.html#openqdc.methods.enums.QmMethod","title":"QmMethod","text":"

    Bases: Enum

    Source code in openqdc/methods/enums.py
    class QmMethod(Enum):\n    def __init__(self, functional: Functional, basis_set: BasisSet, cost: float = 0):\n        self.functional = functional\n        self.basis_set = basis_set\n        self.cost = cost\n\n    def __str__(self):\n        if self.basis_set != \"\":\n            s = \"/\".join([str(self.functional), str(self.basis_set)])\n        else:\n            s = str(self.functional)\n        return s\n\n    @property\n    def atom_energies_matrix(self):\n        \"\"\"Get the atomization energy matrix\"\"\"\n        energies = self.atom_energies_dict\n        mat = to_e_matrix(energies)\n\n        return mat\n\n    @property\n    def atom_energies_dict(self):\n        \"\"\"Get the atomization energy dictionary\"\"\"\n        raise NotImplementedError()\n
    "},{"location":"API/methods.html#openqdc.methods.enums.QmMethod.atom_energies_dict","title":"atom_energies_dict property","text":"

    Get the atomization energy dictionary

    "},{"location":"API/methods.html#openqdc.methods.enums.QmMethod.atom_energies_matrix","title":"atom_energies_matrix property","text":"

    Get the atomization energy matrix

    "},{"location":"API/methods.html#isolated-atom-energies","title":"Isolated Atom Energies","text":""},{"location":"API/methods.html#openqdc.methods.atom_energies.to_e_matrix","title":"to_e_matrix(atom_energies)","text":"

    Get the matrix of isolated atom energies for a dict of non-null values calculates

    Parameters:

    Name Type Description Default atom_energies Dict

    Dict of energies computed for a given QM method. Keys are pairs of (atom, charge) and values are energy values

    required

    np.ndarray of shape (MAX_ATOMIC_NUMBER, 2 * MAX_CHARGE + 1)

    Type Description ndarray

    Matrix containing the isolated atom energies for each atom and charge written in the form:

            |   | -2 | -1 | 0 | +1 | +2 | <- charges\n        |---|----|----|---|----|----|\n        | 0 |    |    |   |    |    |\n        | 1 |    |    |   |    |    |\n        | 2 |    |    |   |    |    |\n
    Source code in openqdc/methods/atom_energies.py
    def to_e_matrix(atom_energies: Dict) -> np.ndarray:\n    \"\"\"\n    Get the matrix of isolated atom energies for a dict of non-null values calculates\n\n    Parameters:\n        atom_energies: Dict of energies computed for a given QM method.\n            Keys are pairs of (atom, charge) and values are energy values\n\n    Returns: np.ndarray of shape (MAX_ATOMIC_NUMBER, 2 * MAX_CHARGE + 1)\n        Matrix containing the isolated atom energies for each atom and charge written in the form:\n\n                        |   | -2 | -1 | 0 | +1 | +2 | <- charges\n                        |---|----|----|---|----|----|\n                        | 0 |    |    |   |    |    |\n                        | 1 |    |    |   |    |    |\n                        | 2 |    |    |   |    |    |\n    \"\"\"\n\n    matrix = np.zeros((MAX_ATOMIC_NUMBER, MAX_CHARGE_NUMBER))\n    if len(atom_energies) > 0:\n        for key in atom_energies.keys():\n            try:\n                matrix[ATOMIC_NUMBERS[key[0]], key[1] + MAX_CHARGE] = atom_energies[key]\n            except KeyError:\n                logger.error(f\"Isolated atom energies not found for {key}\")\n    return matrix\n
    "},{"location":"API/properties.html","title":"Defined properties for datasets","text":""},{"location":"API/properties.html#openqdc.datasets.properties.DatasetPropertyMixIn","title":"DatasetPropertyMixIn","text":"

    Mixin class for BaseDataset class to add properties that are common to all datasets.

    Source code in openqdc/datasets/properties.py
    class DatasetPropertyMixIn:\n    \"\"\"\n    Mixin class for BaseDataset class to add\n    properties that are common to all datasets.\n    \"\"\"\n\n    @property\n    def atoms_per_molecules(self):\n        try:\n            if hasattr(self, \"_n_atoms\"):\n                return self._n_atoms\n            self._n_atoms = self.data[\"n_atoms\"]\n            return self._n_atoms\n        except:  # noqa\n            return None\n\n    @property\n    def _stats(self):\n        return self.__stats__\n\n    def _compute_average_nb_atoms(self):\n        self.__average_nb_atoms__ = np.mean(self.data[\"n_atoms\"])\n\n    @property\n    def average_n_atoms(self) -> int:\n        \"\"\"\n        Average number of atoms in a molecule in the dataset.\n\n        Returns:\n            Average number of atoms in a molecule in the dataset.\n        \"\"\"\n        if self.__average_nb_atoms__ is None:\n            raise StatisticsNotAvailableError(self.__name__)\n        return self.__average_nb_atoms__\n\n    @property\n    def numbers(self) -> np.ndarray:\n        \"\"\"\n        Unique atomic numbers in the dataset\n\n        Returns:\n            Array of the unique atomic numbers in the dataset\n        \"\"\"\n        if hasattr(self, \"_numbers\"):\n            return self._numbers\n        self._numbers = pd.unique(self.data[\"atomic_inputs\"][..., 0]).astype(np.int32)\n        return self._numbers\n\n    @property\n    def charges(self) -> np.ndarray:\n        \"\"\"\n        Unique charges in the dataset\n\n        Returns:\n            Array of the unique charges in the dataset\n        \"\"\"\n        if hasattr(self, \"_charges\"):\n            return self._charges\n        self._charges = np.unique(self.data[\"atomic_inputs\"][..., :2], axis=0).astype(np.int32)\n        return self._charges\n\n    @property\n    def min_max_charges(self) -> Tuple[int, int]:\n        \"\"\"\n        Minimum and maximum charges in the dataset\n\n        Returns:\n            (min_charge, max_charge)\n        \"\"\"\n        if hasattr(self, \"_min_max_charges\"):\n            return self._min_max_charges\n        self._min_max_charges = np.min(self.charges[:, 1]), np.max(self.charges[:, 1])\n        return self._min_max_charges\n\n    @property\n    def chemical_species(self) -> np.ndarray:\n        \"\"\"\n        Chemical symbols in the dataset\n\n        Returns:\n            Array of the chemical symbols in the dataset\n        \"\"\"\n        return np.array(ATOM_SYMBOLS)[self.numbers]\n
    "},{"location":"API/properties.html#openqdc.datasets.properties.DatasetPropertyMixIn.average_n_atoms","title":"average_n_atoms: int property","text":"

    Average number of atoms in a molecule in the dataset.

    Returns:

    Type Description int

    Average number of atoms in a molecule in the dataset.

    "},{"location":"API/properties.html#openqdc.datasets.properties.DatasetPropertyMixIn.charges","title":"charges: np.ndarray property","text":"

    Unique charges in the dataset

    Returns:

    Type Description ndarray

    Array of the unique charges in the dataset

    "},{"location":"API/properties.html#openqdc.datasets.properties.DatasetPropertyMixIn.chemical_species","title":"chemical_species: np.ndarray property","text":"

    Chemical symbols in the dataset

    Returns:

    Type Description ndarray

    Array of the chemical symbols in the dataset

    "},{"location":"API/properties.html#openqdc.datasets.properties.DatasetPropertyMixIn.min_max_charges","title":"min_max_charges: Tuple[int, int] property","text":"

    Minimum and maximum charges in the dataset

    Returns:

    Type Description Tuple[int, int]

    (min_charge, max_charge)

    "},{"location":"API/properties.html#openqdc.datasets.properties.DatasetPropertyMixIn.numbers","title":"numbers: np.ndarray property","text":"

    Unique atomic numbers in the dataset

    Returns:

    Type Description ndarray

    Array of the unique atomic numbers in the dataset

    "},{"location":"API/regressor.html","title":"Normalization regressor","text":"

    Linear Atom Energies regression utilities.

    "},{"location":"API/regressor.html#openqdc.utils.regressor.LinearSolver","title":"LinearSolver","text":"

    Bases: Solver

    Linear regression solver.

    Note

    No Uncertainty associated as it is quite small.

    Source code in openqdc/utils/regressor.py
    class LinearSolver(Solver):\n    \"\"\"\n    Linear regression solver.\n\n    Note:\n        No Uncertainty associated as it is quite small.\n    \"\"\"\n\n    _regr_str = \"linear\"\n\n    @staticmethod\n    def solve(X, y):\n        X, y, y_mean = atom_standardization(X, y)\n        E0s = np.linalg.lstsq(X, y, rcond=None)[0]\n        return E0s, None\n
    "},{"location":"API/regressor.html#openqdc.utils.regressor.Regressor","title":"Regressor","text":"

    Regressor class for preparing and solving regression problem for isolated atom energies. A isolated atom energy regression problem is defined as:

    X = [n_samples, n_species] (number of atoms of each species per sample)

    Y = [n_samples, ] (energies)

    The regression problem is solved by solving the linear system X E0 = Y.

    Example

    For a sytem of 2 samples (H20, CH4)

    n_species = 3, n_samples = 2\n\nH20 = 2H , 1O -> X = [2, 1, 0]\n\nCH4 = 4C, 1H -> X = [1, 0, 4]\n\nX = [[2, 1, 0],\n    [ 1, 0, 4]]\n\nY = [[10, 20]]\n\nX E0 = Y\n

    Linear system to solve

    [[2 eH, 1 eO, 0 eC],\n[ 1 eH, 0 eO, 4 eC]] = [[10, 20]]\n
    Source code in openqdc/utils/regressor.py
    class Regressor:\n    \"\"\"\n    Regressor class for preparing and solving regression problem for isolated atom energies.\n    A isolated atom energy regression problem is defined as:\\n\n    X = [n_samples, n_species] (number of atoms of each species per sample)\\n\n    Y = [n_samples, ] (energies)\\n\n    The regression problem is solved by solving the linear system X E0 = Y.\n\n    Example:\n        For a sytem of 2 samples (H20, CH4)\\n\n            n_species = 3, n_samples = 2\\n\n            H20 = 2H , 1O -> X = [2, 1, 0]\\n\n            CH4 = 4C, 1H -> X = [1, 0, 4]\\n\n            X = [[2, 1, 0],\n                [ 1, 0, 4]]\\n\n            Y = [[10, 20]]\\n\n            X E0 = Y\\n\n        Linear system to solve\\n\n            [[2 eH, 1 eO, 0 eC],\n            [ 1 eH, 0 eO, 4 eC]] = [[10, 20]]\n    \"\"\"\n\n    solver: Solver\n\n    def __init__(\n        self,\n        energies: np.ndarray,\n        atomic_numbers: np.ndarray,\n        position_idx_range: np.ndarray,\n        solver_type: str = \"linear\",\n        stride: int = 1,\n        subsample: Optional[Union[float, int]] = None,\n        remove_nan: bool = True,\n        *args: any,\n        **kwargs: any,\n    ):\n        \"\"\"\n        Regressor class for preparing and solving regression problem for isolated atom energies.\n\n        Parameters:\n            energies:\n                numpy array of energies in the shape (n_samples, n_energy_methods)\n            atomic_numbers:\n                numpy array of atomic numbers in the shape (n_atoms,)\n            position_idx_range:\n                array of shape (n_samples, 2) containing the start and end indices of the atoms in the dataset\n            solver_type: Type of solver to use. [\"linear\", \"ridge\"]\n            stride: Stride to use for the regression.\n            subsample: Sumsample the dataset.\n                If a float, it is interpreted as a fraction of the dataset to use.\n                If >1 it is interpreted as the number of samples to use.\n            remove_nan: Sanitize the dataset by removing energies samples with NaN values.\n            *args: Additional arguments to be passed to the regressor.\n            **kwargs: Additional keyword arguments to be passed to the regressor.\n        \"\"\"\n        self.subsample = subsample\n        self.stride = stride\n        self.solver_type = solver_type.lower()\n        self.energies = energies\n        self.atomic_numbers = atomic_numbers\n        self.numbers = pd.unique(atomic_numbers)\n        self.position_idx_range = position_idx_range\n        self.remove_nan = remove_nan\n        self.hparams = {\n            \"subsample\": subsample,\n            \"stride\": stride,\n            \"solver_type\": solver_type,\n        }\n        self._post_init()\n\n    @classmethod\n    def from_openqdc_dataset(cls, dataset: any, *args: any, **kwargs: any) -> \"Regressor\":\n        \"\"\"\n        Initialize the regressor object from an openqdc dataset. This is the default method.\n        *args and and **kwargs are passed to the __init__ method and depends on the specific regressor.\n\n        Parameters:\n            dataset: openqdc dataset object.\n            *args: Additional arguments to be passed to the regressor.\n            **kwargs: Additional keyword arguments to be passed to the regressor.\n\n        Returns:\n            Instance of the regressor class.\n        \"\"\"\n        energies = dataset.data[\"energies\"]\n        position_idx_range = dataset.data[\"position_idx_range\"]\n        atomic_numbers = dataset.data[\"atomic_inputs\"][:, 0].astype(\"int32\")\n        return cls(energies, atomic_numbers, position_idx_range, *args, **kwargs)\n\n    def _post_init(self):\n        if self.subsample is not None:\n            self._downsample()\n        self._prepare_inputs()\n        self.solver = self._get_solver()\n\n    def update_hparams(self, hparams):\n        self.hparams.update(hparams)\n\n    def _downsample(self):\n        if self.subsample < 1:\n            idxs = np.arange(self.energies.shape[0])\n            np.random.shuffle(idxs)\n            idxs = idxs[: int(self.energies.shape[0] * self.subsample)]\n            self.energies = self.energies[:: int(1 / self.subsample)]\n            self.position_idx_range = self.position_idx_range[:: int(1 / self.subsample)]\n        else:\n            idxs = np.random.randint(0, self.energies.shape[0], int(self.subsample))\n            self.energies = self.energies[idxs]\n            self.position_idx_range = self.position_idx_range[idxs]\n        self.update_hparams({\"idxs\": idxs})\n\n    def _get_solver(self):\n        try:\n            return AVAILABLE_SOLVERS[self.solver_type]()\n        except KeyError:\n            logger.warning(f\"Unknown solver type {self.solver_type}, defaulting to linear regression.\")\n            return LinearSolver()\n\n    def _prepare_inputs(self) -> Tuple[np.ndarray, np.ndarray]:\n        logger.info(\"Preparing inputs for regression.\")\n        len_train = self.energies.shape[0]\n        len_zs = len(self.numbers)\n        A = np.zeros((len_train, len_zs))[:: self.stride]\n        B = self.energies[:: self.stride]\n        for i, ij in enumerate(self.position_idx_range[:: self.stride]):\n            tmp = self.atomic_numbers[ij[0] : ij[1]]\n            for j, z in enumerate(self.numbers):\n                A[i, j] = np.count_nonzero(tmp == z)\n        self.X = A\n        self.y = B\n\n    def solve(self):\n        \"\"\"\n        Solve the regression problem and return the predicted isolated energies and the estimated uncertainty.\n        \"\"\"\n        logger.info(f\"Solving regression with {self.solver}.\")\n        E0_list, cov_list = [], []\n        for energy_idx in range(self.y.shape[1]):\n            if self.remove_nan:\n                idxs = non_nan_idxs(self.y[:, energy_idx])\n                X, y = self.X[idxs], self.y[idxs, energy_idx]\n            else:\n                X, y = self.X, self.y[:, energy_idx]\n            E0s, cov = self.solver(X, y)\n            if cov is None:\n                cov = np.zeros_like(E0s) + 1.0\n            E0_list.append(E0s)\n            cov_list.append(cov)\n        return np.vstack(E0_list).T, np.vstack(cov_list).T\n\n    def __call__(self):\n        return self.solve()\n
    "},{"location":"API/regressor.html#openqdc.utils.regressor.Regressor.__init__","title":"__init__(energies, atomic_numbers, position_idx_range, solver_type='linear', stride=1, subsample=None, remove_nan=True, *args, **kwargs)","text":"

    Regressor class for preparing and solving regression problem for isolated atom energies.

    Parameters:

    Name Type Description Default energies ndarray

    numpy array of energies in the shape (n_samples, n_energy_methods)

    required atomic_numbers ndarray

    numpy array of atomic numbers in the shape (n_atoms,)

    required position_idx_range ndarray

    array of shape (n_samples, 2) containing the start and end indices of the atoms in the dataset

    required solver_type str

    Type of solver to use. [\"linear\", \"ridge\"]

    'linear' stride int

    Stride to use for the regression.

    1 subsample Optional[Union[float, int]]

    Sumsample the dataset. If a float, it is interpreted as a fraction of the dataset to use. If >1 it is interpreted as the number of samples to use.

    None remove_nan bool

    Sanitize the dataset by removing energies samples with NaN values.

    True *args any

    Additional arguments to be passed to the regressor.

    () **kwargs any

    Additional keyword arguments to be passed to the regressor.

    {} Source code in openqdc/utils/regressor.py
    def __init__(\n    self,\n    energies: np.ndarray,\n    atomic_numbers: np.ndarray,\n    position_idx_range: np.ndarray,\n    solver_type: str = \"linear\",\n    stride: int = 1,\n    subsample: Optional[Union[float, int]] = None,\n    remove_nan: bool = True,\n    *args: any,\n    **kwargs: any,\n):\n    \"\"\"\n    Regressor class for preparing and solving regression problem for isolated atom energies.\n\n    Parameters:\n        energies:\n            numpy array of energies in the shape (n_samples, n_energy_methods)\n        atomic_numbers:\n            numpy array of atomic numbers in the shape (n_atoms,)\n        position_idx_range:\n            array of shape (n_samples, 2) containing the start and end indices of the atoms in the dataset\n        solver_type: Type of solver to use. [\"linear\", \"ridge\"]\n        stride: Stride to use for the regression.\n        subsample: Sumsample the dataset.\n            If a float, it is interpreted as a fraction of the dataset to use.\n            If >1 it is interpreted as the number of samples to use.\n        remove_nan: Sanitize the dataset by removing energies samples with NaN values.\n        *args: Additional arguments to be passed to the regressor.\n        **kwargs: Additional keyword arguments to be passed to the regressor.\n    \"\"\"\n    self.subsample = subsample\n    self.stride = stride\n    self.solver_type = solver_type.lower()\n    self.energies = energies\n    self.atomic_numbers = atomic_numbers\n    self.numbers = pd.unique(atomic_numbers)\n    self.position_idx_range = position_idx_range\n    self.remove_nan = remove_nan\n    self.hparams = {\n        \"subsample\": subsample,\n        \"stride\": stride,\n        \"solver_type\": solver_type,\n    }\n    self._post_init()\n
    "},{"location":"API/regressor.html#openqdc.utils.regressor.Regressor.from_openqdc_dataset","title":"from_openqdc_dataset(dataset, *args, **kwargs) classmethod","text":"

    Initialize the regressor object from an openqdc dataset. This is the default method. args and and *kwargs are passed to the init method and depends on the specific regressor.

    Parameters:

    Name Type Description Default dataset any

    openqdc dataset object.

    required *args any

    Additional arguments to be passed to the regressor.

    () **kwargs any

    Additional keyword arguments to be passed to the regressor.

    {}

    Returns:

    Type Description Regressor

    Instance of the regressor class.

    Source code in openqdc/utils/regressor.py
    @classmethod\ndef from_openqdc_dataset(cls, dataset: any, *args: any, **kwargs: any) -> \"Regressor\":\n    \"\"\"\n    Initialize the regressor object from an openqdc dataset. This is the default method.\n    *args and and **kwargs are passed to the __init__ method and depends on the specific regressor.\n\n    Parameters:\n        dataset: openqdc dataset object.\n        *args: Additional arguments to be passed to the regressor.\n        **kwargs: Additional keyword arguments to be passed to the regressor.\n\n    Returns:\n        Instance of the regressor class.\n    \"\"\"\n    energies = dataset.data[\"energies\"]\n    position_idx_range = dataset.data[\"position_idx_range\"]\n    atomic_numbers = dataset.data[\"atomic_inputs\"][:, 0].astype(\"int32\")\n    return cls(energies, atomic_numbers, position_idx_range, *args, **kwargs)\n
    "},{"location":"API/regressor.html#openqdc.utils.regressor.Regressor.solve","title":"solve()","text":"

    Solve the regression problem and return the predicted isolated energies and the estimated uncertainty.

    Source code in openqdc/utils/regressor.py
    def solve(self):\n    \"\"\"\n    Solve the regression problem and return the predicted isolated energies and the estimated uncertainty.\n    \"\"\"\n    logger.info(f\"Solving regression with {self.solver}.\")\n    E0_list, cov_list = [], []\n    for energy_idx in range(self.y.shape[1]):\n        if self.remove_nan:\n            idxs = non_nan_idxs(self.y[:, energy_idx])\n            X, y = self.X[idxs], self.y[idxs, energy_idx]\n        else:\n            X, y = self.X, self.y[:, energy_idx]\n        E0s, cov = self.solver(X, y)\n        if cov is None:\n            cov = np.zeros_like(E0s) + 1.0\n        E0_list.append(E0s)\n        cov_list.append(cov)\n    return np.vstack(E0_list).T, np.vstack(cov_list).T\n
    "},{"location":"API/regressor.html#openqdc.utils.regressor.RidgeSolver","title":"RidgeSolver","text":"

    Bases: Solver

    Ridge regression solver.

    Source code in openqdc/utils/regressor.py
    class RidgeSolver(Solver):\n    \"\"\"\n    Ridge regression solver.\n    \"\"\"\n\n    _regr_str = \"ridge\"\n\n    @staticmethod\n    def solve(X, y):\n        X, y, y_mean = atom_standardization(X, y)\n        A = X.T @ X\n        dy = y - (np.sum(X, axis=1, keepdims=True) * y_mean).reshape(y.shape)\n        Xy = X.T @ dy\n        mean = np.linalg.solve(A, Xy)\n        sigma2 = np.var(X @ mean - dy)\n        Ainv = np.linalg.inv(A)\n        cov = np.sqrt(sigma2 * np.einsum(\"ij,kj,kl,li->i\", Ainv, X, X, Ainv))\n        mean = mean + y_mean.reshape([-1])\n        return mean, cov\n
    "},{"location":"API/regressor.html#openqdc.utils.regressor.Solver","title":"Solver","text":"

    Bases: ABC

    Abstract class for regression solvers.

    Source code in openqdc/utils/regressor.py
    class Solver(ABC):\n    \"\"\"Abstract class for regression solvers.\"\"\"\n\n    _regr_str: str\n\n    @staticmethod\n    @abstractmethod\n    def solve(X: np.ndarray, Y: np.ndarray) -> Tuple[np.ndarray, Optional[np.ndarray]]:\n        \"\"\"\n        Main method to solve the regression problem.\n        Must be implemented in all the subclasses.\n\n        Parameters:\n            X: Input features of shape (n_samples, n_species)\n            Y: Target values of shape (n_samples,) (energy values for the regression)\n\n        Returns:\n            Tuple of predicted values and the estimated uncertainty.\n        \"\"\"\n        pass\n\n    def __call__(self, X, Y):\n        return self.solve(X, Y)\n\n    def __str__(self):\n        return self._regr_str\n\n    def __repr__(self):\n        return str(self)\n
    "},{"location":"API/regressor.html#openqdc.utils.regressor.Solver.solve","title":"solve(X, Y) abstractmethod staticmethod","text":"

    Main method to solve the regression problem. Must be implemented in all the subclasses.

    Parameters:

    Name Type Description Default X ndarray

    Input features of shape (n_samples, n_species)

    required Y ndarray

    Target values of shape (n_samples,) (energy values for the regression)

    required

    Returns:

    Type Description Tuple[ndarray, Optional[ndarray]]

    Tuple of predicted values and the estimated uncertainty.

    Source code in openqdc/utils/regressor.py
    @staticmethod\n@abstractmethod\ndef solve(X: np.ndarray, Y: np.ndarray) -> Tuple[np.ndarray, Optional[np.ndarray]]:\n    \"\"\"\n    Main method to solve the regression problem.\n    Must be implemented in all the subclasses.\n\n    Parameters:\n        X: Input features of shape (n_samples, n_species)\n        Y: Target values of shape (n_samples,) (energy values for the regression)\n\n    Returns:\n        Tuple of predicted values and the estimated uncertainty.\n    \"\"\"\n    pass\n
    "},{"location":"API/regressor.html#openqdc.utils.regressor.atom_standardization","title":"atom_standardization(X, y)","text":"

    Standardize the energies and the atom counts. This will make the calculated uncertainty more meaningful.

    Source code in openqdc/utils/regressor.py
    def atom_standardization(X, y):\n    \"\"\"\n    Standardize the energies and the atom counts.\n    This will make the calculated uncertainty more\n    meaningful.\n    \"\"\"\n    X_norm = X.sum()\n    X = X / X_norm\n    y = y / X_norm\n    y_mean = y.sum() / X.sum()\n    return X, y, y_mean\n
    "},{"location":"API/regressor.html#openqdc.utils.regressor.non_nan_idxs","title":"non_nan_idxs(array)","text":"

    Return non nan indices of an array.

    Source code in openqdc/utils/regressor.py
    def non_nan_idxs(array):\n    \"\"\"\n    Return non nan indices of an array.\n    \"\"\"\n    return np.where(~np.isnan(array))[0]\n
    "},{"location":"API/statistics.html","title":"Statistics","text":""},{"location":"API/statistics.html#openqdc.datasets.statistics.AbstractStatsCalculator","title":"AbstractStatsCalculator","text":"

    Bases: ABC

    Abstract class that defines the interface for all the calculators object and the methods to compute the statistics.

    Source code in openqdc/datasets/statistics.py
    class AbstractStatsCalculator(ABC):\n    \"\"\"\n    Abstract class that defines the interface for all\n    the calculators object and the methods to\n    compute the statistics.\n    \"\"\"\n\n    # State Dependencies of the calculator to skip part of the calculation\n    state_dependency = []\n    name = None\n\n    def __init__(\n        self,\n        name: str,\n        energy_type: Optional[str] = None,\n        force_recompute: bool = False,\n        energies: Optional[np.ndarray] = None,\n        n_atoms: Optional[np.ndarray] = None,\n        atom_species: Optional[np.ndarray] = None,\n        position_idx_range: Optional[np.ndarray] = None,\n        e0_matrix: Optional[np.ndarray] = None,\n        atom_charges: Optional[np.ndarray] = None,\n        forces: Optional[np.ndarray] = None,\n    ):\n        \"\"\"\n        Parameters:\n            name :\n                Name of the dataset for saving and loading.\n            energy_type :\n                Type of the energy for the computation of the statistics. Used for loading and saving.\n            force_recompute :\n                Flag to force the recomputation of the statistics\n            energies : n\n                Energies of the dataset\n            n_atoms :\n                Number of atoms in the dataset\n            atom_species :\n                Atomic species of the dataset\n            position_idx_range : n\n                Position index range of the dataset\n            e0_matrix :\n                Isolated atom energies matrix of the dataset\n            atom_charges :\n                Atomic charges of the dataset\n            forces :\n                Forces of the dataset\n        \"\"\"\n        self.name = name\n        self.energy_type = energy_type\n        self.force_recompute = force_recompute\n        self.energies = energies\n        self.forces = forces\n        self.position_idx_range = position_idx_range\n        self.e0_matrix = e0_matrix\n        self.n_atoms = n_atoms\n        self.atom_species_charges_tuple = (atom_species, atom_charges)\n        self._root = p_join(get_local_cache(), self.name)\n        if atom_species is not None and atom_charges is not None:\n            # by value not reference\n            self.atom_species_charges_tuple = np.concatenate((atom_species[:, None], atom_charges[:, None]), axis=-1)\n\n    @property\n    def has_forces(self) -> bool:\n        return self.forces is not None\n\n    @property\n    def preprocess_path(self):\n        path = p_join(self.root, \"statistics\", self.name + f\"_{str(self)}\" + \".pkl\")\n        return path\n\n    @property\n    def root(self):\n        \"\"\"\n        Path to the dataset folder\n        \"\"\"\n        return self._root\n\n    @classmethod\n    def from_openqdc_dataset(cls, dataset, recompute: bool = False):\n        \"\"\"\n        Create a calculator object from a dataset object.\n        \"\"\"\n        obj = cls(\n            name=dataset.__name__,\n            force_recompute=recompute,\n            energy_type=dataset.energy_type,\n            energies=dataset.data[\"energies\"],\n            forces=dataset.data[\"forces\"] if \"forces\" in dataset.data else None,\n            n_atoms=dataset.data[\"n_atoms\"],\n            position_idx_range=dataset.data[\"position_idx_range\"],\n            atom_species=dataset.data[\"atomic_inputs\"][:, 0].ravel(),\n            atom_charges=dataset.data[\"atomic_inputs\"][:, 1].ravel(),\n            e0_matrix=dataset.__isolated_atom_energies__,\n        )\n        obj._root = dataset.root  # set to the dataset root in case of multiple datasets\n        return obj\n\n    @abstractmethod\n    def compute(self) -> StatisticsResults:\n        \"\"\"\n        Abstract method to compute the statistics.\n        Must return a StatisticsResults object and be implemented\n        in all the childs\n        \"\"\"\n        raise NotImplementedError\n\n    def save_statistics(self) -> None:\n        \"\"\"\n        Save statistics file to the dataset folder as a pkl file\n        \"\"\"\n        save_pkl(self.result, self.preprocess_path)\n\n    def attempt_load(self) -> bool:\n        \"\"\"\n        Load precomputed statistics file and return the success of the operation\n        \"\"\"\n        try:\n            self.result = load_pkl(self.preprocess_path)\n            logger.info(f\"Statistics for {str(self)} loaded successfully\")\n            return True\n        except FileNotFoundError:\n            logger.warning(f\"Statistics for {str(self)} not found. Computing...\")\n            return False\n\n    def _setup_deps(self, state: Dict) -> None:\n        \"\"\"\n        Check if the dependencies of calculators are satisfied\n        from the state object and set the attributes of the calculator\n        to skip part of the calculation\n        \"\"\"\n        self.state = state\n        self.deps_satisfied = all([dep in state for dep in self.state_dependency])\n        if self.deps_satisfied:\n            for dep in self.state_dependency:\n                setattr(self, dep, state[dep])\n\n    def write_state(self, update: Dict) -> None:\n        \"\"\"\n        Write/update the state dictionary with the update dictionary\n\n        update:\n            dictionary containing the update to the state\n        \"\"\"\n        self.state.update(update)\n\n    def run(self, state: Dict) -> None:\n        \"\"\"\n        Main method to run the calculator.\n        Setup the dependencies from the state dictionary\n        Check if the statistics are already computed and load them or\n        recompute them\n        Save the statistics in the correct folder\n\n        state:\n            dictionary containing the state of the calculator\n        \"\"\"\n        self._setup_deps(state)\n        if self.force_recompute or not self.attempt_load():\n            self.result = self.compute()\n            self.save_statistics()\n\n    def __str__(self) -> str:\n        return self.__class__.__name__.lower()\n
    "},{"location":"API/statistics.html#openqdc.datasets.statistics.AbstractStatsCalculator.root","title":"root property","text":"

    Path to the dataset folder

    "},{"location":"API/statistics.html#openqdc.datasets.statistics.AbstractStatsCalculator.__init__","title":"__init__(name, energy_type=None, force_recompute=False, energies=None, n_atoms=None, atom_species=None, position_idx_range=None, e0_matrix=None, atom_charges=None, forces=None)","text":"

    Parameters:

    Name Type Description Default name

    Name of the dataset for saving and loading.

    required energy_type

    Type of the energy for the computation of the statistics. Used for loading and saving.

    None force_recompute

    Flag to force the recomputation of the statistics

    False energies

    n Energies of the dataset

    None n_atoms

    Number of atoms in the dataset

    None atom_species

    Atomic species of the dataset

    None position_idx_range

    n Position index range of the dataset

    None e0_matrix

    Isolated atom energies matrix of the dataset

    None atom_charges

    Atomic charges of the dataset

    None forces

    Forces of the dataset

    None Source code in openqdc/datasets/statistics.py
    def __init__(\n    self,\n    name: str,\n    energy_type: Optional[str] = None,\n    force_recompute: bool = False,\n    energies: Optional[np.ndarray] = None,\n    n_atoms: Optional[np.ndarray] = None,\n    atom_species: Optional[np.ndarray] = None,\n    position_idx_range: Optional[np.ndarray] = None,\n    e0_matrix: Optional[np.ndarray] = None,\n    atom_charges: Optional[np.ndarray] = None,\n    forces: Optional[np.ndarray] = None,\n):\n    \"\"\"\n    Parameters:\n        name :\n            Name of the dataset for saving and loading.\n        energy_type :\n            Type of the energy for the computation of the statistics. Used for loading and saving.\n        force_recompute :\n            Flag to force the recomputation of the statistics\n        energies : n\n            Energies of the dataset\n        n_atoms :\n            Number of atoms in the dataset\n        atom_species :\n            Atomic species of the dataset\n        position_idx_range : n\n            Position index range of the dataset\n        e0_matrix :\n            Isolated atom energies matrix of the dataset\n        atom_charges :\n            Atomic charges of the dataset\n        forces :\n            Forces of the dataset\n    \"\"\"\n    self.name = name\n    self.energy_type = energy_type\n    self.force_recompute = force_recompute\n    self.energies = energies\n    self.forces = forces\n    self.position_idx_range = position_idx_range\n    self.e0_matrix = e0_matrix\n    self.n_atoms = n_atoms\n    self.atom_species_charges_tuple = (atom_species, atom_charges)\n    self._root = p_join(get_local_cache(), self.name)\n    if atom_species is not None and atom_charges is not None:\n        # by value not reference\n        self.atom_species_charges_tuple = np.concatenate((atom_species[:, None], atom_charges[:, None]), axis=-1)\n
    "},{"location":"API/statistics.html#openqdc.datasets.statistics.AbstractStatsCalculator.attempt_load","title":"attempt_load()","text":"

    Load precomputed statistics file and return the success of the operation

    Source code in openqdc/datasets/statistics.py
    def attempt_load(self) -> bool:\n    \"\"\"\n    Load precomputed statistics file and return the success of the operation\n    \"\"\"\n    try:\n        self.result = load_pkl(self.preprocess_path)\n        logger.info(f\"Statistics for {str(self)} loaded successfully\")\n        return True\n    except FileNotFoundError:\n        logger.warning(f\"Statistics for {str(self)} not found. Computing...\")\n        return False\n
    "},{"location":"API/statistics.html#openqdc.datasets.statistics.AbstractStatsCalculator.compute","title":"compute() abstractmethod","text":"

    Abstract method to compute the statistics. Must return a StatisticsResults object and be implemented in all the childs

    Source code in openqdc/datasets/statistics.py
    @abstractmethod\ndef compute(self) -> StatisticsResults:\n    \"\"\"\n    Abstract method to compute the statistics.\n    Must return a StatisticsResults object and be implemented\n    in all the childs\n    \"\"\"\n    raise NotImplementedError\n
    "},{"location":"API/statistics.html#openqdc.datasets.statistics.AbstractStatsCalculator.from_openqdc_dataset","title":"from_openqdc_dataset(dataset, recompute=False) classmethod","text":"

    Create a calculator object from a dataset object.

    Source code in openqdc/datasets/statistics.py
    @classmethod\ndef from_openqdc_dataset(cls, dataset, recompute: bool = False):\n    \"\"\"\n    Create a calculator object from a dataset object.\n    \"\"\"\n    obj = cls(\n        name=dataset.__name__,\n        force_recompute=recompute,\n        energy_type=dataset.energy_type,\n        energies=dataset.data[\"energies\"],\n        forces=dataset.data[\"forces\"] if \"forces\" in dataset.data else None,\n        n_atoms=dataset.data[\"n_atoms\"],\n        position_idx_range=dataset.data[\"position_idx_range\"],\n        atom_species=dataset.data[\"atomic_inputs\"][:, 0].ravel(),\n        atom_charges=dataset.data[\"atomic_inputs\"][:, 1].ravel(),\n        e0_matrix=dataset.__isolated_atom_energies__,\n    )\n    obj._root = dataset.root  # set to the dataset root in case of multiple datasets\n    return obj\n
    "},{"location":"API/statistics.html#openqdc.datasets.statistics.AbstractStatsCalculator.run","title":"run(state)","text":"

    Main method to run the calculator. Setup the dependencies from the state dictionary Check if the statistics are already computed and load them or recompute them Save the statistics in the correct folder

    state

    dictionary containing the state of the calculator

    Source code in openqdc/datasets/statistics.py
    def run(self, state: Dict) -> None:\n    \"\"\"\n    Main method to run the calculator.\n    Setup the dependencies from the state dictionary\n    Check if the statistics are already computed and load them or\n    recompute them\n    Save the statistics in the correct folder\n\n    state:\n        dictionary containing the state of the calculator\n    \"\"\"\n    self._setup_deps(state)\n    if self.force_recompute or not self.attempt_load():\n        self.result = self.compute()\n        self.save_statistics()\n
    "},{"location":"API/statistics.html#openqdc.datasets.statistics.AbstractStatsCalculator.save_statistics","title":"save_statistics()","text":"

    Save statistics file to the dataset folder as a pkl file

    Source code in openqdc/datasets/statistics.py
    def save_statistics(self) -> None:\n    \"\"\"\n    Save statistics file to the dataset folder as a pkl file\n    \"\"\"\n    save_pkl(self.result, self.preprocess_path)\n
    "},{"location":"API/statistics.html#openqdc.datasets.statistics.AbstractStatsCalculator.write_state","title":"write_state(update)","text":"

    Write/update the state dictionary with the update dictionary

    update

    dictionary containing the update to the state

    Source code in openqdc/datasets/statistics.py
    def write_state(self, update: Dict) -> None:\n    \"\"\"\n    Write/update the state dictionary with the update dictionary\n\n    update:\n        dictionary containing the update to the state\n    \"\"\"\n    self.state.update(update)\n
    "},{"location":"API/statistics.html#openqdc.datasets.statistics.EnergyStatistics","title":"EnergyStatistics dataclass","text":"

    Bases: StatisticsResults

    Dataclass for energy related statistics

    Source code in openqdc/datasets/statistics.py
    @dataclass\nclass EnergyStatistics(StatisticsResults):\n    \"\"\"\n    Dataclass for energy related statistics\n    \"\"\"\n\n    mean: Optional[np.ndarray]\n    std: Optional[np.ndarray]\n
    "},{"location":"API/statistics.html#openqdc.datasets.statistics.ForceStatistics","title":"ForceStatistics dataclass","text":"

    Bases: StatisticsResults

    Dataclass for force statistics

    Source code in openqdc/datasets/statistics.py
    @dataclass\nclass ForceStatistics(StatisticsResults):\n    \"\"\"\n    Dataclass for force statistics\n    \"\"\"\n\n    mean: Optional[np.ndarray]\n    std: Optional[np.ndarray]\n    component_mean: Optional[np.ndarray]\n    component_std: Optional[np.ndarray]\n    component_rms: Optional[np.ndarray]\n
    "},{"location":"API/statistics.html#openqdc.datasets.statistics.ForcesCalculatorStats","title":"ForcesCalculatorStats","text":"

    Bases: AbstractStatsCalculator

    Forces statistics calculator class

    Source code in openqdc/datasets/statistics.py
    class ForcesCalculatorStats(AbstractStatsCalculator):\n    \"\"\"\n    Forces statistics calculator class\n    \"\"\"\n\n    def compute(self) -> ForceStatistics:\n        if not self.has_forces:\n            return ForceStatistics(mean=None, std=None, component_mean=None, component_std=None, component_rms=None)\n        converted_force_data = self.forces\n        num_methods = converted_force_data.shape[2]\n        mean = np.nanmean(converted_force_data.reshape(-1, num_methods), axis=0)\n        std = np.nanstd(converted_force_data.reshape(-1, num_methods), axis=0)\n        component_mean = np.nanmean(converted_force_data, axis=0)\n        component_std = np.nanstd(converted_force_data, axis=0)\n        component_rms = np.sqrt(np.nanmean(converted_force_data**2, axis=0))\n        return ForceStatistics(\n            mean=np.atleast_2d(mean),\n            std=np.atleast_2d(std),\n            component_mean=np.atleast_2d(component_mean),\n            component_std=np.atleast_2d(component_std),\n            component_rms=np.atleast_2d(component_rms),\n        )\n
    "},{"location":"API/statistics.html#openqdc.datasets.statistics.FormationEnergyInterface","title":"FormationEnergyInterface","text":"

    Bases: AbstractStatsCalculator, ABC

    Formation Energy interface calculator class. Define the use of the dependency formation_energy in the compute method

    Source code in openqdc/datasets/statistics.py
    class FormationEnergyInterface(AbstractStatsCalculator, ABC):\n    \"\"\"\n    Formation Energy interface calculator class.\n    Define the use of the dependency formation_energy in the\n    compute method\n    \"\"\"\n\n    state_dependency = [\"formation_energy\"]\n\n    def compute(self) -> EnergyStatistics:\n        # if the state has not the dependency satisfied\n        if not self.deps_satisfied:\n            # run the main computation\n            from openqdc.utils.constants import MAX_CHARGE\n\n            splits_idx = self.position_idx_range[:, 1]\n            s = np.array(self.atom_species_charges_tuple, dtype=int)\n            s[:, 1] += MAX_CHARGE\n            matrixs = [matrix[s[:, 0], s[:, 1]] for matrix in self.e0_matrix]\n            converted_energy_data = self.energies\n            E = []\n            for i, matrix in enumerate(matrixs):\n                c = np.cumsum(np.append([0], matrix))[splits_idx]\n                c[1:] = c[1:] - c[:-1]\n                E.append(converted_energy_data[:, i] - c)\n        else:\n            # if the dependency is satisfied get the dependency\n            E = getattr(self, self.state_dependency[0])\n        self.write_state({self.state_dependency[0]: E})\n        E = np.array(E).T\n        return self._compute(E)\n\n    @abstractmethod\n    def _compute(self, energy) -> EnergyStatistics:\n        raise NotImplementedError\n\n    def __str__(self) -> str:\n        # override the __str__ method to add the energy type to the name\n        # to differentiate between formation and regression type\n        return f\"{self.__class__.__name__.lower()}_{self.energy_type.lower()}\"\n
    "},{"location":"API/statistics.html#openqdc.datasets.statistics.FormationEnergyStats","title":"FormationEnergyStats","text":"

    Bases: FormationEnergyInterface

    Formation Energy calculator class.

    Source code in openqdc/datasets/statistics.py
    class FormationEnergyStats(FormationEnergyInterface):\n    \"\"\"\n    Formation Energy  calculator class.\n    \"\"\"\n\n    def _compute(self, energy) -> EnergyStatistics:\n        formation_E_mean = np.nanmean(energy, axis=0)\n        formation_E_std = np.nanstd(energy, axis=0)\n        return EnergyStatistics(mean=np.atleast_2d(formation_E_mean), std=np.atleast_2d(formation_E_std))\n
    "},{"location":"API/statistics.html#openqdc.datasets.statistics.PerAtomFormationEnergyStats","title":"PerAtomFormationEnergyStats","text":"

    Bases: FormationEnergyInterface

    Per atom Formation Energy calculator class.

    Source code in openqdc/datasets/statistics.py
    class PerAtomFormationEnergyStats(FormationEnergyInterface):\n    \"\"\"\n    Per atom Formation Energy  calculator class.\n    \"\"\"\n\n    def _compute(self, energy) -> EnergyStatistics:\n        inter_E_mean = np.nanmean((energy / self.n_atoms[:, None]), axis=0)\n        inter_E_std = np.nanstd((energy / self.n_atoms[:, None]), axis=0)\n        return EnergyStatistics(mean=np.atleast_2d(inter_E_mean), std=np.atleast_2d(inter_E_std))\n
    "},{"location":"API/statistics.html#openqdc.datasets.statistics.StatisticManager","title":"StatisticManager","text":"

    Manager class that automatically handle the shared state between the statistic calculators

    Source code in openqdc/datasets/statistics.py
    class StatisticManager:\n    \"\"\"\n    Manager class that automatically handle the shared state between\n    the statistic calculators\n    \"\"\"\n\n    def __init__(self, dataset: Any, recompute: bool = False, *statistic_calculators: \"AbstractStatsCalculator\"):\n        \"\"\"\n        Parameters:\n            dataset : openqdc.datasets.base.BaseDataset\n                The dataset object to compute the statistics\n            recompute:\n                Flag to recompute the statistics\n            *statistic_calculators:\n                List of statistic calculators to run\n        \"\"\"\n        self._state = {}\n        self._results = {}\n        self._statistic_calculators = [\n            statistic_calculators.from_openqdc_dataset(dataset, recompute)\n            for statistic_calculators in statistic_calculators\n        ]\n\n    @property\n    def state(self) -> Dict:\n        \"\"\"\n        Return the dictionary state of the manager\n\n        Returns:\n            State of the StatisticManager\n        \"\"\"\n        return self._state\n\n    def reset_state(self):\n        \"\"\"\n        Reset the state dictionary\n        \"\"\"\n        self._state = {}\n\n    def reset_results(self):\n        \"\"\"\n        Reset the results dictionary\n        \"\"\"\n        self._results = {}\n\n    def get_state(self, key: Optional[str] = None) -> Optional[Any]:\n        \"\"\"\n        Return the value of the key in the state dictionary\n\n        Parameters:\n            key: str, default = None\n        Returns:\n            the value of the key in the state dictionary\n            or the whole state dictionary if key is None\n        \"\"\"\n        if key is None:\n            return self._state\n        return self._state.get(key, None)\n\n    def has_state(self, key: str) -> bool:\n        \"\"\"\n        Check is state has key\n\n        Parameters:\n            key:\n                Key to check in the state dictionary\n\n        Returns:\n            True if the key is in the state dictionary\n        \"\"\"\n        return key in self._state\n\n    def get_results(self, as_dict: bool = False):\n        \"\"\"\n        Aggregate results from all the calculators\n\n        Parameters:\n            as_dict:\n                Flag to return the results as a dictionary\n        \"\"\"\n        results = deepcopy(self._results)\n        if as_dict:\n            return {k: v.as_dict() for k, v in results.items()}\n        return {k: v for k, v in self._results.items()}\n\n    def run_calculators(self):\n        \"\"\"\n        Run the saved calculators and save the results in the manager\n        \"\"\"\n        logger.info(\"Processing dataset statistics\")\n        for calculator in self._statistic_calculators:\n            calculator.run(self.state)\n            self._results[calculator.__class__.__name__] = calculator.result\n
    "},{"location":"API/statistics.html#openqdc.datasets.statistics.StatisticManager.state","title":"state: Dict property","text":"

    Return the dictionary state of the manager

    Returns:

    Type Description Dict

    State of the StatisticManager

    "},{"location":"API/statistics.html#openqdc.datasets.statistics.StatisticManager.__init__","title":"__init__(dataset, recompute=False, *statistic_calculators)","text":"

    Parameters:

    Name Type Description Default dataset

    openqdc.datasets.base.BaseDataset The dataset object to compute the statistics

    required recompute bool

    Flag to recompute the statistics

    False *statistic_calculators AbstractStatsCalculator

    List of statistic calculators to run

    () Source code in openqdc/datasets/statistics.py
    def __init__(self, dataset: Any, recompute: bool = False, *statistic_calculators: \"AbstractStatsCalculator\"):\n    \"\"\"\n    Parameters:\n        dataset : openqdc.datasets.base.BaseDataset\n            The dataset object to compute the statistics\n        recompute:\n            Flag to recompute the statistics\n        *statistic_calculators:\n            List of statistic calculators to run\n    \"\"\"\n    self._state = {}\n    self._results = {}\n    self._statistic_calculators = [\n        statistic_calculators.from_openqdc_dataset(dataset, recompute)\n        for statistic_calculators in statistic_calculators\n    ]\n
    "},{"location":"API/statistics.html#openqdc.datasets.statistics.StatisticManager.get_results","title":"get_results(as_dict=False)","text":"

    Aggregate results from all the calculators

    Parameters:

    Name Type Description Default as_dict bool

    Flag to return the results as a dictionary

    False Source code in openqdc/datasets/statistics.py
    def get_results(self, as_dict: bool = False):\n    \"\"\"\n    Aggregate results from all the calculators\n\n    Parameters:\n        as_dict:\n            Flag to return the results as a dictionary\n    \"\"\"\n    results = deepcopy(self._results)\n    if as_dict:\n        return {k: v.as_dict() for k, v in results.items()}\n    return {k: v for k, v in self._results.items()}\n
    "},{"location":"API/statistics.html#openqdc.datasets.statistics.StatisticManager.get_state","title":"get_state(key=None)","text":"

    Return the value of the key in the state dictionary

    Parameters:

    Name Type Description Default key Optional[str]

    str, default = None

    None

    Returns: the value of the key in the state dictionary or the whole state dictionary if key is None

    Source code in openqdc/datasets/statistics.py
    def get_state(self, key: Optional[str] = None) -> Optional[Any]:\n    \"\"\"\n    Return the value of the key in the state dictionary\n\n    Parameters:\n        key: str, default = None\n    Returns:\n        the value of the key in the state dictionary\n        or the whole state dictionary if key is None\n    \"\"\"\n    if key is None:\n        return self._state\n    return self._state.get(key, None)\n
    "},{"location":"API/statistics.html#openqdc.datasets.statistics.StatisticManager.has_state","title":"has_state(key)","text":"

    Check is state has key

    Parameters:

    Name Type Description Default key str

    Key to check in the state dictionary

    required

    Returns:

    Type Description bool

    True if the key is in the state dictionary

    Source code in openqdc/datasets/statistics.py
    def has_state(self, key: str) -> bool:\n    \"\"\"\n    Check is state has key\n\n    Parameters:\n        key:\n            Key to check in the state dictionary\n\n    Returns:\n        True if the key is in the state dictionary\n    \"\"\"\n    return key in self._state\n
    "},{"location":"API/statistics.html#openqdc.datasets.statistics.StatisticManager.reset_results","title":"reset_results()","text":"

    Reset the results dictionary

    Source code in openqdc/datasets/statistics.py
    def reset_results(self):\n    \"\"\"\n    Reset the results dictionary\n    \"\"\"\n    self._results = {}\n
    "},{"location":"API/statistics.html#openqdc.datasets.statistics.StatisticManager.reset_state","title":"reset_state()","text":"

    Reset the state dictionary

    Source code in openqdc/datasets/statistics.py
    def reset_state(self):\n    \"\"\"\n    Reset the state dictionary\n    \"\"\"\n    self._state = {}\n
    "},{"location":"API/statistics.html#openqdc.datasets.statistics.StatisticManager.run_calculators","title":"run_calculators()","text":"

    Run the saved calculators and save the results in the manager

    Source code in openqdc/datasets/statistics.py
    def run_calculators(self):\n    \"\"\"\n    Run the saved calculators and save the results in the manager\n    \"\"\"\n    logger.info(\"Processing dataset statistics\")\n    for calculator in self._statistic_calculators:\n        calculator.run(self.state)\n        self._results[calculator.__class__.__name__] = calculator.result\n
    "},{"location":"API/statistics.html#openqdc.datasets.statistics.StatisticsResults","title":"StatisticsResults","text":"

    Parent class to statistics results to provide general methods.

    Source code in openqdc/datasets/statistics.py
    class StatisticsResults:\n    \"\"\"\n    Parent class to statistics results\n    to provide general methods.\n    \"\"\"\n\n    def to_dict(self) -> Dict:\n        \"\"\"\n        Convert the class to a dictionary\n\n        Returns:\n            Dictionary representation of the class\n        \"\"\"\n        return asdict(self)\n\n    def transform(self, func: Callable):\n        \"\"\"\n        Apply a function to all the attributes of the class\n\n        Parameters:\n            func:\n                Function to apply to the attributes\n        \"\"\"\n        for k, v in self.to_dict().items():\n            if v is not None:\n                setattr(self, k, func(v))\n
    "},{"location":"API/statistics.html#openqdc.datasets.statistics.StatisticsResults.to_dict","title":"to_dict()","text":"

    Convert the class to a dictionary

    Returns:

    Type Description Dict

    Dictionary representation of the class

    Source code in openqdc/datasets/statistics.py
    def to_dict(self) -> Dict:\n    \"\"\"\n    Convert the class to a dictionary\n\n    Returns:\n        Dictionary representation of the class\n    \"\"\"\n    return asdict(self)\n
    "},{"location":"API/statistics.html#openqdc.datasets.statistics.StatisticsResults.transform","title":"transform(func)","text":"

    Apply a function to all the attributes of the class

    Parameters:

    Name Type Description Default func Callable

    Function to apply to the attributes

    required Source code in openqdc/datasets/statistics.py
    def transform(self, func: Callable):\n    \"\"\"\n    Apply a function to all the attributes of the class\n\n    Parameters:\n        func:\n            Function to apply to the attributes\n    \"\"\"\n    for k, v in self.to_dict().items():\n        if v is not None:\n            setattr(self, k, func(v))\n
    "},{"location":"API/statistics.html#openqdc.datasets.statistics.TotalEnergyStats","title":"TotalEnergyStats","text":"

    Bases: AbstractStatsCalculator

    Total Energy statistics calculator class

    Source code in openqdc/datasets/statistics.py
    class TotalEnergyStats(AbstractStatsCalculator):\n    \"\"\"\n    Total Energy statistics calculator class\n    \"\"\"\n\n    def compute(self) -> EnergyStatistics:\n        converted_energy_data = self.energies\n        total_E_mean = np.nanmean(converted_energy_data, axis=0)\n        total_E_std = np.nanstd(converted_energy_data, axis=0)\n        return EnergyStatistics(mean=np.atleast_2d(total_E_mean), std=np.atleast_2d(total_E_std))\n
    "},{"location":"API/units.html","title":"UNITS","text":"

    Units conversion utilities module.

    Available Energy units

    [\"kcal/mol\", \"kj/mol\", \"hartree\", \"ev\" \"mev\", \"ryd]

    Available Distance units

    [\"ang\", \"nm\", \"bohr\"]

    Available Force units

    Combinations between Energy and Distance units

    "},{"location":"API/units.html#openqdc.utils.units.Conversion","title":"Conversion","text":"

    Conversion from one unit system to another defined by a name and a callable

    Source code in openqdc/utils/units.py
    class Conversion:\n    \"\"\"\n    Conversion from one unit system to another defined by a name and a callable\n    \"\"\"\n\n    def __init__(self, in_unit: str, out_unit: str, func: Callable[[float], float]):\n        \"\"\"\n\n        Parameters:\n            in_unit: String defining the units of the current values\n            out_unit: String defining the target units\n            func: The callable to compute the conversion\n        \"\"\"\n        name = \"convert_\" + in_unit.lower().strip() + \"_to_\" + out_unit.lower().strip()\n\n        if name in CONVERSION_REGISTRY:\n            raise ConversionAlreadyDefined(in_unit, out_unit)\n        CONVERSION_REGISTRY[name] = self\n\n        self.name = name\n        self.fn = func\n\n    def __call__(self, x):\n        return self.fn(x)\n
    "},{"location":"API/units.html#openqdc.utils.units.Conversion.__init__","title":"__init__(in_unit, out_unit, func)","text":"

    Parameters:

    Name Type Description Default in_unit str

    String defining the units of the current values

    required out_unit str

    String defining the target units

    required func Callable[[float], float]

    The callable to compute the conversion

    required Source code in openqdc/utils/units.py
    def __init__(self, in_unit: str, out_unit: str, func: Callable[[float], float]):\n    \"\"\"\n\n    Parameters:\n        in_unit: String defining the units of the current values\n        out_unit: String defining the target units\n        func: The callable to compute the conversion\n    \"\"\"\n    name = \"convert_\" + in_unit.lower().strip() + \"_to_\" + out_unit.lower().strip()\n\n    if name in CONVERSION_REGISTRY:\n        raise ConversionAlreadyDefined(in_unit, out_unit)\n    CONVERSION_REGISTRY[name] = self\n\n    self.name = name\n    self.fn = func\n
    "},{"location":"API/units.html#openqdc.utils.units.DistanceTypeConversion","title":"DistanceTypeConversion","text":"

    Bases: ConversionEnum, StrEnum

    Define the possible distance units for conversion

    Source code in openqdc/utils/units.py
    @unique\nclass DistanceTypeConversion(ConversionEnum, StrEnum):\n    \"\"\"\n    Define the possible distance units for conversion\n    \"\"\"\n\n    ANG = \"ang\"\n    NM = \"nm\"\n    BOHR = \"bohr\"\n\n    def to(self, distance: \"DistanceTypeConversion\", fraction: bool = False) -> Callable[[float], float]:\n        \"\"\"\n        Get the conversion function to convert the distance to the desired units.\n\n        Parameters:\n            distance: distance unit to convert to\n            fraction: whether it is distance^1 or distance^-1\n\n        Returns:\n            callable to convert the distance to the desired units\n        \"\"\"\n        return get_conversion(str(self), str(distance)) if not fraction else get_conversion(str(distance), str(self))\n
    "},{"location":"API/units.html#openqdc.utils.units.DistanceTypeConversion.to","title":"to(distance, fraction=False)","text":"

    Get the conversion function to convert the distance to the desired units.

    Parameters:

    Name Type Description Default distance DistanceTypeConversion

    distance unit to convert to

    required fraction bool

    whether it is distance^1 or distance^-1

    False

    Returns:

    Type Description Callable[[float], float]

    callable to convert the distance to the desired units

    Source code in openqdc/utils/units.py
    def to(self, distance: \"DistanceTypeConversion\", fraction: bool = False) -> Callable[[float], float]:\n    \"\"\"\n    Get the conversion function to convert the distance to the desired units.\n\n    Parameters:\n        distance: distance unit to convert to\n        fraction: whether it is distance^1 or distance^-1\n\n    Returns:\n        callable to convert the distance to the desired units\n    \"\"\"\n    return get_conversion(str(self), str(distance)) if not fraction else get_conversion(str(distance), str(self))\n
    "},{"location":"API/units.html#openqdc.utils.units.EnergyTypeConversion","title":"EnergyTypeConversion","text":"

    Bases: ConversionEnum, StrEnum

    Define the possible energy units for conversion

    Source code in openqdc/utils/units.py
    @unique\nclass EnergyTypeConversion(ConversionEnum, StrEnum):\n    \"\"\"\n    Define the possible energy units for conversion\n    \"\"\"\n\n    KCAL_MOL = \"kcal/mol\"\n    KJ_MOL = \"kj/mol\"\n    HARTREE = \"hartree\"\n    EV = \"ev\"\n    MEV = \"mev\"\n    RYD = \"ryd\"\n\n    def to(self, energy: \"EnergyTypeConversion\") -> Callable[[float], float]:\n        \"\"\"\n        Get the conversion function to convert the energy to the desired units.\n\n        Parameters:\n            energy: energy unit to convert to\n\n        Returns:\n            Callable to convert the distance to the desired units\n        \"\"\"\n        return get_conversion(str(self), str(energy))\n
    "},{"location":"API/units.html#openqdc.utils.units.EnergyTypeConversion.to","title":"to(energy)","text":"

    Get the conversion function to convert the energy to the desired units.

    Parameters:

    Name Type Description Default energy EnergyTypeConversion

    energy unit to convert to

    required

    Returns:

    Type Description Callable[[float], float]

    Callable to convert the distance to the desired units

    Source code in openqdc/utils/units.py
    def to(self, energy: \"EnergyTypeConversion\") -> Callable[[float], float]:\n    \"\"\"\n    Get the conversion function to convert the energy to the desired units.\n\n    Parameters:\n        energy: energy unit to convert to\n\n    Returns:\n        Callable to convert the distance to the desired units\n    \"\"\"\n    return get_conversion(str(self), str(energy))\n
    "},{"location":"API/units.html#openqdc.utils.units.ForceTypeConversion","title":"ForceTypeConversion","text":"

    Bases: ConversionEnum

    Define the possible foce units for conversion

    Source code in openqdc/utils/units.py
    @unique\nclass ForceTypeConversion(ConversionEnum):\n    \"\"\"\n    Define the possible foce units for conversion\n    \"\"\"\n\n    #     Name      = EnergyTypeConversion,         , DistanceTypeConversion\n    HARTREE_BOHR = EnergyTypeConversion.HARTREE, DistanceTypeConversion.BOHR\n    HARTREE_ANG = EnergyTypeConversion.HARTREE, DistanceTypeConversion.ANG\n    HARTREE_NM = EnergyTypeConversion.HARTREE, DistanceTypeConversion.NM\n    EV_BOHR = EnergyTypeConversion.EV, DistanceTypeConversion.BOHR\n    EV_ANG = EnergyTypeConversion.EV, DistanceTypeConversion.ANG\n    EV_NM = EnergyTypeConversion.EV, DistanceTypeConversion.NM\n    KCAL_MOL_BOHR = EnergyTypeConversion.KCAL_MOL, DistanceTypeConversion.BOHR\n    KCAL_MOL_ANG = EnergyTypeConversion.KCAL_MOL, DistanceTypeConversion.ANG\n    KCAL_MOL_NM = EnergyTypeConversion.KCAL_MOL, DistanceTypeConversion.NM\n    KJ_MOL_BOHR = EnergyTypeConversion.KJ_MOL, DistanceTypeConversion.BOHR\n    KJ_MOL_ANG = EnergyTypeConversion.KJ_MOL, DistanceTypeConversion.ANG\n    KJ_MOL_NM = EnergyTypeConversion.KJ_MOL, DistanceTypeConversion.NM\n    MEV_BOHR = EnergyTypeConversion.MEV, DistanceTypeConversion.BOHR\n    MEV_ANG = EnergyTypeConversion.MEV, DistanceTypeConversion.ANG\n    MEV_NM = EnergyTypeConversion.MEV, DistanceTypeConversion.NM\n    RYD_BOHR = EnergyTypeConversion.RYD, DistanceTypeConversion.BOHR\n    RYD_ANG = EnergyTypeConversion.RYD, DistanceTypeConversion.ANG\n    RYD_NM = EnergyTypeConversion.RYD, DistanceTypeConversion.NM\n\n    def __init__(self, energy: EnergyTypeConversion, distance: DistanceTypeConversion):\n        self.energy = energy\n        self.distance = distance\n\n    def __str__(self):\n        return f\"{self.energy}/{self.distance}\"\n\n    def to(self, energy: EnergyTypeConversion, distance: DistanceTypeConversion) -> Callable[[float], float]:\n        \"\"\"\n        Get the conversion function to convert the force to the desired units.\n\n        Parameters:\n            energy: energy unit to convert to\n            distance: distance unit to convert to\n\n        Returns:\n            callable to convert the distance to the desired units\n        \"\"\"\n        return lambda x: self.distance.to(distance, fraction=True)(self.energy.to(energy)(x))\n
    "},{"location":"API/units.html#openqdc.utils.units.ForceTypeConversion.to","title":"to(energy, distance)","text":"

    Get the conversion function to convert the force to the desired units.

    Parameters:

    Name Type Description Default energy EnergyTypeConversion

    energy unit to convert to

    required distance DistanceTypeConversion

    distance unit to convert to

    required

    Returns:

    Type Description Callable[[float], float]

    callable to convert the distance to the desired units

    Source code in openqdc/utils/units.py
    def to(self, energy: EnergyTypeConversion, distance: DistanceTypeConversion) -> Callable[[float], float]:\n    \"\"\"\n    Get the conversion function to convert the force to the desired units.\n\n    Parameters:\n        energy: energy unit to convert to\n        distance: distance unit to convert to\n\n    Returns:\n        callable to convert the distance to the desired units\n    \"\"\"\n    return lambda x: self.distance.to(distance, fraction=True)(self.energy.to(energy)(x))\n
    "},{"location":"API/units.html#openqdc.utils.units.get_conversion","title":"get_conversion(in_unit, out_unit)","text":"

    Utility function to get the conversion function between two units.

    Parameters:

    Name Type Description Default in_unit

    The input unit

    required out_unit

    The output unit

    required

    Returns:

    Type Description Callable[[float], float]

    The conversion function

    Source code in openqdc/utils/units.py
    def get_conversion(in_unit: str, out_unit: str) -> Callable[[float], float]:\n    \"\"\"\n    Utility function to get the conversion function between two units.\n\n    Parameters:\n        in_unit : The input unit\n        out_unit : The output unit\n\n    Returns:\n        The conversion function\n    \"\"\"\n    name = \"convert_\" + in_unit.lower().strip() + \"_to_\" + out_unit.lower().strip()\n    if in_unit.lower().strip() == out_unit.lower().strip():\n        return lambda x: x\n    if name not in CONVERSION_REGISTRY:\n        raise ConversionNotDefinedError(in_unit, out_unit)\n    return CONVERSION_REGISTRY[name]\n
    "},{"location":"API/utils.html","title":"Utils","text":""},{"location":"API/utils.html#openqdc.utils.check_file","title":"check_file(path)","text":"

    Checks if file present on local

    Source code in openqdc/utils/io.py
    def check_file(path) -> bool:\n    \"\"\"Checks if file present on local\"\"\"\n    return os.path.exists(path)\n
    "},{"location":"API/utils.html#openqdc.utils.create_hdf5_file","title":"create_hdf5_file(hdf5_file_path)","text":"

    Creates hdf5 file with fsspec

    Source code in openqdc/utils/io.py
    def create_hdf5_file(hdf5_file_path: str):\n    \"\"\"Creates hdf5 file with fsspec\"\"\"\n    fp = fsspec.open(hdf5_file_path, \"wb\")\n    if hasattr(fp, \"open\"):\n        fp = fp.open()\n    return h5py.File(fp, \"a\")\n
    "},{"location":"API/utils.html#openqdc.utils.get_conversion","title":"get_conversion(in_unit, out_unit)","text":"

    Utility function to get the conversion function between two units.

    Parameters:

    Name Type Description Default in_unit

    The input unit

    required out_unit

    The output unit

    required

    Returns:

    Type Description Callable[[float], float]

    The conversion function

    Source code in openqdc/utils/units.py
    def get_conversion(in_unit: str, out_unit: str) -> Callable[[float], float]:\n    \"\"\"\n    Utility function to get the conversion function between two units.\n\n    Parameters:\n        in_unit : The input unit\n        out_unit : The output unit\n\n    Returns:\n        The conversion function\n    \"\"\"\n    name = \"convert_\" + in_unit.lower().strip() + \"_to_\" + out_unit.lower().strip()\n    if in_unit.lower().strip() == out_unit.lower().strip():\n        return lambda x: x\n    if name not in CONVERSION_REGISTRY:\n        raise ConversionNotDefinedError(in_unit, out_unit)\n    return CONVERSION_REGISTRY[name]\n
    "},{"location":"API/utils.html#openqdc.utils.get_local_cache","title":"get_local_cache()","text":"

    Returns the local cache directory. It creates it if it does not exist.

    Returns:

    Name Type Description str str

    path to the local cache directory

    Source code in openqdc/utils/io.py
    def get_local_cache() -> str:\n    \"\"\"\n    Returns the local cache directory. It creates it if it does not exist.\n\n    Returns:\n        str: path to the local cache directory\n    \"\"\"\n    cache_dir = os.path.expanduser(os.path.expandvars(_OPENQDC_CACHE_DIR))\n    os.makedirs(cache_dir, exist_ok=True)\n    return cache_dir\n
    "},{"location":"API/utils.html#openqdc.utils.get_remote_cache","title":"get_remote_cache(write_access=False)","text":"

    Returns the entry point based on the write access.

    Source code in openqdc/utils/io.py
    def get_remote_cache(write_access=False) -> str:\n    \"\"\"\n    Returns the entry point based on the write access.\n    \"\"\"\n    if write_access:\n        remote_cache = \"openqdc/v1\"  # \"gs://qmdata-public/openqdc\"\n        # remote_cache = \"gs://qmdata-public/openqdc\"\n    else:\n        remote_cache = _OPENQDC_DOWNLOAD_API.get(os.environ.get(\"OPENQDC_DOWNLOAD_API\", \"s3\"))\n        # remote_cache = \"https://storage.googleapis.com/qmdata-public/openqdc\"\n    return remote_cache\n
    "},{"location":"API/utils.html#openqdc.utils.load_hdf5_file","title":"load_hdf5_file(hdf5_file_path)","text":"

    Loads hdf5 file with fsspec

    Source code in openqdc/utils/io.py
    def load_hdf5_file(hdf5_file_path: str):\n    \"\"\"Loads hdf5 file with fsspec\"\"\"\n    if not check_file(hdf5_file_path):\n        raise FileNotFoundError(f\"File {hdf5_file_path} does not exist on GCS and local.\")\n\n    fp = fsspec.open(hdf5_file_path, \"rb\")\n    if hasattr(fp, \"open\"):\n        fp = fp.open()\n    file = h5py.File(fp)\n\n    # inorder to enable multiprocessing:\n    # https://github.com/fsspec/gcsfs/issues/379#issuecomment-839929801\n    # fsspec.asyn.iothread[0] = None\n    # fsspec.asyn.loop[0] = None\n\n    return file\n
    "},{"location":"API/utils.html#openqdc.utils.load_json","title":"load_json(path)","text":"

    Loads json file

    Source code in openqdc/utils/io.py
    def load_json(path):\n    \"\"\"Loads json file\"\"\"\n    with fsspec.open(path, \"r\") as fp:  # Unpickling\n        return json.load(fp)\n
    "},{"location":"API/utils.html#openqdc.utils.load_pkl","title":"load_pkl(path, check=True)","text":"

    Load pkl file

    Source code in openqdc/utils/io.py
    def load_pkl(path, check=True):\n    \"\"\"Load pkl file\"\"\"\n    if check:\n        if not check_file(path):\n            raise FileNotFoundError(f\"File {path} does not exist on GCS and local.\")\n\n    with open(path, \"rb\") as fp:  # Unpickling\n        return pkl.load(fp)\n
    "},{"location":"API/utils.html#openqdc.utils.makedirs","title":"makedirs(path, exist_ok=True)","text":"

    Creates directory

    Source code in openqdc/utils/io.py
    def makedirs(path, exist_ok=True):\n    \"\"\"Creates directory\"\"\"\n    os.makedirs(path, exist_ok=exist_ok)\n
    "},{"location":"API/utils.html#openqdc.utils.read_qc_archive_h5","title":"read_qc_archive_h5(raw_path, subset, energy_target_names, force_target_names=None)","text":"

    Extracts data from the HDF5 archive file.

    Source code in openqdc/utils/io.py
    def read_qc_archive_h5(\n    raw_path: str, subset: str, energy_target_names: List[str], force_target_names: Optional[List[str]] = None\n) -> List[Dict[str, np.ndarray]]:\n    \"\"\"Extracts data from the HDF5 archive file.\"\"\"\n    data = load_hdf5_file(raw_path)\n    data_t = {k2: data[k1][k2][:] for k1 in data.keys() for k2 in data[k1].keys()}\n\n    n = len(data_t[\"molecule_id\"])\n    samples = [extract_entry(data_t, i, subset, energy_target_names, force_target_names) for i in tqdm(range(n))]\n    return samples\n
    "},{"location":"API/utils.html#openqdc.utils.save_pkl","title":"save_pkl(file, path)","text":"

    Saves pkl file

    Source code in openqdc/utils/io.py
    def save_pkl(file, path):\n    \"\"\"Saves pkl file\"\"\"\n    logger.info(f\"Saving file at {path}\")\n    with fsspec.open(path, \"wb\") as fp:  # Pickling\n        pkl.dump(file, fp)\n
    "},{"location":"API/utils.html#openqdc.utils.set_cache_dir","title":"set_cache_dir(d)","text":"

    Optionally set the _OPENQDC_CACHE_DIR directory.

    Parameters:

    Name Type Description Default d str

    path to a local folder.

    required Source code in openqdc/utils/io.py
    def set_cache_dir(d):\n    r\"\"\"\n    Optionally set the _OPENQDC_CACHE_DIR directory.\n\n    Args:\n        d (str): path to a local folder.\n    \"\"\"\n    if d is None:\n        return\n    global _OPENQDC_CACHE_DIR\n    _OPENQDC_CACHE_DIR = os.path.normpath(os.path.expanduser(d))\n
    "},{"location":"API/datasets/3bpa.html","title":"3BPA","text":""},{"location":"API/datasets/3bpa.html#openqdc.datasets.potential.bpa.BPA","title":"BPA","text":"

    Bases: BaseDataset

    BPA (or 3BPA) dataset is a dataset consisting of a flexible druglike molecule 3-(benzyloxy)pyridin-2-amine. This dataset features complex dihedral potential energy surface with many local minima, which can be challenging to approximate using classical or ML force fields. The configuration were sampled from short (0.5 ps) MD simulations using the ANI-1x force field to perturb the toward lower potential energies. Furthermore, long 25 ps MD simulation were performed at three different temperatures (300, 600, and 1200 K) using the Langevin thermostat and a 1 fs time step. The final configurations were re-evaluated using ORCA at the DFT level of theory using the \u03c9B97X exchange correlation functional and the 6-31G(d) basis set.

    Usage:

    from openqdc.datasets import BPA\ndataset = BPA()\n

    References

    https://pubs.acs.org/doi/10.1021/acs.jctc.1c00647

    Source code in openqdc/datasets/potential/bpa.py
    class BPA(BaseDataset):\n    \"\"\"\n    BPA (or 3BPA) dataset is a dataset consisting of a flexible druglike\n    molecule 3-(benzyloxy)pyridin-2-amine. This dataset features\n    complex dihedral potential energy surface with many local minima,\n    which can be challenging to approximate using classical or ML force fields.\n    The configuration were sampled from short (0.5 ps) MD simulations using the ANI-1x force field to\n    perturb the toward lower potential energies. Furthermore, long 25 ps MD simulation were performed at\n    three different temperatures (300, 600, and 1200 K) using the Langevin thermostat and a 1 fs time step.\n    The final configurations were re-evaluated using ORCA at the DFT level of\n    theory using the \u03c9B97X exchange correlation functional and the 6-31G(d) basis set.\n\n    Usage:\n    ```python\n    from openqdc.datasets import BPA\n    dataset = BPA()\n    ```\n\n\n    References:\n        https://pubs.acs.org/doi/10.1021/acs.jctc.1c00647\n    \"\"\"\n\n    __name__ = \"BPA\"\n    __energy_unit__ = \"ev\"\n    __forces_unit__ = \"ev/ang\"\n    __distance_unit__ = \"ang\"\n    __force_mask__ = [True]\n    __energy_methods__ = [PotentialMethod.WB97X_6_31G_D]\n    __links__ = {\"BPA.zip\": \"https://figshare.com/ndownloader/files/31325990\"}\n\n    def read_raw_entries(self) -> List[Dict]:\n        import os.path as osp\n        from glob import glob\n\n        from ase.io import iread\n\n        files = glob(osp.join(self.root, \"dataset_3BPA\", \"*.xyz\"))\n        files = [f for f in files if \"iso_atoms.xyz\" not in f]\n        all_records = []\n\n        for file in files:\n            subset = np.array([osp.basename(file).split(\".\")[0]])\n\n            for atoms in iread(file, format=\"extxyz\"):\n                all_records.append(read_bpa_record(subset, atoms))\n\n        return all_records\n\n    def __getitem__(self, idx):\n        data = super().__getitem__(idx)\n        data.__setattr__(\"split\", self._convert_array(self.data[\"split\"][idx]))\n        return data\n
    "},{"location":"API/datasets/alchemy.html","title":"Alchemy","text":""},{"location":"API/datasets/alchemy.html#openqdc.datasets.potential.alchemy.Alchemy","title":"Alchemy","text":"

    Bases: BaseDataset

    Alchemy comprises of 119,487 organic molecules with up to 14 heavy atoms, sampled from the GDB MedChem database. Molecular properties are calculated using PySCF's implementation of the DFT Kohn-Sham method at the B3LYP level with the basis set 6-31G(2df,p). The equilibrium geometry is optimized in three passes. First, OpenBabel is used to parse SMILES string and build the Cartesian coordinates with MMFF94 force field optimization. Second, HF/STO3G is used to generate the preliminary geometry. Third, for the final pass of geometry relaxation, the B3LYP/6-31G(2df,p) model with the density fittting approximation for electron repulsion integrals is used. The auxillary basis cc-pVDZ-jkfit is employed in density fitting to build the Coulomb matrix and the HF exchange matrix.

    Usage:

    from openqdc.datasets import Alchemy\ndataset = Alchemy()\n

    Reference

    https://arxiv.org/abs/1906.09427 https://alchemy.tencent.com/

    Source code in openqdc/datasets/potential/alchemy.py
    class Alchemy(BaseDataset):\n    \"\"\"\n    Alchemy comprises of 119,487 organic molecules with up to 14 heavy atoms, sampled from the GDB MedChem database.\n    Molecular properties are calculated using PySCF's implementation of the DFT Kohn-Sham method at the B3LYP level\n    with the basis set 6-31G(2df,p). The equilibrium geometry is optimized in three passes. First, OpenBabel is used\n    to parse SMILES string and build the Cartesian coordinates with MMFF94 force field optimization. Second, HF/STO3G\n    is used to generate the preliminary geometry. Third, for the final pass of geometry relaxation, the\n    B3LYP/6-31G(2df,p) model with the density fittting approximation for electron repulsion integrals is used. The\n    auxillary basis cc-pVDZ-jkfit is employed in density fitting to build the Coulomb matrix and the HF exchange\n    matrix.\n\n    Usage:\n    ```python\n    from openqdc.datasets import Alchemy\n    dataset = Alchemy()\n    ```\n\n    Reference:\n        https://arxiv.org/abs/1906.09427\n        https://alchemy.tencent.com/\n    \"\"\"\n\n    __name__ = \"alchemy\"\n\n    __energy_methods__ = [\n        PotentialMethod.WB97X_6_31G_D,  # \"wb97x/6-31g(d)\"\n    ]\n\n    energy_target_names = [\n        \"\u03c9B97x:6-31G(d) Energy\",\n    ]\n\n    __energy_unit__ = \"hartree\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"hartree/ang\"\n    __links__ = {\"alchemy.zip\": \"https://alchemy.tencent.com/data/alchemy-v20191129.zip\"}\n\n    def read_raw_entries(self):\n        dir_path = p_join(self.root, \"Alchemy-v20191129\")\n        full_csv = pd.read_csv(p_join(dir_path, \"final_version.csv\"))\n        energies = full_csv[\"U0\\n(Ha, internal energy at 0 K)\"].tolist()\n        atom_folder = full_csv[\"atom number\"]\n        gdb_idx = full_csv[\"gdb_idx\"]\n        idxs = full_csv.index.tolist()\n        samples = []\n        for i in tqdm(idxs):\n            sdf_file = p_join(dir_path, f\"atom_{atom_folder[i]}\", f\"{gdb_idx[i]}.sdf\")\n            energy = energies[i]\n            samples.append(read_mol(sdf_file, energy))\n        return samples\n
    "},{"location":"API/datasets/ani.html","title":"ANI","text":""},{"location":"API/datasets/ani.html#openqdc.datasets.potential.ani.ANI1","title":"ANI1","text":"

    Bases: BaseDataset

    The ANI-1 dataset is a collection of 22 x 10^6 structural conformations from 57,000 distinct small organic molecules. The molecules contain 4 distinct atoms, C, N, O and H. Electronic structure calculations use the wB97x density functional and the 6-31G(d) basis set. For generating structures, smiles strings for molecules are used for generating 3D conformations using RDKit. These 3D structures are then pre-optimized to a stationary point using the MMFF94 force field. Finally, geometries are optimized until energy minima using the chosen DFT level.

    Usage:

    from openqdc.datasets import ANI1\ndataset = ANI1()\n

    References

    https://www.nature.com/articles/sdata2017193

    https://github.com/aiqm/ANI1x_datasets

    Source code in openqdc/datasets/potential/ani.py
    class ANI1(BaseDataset):\n    \"\"\"\n    The ANI-1 dataset is a collection of 22 x 10^6 structural conformations from 57,000 distinct small organic\n    molecules. The molecules contain 4 distinct atoms, C, N, O and H. Electronic structure calculations use the\n    wB97x density functional and the 6-31G(d) basis set. For generating structures, smiles strings for molecules\n    are used for generating 3D conformations using RDKit. These 3D structures are then pre-optimized to a stationary\n    point using the MMFF94 force field. Finally, geometries are optimized until energy minima using the chosen DFT\n    level.\n\n    Usage:\n    ```python\n    from openqdc.datasets import ANI1\n    dataset = ANI1()\n    ```\n\n    References:\n        https://www.nature.com/articles/sdata2017193\\n\n        https://github.com/aiqm/ANI1x_datasets\n    \"\"\"\n\n    __name__ = \"ani1\"\n\n    __energy_methods__ = [\n        PotentialMethod.WB97X_6_31G_D,\n    ]\n\n    energy_target_names = [\n        \"\u03c9B97x:6-31G(d) Energy\",\n    ]\n\n    __energy_unit__ = \"hartree\"\n    __distance_unit__ = \"bohr\"\n    __forces_unit__ = \"hartree/bohr\"\n    __links__ = {\"ani1.hdf5.gz\": \"https://zenodo.org/record/3585840/files/214.hdf5.gz\"}\n\n    @property\n    def root(self):\n        return p_join(get_local_cache(), \"ani\")\n\n    @property\n    def config(self):\n        assert len(self.__links__) > 0, \"No links provided for fetching\"\n        return dict(dataset_name=\"ani\", links=self.__links__)\n\n    def __smiles_converter__(self, x):\n        return \"-\".join(x.decode(\"ascii\").split(\"-\")[:-1])\n\n    @property\n    def preprocess_path(self):\n        path = p_join(self.root, \"preprocessed\", self.__name__)\n        os.makedirs(path, exist_ok=True)\n        return path\n\n    def read_raw_entries(self):\n        raw_path = p_join(self.root, f\"{self.__name__}.h5.gz\")\n        samples = read_qc_archive_h5(raw_path, self.__name__, self.energy_target_names, self.force_target_names)\n        return samples\n
    "},{"location":"API/datasets/ani.html#openqdc.datasets.potential.ani.ANI1CCX","title":"ANI1CCX","text":"

    Bases: ANI1

    ANI1-CCX is a dataset of 500k conformers subsampled from the 5.5M conformers of ANI-1X dataset using active learning. The conformations are labelled using a high accuracy CCSD(T)*/CBS method.

    Usage:

    from openqdc.datasets import ANI1CCX\ndataset = ANI1CCX()\n

    References

    https://doi.org/10.1038/s41467-019-10827-4

    https://github.com/aiqm/ANI1x_datasets

    Source code in openqdc/datasets/potential/ani.py
    class ANI1CCX(ANI1):\n    \"\"\"\n    ANI1-CCX is a dataset of 500k conformers subsampled from the 5.5M conformers of ANI-1X dataset using active\n    learning. The conformations are labelled using a high accuracy CCSD(T)*/CBS method.\n\n    Usage:\n    ```python\n    from openqdc.datasets import ANI1CCX\n    dataset = ANI1CCX()\n    ```\n\n    References:\n        https://doi.org/10.1038/s41467-019-10827-4\\n\n        https://github.com/aiqm/ANI1x_datasets\n    \"\"\"\n\n    __name__ = \"ani1ccx\"\n    __energy_unit__ = \"hartree\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"hartree/ang\"\n\n    __energy_methods__ = [\n        PotentialMethod.CCSD_T_CBS,  # \"ccsd(t)/cbs\",\n        PotentialMethod.CCSD_T_CC_PVDZ,  # \"ccsd(t)/cc-pvdz\",\n        PotentialMethod.CCSD_T_CC_PVTZ,  # \"ccsd(t)/cc-pvtz\",\n        PotentialMethod.TCSSD_T_CC_PVDZ,  # \"tccsd(t)/cc-pvdz\",\n    ]\n\n    energy_target_names = [\n        \"CCSD(T)*:CBS Total Energy\",\n        \"NPNO-CCSD(T):cc-pVDZ Correlation Energy\",\n        \"NPNO-CCSD(T):cc-pVTZ Correlation Energy\",\n        \"TPNO-CCSD(T):cc-pVDZ Correlation Energy\",\n    ]\n    force_target_names = []\n    __links__ = {\"ani1x.hdf5.gz\": \"https://zenodo.org/record/4081694/files/292.hdf5.gz\"}\n\n    def __smiles_converter__(self, x):\n        return x.decode(\"ascii\")\n
    "},{"location":"API/datasets/ani.html#openqdc.datasets.potential.ani.ANI1CCX_V2","title":"ANI1CCX_V2","text":"

    Bases: ANI1CCX

    ANI1CCX_V2 is an extension of the ANI1CCX dataset with additional PM6 and GFN2_xTB labels for each conformation.

    Usage:

    from openqdc.datasets import ANI1CCX_V2\ndataset = ANI1CCX_V2()\n

    References

    https://doi.org/10.1038/s41467-019-10827-4

    https://github.com/aiqm/ANI1x_datasets

    Source code in openqdc/datasets/potential/ani.py
    class ANI1CCX_V2(ANI1CCX):\n    \"\"\"\n    ANI1CCX_V2 is an extension of the ANI1CCX dataset with additional PM6 and GFN2_xTB labels\n    for each conformation.\n\n    Usage:\n    ```python\n    from openqdc.datasets import ANI1CCX_V2\n    dataset = ANI1CCX_V2()\n    ```\n\n    References:\n        https://doi.org/10.1038/s41467-019-10827-4\\n\n        https://github.com/aiqm/ANI1x_datasets\n    \"\"\"\n\n    __name__ = \"ani1ccx_v2\"\n\n    __energy_methods__ = ANI1CCX.__energy_methods__ + [PotentialMethod.PM6, PotentialMethod.GFN2_XTB]\n    energy_target_names = ANI1CCX.energy_target_names + [\"PM6\", \"GFN2\"]\n    __force_mask__ = ANI1CCX.__force_mask__ + [False, False]\n
    "},{"location":"API/datasets/ani.html#openqdc.datasets.potential.ani.ANI1X","title":"ANI1X","text":"

    Bases: ANI1

    The ANI-1X dataset consists of ANI-1 molecules + some molecules added using active learning, which leads to a total of 5,496,771 conformers with 63,865 unique molecules. Databases of molecules like GDB-11, ChEMBL, generated amino acids and 2-amino acid peptides are used for sampling new molecules. One of the techniques are used for sampling conformations, (1) molecular dynamics, (2) normal mode sampling, (3) dimer sampling and (4) torsion sampling.

    Usage:

    from openqdc.datasets import ANI1X\ndataset = ANI1X()\n

    References

    https://doi.org/10.1063/1.5023802

    https://github.com/aiqm/ANI1x_datasets

    Source code in openqdc/datasets/potential/ani.py
    class ANI1X(ANI1):\n    \"\"\"\n    The ANI-1X dataset consists of ANI-1 molecules + some molecules added using active learning, which leads to\n    a total of 5,496,771 conformers with 63,865 unique molecules. Databases of molecules like GDB-11, ChEMBL,\n    generated amino acids and 2-amino acid peptides are used for sampling new molecules. One of the techniques\n    are used for sampling conformations, (1) molecular dynamics, (2) normal mode sampling, (3) dimer sampling and\n    (4) torsion sampling.\n\n    Usage:\n    ```python\n    from openqdc.datasets import ANI1X\n    dataset = ANI1X()\n    ```\n\n    References:\n        https://doi.org/10.1063/1.5023802\\n\n        https://github.com/aiqm/ANI1x_datasets\n    \"\"\"\n\n    __name__ = \"ani1x\"\n    __energy_unit__ = \"hartree\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"hartree/ang\"\n\n    __energy_methods__ = [\n        PotentialMethod.HF_CC_PVDZ,\n        PotentialMethod.HF_CC_PVQZ,\n        PotentialMethod.HF_CC_PVTZ,\n        PotentialMethod.MP2_CC_PVDZ,\n        PotentialMethod.MP2_CC_PVQZ,\n        PotentialMethod.MP2_CC_PVTZ,\n        PotentialMethod.WB97X_6_31G_D,\n        PotentialMethod.WB97X_CC_PVTZ,\n    ]\n\n    energy_target_names = [\n        \"HF:cc-pVDZ Total Energy\",\n        \"HF:cc-pVQZ Total Energy\",\n        \"HF:cc-pVTZ Total Energy\",\n        \"MP2:cc-pVDZ Correlation Energy\",\n        \"MP2:cc-pVQZ Correlation Energy\",\n        \"MP2:cc-pVTZ Correlation Energy\",\n        \"wB97x:6-31G(d) Total Energy\",\n        \"wB97x:def2-TZVPP Total Energy\",\n    ]\n\n    force_target_names = [\n        \"wB97x:6-31G(d) Atomic Forces\",\n        \"wB97x:def2-TZVPP Atomic Forces\",\n    ]\n\n    __force_mask__ = [False, False, False, False, False, False, True, True]\n    __links__ = {\"ani1ccx.hdf5.gz\": \"https://zenodo.org/record/4081692/files/293.hdf5.gz\"}\n\n    def convert_forces(self, x):\n        return super().convert_forces(x) * 0.529177249  # correct the Dataset error\n\n    def __smiles_converter__(self, x):\n        return \"-\".join(x.decode(\"ascii\").split(\"-\")[:-1])\n
    "},{"location":"API/datasets/ani.html#openqdc.datasets.potential.ani.ANI2X","title":"ANI2X","text":"

    Bases: ANI1

    The ANI-2X dataset was constructed using active learning from modified versions of GDB-11, CheMBL, and s66x8. It adds three new elements (F, Cl, S) resulting in 4.6 million conformers from 13k chemical isomers, optimized using the LBFGS algorithm and labeled with \u03c9B97X/6-31G*. The same sampling techniques as done in ANI-1X are used for generating geometries.

    Usage:

    from openqdc.datasets import ANI2X\ndataset = ANI2X()\n

    References

    https://doi.org/10.1021/acs.jctc.0c00121 https://github.com/aiqm/ANI1x_datasets

    Source code in openqdc/datasets/potential/ani.py
    class ANI2X(ANI1):\n    \"\"\"\n    The ANI-2X dataset was constructed using active learning from modified versions of GDB-11, CheMBL, and s66x8.\n    It adds three new elements (F, Cl, S) resulting in 4.6 million conformers from 13k chemical isomers, optimized\n    using the LBFGS algorithm and labeled with \u03c9B97X/6-31G*. The same sampling techniques as done in ANI-1X are\n    used for generating geometries.\n\n    Usage:\n    ```python\n    from openqdc.datasets import ANI2X\n    dataset = ANI2X()\n    ```\n\n    References:\n        https://doi.org/10.1021/acs.jctc.0c00121\n        https://github.com/aiqm/ANI1x_datasets\n    \"\"\"\n\n    __name__ = \"ani2x\"\n    __energy_unit__ = \"hartree\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"hartree/ang\"\n\n    __energy_methods__ = [\n        # PotentialMethod.NONE,  # \"b973c/def2mtzvp\",\n        PotentialMethod.WB97X_6_31G_D,  # \"wb97x/631gd\", # PAPER DATASET\n        # PotentialMethod.NONE,  # \"wb97md3bj/def2tzvpp\",\n        # PotentialMethod.NONE,  # \"wb97mv/def2tzvpp\",\n        # PotentialMethod.NONE,  # \"wb97x/def2tzvpp\",\n    ]\n\n    energy_target_names = [\n        # \"b973c/def2mtzvp\",\n        \"wb97x/631gd\",\n        # \"wb97md3bj/def2tzvpp\",\n        # \"wb97mv/def2tzvpp\",\n        # \"wb97x/def2tzvpp\",\n    ]\n\n    force_target_names = [\"wb97x/631gd\"]  # \"b973c/def2mtzvp\",\n\n    __force_mask__ = [True]\n    __links__ = {  # \"ANI-2x-B973c-def2mTZVP.tar.gz\": \"https://zenodo.org/records/10108942/files/ANI-2x-B973c-def2mTZVP.tar.gz?download=1\",  # noqa\n        # \"ANI-2x-wB97MD3BJ-def2TZVPP.tar.gz\": \"https://zenodo.org/records/10108942/files/ANI-2x-wB97MD3BJ-def2TZVPP.tar.gz?download=1\", # noqa\n        # \"ANI-2x-wB97MV-def2TZVPP.tar.gz\": \"https://zenodo.org/records/10108942/files/ANI-2x-wB97MV-def2TZVPP.tar.gz?download=1\", # noqa\n        \"ANI-2x-wB97X-631Gd.tar.gz\": \"https://zenodo.org/records/10108942/files/ANI-2x-wB97X-631Gd.tar.gz?download=1\",  # noqa\n        # \"ANI-2x-wB97X-def2TZVPP.tar.gz\": \"https://zenodo.org/records/10108942/files/ANI-2x-wB97X-def2TZVPP.tar.gz?download=1\", # noqa\n    }\n\n    def __smiles_converter__(self, x):\n        return x\n\n    def read_raw_entries(self):\n        samples = []\n        for lvl_theory in self.__links__.keys():\n            raw_path = p_join(self.root, \"final_h5\", f\"{lvl_theory.split('.')[0]}.h5\")\n            samples.extend(read_ani2_h5(raw_path))\n        return samples\n
    "},{"location":"API/datasets/comp6.html","title":"Comp6","text":""},{"location":"API/datasets/comp6.html#openqdc.datasets.potential.comp6.COMP6","title":"COMP6","text":"

    Bases: BaseDataset

    COMP6 is a benchmark suite consisting of broad regions of bio-chemical and organic space developed for testing the ANI-1x potential. It is curated from 6 benchmark sets: S66x8, ANI-MD, GDB7to9, GDB10to13, DrugBank, and Tripeptides. Energies and forces for all non-equilibrium molecular conformations are calculated using the wB97x density functional with the 6-31G(d) basis set. The dataset also includes Hirshfield charges and molecular dipoles.

    Details of the benchmark sets are as follows

    S66x8: Consists of 66 dimeric systems involving hydrogen bonding, pi-pi stacking, London interactions and

    mixed influence interactions.

    ANI Molecular Dynamics (ANI-MD): Forces from the ANI-1x potential are used for running 1ns vacuum molecular\n

    dynamics with a 0.25fs time step at 300K using the Langevin thermostat of 14 well-known drug molecules and 2 small proteins. A random subsample of 128 frames from each 1ns trajectory is selected, and reference DFT single point calculations are performed to calculate energies and forces.

    GDB7to9: Consists of 1500 molecules where 500 per 7, 8 and 9 heavy atoms subsampled from the GDB-11 dataset.\n

    The intial structure are randomly embedded into 3D space using RDKit and are optimized with tight convergence criteria. Normal modes/force constants are computer using the reference DFT model. Finally, Diverse normal mode sampling (DNMS) is carried out to generate non-equilibrium conformations.

    GDB10to13: Consists of 3000 molecules where 500 molecules per 10 and 11 heavy atoms are subsampled from GDB-11\n

    and 1000 molecules per 12 and 13 heavy atom are subsampled from GDB-13. Non-equilibrium conformations are generated via DNMS.

    Tripeptide: Consists of 248 random tripeptides. Structures are optimized similar to GDB7to9.\n\nDrugBank: Consists of 837 molecules subsampled from the original DrugBank database of real drug molecules.\n

    Structures are optimized similar to GDB7to9.

    Usage:

    from openqdc.datasets import COMP6\ndataset = COMP6()\n

    References

    https://aip.scitation.org/doi/abs/10.1063/1.5023802

    https://github.com/isayev/COMP6

    S66x8: https://pubs.rsc.org/en/content/articlehtml/2016/cp/c6cp00688d

    GDB-11: https://pubmed.ncbi.nlm.nih.gov/15674983/

    GDB-13: https://pubmed.ncbi.nlm.nih.gov/19505099/

    DrugBank: https://pubs.acs.org/doi/10.1021/ja902302h

    Source code in openqdc/datasets/potential/comp6.py
    class COMP6(BaseDataset):\n    \"\"\"\n    COMP6 is a benchmark suite consisting of broad regions of bio-chemical and organic space developed for testing the\n    ANI-1x potential. It is curated from 6 benchmark sets: S66x8, ANI-MD, GDB7to9, GDB10to13, DrugBank, and\n    Tripeptides. Energies and forces for all non-equilibrium molecular conformations are calculated using\n    the wB97x density functional with the 6-31G(d) basis set. The dataset also includes Hirshfield charges and\n    molecular dipoles.\n\n    Details of the benchmark sets are as follows:\n        S66x8: Consists of 66 dimeric systems involving hydrogen bonding, pi-pi stacking, London interactions and\n    mixed influence interactions.\\n\n        ANI Molecular Dynamics (ANI-MD): Forces from the ANI-1x potential are used for running 1ns vacuum molecular\n    dynamics with a 0.25fs time step at 300K using the Langevin thermostat of 14 well-known drug molecules and 2 small\n    proteins. A random subsample of 128 frames from each 1ns trajectory is selected, and reference DFT single point\n    calculations are performed to calculate energies and forces.\\n\n        GDB7to9: Consists of 1500 molecules where 500 per 7, 8 and 9 heavy atoms subsampled from the GDB-11 dataset.\n    The intial structure are randomly embedded into 3D space using RDKit and are optimized with tight convergence\n    criteria. Normal modes/force constants are computer using the reference DFT model. Finally, Diverse normal\n    mode sampling (DNMS) is carried out to generate non-equilibrium conformations.\\n\n        GDB10to13: Consists of 3000 molecules where 500 molecules per 10 and 11 heavy atoms are subsampled from GDB-11\n    and 1000 molecules per 12 and 13 heavy atom are subsampled from GDB-13. Non-equilibrium conformations are\n    generated via DNMS.\\n\n        Tripeptide: Consists of 248 random tripeptides. Structures are optimized similar to GDB7to9.\\n\n        DrugBank: Consists of 837 molecules subsampled from the original DrugBank database of real drug molecules.\n    Structures are optimized similar to GDB7to9.\n\n    Usage:\n    ```python\n    from openqdc.datasets import COMP6\n    dataset = COMP6()\n    ```\n\n    References:\n        https://aip.scitation.org/doi/abs/10.1063/1.5023802\\n\n        https://github.com/isayev/COMP6\\n\n        S66x8: https://pubs.rsc.org/en/content/articlehtml/2016/cp/c6cp00688d\\n\n        GDB-11: https://pubmed.ncbi.nlm.nih.gov/15674983/\\n\n        GDB-13: https://pubmed.ncbi.nlm.nih.gov/19505099/\\n\n        DrugBank: https://pubs.acs.org/doi/10.1021/ja902302h\n    \"\"\"\n\n    __name__ = \"comp6\"\n\n    # watchout that forces are stored as -grad(E)\n    __energy_unit__ = \"kcal/mol\"\n    __distance_unit__ = \"ang\"  # angstorm\n    __forces_unit__ = \"kcal/mol/ang\"\n\n    __energy_methods__ = [\n        PotentialMethod.WB97X_6_31G_D,  # \"wb97x/6-31g*\",\n        PotentialMethod.B3LYP_D3_BJ_DEF2_TZVP,  # \"b3lyp-d3(bj)/def2-tzvp\",\n        PotentialMethod.B3LYP_DEF2_TZVP,  # \"b3lyp/def2-tzvp\",\n        PotentialMethod.HF_DEF2_TZVP,  # \"hf/def2-tzvp\",\n        PotentialMethod.PBE_D3_BJ_DEF2_TZVP,  # \"pbe-d3(bj)/def2-tzvp\",\n        PotentialMethod.PBE_DEF2_TZVP,  # \"pbe/def2-tzvp\",\n        PotentialMethod.SVWN_DEF2_TZVP,  # \"svwn/def2-tzvp\",\n    ]\n\n    energy_target_names = [\n        \"Energy\",\n        \"B3LYP-D3M(BJ):def2-tzvp\",\n        \"B3LYP:def2-tzvp\",\n        \"HF:def2-tzvp\",\n        \"PBE-D3M(BJ):def2-tzvp\",\n        \"PBE:def2-tzvp\",\n        \"SVWN:def2-tzvp\",\n    ]\n    __force_mask__ = [True, False, False, False, False, False, False]\n\n    force_target_names = [\n        \"Gradient\",\n    ]\n\n    def __smiles_converter__(self, x):\n        \"\"\"util function to convert string to smiles: useful if the smiles is\n        encoded in a different format than its display format\n        \"\"\"\n        return \"-\".join(x.decode(\"ascii\").split(\"_\")[:-1])\n\n    def read_raw_entries(self):\n        samples = []\n        for subset in [\"ani_md\", \"drugbank\", \"gdb7_9\", \"gdb10_13\", \"s66x8\", \"tripeptides\"]:\n            raw_path = p_join(self.root, f\"{subset}.h5.gz\")\n            samples += read_qc_archive_h5(raw_path, subset, self.energy_target_names, self.force_target_names)\n\n        return samples\n
    "},{"location":"API/datasets/comp6.html#openqdc.datasets.potential.comp6.COMP6.__smiles_converter__","title":"__smiles_converter__(x)","text":"

    util function to convert string to smiles: useful if the smiles is encoded in a different format than its display format

    Source code in openqdc/datasets/potential/comp6.py
    def __smiles_converter__(self, x):\n    \"\"\"util function to convert string to smiles: useful if the smiles is\n    encoded in a different format than its display format\n    \"\"\"\n    return \"-\".join(x.decode(\"ascii\").split(\"_\")[:-1])\n
    "},{"location":"API/datasets/des.html","title":"DES","text":""},{"location":"API/datasets/des.html#openqdc.datasets.interaction.des.DES370K","title":"DES370K","text":"

    Bases: BaseInteractionDataset, IDES

    DE Shaw 370K (DES370K) is a dataset of 3,691 distinct dimers with 370K unique geometries with interaction energies computed at CCSD(T)/CBS level of theory. It consists of 392 closed-shell chemical species (both neutral molecules and ions) including water and functional groups found in proteins. Dimer geometries are generated using QM-based optimization with DF-LMP2/aVDZ level of theory and MD-based from condensed phase MD simulations.

    Usage:

    from openqdc.datasets import DES370K\ndataset = DES370K()\n

    Reference

    https://www.nature.com/articles/s41597-021-00833-x

    Source code in openqdc/datasets/interaction/des.py
    class DES370K(BaseInteractionDataset, IDES):\n    \"\"\"\n    DE Shaw 370K (DES370K) is a dataset of 3,691 distinct dimers with 370K unique geometries with interaction energies\n    computed at CCSD(T)/CBS level of theory. It consists of 392 closed-shell chemical species (both neutral molecules\n    and ions) including water and functional groups found in proteins. Dimer geometries are generated using\n    QM-based optimization with DF-LMP2/aVDZ level of theory and MD-based from condensed phase MD simulations.\n\n    Usage:\n    ```python\n    from openqdc.datasets import DES370K\n    dataset = DES370K()\n    ```\n\n    Reference:\n        https://www.nature.com/articles/s41597-021-00833-x\n    \"\"\"\n\n    __name__ = \"des370k_interaction\"\n    __filename__ = \"DES370K.csv\"\n    __energy_unit__ = \"kcal/mol\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"kcal/mol/ang\"\n    __energy_methods__ = [\n        InteractionMethod.MP2_CC_PVDZ,\n        InteractionMethod.MP2_CC_PVQZ,\n        InteractionMethod.MP2_CC_PVTZ,\n        InteractionMethod.MP2_CBS,\n        InteractionMethod.CCSD_T_CC_PVDZ,\n        InteractionMethod.CCSD_T_CBS,\n        InteractionMethod.CCSD_T_NN,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n    ]\n\n    __energy_type__ = [\n        InterEnergyType.TOTAL,\n        InterEnergyType.TOTAL,\n        InterEnergyType.TOTAL,\n        InterEnergyType.TOTAL,\n        InterEnergyType.TOTAL,\n        InterEnergyType.TOTAL,\n        InterEnergyType.TOTAL,\n        InterEnergyType.TOTAL,\n        InterEnergyType.ES,\n        InterEnergyType.EX,\n        InterEnergyType.EX_S2,\n        InterEnergyType.IND,\n        InterEnergyType.EX_IND,\n        InterEnergyType.DISP,\n        InterEnergyType.EX_DISP_OS,\n        InterEnergyType.EX_DISP_SS,\n        InterEnergyType.DELTA_HF,\n    ]\n\n    energy_target_names = [\n        \"cc_MP2_all\",\n        \"qz_MP2_all\",\n        \"tz_MP2_all\",\n        \"cbs_MP2_all\",\n        \"cc_CCSD(T)_all\",\n        \"cbs_CCSD(T)_all\",\n        \"nn_CCSD(T)_all\",\n        \"sapt_all\",\n        \"sapt_es\",\n        \"sapt_ex\",\n        \"sapt_exs2\",\n        \"sapt_ind\",\n        \"sapt_exind\",\n        \"sapt_disp\",\n        \"sapt_exdisp_os\",\n        \"sapt_exdisp_ss\",\n        \"sapt_delta_HF\",\n    ]\n    __links__ = {\n        \"DES370K.zip\": \"https://zenodo.org/record/5676266/files/DES370K.zip\",\n    }\n\n    @property\n    def csv_path(self):\n        return os.path.join(self.root, self.__filename__)\n\n    def _create_subsets(self, **kwargs):\n        return create_subset(kwargs[\"smiles0\"], kwargs[\"smiles1\"])\n\n    def read_raw_entries(self) -> List[Dict]:\n        filepath = self.csv_path\n        logger.info(f\"Reading {self.__name__} interaction data from {filepath}\")\n        df = pd.read_csv(filepath)\n        data = []\n        for idx, row in tqdm(df.iterrows(), total=df.shape[0]):\n            item = parse_des_df(row, self.energy_target_names)\n            item[\"subset\"] = self._create_subsets(row=row, **item)\n            item = convert_to_record(item)\n            data.append(item)\n        return data\n
    "},{"location":"API/datasets/des.html#openqdc.datasets.interaction.des.DES5M","title":"DES5M","text":"

    Bases: DES370K

    DE Shaw 5M (DES5M) is a dataset of 3,691 distinct dimers with 5,000,000 unique geometries with interaction energies computed using SNS-MP2, a machine learning approach. The unique geometries are generated similar to DES370K using QM based optimization and MD simulations.

    Usage:

    from openqdc.datasets import DES5M\ndataset = DES5M()\n

    Reference

    https://www.nature.com/articles/s41597-021-00833-x

    Source code in openqdc/datasets/interaction/des.py
    class DES5M(DES370K):\n    \"\"\"\n    DE Shaw 5M (DES5M) is a dataset of 3,691 distinct dimers with 5,000,000 unique geometries with interaction energies\n    computed using SNS-MP2, a machine learning approach. The unique geometries are generated similar to DES370K using\n    QM based optimization and MD simulations.\n\n    Usage:\n    ```python\n    from openqdc.datasets import DES5M\n    dataset = DES5M()\n    ```\n\n    Reference:\n        https://www.nature.com/articles/s41597-021-00833-x\n    \"\"\"\n\n    __name__ = \"des5m_interaction\"\n    __filename__ = \"DES5M.csv\"\n\n    __energy_methods__ = [\n        InteractionMethod.MP2_CC_PVQZ,\n        InteractionMethod.MP2_CC_PVTZ,\n        InteractionMethod.MP2_CBS,\n        InteractionMethod.CCSD_T_NN,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,\n    ]\n\n    __energy_type__ = [\n        InterEnergyType.TOTAL,\n        InterEnergyType.TOTAL,\n        InterEnergyType.TOTAL,\n        InterEnergyType.TOTAL,\n        InterEnergyType.TOTAL,\n        InterEnergyType.ES,\n        InterEnergyType.EX,\n        InterEnergyType.EX_S2,\n        InterEnergyType.IND,\n        InterEnergyType.EX_IND,\n        InterEnergyType.DISP,\n        InterEnergyType.EX_DISP_OS,\n        InterEnergyType.EX_DISP_SS,\n        InterEnergyType.DELTA_HF,\n    ]\n\n    energy_target_names = [\n        \"qz_MP2_all\",\n        \"tz_MP2_all\",\n        \"cbs_MP2_all\",\n        \"nn_CCSD(T)_all\",\n        \"sapt_all\",\n        \"sapt_es\",\n        \"sapt_ex\",\n        \"sapt_exs2\",\n        \"sapt_ind\",\n        \"sapt_exind\",\n        \"sapt_disp\",\n        \"sapt_exdisp_os\",\n        \"sapt_exdisp_ss\",\n        \"sapt_delta_HF\",\n    ]\n    __links__ = {\n        \"DES5M.zip\": \"https://zenodo.org/records/5706002/files/DESS5M.zip?download=1\",\n    }\n
    "},{"location":"API/datasets/des.html#openqdc.datasets.interaction.des.DESS66","title":"DESS66","text":"

    Bases: DES370K

    DESS66 is a dataset consisting of 66 molecular complexes from the S66 dataset with CCSD(T)/CBS dimer interaction energies with 1 equilibrium geometry giving 66 conformers in total. The protocol for estimating energies is based on the DES370K paper.

    Usage:

    from openqdc.datasets import DESS66\ndataset = DESS66()\n

    Reference

    https://www.nature.com/articles/s41597-021-00833-x

    S66: https://pubs.acs.org/doi/10.1021/ct2002946

    Source code in openqdc/datasets/interaction/des.py
    class DESS66(DES370K):\n    \"\"\"\n    DESS66 is a dataset consisting of 66 molecular complexes from the S66 dataset with CCSD(T)/CBS\n    dimer interaction energies with 1 equilibrium geometry giving 66 conformers in total.\n    The protocol for estimating energies is based on the DES370K paper.\n\n    Usage:\n    ```python\n    from openqdc.datasets import DESS66\n    dataset = DESS66()\n    ```\n\n    Reference:\n        https://www.nature.com/articles/s41597-021-00833-x\\n\n        S66: https://pubs.acs.org/doi/10.1021/ct2002946\n    \"\"\"\n\n    __name__ = \"des_s66\"\n    __filename__ = \"DESS66.csv\"\n    __links__ = {\"DESS66.zip\": \"https://zenodo.org/records/5676284/files/DESS66.zip?download=1\"}\n\n    def _create_subsets(self, **kwargs):\n        return kwargs[\"row\"][\"system_name\"]\n
    "},{"location":"API/datasets/des.html#openqdc.datasets.interaction.des.DESS66x8","title":"DESS66x8","text":"

    Bases: DESS66

    DESS66x8 is a dataset consisting of 66 molecular complexes from the S66 dataset with CCSD(T)/CBS dimer interaction energies with 1 equilibrium geometry and 8 geometries along the dissociation curve giving 592 conformers in total. The protocol for estimating energies is based on the DES370K paper.

    Usage:

    from openqdc.datasets import DESS66x8\ndataset = DESS66x8()\n

    Reference

    https://www.nature.com/articles/s41597-021-00833-x

    Source code in openqdc/datasets/interaction/des.py
    class DESS66x8(DESS66):\n    \"\"\"\n    DESS66x8 is a dataset consisting of 66 molecular complexes from the S66 dataset with CCSD(T)/CBS\n    dimer interaction energies with 1 equilibrium geometry and 8 geometries along the dissociation curve\n    giving 592 conformers in total. The protocol for estimating energies is based on the DES370K paper.\n\n    Usage:\n    ```python\n    from openqdc.datasets import DESS66x8\n    dataset = DESS66x8()\n    ```\n\n    Reference:\n        https://www.nature.com/articles/s41597-021-00833-x\n    \"\"\"\n\n    __name__ = \"des_s66x8\"\n    __filename__ = \"DESS66x8.csv\"\n    __links__ = {\"DESS66x8.zip\": \"https://zenodo.org/records/5676284/files/DESS66x8.zip?download=1\"}\n
    "},{"location":"API/datasets/gdml.html","title":"GDML","text":""},{"location":"API/datasets/gdml.html#openqdc.datasets.potential.gdml.GDML","title":"GDML","text":"

    Bases: BaseDataset

    Gradient Domain Machine Learning (GDML) is a dataset consisting of samples from ab initio molecular dynamics (AIMD) trajectories at a resolution of 0.5fs. The dataset consists of, Benzene (627000 conformations), Uracil (133000 conformations), Naptalene (326000 conformations), Aspirin (211000 conformations) Salicylic Acid (320000 conformations), Malonaldehyde (993000 conformations), Ethanol (555000 conformations) and Toluene (100000 conformations). Energy and force labels for each conformation are computed using the PBE + vdW-TS electronic structure method. molecular dynamics (AIMD) trajectories.

    The dataset consists of the following trajectories

    Benzene: 627000 samples

    Uracil: 133000 samples

    Naptalene: 326000 samples

    Aspirin: 211000 samples

    Salicylic Acid: 320000 samples

    Malonaldehyde: 993000 samples

    Ethanol: 555000 samples

    Toluene: 100000 samples

    Usage:

    from openqdc.datasets import GDML\ndataset = GDML()\n

    References

    https://www.science.org/doi/10.1126/sciadv.1603015 http://www.sgdml.org/#datasets

    Source code in openqdc/datasets/potential/gdml.py
    class GDML(BaseDataset):\n    \"\"\"\n    Gradient Domain Machine Learning (GDML) is a dataset consisting of samples from ab initio\n    molecular dynamics (AIMD) trajectories at a resolution of 0.5fs. The dataset consists of, Benzene\n    (627000 conformations), Uracil (133000 conformations), Naptalene (326000 conformations), Aspirin\n    (211000 conformations) Salicylic Acid (320000 conformations), Malonaldehyde (993000 conformations),\n    Ethanol (555000 conformations) and Toluene (100000 conformations). Energy and force labels for\n    each conformation are computed using the PBE + vdW-TS electronic structure method.\n    molecular dynamics (AIMD) trajectories.\n\n    The dataset consists of the following trajectories:\n        Benzene: 627000 samples\\n\n        Uracil: 133000 samples\\n\n        Naptalene: 326000 samples\\n\n        Aspirin: 211000 samples\\n\n        Salicylic Acid: 320000 samples\\n\n        Malonaldehyde: 993000 samples\\n\n        Ethanol: 555000 samples\\n\n        Toluene: 100000 samples\\n\n\n    Usage:\n    ```python\n    from openqdc.datasets import GDML\n    dataset = GDML()\n    ```\n\n    References:\n        https://www.science.org/doi/10.1126/sciadv.1603015\n        http://www.sgdml.org/#datasets\n    \"\"\"\n\n    __name__ = \"gdml\"\n\n    __energy_methods__ = [\n        PotentialMethod.CCSD_CC_PVDZ,  # \"ccsd/cc-pvdz\",\n        PotentialMethod.CCSD_T_CC_PVDZ,  # \"ccsd(t)/cc-pvdz\",\n        # TODO: verify if basis set vdw-ts == def2-tzvp and\n        # it is the same in ISO17 and revmd17\n        PotentialMethod.PBE_DEF2_TZVP,  # \"pbe/def2-tzvp\",  # MD17\n    ]\n\n    energy_target_names = [\n        \"CCSD Energy\",\n        \"CCSD(T) Energy\",\n        \"PBE-TS Energy\",\n    ]\n\n    __force_mask__ = [True, True, True]\n\n    force_target_names = [\n        \"CCSD Gradient\",\n        \"CCSD(T) Gradient\",\n        \"PBE-TS Gradient\",\n    ]\n\n    __energy_unit__ = \"kcal/mol\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"kcal/mol/ang\"\n    __links__ = {\n        \"gdb7_9.hdf5.gz\": \"https://zenodo.org/record/3588361/files/208.hdf5.gz\",\n        \"gdb10_13.hdf5.gz\": \"https://zenodo.org/record/3588364/files/209.hdf5.gz\",\n        \"drugbank.hdf5.gz\": \"https://zenodo.org/record/3588361/files/207.hdf5.gz\",\n        \"tripeptides.hdf5.gz\": \"https://zenodo.org/record/3588368/files/211.hdf5.gz\",\n        \"ani_md.hdf5.gz\": \"https://zenodo.org/record/3588341/files/205.hdf5.gz\",\n        \"s66x8.hdf5.gz\": \"https://zenodo.org/record/3588367/files/210.hdf5.gz\",\n    }\n\n    def read_raw_entries(self):\n        raw_path = p_join(self.root, \"gdml.h5.gz\")\n        samples = read_qc_archive_h5(raw_path, \"gdml\", self.energy_target_names, self.force_target_names)\n\n        return samples\n
    "},{"location":"API/datasets/geom.html","title":"GEOM","text":"

    Bases: BaseDataset

    Geometric Ensemble Of Molecules (GEOM) dataset contains 37 million conformers for 133,000 molecules from QM9, and 317,000 molecules with experimental data related to biophysics, physiology, and physical chemistry. For each molecule, the initial structure is generated with RDKit, optimized with the GFN2-xTB energy method and the lowest energy conformer is fed to the CREST software. CREST software uses metadynamics for exploring the conformational space for each molecule. Energies in the dataset are computed using semi-empirical method GFN2-xTB.

    Usage:

    from openqdc.datasets import GEOM\ndataset = GEOM()\n

    References

    https://www.nature.com/articles/s41597-022-01288-4

    https://github.com/learningmatter-mit/geom

    CREST Software: https://pubs.rsc.org/en/content/articlelanding/2020/cp/c9cp06869d

    Source code in openqdc/datasets/potential/geom.py
    class GEOM(BaseDataset):\n    \"\"\"\n    Geometric Ensemble Of Molecules (GEOM) dataset contains 37 million conformers for 133,000 molecules\n    from QM9, and 317,000 molecules with experimental data related to biophysics, physiology, and physical chemistry.\n    For each molecule, the initial structure is generated with RDKit, optimized with the GFN2-xTB energy method and\n    the lowest energy conformer is fed to the CREST software. CREST software uses metadynamics for exploring the\n    conformational space for each molecule. Energies in the dataset are computed using semi-empirical method GFN2-xTB.\n\n    Usage:\n    ```python\n    from openqdc.datasets import GEOM\n    dataset = GEOM()\n    ```\n\n    References:\n        https://www.nature.com/articles/s41597-022-01288-4\\n\n        https://github.com/learningmatter-mit/geom\\n\n        CREST Software: https://pubs.rsc.org/en/content/articlelanding/2020/cp/c9cp06869d\n    \"\"\"\n\n    __name__ = \"geom\"\n    __energy_methods__ = [PotentialMethod.GFN2_XTB]\n\n    __energy_unit__ = \"hartree\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"hartree/ang\"\n\n    energy_target_names = [\"gfn2_xtb.energy\"]\n    force_target_names = []\n\n    partitions = [\"qm9\", \"drugs\"]\n    __links__ = {\"rdkit_folder.tar.gz\": \"https://dataverse.harvard.edu/api/access/datafile/4327252\"}\n\n    def _read_raw_(self, partition):\n        raw_path = p_join(self.root, \"rdkit_folder\")\n\n        mols = load_json(p_join(raw_path, f\"summary_{partition}.json\"))\n        mols = list(mols.items())\n\n        fn = lambda x: read_mol(x[0], x[1], raw_path, partition)  # noqa E731\n        samples = dm.parallelized(fn, mols, n_jobs=1, progress=True)  # don't use more than 1 job\n        return samples\n\n    def read_raw_entries(self):\n        samples = sum([self._read_raw_(partition) for partition in self.partitions], [])\n        return samples\n
    "},{"location":"API/datasets/iso_17.html","title":"ISO_17","text":""},{"location":"API/datasets/iso_17.html#openqdc.datasets.potential.iso_17.ISO17","title":"ISO17","text":"

    Bases: BaseDataset

    ISO17 dataset consists of the largest set of isomers from the QM9 dataset that consists of a fixed composition of atoms (C7O2H10) arranged in different chemically valid structures. It consist of 129 molecules, each containing 5,000 conformational geometries, energies and forces with a resolution of 1 fs in the molecular dynamics trajectories. The simulations were carried out using density functional theory (DFT) in the generalized gradient approximation (GGA) with the Perdew-Burke-Ernzerhof (PBE) functional and the Tkatchenko-Scheffler (TS) van der Waals correction method.

    Usage:

    from openqdc.datasets import ISO17\ndataset = ISO17()\n

    References

    https://arxiv.org/abs/1706.08566

    https://arxiv.org/abs/1609.08259

    https://www.nature.com/articles/sdata201422

    https://pubmed.ncbi.nlm.nih.gov/10062328/

    https://pubmed.ncbi.nlm.nih.gov/19257665/

    Source code in openqdc/datasets/potential/iso_17.py
    class ISO17(BaseDataset):\n    \"\"\"\n    ISO17 dataset consists of the largest set of isomers from the QM9 dataset that consists of a fixed composition of\n    atoms (C7O2H10) arranged in different chemically valid structures. It consist of 129 molecules, each containing\n    5,000 conformational geometries, energies and forces with a resolution of 1 fs in the molecular dynamics\n    trajectories. The simulations were carried out using density functional theory (DFT) in the generalized gradient\n    approximation (GGA) with the Perdew-Burke-Ernzerhof (PBE) functional and the Tkatchenko-Scheffler (TS) van der\n    Waals correction method.\n\n    Usage:\n    ```python\n    from openqdc.datasets import ISO17\n    dataset = ISO17()\n    ```\n\n    References:\n        https://arxiv.org/abs/1706.08566\\n\n        https://arxiv.org/abs/1609.08259\\n\n        https://www.nature.com/articles/sdata201422\\n\n        https://pubmed.ncbi.nlm.nih.gov/10062328/\\n\n        https://pubmed.ncbi.nlm.nih.gov/19257665/\n    \"\"\"\n\n    __name__ = \"iso_17\"\n\n    __energy_methods__ = [\n        PotentialMethod.PBE_DEF2_TZVP,  # \"pbe/def2-tzvp\",\n    ]\n\n    energy_target_names = [\n        \"PBE-TS Energy\",\n    ]\n\n    __force_mask__ = [True]\n\n    force_target_names = [\n        \"PBE-TS Gradient\",\n    ]\n\n    __energy_unit__ = \"ev\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"ev/ang\"\n    __links__ = {\"iso_17.hdf5.gz\": \"https://zenodo.org/record/3585907/files/216.hdf5.gz\"}\n\n    def __smiles_converter__(self, x):\n        \"\"\"util function to convert string to smiles: useful if the smiles is\n        encoded in a different format than its display format\n        \"\"\"\n        return \"-\".join(x.decode(\"ascii\").split(\"_\")[:-1])\n\n    def read_raw_entries(self):\n        raw_path = p_join(self.root, \"iso_17.h5.gz\")\n        samples = read_qc_archive_h5(raw_path, \"iso_17\", self.energy_target_names, self.force_target_names)\n\n        return samples\n
    "},{"location":"API/datasets/iso_17.html#openqdc.datasets.potential.iso_17.ISO17.__smiles_converter__","title":"__smiles_converter__(x)","text":"

    util function to convert string to smiles: useful if the smiles is encoded in a different format than its display format

    Source code in openqdc/datasets/potential/iso_17.py
    def __smiles_converter__(self, x):\n    \"\"\"util function to convert string to smiles: useful if the smiles is\n    encoded in a different format than its display format\n    \"\"\"\n    return \"-\".join(x.decode(\"ascii\").split(\"_\")[:-1])\n
    "},{"location":"API/datasets/l7.html","title":"L7","text":""},{"location":"API/datasets/l7.html#openqdc.datasets.interaction.l7.L7","title":"L7","text":"

    Bases: YamlDataset

    The L7 interaction energy dataset consists of 7 dispersion stabilized non-covalent complexes with energies labelled using semi-empirical and quantum mechanical methods. The intial geometries are taken from crystal X-ray data and optimized with a DFT method specific to the complex.

    Usage:

    from openqdc.datasets import L7\ndataset = L7()\n

    Reference

    https://pubs.acs.org/doi/10.1021/ct400036b

    Source code in openqdc/datasets/interaction/l7.py
    class L7(YamlDataset):\n    \"\"\"\n    The L7 interaction energy dataset consists of 7 dispersion stabilized non-covalent complexes with\n    energies labelled using semi-empirical and quantum mechanical methods. The intial geometries are\n    taken from crystal X-ray data and optimized with a DFT method specific to the complex.\n\n    Usage:\n    ```python\n    from openqdc.datasets import L7\n    dataset = L7()\n    ```\n\n    Reference:\n        https://pubs.acs.org/doi/10.1021/ct400036b\n    \"\"\"\n\n    __name__ = \"l7\"\n    __energy_methods__ = [\n        InteractionMethod.QCISDT_CBS,  # \"QCISD(T)/CBS\",\n        InteractionMethod.DLPNO_CCSDT,  # \"DLPNO-CCSD(T)\",\n        InteractionMethod.MP2_CBS,  # \"MP2/CBS\",\n        InteractionMethod.MP2C_CBS,  # \"MP2C/CBS\",\n        InteractionMethod.FIXED,  # \"fixed\", TODO: we should remove this level of theory because unless we have a pro\n        InteractionMethod.DLPNO_CCSDT0,  # \"DLPNO-CCSD(T0)\",\n        InteractionMethod.LNO_CCSDT,  # \"LNO-CCSD(T)\",\n        InteractionMethod.FN_DMC,  # \"FN-DMC\",\n    ]\n    __links__ = {\n        \"l7.yaml\": \"http://cuby4.molecular.cz/download_datasets/l7.yaml\",\n        \"geometries.tar.gz\": \"http://cuby4.molecular.cz/download_geometries/L7.tar\",\n    }\n\n    def _process_name(self, item):\n        return item.geometry.split(\":\")[1]\n\n    def get_n_atoms_ptr(self, item, root, filename):\n        return np.array([int(item.setup[\"molecule_a\"][\"selection\"].split(\"-\")[1])], dtype=np.int32)\n
    "},{"location":"API/datasets/maceoff.html","title":"MaceOFF","text":""},{"location":"API/datasets/maceoff.html#openqdc.datasets.potential.maceoff.MACEOFF","title":"MACEOFF","text":"

    Bases: BaseDataset

    MACEOFF dataset core of the dataset consist in the Spice V1 dataset. 95% of the data are used for training and validation under the \"train\" split, and 5% for testing. The dataset uses the Spice level of theory \u03c9B97M-D3(BJ)/def2-TZVPPD as implemented in the PSI4 software. MACEOFF uses a subset of SPICE that contains the ten chemical elements H, C, N, O, F, P, S, Cl, Br, and I, and has a neutral formal charge. MACEOFF doesn't contain ion pairs. To facilitate the learning of intramolecular non-bonded interactions, MACEOFF dataset contains larger 50\u201390 atom molecules randomly selected from the QMugs dataset. MACEOFF contains a number of water clusters carved out of molecular dynamics simulations of liquid water, with sizes of up to 50 water molecules and part of the COMP6 tripeptide geometry dataset.

    Usage:

    from openqdc.datasets import MACEOFF\ndataset = MACEOFF()\n

    Species

    [H, C, N, O, F, P, S, Cl, Br, I]

    References

    https://arxiv.org/pdf/2312.15211

    https://doi.org/10.17863/CAM.107498

    Source code in openqdc/datasets/potential/maceoff.py
    class MACEOFF(BaseDataset):\n    \"\"\"\n    MACEOFF dataset core of the dataset consist in the Spice V1 dataset.\n    95% of the data are used for training and validation under the \"train\" split,\n    and 5% for testing. The dataset uses the Spice level of theory\n    \u03c9B97M-D3(BJ)/def2-TZVPPD as implemented in the PSI4 software.\n    MACEOFF uses a subset of SPICE that contains the ten chemical elements\n    H, C, N, O, F, P, S, Cl, Br, and I, and has a neutral formal charge.\n    MACEOFF doesn't contain ion pairs. To facilitate the learning of intramolecular\n    non-bonded interactions, MACEOFF dataset contains larger 50\u201390 atom molecules\n    randomly selected from the QMugs dataset.\n    MACEOFF contains a number of water clusters carved out of molecular dynamics simulations\n    of liquid water, with sizes of up to 50 water molecules and part of the\n    COMP6 tripeptide geometry dataset.\n\n    Usage:\n    ```python\n    from openqdc.datasets import MACEOFF\n    dataset = MACEOFF()\n    ```\n\n    Species:\n        [H, C, N, O, F, P, S, Cl, Br, I]\n\n    References:\n        https://arxiv.org/pdf/2312.15211\\n\n        https://doi.org/10.17863/CAM.107498\n    \"\"\"\n\n    __name__ = \"maceoff\"\n\n    __energy_methods__ = [PotentialMethod.WB97M_D3BJ_DEF2_TZVPPD]\n    __force_mask__ = [True]\n    __energy_unit__ = \"ev\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"ev/ang\"\n\n    energy_target_names = [\"dft_total_energy\"]\n    force_target_names = [\"dft_total_gradient\"]\n\n    __links__ = {\n        \"train_large_neut_no_bad_clean.tar.gz\": \"https://api.repository.cam.ac.uk/server/api/core/bitstreams/b185b5ab-91cf-489a-9302-63bfac42824a/content\",  # noqa: E501\n        \"test_large_neut_all.tar.gz\": \"https://api.repository.cam.ac.uk/server/api/core/bitstreams/cb8351dd-f09c-413f-921c-67a702a7f0c5/content\",  # noqa: E501\n    }\n\n    def read_raw_entries(self):\n        entries = []\n        for filename in self.__links__:\n            filename = filename.split(\".\")[0]\n            xyzpath = p_join(self.root, f\"{filename}.xyz\")\n            split = filename.split(\"_\")[0]\n            structure_iterator = parse_mace_xyz(xyzpath)\n            func = partial(build_data_object, split=split)\n            entries.extend(dm.utils.parallelized(func, structure_iterator))\n        return entries\n\n    def __getitem__(self, idx):\n        data = super().__getitem__(idx)\n        data.__setattr__(\"split\", self._convert_array(self.data[\"split\"][idx]))\n        return data\n
    "},{"location":"API/datasets/md22.html","title":"MD22","text":""},{"location":"API/datasets/md22.html#openqdc.datasets.potential.md22.MD22","title":"MD22","text":"

    Bases: RevMD17

    MD22 consists of molecular dynamics (MD) trajectories of four major classes of biomolecules and supramolecules, ranging from a small peptide with 42 atoms to a double-walled nanotube with 370 atoms. The simulation trajectories are sampled at 400K and 500K with a resolution of 1fs. Potential energy and forces are computed using the PBE+MBD level of theory.

    Usage:

    from openqdc.datasets import MD22\ndataset = MD22()\n

    Reference

    https://arxiv.org/abs/2209.14865

    Source code in openqdc/datasets/potential/md22.py
    class MD22(RevMD17):\n    \"\"\"\n    MD22 consists of molecular dynamics (MD) trajectories of four major classes of biomolecules and supramolecules,\n    ranging from a small peptide with 42 atoms to a double-walled nanotube with 370 atoms. The simulation trajectories\n    are sampled at 400K and 500K with a resolution of 1fs. Potential energy and forces are computed using the PBE+MBD\n    level of theory.\n\n    Usage:\n    ```python\n    from openqdc.datasets import MD22\n    dataset = MD22()\n    ```\n\n    Reference:\n        https://arxiv.org/abs/2209.14865\n    \"\"\"\n\n    __name__ = \"md22\"\n    __links__ = {\n        f\"{x}.npz\": f\"http://www.quantum-machine.org/gdml/repo/datasets/md22_{x}.npz\"\n        for x in [\n            \"Ac-Ala3-NHMe\",\n            \"DHA\",\n            \"stachyose\",\n            \"AT-AT\",\n            \"AT-AT-CG-CG\",\n            \"double-walled_nanotube\",\n            \"buckyball-catcher\",\n        ]\n    }\n\n    def read_raw_entries(self):\n        entries_list = []\n        for trajectory in trajectories:\n            entries_list.append(read_npz_entry(trajectory, self.root))\n        return entries_list\n
    "},{"location":"API/datasets/metcalf.html","title":"Metcalf","text":""},{"location":"API/datasets/metcalf.html#openqdc.datasets.interaction.metcalf.Metcalf","title":"Metcalf","text":"

    Bases: BaseInteractionDataset

    Metcalf is a dataset consisting of 126 hydrogen-bonded dimers involving N-methylacetamide (NMA) with 14,744 to 156,704 geometries/configurations for each complex. The geometries are optimized using the RI-MP2 method and the cc-pVTZ basis set. SAPT(0) calculations are performed for computing interaction energies and the various components.

    Usage:

    from openqdc.datasets import Metcalf\ndataset = Metcalf()\n

    Reference

    https://doi.org/10.1063/1.5142636

    Source code in openqdc/datasets/interaction/metcalf.py
    class Metcalf(BaseInteractionDataset):\n    \"\"\"\n    Metcalf is a dataset consisting of 126 hydrogen-bonded dimers involving N-methylacetamide (NMA) with 14,744 to\n    156,704 geometries/configurations for each complex. The geometries are optimized using the RI-MP2 method and\n    the cc-pVTZ basis set. SAPT(0) calculations are performed for computing interaction energies and the various\n    components.\n\n    Usage:\n    ```python\n    from openqdc.datasets import Metcalf\n    dataset = Metcalf()\n    ```\n\n    Reference:\n        https://doi.org/10.1063/1.5142636\n    \"\"\"\n\n    __name__ = \"metcalf\"\n    __energy_unit__ = \"kcal/mol\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"kcal/mol/ang\"\n    __energy_methods__ = [\n        InteractionMethod.SAPT0_JUN_CC_PVDZ,\n        InteractionMethod.SAPT0_JUN_CC_PVDZ,\n        InteractionMethod.SAPT0_JUN_CC_PVDZ,\n        InteractionMethod.SAPT0_JUN_CC_PVDZ,\n        InteractionMethod.SAPT0_JUN_CC_PVDZ,\n    ]\n    __energy_type__ = [\n        InterEnergyType.TOTAL,\n        InterEnergyType.ES,\n        InterEnergyType.EX,\n        InterEnergyType.IND,\n        InterEnergyType.DISP,\n    ]\n    energy_target_names = [\n        \"total energy\",\n        \"electrostatic energy\",\n        \"exchange energy\",\n        \"induction energy\",\n        \"dispersion energy\",\n    ]\n    __links__ = {\"model-data.tar.gz\": \"https://zenodo.org/records/10934211/files/model-data.tar?download=1\"}\n\n    def read_raw_entries(self) -> List[Dict]:\n        # extract in folders\n        extract_raw_tar_gz(self.root)\n        data = []\n        for filename in glob(self.root + f\"{os.sep}*.xyz\"):\n            data.extend(read_xyz(filename, self.__name__))\n        return data\n
    "},{"location":"API/datasets/molecule3d.html","title":"Molecule3D","text":""},{"location":"API/datasets/molecule3d.html#openqdc.datasets.potential.molecule3d.Molecule3D","title":"Molecule3D","text":"

    Bases: BaseDataset

    Molecule3D dataset consists of 3,899,647 molecules with equilibrium geometries and energies calculated at the B3LYP/6-31G* level of theory. The molecules are extracted from the PubChem database and cleaned by removing molecules with invalid molecule files, with SMILES conversion error, RDKIT warnings, sanitization problems, or with damaged log files.

    Usage:

    from openqdc.datasets import Molecule3D\ndataset = Molecule3D()\n

    References

    https://arxiv.org/abs/2110.01717

    https://github.com/divelab/MoleculeX

    Source code in openqdc/datasets/potential/molecule3d.py
    class Molecule3D(BaseDataset):\n    \"\"\"\n    Molecule3D dataset consists of 3,899,647 molecules with equilibrium geometries and energies calculated at the\n    B3LYP/6-31G* level of theory. The molecules are extracted from the PubChem database and cleaned by removing\n    molecules with invalid molecule files, with SMILES conversion error, RDKIT warnings, sanitization problems,\n    or with damaged log files.\n\n    Usage:\n    ```python\n    from openqdc.datasets import Molecule3D\n    dataset = Molecule3D()\n    ```\n\n    References:\n        https://arxiv.org/abs/2110.01717\\n\n        https://github.com/divelab/MoleculeX\n    \"\"\"\n\n    __name__ = \"molecule3d\"\n    __energy_methods__ = [PotentialMethod.B3LYP_6_31G_D]  # \"b3lyp/6-31g*\",\n    # UNITS MOST LIKELY WRONG, MUST CHECK THEM MANUALLY\n    __energy_unit__ = \"ev\"  # CALCULATED\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"ev/ang\"\n    __links__ = {\"molecule3d.zip\": \"https://drive.google.com/uc?id=1C_KRf8mX-gxny7kL9ACNCEV4ceu_fUGy\"}\n\n    energy_target_names = [\"b3lyp/6-31g*.energy\"]\n\n    def read_raw_entries(self):\n        raw = p_join(self.root, \"data\", \"raw\")\n        sdf_paths = glob(p_join(raw, \"*.sdf\"))\n        properties_path = p_join(raw, \"properties.csv\")\n\n        fn = lambda x: _read_sdf(x, properties_path)\n        res = dm.parallelized(fn, sdf_paths, n_jobs=1)  # don't use more than 1 job\n        samples = sum(res, [])\n        return samples\n
    "},{"location":"API/datasets/molecule3d.html#openqdc.datasets.potential.molecule3d.read_mol","title":"read_mol(mol, energy)","text":"

    Read molecule (Chem.rdchem.Mol) and energy (float) and return dict with conformers and energies

    "},{"location":"API/datasets/molecule3d.html#openqdc.datasets.potential.molecule3d.read_mol--parameters","title":"Parameters","text":"

    mol: Chem.rdchem.Mol RDKit molecule energy: float Energy of the molecule

    "},{"location":"API/datasets/molecule3d.html#openqdc.datasets.potential.molecule3d.read_mol--returns","title":"Returns","text":"

    res: dict Dictionary containing the following keys: - name: np.ndarray of shape (N,) containing the smiles of the molecule - atomic_inputs: flatten np.ndarray of shape (M, 5) containing the atomic numbers, charges and positions - energies: np.ndarray of shape (1,) containing the energy of the conformer - n_atoms: np.ndarray of shape (1) containing the number of atoms in the conformer - subset: np.ndarray of shape (1) containing \"molecule3d\"

    Source code in openqdc/datasets/potential/molecule3d.py
    def read_mol(mol: Chem.rdchem.Mol, energy: float) -> Dict[str, np.ndarray]:\n    \"\"\"Read molecule (Chem.rdchem.Mol) and energy (float) and return dict with conformers and energies\n\n    Parameters\n    ----------\n    mol: Chem.rdchem.Mol\n        RDKit molecule\n    energy: float\n        Energy of the molecule\n\n    Returns\n    -------\n    res: dict\n        Dictionary containing the following keys:\n        - name: np.ndarray of shape (N,) containing the smiles of the molecule\n        - atomic_inputs: flatten np.ndarray of shape (M, 5) containing the atomic numbers, charges and positions\n        - energies: np.ndarray of shape (1,) containing the energy of the conformer\n        - n_atoms: np.ndarray of shape (1) containing the number of atoms in the conformer\n        - subset: np.ndarray of shape (1) containing \"molecule3d\"\n    \"\"\"\n    smiles = dm.to_smiles(mol, explicit_hs=False)\n    # subset = dm.to_smiles(dm.to_scaffold_murcko(mol, make_generic=True), explicit_hs=False)\n    x = get_atomic_number_and_charge(mol)\n    positions = mol.GetConformer().GetPositions()\n\n    res = dict(\n        name=np.array([smiles]),\n        subset=np.array([\"molecule3d\"]),\n        energies=np.array([energy]).astype(np.float64)[:, None],\n        atomic_inputs=np.concatenate((x, positions), axis=-1, dtype=np.float32),\n        n_atoms=np.array([x.shape[0]], dtype=np.int32),\n    )\n\n    return res\n
    "},{"location":"API/datasets/multixcqm9.html","title":"MultixcQM9","text":""},{"location":"API/datasets/multixcqm9.html#openqdc.datasets.potential.multixcqm9.MultixcQM9","title":"MultixcQM9","text":"

    Bases: BaseDataset

    MultixcQM9 is a dataset of molecular and reaction energies from multi-level quantum chemical methods consisting of 133K QM9 molecules geometries calculated with 76 different DFT functionals and three different basis sets resulting in 228 energy values for each molecule along with semi-empirical method GFN2-xTB. Geometries for the molecules are used directly from Kim et al. which uses G4MP2 method.

    Usage:

    from openqdc.datasets import MultixcQM9\ndataset = MultixcQM9()\n

    References

    https://www.nature.com/articles/s41597-023-02690-2

    https://github.com/chemsurajit/largeDFTdata

    https://www.nature.com/articles/s41597-019-0121-7

    Source code in openqdc/datasets/potential/multixcqm9.py
    class MultixcQM9(BaseDataset):\n    \"\"\"\n    MultixcQM9 is a dataset of molecular and reaction energies from multi-level quantum chemical methods consisting\n    of 133K QM9 molecules geometries calculated with 76 different DFT functionals and three different basis sets\n    resulting in 228 energy values for each molecule along with semi-empirical method GFN2-xTB. Geometries for the\n    molecules are used directly from Kim et al. which uses G4MP2 method.\n\n    Usage:\n    ```python\n    from openqdc.datasets import MultixcQM9\n    dataset = MultixcQM9()\n    ```\n\n    References:\n        https://www.nature.com/articles/s41597-023-02690-2\\n\n        https://github.com/chemsurajit/largeDFTdata\\n\n        https://www.nature.com/articles/s41597-019-0121-7\\n\n    \"\"\"\n\n    __name__ = \"multixcqm9\"\n\n    __energy_methods__ = [\n        PotentialMethod.KCIS_MODIFIED_DZP,\n        PotentialMethod.KCIS_ORIGINAL_DZP,\n        PotentialMethod.PKZB_DZP,\n        PotentialMethod.VS98_DZP,\n        PotentialMethod.LDA_VWN_DZP,\n        PotentialMethod.PW91_DZP,\n        PotentialMethod.BLYP_DZP,\n        PotentialMethod.BP_DZP,\n        PotentialMethod.PBE_DZP,\n        PotentialMethod.RPBE_DZP,\n        PotentialMethod.REVPBE_DZP,\n        PotentialMethod.OLYP_DZP,\n        PotentialMethod.FT97_DZP,\n        PotentialMethod.BLAP3_DZP,\n        PotentialMethod.HCTH_93_DZP,\n        PotentialMethod.HCTH_120_DZP,\n        PotentialMethod.HCTH_147_DZP,\n        PotentialMethod.HCTH_407_DZP,\n        PotentialMethod.BMTAU1_DZP,\n        PotentialMethod.BOP_DZP,\n        PotentialMethod.PKZBX_KCISCOR_DZP,\n        PotentialMethod.VS98_X_XC_DZP,\n        PotentialMethod.VS98_X_ONLY_DZP,\n        PotentialMethod.BECKE00_DZP,\n        PotentialMethod.BECKE00X_XC_DZP,\n        PotentialMethod.BECKE00_X_ONLY_DZP,\n        PotentialMethod.BECKE88X_BR89C_DZP,\n        PotentialMethod.OLAP3_DZP,\n        PotentialMethod.TPSS_DZP,\n        PotentialMethod.MPBE_DZP,\n        PotentialMethod.OPBE_DZP,\n        PotentialMethod.OPERDEW_DZP,\n        PotentialMethod.MPBEKCIS_DZP,\n        PotentialMethod.MPW_DZP,\n        PotentialMethod.TAU_HCTH_DZP,\n        PotentialMethod.XLYP_DZP,\n        PotentialMethod.KT1_DZP,\n        PotentialMethod.KT2_DZP,\n        PotentialMethod.M06_L_DZP,\n        PotentialMethod.BLYP_D_DZP,\n        PotentialMethod.BP86_D_DZP,\n        PotentialMethod.PBE_D_DZP,\n        PotentialMethod.TPSSD_DZP,\n        PotentialMethod.B97_D_DZP,\n        PotentialMethod.REVTPSS_DZP,\n        PotentialMethod.PBESOL_DZP,\n        PotentialMethod.RGE2_DZP,\n        PotentialMethod.SSB_D_DZP,\n        PotentialMethod.MVS_DZP,\n        PotentialMethod.MVSX_DZP,\n        PotentialMethod.TMGGA_DZP,\n        PotentialMethod.TPSSH_DZP,\n        PotentialMethod.B3LYP_VWN5_DZP,\n        PotentialMethod.O3LYP_VWN5_DZP,\n        PotentialMethod.KMLYP_VWN5_DZP,\n        PotentialMethod.PBE0_DZP,\n        PotentialMethod.B3LYP_S_VWN5_DZP,\n        PotentialMethod.BHANDH_DZP,\n        PotentialMethod.BHANDHLYP_DZP,\n        PotentialMethod.B97_DZP,\n        PotentialMethod.B97_1_DZP,\n        PotentialMethod.B97_2_DZP,\n        PotentialMethod.MPBE0KCIS_DZP,\n        PotentialMethod.MPBE1KCIS_DZP,\n        PotentialMethod.B1LYP_VWN5_DZP,\n        PotentialMethod.B1PW91_VWN5_DZP,\n        PotentialMethod.MPW1PW_DZP,\n        PotentialMethod.MPW1K_DZP,\n        PotentialMethod.TAU_HCTH_HYBRID_DZP,\n        PotentialMethod.X3LYP_VWN5_DZP,\n        PotentialMethod.OPBE0_DZP,\n        PotentialMethod.M05_DZP,\n        PotentialMethod.M05_2X_DZP,\n        PotentialMethod.M06_DZP,\n        PotentialMethod.M06_2X_DZP,\n        PotentialMethod.B3LYP_D_DZP,\n        PotentialMethod.KCIS_MODIFIED_TZP,\n        PotentialMethod.KCIS_ORIGINAL_TZP,\n        PotentialMethod.PKZB_TZP,\n        PotentialMethod.VS98_TZP,\n        PotentialMethod.LDA_VWN_TZP,\n        PotentialMethod.PW91_TZP,\n        PotentialMethod.BLYP_TZP,\n        PotentialMethod.BP_TZP,\n        PotentialMethod.PBE_TZP,\n        PotentialMethod.RPBE_TZP,\n        PotentialMethod.REVPBE_TZP,\n        PotentialMethod.OLYP_TZP,\n        PotentialMethod.FT97_TZP,\n        PotentialMethod.BLAP3_TZP,\n        PotentialMethod.HCTH_93_TZP,\n        PotentialMethod.HCTH_120_TZP,\n        PotentialMethod.HCTH_147_TZP,\n        PotentialMethod.HCTH_407_TZP,\n        PotentialMethod.BMTAU1_TZP,\n        PotentialMethod.BOP_TZP,\n        PotentialMethod.PKZBX_KCISCOR_TZP,\n        PotentialMethod.VS98_X_XC_TZP,\n        PotentialMethod.VS98_X_ONLY_TZP,\n        PotentialMethod.BECKE00_TZP,\n        PotentialMethod.BECKE00X_XC_TZP,\n        PotentialMethod.BECKE00_X_ONLY_TZP,\n        PotentialMethod.BECKE88X_BR89C_TZP,\n        PotentialMethod.OLAP3_TZP,\n        PotentialMethod.TPSS_TZP,\n        PotentialMethod.MPBE_TZP,\n        PotentialMethod.OPBE_TZP,\n        PotentialMethod.OPERDEW_TZP,\n        PotentialMethod.MPBEKCIS_TZP,\n        PotentialMethod.MPW_TZP,\n        PotentialMethod.TAU_HCTH_TZP,\n        PotentialMethod.XLYP_TZP,\n        PotentialMethod.KT1_TZP,\n        PotentialMethod.KT2_TZP,\n        PotentialMethod.M06_L_TZP,\n        PotentialMethod.BLYP_D_TZP,\n        PotentialMethod.BP86_D_TZP,\n        PotentialMethod.PBE_D_TZP,\n        PotentialMethod.TPSSD_TZP,\n        PotentialMethod.B97_D_TZP,\n        PotentialMethod.REVTPSS_TZP,\n        PotentialMethod.PBESOL_TZP,\n        PotentialMethod.RGE2_TZP,\n        PotentialMethod.SSB_D_TZP,\n        PotentialMethod.MVS_TZP,\n        PotentialMethod.MVSX_TZP,\n        PotentialMethod.TMGGA_TZP,\n        PotentialMethod.TPSSH_TZP,\n        PotentialMethod.B3LYP_VWN5_TZP,\n        PotentialMethod.O3LYP_VWN5_TZP,\n        PotentialMethod.KMLYP_VWN5_TZP,\n        PotentialMethod.PBE0_TZP,\n        PotentialMethod.B3LYP_S_VWN5_TZP,\n        PotentialMethod.BHANDH_TZP,\n        PotentialMethod.BHANDHLYP_TZP,\n        PotentialMethod.B97_TZP,\n        PotentialMethod.B97_1_TZP,\n        PotentialMethod.B97_2_TZP,\n        PotentialMethod.MPBE0KCIS_TZP,\n        PotentialMethod.MPBE1KCIS_TZP,\n        PotentialMethod.B1LYP_VWN5_TZP,\n        PotentialMethod.B1PW91_VWN5_TZP,\n        PotentialMethod.MPW1PW_TZP,\n        PotentialMethod.MPW1K_TZP,\n        PotentialMethod.TAU_HCTH_HYBRID_TZP,\n        PotentialMethod.X3LYP_VWN5_TZP,\n        PotentialMethod.OPBE0_TZP,\n        PotentialMethod.M05_TZP,\n        PotentialMethod.M05_2X_TZP,\n        PotentialMethod.M06_TZP,\n        PotentialMethod.M06_2X_TZP,\n        PotentialMethod.B3LYP_D_TZP,\n        PotentialMethod.KCIS_MODIFIED_SZ,\n        PotentialMethod.KCIS_ORIGINAL_SZ,\n        PotentialMethod.PKZB_SZ,\n        PotentialMethod.VS98_SZ,\n        PotentialMethod.LDA_VWN_SZ,\n        PotentialMethod.PW91_SZ,\n        PotentialMethod.BLYP_SZ,\n        PotentialMethod.BP_SZ,\n        PotentialMethod.PBE_SZ,\n        PotentialMethod.RPBE_SZ,\n        PotentialMethod.REVPBE_SZ,\n        PotentialMethod.OLYP_SZ,\n        PotentialMethod.FT97_SZ,\n        PotentialMethod.BLAP3_SZ,\n        PotentialMethod.HCTH_93_SZ,\n        PotentialMethod.HCTH_120_SZ,\n        PotentialMethod.HCTH_147_SZ,\n        PotentialMethod.HCTH_407_SZ,\n        PotentialMethod.BMTAU1_SZ,\n        PotentialMethod.BOP_SZ,\n        PotentialMethod.PKZBX_KCISCOR_SZ,\n        PotentialMethod.VS98_X_XC_SZ,\n        PotentialMethod.VS98_X_ONLY_SZ,\n        PotentialMethod.BECKE00_SZ,\n        PotentialMethod.BECKE00X_XC_SZ,\n        PotentialMethod.BECKE00_X_ONLY_SZ,\n        PotentialMethod.BECKE88X_BR89C_SZ,\n        PotentialMethod.OLAP3_SZ,\n        PotentialMethod.TPSS_SZ,\n        PotentialMethod.MPBE_SZ,\n        PotentialMethod.OPBE_SZ,\n        PotentialMethod.OPERDEW_SZ,\n        PotentialMethod.MPBEKCIS_SZ,\n        PotentialMethod.MPW_SZ,\n        PotentialMethod.TAU_HCTH_SZ,\n        PotentialMethod.XLYP_SZ,\n        PotentialMethod.KT1_SZ,\n        PotentialMethod.KT2_SZ,\n        PotentialMethod.M06_L_SZ,\n        PotentialMethod.BLYP_D_SZ,\n        PotentialMethod.BP86_D_SZ,\n        PotentialMethod.PBE_D_SZ,\n        PotentialMethod.TPSSD_SZ,\n        PotentialMethod.B97_D_SZ,\n        PotentialMethod.REVTPSS_SZ,\n        PotentialMethod.PBESOL_SZ,\n        PotentialMethod.RGE2_SZ,\n        PotentialMethod.SSB_D_SZ,\n        PotentialMethod.MVS_SZ,\n        PotentialMethod.MVSX_SZ,\n        PotentialMethod.TMGGA_SZ,\n        PotentialMethod.TPSSH_SZ,\n        PotentialMethod.B3LYP_VWN5_SZ,\n        PotentialMethod.O3LYP_VWN5_SZ,\n        PotentialMethod.KMLYP_VWN5_SZ,\n        PotentialMethod.PBE0_SZ,\n        PotentialMethod.B3LYP_S_VWN5_SZ,\n        PotentialMethod.BHANDH_SZ,\n        PotentialMethod.BHANDHLYP_SZ,\n        PotentialMethod.B97_SZ,\n        PotentialMethod.B97_1_SZ,\n        PotentialMethod.B97_2_SZ,\n        PotentialMethod.MPBE0KCIS_SZ,\n        PotentialMethod.MPBE1KCIS_SZ,\n        PotentialMethod.B1LYP_VWN5_SZ,\n        PotentialMethod.B1PW91_VWN5_SZ,\n        PotentialMethod.MPW1PW_SZ,\n        PotentialMethod.MPW1K_SZ,\n        PotentialMethod.TAU_HCTH_HYBRID_SZ,\n        PotentialMethod.X3LYP_VWN5_SZ,\n        PotentialMethod.OPBE0_SZ,\n        PotentialMethod.M05_SZ,\n        PotentialMethod.M05_2X_SZ,\n        PotentialMethod.M06_SZ,\n        PotentialMethod.M06_2X_SZ,\n        PotentialMethod.B3LYP_D_SZ,\n        PotentialMethod.GFN2_XTB,\n    ]\n\n    energy_target_names = [\n        \"KCIS-MODIFIED/DZP\",\n        \"KCIS-ORIGINAL/DZP\",\n        \"PKZB/DZP\",\n        \"VS98/DZP\",\n        \"LDA(VWN)/DZP\",\n        \"PW91/DZP\",\n        \"BLYP/DZP\",\n        \"BP/DZP\",\n        \"PBE/DZP\",\n        \"RPBE/DZP\",\n        \"REVPBE/DZP\",\n        \"OLYP/DZP\",\n        \"FT97/DZP\",\n        \"BLAP3/DZP\",\n        \"HCTH/93/DZP\",\n        \"HCTH/120/DZP\",\n        \"HCTH/147/DZP\",\n        \"HCTH/407/DZP\",\n        \"BMTAU1/DZP\",\n        \"BOP/DZP\",\n        \"PKZBX-KCISCOR/DZP\",\n        \"VS98-X(XC)/DZP\",\n        \"VS98-X-ONLY/DZP\",\n        \"BECKE00/DZP\",\n        \"BECKE00X(XC)/DZP\",\n        \"BECKE00-X-ONLY/DZP\",\n        \"BECKE88X+BR89C/DZP\",\n        \"OLAP3/DZP\",\n        \"TPSS/DZP\",\n        \"MPBE/DZP\",\n        \"OPBE/DZP\",\n        \"OPERDEW/DZP\",\n        \"MPBEKCIS/DZP\",\n        \"MPW/DZP\",\n        \"TAU-HCTH/DZP\",\n        \"XLYP/DZP\",\n        \"KT1/DZP\",\n        \"KT2/DZP\",\n        \"M06-L/DZP\",\n        \"BLYP-D/DZP\",\n        \"BP86-D/DZP\",\n        \"PBE-D/DZP\",\n        \"TPSS-D/DZP\",\n        \"B97-D/DZP\",\n        \"REVTPSS/DZP\",\n        \"PBESOL/DZP\",\n        \"RGE2/DZP\",\n        \"SSB-D/DZP\",\n        \"MVS/DZP\",\n        \"MVSX/DZP\",\n        \"T-MGGA/DZP\",\n        \"TPSSH/DZP\",\n        \"B3LYP(VWN5)/DZP\",\n        \"O3LYP(VWN5)/DZP\",\n        \"KMLYP(VWN5)/DZP\",\n        \"PBE0/DZP\",\n        \"B3LYP*(VWN5)/DZP\",\n        \"BHANDH/DZP\",\n        \"BHANDHLYP/DZP\",\n        \"B97/DZP\",\n        \"B97-1/DZP\",\n        \"B97-2/DZP\",\n        \"MPBE0KCIS/DZP\",\n        \"MPBE1KCIS/DZP\",\n        \"B1LYP(VWN5)/DZP\",\n        \"B1PW91(VWN5)/DZP\",\n        \"MPW1PW/DZP\",\n        \"MPW1K/DZP\",\n        \"TAU-HCTH-HYBRID/DZP\",\n        \"X3LYP(VWN5)/DZP\",\n        \"OPBE0/DZP\",\n        \"M05/DZP\",\n        \"M05-2X/DZP\",\n        \"M06/DZP\",\n        \"M06-2X/DZP\",\n        \"B3LYP-D/DZP\",\n        \"KCIS-MODIFIED/TZP\",\n        \"KCIS-ORIGINAL/TZP\",\n        \"PKZB/TZP\",\n        \"VS98/TZP\",\n        \"LDA(VWN)/TZP\",\n        \"PW91/TZP\",\n        \"BLYP/TZP\",\n        \"BP/TZP\",\n        \"PBE/TZP\",\n        \"RPBE/TZP\",\n        \"REVPBE/TZP\",\n        \"OLYP/TZP\",\n        \"FT97/TZP\",\n        \"BLAP3/TZP\",\n        \"HCTH/93/TZP\",\n        \"HCTH/120/TZP\",\n        \"HCTH/147/TZP\",\n        \"HCTH/407/TZP\",\n        \"BMTAU1/TZP\",\n        \"BOP/TZP\",\n        \"PKZBX-KCISCOR/TZP\",\n        \"VS98-X(XC)/TZP\",\n        \"VS98-X-ONLY/TZP\",\n        \"BECKE00/TZP\",\n        \"BECKE00X(XC)/TZP\",\n        \"BECKE00-X-ONLY/TZP\",\n        \"BECKE88X+BR89C/TZP\",\n        \"OLAP3/TZP\",\n        \"TPSS/TZP\",\n        \"MPBE/TZP\",\n        \"OPBE/TZP\",\n        \"OPERDEW/TZP\",\n        \"MPBEKCIS/TZP\",\n        \"MPW/TZP\",\n        \"TAU-HCTH/TZP\",\n        \"XLYP/TZP\",\n        \"KT1/TZP\",\n        \"KT2/TZP\",\n        \"M06-L/TZP\",\n        \"BLYP-D/TZP\",\n        \"BP86-D/TZP\",\n        \"PBE-D/TZP\",\n        \"TPSS-D/TZP\",\n        \"B97-D/TZP\",\n        \"REVTPSS/TZP\",\n        \"PBESOL/TZP\",\n        \"RGE2/TZP\",\n        \"SSB-D/TZP\",\n        \"MVS/TZP\",\n        \"MVSX/TZP\",\n        \"T-MGGA/TZP\",\n        \"TPSSH/TZP\",\n        \"B3LYP(VWN5)/TZP\",\n        \"O3LYP(VWN5)/TZP\",\n        \"KMLYP(VWN5)/TZP\",\n        \"PBE0/TZP\",\n        \"B3LYP*(VWN5)/TZP\",\n        \"BHANDH/TZP\",\n        \"BHANDHLYP/TZP\",\n        \"B97/TZP\",\n        \"B97-1/TZP\",\n        \"B97-2/TZP\",\n        \"MPBE0KCIS/TZP\",\n        \"MPBE1KCIS/TZP\",\n        \"B1LYP(VWN5)/TZP\",\n        \"B1PW91(VWN5)/TZP\",\n        \"MPW1PW/TZP\",\n        \"MPW1K/TZP\",\n        \"TAU-HCTH-HYBRID/TZP\",\n        \"X3LYP(VWN5)/TZP\",\n        \"OPBE0/TZP\",\n        \"M05/TZP\",\n        \"M05-2X/TZP\",\n        \"M06/TZP\",\n        \"M06-2X/TZP\",\n        \"B3LYP-D/TZP\",\n        \"KCIS-MODIFIED/SZ\",\n        \"KCIS-ORIGINAL/SZ\",\n        \"PKZB/SZ\",\n        \"VS98/SZ\",\n        \"LDA(VWN)/SZ\",\n        \"PW91/SZ\",\n        \"BLYP/SZ\",\n        \"BP/SZ\",\n        \"PBE/SZ\",\n        \"RPBE/SZ\",\n        \"REVPBE/SZ\",\n        \"OLYP/SZ\",\n        \"FT97/SZ\",\n        \"BLAP3/SZ\",\n        \"HCTH/93/SZ\",\n        \"HCTH/120/SZ\",\n        \"HCTH/147/SZ\",\n        \"HCTH/407/SZ\",\n        \"BMTAU1/SZ\",\n        \"BOP/SZ\",\n        \"PKZBX-KCISCOR/SZ\",\n        \"VS98-X(XC)/SZ\",\n        \"VS98-X-ONLY/SZ\",\n        \"BECKE00/SZ\",\n        \"BECKE00X(XC)/SZ\",\n        \"BECKE00-X-ONLY/SZ\",\n        \"BECKE88X+BR89C/SZ\",\n        \"OLAP3/SZ\",\n        \"TPSS/SZ\",\n        \"MPBE/SZ\",\n        \"OPBE/SZ\",\n        \"OPERDEW/SZ\",\n        \"MPBEKCIS/SZ\",\n        \"MPW/SZ\",\n        \"TAU-HCTH/SZ\",\n        \"XLYP/SZ\",\n        \"KT1/SZ\",\n        \"KT2/SZ\",\n        \"M06-L/SZ\",\n        \"BLYP-D/SZ\",\n        \"BP86-D/SZ\",\n        \"PBE-D/SZ\",\n        \"TPSS-D/SZ\",\n        \"B97-D/SZ\",\n        \"REVTPSS/SZ\",\n        \"PBESOL/SZ\",\n        \"RGE2/SZ\",\n        \"SSB-D/SZ\",\n        \"MVS/SZ\",\n        \"MVSX/SZ\",\n        \"T-MGGA/SZ\",\n        \"TPSSH/SZ\",\n        \"B3LYP(VWN5)/SZ\",\n        \"O3LYP(VWN5)/SZ\",\n        \"KMLYP(VWN5)/SZ\",\n        \"PBE0/SZ\",\n        \"B3LYP*(VWN5)/SZ\",\n        \"BHANDH/SZ\",\n        \"BHANDHLYP/SZ\",\n        \"B97/SZ\",\n        \"B97-1/SZ\",\n        \"B97-2/SZ\",\n        \"MPBE0KCIS/SZ\",\n        \"MPBE1KCIS/SZ\",\n        \"B1LYP(VWN5)/SZ\",\n        \"B1PW91(VWN5)/SZ\",\n        \"MPW1PW/SZ\",\n        \"MPW1K/SZ\",\n        \"TAU-HCTH-HYBRID/SZ\",\n        \"X3LYP(VWN5)/SZ\",\n        \"OPBE0/SZ\",\n        \"M05/SZ\",\n        \"M05-2X/SZ\",\n        \"M06/SZ\",\n        \"M06-2X/SZ\",\n        \"B3LYP-D/SZ\",\n        \"GFNXTB\",\n    ]\n\n    __energy_unit__ = \"ev\"  # to fix\n    __distance_unit__ = \"ang\"  # to fix\n    __forces_unit__ = \"ev/ang\"  # to fix\n    __links__ = {\n        \"xyz.zip\": \"https://data.dtu.dk/ndownloader/files/35143624\",\n        \"xtb.zip\": \"https://data.dtu.dk/ndownloader/files/42444300\",\n        \"dzp.zip\": \"https://data.dtu.dk/ndownloader/files/42443925\",\n        \"tzp.zip\": \"https://data.dtu.dk/ndownloader/files/42444129\",\n        \"sz.zip\": \"https://data.dtu.dk/ndownloader/files/42441345\",\n        \"failed_indices.dat\": \"https://data.dtu.dk/ndownloader/files/37337677\",\n    }\n\n    def _read_molecules_energies(self):\n        d = {\"DZP\": None, \"TZP\": None, \"SZ\": None, \"XTB\": None}\n        for basis in d.keys():\n            d[basis] = pd.read_csv(p_join(self.root, basis, \"molecules/molecules.csv\"), index_col=False).drop(\n                columns=[\"index\"]\n            )\n        return pd.concat([d[\"DZP\"], d[\"TZP\"], d[\"SZ\"], d[\"XTB\"]], axis=1, ignore_index=False)\n\n    def _read_all_xyzs(self):\n        xyz_list = read_xyz_files(self.root)\n        return pd.DataFrame(xyz_list)\n\n    def read_raw_entries(self):\n        df_energies = self._read_molecules_energies()\n        df_xyz = self._read_all_xyzs()\n        return [\n            {\"energies\": np.atleast_2d(en), **xyz_dict}\n            for xyz_dict, en in zip(df_xyz.to_dict(\"records\"), df_energies.values.astype(np.float64))\n        ]\n
    "},{"location":"API/datasets/nabladft.html","title":"NablaDFT","text":""},{"location":"API/datasets/nabladft.html#openqdc.datasets.potential.nabladft.NablaDFT","title":"NablaDFT","text":"

    Bases: BaseDataset

    NablaDFT is a dataset constructed from a subset of the Molecular Sets (MOSES) dataset consisting of 1 million molecules with 5,340,152 unique conformations. Conformations for each molecule are generated in 2 steps. First, a set of conformations are generated using RDKit. Second, using Butina Clustering Method on conformations, clusters that cover 95% of the conformations are selected and the centroids of those clusters are selected as the final set. This results in 1-62 conformations per molecule. For generating quantum properties, Kohn-Sham method at wB97X-D/def2-XVP levels are used to generate the energy.

    Usage:

    from openqdc.datasets import NablaDFT\ndataset = NablaDFT()\n

    References

    https://pubs.rsc.org/en/content/articlelanding/2022/CP/D2CP03966D

    https://github.com/AIRI-Institute/nablaDFT

    Source code in openqdc/datasets/potential/nabladft.py
    class NablaDFT(BaseDataset):\n    \"\"\"\n    NablaDFT is a dataset constructed from a subset of the\n    [Molecular Sets (MOSES) dataset](https://github.com/molecularsets/moses) consisting of 1 million molecules\n    with 5,340,152 unique conformations. Conformations for each molecule are generated in 2 steps. First, a set of\n    conformations are generated using RDKit. Second, using Butina Clustering Method on conformations, clusters that\n    cover 95% of the conformations are selected and the centroids of those clusters are selected as the final set.\n    This results in 1-62 conformations per molecule. For generating quantum properties, Kohn-Sham method at\n    wB97X-D/def2-XVP levels are used to generate the energy.\n\n    Usage:\n    ```python\n    from openqdc.datasets import NablaDFT\n    dataset = NablaDFT()\n    ```\n\n    References:\n        https://pubs.rsc.org/en/content/articlelanding/2022/CP/D2CP03966D\\n\n        https://github.com/AIRI-Institute/nablaDFT\n    \"\"\"\n\n    __name__ = \"nabladft\"\n    __energy_methods__ = [\n        PotentialMethod.WB97X_D_DEF2_SVP,\n    ]  # \"wb97x-d/def2-svp\"\n\n    energy_target_names = [\"wb97x-d/def2-svp\"]\n    __energy_unit__ = \"hartree\"\n    __distance_unit__ = \"bohr\"\n    __forces_unit__ = \"hartree/bohr\"\n    __links__ = {\"nabladft.db\": \"https://n-usr-31b1j.s3pd12.sbercloud.ru/b-usr-31b1j-qz9/data/moses_db/dataset_full.db\"}\n\n    @property\n    def data_types(self):\n        return {\n            \"atomic_inputs\": np.float32,\n            \"position_idx_range\": np.int32,\n            \"energies\": np.float32,\n            \"forces\": np.float32,\n        }\n\n    @requires_package(\"nablaDFT\")\n    def read_raw_entries(self):\n        from nablaDFT.dataset import HamiltonianDatabase\n\n        label_path = p_join(self.root, \"summary.csv\")\n        df = pd.read_csv(label_path, usecols=[\"MOSES id\", \"CONFORMER id\", \"SMILES\", \"DFT TOTAL ENERGY\"])\n        labels = df.set_index(keys=[\"MOSES id\", \"CONFORMER id\"]).to_dict(\"index\")\n\n        raw_path = p_join(self.root, \"dataset_full.db\")\n        train = HamiltonianDatabase(raw_path)\n        n, c = len(train), 20\n        step_size = int(np.ceil(n / os.cpu_count()))\n\n        fn = lambda i: read_chunk_from_db(raw_path, i * step_size, min((i + 1) * step_size, n), labels=labels)\n        samples = dm.parallelized(\n            fn, list(range(c)), n_jobs=c, progress=False, scheduler=\"threads\"\n        )  # don't use more than 1 job\n\n        return sum(samples, [])\n
    "},{"location":"API/datasets/orbnet_denali.html","title":"Orbnet Denali","text":""},{"location":"API/datasets/orbnet_denali.html#openqdc.datasets.potential.orbnet_denali.OrbnetDenali","title":"OrbnetDenali","text":"

    Bases: BaseDataset

    Orbnet Denali is a collection of 2.3 million conformers from 212,905 unique molecules. Molecules include a range of organic molecules with protonation and tautomeric states, non-covalent interactions, common salts, and counterions, spanning the most common elements in bio and organic chemistry. Geometries are generated in 2 steps. First, four energy-minimized conformations are generated for each molecule using the ENTOS BREEZE conformer generator. Second, using the four energy-minimized conformers, non-equilibrium geometries are generated using normal mode sampling at 300K or ab initio molecular dynamics (AIMD) for 200fs at 500K; using GFN1-xTB level of theory. Energies are calculated using DFT method wB97X-D3/def2-TZVP and semi-empirical method GFN1-xTB level of theory.

    Usage:

    from openqdc.datasets import OrbnetDenali\ndataset = OrbnetDenali()\n

    References

    https://arxiv.org/abs/2107.00299

    https://figshare.com/articles/dataset/OrbNet_Denali_Training_Data/14883867

    Source code in openqdc/datasets/potential/orbnet_denali.py
    class OrbnetDenali(BaseDataset):\n    \"\"\"\n    Orbnet Denali is a collection of 2.3 million conformers from 212,905 unique molecules. Molecules include a range\n    of organic molecules with protonation and tautomeric states, non-covalent interactions, common salts, and\n    counterions, spanning the most common elements in bio and organic chemistry. Geometries are generated in 2 steps.\n    First, four energy-minimized conformations are generated for each molecule using the ENTOS BREEZE conformer\n    generator. Second, using the four energy-minimized conformers, non-equilibrium geometries are generated using\n    normal mode sampling at 300K or ab initio molecular dynamics (AIMD) for 200fs at 500K; using GFN1-xTB level of\n    theory. Energies are calculated using DFT method wB97X-D3/def2-TZVP and semi-empirical method GFN1-xTB level of\n    theory.\n\n    Usage:\n    ```python\n    from openqdc.datasets import OrbnetDenali\n    dataset = OrbnetDenali()\n    ```\n\n    References:\n        https://arxiv.org/abs/2107.00299\\n\n        https://figshare.com/articles/dataset/OrbNet_Denali_Training_Data/14883867\n    \"\"\"\n\n    __name__ = \"orbnet_denali\"\n    __energy_methods__ = [\n        PotentialMethod.WB97X_D3_DEF2_TZVP,\n        PotentialMethod.GFN1_XTB,\n    ]  # [\"wb97x-d3/def2-tzvp\", \"gfn1_xtb\"]\n    energy_target_names = [\"dft_energy\", \"xtb1_energy\"]\n    __energy_unit__ = \"hartree\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"hartree/ang\"\n    __links__ = {\n        \"orbnet_denali.tar.gz\": \"https://figshare.com/ndownloader/files/28672287\",\n        \"orbnet_denali_targets.tar.gz\": \"https://figshare.com/ndownloader/files/28672248\",\n    }\n\n    def read_raw_entries(self):\n        label_path = p_join(self.root, \"denali_labels.csv\")\n        df = pd.read_csv(label_path, usecols=[\"sample_id\", \"mol_id\", \"subset\", \"dft_energy\", \"xtb1_energy\"])\n        labels = {\n            mol_id: group.drop([\"mol_id\"], axis=1).drop_duplicates(\"sample_id\").set_index(\"sample_id\").to_dict(\"index\")\n            for mol_id, group in df.groupby(\"mol_id\")\n        }\n\n        fn = lambda x: read_archive(x[0], x[1], self.root, self.energy_target_names)\n        res = dm.parallelized(fn, list(labels.items()), scheduler=\"threads\", n_jobs=-1, progress=True)\n        samples = sum(res, [])\n        return samples\n
    "},{"location":"API/datasets/pcqm.html","title":"PCQM","text":""},{"location":"API/datasets/pcqm.html#openqdc.datasets.potential.pcqm.PCQM_B3LYP","title":"PCQM_B3LYP","text":"

    Bases: PCQM_PM6

    PubChemQC B3LYP/6-31G (PCQM_B3LYP) comprises of 85 million molecules ranging from essential compounds to biomolecules. The geometries for the molecule are optimized using PM6. Using the optimized geometry, the electronic structure and properties are calculated using B3LIP/6-31G method.

    Usage:

    from openqdc.datasets import PCQM_B3LYP\ndataset = PCQM_B3LYP()\n

    References

    https://arxiv.org/abs/2305.18454

    Source code in openqdc/datasets/potential/pcqm.py
    class PCQM_B3LYP(PCQM_PM6):\n    \"\"\"\n    PubChemQC B3LYP/6-31G* (PCQM_B3LYP) comprises of 85 million molecules ranging from essential compounds to\n    biomolecules. The geometries for the molecule are optimized using PM6. Using the optimized geometry,\n    the electronic structure and properties are calculated using B3LIP/6-31G* method.\n\n    Usage:\n    ```python\n    from openqdc.datasets import PCQM_B3LYP\n    dataset = PCQM_B3LYP()\n    ```\n\n    References:\n        https://arxiv.org/abs/2305.18454\n    \"\"\"\n\n    __name__ = \"pubchemqc_b3lyp\"\n    __energy_methods__ = [\"b3lyp/6-31g*\"]\n    energy_target_names = [\"b3lyp\"]\n
    "},{"location":"API/datasets/pcqm.html#openqdc.datasets.potential.pcqm.PCQM_PM6","title":"PCQM_PM6","text":"

    Bases: BaseDataset

    PubChemQC PM6 (PCQM_PM6) is an exhaustive dataset containing 221 million organic molecules with optimized molecular geometries and electronic properties. To generate the dataset, only molecules with weights less than 1000g/mol are considered from the PubChem ftp site. The initial structure is generated using OpenBabel and then is optimized using geometry optimization with the semi-empirical method PM6. The energies are also computed using the PM6 method.

    Usage:

    from openqdc.datasets import PCQM_PM6\ndataset = PCQM_PM6()\n

    References

    https://pubs.acs.org/doi/abs/10.1021/acs.jcim.0c00740

    Source code in openqdc/datasets/potential/pcqm.py
    class PCQM_PM6(BaseDataset):\n    \"\"\"\n    PubChemQC PM6 (PCQM_PM6) is an exhaustive dataset containing 221 million organic molecules with optimized\n    molecular geometries and electronic properties. To generate the dataset, only molecules with weights less\n    than 1000g/mol are considered from the PubChem ftp site. The initial structure is generated using OpenBabel\n    and then is optimized using geometry optimization with the semi-empirical method PM6. The energies are also\n    computed using the PM6 method.\n\n    Usage:\n    ```python\n    from openqdc.datasets import PCQM_PM6\n    dataset = PCQM_PM6()\n    ```\n\n    References:\n        https://pubs.acs.org/doi/abs/10.1021/acs.jcim.0c00740\n    \"\"\"\n\n    __name__ = \"pubchemqc_pm6\"\n    __energy_methods__ = [PotentialMethod.PM6]\n\n    energy_target_names = [\"pm6\"]\n\n    __force_methods__ = []\n    force_target_names = []\n\n    @property\n    def root(self):\n        return p_join(get_local_cache(), \"pubchemqc\")\n\n    @property\n    def preprocess_path(self):\n        path = p_join(self.root, \"preprocessed\", self.__name__)\n        os.makedirs(path, exist_ok=True)\n        return path\n\n    def collate_list(self, list_entries):\n        predicat = list_entries is not None and len(list_entries) > 0\n        list_entries = [x for x in list_entries if x is not None]\n        if predicat:\n            res = super().collate_list(list_entries)\n        else:\n            res = None\n        return res\n\n    @property\n    def data_types(self):\n        return {\n            \"atomic_inputs\": np.float32,\n            \"position_idx_range\": np.int32,\n            \"energies\": np.float32,\n            \"forces\": np.float32,\n        }\n\n    def read_raw_entries(self):\n        arxiv_paths = glob(p_join(self.root, f\"{self.__energy_methods__[0]}\", \"*.pkl\"))\n        f = lambda x: self.collate_list(read_preprocessed_archive(x))\n        samples = dm.parallelized(f, arxiv_paths, n_jobs=1, progress=True)\n        samples = [x for x in samples if x is not None]\n        return samples\n\n    def preprocess(self, overwrite=False):\n        if overwrite or not self.is_preprocessed():\n            logger.info(\"Preprocessing data and saving it to cache.\")\n            logger.info(\n                f\"Dataset {self.__name__} data with the following units:\\n\"\n                f\"Energy: {self.energy_unit}, Distance: {self.distance_unit}, \"\n                f\"Forces: {self.force_unit if self.__force_methods__ else 'None'}\"\n            )\n            entries = self.read_raw_entries()\n            self.collate_and_save_list(entries)\n\n    def collate_and_save_list(self, list_entries):\n        n_molecules, n_atoms = 0, 0\n        for i in range(len(list_entries)):\n            list_entries[i][\"position_idx_range\"] += n_atoms\n            n_atoms += list_entries[i][\"position_idx_range\"].max()\n            n_molecules += list_entries[i][\"position_idx_range\"].shape[0]\n\n        for key in self.data_keys:\n            first = list_entries[0][key]\n            shape = (n_molecules, *first.shape[1:])\n            local_path = p_join(self.preprocess_path, f\"{key}.mmap\")\n            out = np.memmap(local_path, mode=\"w+\", dtype=first.dtype, shape=shape)\n\n            start = 0\n            for i in range(len(list_entries)):\n                x = list_entries[i].pop(key)\n                n = x.shape[0]\n                out[start : start + n] = x\n                out.flush()\n            push_remote(local_path, overwrite=True)\n\n        # save smiles and subset\n        tmp, n = dict(name=[]), len(list_entries)\n        local_path = p_join(self.preprocess_path, \"props.pkl\")\n        names = [list_entries[i].pop(\"name\") for i in range(n)]\n        f = lambda xs: [dm.to_inchikey(x) for x in xs]\n        res = dm.parallelized(f, names, n_jobs=-1, progress=False)\n        for x in res:\n            tmp[\"name\"] += x\n        for key in [\"subset\", \"n_atoms\"]:\n            tmp[key] = []\n            for i in range(n):\n                tmp[key] += list(list_entries[i].pop(key))\n        with open(local_path, \"wb\") as f:\n            pkl.dump(tmp, f)\n        push_remote(local_path, overwrite=True)\n
    "},{"location":"API/datasets/proteinfragments.html","title":"Protein Fragments","text":""},{"location":"API/datasets/proteinfragments.html#openqdc.datasets.potential.proteinfragments.MDDataset","title":"MDDataset","text":"

    Bases: ProteinFragments

    MDDataset is a subset of the proteinfragments dataset that generated from the molecular dynamics with their model. The sampling was done with Molecular Dynamics at room temperature 300K in various solvent phase:

    Subsets

    Polyalanine: All the polyalanine are sampled in gas phase. AceAla15Lys is a polyalanine peptides capped with an N-terminal acetyl group and a protonated lysine residue at the C-terminus, Acela15nme is polyalanine peptide capped with an N-terminal acetyl group and a C-terminal N-methyl amide group

    Crambin: 46-residue protein crambin in aqueous solution (25,257 atoms)

    Usage:

    from openqdc.datasets import MDDataset\ndataset = MDDataset()\n

    References

    https://www.science.org/doi/10.1126/sciadv.adn4397

    Source code in openqdc/datasets/potential/proteinfragments.py
    class MDDataset(ProteinFragments):\n    \"\"\"\n    MDDataset is a subset of the proteinfragments dataset that\n    generated from the molecular dynamics with their model.\n    The sampling was done with Molecular Dynamics\n    at room temperature 300K in various solvent phase:\n\n    Subsets:\n        Polyalanine:\n            All the polyalanine are sampled in gas phase. AceAla15Lys is\n            a polyalanine peptides capped with an N-terminal acetyl group\n            and a protonated lysine residue at the C-terminus,\n            Acela15nme is polyalanine peptide capped with an N-terminal acetyl group\n            and a C-terminal N-methyl amide group\\n\n        Crambin: 46-residue protein crambin in aqueous solution (25,257 atoms)\n\n    Usage:\n    ```python\n    from openqdc.datasets import MDDataset\n    dataset = MDDataset()\n    ```\n\n    References:\n        https://www.science.org/doi/10.1126/sciadv.adn4397\n    \"\"\"\n\n    __name__ = \"mddataset\"\n\n    __links__ = {\n        f\"{name}.db\": f\"https://zenodo.org/records/10720941/files/{name}.db?download=1\"\n        for name in [\"acala15nme_folding_clusters\", \"crambin\", \"minimahopping_acala15lysh\", \"minimahopping_acala15nme\"]\n    }\n
    "},{"location":"API/datasets/proteinfragments.html#openqdc.datasets.potential.proteinfragments.ProteinFragments","title":"ProteinFragments","text":"

    Bases: BaseDataset

    ProteinFragments is a dataset constructed from a subset of the the data was generated from a top-down and bottom-up approach:

    Top-down

    Fragments are generated by cutting out a spherical region around an atom (including solvent molecules) and saturating all dangling bonds. Sampling was done with the Molecular Dynamics (MD) method from conventional FF at room temperature.

    Bottom-up

    Fragments are generated by constructing chemical graphs of one to eight nonhydrogen atoms. Sampling of multiple conformers per fragments was done with MD simulations at high temperatures or normal mode sampling.

    Usage:

    from openqdc.datasets import ProteinFragments\ndataset = ProteinFragments()\n

    References

    https://www.science.org/doi/10.1126/sciadv.adn4397

    Source code in openqdc/datasets/potential/proteinfragments.py
    class ProteinFragments(BaseDataset):\n    \"\"\"\n    ProteinFragments is a dataset constructed from a subset of the\n    the data was generated from a top-down and bottom-up approach:\n\n    Top-down:\n        Fragments are generated by cutting out a spherical\n        region around an atom (including solvent molecules)\n        and saturating all dangling bonds.\n        Sampling was done with the Molecular Dynamics (MD) method from\n        conventional FF at room temperature.\n\n    Bottom-up:\n        Fragments are generated by constructing chemical graphs\n        of one to eight nonhydrogen atoms.\n        Sampling of multiple conformers per fragments was done with\n        MD simulations at high temperatures or normal mode sampling.\n\n\n    Usage:\n    ```python\n    from openqdc.datasets import ProteinFragments\n    dataset = ProteinFragments()\n    ```\n\n    References:\n        https://www.science.org/doi/10.1126/sciadv.adn4397\n    \"\"\"\n\n    __name__ = \"proteinfragments\"\n    # PBE0/def2-TZVPP+MBD\n    __energy_methods__ = [\n        PotentialMethod.PBE0_MBD_DEF2_TZVPP,\n    ]\n\n    energy_target_names = [\n        \"PBE0+MBD/def2-TZVPP\",\n    ]\n\n    __energy_unit__ = \"ev\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"ev/ang\"\n    __links__ = {\n        f\"{name}.db\": f\"https://zenodo.org/records/10720941/files/{name}.db?download=1\"\n        for name in [\"general_protein_fragments\"]\n    }\n\n    @property\n    def root(self):\n        return p_join(get_local_cache(), \"proteinfragments\")\n\n    @property\n    def config(self):\n        assert len(self.__links__) > 0, \"No links provided for fetching\"\n        return dict(dataset_name=\"proteinfragments\", links=self.__links__)\n\n    @property\n    def preprocess_path(self):\n        path = p_join(self.root, \"preprocessed\", self.__name__)\n        os.makedirs(path, exist_ok=True)\n        return path\n\n    def read_raw_entries(self):\n        samples = []\n        for name in self.__links__:\n            raw_path = p_join(self.root, f\"{name}\")\n            samples.extend(read_db(raw_path))\n        return samples\n
    "},{"location":"API/datasets/qm1b.html","title":"QM1B","text":""},{"location":"API/datasets/qm1b.html#openqdc.datasets.potential.qm1b.QM1B","title":"QM1B","text":"

    Bases: BaseDataset

    QM1B is a dataset containing 1 billion conformations for 1.09M small molecules generated using a custom PySCF library that incorporates hardware acceleration via IPUs. The molecules contain 9-11 heavy atoms and are subsampled from the Generated Data Bank (GDB). For each molecule, 1000 geometries are generated using RDKit. Electronic properties for each conformation are then calculated using the density functional B3LYP and the basis set STO-3G.

    Usage:

    from openqdc.datasets import QM1B\ndataset = QM1B()\n

    References

    https://arxiv.org/pdf/2311.01135

    https://github.com/graphcore-research/qm1b-dataset/

    Source code in openqdc/datasets/potential/qm1b.py
    class QM1B(BaseDataset):\n    \"\"\"\n    QM1B is a dataset containing 1 billion conformations for 1.09M small molecules generated using a custom\n    PySCF library that incorporates hardware acceleration via IPUs. The molecules contain 9-11 heavy atoms and are\n    subsampled from the Generated Data Bank (GDB). For each molecule, 1000 geometries are generated using RDKit.\n    Electronic properties for each conformation are then calculated using the density functional B3LYP\n    and the basis set STO-3G.\n\n    Usage:\n    ```python\n    from openqdc.datasets import QM1B\n    dataset = QM1B()\n    ```\n\n    References:\n        https://arxiv.org/pdf/2311.01135\\n\n        https://github.com/graphcore-research/qm1b-dataset/\n    \"\"\"\n\n    __name__ = \"qm1b\"\n\n    __energy_methods__ = [PotentialMethod.B3LYP_STO3G]\n    __force_methods__ = []\n\n    energy_target_names = [\"b3lyp/sto-3g\"]\n    force_target_names = []\n\n    __energy_unit__ = \"ev\"\n    __distance_unit__ = \"bohr\"\n    __forces_unit__ = \"ev/bohr\"\n    __links__ = {\n        \"qm1b_validation.parquet\": \"https://ndownloader.figshare.com/files/43005175\",\n        **{f\"part_{i:03d}.parquet\": f\"https://ndownloader.figshare.com/files/{FILE_NUM[i]}\" for i in range(0, 256)},\n    }\n\n    @property\n    def root(self):\n        return p_join(get_local_cache(), \"qm1b\")\n\n    @property\n    def preprocess_path(self):\n        path = p_join(self.root, \"preprocessed\", self.__name__)\n        os.makedirs(path, exist_ok=True)\n        return path\n\n    def read_raw_entries(self):\n        filenames = list(map(lambda x: p_join(self.root, f\"part_{x:03d}.parquet\"), list(range(0, 256)))) + [\n            p_join(self.root, \"qm1b_validation.parquet\")\n        ]\n\n        def read_entries_parallel(filename):\n            df = pd.read_parquet(filename)\n\n            def extract_parallel(df, i):\n                return extract_from_row(df.iloc[i])\n\n            fn = partial(extract_parallel, df)\n            list_of_idxs = list(range(len(df)))\n            results = dm.utils.parallelized(fn, list_of_idxs, scheduler=\"threads\", progress=False)\n            return results\n\n        list_of_list = dm.utils.parallelized(read_entries_parallel, filenames, scheduler=\"processes\", progress=True)\n\n        return [x for xs in list_of_list for x in xs]\n
    "},{"location":"API/datasets/qm1b.html#openqdc.datasets.potential.qm1b.QM1B_SMALL","title":"QM1B_SMALL","text":"

    Bases: QM1B

    QM1B_SMALL is a subset of the QM1B dataset containing a maximum of 15 random conformers per molecule.

    Usage:

    from openqdc.datasets import QM1B_SMALL\ndataset = QM1B_SMALL()\n

    Source code in openqdc/datasets/potential/qm1b.py
    class QM1B_SMALL(QM1B):\n    \"\"\"\n    QM1B_SMALL is a subset of the QM1B dataset containing a maximum of 15 random conformers per molecule.\n\n    Usage:\n    ```python\n    from openqdc.datasets import QM1B_SMALL\n    dataset = QM1B_SMALL()\n    ```\n    \"\"\"\n\n    __name__ = \"qm1b_small\"\n
    "},{"location":"API/datasets/qm7x.html","title":"QM7X","text":""},{"location":"API/datasets/qm7x.html#openqdc.datasets.potential.qm7x.QM7X","title":"QM7X","text":"

    Bases: BaseDataset

    QM7X is a collection of almost 4.2 million conformers from 6,950 unique organic molecules. The molecules with up to seven heavy (C, N, O, S, Cl) atoms are considered from the GDB13 database. For generating conformations, OpenBabel is utilized to get an initial structure using the MMFF94 force field. Using the initial structure, meta- stable conformational isomers are generated using the Confab tool along with the MMFF94 force field. The structure is then re-optimized with density-functional tight binding (DFTB) supplemented with many-body dispersion (MBD) interactions. The lowest energy structure is then considered as the final equilibrium conformer. Additionally, non -equilibrium conformations are generated by displacing the equilibrium geometry along a linear combination of normal mode coordinates computed at the DFTB3-MBD level within the harmonic approximation. The dataset has energy values for each geometry computed at PBE0-MBD and DFTB3-MBD method.

    Usage:

    from openqdc.datasets import QM7X\ndataset = QM7X()\n

    References

    https://arxiv.org/abs/2006.15139

    https://zenodo.org/records/4288677

    Source code in openqdc/datasets/potential/qm7x.py
    class QM7X(BaseDataset):\n    \"\"\"\n    QM7X is a collection of almost 4.2 million conformers from 6,950 unique organic molecules. The molecules with\n    up to seven heavy (C, N, O, S, Cl) atoms are considered from the GDB13 database. For generating conformations,\n    OpenBabel is utilized to get an initial structure using the MMFF94 force field. Using the initial structure, meta-\n    stable conformational isomers are generated using the Confab tool along with the MMFF94 force field. The structure\n    is then re-optimized with density-functional tight binding (DFTB) supplemented with many-body dispersion (MBD)\n    interactions. The lowest energy structure is then considered as the final equilibrium conformer. Additionally, non\n    -equilibrium conformations are generated by displacing the equilibrium geometry along a linear combination of\n    normal mode coordinates computed at the DFTB3-MBD level within the harmonic approximation. The dataset has\n    energy values for each geometry computed at PBE0-MBD and DFTB3-MBD method.\n\n    Usage:\n    ```python\n    from openqdc.datasets import QM7X\n    dataset = QM7X()\n    ```\n\n    References:\n        https://arxiv.org/abs/2006.15139\\n\n        https://zenodo.org/records/4288677\n    \"\"\"\n\n    __name__ = \"qm7x\"\n\n    __energy_methods__ = [PotentialMethod.PBE0_DEF2_TZVP, PotentialMethod.DFT3B]  # \"pbe0/def2-tzvp\", \"dft3b\"]\n\n    energy_target_names = [\"ePBE0+MBD\", \"eDFTB+MBD\"]\n\n    __force_mask__ = [True, False]\n\n    force_target_names = [\"pbe0FOR\"]\n\n    __energy_unit__ = \"ev\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"ev/ang\"\n    __links__ = {f\"{i}000.xz\": f\"https://zenodo.org/record/4288677/files/{i}000.xz\" for i in range(1, 9)}\n\n    def read_raw_entries(self):\n        samples = []\n        for i in range(1, 9):\n            raw_path = p_join(self.root, f\"{i}000\")\n            data = load_hdf5_file(raw_path)\n            samples += [\n                read_mol(data[k], k, self.energy_target_names, self.force_target_names) for k in tqdm(data.keys())\n            ]\n\n        return samples\n
    "},{"location":"API/datasets/qm7x.html#openqdc.datasets.potential.qm7x.QM7X_V2","title":"QM7X_V2","text":"

    Bases: QM7X

    QM7X_V2 is an extension of the QM7X dataset containing PM6 labels for each of the 4.2M geometries.

    Usage:

    from openqdc.datasets import QM7X_V2\ndataset = QM7X_V2()\n

    Source code in openqdc/datasets/potential/qm7x.py
    class QM7X_V2(QM7X):\n    \"\"\"\n    QM7X_V2 is an extension of the QM7X dataset containing PM6 labels for each of the 4.2M geometries.\n\n    Usage:\n    ```python\n    from openqdc.datasets import QM7X_V2\n    dataset = QM7X_V2()\n    ```\n    \"\"\"\n\n    __name__ = \"qm7x_v2\"\n    __energy_methods__ = QM7X.__energy_methods__ + [PotentialMethod.PM6]\n    __force_mask__ = QM7X.__force_mask__ + [False]\n    energy_target_names = QM7X.energy_target_names + [\"PM6\"]\n    force_target_names = QM7X.force_target_names\n
    "},{"location":"API/datasets/qmugs.html","title":"Qmugs","text":""},{"location":"API/datasets/qmugs.html#openqdc.datasets.potential.qmugs.QMugs","title":"QMugs","text":"

    Bases: BaseDataset

    The QMugs dataset contains 2 million conformers for 665k biologically and pharmacologically relevant molecules extracted from the ChEMBL database. Three geometries per molecule are generated and optimized using the GFN2-xTB method. Using the optimized geometry, the atomic and molecular properties are calculated using both, semi-empirical method (GFN2-xTB) and DFT method (\u03c9B97X-D/def2-SVP).

    Usage:

    from openqdc.datasets import QMugs\ndataset = QMugs()\n

    References

    https://arxiv.org/abs/2107.00367

    https://www.nature.com/articles/s41597-022-01390-7#ethics

    https://www.research-collection.ethz.ch/handle/20.500.11850/482129

    Source code in openqdc/datasets/potential/qmugs.py
    class QMugs(BaseDataset):\n    \"\"\"\n    The QMugs dataset contains 2 million conformers for 665k biologically and pharmacologically relevant molecules\n    extracted from the ChEMBL database. Three geometries per molecule are generated and optimized using the GFN2-xTB\n    method. Using the optimized geometry, the atomic and molecular properties are calculated using both, semi-empirical\n    method (GFN2-xTB) and DFT method (\u03c9B97X-D/def2-SVP).\n\n    Usage:\n    ```python\n    from openqdc.datasets import QMugs\n    dataset = QMugs()\n    ```\n\n    References:\n        https://arxiv.org/abs/2107.00367\\n\n        https://www.nature.com/articles/s41597-022-01390-7#ethics\\n\n        https://www.research-collection.ethz.ch/handle/20.500.11850/482129\n    \"\"\"\n\n    __name__ = \"qmugs\"\n    __energy_methods__ = [PotentialMethod.GFN2_XTB, PotentialMethod.WB97X_D_DEF2_SVP]  # \"gfn2_xtb\", \"wb97x-d/def2-svp\"\n    __energy_unit__ = \"hartree\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"hartree/ang\"\n    __links__ = {\n        \"summary.csv\": \"https://libdrive.ethz.ch/index.php/s/X5vOBNSITAG5vzM/download?path=%2F&files=summary.csv\",\n        \"structures.tar.gz\": \"https://libdrive.ethz.ch/index.php/s/X5vOBNSITAG5vzM/download?path=%2F&files=structures.tar.gz\",  # noqa\n    }\n\n    energy_target_names = [\n        \"GFN2:TOTAL_ENERGY\",\n        \"DFT:TOTAL_ENERGY\",\n    ]\n\n    def read_raw_entries(self):\n        raw_path = p_join(self.root, \"structures\")\n        mol_dirs = [p_join(raw_path, d) for d in os.listdir(raw_path)]\n\n        samples = dm.parallelized(read_mol, mol_dirs, n_jobs=-1, progress=True, scheduler=\"threads\")\n        return samples\n
    "},{"location":"API/datasets/qmugs.html#openqdc.datasets.potential.qmugs.QMugs_V2","title":"QMugs_V2","text":"

    Bases: QMugs

    QMugs_V2 is an extension of the QMugs dataset containing PM6 labels for each of the 4.2M geometries.

    Usage:

    from openqdc.datasets import QMugs_V2\ndataset = QMugs_V2()\n

    Source code in openqdc/datasets/potential/qmugs.py
    class QMugs_V2(QMugs):\n    \"\"\"\n    QMugs_V2 is an extension of the QMugs dataset containing PM6 labels for each of the 4.2M geometries.\n\n    Usage:\n    ```python\n    from openqdc.datasets import QMugs_V2\n    dataset = QMugs_V2()\n    ```\n    \"\"\"\n\n    __name__ = \"qmugs_v2\"\n    __energy_methods__ = QMugs.__energy_methods__ + [PotentialMethod.PM6]\n    energy_target_names = QMugs.energy_target_names + [\"PM6\"]\n    __force_mask__ = QMugs.__force_mask__ + [False]\n
    "},{"location":"API/datasets/qmx.html","title":"QMX","text":""},{"location":"API/datasets/qmx.html#openqdc.datasets.potential.qmx.QM7","title":"QM7","text":"

    Bases: QMX

    QM7 is a dataset constructed from subsets of the GDB-13 database ( stable and synthetically accessible organic molecules), containing up to seven \u201cheavy\u201d atoms. The molecules conformation are optimized using DFT at the PBE0/def2-TZVP level of theory.

    Chemical species

    [C, N, O, S, H]

    Usage:

    from openqdc.datasets import QM7\ndataset = QM7()\n

    References

    https://arxiv.org/pdf/1703.00564

    Source code in openqdc/datasets/potential/qmx.py
    class QM7(QMX):\n    \"\"\"\n    QM7 is a dataset constructed from subsets of the GDB-13 database (\n    stable and synthetically accessible organic molecules),\n    containing up to seven \u201cheavy\u201d atoms.\n    The molecules conformation are optimized using DFT at the\n    PBE0/def2-TZVP level of theory.\n\n    Chemical species:\n        [C, N, O, S, H]\n\n    Usage:\n    ```python\n    from openqdc.datasets import QM7\n    dataset = QM7()\n    ```\n\n    References:\n        https://arxiv.org/pdf/1703.00564\n    \"\"\"\n\n    __links__ = {\"qm7.hdf5.gz\": \"https://zenodo.org/record/3588337/files/150.hdf5.gz?download=1\"}\n    __name__ = \"qm7\"\n\n    energy_target_names = [\n        \"B2PLYP-D3(BJ):aug-cc-pvdz\",\n        \"B2PLYP-D3(BJ):aug-cc-pvtz\",\n        \"B2PLYP-D3(BJ):def2-svp\",\n        \"B2PLYP-D3(BJ):def2-tzvp\",\n        \"B2PLYP-D3(BJ):sto-3g\",\n        \"B2PLYP-D3:aug-cc-pvdz\",\n        \"B2PLYP-D3:aug-cc-pvtz\",\n        \"B2PLYP-D3:def2-svp\",\n        \"B2PLYP-D3:def2-tzvp\",\n        \"B2PLYP-D3:sto-3g\",\n        \"B2PLYP-D3M(BJ):aug-cc-pvdz\",\n        \"B2PLYP-D3M(BJ):aug-cc-pvtz\",\n        \"B2PLYP-D3M(BJ):def2-svp\",\n        \"B2PLYP-D3M(BJ):def2-tzvp\",\n        \"B2PLYP-D3M(BJ):sto-3g\",\n        \"B2PLYP-D3M:aug-cc-pvdz\",\n        \"B2PLYP-D3M:aug-cc-pvtz\",\n        \"B2PLYP-D3M:def2-svp\",\n        \"B2PLYP-D3M:def2-tzvp\",\n        \"B2PLYP-D3M:sto-3g\",\n        \"B2PLYP:aug-cc-pvdz\",\n        \"B2PLYP:aug-cc-pvtz\",\n        \"B2PLYP:def2-svp\",\n        \"B2PLYP:def2-tzvp\",\n        \"B2PLYP:sto-3g\",\n        \"B3LYP-D3(BJ):aug-cc-pvdz\",\n        \"B3LYP-D3(BJ):aug-cc-pvtz\",\n        \"B3LYP-D3(BJ):def2-svp\",\n        \"B3LYP-D3(BJ):def2-tzvp\",\n        \"B3LYP-D3(BJ):sto-3g\",\n        \"B3LYP-D3:aug-cc-pvdz\",\n        \"B3LYP-D3:aug-cc-pvtz\",\n        \"B3LYP-D3:def2-svp\",\n        \"B3LYP-D3:def2-tzvp\",\n        \"B3LYP-D3:sto-3g\",\n        \"B3LYP-D3M(BJ):aug-cc-pvdz\",\n        \"B3LYP-D3M(BJ):aug-cc-pvtz\",\n        \"B3LYP-D3M(BJ):def2-svp\",\n        \"B3LYP-D3M(BJ):def2-tzvp\",\n        \"B3LYP-D3M(BJ):sto-3g\",\n        \"B3LYP-D3M:aug-cc-pvdz\",\n        \"B3LYP-D3M:aug-cc-pvtz\",\n        \"B3LYP-D3M:def2-svp\",\n        \"B3LYP-D3M:def2-tzvp\",\n        \"B3LYP-D3M:sto-3g\",\n        \"B3LYP:aug-cc-pvdz\",\n        \"B3LYP:aug-cc-pvtz\",\n        \"B3LYP:def2-svp\",\n        \"B3LYP:def2-tzvp\",\n        \"B3LYP:sto-3g\",\n        \"HF:aug-cc-pvdz\",\n        \"HF:aug-cc-pvtz\",\n        \"HF:def2-svp\",\n        \"HF:def2-tzvp\",\n        \"HF:sto-3g\",\n        \"MP2:aug-cc-pvdz\",\n        \"MP2:aug-cc-pvtz\",\n        \"MP2:def2-svp\",\n        \"MP2:def2-tzvp\",\n        \"MP2:sto-3g\",\n        \"PBE0:aug-cc-pvdz\",\n        \"PBE0:aug-cc-pvtz\",\n        \"PBE0:def2-svp\",\n        \"PBE0:def2-tzvp\",\n        \"PBE0:sto-3g\",\n        \"PBE:aug-cc-pvdz\",\n        \"PBE:aug-cc-pvtz\",\n        \"PBE:def2-svp\",\n        \"PBE:def2-tzvp\",\n        \"PBE:sto-3g\",\n        \"WB97M-V:aug-cc-pvdz\",\n        \"WB97M-V:aug-cc-pvtz\",\n        \"WB97M-V:def2-svp\",\n        \"WB97M-V:def2-tzvp\",\n        \"WB97M-V:sto-3g\",\n        \"WB97X-D:aug-cc-pvdz\",\n        \"WB97X-D:aug-cc-pvtz\",\n        \"WB97X-D:def2-svp\",\n        \"WB97X-D:def2-tzvp\",\n        \"WB97X-D:sto-3g\",\n    ]\n\n    __energy_methods__ = [PotentialMethod.NONE for _ in range(len(energy_target_names))]  # \"wb97x/6-31g(d)\"\n
    "},{"location":"API/datasets/qmx.html#openqdc.datasets.potential.qmx.QM7b","title":"QM7b","text":"

    Bases: QMX

    QM7b is a dataset constructed from subsets of the GDB-13 database ( stable and synthetically accessible organic molecules), containing up to seven \u201cheavy\u201d atoms. The molecules conformation are optimized using DFT at the PBE0/def2-TZVP level of theory.

    Chemical species

    [C, N, O, S, Cl, H]

    Usage:

    from openqdc.datasets import QM7b\ndataset = QM7b()\n

    References

    https://arxiv.org/pdf/1703.00564

    Source code in openqdc/datasets/potential/qmx.py
    class QM7b(QMX):\n    \"\"\"\n    QM7b is a dataset constructed from subsets of the GDB-13 database (\n    stable and synthetically accessible organic molecules),\n    containing up to seven \u201cheavy\u201d atoms.\n    The molecules conformation are optimized using DFT at the\n    PBE0/def2-TZVP level of theory.\n\n    Chemical species:\n        [C, N, O, S, Cl, H]\n\n    Usage:\n    ```python\n    from openqdc.datasets import QM7b\n    dataset = QM7b()\n    ```\n\n    References:\n        https://arxiv.org/pdf/1703.00564\n    \"\"\"\n\n    __links__ = {\"qm7b.hdf5.gz\": \"https://zenodo.org/record/3588335/files/200.hdf5.gz?download=1\"}\n    __name__ = \"qm7b\"\n    energy_target_names = [\n        \"CCSD(T0):cc-pVDZ\",\n        \"HF:cc-pVDZ\",\n        \"HF:cc-pVTZ\",\n        \"MP2:cc-pVTZ\",\n        \"B2PLYP-D3:aug-cc-pvdz\",\n        \"B2PLYP-D3:aug-cc-pvtz\",\n        \"B2PLYP-D3:def2-svp\",\n        \"B2PLYP-D3:def2-tzvp\",\n        \"B2PLYP-D3:sto-3g\",\n        \"B2PLYP-D3M(BJ):aug-cc-pvdz\",\n        \"B2PLYP-D3M(BJ):aug-cc-pvtz\",\n        \"B2PLYP-D3M(BJ):def2-svp\",\n        \"B2PLYP-D3M(BJ):def2-tzvp\",\n        \"B2PLYP-D3M(BJ):sto-3g\",\n        \"B2PLYP-D3M:aug-cc-pvdz\",\n        \"B2PLYP-D3M:aug-cc-pvtz\",\n        \"B2PLYP-D3M:def2-svp\",\n        \"B2PLYP-D3M:def2-tzvp\",\n        \"B2PLYP-D3M:sto-3g\",\n        \"B2PLYP:aug-cc-pvdz\",\n        \"B2PLYP:aug-cc-pvtz\",\n        \"B2PLYP:def2-svp\",\n        \"B2PLYP:def2-tzvp\",\n        \"B2PLYP:sto-3g\",\n        \"B3LYP-D3(BJ):aug-cc-pvdz\",\n        \"B3LYP-D3(BJ):aug-cc-pvtz\",\n        \"B3LYP-D3(BJ):def2-svp\",\n        \"B3LYP-D3(BJ):def2-tzvp\",\n        \"B3LYP-D3(BJ):sto-3g\",\n        \"B3LYP-D3:aug-cc-pvdz\",\n        \"B3LYP-D3:aug-cc-pvtz\",\n        \"B3LYP-D3:def2-svp\",\n        \"B3LYP-D3:def2-tzvp\",\n        \"B3LYP-D3:sto-3g\",\n        \"B3LYP-D3M(BJ):aug-cc-pvdz\",\n        \"B3LYP-D3M(BJ):aug-cc-pvtz\",\n        \"B3LYP-D3M(BJ):def2-svp\",\n        \"B3LYP-D3M(BJ):def2-tzvp\",\n        \"B3LYP-D3M(BJ):sto-3g\",\n        \"B3LYP-D3M:aug-cc-pvdz\",\n        \"B3LYP-D3M:aug-cc-pvtz\",\n        \"B3LYP-D3M:def2-svp\",\n        \"B3LYP-D3M:def2-tzvp\",\n        \"B3LYP-D3M:sto-3g\",\n        \"B3LYP:aug-cc-pvdz\",\n        \"B3LYP:aug-cc-pvtz\",\n        \"B3LYP:def2-svp\",\n        \"B3LYP:def2-tzvp\",\n        \"B3LYP:sto-3g\",\n        \"HF:aug-cc-pvdz\",\n        \"HF:aug-cc-pvtz\",\n        \"HF:cc-pvtz\",\n        \"HF:def2-svp\",\n        \"HF:def2-tzvp\",\n        \"HF:sto-3g\",\n        \"PBE0:aug-cc-pvdz\",\n        \"PBE0:aug-cc-pvtz\",\n        \"PBE0:def2-svp\",\n        \"PBE0:def2-tzvp\",\n        \"PBE0:sto-3g\",\n        \"PBE:aug-cc-pvdz\",\n        \"PBE:aug-cc-pvtz\",\n        \"PBE:def2-svp\",\n        \"PBE:def2-tzvp\",\n        \"PBE:sto-3g\",\n        \"SVWN:sto-3g\",\n        \"WB97M-V:aug-cc-pvdz\",\n        \"WB97M-V:aug-cc-pvtz\",\n        \"WB97M-V:def2-svp\",\n        \"WB97M-V:def2-tzvp\",\n        \"WB97M-V:sto-3g\",\n        \"WB97X-D:aug-cc-pvdz\",\n        \"WB97X-D:aug-cc-pvtz\",\n        \"WB97X-D:def2-svp\",\n        \"WB97X-D:def2-tzvp\",\n        \"WB97X-D:sto-3g\",\n    ]\n    __energy_methods__ = [PotentialMethod.NONE for _ in range(len(energy_target_names))]  # \"wb97x/6-31g(d)\"]\n
    "},{"location":"API/datasets/qmx.html#openqdc.datasets.potential.qmx.QM8","title":"QM8","text":"

    Bases: QMX

    QM8 is the subset of QM9 used in a study on modeling quantum mechanical calculations of electronic spectra and excited state energy (a increase of energy from the ground states) of small molecules up to eight heavy atoms. Multiple methods were used, including time-dependent density functional theories (TDDFT) and second-order approximate coupled-cluster (CC2). The molecules conformations are relaxed geometries computed using the DFT B3LYP with basis set 6-31G(2df,p). For more information about the sampling, check QM9 dataset.

    Usage:

    from openqdc.datasets import QM8\ndataset = QM8()\n

    References

    https://arxiv.org/pdf/1504.01966

    Source code in openqdc/datasets/potential/qmx.py
    class QM8(QMX):\n    \"\"\"QM8 is the subset of QM9 used in a study on modeling quantum\n    mechanical calculations of electronic spectra and excited\n    state energy (a increase of energy from the ground states) of small molecules\n    up to eight heavy atoms.\n    Multiple methods were used, including\n    time-dependent density functional theories (TDDFT) and\n    second-order approximate coupled-cluster (CC2).\n    The molecules conformations are relaxed geometries computed using\n    the DFT B3LYP with basis set 6-31G(2df,p).\n    For more information about the sampling, check QM9 dataset.\n\n    Usage:\n    ```python\n    from openqdc.datasets import QM8\n    dataset = QM8()\n    ```\n\n    References:\n        https://arxiv.org/pdf/1504.01966\n    \"\"\"\n\n    __name__ = \"qm8\"\n\n    __energy_methods__ = [\n        PotentialMethod.NONE,  # \"wb97x/6-31g(d)\"\n        PotentialMethod.NONE,\n        PotentialMethod.NONE,\n        PotentialMethod.NONE,\n        PotentialMethod.NONE,\n        PotentialMethod.NONE,\n        PotentialMethod.NONE,\n        PotentialMethod.NONE,\n    ]\n\n    __links__ = {\n        \"qm8.csv\": \"https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/qm8.csv\",\n        \"qm8.tar.gz\": \"https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/gdb8.tar.gz\",\n    }\n\n    def read_raw_entries(self):\n        df = pd.read_csv(p_join(self.root, \"qm8.csv\"))\n        mols = dm.read_sdf(p_join(self.root, \"qm8.sdf\"), sanitize=False, remove_hs=False)\n        samples = []\n        for idx_row, mol in zip(df.iterrows(), mols):\n            _, row = idx_row\n            positions = mol.GetConformer().GetPositions()\n            x = get_atomic_number_and_charge(mol)\n            n_atoms = positions.shape[0]\n            samples.append(\n                dict(\n                    atomic_inputs=np.concatenate((x, positions), axis=-1, dtype=np.float32).reshape(-1, 5),\n                    name=np.array([row[\"smiles\"]]),\n                    energies=np.array(\n                        [\n                            row[\n                                [\"E1-CC2\", \"E2-CC2\", \"E1-PBE0\", \"E2-PBE0\", \"E1-PBE0.1\", \"E2-PBE0.1\", \"E1-CAM\", \"E2-CAM\"]\n                            ].tolist()\n                        ],\n                        dtype=np.float64,\n                    ).reshape(1, -1),\n                    n_atoms=np.array([n_atoms], dtype=np.int32),\n                    subset=np.array([f\"{self.__name__}\"]),\n                )\n            )\n        return samples\n
    "},{"location":"API/datasets/qmx.html#openqdc.datasets.potential.qmx.QM9","title":"QM9","text":"

    Bases: QMX

    QM7b is a dataset constructed containing 134k molecules from subsets of the GDB-17 database, containing up to 9 \u201cheavy\u201d atoms. All molecular properties are calculated at B3LUP/6-31G(2df,p) level of quantum chemistry. For each of the 134k molecules, equilibrium geometries are computed by relaxing geometries with quantum mechanical method B3LYP.

    Usage:

    from openqdc.datasets import QM9\ndataset = QM9()\n

    Reference

    https://www.nature.com/articles/sdata201422

    Source code in openqdc/datasets/potential/qmx.py
    class QM9(QMX):\n    \"\"\"\n    QM7b is a dataset constructed containing 134k molecules from subsets of the GDB-17 database,\n    containing up to 9 \u201cheavy\u201d atoms. All molecular properties are calculated at B3LUP/6-31G(2df,p)\n    level of quantum chemistry. For each of the 134k molecules, equilibrium geometries are computed\n    by relaxing geometries with quantum mechanical method B3LYP.\n\n    Usage:\n    ```python\n    from openqdc.datasets import QM9\n    dataset = QM9()\n    ```\n\n    Reference:\n        https://www.nature.com/articles/sdata201422\n    \"\"\"\n\n    __links__ = {\"qm9.hdf5.gz\": \"https://zenodo.org/record/3588339/files/155.hdf5.gz?download=1\"}\n    __name__ = \"qm9\"\n    energy_target_names = [\n        \"Internal energy at 0 K\",\n        \"B3LYP:def2-svp\",\n        \"HF:cc-pvtz\",\n        \"HF:sto-3g\",\n        \"PBE:sto-3g\",\n        \"SVWN:sto-3g\",\n        \"WB97X-D:aug-cc-pvtz\",\n        \"WB97X-D:def2-svp\",\n        \"WB97X-D:def2-tzvp\",\n    ]\n\n    __energy_methods__ = [\n        PotentialMethod.NONE,  # \"wb97x/6-31g(d)\"\n        PotentialMethod.NONE,\n        PotentialMethod.NONE,\n        PotentialMethod.NONE,\n        PotentialMethod.NONE,\n        PotentialMethod.NONE,\n        PotentialMethod.NONE,\n        PotentialMethod.NONE,\n        PotentialMethod.NONE,\n    ]\n
    "},{"location":"API/datasets/qmx.html#openqdc.datasets.potential.qmx.QMX","title":"QMX","text":"

    Bases: ABC, BaseDataset

    QMX dataset base abstract class

    Source code in openqdc/datasets/potential/qmx.py
    class QMX(ABC, BaseDataset):\n    \"\"\"\n    QMX dataset base abstract class\n    \"\"\"\n\n    __name__ = \"qm9\"\n\n    __energy_methods__ = [\n        PotentialMethod.WB97X_6_31G_D,  # \"wb97x/6-31g(d)\"\n    ]\n\n    energy_target_names = [\n        \"\u03c9B97x:6-31G(d) Energy\",\n    ]\n\n    __energy_unit__ = \"hartree\"\n    __distance_unit__ = \"bohr\"\n    __forces_unit__ = \"hartree/bohr\"\n    __links__ = {}\n\n    @property\n    def root(self):\n        return p_join(get_local_cache(), \"qmx\")\n\n    @property\n    def preprocess_path(self):\n        path = p_join(self.root, \"preprocessed\", self.__name__)\n        os.makedirs(path, exist_ok=True)\n        return path\n\n    @property\n    def config(self):\n        assert len(self.__links__) > 0, \"No links provided for fetching\"\n        return dict(dataset_name=\"qmx\", links=self.__links__)\n\n    def read_raw_entries(self):\n        raw_path = p_join(self.root, f\"{self.__name__}.h5.gz\")\n        samples = read_qc_archive_h5(raw_path, self.__name__, self.energy_target_names, None)\n        return samples\n
    "},{"location":"API/datasets/revmd17.html","title":"RevMD17","text":""},{"location":"API/datasets/revmd17.html#openqdc.datasets.potential.revmd17.RevMD17","title":"RevMD17","text":"

    Bases: BaseDataset

    Revised MD (RevMD17) improves upon the MD17 dataset by removing all the numerical noise present in the original dataset. The data is generated from an ab-initio molecular dynamics (AIMD) simulation where forces and energies are computed at the PBE/def2-SVP level of theory using very tigh SCF convergence and very dense DFT integration grid. The dataset contains the following molecules: Benzene: 627000 samples

    Uracil: 133000 samples\n\nNaptalene: 326000 samples\n\nAspirin: 211000 samples\n\nSalicylic Acid: 320000 samples\n\nMalonaldehyde: 993000 samples\n\nEthanol: 555000 samples\n\nToluene: 100000 samples\n

    Usage:

    from openqdc.datasets import RevMD17\ndataset = RevMD17()\n

    References

    https://arxiv.org/abs/2007.09593

    Source code in openqdc/datasets/potential/revmd17.py
    class RevMD17(BaseDataset):\n    \"\"\"\n    Revised MD (RevMD17) improves upon the MD17 dataset by removing all the numerical noise present in the original\n    dataset. The data is generated from an ab-initio molecular dynamics (AIMD) simulation where forces and energies\n    are computed at the PBE/def2-SVP level of theory using very tigh SCF convergence and very dense DFT integration\n    grid. The dataset contains the following molecules:\n        Benzene: 627000 samples\\n\n        Uracil: 133000 samples\\n\n        Naptalene: 326000 samples\\n\n        Aspirin: 211000 samples\\n\n        Salicylic Acid: 320000 samples\\n\n        Malonaldehyde: 993000 samples\\n\n        Ethanol: 555000 samples\\n\n        Toluene: 100000 samples\\n\n\n    Usage:\n    ```python\n    from openqdc.datasets import RevMD17\n    dataset = RevMD17()\n    ```\n\n    References:\n        https://arxiv.org/abs/2007.09593\n    \"\"\"\n\n    __name__ = \"revmd17\"\n\n    __energy_methods__ = [\n        PotentialMethod.PBE_DEF2_TZVP\n        # \"pbe/def2-tzvp\",\n    ]\n    __force_mask__ = [True]\n\n    energy_target_names = [\n        \"PBE-TS Energy\",\n    ]\n\n    __force_methods__ = [\n        \"pbe/def2-tzvp\",\n    ]\n\n    force_target_names = [\n        \"PBE-TS Gradient\",\n    ]\n    __links__ = {\"revmd17.zip\": \"https://figshare.com/ndownloader/articles/12672038/versions/3\"}\n\n    __energy_unit__ = \"kcal/mol\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"kcal/mol/ang\"\n\n    def read_raw_entries(self):\n        entries_list = []\n        decompress_tar_gz(p_join(self.root, \"rmd17.tar.bz2\"))\n        for trajectory in trajectories:\n            entries_list.append(read_npz_entry(trajectory, self.root))\n        return entries_list\n
    "},{"location":"API/datasets/sn2_rxn.html","title":"SN2 RXN","text":""},{"location":"API/datasets/sn2_rxn.html#openqdc.datasets.potential.sn2_rxn.SN2RXN","title":"SN2RXN","text":"

    Bases: BaseDataset

    This dataset probes chemical reactions of methyl halides with halide anions, i.e. X- + CH3Y -> CH3X + Y-, and contains structures for all possible combinations of X,Y = F, Cl, Br, I. The conformations are generated by running MD simulations at a temperature of 5000K with a time step of 0.1 fs using Atomic Simulation Environment (ASE). The forces are derived using semi-empirical method PM7 and the structures are saved every 10 steps, and for each of them, energy and forces are calculated at the DSD-BLYP-D3(BJ)/def2-TZVP level of theory. The dataset contains 452,709 structures along with the energy, force and dipole moments.

    Usage:

    from openqdc.datasets import SN2RXN\ndataset = SN2RXN()\n

    References

    https://doi.org/10.1021/acs.jctc.9b00181

    https://zenodo.org/records/2605341

    Source code in openqdc/datasets/potential/sn2_rxn.py
    class SN2RXN(BaseDataset):\n    \"\"\"\n    This dataset probes chemical reactions of methyl halides with halide anions, i.e. X- + CH3Y -> CH3X +  Y-, and\n    contains structures for all possible combinations of X,Y = F, Cl, Br, I. The conformations are generated by\n    running MD simulations at a temperature of 5000K with a time step of 0.1 fs using Atomic Simulation Environment\n    (ASE). The forces are derived using semi-empirical method PM7 and the structures are saved every 10 steps, and\n    for each of them, energy and forces are calculated at the DSD-BLYP-D3(BJ)/def2-TZVP level of theory. The dataset\n    contains 452,709 structures along with the energy, force and dipole moments.\n\n    Usage:\n    ```python\n    from openqdc.datasets import SN2RXN\n    dataset = SN2RXN()\n    ```\n\n    References:\n        https://doi.org/10.1021/acs.jctc.9b00181\\n\n        https://zenodo.org/records/2605341\n    \"\"\"\n\n    __name__ = \"sn2_rxn\"\n\n    __energy_methods__ = [\n        PotentialMethod.DSD_BLYP_D3_BJ_DEF2_TZVP\n        # \"dsd-blyp-d3(bj)/def2-tzvp\",\n    ]\n    __energy_unit__ = \"ev\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"ev/ang\"\n    __links__ = {\"sn2_rxn.npz\": \"https://zenodo.org/records/2605341/files/sn2_reactions.npz\"}\n\n    energy_target_names = [\n        # TODO: We need to revalidate this to make sure that is not atomization energies.\n        \"DSD-BLYP-D3(BJ):def2-TZVP Atomization Energy\",\n    ]\n\n    __force_mask__ = [True]\n\n    force_target_names = [\n        \"DSD-BLYP-D3(BJ):def2-TZVP Gradient\",\n    ]\n\n    def read_raw_entries(self):\n        raw_path = p_join(self.root, \"sn2_rxn.npz\")\n        data = np.load(raw_path)\n        samples = extract_npz_entry(data)\n\n        return samples\n
    "},{"location":"API/datasets/solvated_peptides.html","title":"Solvated Peptides","text":""},{"location":"API/datasets/solvated_peptides.html#openqdc.datasets.potential.solvated_peptides.SolvatedPeptides","title":"SolvatedPeptides","text":"

    Bases: BaseDataset

    The solvated protein fragments dataset probes many-body intermolecular interactions between \"protein fragments\" and water molecules. Geometries are first optimized with the semi-empirical method PM7 and then MD simulations are run at 1000K with a time-step of 0.1fs using Atomic Simulations Environment (ASE). Structures are saved every 10 steps, where energies, forces and dipole moments are calculated at revPBE-D3(BJ)/def2-TZVP level of theory.

    Usage:

    from openqdc.datasets import SolvatedPeptides\ndataset = SolvatedPeptides()\n

    References

    https://doi.org/10.1021/acs.jctc.9b00181

    https://zenodo.org/records/2605372

    Source code in openqdc/datasets/potential/solvated_peptides.py
    class SolvatedPeptides(BaseDataset):\n    \"\"\"\n    The solvated protein fragments dataset probes many-body intermolecular interactions between \"protein fragments\"\n    and water molecules. Geometries are first optimized with the semi-empirical method PM7 and then MD simulations are\n    run at 1000K with a time-step of 0.1fs using Atomic Simulations Environment (ASE). Structures are saved every 10\n    steps, where energies, forces and dipole moments are calculated at revPBE-D3(BJ)/def2-TZVP level of theory.\n\n    Usage:\n    ```python\n    from openqdc.datasets import SolvatedPeptides\n    dataset = SolvatedPeptides()\n    ```\n\n    References:\n        https://doi.org/10.1021/acs.jctc.9b00181\\n\n        https://zenodo.org/records/2605372\n    \"\"\"\n\n    __name__ = \"solvated_peptides\"\n\n    __energy_methods__ = [\n        PotentialMethod.REVPBE_D3_BJ_DEF2_TZVP\n        # \"revpbe-d3(bj)/def2-tzvp\",\n    ]\n\n    energy_target_names = [\n        \"revPBE-D3(BJ):def2-TZVP Atomization Energy\",\n    ]\n\n    __force_mask__ = [True]\n\n    force_target_names = [\n        \"revPBE-D3(BJ):def2-TZVP Gradient\",\n    ]\n\n    # TO CHECK\n    __energy_unit__ = \"ev\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"ev/ang\"\n    __links__ = {\"solvated_peptides.hdf5.gz\": \"https://zenodo.org/record/3585804/files/213.hdf5.gz\"}\n\n    def __smiles_converter__(self, x):\n        \"\"\"util function to convert string to smiles: useful if the smiles is\n        encoded in a different format than its display format\n        \"\"\"\n        return \"_\".join(x.decode(\"ascii\").split(\"_\")[:-1])\n\n    def read_raw_entries(self):\n        raw_path = p_join(self.root, \"solvated_peptides.h5.gz\")\n        samples = read_qc_archive_h5(raw_path, \"solvated_peptides\", self.energy_target_names, self.force_target_names)\n\n        return samples\n
    "},{"location":"API/datasets/solvated_peptides.html#openqdc.datasets.potential.solvated_peptides.SolvatedPeptides.__smiles_converter__","title":"__smiles_converter__(x)","text":"

    util function to convert string to smiles: useful if the smiles is encoded in a different format than its display format

    Source code in openqdc/datasets/potential/solvated_peptides.py
    def __smiles_converter__(self, x):\n    \"\"\"util function to convert string to smiles: useful if the smiles is\n    encoded in a different format than its display format\n    \"\"\"\n    return \"_\".join(x.decode(\"ascii\").split(\"_\")[:-1])\n
    "},{"location":"API/datasets/spice.html","title":"Spice","text":""},{"location":"API/datasets/spice.html#openqdc.datasets.potential.spice.Spice","title":"Spice","text":"

    Bases: BaseDataset

    Spice dataset consists of 1.1 million conformations for a diverse set of 19k unique molecules consisting of small molecules, dimers, dipeptides, and solvated amino acids. Conformations are first generated with RDKit, and then molecular dynamics simulations at 100ps and 500K using OpenMM and Amber force field are used to generate 100 high energy conformations. Low-energy conformations are then generated by L-BFGS energy minimization and molecular dynamics at 1ps and 100K. Forces and energies for conformations are calculated at the wB97M-D3(BJ)/def2-TZVPPD level of theory.

    Usage:

    from openqdc.datasets import Spice\ndataset = Spice()\n

    References

    https://arxiv.org/abs/2209.10702

    https://github.com/openmm/spice-dataset

    Source code in openqdc/datasets/potential/spice.py
    class Spice(BaseDataset):\n    \"\"\"\n    Spice dataset consists of 1.1 million conformations for a diverse set of 19k unique molecules consisting of\n    small molecules, dimers, dipeptides, and solvated amino acids. Conformations are first generated with RDKit,\n    and then molecular dynamics simulations at 100ps and 500K using OpenMM and Amber force field are used to generate\n    100 high energy conformations. Low-energy conformations are then generated by L-BFGS energy minimization and\n    molecular dynamics at 1ps and 100K. Forces and energies for conformations are calculated at the\n    wB97M-D3(BJ)/def2-TZVPPD level of theory.\n\n    Usage:\n    ```python\n    from openqdc.datasets import Spice\n    dataset = Spice()\n    ```\n\n    References:\n        https://arxiv.org/abs/2209.10702\\n\n        https://github.com/openmm/spice-dataset\n    \"\"\"\n\n    __name__ = \"spice\"\n    __energy_methods__ = [PotentialMethod.WB97M_D3BJ_DEF2_TZVPPD]\n    __force_mask__ = [True]\n    __energy_unit__ = \"hartree\"\n    __distance_unit__ = \"bohr\"\n    __forces_unit__ = \"hartree/bohr\"\n\n    energy_target_names = [\"dft_total_energy\"]\n\n    force_target_names = [\"dft_total_gradient\"]\n\n    subset_mapping = {\n        \"SPICE Solvated Amino Acids Single Points Dataset v1.1\": \"Solvated Amino Acids\",\n        \"SPICE Dipeptides Single Points Dataset v1.2\": \"Dipeptides\",\n        \"SPICE DES Monomers Single Points Dataset v1.1\": \"DES370K Monomers\",\n        \"SPICE DES370K Single Points Dataset v1.0\": \"DES370K Dimers\",\n        \"SPICE DES370K Single Points Dataset Supplement v1.0\": \"DES370K Dimers\",\n        \"SPICE PubChem Set 1 Single Points Dataset v1.2\": \"PubChem\",\n        \"SPICE PubChem Set 2 Single Points Dataset v1.2\": \"PubChem\",\n        \"SPICE PubChem Set 3 Single Points Dataset v1.2\": \"PubChem\",\n        \"SPICE PubChem Set 4 Single Points Dataset v1.2\": \"PubChem\",\n        \"SPICE PubChem Set 5 Single Points Dataset v1.2\": \"PubChem\",\n        \"SPICE PubChem Set 6 Single Points Dataset v1.2\": \"PubChem\",\n        \"SPICE Ion Pairs Single Points Dataset v1.1\": \"Ion Pairs\",\n    }\n    __links__ = {\"SPICE-1.1.4.hdf5\": \"https://zenodo.org/record/8222043/files/SPICE-1.1.4.hdf5\"}\n\n    def convert_forces(self, x):\n        return (-1.0) * super().convert_forces(x)\n\n    def read_raw_entries(self):\n        raw_path = p_join(self.root, \"SPICE-1.1.4.hdf5\")\n\n        data = load_hdf5_file(raw_path)\n        tmp = [read_record(data[mol_name], self) for mol_name in tqdm(data)]  # don't use parallelized here\n\n        return tmp\n
    "},{"location":"API/datasets/spice.html#openqdc.datasets.potential.spice.SpiceV2","title":"SpiceV2","text":"

    Bases: Spice

    SpiceV2 dataset augments the Spice data with amino acids complexes, water boxes, pubchem solvated molecules. The main changes include, (1) over 13,000 new PubChem molecules, out of which 1500 contain boron and 1900 contain silicon, (2) 194,000 conformations of dimers containing amino acid and ligands, (3) 1000 water clusters to improve sampling interactions in bulk water, (4) 1397 PubChem molecules solvated with a shell of water molecules, and (5) Fixing bad calculations from the Spice dataset. The data generation process is the same as the Spice dataset.

    Usage:

    from openqdc.datasets import SpiceV2\ndataset = SpiceV2()\n

    References

    https://github.com/openmm/spice-dataset/releases/tag/2.0.0

    https://github.com/openmm/spice-dataset

    Source code in openqdc/datasets/potential/spice.py
    class SpiceV2(Spice):\n    \"\"\"\n    SpiceV2 dataset augments the Spice data with amino acids complexes, water boxes, pubchem solvated molecules.\n    The main changes include, (1) over 13,000 new PubChem molecules, out of which 1500 contain boron and 1900 contain\n    silicon, (2) 194,000 conformations of dimers containing amino acid and ligands, (3) 1000 water clusters to improve\n    sampling interactions in bulk water, (4) 1397 PubChem molecules solvated with a shell of water molecules, and\n    (5) Fixing bad calculations from the Spice dataset. The data generation process is the same as the Spice dataset.\n\n    Usage:\n    ```python\n    from openqdc.datasets import SpiceV2\n    dataset = SpiceV2()\n    ```\n\n    References:\n        https://github.com/openmm/spice-dataset/releases/tag/2.0.0\\n\n        https://github.com/openmm/spice-dataset\n    \"\"\"\n\n    __name__ = \"spicev2\"\n\n    subset_mapping = {\n        \"SPICE Dipeptides Single Points Dataset v1.3\": \"Dipeptides\",\n        \"SPICE Solvated Amino Acids Single Points Dataset v1.1\": \"Solvated Amino Acids\",\n        \"SPICE Water Clusters v1.0\": \"Water Clusters\",\n        \"SPICE Solvated PubChem Set 1 v1.0\": \"Solvated PubChem\",\n        \"SPICE Amino Acid Ligand v1.0\": \"Amino Acid Ligand\",\n        \"SPICE PubChem Set 1 Single Points Dataset v1.3\": \"PubChem\",\n        \"SPICE PubChem Set 2 Single Points Dataset v1.3\": \"PubChem\",\n        \"SPICE PubChem Set 3 Single Points Dataset v1.3\": \"PubChem\",\n        \"SPICE PubChem Set 4 Single Points Dataset v1.3\": \"PubChem\",\n        \"SPICE PubChem Set 5 Single Points Dataset v1.3\": \"PubChem\",\n        \"SPICE PubChem Set 6 Single Points Dataset v1.3\": \"PubChem\",\n        \"SPICE PubChem Set 7 Single Points Dataset v1.0\": \"PubChemv2\",\n        \"SPICE PubChem Set 8 Single Points Dataset v1.0\": \"PubChemv2\",\n        \"SPICE PubChem Set 9 Single Points Dataset v1.0\": \"PubChemv2\",\n        \"SPICE PubChem Set 10 Single Points Dataset v1.0\": \"PubChemv2\",\n        \"SPICE DES Monomers Single Points Dataset v1.1\": \"DES370K Monomers\",\n        \"SPICE DES370K Single Points Dataset v1.0\": \"DES370K Dimers\",\n        \"SPICE DES370K Single Points Dataset Supplement v1.1\": \"DES370K Dimers\",\n        \"SPICE PubChem Boron Silicon v1.0\": \"PubChem Boron Silicon\",\n        \"SPICE Ion Pairs Single Points Dataset v1.2\": \"Ion Pairs\",\n    }\n    __links__ = {\"spice-2.0.0.hdf5\": \"https://zenodo.org/records/10835749/files/SPICE-2.0.0.hdf5?download=1\"}\n\n    def read_raw_entries(self):\n        raw_path = p_join(self.root, \"spice-2.0.0.hdf5\")\n\n        data = load_hdf5_file(raw_path)\n        # Entry 40132 without positions, skip it\n        # don't use parallelized here\n        tmp = [read_record(data[mol_name], self) for i, mol_name in enumerate(tqdm(data)) if i != 40132]\n\n        return tmp\n
    "},{"location":"API/datasets/spice.html#openqdc.datasets.potential.spice.SpiceVL2","title":"SpiceVL2","text":"

    Bases: SpiceV2

    SpiceVL2 is an extension of the SpiceV2 dataset with additional semi-empirical GFN2-xTB and PM6 energy methods.

    Usage:

    from openqdc.datasets import SpiceVL2\ndataset = SpiceVL2()\n

    References

    https://github.com/openmm/spice-dataset/releases/tag/2.0.0

    https://github.com/openmm/spice-dataset

    Source code in openqdc/datasets/potential/spice.py
    class SpiceVL2(SpiceV2):\n    \"\"\"\n    SpiceVL2 is an extension of the SpiceV2 dataset with additional semi-empirical GFN2-xTB and PM6 energy methods.\n\n    Usage:\n    ```python\n    from openqdc.datasets import SpiceVL2\n    dataset = SpiceVL2()\n    ```\n\n    References:\n        https://github.com/openmm/spice-dataset/releases/tag/2.0.0\\n\n        https://github.com/openmm/spice-dataset\n    \"\"\"\n\n    __name__ = \"spice_vl2\"\n\n    __energy_methods__ = SpiceV2.__energy_methods__ + [PotentialMethod.GFN2_XTB, PotentialMethod.PM6]\n    energy_target_names = SpiceV2.energy_target_names + [\"GFN2,\" \"PM6\"]\n    __force_mask__ = SpiceV2.__force_mask__ + [False, False]\n
    "},{"location":"API/datasets/spice.html#openqdc.datasets.potential.spice.read_record","title":"read_record(r, obj)","text":"

    Read record from hdf5 file. r : hdf5 record obj : Spice class object used to grab subset and names

    Source code in openqdc/datasets/potential/spice.py
    def read_record(r, obj):\n    \"\"\"\n    Read record from hdf5 file.\n        r : hdf5 record\n        obj : Spice class object used to grab subset and names\n    \"\"\"\n    smiles = r[\"smiles\"].asstr()[0]\n    subset = r[\"subset\"][0].decode(\"utf-8\")\n    n_confs = r[\"conformations\"].shape[0]\n    x = get_atomic_number_and_charge(dm.to_mol(smiles, remove_hs=False, ordered=True))\n    positions = r[\"conformations\"][:]\n\n    res = dict(\n        name=np.array([smiles] * n_confs),\n        subset=np.array([obj.subset_mapping[subset]] * n_confs),\n        energies=r[obj.energy_target_names[0]][:][:, None].astype(np.float64),\n        forces=r[obj.force_target_names[0]][:].reshape(\n            -1, 3, 1\n        ),  # forces -ve of energy gradient but the -1.0 is done in the convert_forces method\n        atomic_inputs=np.concatenate(\n            (x[None, ...].repeat(n_confs, axis=0), positions), axis=-1, dtype=np.float32\n        ).reshape(-1, 5),\n        n_atoms=np.array([x.shape[0]] * n_confs, dtype=np.int32),\n    )\n\n    return res\n
    "},{"location":"API/datasets/splinter.html","title":"Splinter","text":""},{"location":"API/datasets/splinter.html#openqdc.datasets.interaction.splinter.Splinter","title":"Splinter","text":"

    Bases: BaseInteractionDataset

    Splinter consists of 30,416A dimer pairs with over 1.5 million geometries. The geometries are generated by quantum mechanical optimization with B3LYP-D3/aug-cc-pV(D+d)Z level of theory. The interaction energies and the various components are computed using SAPT0/qug-cc-pV(D=d)Z method.

    Usage:

    from openqdc.datasets import Splinter\ndataset = Splinter()\n

    Reference

    https://doi.org/10.1038/s41597-023-02443-1

    Source code in openqdc/datasets/interaction/splinter.py
    class Splinter(BaseInteractionDataset):\n    \"\"\"\n    Splinter consists of 30,416A dimer pairs with over 1.5 million geometries. The geometries are generated\n    by quantum mechanical optimization with B3LYP-D3/aug-cc-pV(D+d)Z level of theory. The interaction energies\n    and the various components are computed using SAPT0/qug-cc-pV(D=d)Z method.\n\n    Usage:\n    ```python\n    from openqdc.datasets import Splinter\n    dataset = Splinter()\n    ```\n\n    Reference:\n        https://doi.org/10.1038/s41597-023-02443-1\n    \"\"\"\n\n    __energy_unit__ = \"kcal/mol\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"kcal/mol/ang\"\n\n    __name__ = \"splinter\"\n    __energy_methods__ = [\n        InteractionMethod.SAPT0_JUN_CC_PVDDZ,\n        InteractionMethod.SAPT0_JUN_CC_PVDDZ,\n        InteractionMethod.SAPT0_JUN_CC_PVDDZ,\n        InteractionMethod.SAPT0_JUN_CC_PVDDZ,\n        InteractionMethod.SAPT0_JUN_CC_PVDDZ,\n        InteractionMethod.SAPT0_JUN_CC_PVDDZ,\n        InteractionMethod.SAPT0_JUN_CC_PVDDZ,\n        InteractionMethod.SAPT0_JUN_CC_PVDDZ,\n        InteractionMethod.SAPT0_JUN_CC_PVDDZ,\n        InteractionMethod.SAPT0_JUN_CC_PVDDZ,\n        InteractionMethod.SAPT0_AUG_CC_PVDDZ,\n        InteractionMethod.SAPT0_AUG_CC_PVDDZ,\n        InteractionMethod.SAPT0_AUG_CC_PVDDZ,\n        InteractionMethod.SAPT0_AUG_CC_PVDDZ,\n        InteractionMethod.SAPT0_AUG_CC_PVDDZ,\n        InteractionMethod.SAPT0_AUG_CC_PVDDZ,\n        InteractionMethod.SAPT0_AUG_CC_PVDDZ,\n        InteractionMethod.SAPT0_AUG_CC_PVDDZ,\n        InteractionMethod.SAPT0_AUG_CC_PVDDZ,\n        InteractionMethod.SAPT0_AUG_CC_PVDDZ,\n        # \"sapt0/jun-cc-pV(D+d)Z_unscaled\", #TODO: we need to pick the unscaled version only here\n        # \"sapt0/jun-cc-pV(D+d)Z_es_unscaled\",\n        # \"sapt0/jun-cc-pV(D+d)Z_ex_unscaled\",\n        # \"sapt0/jun-cc-pV(D+d)Z_ind_unscaled\",\n        # \"sapt0/jun-cc-pV(D+d)Z_disp_unscaled\",\n        # \"sapt0/jun-cc-pV(D+d)Z_scaled\",\n        # \"sapt0/jun-cc-pV(D+d)Z_es_scaled\",\n        # \"sapt0/jun-cc-pV(D+d)Z_ex_scaled\",\n        # \"sapt0/jun-cc-pV(D+d)Z_ind_scaled\",\n        # \"sapt0/jun-cc-pV(D+d)Z_disp_scaled\",\n        # \"sapt0/aug-cc-pV(D+d)Z_unscaled\",\n        # \"sapt0/aug-cc-pV(D+d)Z_es_unscaled\",\n        # \"sapt0/aug-cc-pV(D+d)Z_ex_unscaled\",\n        # \"sapt0/aug-cc-pV(D+d)Z_ind_unscaled\",\n        # \"sapt0/aug-cc-pV(D+d)Z_disp_unscaled\",\n        # \"sapt0/aug-cc-pV(D+d)Z_scaled\",\n        # \"sapt0/aug-cc-pV(D+d)Z_es_scaled\",\n        # \"sapt0/aug-cc-pV(D+d)Z_ex_scaled\",\n        # \"sapt0/aug-cc-pV(D+d)Z_ind_scaled\",\n        # \"sapt0/aug-cc-pV(D+d)Z_disp_scaled\",\n    ]\n\n    __energy_type__ = [\n        InterEnergyType.TOTAL,\n        InterEnergyType.ES,\n        InterEnergyType.EX,\n        InterEnergyType.IND,\n        InterEnergyType.DISP,\n        InterEnergyType.TOTAL,\n        InterEnergyType.ES,\n        InterEnergyType.EX,\n        InterEnergyType.IND,\n        InterEnergyType.DISP,\n        InterEnergyType.TOTAL,\n        InterEnergyType.ES,\n        InterEnergyType.EX,\n        InterEnergyType.IND,\n        InterEnergyType.DISP,\n        InterEnergyType.TOTAL,\n        InterEnergyType.ES,\n        InterEnergyType.EX,\n        InterEnergyType.IND,\n        InterEnergyType.DISP,\n    ]\n    energy_target_names = []\n    __links__ = {\n        \"dimerpairs.0.tar.gz\": \"https://figshare.com/ndownloader/files/39449167\",\n        \"dimerpairs.1.tar.gz\": \"https://figshare.com/ndownloader/files/40271983\",\n        \"dimerpairs.2.tar.gz\": \"https://figshare.com/ndownloader/files/40271989\",\n        \"dimerpairs.3.tar.gz\": \"https://figshare.com/ndownloader/files/40272001\",\n        \"dimerpairs.4.tar.gz\": \"https://figshare.com/ndownloader/files/40272022\",\n        \"dimerpairs.5.tar.gz\": \"https://figshare.com/ndownloader/files/40552931\",\n        \"dimerpairs.6.tar.gz\": \"https://figshare.com/ndownloader/files/40272040\",\n        \"dimerpairs.7.tar.gz\": \"https://figshare.com/ndownloader/files/40272052\",\n        \"dimerpairs.8.tar.gz\": \"https://figshare.com/ndownloader/files/40272061\",\n        \"dimerpairs.9.tar.gz\": \"https://figshare.com/ndownloader/files/40272064\",\n        \"dimerpairs_nonstandard.tar.gz\": \"https://figshare.com/ndownloader/files/40272067\",\n        \"lig_interaction_sites.sdf\": \"https://figshare.com/ndownloader/files/40272070\",\n        \"lig_monomers.sdf\": \"https://figshare.com/ndownloader/files/40272073\",\n        \"prot_interaction_sites.sdf\": \"https://figshare.com/ndownloader/files/40272076\",\n        \"prot_monomers.sdf\": \"https://figshare.com/ndownloader/files/40272079\",\n        \"merge_monomers.py\": \"https://figshare.com/ndownloader/files/41807682\",\n    }\n\n    def read_raw_entries(self) -> List[Dict]:\n        logger.info(f\"Reading Splinter interaction data from {self.root}\")\n        data = []\n        i = 0\n        with tqdm(total=1680022) as progress_bar:\n            for root, dirs, files in os.walk(self.root):  # total is currently an approximation\n                for filename in files:\n                    if not filename.endswith(\".xyz\"):\n                        continue\n                    i += 1\n                    filepath = os.path.join(root, filename)\n                    filein = open(filepath, \"r\")\n                    lines = list(map(lambda x: x.strip(), filein.readlines()))\n                    n_atoms = np.array([int(lines[0])], dtype=np.int32)\n                    metadata = lines[1].split(\",\")\n                    try:\n                        (\n                            protein_monomer_name,\n                            protein_interaction_site_type,\n                            ligand_monomer_name,\n                            ligand_interaction_site_type,\n                            index,\n                            r,\n                            theta_P,\n                            tau_P,\n                            theta_L,\n                            tau_L,\n                            tau_PL,\n                        ) = metadata[0].split(\"_\")\n                        index, r, theta_P, tau_P, theta_L, tau_L, tau_PL = list(\n                            map(float, [index, r, theta_P, tau_P, theta_L, tau_L, tau_PL])\n                        )\n                    except ValueError:\n                        (\n                            protein_monomer_name,\n                            protein_interaction_site_type,\n                            ligand_monomer_name,\n                            ligand_interaction_site_type,\n                            index,\n                            _,\n                        ) = metadata[0].split(\"_\")\n                        r, theta_P, tau_P, theta_L, tau_L, tau_PL = [np.nan] * 6\n                    energies = np.array([list(map(float, metadata[4:-1]))]).astype(np.float32)\n                    n_atoms_ptr = np.array([int(metadata[-1])], dtype=np.int32)\n                    total_charge, charge0, charge1 = list(map(int, metadata[1:4]))\n                    lines = list(map(lambda x: x.split(), lines[2:]))\n                    pos = np.array(lines)[:, 1:].astype(np.float32)\n                    elems = np.array(lines)[:, 0]\n                    atomic_nums = np.expand_dims(np.array([ATOM_TABLE.GetAtomicNumber(x) for x in elems]), axis=1)\n                    natoms0 = n_atoms_ptr[0]\n                    natoms1 = n_atoms[0] - natoms0\n                    charges = np.expand_dims(np.array([charge0] * natoms0 + [charge1] * natoms1), axis=1)\n                    atomic_inputs = np.concatenate((atomic_nums, charges, pos), axis=-1, dtype=np.float32)\n                    subset = np.array([root.split(\"/\")[-1]])\n\n                    item = dict(\n                        energies=energies,\n                        subset=subset,\n                        n_atoms=n_atoms,\n                        n_atoms_ptr=n_atoms_ptr,\n                        atomic_inputs=atomic_inputs,\n                        protein_monomer_name=np.array([protein_monomer_name]),\n                        protein_interaction_site_type=np.array([protein_interaction_site_type]),\n                        ligand_monomer_name=np.array([ligand_monomer_name]),\n                        ligand_interaction_site_type=np.array([ligand_interaction_site_type]),\n                        index=np.array([index], dtype=np.float32),\n                        r=np.array([r], dtype=np.float32),\n                        theta_P=np.array([theta_P], dtype=np.float32),\n                        tau_P=np.array([tau_P], dtype=np.float32),\n                        theta_L=np.array([theta_L], dtype=np.float32),\n                        tau_L=np.array([tau_L], dtype=np.float32),\n                        tau_PL=np.array([tau_PL], dtype=np.float32),\n                        name=np.array([protein_monomer_name + \".\" + ligand_monomer_name]),\n                    )\n                    data.append(item)\n                    progress_bar.update(1)\n        logger.info(f\"Processed {i} files in total\")\n        return data\n
    "},{"location":"API/datasets/tmqm.html","title":"TMQM","text":""},{"location":"API/datasets/tmqm.html#openqdc.datasets.potential.tmqm.TMQM","title":"TMQM","text":"

    Bases: BaseDataset

    tmQM dataset contains the geometries of a large transition metal-organic compound space with a large variety of organic ligands and 30 transition metals. It contains energy labels for 86,665 mononuclear complexes calculated at the TPSSh-D3BJ/def2-SV DFT level of theory. Structures are first extracted from Cambridge Structure Database and then optimized in gas phase with the extended tight-binding GFN2-xTB method.

    Usage:

    from openqdc.datasets import TMQM\ndataset = TMQM()\n

    References

    https://pubs.acs.org/doi/10.1021/acs.jcim.0c01041

    https://github.com/bbskjelstad/tmqm

    Source code in openqdc/datasets/potential/tmqm.py
    class TMQM(BaseDataset):\n    \"\"\"\n    tmQM dataset contains the geometries of a large transition metal-organic compound space with a large variety of\n    organic ligands and 30 transition metals. It contains energy labels for 86,665 mononuclear complexes calculated\n    at the TPSSh-D3BJ/def2-SV DFT level of theory. Structures are first extracted from Cambridge Structure Database\n    and then optimized in gas phase with the extended tight-binding GFN2-xTB method.\n\n    Usage:\n    ```python\n    from openqdc.datasets import TMQM\n    dataset = TMQM()\n    ```\n\n    References:\n        https://pubs.acs.org/doi/10.1021/acs.jcim.0c01041\\n\n        https://github.com/bbskjelstad/tmqm\n    \"\"\"\n\n    __name__ = \"tmqm\"\n\n    __energy_methods__ = [PotentialMethod.TPSSH_DEF2_TZVP]  # \"tpssh/def2-tzvp\"]\n\n    energy_target_names = [\"TPSSh/def2TZVP level\"]\n\n    __energy_unit__ = \"hartree\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"hartree/ang\"\n    __links__ = {\n        x: f\"https://raw.githubusercontent.com/bbskjelstad/tmqm/master/data/{x}\"\n        for x in [\"tmQM_X1.xyz.gz\", \"tmQM_X2.xyz.gz\", \"tmQM_y.csv\", \"Benchmark2_TPSSh_Opt.xyz\"]\n    }\n\n    def read_raw_entries(self):\n        df = pd.read_csv(p_join(self.root, \"tmQM_y.csv\"), sep=\";\", usecols=[\"CSD_code\", \"Electronic_E\"])\n        e_map = dict(zip(df[\"CSD_code\"], df[\"Electronic_E\"]))\n        raw_fnames = [\"tmQM_X1.xyz\", \"tmQM_X2.xyz\", \"Benchmark2_TPSSh_Opt.xyz\"]\n        samples = []\n        for fname in raw_fnames:\n            data = read_xyz(p_join(self.root, fname), e_map)\n            samples += data\n\n        return samples\n
    "},{"location":"API/datasets/transition1x.html","title":"Transition1X","text":""},{"location":"API/datasets/transition1x.html#openqdc.datasets.potential.transition1x.Transition1X","title":"Transition1X","text":"

    Bases: BaseDataset

    Transition1x dataset contains structures from 10k organic reaction pathways of various types. It contains energy and force labels for 9.6 mio. conformers calculated at the wB97x/6-31-G(d) level of theory. The geometries and the transition states are generated by running Nudged Elastic Band (NEB) with DFT.

    Usage:

    from openqdc.datasets import Transition1X\ndataset = Transition1X()\n

    References: - https://www.nature.com/articles/s41597-022-01870-w

    Source code in openqdc/datasets/potential/transition1x.py
    class Transition1X(BaseDataset):\n    \"\"\"\n    Transition1x dataset contains structures from 10k organic reaction pathways of various types. It contains energy\n    and force labels for 9.6 mio. conformers calculated at the wB97x/6-31-G(d) level of theory. The geometries and\n    the transition states are generated by running Nudged Elastic Band (NEB) with DFT.\n\n    Usage:\n    ```python\n    from openqdc.datasets import Transition1X\n    dataset = Transition1X()\n    ```\n\n    References:\n    - https://www.nature.com/articles/s41597-022-01870-w\\n\n    - https://gitlab.com/matschreiner/Transition1x\\n\n    \"\"\"\n\n    __name__ = \"transition1x\"\n\n    __energy_methods__ = [\n        PotentialMethod.WB97X_6_31G_D\n        # \"wb97x/6-31G(d)\",\n    ]\n\n    energy_target_names = [\n        \"wB97x_6-31G(d).energy\",\n    ]\n\n    __force_mask__ = [True]\n    force_target_names = [\n        \"wB97x_6-31G(d).forces\",\n    ]\n\n    __energy_unit__ = \"ev\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"ev/ang\"\n    __links__ = {\"Transition1x.h5\": \"https://figshare.com/ndownloader/files/36035789\"}\n\n    def read_raw_entries(self):\n        raw_path = p_join(self.root, \"Transition1x.h5\")\n        f = load_hdf5_file(raw_path)[\"data\"]\n\n        res = sum([read_record(f[g], group=g) for g in tqdm(f)], [])  # don't use parallelized here\n        return res\n
    "},{"location":"API/datasets/vqm24.html","title":"VQM24","text":""},{"location":"API/datasets/vqm24.html#openqdc.datasets.potential.vqm24.VQM24","title":"VQM24","text":"

    Bases: BaseDataset

    Vector-QM24 (VQM24) dataset consists of small organic and inorganic molecules with quantum mechanical properties calculated at wB97x-D3//cc-pVDZ level of theory. This leads to 258,242 unique constitutional isomers and 577,705 conformers of varying stoichiometries. Geometries are generated using GFN2-xTB, and relaxed with DFT method wB97x-D3/cc-pVDZ. The energy values are calculated with DFT method wB97x-D3/cc-pVDZ.

    Usage:

    from openqdc.datasets import VQM24\ndataset = VQM24()\n

    Reference

    https://arxiv.org/abs/2405.05961

    Source code in openqdc/datasets/potential/vqm24.py
    class VQM24(BaseDataset):\n    \"\"\"\n    Vector-QM24 (VQM24) dataset consists of small organic and inorganic molecules with quantum mechanical\n    properties calculated at wB97x-D3//cc-pVDZ level of theory. This leads to 258,242 unique constitutional\n    isomers and 577,705 conformers of varying stoichiometries. Geometries are generated using GFN2-xTB, and\n    relaxed with DFT method wB97x-D3/cc-pVDZ. The energy values are calculated with DFT method wB97x-D3/cc-pVDZ.\n\n    Usage:\n    ```python\n    from openqdc.datasets import VQM24\n    dataset = VQM24()\n    ```\n\n    Reference:\n        https://arxiv.org/abs/2405.05961\n    \"\"\"\n\n    __name__ = \"vqm24\"\n\n    __energy_methods__ = [\n        PotentialMethod.WB97X_D3_CC_PVDZ,  # \"wB97x-D3/cc-pVDZ.\"\n    ]\n\n    energy_target_names = [\n        \"wB97x-D3/cc-pVDZ\",\n    ]\n    # \u03c9B97X-D3/cc-pVDZ\n    __energy_unit__ = \"hartree\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"hartree/ang\"\n    __links__ = {\n        f\"{name}.npz\": f\"https://zenodo.org/records/11164951/files/{name}.npz?download=1\"\n        for name in [\"DFT_all\", \"DFT_saddles\", \"DFT_uniques\", \"DMC\"]\n    }\n\n    def read_raw_entries(self):\n        samples = []\n        for name in self.__links__:\n            raw_path = p_join(self.root, f\"{name}\")\n            samples.append(read_npz_entry(raw_path))\n        return samples\n
    "},{"location":"API/datasets/waterclusters.html","title":"SCAN Waterclusters","text":""},{"location":"API/datasets/waterclusters.html#openqdc.datasets.potential.waterclusters.SCANWaterClusters","title":"SCANWaterClusters","text":"

    Bases: BaseDataset

    The SCAN Water Clusters dataset contains conformations of neutral water clusters containing up to 20 monomers, charged water clusters, and alkali- and halide-water clusters. This dataset consists of our data sets of water clusters: the benchmark energy and geometry database (BEGDB) neutral water cluster subset; the WATER2723 set of 14 neutral, 5 protonated, 7 deprotonated, and one auto-ionized water cluster; and two sets of ion-water clusters M...(H2O)n, where M = Li+, Na+, K+, F\u2212, Cl\u2212, or Br\u2212. Water clusters were obtained from 10 nanosecond gas-phase molecular dynamics simulations using AMBER 9 and optimized to obtain lowest energy isomers were determined using MP2/aug-cc-pVDZ//MP2/6-31G* Gibbs free energies.

    Chemical Species

    [H, O, Li, Na, K, F, Cl, Br]

    Usage:

    from openqdc.datasets import SCANWaterClusters\ndataset = SCANWaterClusters()\n

    References

    https://chemrxiv.org/engage/chemrxiv/article-details/662aaff021291e5d1db7d8ec

    https://github.com/esoteric-ephemera/water_cluster_density_errors

    Source code in openqdc/datasets/potential/waterclusters.py
    class SCANWaterClusters(BaseDataset):\n    \"\"\"\n    The SCAN Water Clusters dataset contains conformations of\n    neutral water clusters containing up to 20 monomers, charged water clusters,\n    and alkali- and halide-water clusters. This dataset consists of our data sets of water clusters:\n    the benchmark energy and geometry database (BEGDB) neutral water cluster subset; the WATER2723 set of 14\n    neutral, 5 protonated, 7 deprotonated, and one auto-ionized water cluster; and two sets of\n    ion-water clusters M...(H2O)n, where M = Li+, Na+, K+, F\u2212, Cl\u2212, or Br\u2212.\n    Water clusters were obtained from  10 nanosecond gas-phase molecular dynamics\n    simulations using AMBER 9 and optimized to obtain\n    lowest energy isomers were determined using MP2/aug-cc-pVDZ//MP2/6-31G* Gibbs free energies.\n\n\n    Chemical Species:\n        [H, O, Li, Na, K, F, Cl, Br]\n\n    Usage:\n    ```python\n    from openqdc.datasets import SCANWaterClusters\n    dataset = SCANWaterClusters()\n    ```\n\n    References:\n        https://chemrxiv.org/engage/chemrxiv/article-details/662aaff021291e5d1db7d8ec\\n\n        https://github.com/esoteric-ephemera/water_cluster_density_errors\n    \"\"\"\n\n    __name__ = \"scanwaterclusters\"\n\n    __energy_unit__ = \"hartree\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"hartree/ang\"\n    energy_target_names = [\n        \"HF\",\n        \"HF-r2SCAN-DC4\",\n        \"SCAN\",\n        \"SCAN@HF\",\n        \"SCAN@r2SCAN50\",\n        \"r2SCAN\",\n        \"r2SCAN@HF\",\n        \"r2SCAN@r2SCAN50\",\n        \"r2SCAN50\",\n        \"r2SCAN100\",\n        \"r2SCAN10\",\n        \"r2SCAN20\",\n        \"r2SCAN25\",\n        \"r2SCAN30\",\n        \"r2SCAN40\",\n        \"r2SCAN60\",\n        \"r2SCAN70\",\n        \"r2SCAN80\",\n        \"r2SCAN90\",\n    ]\n    __energy_methods__ = [PotentialMethod.NONE for _ in range(len(energy_target_names))]\n    force_target_names = []\n    # 27            # 9 level\n    subsets = [\"BEGDB_H2O\", \"WATER27\", \"H2O_alkali_clusters\", \"H2O_halide_clusters\"]\n    __links__ = {\n        \"geometries.json.gz\": \"https://github.com/esoteric-ephemera/water_cluster_density_errors/blob/main/data_files/geometries.json.gz?raw=True\",  # noqa\n        \"total_energies.json.gz\": \"https://github.com/esoteric-ephemera/water_cluster_density_errors/blob/main/data_files/total_energies.json.gz?raw=True\",  # noqa\n    }\n\n    def read_raw_entries(self):\n        entries = []  # noqa\n        for i, subset in enumerate(self.subsets):\n            geometries = read_geometries(p_join(self.root, \"geometries.json.gz\"), subset)\n            energies = read_energies(p_join(self.root, \"total_energies.json.gz\"), subset)\n            datum = {}\n            for k in energies:\n                _ = energies[k].pop(\"metadata\")\n                datum[k] = energies[k][\"total_energies\"]\n            entries.extend(format_geometry_and_entries(geometries, datum, subset))\n        return entries\n
    "},{"location":"API/datasets/waterclusters3_30.html","title":"Waterclusters3_30","text":""},{"location":"API/datasets/waterclusters3_30.html#openqdc.datasets.potential.waterclusters3_30.WaterClusters","title":"WaterClusters","text":"

    Bases: BaseDataset

    The WaterClusters dataset contains putative minima and low energy networks for water clusters of sizes n = 3 - 30. The cluster structures are derived and labeled with the TTM2.1-F ab-initio based interaction potential for water. It contains approximately 4.5 mil. structures. Sampling was done with the Monte Carlo Temperature Basin Paving (MCTBP) method.

    Chemical Species

    [\"H\", \"O\"]

    Usage:

    from openqdc.datasets import WaterClusters\ndataset = WaterClusters()\n

    References

    https://doi.org/10.1063/1.5128378

    https://sites.uw.edu/wdbase/database-of-water-clusters/

    Source code in openqdc/datasets/potential/waterclusters3_30.py
    class WaterClusters(BaseDataset):\n    \"\"\"\n    The WaterClusters dataset contains putative minima and low energy networks for water\n    clusters of sizes n = 3 - 30. The cluster structures are derived and labeled with\n    the TTM2.1-F ab-initio based interaction potential for water.\n    It contains approximately 4.5 mil. structures.\n    Sampling was done with the Monte Carlo Temperature Basin Paving (MCTBP) method.\n\n    Chemical Species:\n        [\"H\", \"O\"]\n\n    Usage:\n    ```python\n    from openqdc.datasets import WaterClusters\n    dataset = WaterClusters()\n    ```\n\n    References:\n        https://doi.org/10.1063/1.5128378\\n\n        https://sites.uw.edu/wdbase/database-of-water-clusters/\\n\n    \"\"\"\n\n    __name__ = \"waterclusters3_30\"\n\n    # Energy in hartree, all zeros by default\n    atomic_energies = np.zeros((MAX_ATOMIC_NUMBER,), dtype=np.float32)\n    __energy_unit__ = \"kcal/mol\"\n    __distance_unit__ = \"ang\"\n    __forces_unit__ = \"kcal/mol/ang\"\n\n    __energy_methods__ = [PotentialMethod.TTM2_1_F]  # \"ttm2.1-f\"\n    energy_target_names = [\"TTM2.1-F Potential\"]\n    __links__ = {\"W3-W30_all_geoms_TTM2.1-F.zip\": \"https://drive.google.com/uc?id=18Y7OiZXSCTsHrQ83GCc4fyE_abbL6E_n\"}\n\n    def read_raw_entries(self):\n        samples = []\n        parent_folder = p_join(self.root, \"W3-W30_all_geoms_TTM2.1-F/\")\n        for i in range(3, 31):\n            name = f\"W{i}_geoms_all\"\n            zip_path = p_join(parent_folder, f\"{name}.zip\")\n            xyz_path = p_join(parent_folder, f\"{name}.xyz\")\n            with zipfile.ZipFile(zip_path, \"r\") as zip_ref:\n                zip_ref.extractall(parent_folder)\n\n            data = read_xyz(xyz_path, i)\n            samples += data\n\n        return samples\n
    "},{"location":"API/datasets/x40.html","title":"X40","text":""},{"location":"API/datasets/x40.html#openqdc.datasets.interaction.x40.X40","title":"X40","text":"

    Bases: YamlDataset

    X40 interaction dataset of 40 noncovalent complexes of organic halides, halohydrides, and halogen molecules where the halogens participate in various interaction types such as electrostatic interactions, london dispersion, hydrogen bonds, halogen bonding, halogen-pi interactions and stacking of halogenated aromatic molecules. For each complex 10 geometries are generated resulting in 400 geometries in the dataset. The geometries are optimized using the MP2 level of theory with cc-pVTZ basis set whereas the interaction energies are computed with CCSD(T)/CBS level of theory.

    Usage:

    from openqdc.datasets import X40\ndataset = X40()\n

    Reference

    https://pubs.acs.org/doi/10.1021/ct300647k

    Source code in openqdc/datasets/interaction/x40.py
    class X40(YamlDataset):\n    \"\"\"\n    X40 interaction dataset of 40 noncovalent complexes of organic halides, halohydrides, and halogen molecules\n    where the halogens participate in various interaction types such as electrostatic interactions, london\n    dispersion, hydrogen bonds, halogen bonding, halogen-pi interactions and stacking of halogenated aromatic\n    molecules. For each complex 10 geometries are generated resulting in 400 geometries in the dataset. The geometries\n    are optimized using the MP2 level of theory with cc-pVTZ basis set whereas the interaction energies are\n    computed with CCSD(T)/CBS level of theory.\n\n    Usage:\n    ```python\n    from openqdc.datasets import X40\n    dataset = X40()\n    ```\n\n    Reference:\n        https://pubs.acs.org/doi/10.1021/ct300647k\n    \"\"\"\n\n    __name__ = \"x40\"\n    __energy_methods__ = [\n        InteractionMethod.CCSD_T_CBS,  # \"CCSD(T)/CBS\",\n        InteractionMethod.MP2_CBS,  # \"MP2/CBS\",\n        InteractionMethod.DCCSDT_HA_DZ,  # \"dCCSD(T)/haDZ\",\n        InteractionMethod.DCCSDT_HA_TZ,  # \"dCCSD(T)/haTZ\",\n        InteractionMethod.MP2_5_CBS_ADZ,  # \"MP2.5/CBS(aDZ)\",\n    ]\n    __links__ = {\n        \"x40.yaml\": \"http://cuby4.molecular.cz/download_datasets/x40.yaml\",\n        \"geometries.tar.gz\": \"http://cuby4.molecular.cz/download_geometries/X40.tar\",\n    }\n\n    def _process_name(self, item):\n        return item.shortname\n\n    def get_n_atoms_ptr(self, item, root, filename):\n        xyz_path = p_join(root, f\"{filename}.xyz\")\n        with open(xyz_path, \"r\") as xyz_file:  # avoid not closing the file\n            lines = list(map(lambda x: x.strip().split(), xyz_file.readlines()))\n            setup = lines.pop(1)\n            n_atoms_first = setup[0].split(\"-\")[1]\n            n_atoms_ptr = np.array([int(n_atoms_first)], dtype=np.int32)\n            return n_atoms_ptr\n
    "},{"location":"tutorials/usage.html","title":"OpenQDC Hands-on Tutorial","text":"In\u00a0[31]: Copied!
    from openqdc.datasets import Spice\nds = Spice(\n    energy_unit=\"kcal/mol\",\n    distance_unit=\"ang\",\n)\n
    from openqdc.datasets import Spice ds = Spice( energy_unit=\"kcal/mol\", distance_unit=\"ang\", )
    2024-02-29 12:17:13.349 | INFO     | openqdc.datasets.base:read_preprocess:381 - Reading preprocessed data.\n2024-02-29 12:17:13.349 | INFO     | openqdc.datasets.base:read_preprocess:382 - Dataset spice with the following units:\n                     Energy: hartree,\n                     Distance: bohr,\n                     Forces: hartree/bohr\n2024-02-29 12:17:13.978 | INFO     | openqdc.datasets.base:read_preprocess:406 - Loaded atomic_inputs with shape (33175288, 5), dtype float32\n2024-02-29 12:17:13.979 | INFO     | openqdc.datasets.base:read_preprocess:406 - Loaded position_idx_range with shape (1110165, 2), dtype int32\n2024-02-29 12:17:13.979 | INFO     | openqdc.datasets.base:read_preprocess:406 - Loaded energies with shape (1110165, 1), dtype float32\n2024-02-29 12:17:13.980 | INFO     | openqdc.datasets.base:read_preprocess:406 - Loaded forces with shape (33175288, 3, 1), dtype float32\n2024-02-29 12:17:13.980 | INFO     | openqdc.datasets.base:read_preprocess:406 - Loaded name with shape (1110165,), dtype <U632\n2024-02-29 12:17:13.981 | INFO     | openqdc.datasets.base:read_preprocess:406 - Loaded subset with shape (1110165,), dtype <U20\n2024-02-29 12:17:13.981 | INFO     | openqdc.datasets.base:read_preprocess:406 - Loaded n_atoms with shape (1110165,), dtype int32\n2024-02-29 12:17:13.983 | INFO     | openqdc.datasets.base:_precompute_statistics:154 - Loaded precomputed statistics\n2024-02-29 12:17:13.985 | INFO     | openqdc.datasets.base:_convert_data:141 - Converting spice data to the following units:\n                     Energy: kcal/mol,\n                     Distance: ang,\n                     Forces: kcal/mol/ang\n
    In\u00a0[39]: Copied!
    ds[0]\n
    ds[0] Out[39]:
    {'positions': array([[ 0.71034044,  2.1993854 , -1.7317094 ],\n        [ 0.06135919,  2.6528177 , -0.4163168 ],\n        [ 1.762424  ,  1.0939031 , -1.4321265 ],\n        [-0.22598556,  1.6802124 ,  0.5978407 ],\n        [ 1.1740401 , -0.04154727, -0.512898  ],\n        [-0.41957757, -0.24454471,  3.0900123 ],\n        [ 0.7238282 ,  0.52511275,  0.8248042 ],\n        [ 0.05533566, -0.6713925 ,  1.6488242 ],\n        [ 0.9663853 , -1.8097109 ,  1.8863406 ],\n        [-0.0657557 ,  1.8550861 , -2.3939755 ],\n        [ 1.2260683 ,  3.0082219 , -2.2036319 ],\n        [-0.8098082 ,  3.201651  , -0.6507186 ],\n        [ 0.792407  ,  3.368585  ,  0.01799216],\n        [ 2.558414  ,  1.5826052 , -0.9704587 ],\n        [ 2.166226  ,  0.64460325, -2.384977  ],\n        [-0.4735094 ,  2.0926695 ,  1.5486747 ],\n        [-1.1792994 ,  1.1978384 ,  0.34465855],\n        [ 1.8563557 , -0.90775317, -0.5115611 ],\n        [ 0.31435642, -0.42179283, -1.0628686 ],\n        [ 0.42152542,  0.25200853,  3.627957  ],\n        [-0.5416419 , -1.1152233 ,  3.7040234 ],\n        [-1.1868238 ,  0.46580845,  3.0541756 ],\n        [ 1.6525911 ,  0.8830018 ,  1.3779446 ],\n        [-0.7720179 , -0.9603249 ,  0.994841  ],\n        [ 1.7518724 , -1.5571898 ,  2.560223  ],\n        [ 1.3855549 , -2.1521344 ,  1.0039169 ],\n        [ 0.38311973, -2.5341127 ,  2.2767966 ]], dtype=float32),\n 'atomic_numbers': array([6, 6, 6, 6, 6, 6, 6, 6, 7, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n        1, 1, 1, 1, 1], dtype=int32),\n 'charges': array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n        0, 0, 0, 0, 0], dtype=int32),\n 'e0': array([[-23765.42563669],\n        [-23765.42563669],\n        [-23765.42563669],\n        [-23765.42563669],\n        [-23765.42563669],\n        [-23765.42563669],\n        [-23765.42563669],\n        [-23765.42563669],\n        [-33939.41501837],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ]]),\n 'energies': array([-232450.64], dtype=float32),\n 'name': '[H:10][C:1]1([C:2]([C:4]([C:7]([C:5]([C:3]1([H:14])[H:15])([H:18])[H:19])([H:23])[C@:8]([H:24])([C:6]([H:20])([H:21])[H:22])[N+:9]([H:25])([H:26])[H:27])([H:16])[H:17])([H:12])[H:13])[H:11]',\n 'subset': 'PubChem',\n 'forces': array([[[  2.1335483 ],\n         [-37.241825  ],\n         [ 22.830988  ]],\n \n        [[ 68.235725  ],\n         [ 59.30573   ],\n         [-27.672606  ]],\n \n        [[-34.137283  ],\n         [-30.504696  ],\n         [-33.670048  ]],\n \n        [[-49.57814   ],\n         [-75.2747    ],\n         [ 32.80194   ]],\n \n        [[  8.196513  ],\n         [ 17.132149  ],\n         [-36.84995   ]],\n \n        [[ 67.39872   ],\n         [ -8.923976  ],\n         [-20.772083  ]],\n \n        [[ 45.424217  ],\n         [-33.559574  ],\n         [ 20.30243   ]],\n \n        [[-13.522426  ],\n         [ 79.690094  ],\n         [ 15.531546  ]],\n \n        [[ 35.77895   ],\n         [  1.9324436 ],\n         [ -8.205132  ]],\n \n        [[ -3.3487453 ],\n         [ -7.991125  ],\n         [ -9.71156   ]],\n \n        [[  1.4049193 ],\n         [ 13.497365  ],\n         [ -5.981079  ]],\n \n        [[-21.196207  ],\n         [ 16.861713  ],\n         [ -1.7730864 ]],\n \n        [[-10.805695  ],\n         [ -2.033095  ],\n         [ -4.2524548 ]],\n \n        [[ 35.204765  ],\n         [ 12.971134  ],\n         [ 22.815577  ]],\n \n        [[-11.87403   ],\n         [ 10.404548  ],\n         [ 23.009806  ]],\n \n        [[  2.3782759 ],\n         [ 19.309696  ],\n         [ 15.546526  ]],\n \n        [[ -2.5732849 ],\n         [ -4.098344  ],\n         [ -5.087256  ]],\n \n        [[  3.5987573 ],\n         [ 10.469024  ],\n         [  9.869113  ]],\n \n        [[ -8.646548  ],\n         [ -0.35554707],\n         [  1.7650104 ]],\n \n        [[ -6.6712875 ],\n         [ -0.7742697 ],\n         [-15.672442  ]],\n \n        [[-25.453985  ],\n         [ -9.350726  ],\n         [  6.0056353 ]],\n \n        [[-32.657543  ],\n         [ 10.617167  ],\n         [  2.516469  ]],\n \n        [[-23.541552  ],\n         [ -9.305013  ],\n         [ -9.855984  ]],\n \n        [[  2.8105662 ],\n         [-13.78966   ],\n         [ 10.141727  ]],\n \n        [[-29.951014  ],\n         [ -9.25683   ],\n         [-23.69946   ]],\n \n        [[ -3.412568  ],\n         [  4.13157   ],\n         [ 12.421117  ]],\n \n        [[  4.77353   ],\n         [-13.841051  ],\n         [  7.6428723 ]]], dtype=float32)}
    In\u00a0[40]: Copied!
    ds.get_ase_atoms(0)\n
    ds.get_ase_atoms(0) Out[40]:
    Atoms(symbols='C8NH18', pbc=False, initial_charges=...)
    In\u00a0[53]: Copied!
    ds.get_ase_atoms(0).info\n
    ds.get_ase_atoms(0).info Out[53]:
    {'e0': array([[-23765.42563669],\n        [-23765.42563669],\n        [-23765.42563669],\n        [-23765.42563669],\n        [-23765.42563669],\n        [-23765.42563669],\n        [-23765.42563669],\n        [-23765.42563669],\n        [-33939.41501837],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ],\n        [  -312.9767089 ]]),\n 'energies': array([-232450.64], dtype=float32),\n 'name': '[H:10][C:1]1([C:2]([C:4]([C:7]([C:5]([C:3]1([H:14])[H:15])([H:18])[H:19])([H:23])[C@:8]([H:24])([C:6]([H:20])([H:21])[H:22])[N+:9]([H:25])([H:26])[H:27])([H:16])[H:17])([H:12])[H:13])[H:11]',\n 'subset': 'PubChem',\n 'forces': array([[[  2.1335483 ],\n         [-37.241825  ],\n         [ 22.830988  ]],\n \n        [[ 68.235725  ],\n         [ 59.30573   ],\n         [-27.672606  ]],\n \n        [[-34.137283  ],\n         [-30.504696  ],\n         [-33.670048  ]],\n \n        [[-49.57814   ],\n         [-75.2747    ],\n         [ 32.80194   ]],\n \n        [[  8.196513  ],\n         [ 17.132149  ],\n         [-36.84995   ]],\n \n        [[ 67.39872   ],\n         [ -8.923976  ],\n         [-20.772083  ]],\n \n        [[ 45.424217  ],\n         [-33.559574  ],\n         [ 20.30243   ]],\n \n        [[-13.522426  ],\n         [ 79.690094  ],\n         [ 15.531546  ]],\n \n        [[ 35.77895   ],\n         [  1.9324436 ],\n         [ -8.205132  ]],\n \n        [[ -3.3487453 ],\n         [ -7.991125  ],\n         [ -9.71156   ]],\n \n        [[  1.4049193 ],\n         [ 13.497365  ],\n         [ -5.981079  ]],\n \n        [[-21.196207  ],\n         [ 16.861713  ],\n         [ -1.7730864 ]],\n \n        [[-10.805695  ],\n         [ -2.033095  ],\n         [ -4.2524548 ]],\n \n        [[ 35.204765  ],\n         [ 12.971134  ],\n         [ 22.815577  ]],\n \n        [[-11.87403   ],\n         [ 10.404548  ],\n         [ 23.009806  ]],\n \n        [[  2.3782759 ],\n         [ 19.309696  ],\n         [ 15.546526  ]],\n \n        [[ -2.5732849 ],\n         [ -4.098344  ],\n         [ -5.087256  ]],\n \n        [[  3.5987573 ],\n         [ 10.469024  ],\n         [  9.869113  ]],\n \n        [[ -8.646548  ],\n         [ -0.35554707],\n         [  1.7650104 ]],\n \n        [[ -6.6712875 ],\n         [ -0.7742697 ],\n         [-15.672442  ]],\n \n        [[-25.453985  ],\n         [ -9.350726  ],\n         [  6.0056353 ]],\n \n        [[-32.657543  ],\n         [ 10.617167  ],\n         [  2.516469  ]],\n \n        [[-23.541552  ],\n         [ -9.305013  ],\n         [ -9.855984  ]],\n \n        [[  2.8105662 ],\n         [-13.78966   ],\n         [ 10.141727  ]],\n \n        [[-29.951014  ],\n         [ -9.25683   ],\n         [-23.69946   ]],\n \n        [[ -3.412568  ],\n         [  4.13157   ],\n         [ 12.421117  ]],\n \n        [[  4.77353   ],\n         [-13.841051  ],\n         [  7.6428723 ]]], dtype=float32)}
    In\u00a0[41]: Copied!
    for i in ds.as_iter():\n    print(i)\n    break\n
    for i in ds.as_iter(): print(i) break
    {'positions': array([[ 0.71034044,  2.1993854 , -1.7317094 ],\n       [ 0.06135919,  2.6528177 , -0.4163168 ],\n       [ 1.762424  ,  1.0939031 , -1.4321265 ],\n       [-0.22598556,  1.6802124 ,  0.5978407 ],\n       [ 1.1740401 , -0.04154727, -0.512898  ],\n       [-0.41957757, -0.24454471,  3.0900123 ],\n       [ 0.7238282 ,  0.52511275,  0.8248042 ],\n       [ 0.05533566, -0.6713925 ,  1.6488242 ],\n       [ 0.9663853 , -1.8097109 ,  1.8863406 ],\n       [-0.0657557 ,  1.8550861 , -2.3939755 ],\n       [ 1.2260683 ,  3.0082219 , -2.2036319 ],\n       [-0.8098082 ,  3.201651  , -0.6507186 ],\n       [ 0.792407  ,  3.368585  ,  0.01799216],\n       [ 2.558414  ,  1.5826052 , -0.9704587 ],\n       [ 2.166226  ,  0.64460325, -2.384977  ],\n       [-0.4735094 ,  2.0926695 ,  1.5486747 ],\n       [-1.1792994 ,  1.1978384 ,  0.34465855],\n       [ 1.8563557 , -0.90775317, -0.5115611 ],\n       [ 0.31435642, -0.42179283, -1.0628686 ],\n       [ 0.42152542,  0.25200853,  3.627957  ],\n       [-0.5416419 , -1.1152233 ,  3.7040234 ],\n       [-1.1868238 ,  0.46580845,  3.0541756 ],\n       [ 1.6525911 ,  0.8830018 ,  1.3779446 ],\n       [-0.7720179 , -0.9603249 ,  0.994841  ],\n       [ 1.7518724 , -1.5571898 ,  2.560223  ],\n       [ 1.3855549 , -2.1521344 ,  1.0039169 ],\n       [ 0.38311973, -2.5341127 ,  2.2767966 ]], dtype=float32), 'atomic_numbers': array([6, 6, 6, 6, 6, 6, 6, 6, 7, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n       1, 1, 1, 1, 1], dtype=int32), 'charges': array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n       0, 0, 0, 0, 0], dtype=int32), 'e0': array([[-23765.42563669],\n       [-23765.42563669],\n       [-23765.42563669],\n       [-23765.42563669],\n       [-23765.42563669],\n       [-23765.42563669],\n       [-23765.42563669],\n       [-23765.42563669],\n       [-33939.41501837],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ],\n       [  -312.9767089 ]]), 'energies': array([-232450.64], dtype=float32), 'name': '[H:10][C:1]1([C:2]([C:4]([C:7]([C:5]([C:3]1([H:14])[H:15])([H:18])[H:19])([H:23])[C@:8]([H:24])([C:6]([H:20])([H:21])[H:22])[N+:9]([H:25])([H:26])[H:27])([H:16])[H:17])([H:12])[H:13])[H:11]', 'subset': 'PubChem', 'forces': array([[[  2.1335483 ],\n        [-37.241825  ],\n        [ 22.830988  ]],\n\n       [[ 68.235725  ],\n        [ 59.30573   ],\n        [-27.672606  ]],\n\n       [[-34.137283  ],\n        [-30.504696  ],\n        [-33.670048  ]],\n\n       [[-49.57814   ],\n        [-75.2747    ],\n        [ 32.80194   ]],\n\n       [[  8.196513  ],\n        [ 17.132149  ],\n        [-36.84995   ]],\n\n       [[ 67.39872   ],\n        [ -8.923976  ],\n        [-20.772083  ]],\n\n       [[ 45.424217  ],\n        [-33.559574  ],\n        [ 20.30243   ]],\n\n       [[-13.522426  ],\n        [ 79.690094  ],\n        [ 15.531546  ]],\n\n       [[ 35.77895   ],\n        [  1.9324436 ],\n        [ -8.205132  ]],\n\n       [[ -3.3487453 ],\n        [ -7.991125  ],\n        [ -9.71156   ]],\n\n       [[  1.4049193 ],\n        [ 13.497365  ],\n        [ -5.981079  ]],\n\n       [[-21.196207  ],\n        [ 16.861713  ],\n        [ -1.7730864 ]],\n\n       [[-10.805695  ],\n        [ -2.033095  ],\n        [ -4.2524548 ]],\n\n       [[ 35.204765  ],\n        [ 12.971134  ],\n        [ 22.815577  ]],\n\n       [[-11.87403   ],\n        [ 10.404548  ],\n        [ 23.009806  ]],\n\n       [[  2.3782759 ],\n        [ 19.309696  ],\n        [ 15.546526  ]],\n\n       [[ -2.5732849 ],\n        [ -4.098344  ],\n        [ -5.087256  ]],\n\n       [[  3.5987573 ],\n        [ 10.469024  ],\n        [  9.869113  ]],\n\n       [[ -8.646548  ],\n        [ -0.35554707],\n        [  1.7650104 ]],\n\n       [[ -6.6712875 ],\n        [ -0.7742697 ],\n        [-15.672442  ]],\n\n       [[-25.453985  ],\n        [ -9.350726  ],\n        [  6.0056353 ]],\n\n       [[-32.657543  ],\n        [ 10.617167  ],\n        [  2.516469  ]],\n\n       [[-23.541552  ],\n        [ -9.305013  ],\n        [ -9.855984  ]],\n\n       [[  2.8105662 ],\n        [-13.78966   ],\n        [ 10.141727  ]],\n\n       [[-29.951014  ],\n        [ -9.25683   ],\n        [-23.69946   ]],\n\n       [[ -3.412568  ],\n        [  4.13157   ],\n        [ 12.421117  ]],\n\n       [[  4.77353   ],\n        [-13.841051  ],\n        [  7.6428723 ]]], dtype=float32)}\n
    In\u00a0[42]: Copied!
    for i in ds.as_iter(atoms=True):\n    print(i)\n    break\n
    for i in ds.as_iter(atoms=True): print(i) break
    Atoms(symbols='C8NH18', pbc=False, initial_charges=...)\n
    In\u00a0[43]: Copied!
    from openqdc.methods import QmMethod\n\n# Get the b3lyp/6-31g* method\nmethod = QmMethod.B3LYP_6_31G_D\nmethod.atom_energies_dict\n
    from openqdc.methods import QmMethod # Get the b3lyp/6-31g* method method = QmMethod.B3LYP_6_31G_D method.atom_energies_dict Out[43]:
    {('H', -1): -0.4618190740256503,\n ('H', 0): -0.5002733301377901,\n ('H', 1): 0.0,\n ('Li', 1): -7.284546111273075,\n ('B', -3): -23.577268753399462,\n ('B', -1): -24.614577395156598,\n ('B', 0): -24.65435524492553,\n ('B', 3): -22.018169862974275,\n ('C', -1): -37.844269871879376,\n ('C', 0): -37.84628033285479,\n ('C', 1): -37.42731164237431,\n ('N', -1): -54.52864356359092,\n ('N', 0): -54.584488815424095,\n ('N', 1): -54.0458621835885,\n ('O', -1): -75.05272792994404,\n ('O', 0): -75.06062109946738,\n ('O', 1): -74.54659271939704,\n ('F', -1): -99.75408410035712,\n ('F', 0): -99.71553471526475,\n ('Na', 1): -162.081235395777,\n ('Mg', 2): -199.22734695613283,\n ('Si', 4): -285.5564410277949,\n ('Si', 0): -289.3717359984153,\n ('Si', -4): -288.02795351148654,\n ('P', 0): -341.2580911838578,\n ('P', 1): -340.8765976669208,\n ('S', -1): -398.16568433994024,\n ('S', 0): -398.1049932797066,\n ('S', 1): -397.7199808615457,\n ('Cl', -2): -459.5066184980746,\n ('Cl', -1): -460.25223446009306,\n ('Cl', 0): -460.13624346967765,\n ('Cl', 2): -458.6740467177361,\n ('K', 1): -599.7247062673807,\n ('Ca', 2): -676.8667395990246,\n ('Br', -1): -2573.824201570383,\n ('Br', 0): -2573.705283744811,\n ('I', -1): None,\n ('I', 0): None}
    In\u00a0[44]: Copied!
    # Get the matrix of atomization energies for the b3lyp/6-31g* method\nmethod.atom_energies_matrix\n
    # Get the matrix of atomization energies for the b3lyp/6-31g* method method.atom_energies_matrix Out[44]:
    array([[0., 0., 0., ..., 0., 0., 0.],\n       [0., 0., 0., ..., 0., 0., 0.],\n       [0., 0., 0., ..., 0., 0., 0.],\n       ...,\n       [0., 0., 0., ..., 0., 0., 0.],\n       [0., 0., 0., ..., 0., 0., 0.],\n       [0., 0., 0., ..., 0., 0., 0.]])
    In\u00a0[45]: Copied!
    import matplotlib.pyplot as plt \nfrom sklearn.decomposition import PCA\ndatum = ds.soap_descriptors(n_samples=500, progress=True)\nreducer = PCA()\nembedding = reducer.fit_transform(datum[\"soap\"])\n
    import matplotlib.pyplot as plt from sklearn.decomposition import PCA datum = ds.soap_descriptors(n_samples=500, progress=True) reducer = PCA() embedding = reducer.fit_transform(datum[\"soap\"])
    100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 500/500 [00:01<00:00, 459.21it/s]\n
    In\u00a0[46]: Copied!
    plt.scatter(\n    embedding[:, 0],\n    embedding[:, 1],\n    c=[(ds[i].energies - ds[i][\"e0\"].sum() )/ ds.data[\"n_atoms\"][i] for i in datum[\"idxs\"]])\nplt.colorbar()\n
    plt.scatter( embedding[:, 0], embedding[:, 1], c=[(ds[i].energies - ds[i][\"e0\"].sum() )/ ds.data[\"n_atoms\"][i] for i in datum[\"idxs\"]]) plt.colorbar() Out[46]:
    <matplotlib.colorbar.Colorbar at 0x1554aa7bd820>
    "},{"location":"tutorials/usage.html#openqdc-hands-on-tutorial","title":"OpenQDC Hands-on Tutorial\u00b6","text":""},{"location":"tutorials/usage.html#instantiate-and-go","title":"Instantiate and GO!\u00b6","text":"

    If you don't have the dataset downloaded, it will be downloaded automatically and cached. You just instantiate the class and you are ready to go. Change of units is done automatically upon loading based on the units of the dataset.

    Supported energy units: [\"kcal/mol\", \"kj/mol\", \"hartree\", \"ev\"]

    Supported distance units: [\"ang\", \"nm\", \"bohr\"]

    "},{"location":"tutorials/usage.html#items-from-the-dataset-object-class-are-obtained-through-the-get-method","title":"Items from the dataset object class are obtained through the \"get\" method.\u00b6","text":"

    The dictionary of the item contains different important keys:

    "},{"location":"tutorials/usage.html#alternatively-we-can-also-retrieve-the-data-from-the-dataset-object-class-as-aseatoms-using-the-get_ase_atoms","title":"Alternatively, we can also retrieve the data from the dataset object class as ase.Atoms using the get_ase_atoms!\u00b6","text":""},{"location":"tutorials/usage.html#iterators","title":"Iterators\u00b6","text":"

    The method as_iter(atoms=False) returns an iterator over the dataset. If atoms is True, the iterator returns the data as an ase.Atoms objects. Otherwise, it returns the dictionary of the item.

    "},{"location":"tutorials/usage.html#isolated-atoms-energies-e0s","title":"Isolated atoms energies [e0s]\u00b6","text":"

    The potential energy of the system can be decomposed into the sum of isolated atom energies and the formation energy.

    $U(A_1, A_2, ...) = \\sum_{i_1}^N e_0(A_i) + e(A_1, A_2, ...)$

    The isolated atoms energies are automatically associated with the correct level of theory, and you can get access as follow

    "},{"location":"tutorials/usage.html#chemical-space-from-soap-descriptors","title":"Chemical space from SOAP descriptors\u00b6","text":"

    openQDC offer a simple way to calculate the Smooth Overlaps of Atomic Positions (SOAP) descriptors for the molecules in the dataset. The method get_soap_descriptors returns the SOAP descriptors for the molecules in the dataset.

    "}]} \ No newline at end of file diff --git a/main/sitemap.xml b/main/sitemap.xml index 9076776..d211d1b 100644 --- a/main/sitemap.xml +++ b/main/sitemap.xml @@ -90,6 +90,11 @@ 2024-08-30 daily + + https://github.com/valence-labs/openQDC/main/API/datasets/3bpa.html + 2024-08-30 + daily + https://github.com/valence-labs/openQDC/main/API/datasets/alchemy.html 2024-08-30 @@ -130,6 +135,11 @@ 2024-08-30 daily + + https://github.com/valence-labs/openQDC/main/API/datasets/maceoff.html + 2024-08-30 + daily + https://github.com/valence-labs/openQDC/main/API/datasets/md22.html 2024-08-30 diff --git a/main/sitemap.xml.gz b/main/sitemap.xml.gz index 3f81275..c1330a6 100644 Binary files a/main/sitemap.xml.gz and b/main/sitemap.xml.gz differ diff --git a/main/tutorials/usage.html b/main/tutorials/usage.html index b1ede83..4bc3c46 100644 --- a/main/tutorials/usage.html +++ b/main/tutorials/usage.html @@ -1049,6 +1049,27 @@ +
  • + + + + + 3BPA + + + + +
  • + + + + + + + + + +
  • @@ -1112,6 +1133,27 @@ +
  • + + + + + MaceOFF + + + + +
  • + + + + + + + + + +
  • diff --git a/main/usage.html b/main/usage.html index ed38163..2b17c27 100644 --- a/main/usage.html +++ b/main/usage.html @@ -1016,6 +1016,27 @@ +
  • + + + + + 3BPA + + + + +
  • + + + + + + + + + +
  • @@ -1079,6 +1100,27 @@ +
  • + + + + + MaceOFF + + + + +
  • + + + + + + + + + +