From 09c75a7e19275ca66ab2dc6f84023c5de18ad81f Mon Sep 17 00:00:00 2001
From: FNTwin <cristian@valencelabs.com>
Date: Mon, 29 Jul 2024 09:47:40 -0600
Subject: [PATCH 1/8] MaceOff dataset

---
 openqdc/__init__.py                    |  2 +
 openqdc/datasets/potential/__init__.py |  2 +
 openqdc/datasets/potential/maceoff.py  | 95 ++++++++++++++++++++++++++
 openqdc/methods/enums.py               |  2 -
 4 files changed, 99 insertions(+), 2 deletions(-)
 create mode 100644 openqdc/datasets/potential/maceoff.py

diff --git a/openqdc/__init__.py b/openqdc/__init__.py
index c6be72d4..051aeefb 100644
--- a/openqdc/__init__.py
+++ b/openqdc/__init__.py
@@ -31,6 +31,7 @@ def get_project_root():
     "COMP6": "openqdc.datasets.potential.comp6",
     "GDML": "openqdc.datasets.potential.gdml",
     "Molecule3D": "openqdc.datasets.potential.molecule3d",
+    "MACEOFF": "openqdc.datasets.potential.maceoff",
     "OrbnetDenali": "openqdc.datasets.potential.orbnet_denali",
     "SN2RXN": "openqdc.datasets.potential.sn2_rxn",
     "QM7X": "openqdc.datasets.potential.qm7x",
@@ -122,6 +123,7 @@ def __dir__():
     from .datasets.potential.gdml import GDML
     from .datasets.potential.geom import GEOM
     from .datasets.potential.iso_17 import ISO17
+    from .datasets.potential.maceoff import MACEOFF
     from .datasets.potential.md22 import MD22
     from .datasets.potential.molecule3d import Molecule3D
     from .datasets.potential.multixcqm9 import MultixcQM9, MultixcQM9_V2
diff --git a/openqdc/datasets/potential/__init__.py b/openqdc/datasets/potential/__init__.py
index 35721dde..e0207b7f 100644
--- a/openqdc/datasets/potential/__init__.py
+++ b/openqdc/datasets/potential/__init__.py
@@ -5,6 +5,7 @@
 from .gdml import GDML
 from .geom import GEOM
 from .iso_17 import ISO17
+from .maceoff import MACEOFF
 from .md22 import MD22
 from .molecule3d import Molecule3D
 from .multixcqm9 import MultixcQM9, MultixcQM9_V2
@@ -38,6 +39,7 @@
     "GEOM": GEOM,
     "ISO17": ISO17,
     "Molecule3D": Molecule3D,
+    "MACEOFF": MACEOFF,
     "NablaDFT": NablaDFT,
     "OrbnetDenali": OrbnetDenali,
     "PCQM_B3LYP": PCQM_B3LYP,
diff --git a/openqdc/datasets/potential/maceoff.py b/openqdc/datasets/potential/maceoff.py
new file mode 100644
index 00000000..c8fb06b0
--- /dev/null
+++ b/openqdc/datasets/potential/maceoff.py
@@ -0,0 +1,95 @@
+import re
+from os.path import join as p_join
+
+import datamol as dm
+import numpy as np
+
+from openqdc.datasets.base import BaseDataset
+from openqdc.methods import PotentialMethod
+from openqdc.utils.constants import ATOMIC_NUMBERS
+from openqdc.utils.molecule import get_atomic_number_and_charge
+
+
+def parse_mace_xyz(xyzpath):
+    energy_re = re.compile(r"energy=(\S+)")
+    smiles_re = re.compile(r"smiles=(\S+)")
+    subset_re = re.compile(r"config_type=(\S+)")
+    with open(xyzpath, "r") as f:
+        n_atoms = None
+        counter = 0
+        positions = []
+        numbers = []
+        forces = []
+        energy = None
+        for line in f:
+            if n_atoms is None:
+                n_atoms = int(line)
+                positions = []
+                numbers = []
+                forces = []
+                energy = None
+                counter = 1
+                continue
+            if counter == 1:
+                props = line
+                energy = float(energy_re.search(props).group(1))
+                subset = subset_re.search(props).group(1)
+                try:
+                    smiles = smiles_re.search(props).group(1)
+                except AttributeError:  # water and qmugs subsets do not have smiles
+                    smiles = ""
+                counter = 2
+                continue
+            el, x, y, z, fx, fy, fz, _, _, _ = line.split()
+            numbers.append(ATOMIC_NUMBERS[el])
+            positions.append([float(x), float(y), float(z)])
+            forces.append([float(fx), float(fy), float(fz)])
+            smiles = smiles.replace('"', "")
+            subset = subset.replace('"', "")
+            counter += 1
+            if counter == n_atoms + 2:
+                n_atoms = None
+                yield energy, numbers, positions, forces, smiles, subset
+
+
+def build_data_object(data):
+    energy, numbers, positions, forces, smiles, subset = data
+    if smiles == "":
+        x = np.concatenate((np.array(numbers)[:, None], np.zeros((len(numbers), 1))), axis=-1)
+    else:
+        x = get_atomic_number_and_charge(dm.to_mol(smiles, remove_hs=False, ordered=True))
+    res = dict(
+        name=np.array([smiles]),
+        subset=np.array([subset]),
+        energies=np.array([[energy]], dtype=np.float64),
+        forces=np.array(forces, dtype=np.float32).reshape(
+            -1, 3, 1
+        ),  # forces -ve of energy gradient but the -1.0 is done in the convert_forces method
+        atomic_inputs=np.concatenate((x, np.array(positions)), axis=-1, dtype=np.float32).reshape(-1, 5),
+        n_atoms=np.array([x.shape[0]], dtype=np.int32),
+    )
+    return res
+
+
+class MACEOFF(BaseDataset):
+    __name__ = "maceoff"
+
+    __energy_methods__ = [PotentialMethod.WB97M_D3BJ_DEF2_TZVPPD]
+    __force_mask__ = [True]
+    __energy_unit__ = "ev"
+    __distance_unit__ = "ang"
+    __forces_unit__ = "ev/ang"
+
+    energy_target_names = ["dft_total_energy"]
+    force_target_names = ["dft_total_gradient"]
+
+    __links__ = {
+        "train_large_neut_no_bad_clean.tar.gz": "https://api.repository.cam.ac.uk/server/api/core/bitstreams/b185b5ab-91cf-489a-9302-63bfac42824a/content"  # noqa: E501
+    }
+
+    def read_raw_entries(self):
+        filepath = p_join(self.root, "train_large_neut_no_bad_clean.xyz")
+        xyzpath = p_join(self.root, filepath)
+        structure_iterator = parse_mace_xyz(xyzpath)
+        res = dm.utils.parallelized(build_data_object, structure_iterator)
+        return res
diff --git a/openqdc/methods/enums.py b/openqdc/methods/enums.py
index 9dff4a15..a4b958ba 100644
--- a/openqdc/methods/enums.py
+++ b/openqdc/methods/enums.py
@@ -224,7 +224,6 @@ def atom_energies_dict(self):
         raise NotImplementedError()
 
 
-@unique
 class PotentialMethod(QmMethod):  # SPLIT FOR INTERACTIO ENERGIES AND FIX MD1
     B1LYP_VWN5_DZP = Functional.B1LYP_VWN5, BasisSet.DZP
     B1LYP_VWN5_SZ = Functional.B1LYP_VWN5, BasisSet.SZ
@@ -501,7 +500,6 @@ def atom_energies_dict(self):
         return energies
 
 
-@unique
 class InteractionMethod(QmMethod):
     CCSD_T_NN = Functional.CCSDT, BasisSet.NN
     CCSD_T_CBS = Functional.CCSDT, BasisSet.CBS

From b16a410c9b51f0d1ec8cc42f5b93d336314e59a1 Mon Sep 17 00:00:00 2001
From: Hatem Helal <hatem@valencelabs.com>
Date: Fri, 2 Aug 2024 07:00:53 -0600
Subject: [PATCH 2/8] initial scaffolding for BPA dataset

---
 openqdc/__init__.py                    |  2 ++
 openqdc/datasets/potential/__init__.py |  2 ++
 openqdc/datasets/potential/bpa.py      | 26 ++++++++++++++++++++++++++
 3 files changed, 30 insertions(+)
 create mode 100644 openqdc/datasets/potential/bpa.py

diff --git a/openqdc/__init__.py b/openqdc/__init__.py
index c6be72d4..63d5558b 100644
--- a/openqdc/__init__.py
+++ b/openqdc/__init__.py
@@ -21,6 +21,7 @@ def get_project_root():
     "ANI1CCX_V2": "openqdc.datasets.potential.ani",
     "ANI1X": "openqdc.datasets.potential.ani",
     "ANI2X": "openqdc.datasets.potential.ani",
+    "BPA": "openqdc.datasets.potential.bpa",
     "Spice": "openqdc.datasets.potential.spice",
     "SpiceV2": "openqdc.datasets.potential.spice",
     "SpiceVL2": "openqdc.datasets.potential.spice",
@@ -117,6 +118,7 @@ def __dir__():
     # POTENTIAL
     from .datasets.potential.alchemy import Alchemy
     from .datasets.potential.ani import ANI1, ANI1CCX, ANI1CCX_V2, ANI1X, ANI2X
+    from .datasets.potential.bpa import BPA
     from .datasets.potential.comp6 import COMP6
     from .datasets.potential.dummy import Dummy, PredefinedDataset
     from .datasets.potential.gdml import GDML
diff --git a/openqdc/datasets/potential/__init__.py b/openqdc/datasets/potential/__init__.py
index 35721dde..86671792 100644
--- a/openqdc/datasets/potential/__init__.py
+++ b/openqdc/datasets/potential/__init__.py
@@ -1,5 +1,6 @@
 from .alchemy import Alchemy
 from .ani import ANI1, ANI1CCX, ANI1CCX_V2, ANI1X, ANI2X
+from .bpa import BPA
 from .comp6 import COMP6
 from .dummy import Dummy, PredefinedDataset
 from .gdml import GDML
@@ -33,6 +34,7 @@
     "ANI1CCX_V2": ANI1CCX_V2,
     "ANI1X": ANI1X,
     "ANI2X": ANI2X,
+    "BPA": BPA,
     "COMP6": COMP6,
     "GDML": GDML,
     "GEOM": GEOM,
diff --git a/openqdc/datasets/potential/bpa.py b/openqdc/datasets/potential/bpa.py
new file mode 100644
index 00000000..a6aa5df1
--- /dev/null
+++ b/openqdc/datasets/potential/bpa.py
@@ -0,0 +1,26 @@
+from openqdc import BaseDataset
+from openqdc.methods import PotentialMethod
+
+
+class BPA(BaseDataset):
+    """
+    _summary_
+
+
+    Usage:
+    ```python
+    from openqdc.datasets import BPA
+    dataset = BPA()
+    ```
+
+
+    References:
+        https://pubs.acs.org/doi/10.1021/acs.jctc.1c00647
+    """
+
+    __name__ = "BPA"
+    __energy_unit__ = "ev"
+    __forces_unit__ = "eV/ang"
+    __distance_unit__ = "ang"
+    __energy_methods__ = ([PotentialMethod.WB97X_6_31G_D],)
+    __links__ = {"BPA.zip": "https://pubs.acs.org/doi/suppl/10.1021/acs.jctc.1c00647/suppl_file/ct1c00647_si_002.zip"}

From b613fb018c6b1e0d98dbf03bedeced8a8da7f67a Mon Sep 17 00:00:00 2001
From: Hatem Helal <hatem@valencelabs.com>
Date: Fri, 2 Aug 2024 10:33:04 -0600
Subject: [PATCH 3/8] fix download and parsing

---
 openqdc/datasets/potential/bpa.py | 41 ++++++++++++++++++++++++++++---
 1 file changed, 38 insertions(+), 3 deletions(-)

diff --git a/openqdc/datasets/potential/bpa.py b/openqdc/datasets/potential/bpa.py
index a6aa5df1..464c2759 100644
--- a/openqdc/datasets/potential/bpa.py
+++ b/openqdc/datasets/potential/bpa.py
@@ -1,7 +1,23 @@
+from typing import Any, Dict, List
+
+import numpy as np
+from ase.atoms import Atoms
+
 from openqdc import BaseDataset
 from openqdc.methods import PotentialMethod
 
 
+def read_bpa_record(subset: str, atoms: Atoms) -> Dict[str, Any]:
+    return dict(
+        name=np.array([str(atoms.symbols)]),
+        subset=subset,
+        energies=np.array([atoms.get_potential_energy()], dtype=np.float32),
+        forces=atoms.get_forces().reshape(-1, 3, 1).astype(np.float32),
+        atomic_inputs=np.column_stack((atoms.numbers, atoms.get_initial_charges(), atoms.positions)).astype(np.float32),
+        n_atoms=np.array([len(atoms)], dtype=np.int32),
+    )
+
+
 class BPA(BaseDataset):
     """
     _summary_
@@ -20,7 +36,26 @@ class BPA(BaseDataset):
 
     __name__ = "BPA"
     __energy_unit__ = "ev"
-    __forces_unit__ = "eV/ang"
+    __forces_unit__ = "ev/ang"
     __distance_unit__ = "ang"
-    __energy_methods__ = ([PotentialMethod.WB97X_6_31G_D],)
-    __links__ = {"BPA.zip": "https://pubs.acs.org/doi/suppl/10.1021/acs.jctc.1c00647/suppl_file/ct1c00647_si_002.zip"}
+    __force_mask__ = [True]
+    __energy_methods__ = (PotentialMethod.WB97X_6_31G_D,)
+    __links__ = {"BPA.zip": "https://figshare.com/ndownloader/files/31325990"}
+
+    def read_raw_entries(self) -> List[Dict]:
+        import os.path as osp
+        from glob import glob
+
+        from ase.io import iread
+
+        files = glob(osp.join(self.root, "dataset_3BPA", "*.xyz"))
+        files = [f for f in files if "iso_atoms.xyz" not in f]
+        all_records = []
+
+        for file in files:
+            subset = np.array([osp.basename(file).split(".")[0]])
+
+            for atoms in iread(file, format="extxyz"):
+                all_records.append(read_bpa_record(subset, atoms))
+
+        return all_records

From d63cb552b1c12262effd6fc7e471efae52c6a5b6 Mon Sep 17 00:00:00 2001
From: FNTwin <cristian@valencelabs.com>
Date: Fri, 2 Aug 2024 12:03:31 -0600
Subject: [PATCH 4/8] Splits in MACEOFF

---
 openqdc/datasets/potential/maceoff.py | 26 +++++++++++++++++++-------
 1 file changed, 19 insertions(+), 7 deletions(-)

diff --git a/openqdc/datasets/potential/maceoff.py b/openqdc/datasets/potential/maceoff.py
index c8fb06b0..2552970e 100644
--- a/openqdc/datasets/potential/maceoff.py
+++ b/openqdc/datasets/potential/maceoff.py
@@ -1,4 +1,5 @@
 import re
+from functools import partial
 from os.path import join as p_join
 
 import datamol as dm
@@ -52,7 +53,7 @@ def parse_mace_xyz(xyzpath):
                 yield energy, numbers, positions, forces, smiles, subset
 
 
-def build_data_object(data):
+def build_data_object(data, split):
     energy, numbers, positions, forces, smiles, subset = data
     if smiles == "":
         x = np.concatenate((np.array(numbers)[:, None], np.zeros((len(numbers), 1))), axis=-1)
@@ -67,6 +68,7 @@ def build_data_object(data):
         ),  # forces -ve of energy gradient but the -1.0 is done in the convert_forces method
         atomic_inputs=np.concatenate((x, np.array(positions)), axis=-1, dtype=np.float32).reshape(-1, 5),
         n_atoms=np.array([x.shape[0]], dtype=np.int32),
+        split=np.array([split]),
     )
     return res
 
@@ -84,12 +86,22 @@ class MACEOFF(BaseDataset):
     force_target_names = ["dft_total_gradient"]
 
     __links__ = {
-        "train_large_neut_no_bad_clean.tar.gz": "https://api.repository.cam.ac.uk/server/api/core/bitstreams/b185b5ab-91cf-489a-9302-63bfac42824a/content"  # noqa: E501
+        "train_large_neut_no_bad_clean.tar.gz": "https://api.repository.cam.ac.uk/server/api/core/bitstreams/b185b5ab-91cf-489a-9302-63bfac42824a/content",  # noqa: E501
+        "test_large_neut_all.tar.gz": "https://api.repository.cam.ac.uk/server/api/core/bitstreams/cb8351dd-f09c-413f-921c-67a702a7f0c5/content",  # noqa: E501
     }
 
     def read_raw_entries(self):
-        filepath = p_join(self.root, "train_large_neut_no_bad_clean.xyz")
-        xyzpath = p_join(self.root, filepath)
-        structure_iterator = parse_mace_xyz(xyzpath)
-        res = dm.utils.parallelized(build_data_object, structure_iterator)
-        return res
+        entries = []
+        for filename in self.__links__:
+            filename = filename.split(".")[0]
+            xyzpath = p_join(self.root, f"{filename}.xyz")
+            split = filename.split("_")[0]
+            structure_iterator = parse_mace_xyz(xyzpath)
+            func = partial(build_data_object, split=split)
+            entries.extend(dm.utils.parallelized(func, structure_iterator))
+        return entries
+
+    def __getitem__(self, idx):
+        data = super().__getitem__(idx)
+        data.__setattr__("split", self._convert_array(self.data["split"][idx]))
+        return data

From 737b81e14f778214f905f4876bbf8c7c6b32c875 Mon Sep 17 00:00:00 2001
From: FNTwin <cristian@valencelabs.com>
Date: Fri, 2 Aug 2024 12:51:14 -0600
Subject: [PATCH 5/8] WIP

---
 openqdc/datasets/potential/bpa.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/openqdc/datasets/potential/bpa.py b/openqdc/datasets/potential/bpa.py
index 464c2759..78a00ad5 100644
--- a/openqdc/datasets/potential/bpa.py
+++ b/openqdc/datasets/potential/bpa.py
@@ -11,10 +11,11 @@ def read_bpa_record(subset: str, atoms: Atoms) -> Dict[str, Any]:
     return dict(
         name=np.array([str(atoms.symbols)]),
         subset=subset,
-        energies=np.array([atoms.get_potential_energy()], dtype=np.float32),
+        energies=np.array([atoms.get_potential_energy()], dtype=np.float64),
         forces=atoms.get_forces().reshape(-1, 3, 1).astype(np.float32),
         atomic_inputs=np.column_stack((atoms.numbers, atoms.get_initial_charges(), atoms.positions)).astype(np.float32),
         n_atoms=np.array([len(atoms)], dtype=np.int32),
+        split = np.array([subset.item().split("_")[0]])
     )
 
 
@@ -59,3 +60,8 @@ def read_raw_entries(self) -> List[Dict]:
                 all_records.append(read_bpa_record(subset, atoms))
 
         return all_records
+    
+    def __getitem__(self, idx):
+        data = super().__getitem__(idx)
+        data.__setattr__("split", self._convert_array(self.data["split"][idx]))
+        return data

From a1061a8f75264164d2b54a0c277da01eb92e209b Mon Sep 17 00:00:00 2001
From: FNTwin <cristian@valencelabs.com>
Date: Fri, 2 Aug 2024 13:02:48 -0600
Subject: [PATCH 6/8] MACEOFF docstrings

---
 openqdc/datasets/potential/maceoff.py | 32 ++++++++++++++++++++++++---
 1 file changed, 29 insertions(+), 3 deletions(-)

diff --git a/openqdc/datasets/potential/maceoff.py b/openqdc/datasets/potential/maceoff.py
index 2552970e..ed184eb6 100644
--- a/openqdc/datasets/potential/maceoff.py
+++ b/openqdc/datasets/potential/maceoff.py
@@ -63,9 +63,7 @@ def build_data_object(data, split):
         name=np.array([smiles]),
         subset=np.array([subset]),
         energies=np.array([[energy]], dtype=np.float64),
-        forces=np.array(forces, dtype=np.float32).reshape(
-            -1, 3, 1
-        ),  # forces -ve of energy gradient but the -1.0 is done in the convert_forces method
+        forces=np.array(forces, dtype=np.float32).reshape(-1, 3, 1),
         atomic_inputs=np.concatenate((x, np.array(positions)), axis=-1, dtype=np.float32).reshape(-1, 5),
         n_atoms=np.array([x.shape[0]], dtype=np.int32),
         split=np.array([split]),
@@ -74,6 +72,34 @@ def build_data_object(data, split):
 
 
 class MACEOFF(BaseDataset):
+    """
+    MACEOFF dataset core of the dataset consist in the Spice V1 dataset.
+    95% of the data are used for training and validation under the "train" split,
+    and 5% for testing. The dataset uses the Spice level of theory
+    ωB97M-D3(BJ)/def2-TZVPPD as implemented in the PSI4 software.
+    MACEOFF uses a subset of SPICE that contains the ten chemical elements
+    H, C, N, O, F, P, S, Cl, Br, and I, and has a neutral formal charge.
+    MACEOFF doesn't contain ion pairs. To facilitate the learning of intramolecular
+    non-bonded interactions, MACEOFF dataset contains larger 50–90 atom molecules
+    randomly selected from the QMugs dataset.
+    MACEOFF contains a number of water clusters carved out of molecular dynamics simulations
+    of liquid water, with sizes of up to 50 water molecules and part of the
+    COMP6 tripeptide geometry dataset.
+
+    Usage:
+    ```python
+    from openqdc.datasets import MACEOFF
+    dataset = MACEOFF()
+    ```
+
+    Species:
+        [H, C, N, O, F, P, S, Cl, Br, I]
+
+    References:
+        https://arxiv.org/pdf/2312.15211\n
+        https://doi.org/10.17863/CAM.107498
+    """
+
     __name__ = "maceoff"
 
     __energy_methods__ = [PotentialMethod.WB97M_D3BJ_DEF2_TZVPPD]

From dab04efae3d6c10f91e9279704337a9001193988 Mon Sep 17 00:00:00 2001
From: FNTwin <cristian@valencelabs.com>
Date: Thu, 8 Aug 2024 14:08:19 -0600
Subject: [PATCH 7/8] Correct regex parsing + binary strings dec

---
 openqdc/datasets/potential/ani.py     | 7 ++-----
 openqdc/datasets/potential/maceoff.py | 2 +-
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/openqdc/datasets/potential/ani.py b/openqdc/datasets/potential/ani.py
index aac35635..81f106f9 100644
--- a/openqdc/datasets/potential/ani.py
+++ b/openqdc/datasets/potential/ani.py
@@ -154,7 +154,7 @@ def convert_forces(self, x):
         return super().convert_forces(x) * 0.529177249  # correct the Dataset error
 
     def __smiles_converter__(self, x):
-        return x
+        return "-".join(x.decode("ascii").split("-")[:-1])
 
 
 class ANI1CCX(ANI1):
@@ -195,10 +195,7 @@ class ANI1CCX(ANI1):
     __links__ = {"ani1x.hdf5.gz": "https://zenodo.org/record/4081694/files/292.hdf5.gz"}
 
     def __smiles_converter__(self, x):
-        """util function to convert string to smiles: useful if the smiles is
-        encoded in a different format than its display format
-        """
-        return x
+        return x.decode("ascii")
 
 
 class ANI1CCX_V2(ANI1CCX):
diff --git a/openqdc/datasets/potential/maceoff.py b/openqdc/datasets/potential/maceoff.py
index ed184eb6..f90a3c4f 100644
--- a/openqdc/datasets/potential/maceoff.py
+++ b/openqdc/datasets/potential/maceoff.py
@@ -14,7 +14,7 @@
 def parse_mace_xyz(xyzpath):
     energy_re = re.compile(r"energy=(\S+)")
     smiles_re = re.compile(r"smiles=(\S+)")
-    subset_re = re.compile(r"config_type=(\S+)")
+    subset_re = re.compile(r"config_type=([^;]+)\ MACE_energy")
     with open(xyzpath, "r") as f:
         n_atoms = None
         counter = 0

From 3aa2796feb146f9e249a18ca09b49e041c9938cc Mon Sep 17 00:00:00 2001
From: FNTwin <cristian@valencelabs.com>
Date: Fri, 30 Aug 2024 09:38:30 -0600
Subject: [PATCH 8/8] BPA docstrings

---
 openqdc/datasets/potential/bpa.py | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/openqdc/datasets/potential/bpa.py b/openqdc/datasets/potential/bpa.py
index 78a00ad5..16817105 100644
--- a/openqdc/datasets/potential/bpa.py
+++ b/openqdc/datasets/potential/bpa.py
@@ -15,14 +15,21 @@ def read_bpa_record(subset: str, atoms: Atoms) -> Dict[str, Any]:
         forces=atoms.get_forces().reshape(-1, 3, 1).astype(np.float32),
         atomic_inputs=np.column_stack((atoms.numbers, atoms.get_initial_charges(), atoms.positions)).astype(np.float32),
         n_atoms=np.array([len(atoms)], dtype=np.int32),
-        split = np.array([subset.item().split("_")[0]])
+        split=np.array([subset.item().split("_")[0]]),
     )
 
 
 class BPA(BaseDataset):
     """
-    _summary_
-
+    BPA (or 3BPA) dataset is a dataset consisting of a flexible druglike
+    molecule 3-(benzyloxy)pyridin-2-amine. This dataset features
+    complex dihedral potential energy surface with many local minima,
+    which can be challenging to approximate using classical or ML force fields.
+    The configuration were sampled from short (0.5 ps) MD simulations using the ANI-1x force field to
+    perturb the toward lower potential energies. Furthermore, long 25 ps MD simulation were performed at
+    three different temperatures (300, 600, and 1200 K) using the Langevin thermostat and a 1 fs time step.
+    The final configurations were re-evaluated using ORCA at the DFT level of
+    theory using the ωB97X exchange correlation functional and the 6-31G(d) basis set.
 
     Usage:
     ```python
@@ -40,7 +47,7 @@ class BPA(BaseDataset):
     __forces_unit__ = "ev/ang"
     __distance_unit__ = "ang"
     __force_mask__ = [True]
-    __energy_methods__ = (PotentialMethod.WB97X_6_31G_D,)
+    __energy_methods__ = [PotentialMethod.WB97X_6_31G_D]
     __links__ = {"BPA.zip": "https://figshare.com/ndownloader/files/31325990"}
 
     def read_raw_entries(self) -> List[Dict]:
@@ -60,7 +67,7 @@ def read_raw_entries(self) -> List[Dict]:
                 all_records.append(read_bpa_record(subset, atoms))
 
         return all_records
-    
+
     def __getitem__(self, idx):
         data = super().__getitem__(idx)
         data.__setattr__("split", self._convert_array(self.data["split"][idx]))