Improved docs

valence-labs · Jul 24, 2024 · 1c1077d · 1c1077d
1 parent e5960e9
commit 1c1077d
Show file tree

Hide file tree

Showing 10 changed files with 175 additions and 84 deletions.
diff --git a/docs/API/e0_dispatcher.md b/docs/API/e0_dispatcher.md
@@ -0,0 +1 @@
+::: openqdc.datasets.energies
diff --git a/docs/API/properties.md b/docs/API/properties.md
@@ -0,0 +1,3 @@
+# Defined properties for datasets
+
+:::openqdc.datasets.properties
diff --git a/docs/API/statistics.md b/docs/API/statistics.md
@@ -0,0 +1 @@
+::: openqdc.datasets.statistics
diff --git a/docs/index.md b/docs/index.md
@@ -9,17 +9,17 @@ OpenQDC is a python library to work with quantum datasets. It's a package aimed
 - 🧠 Performance matters: read and write multiple formats (memmap, zarr, xyz, etc).
 - 📈 Data: have access to 1.5+ billion datapoints
 
-Visit our website at TOFILL <IDK>.
+Visit our website at https://openqdc.io .
 
 ## Installation
 
 Use mamba:
 
 ```bash
-mamba install -c conda-forge openqdc
+conda install -c conda-forge openqdc
 ```
 
-_**Tips:** You can replace `mamba` by `conda`._
+_**Tips:** You can replace `conda` by `mamba`._
 
 _**Note:** We highly recommend using a [Conda Python distribution](https://github.com/conda-forge/miniforge) to install OpenQDC. The package is also pip installable if you need it: `pip install openqdc`._
 
@@ -58,7 +58,7 @@ dataset.calculate_descriptors(
 
 ## How to cite
 
-Please cite OpenQDC if you use it in your research: [![DOI](zenodo_badge)](zenodo_link).
+Please cite OpenQDC if you use it in your research: [![Pending Publication](Pending Publication)](Pending Publication).
 
 ## Compatibilities
 

diff --git a/docs/usage.md b/docs/usage.md
@@ -37,6 +37,14 @@ for data in dataset.as_iter(atoms=True):
     break
 ```
 
+or if you want to just iterate over the data:
+
+```python
+for data in dataset:
+    print(data) # dict of arrays
+    break
+```
+
 ## Lazy loading
 
 OpenQDC uses lazy loading to dynamically expose all its API without imposing a long import time during `import openqdc as qdc`. In case of trouble you can always disable lazy loading by setting the environment variable `OPENQDC_DISABLE_LAZY_LOADING` to `1`.
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -25,7 +25,11 @@ nav:
   - API:
     - QM methods: API/methods.md
     - Normalization regressor: API/regressor.md
-    - Main class: API/basedataset.md
+    - Main classes:
+      - BaseDataset: API/basedataset.md
+      - Available Properties: API/properties.md
+      - e0 Dispatcher: API/e0_dispatcher.md
+      - Statistics: API/statistics.md
     - Format loading: API/formats.md
     - Datasets:
       - Potential Energy:

diff --git a/openqdc/datasets/base.py b/openqdc/datasets/base.py
@@ -237,7 +237,13 @@ def force_methods(self):
         return list(compress(self.energy_methods, self.force_mask))
 
     @property
-    def e0s_dispatcher(self):
+    def e0s_dispatcher(self) -> AtomEnergies:
+        """
+        Property to get the object that dispatched the isolated atom energies of the QM methods.
+
+        Returns:
+            Object wrapping the isolated atom energies of the QM methods.
+        """
         if not hasattr(self, "_e0s_dispatcher"):
             # Automatically fetch/compute formation or regression energies
             self._e0s_dispatcher = AtomEnergies(self, **self.regressor_kwargs)

diff --git a/openqdc/datasets/energies.py b/openqdc/datasets/energies.py
@@ -1,7 +1,7 @@
 from abc import ABC, abstractmethod
 from dataclasses import dataclass, field
 from os.path import join as p_join
-from typing import Dict, Union
+from typing import Any, Dict, Optional, Tuple, Union
 
 import numpy as np
 from loguru import logger
@@ -14,22 +14,24 @@
 POSSIBLE_ENERGIES = ["formation", "regression", "null"]
 
 
-def dispatch_factory(data, **kwargs) -> "IsolatedEnergyInterface":
+def dispatch_factory(data: Any, **kwargs: Dict) -> "IsolatedEnergyInterface":
     """
     Factory function that select the correct
     energy class for the fetching/calculation
     of isolated atom energies.
 
-    Parameters
-    ----------
-    data : openqdc.datasets.Dataset
-        Dataset object that contains the information
-        about the isolated atom energies. Info will be passed
-        by references
-    kwargs : dict
-        Additional arguments that will be passed to the
-        selected energy class. Mostly used for regression
-        to pass the regressor_kwargs.
+    Parameters:
+        data : openqdc.datasets.Dataset
+            Dataset object that contains the information
+            about the isolated atom energies. Info will be passed
+            by references
+        kwargs : dict
+            Additional arguments that will be passed to the
+            selected energy class. Mostly used for regression
+            to pass the regressor_kwargs.
+
+    Returns:
+        Initialized IsolatedEnergyInterface-like object
     """
     if data.energy_type == "formation":
         return PhysicalEnergy(data, **kwargs)
@@ -100,33 +102,26 @@ class AtomEnergies:
     """
 
     def __init__(self, data, **kwargs) -> None:
-        """
-        Parameters
-        ----------
-        data : openqdc.datasets.Dataset
-            Dataset object that contains the information
-            about the isolated atom energies. Info will be passed
-            by references
-        kwargs : dict
-            Additional arguments that will be passed to the
-            selected energy class. Mostly used for regression
-            to pass the regressor_kwargs.
-        """
-
         self.atom_energies = data.energy_type
         self.factory = dispatch_factory(data, **kwargs)
 
     @property
     def e0s_matrix(self) -> np.ndarray:
         """
-        Returns the isolated atom energies matrixes
+        Return the isolated atom energies dictionary
+
+        Returns:
+            Matrix Array with the isolated atom energies
         """
         return self.factory.e0_matrix
 
     @property
     def e0s_dict(self) -> Dict[AtomSpecies, AtomEnergy]:
         """
         Return the isolated atom energies dictionary
+
+        Returns:
+            Dictionary with the isolated atom energies
         """
         return self.factory.e0_dict
 
@@ -142,10 +137,18 @@ def __getitem__(self, item: AtomSpecies) -> AtomEnergy:
         Item can be written as tuple(Symbol, charge),
         tuple(Chemical number, charge). If no charge is passed,
         it will be automatically set to 0.
+
         Examples:
-            AtomEnergies[6], AtomEnergies[6,1],
-            AtomEnergies["C",1], AtomEnergies[(6,1)]
+            AtomEnergies[6], AtomEnergies[6,1], \n
+            AtomEnergies["C",1], AtomEnergies[(6,1)], \n
             AtomEnergies[("C,1)]
+
+        Parameters:
+            item:
+                AtomSpecies object or tuple with the atom symbol and charge
+
+        Returns:
+            AtomEnergy object with the isolated atom energy
         """
         try:
             atom, charge = item[0], item[1]
@@ -168,16 +171,15 @@ class IsolatedEnergyInterface(ABC):
 
     def __init__(self, data, **kwargs):
         """
-        Parameters
-        ----------
-        data : openqdc.datasets.Dataset
-            Dataset object that contains the information
-            about the isolated atom energies. Info will be passed
-            by references
-        kwargs : dict
-            Additional arguments that will be passed to the
-            selected energy class. Mostly used for regression
-            to pass the regressor_kwargs.
+        Parameters:
+            data : openqdc.datasets.Dataset
+                Dataset object that contains the information
+                about the isolated atom energies. Info will be passed
+                by references
+            kwargs : dict
+                Additional arguments that will be passed to the
+                selected energy class. Mostly used for regression
+                to pass the regressor_kwargs.
         """
         self._e0_matrixs = []
         self._e0_dict = None
@@ -204,13 +206,19 @@ def __len__(self):
     def e0_matrix(self) -> np.ndarray:
         """
         Return the isolated atom energies matrixes
+
+        Returns:
+            Matrix Array with the isolated atom energies
         """
         return np.array(self._e0_matrixs)
 
     @property
     def e0_dict(self) -> Dict:
         """
         Return the isolated atom energies dict
+
+        Returns:
+            Dictionary with the isolated atom energies
         """
 
         return self._e0s_dict
@@ -276,11 +284,15 @@ def _post_init(self):
             self._set_lin_atom_species_dict(E0s, cov)
         self._set_linear_e0s()
 
-    def _compute_regression_e0s(self):
+    def _compute_regression_e0s(self) -> Tuple[np.ndarray, Optional[np.ndarray]]:
         """
         Try to compute the regressed isolated atom energies.
         raise an error if the regression fails.
         return the regressed isolated atom energies and the uncertainty values.
+
+        Returns:
+            Tuple with the regressed isolated atom energies and the uncertainty values of the regression
+            if available.
         """
         try:
             E0s, cov = self.regressor.solve()
@@ -305,7 +317,7 @@ def _set_lin_atom_species_dict(self, E0s, covs) -> None:
     def _set_linear_e0s(self) -> None:
         """
         Transform the e0s dictionary into the correct e0s
-        matrix format
+        matrix format.
         """
         new_e0s = [np.zeros((max(self.data.numbers) + 1, MAX_CHARGE_NUMBER)) for _ in range(len(self))]
         for z, e0 in self._e0s_dict.items():

diff --git a/openqdc/datasets/properties.py b/openqdc/datasets/properties.py
@@ -1,3 +1,5 @@
+from typing import Tuple
+
 import numpy as np
 import pandas as pd
 
@@ -29,35 +31,62 @@ def _compute_average_nb_atoms(self):
         self.__average_nb_atoms__ = np.mean(self.data["n_atoms"])
 
     @property
-    def average_n_atoms(self):
+    def average_n_atoms(self) -> int:
         """
         Average number of atoms in a molecule in the dataset.
+
+        Returns:
+            Average number of atoms in a molecule in the dataset.
         """
         if self.__average_nb_atoms__ is None:
             raise StatisticsNotAvailableError(self.__name__)
         return self.__average_nb_atoms__
 
     @property
-    def numbers(self):
+    def numbers(self) -> np.ndarray:
+        """
+        Unique atomic numbers in the dataset
+
+        Returns:
+            Array of the unique atomic numbers in the dataset
+        """
         if hasattr(self, "_numbers"):
             return self._numbers
         self._numbers = pd.unique(self.data["atomic_inputs"][..., 0]).astype(np.int32)
         return self._numbers
 
     @property
-    def charges(self):
+    def charges(self) -> np.ndarray:
+        """
+        Unique charges in the dataset
+
+        Returns:
+            Array of the unique charges in the dataset
+        """
         if hasattr(self, "_charges"):
             return self._charges
         self._charges = np.unique(self.data["atomic_inputs"][..., :2], axis=0).astype(np.int32)
         return self._charges
 
     @property
-    def min_max_charges(self):
+    def min_max_charges(self) -> Tuple[int, int]:
+        """
+        Minimum and maximum charges in the dataset
+
+        Returns:
+            (min_charge, max_charge)
+        """
         if hasattr(self, "_min_max_charges"):
             return self._min_max_charges
         self._min_max_charges = np.min(self.charges[:, 1]), np.max(self.charges[:, 1])
         return self._min_max_charges
 
     @property
-    def chemical_species(self):
+    def chemical_species(self) -> np.ndarray:
+        """
+        Chemical symbols in the dataset
+
+        Returns:
+            Array of the chemical symbols in the dataset
+        """
         return np.array(ATOM_SYMBOLS)[self.numbers]