Skip to content

Commit

Permalink
Improved docs
Browse files Browse the repository at this point in the history
  • Loading branch information
FNTwin committed Jul 24, 2024
1 parent e5960e9 commit 1c1077d
Show file tree
Hide file tree
Showing 10 changed files with 175 additions and 84 deletions.
1 change: 1 addition & 0 deletions docs/API/e0_dispatcher.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
::: openqdc.datasets.energies
3 changes: 3 additions & 0 deletions docs/API/properties.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Defined properties for datasets

:::openqdc.datasets.properties
1 change: 1 addition & 0 deletions docs/API/statistics.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
::: openqdc.datasets.statistics
8 changes: 4 additions & 4 deletions docs/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,17 +9,17 @@ OpenQDC is a python library to work with quantum datasets. It's a package aimed
- 🧠 Performance matters: read and write multiple formats (memmap, zarr, xyz, etc).
- 📈 Data: have access to 1.5+ billion datapoints

Visit our website at TOFILL <IDK>.
Visit our website at https://openqdc.io .

## Installation

Use mamba:

```bash
mamba install -c conda-forge openqdc
conda install -c conda-forge openqdc
```

_**Tips:** You can replace `mamba` by `conda`._
_**Tips:** You can replace `conda` by `mamba`._

_**Note:** We highly recommend using a [Conda Python distribution](https://github.com/conda-forge/miniforge) to install OpenQDC. The package is also pip installable if you need it: `pip install openqdc`._

Expand Down Expand Up @@ -58,7 +58,7 @@ dataset.calculate_descriptors(

## How to cite

Please cite OpenQDC if you use it in your research: [![DOI](zenodo_badge)](zenodo_link).
Please cite OpenQDC if you use it in your research: [![Pending Publication](Pending Publication)](Pending Publication).

## Compatibilities

Expand Down
8 changes: 8 additions & 0 deletions docs/usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,14 @@ for data in dataset.as_iter(atoms=True):
break
```

or if you want to just iterate over the data:

```python
for data in dataset:
print(data) # dict of arrays
break
```

## Lazy loading

OpenQDC uses lazy loading to dynamically expose all its API without imposing a long import time during `import openqdc as qdc`. In case of trouble you can always disable lazy loading by setting the environment variable `OPENQDC_DISABLE_LAZY_LOADING` to `1`.
6 changes: 5 additions & 1 deletion mkdocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,11 @@ nav:
- API:
- QM methods: API/methods.md
- Normalization regressor: API/regressor.md
- Main class: API/basedataset.md
- Main classes:
- BaseDataset: API/basedataset.md
- Available Properties: API/properties.md
- e0 Dispatcher: API/e0_dispatcher.md
- Statistics: API/statistics.md
- Format loading: API/formats.md
- Datasets:
- Potential Energy:
Expand Down
8 changes: 7 additions & 1 deletion openqdc/datasets/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,7 +237,13 @@ def force_methods(self):
return list(compress(self.energy_methods, self.force_mask))

@property
def e0s_dispatcher(self):
def e0s_dispatcher(self) -> AtomEnergies:
"""
Property to get the object that dispatched the isolated atom energies of the QM methods.
Returns:
Object wrapping the isolated atom energies of the QM methods.
"""
if not hasattr(self, "_e0s_dispatcher"):
# Automatically fetch/compute formation or regression energies
self._e0s_dispatcher = AtomEnergies(self, **self.regressor_kwargs)
Expand Down
92 changes: 52 additions & 40 deletions openqdc/datasets/energies.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from abc import ABC, abstractmethod
from dataclasses import dataclass, field
from os.path import join as p_join
from typing import Dict, Union
from typing import Any, Dict, Optional, Tuple, Union

import numpy as np
from loguru import logger
Expand All @@ -14,22 +14,24 @@
POSSIBLE_ENERGIES = ["formation", "regression", "null"]


def dispatch_factory(data, **kwargs) -> "IsolatedEnergyInterface":
def dispatch_factory(data: Any, **kwargs: Dict) -> "IsolatedEnergyInterface":
"""
Factory function that select the correct
energy class for the fetching/calculation
of isolated atom energies.
Parameters
----------
data : openqdc.datasets.Dataset
Dataset object that contains the information
about the isolated atom energies. Info will be passed
by references
kwargs : dict
Additional arguments that will be passed to the
selected energy class. Mostly used for regression
to pass the regressor_kwargs.
Parameters:
data : openqdc.datasets.Dataset
Dataset object that contains the information
about the isolated atom energies. Info will be passed
by references
kwargs : dict
Additional arguments that will be passed to the
selected energy class. Mostly used for regression
to pass the regressor_kwargs.
Returns:
Initialized IsolatedEnergyInterface-like object
"""
if data.energy_type == "formation":
return PhysicalEnergy(data, **kwargs)
Expand Down Expand Up @@ -100,33 +102,26 @@ class AtomEnergies:
"""

def __init__(self, data, **kwargs) -> None:
"""
Parameters
----------
data : openqdc.datasets.Dataset
Dataset object that contains the information
about the isolated atom energies. Info will be passed
by references
kwargs : dict
Additional arguments that will be passed to the
selected energy class. Mostly used for regression
to pass the regressor_kwargs.
"""

self.atom_energies = data.energy_type
self.factory = dispatch_factory(data, **kwargs)

@property
def e0s_matrix(self) -> np.ndarray:
"""
Returns the isolated atom energies matrixes
Return the isolated atom energies dictionary
Returns:
Matrix Array with the isolated atom energies
"""
return self.factory.e0_matrix

@property
def e0s_dict(self) -> Dict[AtomSpecies, AtomEnergy]:
"""
Return the isolated atom energies dictionary
Returns:
Dictionary with the isolated atom energies
"""
return self.factory.e0_dict

Expand All @@ -142,10 +137,18 @@ def __getitem__(self, item: AtomSpecies) -> AtomEnergy:
Item can be written as tuple(Symbol, charge),
tuple(Chemical number, charge). If no charge is passed,
it will be automatically set to 0.
Examples:
AtomEnergies[6], AtomEnergies[6,1],
AtomEnergies["C",1], AtomEnergies[(6,1)]
AtomEnergies[6], AtomEnergies[6,1], \n
AtomEnergies["C",1], AtomEnergies[(6,1)], \n
AtomEnergies[("C,1)]
Parameters:
item:
AtomSpecies object or tuple with the atom symbol and charge
Returns:
AtomEnergy object with the isolated atom energy
"""
try:
atom, charge = item[0], item[1]
Expand All @@ -168,16 +171,15 @@ class IsolatedEnergyInterface(ABC):

def __init__(self, data, **kwargs):
"""
Parameters
----------
data : openqdc.datasets.Dataset
Dataset object that contains the information
about the isolated atom energies. Info will be passed
by references
kwargs : dict
Additional arguments that will be passed to the
selected energy class. Mostly used for regression
to pass the regressor_kwargs.
Parameters:
data : openqdc.datasets.Dataset
Dataset object that contains the information
about the isolated atom energies. Info will be passed
by references
kwargs : dict
Additional arguments that will be passed to the
selected energy class. Mostly used for regression
to pass the regressor_kwargs.
"""
self._e0_matrixs = []
self._e0_dict = None
Expand All @@ -204,13 +206,19 @@ def __len__(self):
def e0_matrix(self) -> np.ndarray:
"""
Return the isolated atom energies matrixes
Returns:
Matrix Array with the isolated atom energies
"""
return np.array(self._e0_matrixs)

@property
def e0_dict(self) -> Dict:
"""
Return the isolated atom energies dict
Returns:
Dictionary with the isolated atom energies
"""

return self._e0s_dict
Expand Down Expand Up @@ -276,11 +284,15 @@ def _post_init(self):
self._set_lin_atom_species_dict(E0s, cov)
self._set_linear_e0s()

def _compute_regression_e0s(self):
def _compute_regression_e0s(self) -> Tuple[np.ndarray, Optional[np.ndarray]]:
"""
Try to compute the regressed isolated atom energies.
raise an error if the regression fails.
return the regressed isolated atom energies and the uncertainty values.
Returns:
Tuple with the regressed isolated atom energies and the uncertainty values of the regression
if available.
"""
try:
E0s, cov = self.regressor.solve()
Expand All @@ -305,7 +317,7 @@ def _set_lin_atom_species_dict(self, E0s, covs) -> None:
def _set_linear_e0s(self) -> None:
"""
Transform the e0s dictionary into the correct e0s
matrix format
matrix format.
"""
new_e0s = [np.zeros((max(self.data.numbers) + 1, MAX_CHARGE_NUMBER)) for _ in range(len(self))]
for z, e0 in self._e0s_dict.items():
Expand Down
39 changes: 34 additions & 5 deletions openqdc/datasets/properties.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from typing import Tuple

import numpy as np
import pandas as pd

Expand Down Expand Up @@ -29,35 +31,62 @@ def _compute_average_nb_atoms(self):
self.__average_nb_atoms__ = np.mean(self.data["n_atoms"])

@property
def average_n_atoms(self):
def average_n_atoms(self) -> int:
"""
Average number of atoms in a molecule in the dataset.
Returns:
Average number of atoms in a molecule in the dataset.
"""
if self.__average_nb_atoms__ is None:
raise StatisticsNotAvailableError(self.__name__)
return self.__average_nb_atoms__

@property
def numbers(self):
def numbers(self) -> np.ndarray:
"""
Unique atomic numbers in the dataset
Returns:
Array of the unique atomic numbers in the dataset
"""
if hasattr(self, "_numbers"):
return self._numbers
self._numbers = pd.unique(self.data["atomic_inputs"][..., 0]).astype(np.int32)
return self._numbers

@property
def charges(self):
def charges(self) -> np.ndarray:
"""
Unique charges in the dataset
Returns:
Array of the unique charges in the dataset
"""
if hasattr(self, "_charges"):
return self._charges
self._charges = np.unique(self.data["atomic_inputs"][..., :2], axis=0).astype(np.int32)
return self._charges

@property
def min_max_charges(self):
def min_max_charges(self) -> Tuple[int, int]:
"""
Minimum and maximum charges in the dataset
Returns:
(min_charge, max_charge)
"""
if hasattr(self, "_min_max_charges"):
return self._min_max_charges
self._min_max_charges = np.min(self.charges[:, 1]), np.max(self.charges[:, 1])
return self._min_max_charges

@property
def chemical_species(self):
def chemical_species(self) -> np.ndarray:
"""
Chemical symbols in the dataset
Returns:
Array of the chemical symbols in the dataset
"""
return np.array(ATOM_SYMBOLS)[self.numbers]
Loading

0 comments on commit 1c1077d

Please sign in to comment.