From d551211e9e2e3f4a36bd7494f03e01d876842670 Mon Sep 17 00:00:00 2001
From: Pol Febrer <pfebrer96@gmail.com>
Date: Fri, 15 Mar 2024 21:53:00 +0100
Subject: [PATCH 1/5] Pre-computation of orbital values.

There is now the possibility to pre-compute orbital values to calculate the density.
This results in much faster calculations, although it uses more memory.
---
 src/sisl/CMakeLists.txt                       |   2 +-
 src/sisl/_core/geometry.py                    | 210 +++-
 src/sisl/_sparse_grid.py                      | 455 +++++++++
 src/sisl/_sparse_grid_ops.pyx                 | 904 ++++++++++++++++++
 src/sisl/physics/densitymatrix.py             | 109 ++-
 src/sisl/physics/tests/test_density_matrix.py |  51 +-
 src/sisl/tests/test_sparse_grid.py            | 312 ++++++
 7 files changed, 1983 insertions(+), 60 deletions(-)
 create mode 100644 src/sisl/_sparse_grid.py
 create mode 100644 src/sisl/_sparse_grid_ops.pyx
 create mode 100644 src/sisl/tests/test_sparse_grid.py

diff --git a/src/sisl/CMakeLists.txt b/src/sisl/CMakeLists.txt
index 271f69c4b3..55f75ca404 100644
--- a/src/sisl/CMakeLists.txt
+++ b/src/sisl/CMakeLists.txt
@@ -1,4 +1,4 @@
-foreach(source _indices _math_small)
+foreach(source _indices _math_small _sparse_grid_ops)
   add_cython_library(
     SOURCE ${source}.pyx
     LIBRARY ${source}
diff --git a/src/sisl/_core/geometry.py b/src/sisl/_core/geometry.py
index 02d8db6c37..9d76d3c831 100644
--- a/src/sisl/_core/geometry.py
+++ b/src/sisl/_core/geometry.py
@@ -32,6 +32,7 @@
     tile,
     unique,
 )
+from scipy.sparse import csr_array
 
 import sisl._array as _a
 from sisl._category import Category, GenericCategory
@@ -45,7 +46,7 @@
     list_index_le,
 )
 from sisl._internal import set_module
-from sisl._math_small import cross3, is_ascending
+from sisl._math_small import cross3, is_ascending, xyz_to_spherical_cos_phi
 from sisl._namedindex import NamedIndex
 from sisl.messages import SislError, deprecate_argument, deprecation, info, warn
 from sisl.shape import Cube, Shape, Sphere
@@ -3610,6 +3611,213 @@ def within_inf(
         # infinite supercell indices
         return self.asc2uc(idx), xyz, isc
 
+    def _orbital_values(self, grid_shape: Tuple[int, int, int]):
+        r"""Calculates orbital values for a given grid.
+
+        Parameters
+        ----------
+        grid_shape:
+           the grid shape (i.e. resolution) in which to calculate the orbital values.
+        """
+        # We need to import these here to avoid circular imports.
+        from sisl import Grid
+        from sisl._sparse_grid import SparseGridOrbitalBZ
+
+        # In the following we don't care about division
+        # So 1) save error state, 2) turn off divide by 0, 3) calculate, 4) turn on old error state
+        old_err = np.seterr(divide="ignore", invalid="ignore")
+
+        # Instead of looping all atoms in the supercell we find the exact atoms
+        # and their supercell indices.
+        add_R = _a.fulld(3, self.maxR())
+        # Calculate the required additional vectors required to increase the fictitious
+        # supercell by add_R in each direction.
+        # For extremely skewed lattices this will be way too much, hence we make
+        # them square.
+        o = self.lattice.to.Cuboid(True)
+        lattice = Lattice(o._v + np.diag(2 * add_R), origin=o.origin - add_R)
+
+        # Retrieve all atoms within the grid supercell
+        # (and the neighbours that connect into the cell)
+        IA, XYZ, ISC = self.within_inf(lattice, periodic=self.pbc)
+        XYZ -= self.lattice.origin.reshape(1, 3)
+
+        # within_inf translates atoms to the unit cell to compute
+        # supercell indices. Here we revert that
+        ISC -= np.floor(self.fxyz[IA]).astype(int32)
+
+        def xyz2spherical(xyz, offset):
+            """Calculate the spherical coordinates from indices"""
+            rx = xyz[:, 0] - offset[0]
+            ry = xyz[:, 1] - offset[1]
+            rz = xyz[:, 2] - offset[2]
+
+            xyz_to_spherical_cos_phi(rx, ry, rz)
+            return rx, ry, rz
+
+        def sphere_grid_index(grid, center, R):
+
+            corners = np.mgrid[-1:2:2, -1:2:2, -1:2:2].T * R + center
+            corners = corners.reshape(-1, 3)
+
+            corners_i = grid.index(corners)
+
+            ix = np.arange(
+                max(corners_i[:, 0].min(), 0),
+                min(corners_i[:, 0].max() + 1, grid.shape[0]),
+            )
+            iy = np.arange(
+                max(corners_i[:, 1].min(), 0),
+                min(corners_i[:, 1].max() + 1, grid.shape[1]),
+            )
+            iz = np.arange(
+                max(corners_i[:, 2].min(), 0),
+                min(corners_i[:, 2].max() + 1, grid.shape[2]),
+            )
+
+            indices = np.meshgrid(ix, iy, iz)
+            indices = np.array(indices).T.reshape(-1, 3)
+
+            return indices
+
+        # Get the size of the auxiliary supercell needed to store orbital values.
+        nsc = abs(ISC).max(axis=0) * 2 + 1
+        sp_grid_geom = self.copy()
+        sp_grid_geom.set_nsc(nsc)
+
+        # Initialize a fake grid to compute some quantities related to the grid distribution
+        grid = Grid(grid_shape, geometry=self)
+
+        # Estimate a top limit on how many values we need to store. We estimate it by expecting
+        # each orbital to fill a sphere of radius R, being R the radius of the orbital. We also
+        # add a margin of 1 voxel so that we don't underestimate because of rounding.
+        dvolume = grid.dvolume
+        margin_R = np.linalg.norm(grid.dcell.sum(axis=0))
+        vol = 0.0
+        for atom, indices in self.sub(IA).atoms.iter(species=True):
+            vol += (4 / 3 * np.pi * (atom.R + margin_R) ** 3).sum() * len(indices)
+
+        max_vals = int(vol / dvolume)
+
+        # Array storing all the grid values
+        grid_values = np.zeros(max_vals, dtype=np.float64)
+        # Orbital indices for each orbital that has a nonzero value in the grid.
+        orbital_indices = np.full(max_vals, -1, dtype=np.int32)
+        # For each value, its index of the grid. Even if the grid is 3 dimensional,
+        # we store the raveled index. That is, a single integer representing the position
+        # of the point. One can always unravel the index if needed.
+        grid_indices = np.zeros(max_vals, dtype=np.int32)
+
+        # print(
+        #     f"Estimated memory required:",
+        #     (orbital_indices.size * 32 + grid_values.size * 64 + grid_indices.size * 32) / 8 / 1024 / 1024,
+        #     "MB"
+        # )
+
+        # Temporal variables that will help us keep track of the construction of the arrays.
+        i_value = 0
+        first_orbs = self.firsto
+        isc_off = sp_grid_geom.isc_off
+
+        # Loop over all atoms in the grid-cell
+        for ia, ia_xyz, isc in zip(IA, XYZ, ISC):
+            # Get current atom
+            atom = self.atoms[ia]
+
+            # Get the index of the cell where this atom is in the auxiliary supercell
+            index_sc = isc_off[isc[0], isc[1], isc[2]]
+            # And use it to calculate the offset on the orbital index.
+            io_offset = self.no * index_sc
+
+            # Extract maximum R
+            R = atom.maxR()
+
+            if R <= 0.0:
+                warn(f"Atom '{atom}' does not have a wave-function, skipping atom.")
+                continue
+
+            idx = sphere_grid_index(grid, ia_xyz, R)
+
+            if len(idx) == 0:
+                continue
+
+            # Get real-space coordinates for the atom
+            grid_xyz = dot(idx, grid.dcell)
+            # Convert them to spherical coordinates
+            at_r, at_theta, at_cos_phi = xyz2spherical(grid_xyz, ia_xyz)
+
+            del grid_xyz
+            # Merge the three components of spherical coordinates into one array.
+            at_spherical = np.array([at_r, at_theta, at_cos_phi]).T
+
+            # Filter out points where the distance to the atom is less than its max R.
+            at_nonzero = at_spherical[:, 0] < R
+            idx = idx[at_nonzero]
+            at_spherical = at_spherical[at_nonzero]
+
+            if len(idx) == 0:
+                continue
+
+            # Ravel multi index to save space. That is, convert the 3D grid index
+            # into a single integer. One can always unravel them if needed.
+            idx = (
+                idx[:, 0] * grid.shape[1] * grid.shape[2]
+                + idx[:, 1] * grid.shape[2]
+                + idx[:, 2]
+            )
+
+            # Loop over the orbitals
+            for io, orb in enumerate(atom.orbitals):
+                # Get the index of this orbital
+                uc_io = first_orbs[ia] + io
+
+                orb_spherical = at_spherical
+                orb_indices = idx
+
+                # The orbital's R might not be the maximum R of the atom. In that case,
+                # we don't need to calculate the values for all the grid points that are within
+                # the atom's range.
+                if R - orb.R > 1e-6:
+                    # Check which coordinates are not within this orbital's range (the radius is bigger than orbital radius)
+                    orb_nonzero = orb_spherical[:, 0] < orb.R
+
+                    orb_spherical = orb_spherical[orb_nonzero]
+                    orb_indices = orb_indices[orb_nonzero]
+
+                # Number of grid values that we are going to compute for this orbital
+                orb_nvals = orb_spherical.shape[0]
+
+                # If there are no values to add, go to the next orbital
+                if orb_nvals == 0:
+                    continue
+
+                # Compute the psi values for the grid points we are interested in
+                psi = orb.psi_spher(*orb_spherical.T, cos_phi=True)
+
+                # Update the data structure
+                values_i = slice(i_value, i_value + orb_nvals)
+                grid_values[values_i] = psi
+                grid_indices[values_i] = orb_indices
+                orbital_indices[values_i] = uc_io + io_offset
+
+                # Update the index where new values should be stored
+                i_value += orb_nvals
+
+        # Reset the error code for division
+        np.seterr(**old_err)
+
+        # Cut the arrays to return only the parts that have been filled
+        grid_values = grid_values[:i_value]
+        grid_indices = grid_indices[:i_value]
+        orbital_indices = orbital_indices[:i_value]
+
+        psi_values = csr_array(
+            (grid_values, (grid_indices, orbital_indices)),
+            shape=(np.prod(grid.shape), sp_grid_geom.no_s),
+        )
+
+        return SparseGridOrbitalBZ(grid.shape, psi_values, geometry=sp_grid_geom)
+
     # Create pickling routines
     def __getstate__(self):
         """Returns the state of this object"""
diff --git a/src/sisl/_sparse_grid.py b/src/sisl/_sparse_grid.py
new file mode 100644
index 0000000000..bf349f569b
--- /dev/null
+++ b/src/sisl/_sparse_grid.py
@@ -0,0 +1,455 @@
+from typing import Optional, Tuple, Union
+
+import numpy as np
+from scipy.sparse import issparse, spmatrix
+
+from sisl import Grid, Lattice, SparseCSR
+from sisl.physics import Overlap
+
+from ._sparse_grid_ops import (
+    reduce_grid_matvec_multiply,
+    reduce_grid_matvecs_multiply,
+    reduce_grid_sum,
+    reduce_sc_products,
+    reduce_sc_products_multicoeffs,
+    reduce_sc_products_multicoeffs_sparse_denseindex,
+)
+from .physics._phase import phase_rsc
+
+
+class Sparse4DGrid:
+    """Stores information on a 3D grid with an extra sparse dimension.
+
+    The information is stored as a matrix where the rows are the raveled indices of the
+    grid and the columns represent the extra dimension. The sparsity of the matrix is
+    handled by the usual methods.
+    """
+
+    def __init__(self, grid_shape, *args, geometry=None, lattice=None, **kwargs):
+        self.grid_shape = tuple(grid_shape)
+
+        self.geometry = geometry
+        if geometry is not None:
+            self.lattice = geometry.lattice
+        else:
+            self.lattice = lattice
+
+        self._csr = SparseCSR(*args, **kwargs)
+
+    def _transposed_grid_indices(self, *axes):
+        """Function that returns the transposed raveled indices.
+
+        Might be useful to reorder rows in the CSR matrix that stores the grid.
+
+        Not used for now.
+        """
+        csr = self._csr
+
+        # Transform the indices
+        old_indices = np.arange(csr.shape[0], dtype=np.int32)
+        old_indices_g = old_indices.reshape(self.grid_shape).ravel()
+
+        new_indices_g = old_indices_g.transpose(*axes)
+        new_indices = new_indices_g.ravel()
+        new_shape = new_indices_g.shape
+
+        return new_indices, new_shape
+
+    def reduce_dimension(
+        self, weights: Optional[np.ndarray] = None, reduce_grid: Tuple[int, ...] = ()
+    ) -> Union[Grid, np.ndarray]:
+        """Reduces the extra dimension on the grid, possibly applying weights.
+
+        It can also reduce grid axes on the same go. See the `reduce_grid` parameter.
+
+        Parameters
+        ----------
+        weights:
+            Array of weights for the columns. The first dimension of this array must have length
+            equal to the extra dimension of this grid object.
+
+            The array might optionally have a second dimension,
+            which will also be reflected as an extra dimension in the output grid.
+        reduce_grid:
+            Axes that you wish to reduce on the grid. Along these axes, the grid values will be
+            accumulated. This is very useful to reduce memory footprint (and probably also
+            computation time) if your final goal is to reduce over the grid axes, because with
+            this approach the grid axes are reduced on the fly (instead of first constructing the
+            full grid and then reducing).
+
+            This might be the difference between being able to perform the calculation or not.
+
+        Returns
+        -------
+        Union[Grid, np.ndarray]
+            The reduced grid. If the weights array has more than one dimension, it will be a
+            numpy array. Otherwise it will be a sisl.Grid object.
+        """
+        # Assess if we are dealing with multiple weights
+        multi_weights = weights is not None and (
+            weights.ndim == 2 and weights.shape[1] != 1
+        )
+        if not multi_weights and weights is not None:
+            weights = weights.ravel()
+
+        # If we don't need to reduce the grid axes, we use scipy functionality
+        if len(reduce_grid) == 0:
+            csr = self._csr.tocsr()
+            if weights is None:
+                reduced = csr.sum(axis=1)
+            else:
+                # Perform matrix-vector multiplication to reduce the extra dimension.
+                reduced = csr @ weights
+
+            if multi_weights:
+                grid = reduced.reshape(*self.grid_shape, -1)
+            else:
+                # Create a new grid and set the result of the operation as the values
+                grid = Grid(
+                    self.grid_shape, geometry=self.geometry, dtype=reduced.dtype
+                )
+                del grid.grid
+                grid.grid = reduced.reshape(self.grid_shape)
+        # If we wish to reduce the axes, we use our own functionality that reduces the grid
+        # on the fly, because it is faster and uses much less memory than reducing a posteriori.
+        else:
+            csr = self._csr
+
+            # Find out the reduced shape, and the reduce factor. The reduced factor is the number
+            # by which we have to divide the index to get the reduced index.
+            reduced_shape = list(self.grid_shape)
+            reduce_factor = 1
+
+            if len(reduce_grid) > 3:
+                raise ValueError(
+                    f"There are only 3 grid axes, it is not possible to reduce axes {reduce_grid}"
+                )
+
+            # Check if we are reducing the last dimensions. Otherwise we will need to transpose the grid to
+            # put the dimension to reduce on the last axis. Note that we will not do that explicilty, transposing
+            # is done inside the cython routines (per index) to reduce memory footprint.
+            need_transpose = False
+            sorted_reduce_axes = np.sort(reduce_grid)
+            if len(reduce_grid) > 0:
+                need_transpose = sorted_reduce_axes[-1] != 2
+            if len(reduce_grid) > 1:
+                need_transpose = need_transpose or sorted_reduce_axes[-2] != 1
+            if len(reduce_grid) > 2:
+                need_transpose = need_transpose or sorted_reduce_axes[0] != 0
+
+            # Calculate the reducing factor and the new reduced shape.
+            for reduce_axis in sorted_reduce_axes:
+                reduce_factor *= self.grid_shape[reduce_axis]
+                reduced_shape[reduce_axis] = 1
+
+            # Prepare the quantities needed to transpose.
+            grid_shape = np.array(self.grid_shape, dtype=np.int32)
+            if need_transpose:
+                not_reduced_axes = np.sort(tuple(set([0, 1, 2]) - set(reduce_grid)))
+                new_axes_order = np.array(
+                    [*not_reduced_axes, *reduce_grid], dtype=np.int32
+                )
+            else:
+                # A dummy array to pass to cython functions so that they don't complain
+                new_axes_order = np.zeros(0, dtype=np.int32)
+
+            # Determine the datatype of the output
+            if weights is not None:
+                dtype = np.result_type(weights, csr.data)
+            else:
+                dtype = csr.dtype
+
+            # If we do not have multiple weights, we store it in a sisl Grid, otherwise
+            # we store it in a raw numpy array, since sisl.Grid doesn't support extra
+            # dimensions.
+            if not multi_weights:
+                grid = Grid(reduced_shape, geometry=self.geometry, dtype=dtype)
+                out = grid.grid.ravel()
+            else:
+                grid = np.zeros((*reduced_shape, weights.shape[1]), dtype=dtype)
+                out = grid.reshape(-1, weights.shape[1])
+
+            # Apply the corresponding function to do the operation that we need to do.
+            if weights is None:
+                reduce_grid_sum(
+                    csr.data[:, 0].astype(dtype),
+                    csr.ptr,
+                    csr.col,
+                    reduce_factor=int(reduce_factor),
+                    grid_shape=grid_shape,
+                    new_axes=new_axes_order,
+                    out=out,
+                )
+            elif multi_weights:
+                reduce_grid_matvecs_multiply(
+                    csr.data[:, 0].astype(dtype),
+                    csr.ptr,
+                    csr.col,
+                    weights.astype(dtype),
+                    reduce_factor=int(reduce_factor),
+                    grid_shape=grid_shape,
+                    new_axes=new_axes_order,
+                    out=out,
+                )
+            else:
+                reduce_grid_matvec_multiply(
+                    csr.data[:, 0].astype(dtype),
+                    csr.ptr,
+                    csr.col,
+                    weights.astype(dtype),
+                    reduce_factor=int(reduce_factor),
+                    grid_shape=grid_shape,
+                    new_axes=new_axes_order,
+                    out=out,
+                )
+
+        return grid
+
+
+class SparseGridOrbitalBZ(Sparse4DGrid):
+    """Stores information on a 3D grid with an extra orbital dimension, which is sparse."""
+
+    def get_overlap_matrix(self) -> Overlap:
+        """Computes the overlap matrix.
+
+        It does so by integrating the product between each pair of orbitals on the whole grid.
+        """
+
+        # Get the volume of each grid cell, because we will need to multiply
+        # the sum of the products by it.
+        dvolume = self.lattice.volume / np.prod(self.grid_shape)
+
+        # Get the grid values as a scipy matrix
+        csr = self._csr.tocsr()
+
+        # Compute the overlap matrix. It is a simple matrix multiplication.
+        all_overlap = csr.T @ csr
+
+        # We would be done here, if it wasn't because the matrix multiplication
+        # returns a square matrix. We need to acumulate all the extra rows into
+        # the first set of rows (unit cell). For example, the overlap between
+        # supercell [0, 0, 1] and supercell [0, 0, 1] is an overlap within the same
+        # cell. We should add it to the unit cell overlap.
+        # In practice, what we do is to take the rows of the matrix that correspond
+        # to a supercell, shift the columns, and add them to the unit cell rows.
+
+        # Number of orbitals in the unit cell
+        no = self.geometry.no
+
+        # Part of the overlap matrix where we will fold everything. Shape (no, no * n_supercells)
+        overlap = all_overlap[:no]
+        # Compute number of supercells
+        n_s = overlap.shape[-1] // no
+
+        # Then loop over supercell rows, shifting them and folding into the main matrix,
+        # as explained above.
+        for isc_row in range(1, n_s):
+            next_rows = all_overlap[isc_row * no : (isc_row + 1) * no]
+            for isc_col in range(n_s):
+                start_col = (isc_col - isc_row) * no
+                if start_col < 0:
+                    start_col = overlap.shape[-1] + start_col
+
+                overlap[:, start_col : start_col + no] += next_rows[
+                    :, isc_col * no : (isc_col + 1) * no
+                ]
+
+        # Multiply by the volume of each grid cell and return an Overlap object.
+        return Overlap.fromsp(self.geometry, overlap * dvolume)
+
+    def reduce_orbitals(
+        self, weights: np.ndarray, k: Tuple[float, float, float] = (0, 0, 0), **kwargs
+    ) -> Union[Grid, np.ndarray]:
+        """Reduces the orbitals dimension, applying phases if needed.
+
+        Parameters
+        ----------
+        weights:
+            Array of weights for the orbitals. The first dimension of this array must have length
+            equal to the number of orbitals in the unit cell.
+
+            The array might optionally have a second dimension,
+            which will also be reflected as an extra dimension in the output grid.
+        k:
+            The k point for which the values are to be projected. This will be used to compute a
+            phase for the contributions of orbitals outside the unit cell.
+        **kwargs:
+            passed directly to reduce_dimension
+
+        Returns
+        -------
+        Union[Grid, np.ndarray]
+            The reduced grid. If the weights array has more than one dimension, it will be a
+            numpy array. Otherwise it will be a sisl.Grid object.
+        """
+        # Assess if we are dealing with multiple weights
+        multi_weights = weights.ndim == 2 and weights.shape[1] != 1
+        if not multi_weights:
+            weights = weights.reshape(-1, 1)
+
+        if weights.shape[0] == self.geometry.no:
+            # We should expand the coefficients to the whole supercell, applying phases.
+            k = np.array(k, dtype=np.float64)
+
+            if np.any(k > 1e-5):
+                # Compute phases
+                phases = phase_rsc(self.geometry.lattice, k, dtype=np.complex128)
+                # Expand the coefficients array.
+                weights = (
+                    phases.reshape(-1, 1)
+                    .dot(weights.reshape(1, -1))
+                    .reshape(-1, weights.shape[1])
+                )
+            else:
+                # At gamma we just need to tile the coefficients array
+                n_s = self.geometry.n_s
+                weights = (
+                    np.tile(weights.T, n_s)
+                    .reshape(weights.shape[1], weights.shape[0] * n_s)
+                    .T
+                )
+
+        elif weights.shape[0] != self.geometry.no_s:
+            raise ValueError(
+                f"The coefficients array must be of length no ({self.geometry.no}) or no_s ({self.geometry.no_s}). It is of length {weights.shape[0]}"
+            )
+
+        return self.reduce_dimension(weights, **kwargs)
+
+    def reduce_orbital_products(
+        self,
+        weights: Union[np.ndarray, SparseCSR, spmatrix],
+        weights_lattice: Lattice,
+        reduce_grid: Tuple[int, ...] = (),
+        out: Optional[np.ndarray] = None,
+    ) -> np.ndarray:
+        """Reduces the orbital dimension by taking products of orbitals with weights.
+
+        Parameters
+        ----------
+        weights:
+            The coefficient to apply to each orbital product. The first two dimensions of this
+            array must be ``(no, no_s)``. Therefore each entry ``weights[i, j]`` is the coefficient
+            to apply to the product of orbitals ``i`` and ``j``.
+
+            The array might optionally have a third dimension, which will also be reflected as an
+            extra dimension in the output grid. This is useful to compute multiple quantities at
+            the same time, because orbital products are only computed once.
+
+            The ``weights`` array be either a dense or sparse array.
+        weights_lattice:
+            The lattice associated to the weights array. It might be different from the one
+            associated to the ``SparseGridOrbitalBZ`` object.
+        reduce_grid:
+            Axes that you wish to reduce on the grid. Along these axes, the grid values will be
+            accumulated. This is very useful to reduce memory footprint (and probably also
+            computation time) if your final goal is to reduce over the grid axes, because with
+            this approach the grid axes are reduced on the fly (instead of first constructing the
+            full grid and then reducing).
+
+            This might be the difference between being able to perform the calculation or not.
+        out:
+            If you wish to store the result in a preallocated array, you can pass it here.
+            It is your responsibility to ensure that the array has the correct shape and dtype.
+
+        Returns
+        -------
+        np.ndarray
+            The output grid. If ``out`` is not ``None``, it will be the same as ``out``. Otherwise
+            it will be a newly created array.
+        """
+        csr = self._csr
+
+        # Find out the reduced shape, and the reduce factor. The reduced factor is the number
+        # by which we have to divide the index to get the reduced index.
+        reduced_shape = list(self.grid_shape)
+        reduce_factor = 1
+
+        if len(reduce_grid) > 3:
+            raise ValueError(
+                f"There are only 3 grid axes, it is not possible to reduce axes {reduce_grid}"
+            )
+
+        # Check if we are reducing the last dimensions. Otherwise we will need to transpose the grid to
+        # put the dimension to reduce on the last axis. Note that we will not do that explicilty, transposing
+        # is done inside the cython routines (per index) to reduce memory footprint.
+        need_transpose = False
+        sorted_reduce_axes = np.sort(reduce_grid)
+        if len(reduce_grid) > 0:
+            need_transpose = sorted_reduce_axes[-1] != 2
+        if len(reduce_grid) > 1:
+            need_transpose = need_transpose or sorted_reduce_axes[-2] != 1
+        if len(reduce_grid) > 2:
+            need_transpose = need_transpose or sorted_reduce_axes[0] != 0
+
+        # Calculate the reducing factor and the new reduced shape.
+        for reduce_axis in sorted_reduce_axes:
+            reduce_factor *= self.grid_shape[reduce_axis]
+            reduced_shape[reduce_axis] = 1
+
+        # Prepare the quantities needed to transpose.
+        grid_shape = np.array(self.grid_shape, dtype=np.int32)
+        if need_transpose:
+            not_reduced_axes = np.sort(tuple(set([0, 1, 2]) - set(reduce_grid)))
+            new_axes_order = np.array([*not_reduced_axes, *reduce_grid], dtype=np.int32)
+        else:
+            # A dummy array to pass to cython functions so that they don't complain
+            new_axes_order = np.zeros(0, dtype=np.int32)
+
+        # If it is a scipy sparse matrix, convert it to a sisl sparse matrix
+        if issparse(weights):
+            weights = SparseCSR(weights)
+
+        sparse_coeffs = isinstance(weights, SparseCSR)
+        multi_coeffs = len(weights.shape) == 3 and weights.shape[2] > 1
+
+        if out is None:
+            dtype = np.result_type(weights, csr)
+        else:
+            dtype = out.dtype
+
+        # Decide which function to use.
+        if sparse_coeffs:
+            if multi_coeffs:
+                reduce_func = reduce_sc_products_multicoeffs_sparse_denseindex
+                weights.astype = weights.copy
+            else:
+                reduce_func = reduce_sc_products
+                weights = weights.tocsr().toarray(order="C")
+        else:
+            if multi_coeffs:
+                reduce_func = reduce_sc_products_multicoeffs
+            else:
+                reduce_func = reduce_sc_products
+
+        if out is not None:
+            grid = out
+
+        if multi_coeffs:
+            if out is None:
+                grid = np.zeros((*reduced_shape, weights.shape[2]), dtype=dtype)
+                out = grid
+
+            out = grid.reshape(-1, weights.shape[2])
+        else:
+            if out is None:
+                grid = Grid(reduced_shape, geometry=self.geometry, dtype=dtype)
+                out = grid.grid
+
+            out = out.ravel()
+
+        reduce_func(
+            csr.data[:, 0].astype(dtype),
+            csr.ptr,
+            csr.col,
+            weights.astype(dtype=dtype),
+            self.geometry.no,
+            self.lattice.sc_off,
+            weights_lattice.isc_off,
+            reduce_factor=int(reduce_factor),
+            grid_shape=grid_shape,
+            new_axes=new_axes_order,
+            out=out,
+        )
+
+        return grid
diff --git a/src/sisl/_sparse_grid_ops.pyx b/src/sisl/_sparse_grid_ops.pyx
new file mode 100644
index 0000000000..8608ea8cb3
--- /dev/null
+++ b/src/sisl/_sparse_grid_ops.pyx
@@ -0,0 +1,904 @@
+import cython
+import numpy as np
+
+cimport numpy as np
+
+import math
+
+from sisl import SparseCSR
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+@cython.cdivision(True)
+cpdef cython.int transpose_raveled_index(index: cython.int, grid_shape: np.int32_t[:], new_order: np.int32_t[:]):
+    """Transposes a raveled index, given the shape that raveled it and the new axis order.
+
+    Currently it only works for 3D indices.
+
+    Parameters
+    -----------
+    index:
+        The index to transpose.
+    grid_shape:
+        The old grid shape.
+    new_order:
+        The new axes order. E.g. [1, 2, 0] will move the first axis to the last dimension.
+    """
+
+    unraveled: cython.int[3]
+    new_grid_shape: cython.int[3]
+    new_unraveled: cython.int[3]
+
+    remaining: cython.int
+    iaxis: cython.int
+
+    new_raveled: cython.int
+
+    remaining = index
+    for iaxis in range(2, -1, -1):
+        unraveled[iaxis] = remaining % grid_shape[iaxis]
+        remaining = remaining // grid_shape[iaxis]
+
+    for iaxis in range(3):
+        new_grid_shape[iaxis] = grid_shape[new_order[iaxis]]
+        new_unraveled[iaxis] = unraveled[new_order[iaxis]]
+
+    new_raveled = new_unraveled[2] + new_unraveled[1] * new_grid_shape[2] + new_unraveled[0] * new_grid_shape[1] * new_grid_shape[2]
+
+    return new_raveled
+
+# This function should be in sisl._sparse, but I don't know how to import it from there
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def dense_index(shape, ptr: np.int32_t[:], col: np.int32_t[:]):
+    """Returns a dense array containing the value index for each nonzero (row, col) element.
+
+    Zero elements are asigned an index of -1, so routines using this dense index should
+    take this into account.
+
+    Parameters
+    -----------
+    shape: tuple of shape (2, )
+        The shape of the sparse matrix
+    ptr, col:
+        ptr and col descriptors of the matrix in CSR format
+
+    Returns
+    --------
+    np.ndarray:
+        Numpy array of shape = matrix shape and data type np.int32.
+    """
+    nrows: cython.int = ptr.shape[0] - 1
+    row: cython.int
+    row_start: cython.int
+    row_end: cython.int
+    ival: cython.int
+    val_col: cython.int
+
+    # Initialize the dense index
+    dense_index: np.int32_t[:, :] = np.full(shape, -1, dtype=np.int32)
+
+    for row in range(nrows):
+        row_start = ptr[row]
+        row_end = ptr[row + 1]
+
+        for ival in range(row_start, row_end):
+            val_col = col[ival]
+            dense_index[row, val_col] = ival
+
+    return np.asarray(dense_index)
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+@cython.cdivision(True)
+def reduce_grid_sum(
+    cython.numeric[:] data, ptr: np.int32_t[:], col: np.int32_t[:],
+    reduce_factor: cython.int, grid_shape: np.int32_t[:], new_axes: np.int32_t[:],
+    cython.numeric[:] out
+):
+    """Performs sum over the extra dimension while reducing other dimensions of the grid.
+
+    Parameters
+    ----------
+    data:
+        The data values of the sparse matrix
+    ptr:
+        The array with pointers to the start of each row for the sparse matrix.
+    col:
+        The array containing the column index for each value in the sparse matrix.
+    reduce_factor:
+        Each row index is integer divided (//) by this factor to get the row index where
+        the result should be stored.
+    grid_shape:
+        If you want to transpose the grid indices before reducing it, you need to specify
+        the (old) grid shape. If not, pass here an array of shape 0.
+    new_axes:
+        If you want to transpose the grid indices before reducing it, you need to specify
+        the nex axes. As an example, [1, 2, 0] will move the first axis to the last dimension.
+
+        If you don't want to transpose, pass an array of shape 0.
+    out:
+        The array where the output should be stored.
+    """
+    nrows: cython.int = ptr.shape[0] - 1
+
+    i: cython.int
+    reduced_i: cython.int
+    j: cython.int
+    jcol: cython.int
+
+    need_transpose: cython.bint = new_axes.shape[0] > 1
+
+    cdef cython.numeric row_value
+
+    for i in range(nrows):
+
+        if need_transpose:
+            reduced_i = transpose_raveled_index(i, grid_shape, new_axes) // reduce_factor
+        else:
+            reduced_i = i // reduce_factor
+
+        row_value = 0
+        for j in range(ptr[i], ptr[i + 1]):
+            jcol = col[j]
+            row_value += data[j]
+
+        out[reduced_i] = out[reduced_i] + row_value
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+@cython.cdivision(True)
+def reduce_grid_matvec_multiply(
+    cython.numeric[:] data, ptr: np.int32_t[:], col: np.int32_t[:], cython.numeric[:] V,
+    reduce_factor: cython.int, grid_shape: np.int32_t[:], new_axes: np.int32_t[:],
+    cython.numeric[:] out
+):
+    """Performs a matrix-vector multiplication while reducing other dimensions of the grid.
+
+    Parameters
+    ----------
+    data:
+        The data values of the sparse matrix
+    ptr:
+        The array with pointers to the start of each row for the sparse matrix.
+    col:
+        The array containing the column index for each value in the sparse matrix.
+    V:
+        The dense vector by which the sparse matrix is to be multiplied.
+    reduce_factor:
+        Each row index is integer divided (//) by this factor to get the row index where
+        the result should be stored.
+    grid_shape:
+        If you want to transpose the grid indices before reducing it, you need to specify
+        the (old) grid shape. If not, pass here an array of shape 0.
+    new_axes:
+        If you want to transpose the grid indices before reducing it, you need to specify
+        the nex axes. As an example, [1, 2, 0] will move the first axis to the last dimension.
+
+        If you don't want to transpose, pass an array of shape 0.
+    out:
+        The array where the output should be stored.
+    """
+
+    nrows: cython.int = ptr.shape[0] - 1
+
+    i: cython.int
+    reduced_i: cython.int
+    j: cython.int
+    jcol: cython.int
+
+    need_transpose: cython.bint = new_axes.shape[0] > 1
+
+    cdef cython.numeric row_value
+
+    for i in range(nrows):
+        reduced_i = i // reduce_factor
+
+        if need_transpose:
+            reduced_i = transpose_raveled_index(i, grid_shape, new_axes) // reduce_factor
+        else:
+            reduced_i = i // reduce_factor
+
+        row_value = 0
+        for j in range(ptr[i], ptr[i + 1]):
+            jcol = col[j]
+            row_value += data[j] * V[jcol]
+
+        out[reduced_i] = out[reduced_i] + row_value
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+@cython.cdivision(True)
+def reduce_grid_matvecs_multiply(
+    cython.numeric[:] data, np.int32_t[:] ptr, np.int32_t[:] col, cython.numeric[:, :] V,
+    reduce_factor: cython.int, grid_shape: np.int32_t[:], new_axes: np.int32_t[:],
+    cython.numeric[:, :] out
+):
+    """Performs a matrix-matrix multiplication while reducing other dimensions of the grid.
+
+    Parameters
+    ----------
+    data:
+        The data values of the sparse matrix
+    ptr:
+        The array with pointers to the start of each row for the sparse matrix.
+    col:
+        The array containing the column index for each value in the sparse matrix.
+    V:
+        The dense matrix by which the sparse matrix is to be multiplied.
+    reduce_factor:
+        Each row index is integer divided (//) by this factor to get the row index where
+        the result should be stored.
+    grid_shape:
+        If you want to transpose the grid indices before reducing it, you need to specify
+        the (old) grid shape. If not, pass here an array of shape 0.
+    new_axes:
+        If you want to transpose the grid indices before reducing it, you need to specify
+        the nex axes. As an example, [1, 2, 0] will move the first axis to the last dimension.
+
+        If you don't want to transpose, pass an array of shape 0.
+    out:
+        The array where the output should be stored.
+    """
+
+    nrows: cython.int = ptr.shape[0] - 1
+    nvecs: cython.int = V.shape[1]
+
+    i: cython.int
+    reduced_i: cython.int
+    j: cython.int
+    jcol: cython.int
+    ivec: cython.int
+
+    need_transpose: cython.bint = new_axes.shape[0] > 1
+
+    cdef cython.numeric[:] row_value = np.zeros(nvecs, np.asarray(data).dtype)
+
+    for i in range(nrows):
+        if need_transpose:
+            reduced_i = transpose_raveled_index(i, grid_shape, new_axes) // reduce_factor
+        else:
+            reduced_i = i // reduce_factor
+
+        row_value[:] = 0
+        for j in range(ptr[i], ptr[i + 1]):
+            jcol = col[j]
+
+            for ivec in range(nvecs):
+                row_value[ivec] = row_value[ivec] + data[j] * V[jcol, ivec]
+
+        for ivec in range(nvecs):
+            out[reduced_i, ivec] = out[reduced_i, ivec] + row_value[ivec]
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+@cython.cdivision(True)
+def reduce_sc_products(
+    cython.floating[:] data, ptr: np.int32_t[:], col: np.int32_t[:], cython.floating[:, :] coeffs,
+    uc_ncol: cython.int, data_sc_off: np.int32_t[:, :], coeffs_isc_off: np.int32_t[:, :, :],
+    reduce_factor: cython.int, grid_shape: np.int32_t[:], new_axes: np.int32_t[:],
+    cython.floating[:] out
+):
+    """For each row, sums all possible products between column pairs.
+
+    A coefficients array is accepted to weight each product.
+
+    This function is to be used when columns are some local property of the cell (orbital, atom...) and
+    there are interactions between neighbouring cells.
+
+    For each pair of columns ij, the offset between columns is calculated. Then, the function looks
+    in the coeffs array (which potentially contains information for interactions between cells) for the ij pair
+    with that offset.
+
+    This
+
+    Parameters
+    ----------
+    data:
+        The data values of the sparse matrix
+    ptr:
+        The array with pointers to the start of each row for the sparse matrix.
+    col:
+        The array containing the column index for each value in the sparse matrix.
+    coeffs:
+        The dense matrix containing weights for all possible products.
+    uc_ncol:
+        Number of columns for each unit cell.
+    reduce_factor:
+        Each row index is integer divided (//) by this factor to get the row index where
+        the result should be stored.
+    grid_shape:
+        If you want to transpose the grid indices before reducing it, you need to specify
+        the (old) grid shape. If not, pass here an array of shape 0.
+    new_axes:
+        If you want to transpose the grid indices before reducing it, you need to specify
+        the nex axes. As an example, [1, 2, 0] will move the first axis to the last dimension.
+
+        If you don't want to transpose, pass an array of shape 0.
+    out:
+        The array where the output should be stored.
+    """
+    nrows: cython.int = ptr.shape[0] - 1
+
+    # Indices to handle rows
+    row: cython.int
+    reduced_row: cython.int
+    row_start: cython.int
+    row_end: cython.int
+
+    # Indices to handle pairs of columns (ij)
+    i: cython.int
+    icol: cython.int
+    uc_icol: cython.int
+    sc_icol: cython.int
+    icol_sc: cython.int
+    ipair_sc: cython.int
+
+    j: cython.int
+    jcol: cython.int
+    uc_jcol: cython.int
+    sc_jcol: cython.int
+    jcol_sc: cython.int
+    jpair_sc: cython.int
+
+    # Temporal storage to build values
+    cdef cython.floating row_value
+    cdef cython.floating i_row_value
+
+    # Index to loop over axes of the grid.
+    iaxis: cython.int
+
+    # Variables that will help managing orbital pairs that are not within the same cell.
+    coeffs_nsc: cython.int[3]
+    sc_diff: cython.int[3]
+    inv_sc_diff: cython.int[3]
+    force_same_cell: cython.bint
+    same_cell: cython.bint
+
+    # Boolean to store whether we should reduce row indices
+    grid_reduce: cython.bint = reduce_factor > 1
+    # Do we need to transpose while reducing?
+    need_transpose: cython.bint = new_axes.shape[0] > 1
+
+    # Calculate the number of cells in each direction that the supercell is built of.
+    # This will be useful just to convert negative supercell indices to positive ones.
+    # If the number of supercells is 1, we assume that even intercell overlaps
+    # (if any) have been stored in the unit cell. This is what SIESTA does for gamma point calculations
+    # with nsc <= 3.
+    force_same_cell = True
+    for iaxis in range(3):
+        coeffs_nsc[iaxis] = coeffs_isc_off.shape[iaxis]
+        if coeffs_nsc[iaxis] != 1:
+            force_same_cell = False
+
+    # Loop over rows.
+    for row in range(nrows):
+        # Get the potentially reduced index of the output row where we should store the
+        # results for this row.
+        reduced_row = row
+        if grid_reduce:
+
+            if need_transpose:
+                reduced_row = transpose_raveled_index(row, grid_shape, new_axes)
+
+            reduced_row = reduced_row // reduce_factor
+
+        # Get the limits of this row
+        row_start = ptr[row]
+        row_end = ptr[row + 1]
+
+        # Initialize the row value.
+        row_value = 0
+
+        # For each row, loop over pairs of columns (ij).
+        # We add both ij and ji contributions, therefore we only need to loop over j greater than i.
+        # We do this because it is very easy if orbitals i and j are in the same cell. We also save
+        # some computation if they are not.
+        for i in range(row_start, row_end):
+            icol = col[i]
+            # Initialize the value for all pairs that we found for i
+            i_row_value = 0
+
+            # Precompute the supercell index of icol (will compare it to that of jcol)
+            icol_sc = icol // uc_ncol
+            # And also its unit cell index
+            uc_icol = icol % uc_ncol
+
+            # Get also the
+            for j in range(i, row_end):
+                jcol = col[j]
+
+                jcol_sc = jcol // uc_ncol
+                # Get the unit cell index of jcol
+                uc_jcol = jcol % uc_ncol
+
+                same_cell = force_same_cell
+                # If same cell interactions are not forced, we need to discover if this pair
+                # of columns is within the same cell.
+                if not force_same_cell:
+                    same_cell = icol_sc == jcol_sc
+
+                # If the columns are not in the same cell, we need to
+                # (1) Calculate the supercell offset between icol and jcol
+                # (2) And then calculate the new index for jcol, moving icol to the unit cell
+                # (3) Do the same in the reverse direction (jcol -> icol)
+                if not same_cell:
+                    # Calculate the sc offset between both orbitals.
+                    for iaxis in range(3):
+                        sc_diff[iaxis] = data_sc_off[jcol_sc, iaxis] - data_sc_off[icol_sc, iaxis]
+                        # Calculate also the offset in the reverse direction
+                        inv_sc_diff[iaxis] = - sc_diff[iaxis]
+
+                        # If the sc_difference is negative, convert it to positive so that we can
+                        # use it to index the isc_off array (we switched off the handling of negative
+                        # indices in cython with wraparound(False))
+                        if sc_diff[iaxis] < 0:
+                            sc_diff[iaxis] = coeffs_nsc[iaxis] + sc_diff[iaxis]
+                        elif inv_sc_diff[iaxis] < 0:
+                            inv_sc_diff[iaxis] = coeffs_nsc[iaxis] + inv_sc_diff[iaxis]
+
+                    # Get the supercell offset index of jcol with respect to icol
+                    jpair_sc = coeffs_isc_off[sc_diff[0], sc_diff[1], sc_diff[2]]
+                    # And use it to calculate the supercell index of the j orbital in this ij pair
+                    sc_jcol = jpair_sc * uc_ncol + uc_jcol
+
+                    # Do the same for the ji pair
+                    ipair_sc = coeffs_isc_off[inv_sc_diff[0], inv_sc_diff[1], inv_sc_diff[2]]
+                    sc_icol = ipair_sc * uc_ncol + uc_icol
+
+                # Add the contribution of this column pair to the row total value. Note that we only
+                # multiply the coefficients by data[j] here. This is because this loop is over all j
+                # that pair with a given i. data[i] is a common factor and therefore we can multiply
+                # after the loop to save operations.
+                if same_cell:
+                    if icol == jcol:
+                        i_row_value += coeffs[uc_icol, uc_jcol] * data[j]
+                    else:
+                        i_row_value += (coeffs[uc_icol, uc_jcol] + coeffs[uc_jcol, uc_icol]) * data[j]
+                else:
+                    i_row_value += (coeffs[uc_icol, sc_jcol] + coeffs[uc_jcol, sc_icol]) * data[j]
+
+            # Multiply all the contributions of ij pairs with this i by data[i], as explained inside the j loop.
+            i_row_value *= data[i]
+
+            # Add the contribution of all ij pairs for this i to the row value.
+            row_value += i_row_value
+
+        # Store the row value in the output
+        out[reduced_row] = out[reduced_row] + row_value
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+@cython.cdivision(True)
+def reduce_sc_products_multicoeffs(
+    cython.floating[:] data, ptr: np.int32_t[:], col: np.int32_t[:], cython.floating[:, :, :] coeffs,
+    uc_ncol: cython.int, data_sc_off: np.int32_t[:, :], coeffs_isc_off: np.int32_t[:, :, :],
+    reduce_factor: cython.int, grid_shape: np.int32_t[:], new_axes: np.int32_t[:],
+    cython.floating[:, :] out,
+):
+    """For each row, sums all possible products between column pairs.
+
+    A coefficients array is accepted to weight each product. The with reduce_sc_products is that
+    this function accepts multiple
+
+    This function is to be used when columns are some local property of the cell (orbital, atom...) and
+    there are interactions between neighbouring cells.
+
+    For each pair of columns ij, the offset between columns is calculated. Then, the function looks
+    in the coeffs array (which potentially contains information for interactions between cells) for the ij pair
+    with that offset.
+
+    Parameters
+    ----------
+    data:
+        The data values of the sparse matrix
+    ptr:
+        The array with pointers to the start of each row for the sparse matrix.
+    col:
+        The array containing the column index for each value in the sparse matrix.
+    coeffs:
+        The dense matrix containing weights for all possible products.
+    uc_ncol:
+        Number of columns for each unit cell.
+    reduce_factor:
+        Each row index is integer divided (//) by this factor to get the row index where
+        the result should be stored.
+    grid_shape:
+        If you want to transpose the grid indices before reducing it, you need to specify
+        the (old) grid shape. If not, pass here an array of shape 0.
+    new_axes:
+        If you want to transpose the grid indices before reducing it, you need to specify
+        the nex axes. As an example, [1, 2, 0] will move the first axis to the last dimension.
+
+        If you don't want to transpose, pass an array of shape 0.
+    out:
+        The array where the output should be stored.
+    """
+    nrows: cython.int = ptr.shape[0] - 1
+
+    # Indices to handle rows
+    row: cython.int
+    reduced_row: cython.int
+    row_start: cython.int
+    row_end: cython.int
+
+    # Indices to handle pairs of columns (ij)
+    i: cython.int
+    icol: cython.int
+    uc_icol: cython.int
+    sc_icol: cython.int
+    icol_sc: cython.int
+    ipair_sc: cython.int
+
+    j: cython.int
+    jcol: cython.int
+    uc_jcol: cython.int
+    sc_jcol: cython.int
+    jcol_sc: cython.int
+    jpair_sc: cython.int
+
+    # Variables to handle multiple productcoefficients
+    ncoeffs: cython.int = coeffs.shape[2]
+    icoeff: cython.int
+
+    # Temporal storage to build values
+    cdef cython.floating[:] row_value = np.zeros(ncoeffs, dtype=np.asarray(data).dtype)
+    cdef cython.floating[:] i_row_value = np.zeros(ncoeffs, dtype=np.asarray(data).dtype)
+
+    # Index to loop over axes of the grid.
+    iaxis: cython.int
+
+    # Variables that will help managing orbital pairs that are not within the same cell.
+    coeffs_nsc: cython.int[3]
+    sc_diff: cython.int[3]
+    inv_sc_diff: cython.int[3]
+    force_same_cell: cython.bint
+    same_cell: cython.bint
+
+    # Boolean to store whether we should reduce row indices
+    grid_reduce: cython.bint = reduce_factor > 1
+    # Do we need to transpose while reducing?
+    need_transpose: cython.bint = new_axes.shape[0] > 1
+
+    # Calculate the number of cells in each direction that the supercell is built of.
+    # This will be useful just to convert negative supercell indices to positive ones.
+    # If the number of supercells is 1, we assume that even intercell overlaps
+    # (if any) have been stored in the unit cell. This is what SIESTA does for gamma point calculations
+    # with nsc <= 3.
+    force_same_cell = True
+    for iaxis in range(3):
+        coeffs_nsc[iaxis] = coeffs_isc_off.shape[iaxis]
+        if coeffs_nsc[iaxis] != 1:
+            force_same_cell = False
+
+    # Loop over rows.
+    for row in range(nrows):
+        # Get the potentially reduced index of the output row where we should store the
+        # results for this row.
+        reduced_row = row
+        if grid_reduce:
+
+            if need_transpose:
+                reduced_row = transpose_raveled_index(row, grid_shape, new_axes)
+
+            reduced_row = reduced_row // reduce_factor
+
+        # Get the limits of this row
+        row_start = ptr[row]
+        row_end = ptr[row + 1]
+
+        # Initialize the row value.
+        row_value[:] = 0
+
+        # For each row, loop over pairs of columns (ij).
+        # We add both ij and ji contributions, therefore we only need to loop over j greater than i.
+        # We do this because it is very easy if orbitals i and j are in the same cell. We also save
+        # some computation if they are not.
+        for i in range(row_start, row_end):
+            icol = col[i]
+            # Initialize the value for all pairs that we found for i
+            i_row_value[:] = 0
+
+            # Precompute the supercell index of icol (will compare it to that of jcol)
+            icol_sc = icol // uc_ncol
+            # And also its unit cell index
+            uc_icol = icol % uc_ncol
+
+            # Get also the
+            for j in range(i, row_end):
+                jcol = col[j]
+
+                jcol_sc = jcol // uc_ncol
+                # Get the unit cell index of jcol
+                uc_jcol = jcol % uc_ncol
+
+                same_cell = force_same_cell
+                # If same cell interactions are not forced, we need to discover if this pair
+                # of columns is within the same cell.
+                if not force_same_cell:
+                    same_cell = icol_sc == jcol_sc
+
+                # If the columns are not in the same cell, we need to
+                # (1) Calculate the supercell offset between icol and jcol
+                # (2) And then calculate the new index for jcol, moving icol to the unit cell
+                # (3) Do the same in the reverse direction (jcol -> icol)
+                if not same_cell:
+                    # Calculate the sc offset between both orbitals.
+                    for iaxis in range(3):
+                        sc_diff[iaxis] = data_sc_off[jcol_sc, iaxis] - data_sc_off[icol_sc, iaxis]
+                        # Calculate also the offset in the reverse direction
+                        inv_sc_diff[iaxis] = - sc_diff[iaxis]
+
+                        # If the sc_difference is negative, convert it to positive so that we can
+                        # use it to index the isc_off array (we switched off the handling of negative
+                        # indices in cython with wraparound(False))
+                        if sc_diff[iaxis] < 0:
+                            sc_diff[iaxis] = coeffs_nsc[iaxis] + sc_diff[iaxis]
+                        elif inv_sc_diff[iaxis] < 0:
+                            inv_sc_diff[iaxis] = coeffs_nsc[iaxis] + inv_sc_diff[iaxis]
+
+                    # Get the supercell offset index of jcol with respect to icol
+                    jpair_sc = coeffs_isc_off[sc_diff[0], sc_diff[1], sc_diff[2]]
+                    # And use it to calculate the supercell index of the j orbital in this ij pair
+                    sc_jcol = jpair_sc * uc_ncol + uc_jcol
+
+                    # Do the same for the ji pair
+                    ipair_sc = coeffs_isc_off[inv_sc_diff[0], inv_sc_diff[1], inv_sc_diff[2]]
+                    sc_icol = ipair_sc * uc_ncol + uc_icol
+
+                # Add the contribution of this column pair to the row total value. Note that we only
+                # multiply the coefficients by data[j] here. This is because this loop is over all j
+                # that pair with a given i. data[i] is a common factor and therefore we can multiply
+                # after the loop to save operations.
+
+                # Do it for all coefficients.
+                for icoeff in range(ncoeffs):
+                    if same_cell:
+                        if icol == jcol:
+                            i_row_value[icoeff] += coeffs[uc_icol, uc_jcol, icoeff] * data[j]
+                        else:
+                            i_row_value[icoeff] += (coeffs[uc_icol, uc_jcol, icoeff] + coeffs[uc_jcol, uc_icol, icoeff]) * data[j]
+                    else:
+                        i_row_value[icoeff] += (coeffs[uc_icol, sc_jcol, icoeff] + coeffs[uc_jcol, sc_icol, icoeff]) * data[j]
+
+            for icoeff in range(ncoeffs):
+                # Multiply all the contributions of ij pairs with this i by data[i], as explained inside the j loop.
+                i_row_value[icoeff] *= data[i]
+
+                # Add the contribution of all ij pairs for this i to the row value.
+                row_value[icoeff] += i_row_value[icoeff]
+
+        for icoeff in range(ncoeffs):
+            # Store the row value in the output
+            out[reduced_row, icoeff] = out[reduced_row, icoeff] + row_value[icoeff]
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+@cython.cdivision(True)
+def reduce_sc_products_multicoeffs_sparse_denseindex(
+    cython.floating[:] data, ptr: np.int32_t[:], col: np.int32_t[:], coeffs_csr: SparseCSR,
+    uc_ncol: cython.int, data_sc_off: np.int32_t[:, :], coeffs_isc_off: np.int32_t[:, :, :],
+    reduce_factor: cython.int, grid_shape: np.int32_t[:], new_axes: np.int32_t[:],
+    cython.floating[:, :] out,
+):
+    """For each row, sums all possible products between column pairs.
+
+    A coefficients array is accepted to weight each product. The with reduce_sc_products is that
+    this function accepts multiple
+
+    This function is to be used when columns are some local property of the cell (orbital, atom...) and
+    there are interactions between neighbouring cells.
+
+    For each pair of columns ij, the offset between columns is calculated. Then, the function looks
+    in the coeffs array (which potentially contains information for interactions between cells) for the ij pair
+    with that offset.
+
+    Parameters
+    ----------
+    data:
+        The data values of the sparse matrix
+    ptr:
+        The array with pointers to the start of each row for the sparse matrix.
+    col:
+        The array containing the column index for each value in the sparse matrix.
+    coeffs:
+        The dense matrix containing weights for all possible products.
+    uc_ncol:
+        Number of columns for each unit cell.
+    reduce_factor:
+        Each row index is integer divided (//) by this factor to get the row index where
+        the result should be stored.
+    grid_shape:
+        If you want to transpose the grid indices before reducing it, you need to specify
+        the (old) grid shape. If not, pass here an array of shape 0.
+    new_axes:
+        If you want to transpose the grid indices before reducing it, you need to specify
+        the nex axes. As an example, [1, 2, 0] will move the first axis to the last dimension.
+
+        If you don't want to transpose, pass an array of shape 0.
+    out:
+        The array where the output should be stored.
+    """
+    nrows: cython.int = ptr.shape[0] - 1
+
+    # Indices to handle rows
+    row: cython.int
+    reduced_row: cython.int
+    row_start: cython.int
+    row_end: cython.int
+
+    # Indices to handle pairs of columns (ij)
+    i: cython.int
+    icol: cython.int
+    uc_icol: cython.int
+    sc_icol: cython.int
+    icol_sc: cython.int
+    ipair_sc: cython.int
+
+    j: cython.int
+    jcol: cython.int
+    uc_jcol: cython.int
+    sc_jcol: cython.int
+    jcol_sc: cython.int
+    jpair_sc: cython.int
+
+    # Extra variables to handle coefficients
+    cdef cython.floating[:, :] coeffs = coeffs_csr.data
+    # Get the array containing all the dense indices.
+    dense_idx: np.int32_t[:, :] = dense_index(coeffs_csr.shape[:2], coeffs_csr.ptr, coeffs_csr.col)
+    coeff_index: cython.int
+    coeff_index2: cython.int
+
+    # Variables to handle multiple productcoefficients
+    ncoeffs: cython.int = coeffs.shape[1]
+    icoeff: cython.int
+
+    # Temporal storage to build values
+    cdef cython.floating[:] row_value = np.zeros(ncoeffs, dtype=np.asarray(data).dtype)
+    cdef cython.floating[:] i_row_value = np.zeros(ncoeffs, dtype=np.asarray(data).dtype)
+
+    # Index to loop over axes of the grid.
+    iaxis: cython.int
+
+    # Variables that will help managing orbital pairs that are not within the same cell.
+    coeffs_nsc: cython.int[3]
+    sc_diff: cython.int[3]
+    inv_sc_diff: cython.int[3]
+    force_same_cell: cython.bint
+    same_cell: cython.bint
+
+    # Boolean to store whether we should reduce row indices
+    grid_reduce: cython.bint = reduce_factor > 1
+    # Do we need to transpose while reducing?
+    need_transpose: cython.bint = new_axes.shape[0] > 1
+
+    # Calculate the number of cells in each direction that the supercell is built of.
+    # This will be useful just to convert negative supercell indices to positive ones.
+    # If the number of supercells is 1, we assume that even intercell overlaps
+    # (if any) have been stored in the unit cell. This is what SIESTA does for gamma point calculations
+    # with nsc <= 3.
+    force_same_cell = True
+    for iaxis in range(3):
+        coeffs_nsc[iaxis] = coeffs_isc_off.shape[iaxis]
+        if coeffs_nsc[iaxis] != 1:
+            force_same_cell = False
+
+    # Loop over rows.
+    for row in range(nrows):
+        # Get the potentially reduced index of the output row where we should store the
+        # results for this row.
+        reduced_row = row
+        if grid_reduce:
+
+            if need_transpose:
+                reduced_row = transpose_raveled_index(row, grid_shape, new_axes)
+
+            reduced_row = reduced_row // reduce_factor
+
+        # Get the limits of this row
+        row_start = ptr[row]
+        row_end = ptr[row + 1]
+
+        # Initialize the row value.
+        row_value[:] = 0
+
+        # For each row, loop over pairs of columns (ij).
+        # We add both ij and ji contributions, therefore we only need to loop over j greater than i.
+        # We do this because it is very easy if orbitals i and j are in the same cell. We also save
+        # some computation if they are not.
+        for i in range(row_start, row_end):
+            icol = col[i]
+            # Initialize the value for all pairs that we found for i
+            i_row_value[:] = 0
+
+            # Precompute the supercell index of icol (will compare it to that of jcol)
+            icol_sc = icol // uc_ncol
+            # And also its unit cell index
+            uc_icol = icol % uc_ncol
+
+            # Get also the
+            for j in range(i, row_end):
+                jcol = col[j]
+
+                jcol_sc = jcol // uc_ncol
+                # Get the unit cell index of jcol
+                uc_jcol = jcol % uc_ncol
+
+                same_cell = force_same_cell
+                # If same cell interactions are not forced, we need to discover if this pair
+                # of columns is within the same cell.
+                if not force_same_cell:
+                    same_cell = icol_sc == jcol_sc
+
+                # If the columns are not in the same cell, we need to
+                # (1) Calculate the supercell offset between icol and jcol
+                # (2) And then calculate the new index for jcol, moving icol to the unit cell
+                # (3) Do the same in the reverse direction (jcol -> icol)
+                if not same_cell:
+                    # Calculate the sc offset between both orbitals.
+                    for iaxis in range(3):
+                        sc_diff[iaxis] = data_sc_off[jcol_sc, iaxis] - data_sc_off[icol_sc, iaxis]
+                        # Calculate also the offset in the reverse direction
+                        inv_sc_diff[iaxis] = - sc_diff[iaxis]
+
+                        # If the sc_difference is negative, convert it to positive so that we can
+                        # use it to index the isc_off array (we switched off the handling of negative
+                        # indices in cython with wraparound(False))
+                        if sc_diff[iaxis] < 0:
+                            sc_diff[iaxis] = coeffs_nsc[iaxis] + sc_diff[iaxis]
+                        elif inv_sc_diff[iaxis] < 0:
+                            inv_sc_diff[iaxis] = coeffs_nsc[iaxis] + inv_sc_diff[iaxis]
+
+                    # Get the supercell offset index of jcol with respect to icol
+                    jpair_sc = coeffs_isc_off[sc_diff[0], sc_diff[1], sc_diff[2]]
+                    # And use it to calculate the supercell index of the j orbital in this ij pair
+                    sc_jcol = jpair_sc * uc_ncol + uc_jcol
+
+                    # Do the same for the ji pair
+                    ipair_sc = coeffs_isc_off[inv_sc_diff[0], inv_sc_diff[1], inv_sc_diff[2]]
+                    sc_icol = ipair_sc * uc_ncol + uc_icol
+
+                # Get the index needed to find the coefficients that we want from the coeffs array.
+                if same_cell:
+                    if icol == jcol:
+                        coeff_index = dense_idx[uc_icol, uc_jcol]
+                        coeff_index2 = 0
+                    else:
+                        coeff_index = dense_idx[uc_icol, uc_jcol]
+                        coeff_index2 = dense_idx[uc_jcol, uc_icol]
+                else:
+                    coeff_index = dense_idx[uc_icol, sc_jcol]
+                    coeff_index2 = dense_idx[uc_jcol, sc_icol]
+
+                # If the index for the needed (row, col) element is -1, it means that the element is 0.
+                # Just go to next iteration if all elements that we need are 0. Note that we assume here
+                # that if (row, col) is zero (col, row) is also zero.
+                if coeff_index < 0:
+                    continue
+
+                # Add the contribution of this column pair to the row total value. Note that we only
+                # multiply the coefficients by data[j] here. This is because this loop is over all j
+                # that pair with a given i. data[i] is a common factor and therefore we can multiply
+                # after the loop to save operations.
+
+                # Do it for all coefficients.
+                for icoeff in range(ncoeffs):
+                    if same_cell:
+                        if icol == jcol:
+                            i_row_value[icoeff] += coeffs[coeff_index, icoeff] * data[j]
+                        else:
+                            i_row_value[icoeff] += (coeffs[coeff_index, icoeff] + coeffs[coeff_index2, icoeff]) * data[j]
+                    else:
+                        i_row_value[icoeff] += (coeffs[coeff_index, icoeff] + coeffs[coeff_index2, icoeff]) * data[j]
+
+            for icoeff in range(ncoeffs):
+                # Multiply all the contributions of ij pairs with this i by data[i], as explained inside the j loop.
+                i_row_value[icoeff] *= data[i]
+
+                # Add the contribution of all ij pairs for this i to the row value.
+                row_value[icoeff] += i_row_value[icoeff]
+
+        for icoeff in range(ncoeffs):
+            # Store the row value in the output
+            out[reduced_row, icoeff] = out[reduced_row, icoeff] + row_value[icoeff]
diff --git a/src/sisl/physics/densitymatrix.py b/src/sisl/physics/densitymatrix.py
index 5112653604..770479cbb9 100644
--- a/src/sisl/physics/densitymatrix.py
+++ b/src/sisl/physics/densitymatrix.py
@@ -5,7 +5,7 @@
 
 import math as m
 from numbers import Integral
-from typing import Optional
+from typing import Literal, Optional
 
 import numpy as np
 from numpy import add, dot, logical_and, repeat, subtract, unique
@@ -15,7 +15,7 @@
 
 import sisl._array as _a
 from sisl import BoundaryCondition as BC
-from sisl import Geometry, Lattice
+from sisl import Geometry, Grid, Lattice
 from sisl._core.sparse import SparseCSR, _ncol_to_indptr, _to_coo
 from sisl._core.sparse_geometry import SparseAtom, SparseOrbital
 from sisl._indices import indices_fabs_le, indices_le
@@ -688,7 +688,15 @@ def get_BO(geom, D, S, rows, cols):
         "0.15",
         "0.16",
     )
-    def density(self, grid, spinor=None, atol: float = 1e-7, eta=None):
+    def density(
+        self,
+        grid: Grid,
+        spinor=None,
+        atol: float = 1e-7,
+        eta: Optional[bool] = False,
+        method: Literal["pre-compute", "direct"] = "pre-compute",
+        **kwargs,
+    ):
         r"""Expand the density matrix to the charge density on a grid
 
         This routine calculates the real-space density components on a specified grid.
@@ -727,29 +735,23 @@ def density(self, grid, spinor=None, atol: float = 1e-7, eta=None):
            the tolerance, they will be treated as strictly zeros.
         eta : bool, optional
            show a progressbar on stdout
-        """
-        geometry = self.geometry
-        # Check that the atomic coordinates, really are all within the intrinsic supercell.
-        # If not, it may mean that the DM does not conform to the primary unit-cell paradigm
-        # of matrix elements. It complicates things.
-        fxyz = geometry.fxyz
-        f_min = fxyz.min()
-        f_max = fxyz.max()
-        del fxyz, f_min, f_max
+        method:
+           It determines if the orbital values are computed on the fly (direct) or they are all pre-computed
+           on the grid at the beginning(pre-compute).
+           Pre computing orbitals results in a faster computation, but it requires more memory.
 
-        # Extract sub variables used throughout the loop
-        shape = _a.asarrayi(grid.shape)
-        dcell = grid.dcell
-
-        # Sparse matrix data
-        csr = self._csr
+        Notes
+        -----
 
-        # In the following we don't care about division
-        # So 1) save error state, 2) turn off divide by 0, 3) calculate, 4) turn on old error state
-        old_err = np.seterr(divide="ignore", invalid="ignore")
+        The `method` argument may change at will since this is an experimental feature.
+        """
+        # Translate the density matrix to have all the unit cell atoms actually inside
+        # the unit cell, since this will facilitate things greatly and it gives the
+        # same result.
+        uc_dm = self.translate2uc()
 
-        # Placeholder for the resulting coefficients
-        DM = None
+        # Get the DM components with which we want to compute the density
+        csr = uc_dm._csr
         if self.spin.kind > Spin.POLARIZED:
             if spinor is None:
                 # Default to the total density
@@ -779,6 +781,15 @@ def density(self, grid, spinor=None, atol: float = 1e-7, eta=None):
             # Perform dot-product with spinor, and take out the diagonal real part
             DM = dot(DM, spinor.T)[:, [0, 1], [0, 1]].sum(1).real
 
+            # Create the DM csr matrix.
+            csrDM = csr_matrix(
+                (DM, csr.col[idx], _ncol_to_indptr(csr.ncol)),
+                shape=(self.shape[:2]),
+                dtype=DM.dtype,
+            )
+
+            del idx, DM
+
         elif self.spin.kind == Spin.POLARIZED:
             if spinor is None:
                 spinor = _a.onesd(2)
@@ -797,22 +808,50 @@ def density(self, grid, spinor=None, atol: float = 1e-7, eta=None):
                     "argument as an integer, or a vector of length 2"
                 )
 
-            idx = _a.array_arange(csr.ptr[:-1], n=csr.ncol)
-            DM = csr._D[idx, 0] * spinor[0] + csr._D[idx, 1] * spinor[1]
+            csrDM = csr.tocsr(dim=0) * spinor[0] + csr.tocsr(dim=1) * spinor[1]
 
         else:
-            idx = _a.array_arange(csr.ptr[:-1], n=csr.ncol)
-            DM = csr._D[idx, 0]
+            csrDM = csr.tocsr(dim=0)
 
-        # Create the DM csr matrix.
-        csrDM = csr_matrix(
-            (DM, csr.col[idx], _ncol_to_indptr(csr.ncol)),
-            shape=(self.shape[:2]),
-            dtype=DM.dtype,
-        )
+        if method == "pre-compute":
+            # Compute orbital values on the grid
+            psi_values = uc_dm.geometry._orbital_values(grid.shape)
+
+            psi_values.reduce_orbital_products(
+                csrDM, uc_dm.lattice, out=grid.grid, **kwargs
+            )
+        elif method == "direct":
+            self._density_direct(grid, csrDM, atol=atol, eta=eta)
+
+    def _density_direct(
+        self, grid: Grid, csrDM, atol: float = 1e-7, eta: Optional[bool] = None
+    ):
+        r"""Compute the density by calculating the orbital values on the fly.
 
-        # Clean-up
-        del idx, DM
+        Parameters
+        ----------
+        grid : Grid
+           the grid on which to add the density (the density is in ``e/Ang^3``)
+        spinor : (2,) or (2, 2), optional
+           the spinor matrix to obtain the diagonal components of the density. For un-polarized density matrices
+           this keyword has no influence. For spin-polarized it *has* to be either 1 integer or a vector of
+           length 2 (defaults to total density).
+           For non-collinear/spin-orbit density matrices it has to be a 2x2 matrix (defaults to total density).
+        atol : float, optional
+           DM tolerance for accepted values. For all density matrix elements with absolute values below
+           the tolerance, they will be treated as strictly zeros.
+        eta : bool, optional
+           show a progressbar on stdout
+        """
+        geometry = self.geometry
+
+        # Extract sub variables used throughout the loop
+        shape = _a.asarrayi(grid.shape)
+        dcell = grid.dcell
+
+        # In the following we don't care about division
+        # So 1) save error state, 2) turn off divide by 0, 3) calculate, 4) turn on old error state
+        old_err = np.seterr(divide="ignore", invalid="ignore")
 
         # To heavily speed up the construction of the density we can recreate
         # the sparse csrDM matrix by summing the lower and upper triangular part.
diff --git a/src/sisl/physics/tests/test_density_matrix.py b/src/sisl/physics/tests/test_density_matrix.py
index dc263a2936..a7fe7e160a 100644
--- a/src/sisl/physics/tests/test_density_matrix.py
+++ b/src/sisl/physics/tests/test_density_matrix.py
@@ -88,6 +88,11 @@ def func(D, ia, atoms, atoms_xyz):
     return t()
 
 
+@pytest.fixture(scope="module", params=["direct", "pre-compute"])
+def density_method(request):
+    return request.param
+
+
 @pytest.mark.physics
 @pytest.mark.densitymatrix
 class TestDensityMatrix:
@@ -183,14 +188,14 @@ def test_bond_order(self, setup, method, option, projection):
             assert isinstance(BO, SparseOrbital)
             assert BO.shape[:2] == (D.geometry.no, D.geometry.no_s)
 
-    def test_rho1(self, setup):
+    def test_rho1(self, setup, density_method):
         D = setup.D.copy()
         D.construct(setup.func)
         grid = Grid(0.2, geometry=setup.D.geometry)
-        D.density(grid)
+        D.density(grid, method=density_method)
 
     @pytest.mark.filterwarnings("ignore", message="*is NOT Hermitian for on-site")
-    def test_rho2(self):
+    def test_rho2(self, density_method):
         bond = 1.42
         sq3h = 3.0**0.5 * 0.5
         lattice = Lattice(
@@ -214,23 +219,23 @@ def test_rho2(self):
         D = DensityMatrix(g)
         D.construct([[0.1, bond + 0.01], [1.0, 0.1]])
         grid = Grid(0.2, geometry=D.geometry)
-        D.density(grid)
+        D.density(grid, method=density_method)
 
         D = DensityMatrix(g, spin=Spin("P"))
         D.construct([[0.1, bond + 0.01], [(1.0, 0.5), (0.1, 0.1)]])
         grid = Grid(0.2, geometry=D.geometry)
-        D.density(grid)
-        D.density(grid, [1.0, -1])
-        D.density(grid, 0)
-        D.density(grid, 1)
+        D.density(grid, method=density_method)
+        D.density(grid, [1.0, -1], method=density_method)
+        D.density(grid, 0, method=density_method)
+        D.density(grid, 1, method=density_method)
 
         D = DensityMatrix(g, spin=Spin("NC"))
         D.construct(
             [[0.1, bond + 0.01], [(1.0, 0.5, 0.01, 0.01), (0.1, 0.1, 0.1, 0.1)]]
         )
         grid = Grid(0.2, geometry=D.geometry)
-        D.density(grid)
-        D.density(grid, [[1.0, 0.0], [0.0, -1]])
+        D.density(grid, method=density_method)
+        D.density(grid, [[1.0, 0.0], [0.0, -1]], method=density_method)
 
         D = DensityMatrix(g, spin=Spin("SO"))
         D.construct(
@@ -243,11 +248,11 @@ def test_rho2(self):
             ]
         )
         grid = Grid(0.2, geometry=D.geometry)
-        D.density(grid)
-        D.density(grid, [[1.0, 0.0], [0.0, -1]])
-        D.density(grid, Spin.X)
-        D.density(grid, Spin.Y)
-        D.density(grid, Spin.Z)
+        D.density(grid, method=density_method)
+        D.density(grid, [[1.0, 0.0], [0.0, -1]], method=density_method)
+        D.density(grid, Spin.X, method=density_method)
+        D.density(grid, Spin.Y, method=density_method)
+        D.density(grid, Spin.Z, method=density_method)
 
     @pytest.mark.filterwarnings("ignore", message="*is NOT Hermitian for on-site")
     def test_orbital_momentum(self):
@@ -474,20 +479,20 @@ def test_spin_rotate_so(self):
         assert not np.allclose(D_mull, d_mull)
         assert np.allclose(D_mull[0], d_mull[0])
 
-    def test_rho_eta(self, setup):
+    def test_rho_eta(self, setup, density_method):
         D = setup.D.copy()
         D.construct(setup.func)
         grid = Grid(0.2, geometry=setup.D.geometry)
-        D.density(grid, eta=True)
+        D.density(grid, eta=True, method=density_method)
 
-    def test_rho_smaller_grid1(self, setup):
+    def test_rho_smaller_grid1(self, setup, density_method):
         D = setup.D.copy()
         D.construct(setup.func)
         lattice = setup.D.geometry.cell.copy() / 2
         grid = Grid(0.2, geometry=setup.D.geometry.copy(), lattice=lattice)
-        D.density(grid)
+        D.density(grid, method=density_method)
 
-    def test_rho_fail_p(self):
+    def test_rho_fail_p(self, density_method):
         bond = 1.42
         sq3h = 3.0**0.5 * 0.5
         lattice = Lattice(
@@ -513,9 +518,9 @@ def test_rho_fail_p(self):
         D.construct([[0.1, bond + 0.01], [(1.0, 0.5), (0.1, 0.1)]])
         grid = Grid(0.2, geometry=D.geometry)
         with pytest.raises(ValueError):
-            D.density(grid, [1.0, -1, 0.0])
+            D.density(grid, [1.0, -1, 0.0], method=density_method)
 
-    def test_rho_fail_nc(self):
+    def test_rho_fail_nc(self, density_method):
         bond = 1.42
         sq3h = 3.0**0.5 * 0.5
         lattice = Lattice(
@@ -543,7 +548,7 @@ def test_rho_fail_nc(self):
         )
         grid = Grid(0.2, geometry=D.geometry)
         with pytest.raises(ValueError):
-            D.density(grid, [1.0, 0.0])
+            D.density(grid, [1.0, 0.0], method=density_method)
 
     def test_pickle(self, setup):
         import pickle as p
diff --git a/src/sisl/tests/test_sparse_grid.py b/src/sisl/tests/test_sparse_grid.py
new file mode 100644
index 0000000000..24cfff6c30
--- /dev/null
+++ b/src/sisl/tests/test_sparse_grid.py
@@ -0,0 +1,312 @@
+import numpy as np
+import pytest
+
+import sisl
+from sisl import Grid
+from sisl._sparse_grid_ops import transpose_raveled_index
+
+
+@pytest.fixture
+def geometry():
+
+    r = np.linspace(0, 3.5, 50)
+    f = np.exp(-r)
+
+    orb = sisl.AtomicOrbital("2pzZ", (r, f))
+    geom = sisl.geom.graphene(orthogonal=True, atoms=sisl.Atom(6, orb))
+
+    return geom
+
+
+@pytest.fixture
+def grid_shape():
+    return (8, 10, 12)
+
+
+@pytest.fixture
+def psi_values(geometry, grid_shape):
+    return geometry._orbital_values(grid_shape)
+
+
+@pytest.fixture
+def H(geometry):
+    H = sisl.Hamiltonian(geometry)
+    H.construct(
+        [(0.1, 1.44), (0, -2.7)],
+    )
+    return H
+
+
+def test_transpose_raveled():
+    """Tests the cython implemented function that transposes
+    raveled indices."""
+
+    # Define the reference function, which calls numpy
+    def transpose_raveled_numpy(index, grid_shape, new_order):
+        grid_shape = np.array(grid_shape)
+
+        unraveled = np.unravel_index(index, grid_shape)
+        unraveled = np.array(unraveled)
+        new_grid_shape = grid_shape[new_order]
+        new_unraveled = unraveled[new_order]
+
+        return np.ravel_multi_index(new_unraveled, new_grid_shape)
+
+    grid_shape = np.array([3, 4, 5], dtype=np.int32)
+    grid_size = np.prod(grid_shape)
+    # Move the first axis to the last dimension.
+    new_order = np.array([1, 2, 0], dtype=np.int32)
+
+    for i in range(grid_size):
+        transposed = transpose_raveled_index(i, grid_shape, new_order)
+        transposed_np = transpose_raveled_numpy(i, grid_shape, new_order)
+
+        assert transposed == transposed_np
+
+    # Move the last axis to the first dimension.
+    new_order = np.array([2, 0, 1], dtype=np.int32)
+
+    for i in range(grid_size):
+        transposed = transpose_raveled_index(i, grid_shape, new_order)
+        transposed_np = transpose_raveled_numpy(i, grid_shape, new_order)
+
+        assert transposed == transposed_np
+
+
+@pytest.mark.parametrize("k", [(0, 0, 0), (0.5, 0, 0)])
+def test_wavefunction_correct(H, grid_shape, psi_values, k):
+    """Checks that the wavefunction computed with the precalculated
+    psi values is the same as the wavefunction computed directly."""
+
+    eig = H.eigenstate(k=k)[0]
+
+    wf_grid = sisl.Grid(grid_shape, geometry=H.geometry, dtype=np.complex128)
+
+    eig.wavefunction(wf_grid)
+
+    from_psi_grid = psi_values.reduce_orbitals(eig.state.T, k=k)
+
+    assert np.allclose(from_psi_grid.grid, wf_grid.grid)
+
+
+@pytest.mark.parametrize(
+    ["k", "ncoeffs"],
+    [[(0, 0, 0), 1], [(0, 0, 0), 2], [(0.25, 0, 0), 1], [(0.25, 0, 0), 2]],
+)
+def test_onthefly_reduction(geometry, psi_values, k, ncoeffs):
+    """Checks that the on the fly reduction produces the same
+    results as computing the whole grid and then reducing."""
+
+    coeffs = np.random.random((geometry.no, ncoeffs))
+
+    not_reduced = psi_values.reduce_orbitals(coeffs, k=k)
+
+    # Reducing the last axis
+    reduced = psi_values.reduce_orbitals(coeffs, k=k, reduce_grid=(2,))
+
+    post_reduced = not_reduced.sum(2)
+    assert reduced.size == post_reduced.size
+    if ncoeffs == 1:
+        assert np.allclose(reduced.grid, post_reduced.grid)
+    else:
+        assert np.allclose(reduced.reshape(post_reduced.shape), post_reduced)
+
+    # Reducing the two last axes
+    reduced = psi_values.reduce_orbitals(coeffs, k=k, reduce_grid=(2, 1))
+
+    post_reduced = not_reduced.sum(2).sum(1)
+    assert reduced.size == post_reduced.size
+    if ncoeffs == 1:
+        assert np.allclose(reduced.grid, post_reduced.grid)
+    else:
+        assert np.allclose(reduced.reshape(post_reduced.shape), post_reduced)
+
+    # Now we are going to reduce the leading axes. This is more involved, so
+    # it is more likely to fail.
+
+    # Reducing the first axis
+    reduced = psi_values.reduce_orbitals(coeffs, k=k, reduce_grid=(0,))
+
+    post_reduced = not_reduced.sum(0)
+    assert reduced.size == post_reduced.size
+    if ncoeffs == 1:
+        assert np.allclose(reduced.grid, post_reduced.grid)
+    else:
+        assert np.allclose(reduced.reshape(post_reduced.shape), post_reduced)
+
+    # Reducing the first and second axis
+    reduced = psi_values.reduce_orbitals(coeffs, k=k, reduce_grid=(0, 1))
+
+    post_reduced = not_reduced.sum(1).sum(0)
+    assert reduced.size == post_reduced.size
+    if ncoeffs == 1:
+        assert np.allclose(reduced.grid, post_reduced.grid)
+    else:
+        assert np.allclose(reduced.reshape(post_reduced.shape), post_reduced)
+
+
+def test_orbital_products(geometry):
+    """Very simple tests to see if the orbital products are computed correctly"""
+    geometry = geometry.copy()
+    geometry.set_nsc([1, 1, 1])
+    geometry.lattice.set_boundary_condition("open")
+
+    # Don't get periodic contributions for orbital values, then predicting
+    # grid values will be much easier.
+    psi_values = geometry._orbital_values((10, 10, 10))
+
+    orb_csr = psi_values._csr.tocsr()
+    orb_0 = orb_csr[:, 0].toarray().ravel()
+    orb_1 = orb_csr[:, 1].toarray().ravel()
+
+    # Compute the orbital products with one coefficient
+    DM = sisl.DensityMatrix(geometry, dim=1, dtype=np.float64)
+    DM[0, 0] = 1.0
+
+    dens = psi_values.reduce_orbital_products(DM.tocsr(), geometry.sc)
+    assert isinstance(dens, Grid)
+    assert dens.shape == psi_values.grid_shape
+    assert np.any(dens.grid != 0)
+
+    predicted = orb_0**2
+
+    assert np.allclose(dens.grid.ravel(), predicted)
+
+    # Compute the orbital products with two diagonal coefficients
+    DM = sisl.DensityMatrix(geometry, dim=1, dtype=np.float64)
+    DM[0, 0] = 1.0
+    DM[1, 1] = 2.0
+
+    dens = psi_values.reduce_orbital_products(DM.tocsr(), DM.sc)
+    predicted = (orb_0**2) * 1.0 + (orb_1**2) * 2.0
+
+    assert np.allclose(dens.grid.ravel(), predicted)
+
+    # Compute the orbital products with an off-diagonal coefficient
+    DM = sisl.DensityMatrix(geometry, dim=1, dtype=np.float64)
+    DM[0, 0] = 1.0
+    DM[0, 1] = 2.0
+
+    dens = psi_values.reduce_orbital_products(DM.tocsr(), DM.sc)
+    predicted = (orb_0**2) * 1.0 + (orb_0 * orb_1) * 2.0
+
+    assert np.allclose(dens.grid.ravel(), predicted)
+
+    # Compute the orbital products with both opposite off-diagonal coefficients
+    DM = sisl.DensityMatrix(geometry, dim=1, dtype=np.float64)
+    DM[0, 0] = 1.0
+    DM[0, 1] = 1.0
+    DM[1, 0] = 0.5
+
+    dens = psi_values.reduce_orbital_products(DM.tocsr(), DM.sc)
+    predicted = (orb_0**2) * 1.0 + (orb_0 * orb_1) * 1.5
+
+    assert np.allclose(dens.grid.ravel(), predicted)
+
+
+def test_orbital_products_periodic(geometry):
+    """Very simple tests to see if the orbital products are computed correctly
+    when there are periodic conditions.
+    """
+    geometry = geometry.copy()
+    geometry.set_nsc([3, 1, 1])
+    geometry.lattice.set_boundary_condition(a="periodic", b="open", c="open")
+
+    # Don't get periodic contributions for orbital values, then predicting
+    # grid values will be much easier.
+    psi_values = geometry._orbital_values((10, 10, 10))
+
+    orb_csr = psi_values._csr.tocsr()
+    orb_0 = orb_csr[:, [0, geometry.no, 2 * geometry.no]].toarray()
+    orb_1 = orb_csr[:, [1, geometry.no + 1, 2 * geometry.no + 1]].toarray()
+
+    # Compute the orbital products with one coefficient
+    DM = sisl.DensityMatrix(geometry, dim=1, dtype=np.float64)
+    DM[0, 0] = 1.0
+
+    dens = psi_values.reduce_orbital_products(DM.tocsr(), geometry.sc)
+
+    predicted = (orb_0**2).sum(axis=1)
+
+    assert np.allclose(dens.grid.ravel(), predicted)
+
+    # Compute the orbital products with two diagonal coefficients
+    DM = sisl.DensityMatrix(geometry, dim=1, dtype=np.float64)
+    DM[0, 0] = 1.0
+    DM[1, 1] = 2.0
+
+    dens = psi_values.reduce_orbital_products(DM.tocsr(), DM.sc)
+    predicted = (orb_0**2).sum(axis=1) + (orb_1**2).sum(axis=1) * 2.0
+
+    assert np.allclose(dens.grid.ravel(), predicted)
+
+    # Predicting the result with periodic non-diagonal coefficients is a bit more involved.
+    # We don't check for now.
+
+
+@pytest.mark.parametrize("ncoeffs", [1, 2])
+def test_orbital_products_onthefly_reduction(geometry, psi_values, ncoeffs):
+    """Checks that the on the fly reduction produces the same
+    results as computing the whole grid and then reducing."""
+
+    # Build a test DM
+    DM = sisl.DensityMatrix(geometry, dim=1, dtype=np.float64)
+    DM[0, 0] = 1.0
+    DM[1, 1] = 2.0
+    DM[2, 2] = 0.75
+    DM[3, 3] = 0.92
+    DM[0, 1] = 0.5
+    DM[1, 0] = 0.5
+
+    csr = DM.tocsr()
+    # If this run is to be done with multiple coefficients, build the csr
+    # matrix with dim > 1.
+    if ncoeffs > 1:
+        csr = sisl.SparseCSR(csr, dim=ncoeffs)
+        csr.data[:, :] = csr.data[:, 0].reshape(-1, 1)
+
+    # Compute the values on the full grid
+    not_reduced = psi_values.reduce_orbital_products(csr, DM.sc)
+
+    # Reducing the last axis
+    reduced = psi_values.reduce_orbital_products(csr, DM.sc, reduce_grid=(2,))
+
+    post_reduced = not_reduced.sum(2)
+    assert reduced.size == post_reduced.size
+    if ncoeffs == 1:
+        assert np.allclose(reduced.grid, post_reduced.grid)
+    else:
+        assert np.allclose(reduced.reshape(post_reduced.shape), post_reduced)
+
+    # Reducing the two last axes
+    reduced = psi_values.reduce_orbital_products(csr, DM.sc, reduce_grid=(2, 1))
+
+    post_reduced = not_reduced.sum(2).sum(1)
+    assert reduced.size == post_reduced.size
+    if ncoeffs == 1:
+        assert np.allclose(reduced.grid, post_reduced.grid)
+    else:
+        assert np.allclose(reduced.reshape(post_reduced.shape), post_reduced)
+
+    # Now we are going to reduce the leading axes. This is more involved, so
+    # it is more likely to fail.
+
+    # Reducing the first axis
+    reduced = psi_values.reduce_orbital_products(csr, DM.sc, reduce_grid=(0,))
+
+    post_reduced = not_reduced.sum(0)
+    assert reduced.size == post_reduced.size
+    if ncoeffs == 1:
+        assert np.allclose(reduced.grid, post_reduced.grid)
+    else:
+        assert np.allclose(reduced.reshape(post_reduced.shape), post_reduced)
+
+    # Reducing the first and second axis
+    reduced = psi_values.reduce_orbital_products(csr, DM.sc, reduce_grid=(0, 1))
+
+    post_reduced = not_reduced.sum(1).sum(0)
+    assert reduced.size == post_reduced.size
+    if ncoeffs == 1:
+        assert np.allclose(reduced.grid, post_reduced.grid)
+    else:
+        assert np.allclose(reduced.reshape(post_reduced.shape), post_reduced)

From 289b685f0b20a8a384a1e4d8694a8427b48fc178 Mon Sep 17 00:00:00 2001
From: Pol Febrer <pfebrer96@gmail.com>
Date: Fri, 15 Mar 2024 22:22:19 +0100
Subject: [PATCH 2/5] fix tests

---
 src/sisl/_sparse_grid.py           |  8 ++++++--
 src/sisl/tests/test_sparse_grid.py | 22 +++++++++++-----------
 2 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/src/sisl/_sparse_grid.py b/src/sisl/_sparse_grid.py
index bf349f569b..56a531c848 100644
--- a/src/sisl/_sparse_grid.py
+++ b/src/sisl/_sparse_grid.py
@@ -412,7 +412,6 @@ def reduce_orbital_products(
         if sparse_coeffs:
             if multi_coeffs:
                 reduce_func = reduce_sc_products_multicoeffs_sparse_denseindex
-                weights.astype = weights.copy
             else:
                 reduce_func = reduce_sc_products
                 weights = weights.tocsr().toarray(order="C")
@@ -438,11 +437,16 @@ def reduce_orbital_products(
 
             out = out.ravel()
 
+        if isinstance(weights, SparseCSR):
+            weights = weights.copy(dtype=dtype)
+        else:
+            weights = weights.astype(dtype)
+
         reduce_func(
             csr.data[:, 0].astype(dtype),
             csr.ptr,
             csr.col,
-            weights.astype(dtype=dtype),
+            weights,
             self.geometry.no,
             self.lattice.sc_off,
             weights_lattice.isc_off,
diff --git a/src/sisl/tests/test_sparse_grid.py b/src/sisl/tests/test_sparse_grid.py
index 24cfff6c30..17800fbc97 100644
--- a/src/sisl/tests/test_sparse_grid.py
+++ b/src/sisl/tests/test_sparse_grid.py
@@ -163,7 +163,7 @@ def test_orbital_products(geometry):
     DM = sisl.DensityMatrix(geometry, dim=1, dtype=np.float64)
     DM[0, 0] = 1.0
 
-    dens = psi_values.reduce_orbital_products(DM.tocsr(), geometry.sc)
+    dens = psi_values.reduce_orbital_products(DM.tocsr(), geometry.lattice)
     assert isinstance(dens, Grid)
     assert dens.shape == psi_values.grid_shape
     assert np.any(dens.grid != 0)
@@ -177,7 +177,7 @@ def test_orbital_products(geometry):
     DM[0, 0] = 1.0
     DM[1, 1] = 2.0
 
-    dens = psi_values.reduce_orbital_products(DM.tocsr(), DM.sc)
+    dens = psi_values.reduce_orbital_products(DM.tocsr(), DM.lattice)
     predicted = (orb_0**2) * 1.0 + (orb_1**2) * 2.0
 
     assert np.allclose(dens.grid.ravel(), predicted)
@@ -187,7 +187,7 @@ def test_orbital_products(geometry):
     DM[0, 0] = 1.0
     DM[0, 1] = 2.0
 
-    dens = psi_values.reduce_orbital_products(DM.tocsr(), DM.sc)
+    dens = psi_values.reduce_orbital_products(DM.tocsr(), DM.lattice)
     predicted = (orb_0**2) * 1.0 + (orb_0 * orb_1) * 2.0
 
     assert np.allclose(dens.grid.ravel(), predicted)
@@ -198,7 +198,7 @@ def test_orbital_products(geometry):
     DM[0, 1] = 1.0
     DM[1, 0] = 0.5
 
-    dens = psi_values.reduce_orbital_products(DM.tocsr(), DM.sc)
+    dens = psi_values.reduce_orbital_products(DM.tocsr(), DM.lattice)
     predicted = (orb_0**2) * 1.0 + (orb_0 * orb_1) * 1.5
 
     assert np.allclose(dens.grid.ravel(), predicted)
@@ -224,7 +224,7 @@ def test_orbital_products_periodic(geometry):
     DM = sisl.DensityMatrix(geometry, dim=1, dtype=np.float64)
     DM[0, 0] = 1.0
 
-    dens = psi_values.reduce_orbital_products(DM.tocsr(), geometry.sc)
+    dens = psi_values.reduce_orbital_products(DM.tocsr(), geometry.lattice)
 
     predicted = (orb_0**2).sum(axis=1)
 
@@ -235,7 +235,7 @@ def test_orbital_products_periodic(geometry):
     DM[0, 0] = 1.0
     DM[1, 1] = 2.0
 
-    dens = psi_values.reduce_orbital_products(DM.tocsr(), DM.sc)
+    dens = psi_values.reduce_orbital_products(DM.tocsr(), DM.lattice)
     predicted = (orb_0**2).sum(axis=1) + (orb_1**2).sum(axis=1) * 2.0
 
     assert np.allclose(dens.grid.ravel(), predicted)
@@ -266,10 +266,10 @@ def test_orbital_products_onthefly_reduction(geometry, psi_values, ncoeffs):
         csr.data[:, :] = csr.data[:, 0].reshape(-1, 1)
 
     # Compute the values on the full grid
-    not_reduced = psi_values.reduce_orbital_products(csr, DM.sc)
+    not_reduced = psi_values.reduce_orbital_products(csr, DM.lattice)
 
     # Reducing the last axis
-    reduced = psi_values.reduce_orbital_products(csr, DM.sc, reduce_grid=(2,))
+    reduced = psi_values.reduce_orbital_products(csr, DM.lattice, reduce_grid=(2,))
 
     post_reduced = not_reduced.sum(2)
     assert reduced.size == post_reduced.size
@@ -279,7 +279,7 @@ def test_orbital_products_onthefly_reduction(geometry, psi_values, ncoeffs):
         assert np.allclose(reduced.reshape(post_reduced.shape), post_reduced)
 
     # Reducing the two last axes
-    reduced = psi_values.reduce_orbital_products(csr, DM.sc, reduce_grid=(2, 1))
+    reduced = psi_values.reduce_orbital_products(csr, DM.lattice, reduce_grid=(2, 1))
 
     post_reduced = not_reduced.sum(2).sum(1)
     assert reduced.size == post_reduced.size
@@ -292,7 +292,7 @@ def test_orbital_products_onthefly_reduction(geometry, psi_values, ncoeffs):
     # it is more likely to fail.
 
     # Reducing the first axis
-    reduced = psi_values.reduce_orbital_products(csr, DM.sc, reduce_grid=(0,))
+    reduced = psi_values.reduce_orbital_products(csr, DM.lattice, reduce_grid=(0,))
 
     post_reduced = not_reduced.sum(0)
     assert reduced.size == post_reduced.size
@@ -302,7 +302,7 @@ def test_orbital_products_onthefly_reduction(geometry, psi_values, ncoeffs):
         assert np.allclose(reduced.reshape(post_reduced.shape), post_reduced)
 
     # Reducing the first and second axis
-    reduced = psi_values.reduce_orbital_products(csr, DM.sc, reduce_grid=(0, 1))
+    reduced = psi_values.reduce_orbital_products(csr, DM.lattice, reduce_grid=(0, 1))
 
     post_reduced = not_reduced.sum(1).sum(0)
     assert reduced.size == post_reduced.size

From 220b2ec172291abe0c028ae8595619379a5db313 Mon Sep 17 00:00:00 2001
From: Pol Febrer <pfebrer96@gmail.com>
Date: Wed, 20 Mar 2024 13:33:00 +0100
Subject: [PATCH 3/5] changed to pure python syntax

---
 src/sisl/CMakeLists.txt                       |  13 +-
 ...parse_grid_ops.pyx => _sparse_grid_ops.py} | 239 +++++++++++-------
 2 files changed, 165 insertions(+), 87 deletions(-)
 rename src/sisl/{_sparse_grid_ops.pyx => _sparse_grid_ops.py} (83%)

diff --git a/src/sisl/CMakeLists.txt b/src/sisl/CMakeLists.txt
index 55f75ca404..4fff724b44 100644
--- a/src/sisl/CMakeLists.txt
+++ b/src/sisl/CMakeLists.txt
@@ -1,4 +1,4 @@
-foreach(source _indices _math_small _sparse_grid_ops)
+foreach(source _indices _math_small)
   add_cython_library(
     SOURCE ${source}.pyx
     LIBRARY ${source}
@@ -8,6 +8,17 @@ foreach(source _indices _math_small _sparse_grid_ops)
     DESTINATION ${SKBUILD_PROJECT_NAME})
 endforeach()
 
+# Python files that can be compiled with cython (Pure Python syntax)
+foreach(source _sparse_grid_ops)
+  add_cython_library(
+    SOURCE ${source}.py
+    LIBRARY ${source}
+    OUTPUT ${source}_C
+    )
+  install(TARGETS ${source} LIBRARY
+    DESTINATION ${SKBUILD_PROJECT_NAME})
+endforeach()
+
 # Add other sub-directories
 add_subdirectory("_core")
 add_subdirectory("io")
diff --git a/src/sisl/_sparse_grid_ops.pyx b/src/sisl/_sparse_grid_ops.py
similarity index 83%
rename from src/sisl/_sparse_grid_ops.pyx
rename to src/sisl/_sparse_grid_ops.py
index 8608ea8cb3..ec37686589 100644
--- a/src/sisl/_sparse_grid_ops.pyx
+++ b/src/sisl/_sparse_grid_ops.py
@@ -1,17 +1,18 @@
 import cython
+import cython.cimports.numpy as cnp
 import numpy as np
 
-cimport numpy as np
-
-import math
-
 from sisl import SparseCSR
 
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
 @cython.cdivision(True)
-cpdef cython.int transpose_raveled_index(index: cython.int, grid_shape: np.int32_t[:], new_order: np.int32_t[:]):
+@cython.ccall
+@cython.exceptval(check=False)
+def transpose_raveled_index(
+    index: cython.int, grid_shape: cnp.int32_t[:], new_order: cnp.int32_t[:]
+) -> cython.int:
     """Transposes a raveled index, given the shape that raveled it and the new axis order.
 
     Currently it only works for 3D indices.
@@ -44,14 +45,19 @@
         new_grid_shape[iaxis] = grid_shape[new_order[iaxis]]
         new_unraveled[iaxis] = unraveled[new_order[iaxis]]
 
-    new_raveled = new_unraveled[2] + new_unraveled[1] * new_grid_shape[2] + new_unraveled[0] * new_grid_shape[1] * new_grid_shape[2]
+    new_raveled = (
+        new_unraveled[2]
+        + new_unraveled[1] * new_grid_shape[2]
+        + new_unraveled[0] * new_grid_shape[1] * new_grid_shape[2]
+    )
 
     return new_raveled
 
+
 # This function should be in sisl._sparse, but I don't know how to import it from there
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def dense_index(shape, ptr: np.int32_t[:], col: np.int32_t[:]):
+def dense_index(shape, ptr: cnp.int32_t[:], col: cnp.int32_t[:]) -> cnp.int32_t[:, :]:
     """Returns a dense array containing the value index for each nonzero (row, col) element.
 
     Zero elements are asigned an index of -1, so routines using this dense index should
@@ -69,15 +75,12 @@ def dense_index(shape, ptr: np.int32_t[:], col: np.int32_t[:]):
     np.ndarray:
         Numpy array of shape = matrix shape and data type np.int32.
     """
-    nrows: cython.int = ptr.shape[0] - 1
+    nrows = ptr.shape[0] - 1
     row: cython.int
-    row_start: cython.int
-    row_end: cython.int
     ival: cython.int
-    val_col: cython.int
 
     # Initialize the dense index
-    dense_index: np.int32_t[:, :] = np.full(shape, -1, dtype=np.int32)
+    dense_index: cnp.int32_t[:, :] = np.full(shape, -1, dtype=np.int32)
 
     for row in range(nrows):
         row_start = ptr[row]
@@ -89,13 +92,18 @@ def dense_index(shape, ptr: np.int32_t[:], col: np.int32_t[:]):
 
     return np.asarray(dense_index)
 
+
 @cython.boundscheck(False)
 @cython.wraparound(False)
 @cython.cdivision(True)
 def reduce_grid_sum(
-    cython.numeric[:] data, ptr: np.int32_t[:], col: np.int32_t[:],
-    reduce_factor: cython.int, grid_shape: np.int32_t[:], new_axes: np.int32_t[:],
-    cython.numeric[:] out
+    data: cython.numeric[:],
+    ptr: cnp.int32_t[:],
+    col: cnp.int32_t[:],
+    reduce_factor: cython.int,
+    grid_shape: cnp.int32_t[:],
+    new_axes: cnp.int32_t[:],
+    out: cython.numeric[:],
 ):
     """Performs sum over the extra dimension while reducing other dimensions of the grid.
 
@@ -121,38 +129,43 @@ def reduce_grid_sum(
     out:
         The array where the output should be stored.
     """
-    nrows: cython.int = ptr.shape[0] - 1
+    nrows = ptr.shape[0] - 1
 
     i: cython.int
-    reduced_i: cython.int
     j: cython.int
-    jcol: cython.int
 
     need_transpose: cython.bint = new_axes.shape[0] > 1
 
-    cdef cython.numeric row_value
+    row_value: cython.numeric
 
     for i in range(nrows):
 
         if need_transpose:
-            reduced_i = transpose_raveled_index(i, grid_shape, new_axes) // reduce_factor
+            reduced_i = (
+                transpose_raveled_index(i, grid_shape, new_axes) // reduce_factor
+            )
         else:
             reduced_i = i // reduce_factor
 
         row_value = 0
         for j in range(ptr[i], ptr[i + 1]):
-            jcol = col[j]
             row_value += data[j]
 
         out[reduced_i] = out[reduced_i] + row_value
 
+
 @cython.boundscheck(False)
 @cython.wraparound(False)
 @cython.cdivision(True)
 def reduce_grid_matvec_multiply(
-    cython.numeric[:] data, ptr: np.int32_t[:], col: np.int32_t[:], cython.numeric[:] V,
-    reduce_factor: cython.int, grid_shape: np.int32_t[:], new_axes: np.int32_t[:],
-    cython.numeric[:] out
+    data: cython.numeric[:],
+    ptr: cnp.int32_t[:],
+    col: cnp.int32_t[:],
+    V: cython.numeric[:],
+    reduce_factor: cython.int,
+    grid_shape: cnp.int32_t[:],
+    new_axes: cnp.int32_t[:],
+    out: cython.numeric[:],
 ):
     """Performs a matrix-vector multiplication while reducing other dimensions of the grid.
 
@@ -190,13 +203,15 @@ def reduce_grid_matvec_multiply(
 
     need_transpose: cython.bint = new_axes.shape[0] > 1
 
-    cdef cython.numeric row_value
+    row_value: cython.numeric
 
     for i in range(nrows):
         reduced_i = i // reduce_factor
 
         if need_transpose:
-            reduced_i = transpose_raveled_index(i, grid_shape, new_axes) // reduce_factor
+            reduced_i = (
+                transpose_raveled_index(i, grid_shape, new_axes) // reduce_factor
+            )
         else:
             reduced_i = i // reduce_factor
 
@@ -207,13 +222,19 @@ def reduce_grid_matvec_multiply(
 
         out[reduced_i] = out[reduced_i] + row_value
 
+
 @cython.boundscheck(False)
 @cython.wraparound(False)
 @cython.cdivision(True)
 def reduce_grid_matvecs_multiply(
-    cython.numeric[:] data, np.int32_t[:] ptr, np.int32_t[:] col, cython.numeric[:, :] V,
-    reduce_factor: cython.int, grid_shape: np.int32_t[:], new_axes: np.int32_t[:],
-    cython.numeric[:, :] out
+    data: cython.numeric[:],
+    ptr: cnp.int32_t[:],
+    col: cnp.int32_t[:],
+    V: cython.numeric[:, :],
+    reduce_factor: cython.int,
+    grid_shape: cnp.int32_t[:],
+    new_axes: cnp.int32_t[:],
+    out: cython.numeric[:, :],
 ):
     """Performs a matrix-matrix multiplication while reducing other dimensions of the grid.
 
@@ -242,22 +263,22 @@ def reduce_grid_matvecs_multiply(
         The array where the output should be stored.
     """
 
-    nrows: cython.int = ptr.shape[0] - 1
-    nvecs: cython.int = V.shape[1]
+    nrows = ptr.shape[0] - 1
+    nvecs = V.shape[1]
 
     i: cython.int
-    reduced_i: cython.int
     j: cython.int
-    jcol: cython.int
     ivec: cython.int
 
     need_transpose: cython.bint = new_axes.shape[0] > 1
 
-    cdef cython.numeric[:] row_value = np.zeros(nvecs, np.asarray(data).dtype)
+    row_value: cython.numeric[:] = np.zeros(nvecs, np.asarray(data).dtype)
 
     for i in range(nrows):
         if need_transpose:
-            reduced_i = transpose_raveled_index(i, grid_shape, new_axes) // reduce_factor
+            reduced_i = (
+                transpose_raveled_index(i, grid_shape, new_axes) // reduce_factor
+            )
         else:
             reduced_i = i // reduce_factor
 
@@ -271,14 +292,22 @@ def reduce_grid_matvecs_multiply(
         for ivec in range(nvecs):
             out[reduced_i, ivec] = out[reduced_i, ivec] + row_value[ivec]
 
+
 @cython.boundscheck(False)
 @cython.wraparound(False)
 @cython.cdivision(True)
 def reduce_sc_products(
-    cython.floating[:] data, ptr: np.int32_t[:], col: np.int32_t[:], cython.floating[:, :] coeffs,
-    uc_ncol: cython.int, data_sc_off: np.int32_t[:, :], coeffs_isc_off: np.int32_t[:, :, :],
-    reduce_factor: cython.int, grid_shape: np.int32_t[:], new_axes: np.int32_t[:],
-    cython.floating[:] out
+    data: cython.floating[:],
+    ptr: cnp.int32_t[:],
+    col: cnp.int32_t[:],
+    coeffs: cython.floating[:, :],
+    uc_ncol: cython.int,
+    data_sc_off: cnp.int32_t[:, :],
+    coeffs_isc_off: cnp.int32_t[:, :, :],
+    reduce_factor: cython.int,
+    grid_shape: cnp.int32_t[:],
+    new_axes: cnp.int32_t[:],
+    out: cython.floating[:],
 ):
     """For each row, sums all possible products between column pairs.
 
@@ -319,13 +348,11 @@ def reduce_sc_products(
     out:
         The array where the output should be stored.
     """
-    nrows: cython.int = ptr.shape[0] - 1
+    nrows = ptr.shape[0] - 1
 
     # Indices to handle rows
     row: cython.int
     reduced_row: cython.int
-    row_start: cython.int
-    row_end: cython.int
 
     # Indices to handle pairs of columns (ij)
     i: cython.int
@@ -343,14 +370,13 @@ def reduce_sc_products(
     jpair_sc: cython.int
 
     # Temporal storage to build values
-    cdef cython.floating row_value
-    cdef cython.floating i_row_value
+    row_value: cython.floating
+    i_row_value: cython.floating
 
     # Index to loop over axes of the grid.
     iaxis: cython.int
 
     # Variables that will help managing orbital pairs that are not within the same cell.
-    coeffs_nsc: cython.int[3]
     sc_diff: cython.int[3]
     inv_sc_diff: cython.int[3]
     force_same_cell: cython.bint
@@ -367,8 +393,8 @@ def reduce_sc_products(
     # (if any) have been stored in the unit cell. This is what SIESTA does for gamma point calculations
     # with nsc <= 3.
     force_same_cell = True
+    coeffs_nsc = coeffs_isc_off.shape
     for iaxis in range(3):
-        coeffs_nsc[iaxis] = coeffs_isc_off.shape[iaxis]
         if coeffs_nsc[iaxis] != 1:
             force_same_cell = False
 
@@ -426,9 +452,11 @@ def reduce_sc_products(
                 if not same_cell:
                     # Calculate the sc offset between both orbitals.
                     for iaxis in range(3):
-                        sc_diff[iaxis] = data_sc_off[jcol_sc, iaxis] - data_sc_off[icol_sc, iaxis]
+                        sc_diff[iaxis] = (
+                            data_sc_off[jcol_sc, iaxis] - data_sc_off[icol_sc, iaxis]
+                        )
                         # Calculate also the offset in the reverse direction
-                        inv_sc_diff[iaxis] = - sc_diff[iaxis]
+                        inv_sc_diff[iaxis] = -sc_diff[iaxis]
 
                         # If the sc_difference is negative, convert it to positive so that we can
                         # use it to index the isc_off array (we switched off the handling of negative
@@ -444,7 +472,9 @@ def reduce_sc_products(
                     sc_jcol = jpair_sc * uc_ncol + uc_jcol
 
                     # Do the same for the ji pair
-                    ipair_sc = coeffs_isc_off[inv_sc_diff[0], inv_sc_diff[1], inv_sc_diff[2]]
+                    ipair_sc = coeffs_isc_off[
+                        inv_sc_diff[0], inv_sc_diff[1], inv_sc_diff[2]
+                    ]
                     sc_icol = ipair_sc * uc_ncol + uc_icol
 
                 # Add the contribution of this column pair to the row total value. Note that we only
@@ -455,9 +485,13 @@ def reduce_sc_products(
                     if icol == jcol:
                         i_row_value += coeffs[uc_icol, uc_jcol] * data[j]
                     else:
-                        i_row_value += (coeffs[uc_icol, uc_jcol] + coeffs[uc_jcol, uc_icol]) * data[j]
+                        i_row_value += (
+                            coeffs[uc_icol, uc_jcol] + coeffs[uc_jcol, uc_icol]
+                        ) * data[j]
                 else:
-                    i_row_value += (coeffs[uc_icol, sc_jcol] + coeffs[uc_jcol, sc_icol]) * data[j]
+                    i_row_value += (
+                        coeffs[uc_icol, sc_jcol] + coeffs[uc_jcol, sc_icol]
+                    ) * data[j]
 
             # Multiply all the contributions of ij pairs with this i by data[i], as explained inside the j loop.
             i_row_value *= data[i]
@@ -468,14 +502,22 @@ def reduce_sc_products(
         # Store the row value in the output
         out[reduced_row] = out[reduced_row] + row_value
 
+
 @cython.boundscheck(False)
 @cython.wraparound(False)
 @cython.cdivision(True)
 def reduce_sc_products_multicoeffs(
-    cython.floating[:] data, ptr: np.int32_t[:], col: np.int32_t[:], cython.floating[:, :, :] coeffs,
-    uc_ncol: cython.int, data_sc_off: np.int32_t[:, :], coeffs_isc_off: np.int32_t[:, :, :],
-    reduce_factor: cython.int, grid_shape: np.int32_t[:], new_axes: np.int32_t[:],
-    cython.floating[:, :] out,
+    data: cython.floating[:],
+    ptr: cnp.int32_t[:],
+    col: cnp.int32_t[:],
+    coeffs: cython.floating[:, :, :],
+    uc_ncol: cython.int,
+    data_sc_off: cnp.int32_t[:, :],
+    coeffs_isc_off: cnp.int32_t[:, :, :],
+    reduce_factor: cython.int,
+    grid_shape: cnp.int32_t[:],
+    new_axes: cnp.int32_t[:],
+    out: cython.floating[:, :],
 ):
     """For each row, sums all possible products between column pairs.
 
@@ -515,13 +557,11 @@ def reduce_sc_products_multicoeffs(
     out:
         The array where the output should be stored.
     """
-    nrows: cython.int = ptr.shape[0] - 1
+    nrows = ptr.shape[0] - 1
 
     # Indices to handle rows
     row: cython.int
     reduced_row: cython.int
-    row_start: cython.int
-    row_end: cython.int
 
     # Indices to handle pairs of columns (ij)
     i: cython.int
@@ -543,14 +583,13 @@ def reduce_sc_products_multicoeffs(
     icoeff: cython.int
 
     # Temporal storage to build values
-    cdef cython.floating[:] row_value = np.zeros(ncoeffs, dtype=np.asarray(data).dtype)
-    cdef cython.floating[:] i_row_value = np.zeros(ncoeffs, dtype=np.asarray(data).dtype)
+    row_value: cython.floating[:] = np.zeros(ncoeffs, dtype=np.asarray(data).dtype)
+    i_row_value: cython.floating[:] = np.zeros(ncoeffs, dtype=np.asarray(data).dtype)
 
     # Index to loop over axes of the grid.
     iaxis: cython.int
 
     # Variables that will help managing orbital pairs that are not within the same cell.
-    coeffs_nsc: cython.int[3]
     sc_diff: cython.int[3]
     inv_sc_diff: cython.int[3]
     force_same_cell: cython.bint
@@ -567,8 +606,8 @@ def reduce_sc_products_multicoeffs(
     # (if any) have been stored in the unit cell. This is what SIESTA does for gamma point calculations
     # with nsc <= 3.
     force_same_cell = True
+    coeffs_nsc = coeffs_isc_off.shape
     for iaxis in range(3):
-        coeffs_nsc[iaxis] = coeffs_isc_off.shape[iaxis]
         if coeffs_nsc[iaxis] != 1:
             force_same_cell = False
 
@@ -626,9 +665,11 @@ def reduce_sc_products_multicoeffs(
                 if not same_cell:
                     # Calculate the sc offset between both orbitals.
                     for iaxis in range(3):
-                        sc_diff[iaxis] = data_sc_off[jcol_sc, iaxis] - data_sc_off[icol_sc, iaxis]
+                        sc_diff[iaxis] = (
+                            data_sc_off[jcol_sc, iaxis] - data_sc_off[icol_sc, iaxis]
+                        )
                         # Calculate also the offset in the reverse direction
-                        inv_sc_diff[iaxis] = - sc_diff[iaxis]
+                        inv_sc_diff[iaxis] = -sc_diff[iaxis]
 
                         # If the sc_difference is negative, convert it to positive so that we can
                         # use it to index the isc_off array (we switched off the handling of negative
@@ -644,7 +685,9 @@ def reduce_sc_products_multicoeffs(
                     sc_jcol = jpair_sc * uc_ncol + uc_jcol
 
                     # Do the same for the ji pair
-                    ipair_sc = coeffs_isc_off[inv_sc_diff[0], inv_sc_diff[1], inv_sc_diff[2]]
+                    ipair_sc = coeffs_isc_off[
+                        inv_sc_diff[0], inv_sc_diff[1], inv_sc_diff[2]
+                    ]
                     sc_icol = ipair_sc * uc_ncol + uc_icol
 
                 # Add the contribution of this column pair to the row total value. Note that we only
@@ -656,11 +699,19 @@ def reduce_sc_products_multicoeffs(
                 for icoeff in range(ncoeffs):
                     if same_cell:
                         if icol == jcol:
-                            i_row_value[icoeff] += coeffs[uc_icol, uc_jcol, icoeff] * data[j]
+                            i_row_value[icoeff] += (
+                                coeffs[uc_icol, uc_jcol, icoeff] * data[j]
+                            )
                         else:
-                            i_row_value[icoeff] += (coeffs[uc_icol, uc_jcol, icoeff] + coeffs[uc_jcol, uc_icol, icoeff]) * data[j]
+                            i_row_value[icoeff] += (
+                                coeffs[uc_icol, uc_jcol, icoeff]
+                                + coeffs[uc_jcol, uc_icol, icoeff]
+                            ) * data[j]
                     else:
-                        i_row_value[icoeff] += (coeffs[uc_icol, sc_jcol, icoeff] + coeffs[uc_jcol, sc_icol, icoeff]) * data[j]
+                        i_row_value[icoeff] += (
+                            coeffs[uc_icol, sc_jcol, icoeff]
+                            + coeffs[uc_jcol, sc_icol, icoeff]
+                        ) * data[j]
 
             for icoeff in range(ncoeffs):
                 # Multiply all the contributions of ij pairs with this i by data[i], as explained inside the j loop.
@@ -673,14 +724,22 @@ def reduce_sc_products_multicoeffs(
             # Store the row value in the output
             out[reduced_row, icoeff] = out[reduced_row, icoeff] + row_value[icoeff]
 
+
 @cython.boundscheck(False)
 @cython.wraparound(False)
 @cython.cdivision(True)
 def reduce_sc_products_multicoeffs_sparse_denseindex(
-    cython.floating[:] data, ptr: np.int32_t[:], col: np.int32_t[:], coeffs_csr: SparseCSR,
-    uc_ncol: cython.int, data_sc_off: np.int32_t[:, :], coeffs_isc_off: np.int32_t[:, :, :],
-    reduce_factor: cython.int, grid_shape: np.int32_t[:], new_axes: np.int32_t[:],
-    cython.floating[:, :] out,
+    data: cython.floating[:],
+    ptr: cnp.int32_t[:],
+    col: cnp.int32_t[:],
+    coeffs_csr: SparseCSR,
+    uc_ncol: cython.int,
+    data_sc_off: cnp.int32_t[:, :],
+    coeffs_isc_off: cnp.int32_t[:, :, :],
+    reduce_factor: cython.int,
+    grid_shape: cnp.int32_t[:],
+    new_axes: cnp.int32_t[:],
+    out: cython.floating[:, :],
 ):
     """For each row, sums all possible products between column pairs.
 
@@ -720,13 +779,11 @@ def reduce_sc_products_multicoeffs_sparse_denseindex(
     out:
         The array where the output should be stored.
     """
-    nrows: cython.int = ptr.shape[0] - 1
+    nrows = ptr.shape[0] - 1
 
     # Indices to handle rows
     row: cython.int
     reduced_row: cython.int
-    row_start: cython.int
-    row_end: cython.int
 
     # Indices to handle pairs of columns (ij)
     i: cython.int
@@ -744,9 +801,11 @@ def reduce_sc_products_multicoeffs_sparse_denseindex(
     jpair_sc: cython.int
 
     # Extra variables to handle coefficients
-    cdef cython.floating[:, :] coeffs = coeffs_csr.data
+    coeffs: cython.floating[:, :] = coeffs_csr.data
     # Get the array containing all the dense indices.
-    dense_idx: np.int32_t[:, :] = dense_index(coeffs_csr.shape[:2], coeffs_csr.ptr, coeffs_csr.col)
+    dense_idx: cnp.int32_t[:, :] = dense_index(
+        coeffs_csr.shape[:2], coeffs_csr.ptr, coeffs_csr.col
+    )
     coeff_index: cython.int
     coeff_index2: cython.int
 
@@ -755,14 +814,13 @@ def reduce_sc_products_multicoeffs_sparse_denseindex(
     icoeff: cython.int
 
     # Temporal storage to build values
-    cdef cython.floating[:] row_value = np.zeros(ncoeffs, dtype=np.asarray(data).dtype)
-    cdef cython.floating[:] i_row_value = np.zeros(ncoeffs, dtype=np.asarray(data).dtype)
+    row_value: cython.floating[:] = np.zeros(ncoeffs, dtype=np.asarray(data).dtype)
+    i_row_value: cython.floating[:] = np.zeros(ncoeffs, dtype=np.asarray(data).dtype)
 
     # Index to loop over axes of the grid.
     iaxis: cython.int
 
     # Variables that will help managing orbital pairs that are not within the same cell.
-    coeffs_nsc: cython.int[3]
     sc_diff: cython.int[3]
     inv_sc_diff: cython.int[3]
     force_same_cell: cython.bint
@@ -779,8 +837,8 @@ def reduce_sc_products_multicoeffs_sparse_denseindex(
     # (if any) have been stored in the unit cell. This is what SIESTA does for gamma point calculations
     # with nsc <= 3.
     force_same_cell = True
+    coeffs_nsc = coeffs_isc_off.shape
     for iaxis in range(3):
-        coeffs_nsc[iaxis] = coeffs_isc_off.shape[iaxis]
         if coeffs_nsc[iaxis] != 1:
             force_same_cell = False
 
@@ -838,9 +896,11 @@ def reduce_sc_products_multicoeffs_sparse_denseindex(
                 if not same_cell:
                     # Calculate the sc offset between both orbitals.
                     for iaxis in range(3):
-                        sc_diff[iaxis] = data_sc_off[jcol_sc, iaxis] - data_sc_off[icol_sc, iaxis]
+                        sc_diff[iaxis] = (
+                            data_sc_off[jcol_sc, iaxis] - data_sc_off[icol_sc, iaxis]
+                        )
                         # Calculate also the offset in the reverse direction
-                        inv_sc_diff[iaxis] = - sc_diff[iaxis]
+                        inv_sc_diff[iaxis] = -sc_diff[iaxis]
 
                         # If the sc_difference is negative, convert it to positive so that we can
                         # use it to index the isc_off array (we switched off the handling of negative
@@ -856,7 +916,9 @@ def reduce_sc_products_multicoeffs_sparse_denseindex(
                     sc_jcol = jpair_sc * uc_ncol + uc_jcol
 
                     # Do the same for the ji pair
-                    ipair_sc = coeffs_isc_off[inv_sc_diff[0], inv_sc_diff[1], inv_sc_diff[2]]
+                    ipair_sc = coeffs_isc_off[
+                        inv_sc_diff[0], inv_sc_diff[1], inv_sc_diff[2]
+                    ]
                     sc_icol = ipair_sc * uc_ncol + uc_icol
 
                 # Get the index needed to find the coefficients that we want from the coeffs array.
@@ -888,9 +950,14 @@ def reduce_sc_products_multicoeffs_sparse_denseindex(
                         if icol == jcol:
                             i_row_value[icoeff] += coeffs[coeff_index, icoeff] * data[j]
                         else:
-                            i_row_value[icoeff] += (coeffs[coeff_index, icoeff] + coeffs[coeff_index2, icoeff]) * data[j]
+                            i_row_value[icoeff] += (
+                                coeffs[coeff_index, icoeff]
+                                + coeffs[coeff_index2, icoeff]
+                            ) * data[j]
                     else:
-                        i_row_value[icoeff] += (coeffs[coeff_index, icoeff] + coeffs[coeff_index2, icoeff]) * data[j]
+                        i_row_value[icoeff] += (
+                            coeffs[coeff_index, icoeff] + coeffs[coeff_index2, icoeff]
+                        ) * data[j]
 
             for icoeff in range(ncoeffs):
                 # Multiply all the contributions of ij pairs with this i by data[i], as explained inside the j loop.

From c0986ad1be941d45eac06586b74910d8f15b875c Mon Sep 17 00:00:00 2001
From: Nick Papior <nickpapior@gmail.com>
Date: Wed, 19 Jun 2024 12:58:10 +0200
Subject: [PATCH 4/5] further improved speed

Signed-off-by: Nick Papior <nickpapior@gmail.com>
---
 src/sisl/_core/geometry.py | 21 +++++++--------------
 1 file changed, 7 insertions(+), 14 deletions(-)

diff --git a/src/sisl/_core/geometry.py b/src/sisl/_core/geometry.py
index 9d76d3c831..df526c5074 100644
--- a/src/sisl/_core/geometry.py
+++ b/src/sisl/_core/geometry.py
@@ -3662,21 +3662,14 @@ def sphere_grid_index(grid, center, R):
 
             corners_i = grid.index(corners)
 
-            ix = np.arange(
-                max(corners_i[:, 0].min(), 0),
-                min(corners_i[:, 0].max() + 1, grid.shape[0]),
-            )
-            iy = np.arange(
-                max(corners_i[:, 1].min(), 0),
-                min(corners_i[:, 1].max() + 1, grid.shape[1]),
-            )
-            iz = np.arange(
-                max(corners_i[:, 2].min(), 0),
-                min(corners_i[:, 2].max() + 1, grid.shape[2]),
-            )
+            cmin = np.maximum(corners_i.min(axis=0), 0)
+            cmax = np.maximum(corners_i.max(axis=0) + 1, 0)
+
+            rx = slice(cmin[0], min(cmax[0], grid.shape[0]))
+            ry = slice(cmin[1], min(cmax[1], grid.shape[1]))
+            rz = slice(cmin[2], min(cmax[2], grid.shape[2]))
 
-            indices = np.meshgrid(ix, iy, iz)
-            indices = np.array(indices).T.reshape(-1, 3)
+            indices = np.mgrid[rx, ry, rz].reshape(3, -1).T
 
             return indices
 

From 718b7ab1827458fe5481416f3b221c4b83b9596e Mon Sep 17 00:00:00 2001
From: Nick Papior <nickpapior@gmail.com>
Date: Wed, 19 Jun 2024 15:12:02 +0200
Subject: [PATCH 5/5] added doc note to _orbital_values

Signed-off-by: Nick Papior <nickpapior@gmail.com>
---
 src/sisl/_core/geometry.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/sisl/_core/geometry.py b/src/sisl/_core/geometry.py
index df526c5074..0f3b3d3faa 100644
--- a/src/sisl/_core/geometry.py
+++ b/src/sisl/_core/geometry.py
@@ -3618,6 +3618,10 @@ def _orbital_values(self, grid_shape: Tuple[int, int, int]):
         ----------
         grid_shape:
            the grid shape (i.e. resolution) in which to calculate the orbital values.
+
+        Notes
+        -----
+        This method does not belong on this geometry. It will be removed eventually.
         """
         # We need to import these here to avoid circular imports.
         from sisl import Grid