From eba5f5ba2aa898a6e09f583545b10d0eb136194f Mon Sep 17 00:00:00 2001
From: Mikael Simberg <mikael.simberg@iki.fi>
Date: Thu, 20 Jul 2023 13:26:07 +0200
Subject: [PATCH 1/9] Remove duplicate "the" from docstrings

---
 include/dlaf/eigensolver/tridiag_solver.h       | 13 +++++--------
 include/dlaf/eigensolver/tridiag_solver/merge.h |  4 ++--
 include/dlaf/eigensolver/tridiag_solver/rot.h   |  4 ++--
 include/dlaf/sender/transform_mpi.h             |  2 +-
 4 files changed, 10 insertions(+), 13 deletions(-)
diff --git a/include/dlaf/eigensolver/tridiag_solver.h b/include/dlaf/eigensolver/tridiag_solver.h
index f5715756c2..71f458f86f 100644
--- a/include/dlaf/eigensolver/tridiag_solver.h
+++ b/include/dlaf/eigensolver/tridiag_solver.h
@@ -26,10 +26,9 @@ namespace eigensolver {
 /// @param tridiag [in/out] (n x 2) local matrix with the diagonal and off-diagonal of the symmetric
 ///                tridiagonal matrix in the first column and second columns respectively. The last entry
 ///                of the second column is not used.
-/// @param evals [out] (n x 1) local matrix holding the eigenvalues of the the symmetric tridiagonal
-///              matrix
-/// @param evecs [out] (n x n) local matrix holding the eigenvectors of the the symmetric tridiagonal
-///              matrix on exit.
+/// @param evals [out] (n x 1) local matrix holding the eigenvalues of the symmetric tridiagonal matrix
+/// @param evecs [out] (n x n) local matrix holding the eigenvectors of the symmetric tridiagonal matrix
+///              on exit.
 ///
 /// @pre tridiag and @p evals and @p evecs are local matrices
 /// @pre tridiag has 2 columns and column block size of 2
@@ -75,10 +74,8 @@ void tridiagSolver(Matrix<BaseType<T>, Device::CPU>& tridiag, Matrix<BaseType<T>
 /// @param tridiag [in/out] (n x 2) local matrix with the diagonal and off-diagonal of the symmetric
 ///                tridiagonal matrix in the first column and second columns respectively. The last entry
 ///                of the second column is not used.
-/// @param evals [out] (n x 1) local matrix holding the eigenvalues of the the symmetric tridiagonal
-///              matrix
-/// @param evecs [out] (n x n) distributed matrix holding the eigenvectors of the the symmetric
-/// tridiagonal
+/// @param evals [out] (n x 1) local matrix holding the eigenvalues of the symmetric tridiagonal matrix
+/// @param evecs [out] (n x n) distributed matrix holding the eigenvectors of the symmetric tridiagonal
 ///              matrix on exit.
 ///
 /// @pre tridiag and @p evals are local matrices and are the same on all ranks
diff --git a/include/dlaf/eigensolver/tridiag_solver/merge.h b/include/dlaf/eigensolver/tridiag_solver/merge.h
index f60df1bd83..823ee2610d 100644
--- a/include/dlaf/eigensolver/tridiag_solver/merge.h
+++ b/include/dlaf/eigensolver/tridiag_solver/merge.h
@@ -388,8 +388,8 @@ std::vector<GivensRotation<T>> applyDeflationToArrays(T rho, T tol, const SizeTy
     d2 = tmp;
 
     rots.push_back(GivensRotation<T>{i1s, i2s, c, s});
-    //  Set the the `i1` column as "Dense" if the `i2` column has opposite non-zero structure (i.e if
-    //  one comes from Q1 and the other from Q2 or vice-versa)
+    //  Set the `i1` column as "Dense" if the `i2` column has opposite non-zero structure (i.e if one
+    //  comes from Q1 and the other from Q2 or vice-versa)
     if ((c1 == ColType::UpperHalf && c2 == ColType::LowerHalf) ||
         (c1 == ColType::LowerHalf && c2 == ColType::UpperHalf)) {
       c1 = ColType::Dense;
diff --git a/include/dlaf/eigensolver/tridiag_solver/rot.h b/include/dlaf/eigensolver/tridiag_solver/rot.h
index 6d4860f410..daa56b9186 100644
--- a/include/dlaf/eigensolver/tridiag_solver/rot.h
+++ b/include/dlaf/eigensolver/tridiag_solver/rot.h
@@ -165,12 +165,12 @@ void applyGivensRotationsToMatrixColumns(const SizeType i_begin, const SizeType
     common::internal::SingleThreadedBlasScope single;
 
     for (const GivensRotation<T>& rot : rots) {
-      // Get the index of the tile that has column `rot.i` and the the index of the column within the tile.
+      // Get the index of the tile that has column `rot.i` and the index of the column within the tile.
       const SizeType i_tile = distr.globalTileLinearIndex(GlobalElementIndex(0, rot.i));
       const SizeType i_el = distr.tileElementFromGlobalElement<Coord::Col>(rot.i);
       T* x = tiles[to_sizet(i_tile)].ptr(TileElementIndex(0, i_el));
 
-      // Get the index of the tile that has column `rot.j` and the the index of the column within the tile.
+      // Get the index of the tile that has column `rot.j` and the index of the column within the tile.
       const SizeType j_tile = distr.globalTileLinearIndex(GlobalElementIndex(0, rot.j));
       const SizeType j_el = distr.tileElementFromGlobalElement<Coord::Col>(rot.j);
       T* y = tiles[to_sizet(j_tile)].ptr(TileElementIndex(0, j_el));
diff --git a/include/dlaf/sender/transform_mpi.h b/include/dlaf/sender/transform_mpi.h
index 50f23cb0b7..c953b20c2f 100644
--- a/include/dlaf/sender/transform_mpi.h
+++ b/include/dlaf/sender/transform_mpi.h
@@ -34,7 +34,7 @@ void consumeCommunicatorWrapper(T&) {}
 ///
 /// Wrapper type around calls to MPI functions. Provides a call operator that
 /// creates an MPI request and passes it as the last argument to the provided
-/// callable. The wrapper then waits for the the request to complete with
+/// callable. The wrapper then waits for the request to complete with
 /// yield_while.
 ///
 /// This could in theory be a lambda inside transformMPI.  However, clang at

From 22656c8ba85a98cd2a63daadb54d67fc48a0abda Mon Sep 17 00:00:00 2001
From: Mikael Simberg <mikael.simberg@iki.fi>
Date: Thu, 20 Jul 2023 13:29:46 +0200
Subject: [PATCH 2/9] Make local backtransformation band to tridiagonal
 docstring actual doxygen docstring

---
 include/dlaf/eigensolver/bt_band_to_tridiag.h | 68 +++++++++----------
 1 file changed, 34 insertions(+), 34 deletions(-)

diff --git a/include/dlaf/eigensolver/bt_band_to_tridiag.h b/include/dlaf/eigensolver/bt_band_to_tridiag.h
index ea8d69d8be..05715bcee8 100644
--- a/include/dlaf/eigensolver/bt_band_to_tridiag.h
+++ b/include/dlaf/eigensolver/bt_band_to_tridiag.h
@@ -19,40 +19,40 @@
 
 namespace dlaf::eigensolver {
 
-// Eigenvalue back-transformation implementation on local memory, which applies the inverse of the
-// transformation used to get a tridiagonal matrix from a band one.
-//
-// It computes E -= V T V* E, applying to a general matrix E the inverse of the transformation described
-// by the reflectors in V (block-wise, so T represents the T factor which embeds the information about
-// taus), which are the ones used to transform a band matrix to a tridiagonal matrix.
-//
-// In particular, V and T are obtained using data about reflectors and taus passed via @p mat_hh
-// where they are stored using following compact representation
-//
-// compact           extended
-// AT BT CT DT       1  0  0  0
-// A1 B1 C1 D1       A1 1  0  0
-// A2 B2 C2 D2       A2 B1 1  0
-// A3 B3 C3 D3       A3 B2 C1 1
-//                   0  B3 C2 D1
-//                   0  0  C3 D2
-//                   0  0  0  D3
-//
-// where A, B, C and D refers to distinct reflectors, with their components numbered and their taus
-// identified by the letter T.
-//
-// @param mat_hh matrix containing reflectors together with taus (compact form see representation above)
-// @param mat_e matrix to which the inverse transformation is applied to
-// @param band_size size of the reflectors (normal one, not constrained by any matrix size limit)
-// @pre mat_hh has a square size
-// @pre mat_hh has a square block size
-// @pre mat_e and mat_hh share the same number of rows
-// @pre mat_e block size and mat_hh block size share the same number of rows
-// @pre band_size is a divisor of mat_hh.blockSize().cols()
-// @pre mat_e is not distributed
-// @pre mat_hh is not distributed
-// @pre mat_e has equal tile and block sizes
-// @pre mat_hh has equal tile and block sizes
+/// Eigenvalue back-transformation implementation on local memory, which applies the inverse of the
+/// transformation used to get a tridiagonal matrix from a band one.
+///
+/// It computes E -= V T V* E, applying to a general matrix E the inverse of the transformation described
+/// by the reflectors in V (block-wise, so T represents the T factor which embeds the information about
+/// taus), which are the ones used to transform a band matrix to a tridiagonal matrix.
+///
+/// In particular, V and T are obtained using data about reflectors and taus passed via @p mat_hh
+/// where they are stored using following compact representation
+///
+/// compact           extended
+/// AT BT CT DT       1  0  0  0
+/// A1 B1 C1 D1       A1 1  0  0
+/// A2 B2 C2 D2       A2 B1 1  0
+/// A3 B3 C3 D3       A3 B2 C1 1
+///                   0  B3 C2 D1
+///                   0  0  C3 D2
+///                   0  0  0  D3
+///
+/// where A, B, C and D refers to distinct reflectors, with their components numbered and their taus
+/// identified by the letter T.
+///
+/// @param mat_hh matrix containing reflectors together with taus (compact form see representation above)
+/// @param mat_e matrix to which the inverse transformation is applied to
+/// @param band_size size of the reflectors (normal one, not constrained by any matrix size limit)
+/// @pre mat_hh has a square size
+/// @pre mat_hh has a square block size
+/// @pre mat_e and mat_hh share the same number of rows
+/// @pre mat_e block size and mat_hh block size share the same number of rows
+/// @pre band_size is a divisor of mat_hh.blockSize().cols()
+/// @pre mat_e is not distributed
+/// @pre mat_hh is not distributed
+/// @pre mat_e has equal tile and block sizes
+/// @pre mat_hh has equal tile and block sizes
 template <Backend B, Device D, class T>
 void backTransformationBandToTridiag(const SizeType band_size, matrix::Matrix<T, D>& mat_e,
                                      matrix::Matrix<const T, Device::CPU>& mat_hh) {

From 6745155c331c124415367f8c6be39f14c4dbc75f Mon Sep 17 00:00:00 2001
From: Mikael Simberg <mikael.simberg@iki.fi>
Date: Thu, 20 Jul 2023 13:30:20 +0200
Subject: [PATCH 3/9] Add docstring for distributed backtransformation band to
 tridiagonal

---
 include/dlaf/eigensolver/bt_band_to_tridiag.h | 34 +++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/include/dlaf/eigensolver/bt_band_to_tridiag.h b/include/dlaf/eigensolver/bt_band_to_tridiag.h
index 05715bcee8..985fc00b80 100644
--- a/include/dlaf/eigensolver/bt_band_to_tridiag.h
+++ b/include/dlaf/eigensolver/bt_band_to_tridiag.h
@@ -74,6 +74,40 @@ void backTransformationBandToTridiag(const SizeType band_size, matrix::Matrix<T,
   internal::BackTransformationT2B<B, D, T>::call(band_size, mat_e, mat_hh);
 }
 
+/// Eigenvalue back-transformation implementation, which applies the inverse of the transformation used
+/// to get a tridiagonal matrix from a band one.
+///
+/// It computes E -= V T V* E, applying to a general matrix E the inverse of the transformation described
+/// by the reflectors in V (block-wise, so T represents the T factor which embeds the information about
+/// taus), which are the ones used to transform a band matrix to a tridiagonal matrix.
+///
+/// In particular, V and T are obtained using data about reflectors and taus passed via @p mat_hh
+/// where they are stored using following compact representation
+///
+/// compact           extended
+/// AT BT CT DT       1  0  0  0
+/// A1 B1 C1 D1       A1 1  0  0
+/// A2 B2 C2 D2       A2 B1 1  0
+/// A3 B3 C3 D3       A3 B2 C1 1
+///                   0  B3 C2 D1
+///                   0  0  C3 D2
+///                   0  0  0  D3
+///
+/// where A, B, C and D refers to distinct reflectors, with their components numbered and their taus
+/// identified by the letter T.
+///
+/// @param mat_hh matrix containing reflectors together with taus (compact form see representation above)
+/// @param mat_e matrix to which the inverse transformation is applied to
+/// @param band_size size of the reflectors (normal one, not constrained by any matrix size limit)
+/// @pre mat_hh has a square size
+/// @pre mat_hh has a square block size
+/// @pre mat_e and mat_hh share the same number of rows
+/// @pre mat_e block size and mat_hh block size share the same number of rows
+/// @pre band_size is a divisor of mat_hh.blockSize().cols()
+/// @pre mat_e is distributed according to grid
+/// @pre mat_hh is distributed according to grid
+/// @pre mat_e has equal tile and block sizes
+/// @pre mat_hh has equal tile and block sizes
 template <Backend B, Device D, class T>
 void backTransformationBandToTridiag(comm::CommunicatorGrid grid, const SizeType band_size,
                                      matrix::Matrix<T, D>& mat_e,

From bd7585adfa518728c56a812b22547b9f3d1517be Mon Sep 17 00:00:00 2001
From: Mikael Simberg <mikael.simberg@iki.fi>
Date: Thu, 20 Jul 2023 13:38:29 +0200
Subject: [PATCH 4/9] Add preconditions for eigensolver to docstring

---
 include/dlaf/eigensolver/eigensolver.h | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/include/dlaf/eigensolver/eigensolver.h b/include/dlaf/eigensolver/eigensolver.h
index d21bdf6b78..b17138848f 100644
--- a/include/dlaf/eigensolver/eigensolver.h
+++ b/include/dlaf/eigensolver/eigensolver.h
@@ -35,6 +35,15 @@ namespace dlaf::eigensolver {
 /// @param mat contains the Hermitian matrix A
 /// @param eigenvalues is a N x 1 matrix which on output contains the eigenvalues
 /// @param eigenvectors is a N x N matrix which on output contains the eigenvectors
+/// @pre mat is not distributed
+/// @pre mat has a square size
+/// @pre mat has a square blocksize
+/// @pre mat has equal tile and block sizes
+/// @pre eigenvalues is not distributed
+/// @pre eigenvalues has equal tile and block sizes
+/// @pre eigenvectors is not distributed
+/// @pre eigenvectors has a square blocksize
+/// @pre eigenvectors has equal tile and block sizes
 template <Backend B, Device D, class T>
 void eigensolver(blas::Uplo uplo, Matrix<T, D>& mat, Matrix<BaseType<T>, D>& eigenvalues,
                  Matrix<T, D>& eigenvectors) {
@@ -69,6 +78,10 @@ void eigensolver(blas::Uplo uplo, Matrix<T, D>& mat, Matrix<BaseType<T>, D>& eig
 /// @return struct ReturnEigensolverType with eigenvalues, as a vector<T>, and eigenvectors as a Matrix
 /// @param uplo specifies if upper or lower triangular part of @p mat will be referenced
 /// @param mat contains the Hermitian matrix A
+/// @pre mat is not distributed
+/// @pre mat has a square size
+/// @pre mat has a square blocksize
+/// @pre mat has equal tile and block sizes
 template <Backend B, Device D, class T>
 EigensolverResult<T, D> eigensolver(blas::Uplo uplo, Matrix<T, D>& mat) {
   const SizeType size = mat.size().rows();
@@ -95,6 +108,15 @@ EigensolverResult<T, D> eigensolver(blas::Uplo uplo, Matrix<T, D>& mat) {
 /// @param mat contains the Hermitian matrix A
 /// @param eigenvalues is a N x 1 matrix which on output contains the eigenvalues
 /// @param eigenvectors is a N x N matrix which on output contains the eigenvectors
+/// @pre mat is distributed according to grid
+/// @pre mat has a square size
+/// @pre mat has a square blocksize
+/// @pre mat has equal tile and block sizes
+/// @pre eigenvalues is distributed according to grid ?? TODO
+/// @pre eigenvalues has equal tile and block sizes
+/// @pre eigenvectors is distributed according to grid
+/// @pre eigenvectors has a square blocksize
+/// @pre eigenvectors has equal tile and block sizes
 template <Backend B, Device D, class T>
 void eigensolver(comm::CommunicatorGrid grid, blas::Uplo uplo, Matrix<T, D>& mat,
                  Matrix<BaseType<T>, D>& eigenvalues, Matrix<T, D>& eigenvectors) {
@@ -130,6 +152,10 @@ void eigensolver(comm::CommunicatorGrid grid, blas::Uplo uplo, Matrix<T, D>& mat
 /// @param grid is the communicator grid on which the matrix @p mat has been distributed,
 /// @param uplo specifies if upper or lower triangular part of @p mat will be referenced
 /// @param mat contains the Hermitian matrix A
+/// @pre mat is distributed according to grid
+/// @pre mat has a square size
+/// @pre mat has a square blocksize
+/// @pre mat has equal tile and block sizes
 template <Backend B, Device D, class T>
 EigensolverResult<T, D> eigensolver(comm::CommunicatorGrid grid, blas::Uplo uplo, Matrix<T, D>& mat) {
   const SizeType size = mat.size().rows();

From 4b7f9b4ebabed90d026c7e66892a5f2a73b8fe4a Mon Sep 17 00:00:00 2001
From: Mikael Simberg <mikael.simberg@iki.fi>
Date: Thu, 20 Jul 2023 13:48:41 +0200
Subject: [PATCH 5/9] Add preconditions for generalized eigensolver to
 docstring

---
 include/dlaf/eigensolver/gen_eigensolver.h | 42 ++++++++++++++++++++++
 1 file changed, 42 insertions(+)

diff --git a/include/dlaf/eigensolver/gen_eigensolver.h b/include/dlaf/eigensolver/gen_eigensolver.h
index ee973b10f5..6a09ab6155 100644
--- a/include/dlaf/eigensolver/gen_eigensolver.h
+++ b/include/dlaf/eigensolver/gen_eigensolver.h
@@ -38,6 +38,19 @@ namespace dlaf::eigensolver {
 /// @param mat_b contains the Hermitian positive definite matrix B
 /// @param eigenvalues is a N x 1 matrix which on output contains the eigenvalues
 /// @param eigenvectors is a N x N matrix which on output contains the eigenvectors
+/// @pre mat_a is not distributed
+/// @pre mat_a has a square size
+/// @pre mat_a has a square blocksize
+/// @pre mat_a has equal tile and block sizes
+/// @pre mat_b is not distributed
+/// @pre mat_b has a square size
+/// @pre mat_b has a square blocksize
+/// @pre mat_b has equal tile and block sizes
+/// @pre eigenvalues is not distributed
+/// @pre eigenvalues has equal tile and block sizes
+/// @pre eigenvectors is not distributed
+/// @pre eigenvectors has a square blocksize
+/// @pre eigenvectors has equal tile and block sizes
 template <Backend B, Device D, class T>
 void genEigensolver(blas::Uplo uplo, Matrix<T, D>& mat_a, Matrix<T, D>& mat_b,
                     Matrix<BaseType<T>, D>& eigenvalues, Matrix<T, D>& eigenvectors) {
@@ -81,6 +94,14 @@ void genEigensolver(blas::Uplo uplo, Matrix<T, D>& mat_a, Matrix<T, D>& mat_b,
 /// @param uplo specifies if upper or lower triangular part of @p mat_a and @p mat_b will be referenced
 /// @param mat_a contains the Hermitian matrix A
 /// @param mat_b contains the Hermitian positive definite matrix B
+/// @pre mat_a is not distributed
+/// @pre mat_a has a square size
+/// @pre mat_a has a square blocksize
+/// @pre mat_a has equal tile and block sizes
+/// @pre mat_b is not distributed
+/// @pre mat_b has a square size
+/// @pre mat_b has a square blocksize
+/// @pre mat_b has equal tile and block sizes
 template <Backend B, Device D, class T>
 EigensolverResult<T, D> genEigensolver(blas::Uplo uplo, Matrix<T, D>& mat_a, Matrix<T, D>& mat_b) {
   DLAF_ASSERT(matrix::local_matrix(mat_a), mat_a);
@@ -122,6 +143,19 @@ EigensolverResult<T, D> genEigensolver(blas::Uplo uplo, Matrix<T, D>& mat_a, Mat
 /// @param mat_b contains the Hermitian positive definite matrix B
 /// @param eigenvalues is a N x 1 matrix which on output contains the eigenvalues
 /// @param eigenvectors is a N x N matrix which on output contains the eigenvectors
+/// @pre mat_a is distributed according to grid
+/// @pre mat_a has a square size
+/// @pre mat_a has a square blocksize
+/// @pre mat_a has equal tile and block sizes
+/// @pre mat_b is distributed according to grid
+/// @pre mat_b has a square size
+/// @pre mat_b has a square blocksize
+/// @pre mat_b has equal tile and block sizes
+/// @pre eigenvalues is distributed according to grid ?? TODO
+/// @pre eigenvalues has equal tile and block sizes
+/// @pre eigenvectors is distributed according to grid
+/// @pre eigenvectors has a square blocksize
+/// @pre eigenvectors has equal tile and block sizes
 template <Backend B, Device D, class T>
 void genEigensolver(comm::CommunicatorGrid grid, blas::Uplo uplo, Matrix<T, D>& mat_a,
                     Matrix<T, D>& mat_b, Matrix<BaseType<T>, D>& eigenvalues,
@@ -167,6 +201,14 @@ void genEigensolver(comm::CommunicatorGrid grid, blas::Uplo uplo, Matrix<T, D>&
 /// @param uplo specifies if upper or lower triangular part of @p mat_a and @p mat_b will be referenced
 /// @param mat_a contains the Hermitian matrix A
 /// @param mat_b contains the Hermitian positive definite matrix B
+/// @pre mat_a is distributed according to grid
+/// @pre mat_a has a square size
+/// @pre mat_a has a square blocksize
+/// @pre mat_a has equal tile and block sizes
+/// @pre mat_b is distributed according to grid
+/// @pre mat_b has a square size
+/// @pre mat_b has a square blocksize
+/// @pre mat_b has equal tile and block sizes
 template <Backend B, Device D, class T>
 EigensolverResult<T, D> genEigensolver(comm::CommunicatorGrid grid, blas::Uplo uplo, Matrix<T, D>& mat_a,
                                        Matrix<T, D>& mat_b) {

From 448870a1d7ad6a92fe6eaf9b791ff7fca6ca8606 Mon Sep 17 00:00:00 2001
From: Mikael Simberg <mikael.simberg@iki.fi>
Date: Thu, 20 Jul 2023 13:48:57 +0200
Subject: [PATCH 6/9] Add preconditions for permuations to docstring

---
 include/dlaf/permutations/general.h | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/include/dlaf/permutations/general.h b/include/dlaf/permutations/general.h
index b898a9e03e..cfa6cb549e 100644
--- a/include/dlaf/permutations/general.h
+++ b/include/dlaf/permutations/general.h
@@ -34,7 +34,16 @@ namespace dlaf::permutations {
 ///        the range [i_begin,i_end) are accessed in read-only mode.
 /// @param mat_out is the output matrix. Only tiles whose both row and col tile coords are in
 ///        the range [i_begin,i_end) are accessed in write-only mode.
-///
+/// @pre perms is not distributed
+/// @pre perms has equal tile and block sizes
+/// @pre mat_in is not distributed
+/// @pre mat_in has equal tile and block sizes
+/// @pre mat_in has a square size
+/// @pre mat_in has a square blocksize
+/// @pre mat_out is not distributed
+/// @pre mat_out has equal tile and block sizes
+/// @pre mat_out has a square size
+/// @pre mat_out has a square blocksize
 template <Backend B, Device D, class T, Coord coord>
 void permute(SizeType i_begin, SizeType i_end, Matrix<const SizeType, D>& perms,
              Matrix<const T, D>& mat_in, Matrix<T, D>& mat_out) {
@@ -83,6 +92,16 @@ void permute(SizeType i_begin, SizeType i_end, Matrix<const SizeType, D>& perms,
 ///        the range [i_begin,i_end) are accessed in readwrite-mode.
 /// @param mat_out is the distributed output matrix. Only tiles whose both global row and col tile coords are in
 ///        the range [i_begin,i_end) are accessed in readwrite-mode.
+/// @pre perms is not distributed
+/// @pre perms has equal tile and block sizes
+/// @pre mat_in is distributed according to grid
+/// @pre mat_in has equal tile and block sizes
+/// @pre mat_in has a square size
+/// @pre mat_in has a square blocksize
+/// @pre mat_out is distributed according to grid
+/// @pre mat_out has equal tile and block sizes
+/// @pre mat_out has a square size
+/// @pre mat_out has a square blocksize
 ///
 /// Note: The Pipeline<> API allows to use permute() within other algorithms without having to clone communicators
 ///       internally.
@@ -122,7 +141,6 @@ void permute(comm::CommunicatorGrid grid, common::Pipeline<comm::Communicator>&
 ///
 /// This overload clones the row communicator (if Coord::Col) or column communicator (if Coord::Row) of
 /// @p grid internally.
-///
 template <Backend B, Device D, class T, Coord coord>
 void permute(comm::CommunicatorGrid grid, SizeType i_begin, SizeType i_end,
              Matrix<const SizeType, D>& perms, Matrix<const T, D>& mat_in, Matrix<T, D>& mat_out) {

From 5549f94d5a1228aa077f392e14f3d79c9299934d Mon Sep 17 00:00:00 2001
From: Mikael Simberg <mikael.simberg@iki.fi>
Date: Thu, 20 Jul 2023 13:53:04 +0200
Subject: [PATCH 7/9] Fix eigenvalues docstring precondition

---
 include/dlaf/eigensolver/eigensolver.h     | 2 +-
 include/dlaf/eigensolver/gen_eigensolver.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/dlaf/eigensolver/eigensolver.h b/include/dlaf/eigensolver/eigensolver.h
index b17138848f..cd0e3a65db 100644
--- a/include/dlaf/eigensolver/eigensolver.h
+++ b/include/dlaf/eigensolver/eigensolver.h
@@ -112,7 +112,7 @@ EigensolverResult<T, D> eigensolver(blas::Uplo uplo, Matrix<T, D>& mat) {
 /// @pre mat has a square size
 /// @pre mat has a square blocksize
 /// @pre mat has equal tile and block sizes
-/// @pre eigenvalues is distributed according to grid ?? TODO
+/// @pre eigenvalues is not distributed
 /// @pre eigenvalues has equal tile and block sizes
 /// @pre eigenvectors is distributed according to grid
 /// @pre eigenvectors has a square blocksize
diff --git a/include/dlaf/eigensolver/gen_eigensolver.h b/include/dlaf/eigensolver/gen_eigensolver.h
index 6a09ab6155..f3dd726044 100644
--- a/include/dlaf/eigensolver/gen_eigensolver.h
+++ b/include/dlaf/eigensolver/gen_eigensolver.h
@@ -151,7 +151,7 @@ EigensolverResult<T, D> genEigensolver(blas::Uplo uplo, Matrix<T, D>& mat_a, Mat
 /// @pre mat_b has a square size
 /// @pre mat_b has a square blocksize
 /// @pre mat_b has equal tile and block sizes
-/// @pre eigenvalues is distributed according to grid ?? TODO
+/// @pre eigenvalues is not distributed
 /// @pre eigenvalues has equal tile and block sizes
 /// @pre eigenvectors is distributed according to grid
 /// @pre eigenvectors has a square blocksize

From 1a6cedeacb1b61bace2ddddeb8e8d160b3cc1b7d Mon Sep 17 00:00:00 2001
From: Mikael Simberg <mikael.simberg@iki.fi>
Date: Wed, 30 Aug 2023 11:02:03 +0200
Subject: [PATCH 8/9] Update eigenvalues documentation to new style

---
 include/dlaf/eigensolver/eigensolver.h | 87 ++++++++++++++++----------
 1 file changed, 53 insertions(+), 34 deletions(-)

diff --git a/include/dlaf/eigensolver/eigensolver.h b/include/dlaf/eigensolver/eigensolver.h
index cd0e3a65db..28d1c76245 100644
--- a/include/dlaf/eigensolver/eigensolver.h
+++ b/include/dlaf/eigensolver/eigensolver.h
@@ -32,18 +32,24 @@ namespace dlaf::eigensolver {
 /// Implementation on local memory.
 ///
 /// @param uplo specifies if upper or lower triangular part of @p mat will be referenced
-/// @param mat contains the Hermitian matrix A
-/// @param eigenvalues is a N x 1 matrix which on output contains the eigenvalues
-/// @param eigenvectors is a N x N matrix which on output contains the eigenvectors
+///
+/// @param[in,out] mat contains the Hermitian matrix A
 /// @pre mat is not distributed
-/// @pre mat has a square size
-/// @pre mat has a square blocksize
-/// @pre mat has equal tile and block sizes
+/// @pre mat has size (N x N)
+/// @pre mat has blocksize (NB x NB)
+/// @pre mat has tilesize (NB x NB)
+///
+/// @param[out] eigenvalues contains the eigenvalues
 /// @pre eigenvalues is not distributed
-/// @pre eigenvalues has equal tile and block sizes
+/// @pre eigenvalues has size (N x 1)
+/// @pre eigenvalues has blocksize (NB x 1)
+/// @pre eigenvalues has tilesize (NB x 1)
+///
+/// @param[out] eigenvectors contains the eigenvectors
 /// @pre eigenvectors is not distributed
-/// @pre eigenvectors has a square blocksize
-/// @pre eigenvectors has equal tile and block sizes
+/// @pre eigenvectors has size (N x N)
+/// @pre eigenvectors has blocksize (NB x NB)
+/// @pre eigenvectors has tilesize (NB x NB)
 template <Backend B, Device D, class T>
 void eigensolver(blas::Uplo uplo, Matrix<T, D>& mat, Matrix<BaseType<T>, D>& eigenvalues,
                  Matrix<T, D>& eigenvectors) {
@@ -75,13 +81,15 @@ void eigensolver(blas::Uplo uplo, Matrix<T, D>& mat, Matrix<BaseType<T>, D>& eig
 ///
 /// Implementation on local memory.
 ///
-/// @return struct ReturnEigensolverType with eigenvalues, as a vector<T>, and eigenvectors as a Matrix
+/// @return ReturnEigensolverType with eigenvalues and eigenvectors as a Matrix
+///
 /// @param uplo specifies if upper or lower triangular part of @p mat will be referenced
-/// @param mat contains the Hermitian matrix A
+///
+/// @param[in,out] mat contains the Hermitian matrix A
 /// @pre mat is not distributed
-/// @pre mat has a square size
-/// @pre mat has a square blocksize
-/// @pre mat has equal tile and block sizes
+/// @pre mat has size (N x N)
+/// @pre mat has blocksize (NB x NB)
+/// @pre mat has tilesize (NB x NB)
 template <Backend B, Device D, class T>
 EigensolverResult<T, D> eigensolver(blas::Uplo uplo, Matrix<T, D>& mat) {
   const SizeType size = mat.size().rows();
@@ -103,20 +111,28 @@ EigensolverResult<T, D> eigensolver(blas::Uplo uplo, Matrix<T, D>& mat) {
 ///
 /// Implementation on distributed memory.
 ///
-/// @param grid is the communicator grid on which the matrix @p mat has been distributed,
+/// @param grid is the communicator grid on which the matrix @p mat has been distributed
+/// @pre grid is an (NG x MG) grid
+///
 /// @param uplo specifies if upper or lower triangular part of @p mat will be referenced
-/// @param mat contains the Hermitian matrix A
-/// @param eigenvalues is a N x 1 matrix which on output contains the eigenvalues
-/// @param eigenvectors is a N x N matrix which on output contains the eigenvectors
-/// @pre mat is distributed according to grid
-/// @pre mat has a square size
-/// @pre mat has a square blocksize
-/// @pre mat has equal tile and block sizes
-/// @pre eigenvalues is not distributed
-/// @pre eigenvalues has equal tile and block sizes
-/// @pre eigenvectors is distributed according to grid
-/// @pre eigenvectors has a square blocksize
-/// @pre eigenvectors has equal tile and block sizes
+///
+/// @param[in,out] mat contains the Hermitian matrix A
+/// @pre mat is distributed according to @p grid
+/// @pre mat has size (N x N)
+/// @pre mat has blocksize (NB x NB)
+/// @pre mat has tilesize (NB x NB)
+///
+/// @param[out] eigenvalues contains the eigenvalues
+/// @pre eigenvalues is stored on all ranks
+/// @pre eigenvalues has size (N x 1)
+/// @pre eigenvalues has blocksize (NB x 1)
+/// @pre eigenvalues has tilesize (NB x 1)
+///
+/// @param[out] eigenvectors contains the eigenvectors
+/// @pre eigenvectors is distributed according to @p grid
+/// @pre eigenvectors has size (N x N)
+/// @pre eigenvectors has blocksize (NB x NB)
+/// @pre eigenvectors has tilesize (NB x NB)
 template <Backend B, Device D, class T>
 void eigensolver(comm::CommunicatorGrid grid, blas::Uplo uplo, Matrix<T, D>& mat,
                  Matrix<BaseType<T>, D>& eigenvalues, Matrix<T, D>& eigenvectors) {
@@ -148,14 +164,17 @@ void eigensolver(comm::CommunicatorGrid grid, blas::Uplo uplo, Matrix<T, D>& mat
 ///
 /// Implementation on distributed memory.
 ///
-/// @return struct ReturnEigensolverType with eigenvalues, as a vector<T>, and eigenvectors as a Matrix
-/// @param grid is the communicator grid on which the matrix @p mat has been distributed,
+/// @return struct ReturnEigensolverType with eigenvalues and eigenvectors as a Matrix
+///
+/// @param grid is the communicator grid on which the matrix @p mat has been distributed
+///
 /// @param uplo specifies if upper or lower triangular part of @p mat will be referenced
-/// @param mat contains the Hermitian matrix A
-/// @pre mat is distributed according to grid
-/// @pre mat has a square size
-/// @pre mat has a square blocksize
-/// @pre mat has equal tile and block sizes
+///
+/// @param[in,out] mat contains the Hermitian matrix A
+/// @pre mat is distributed according to @p grid
+/// @pre mat has size (N x N)
+/// @pre mat has blocksize (NB x NB)
+/// @pre mat has tilesize (NB x NB)
 template <Backend B, Device D, class T>
 EigensolverResult<T, D> eigensolver(comm::CommunicatorGrid grid, blas::Uplo uplo, Matrix<T, D>& mat) {
   const SizeType size = mat.size().rows();

From aaf70597393084a3151fff9165a5dd203e7161e7 Mon Sep 17 00:00:00 2001
From: Mikael Simberg <mikael.simberg@iki.fi>
Date: Thu, 7 Sep 2023 12:17:04 +0200
Subject: [PATCH 9/9] Update docstrings

---
 include/dlaf/auxiliary/norm.h                 |   6 +-
 include/dlaf/eigensolver/band_to_tridiag.h    |  22 ++--
 include/dlaf/eigensolver/bt_band_to_tridiag.h |  40 ++++---
 .../dlaf/eigensolver/bt_reduction_to_band.h   |  26 +++--
 include/dlaf/eigensolver/eigensolver.h        |  65 ++++++-----
 include/dlaf/eigensolver/gen_eigensolver.h    | 108 ++++++++++--------
 include/dlaf/eigensolver/gen_to_std.h         |  32 ++++--
 include/dlaf/eigensolver/reduction_to_band.h  |  26 +++--
 include/dlaf/eigensolver/tridiag_solver.h     |  64 ++++++-----
 include/dlaf/factorization/cholesky.h         |  17 +--
 include/dlaf/multiplication/general.h         |  43 +++++--
 include/dlaf/multiplication/hermitian.h       |  40 +++++--
 include/dlaf/multiplication/triangular.h      |  30 +++--
 include/dlaf/permutations/general.h           |  46 ++++----
 include/dlaf/solver/triangular.h              |  30 +++--
 15 files changed, 360 insertions(+), 235 deletions(-)

diff --git a/include/dlaf/auxiliary/norm.h b/include/dlaf/auxiliary/norm.h
index c206e180c0..31adeee37b 100644
--- a/include/dlaf/auxiliary/norm.h
+++ b/include/dlaf/auxiliary/norm.h
@@ -27,9 +27,9 @@ namespace dlaf::auxiliary {
 ///
 /// @note @p uplo == blas::uplo::Upper not yet implemented
 ///
-/// @pre `A.blockSize().rows() == A.blockSize().cols()`,
-/// @pre @p A is distributed according to @p grid,
-/// @pre @p A has equal tile and block sizes,
+/// @pre @p A is distributed according to @p grid
+/// @pre @p A has blocksize (NB x NB)
+/// @pre @p A has tilesize (NB x NB)
 /// @return the max norm of the Matrix @p A or 0 if `A.size().isEmpty()`
 template <Backend backend, Device device, class T>
 dlaf::BaseType<T> max_norm(comm::CommunicatorGrid grid, comm::Index2D rank, blas::Uplo uplo,
diff --git a/include/dlaf/eigensolver/band_to_tridiag.h b/include/dlaf/eigensolver/band_to_tridiag.h
index 252be9d133..f5cb09acfb 100644
--- a/include/dlaf/eigensolver/band_to_tridiag.h
+++ b/include/dlaf/eigensolver/band_to_tridiag.h
@@ -65,11 +65,12 @@ namespace dlaf::eigensolver::internal {
 /// Implementation on local memory.
 ///
 /// @param mat_a contains the Hermitian band matrix A (if A is real, the matrix is symmetric).
-/// @pre mat_a has a square size,
-/// @pre mat_a has a square block size,
-/// @pre band_size is a divisor of mat_a.blockSize().cols(), and band_size >= 2
-/// @pre mat_a is not distributed,
-/// @pre mat_a has equal tile and block sizes.
+/// @pre @p mat_a is not distributed
+/// @pre @p mat_a has size (N x N)
+/// @pre @p mat_a has blocksize (NB x NB)
+/// @pre @p mat_a has tilesize (NB x NB)
+///
+/// @pre @p band_size is a divisor of `mat_a.blockSize().cols()`, and @p band_size >= 2
 template <Backend B, Device D, class T>
 TridiagResult<T, Device::CPU> band_to_tridiagonal(blas::Uplo uplo, SizeType band_size,
                                                   Matrix<const T, D>& mat_a) {
@@ -138,11 +139,12 @@ TridiagResult<T, Device::CPU> band_to_tridiagonal(blas::Uplo uplo, SizeType band
 /// Implementation on distributed memory.
 ///
 /// @param mat_a contains the Hermitian band matrix A (if A is real, the matrix is symmetric).
-/// @pre mat_a has a square size,
-/// @pre mat_a has a square block size,
-/// @pre band_size is a divisor of mat_a.blockSize().cols() and band_size >= 2,
-/// @pre mat_a is distributed according to grid,
-/// @pre mat_a has equal tile and block sizes.
+/// @pre @p mat_a is distributed according to @p grid
+/// @pre @p mat_a has size (N x N)
+/// @pre @p mat_a has blocksize (NB x NB)
+/// @pre @p mat_a has tilesize (NB x NB)
+///
+/// @pre @p band_size is a divisor of `mat_a.blockSize().cols()`, and @p band_size >= 2
 template <Backend backend, Device device, class T>
 TridiagResult<T, Device::CPU> band_to_tridiagonal(comm::CommunicatorGrid grid, blas::Uplo uplo,
                                                   SizeType band_size, Matrix<const T, device>& mat_a) {
diff --git a/include/dlaf/eigensolver/bt_band_to_tridiag.h b/include/dlaf/eigensolver/bt_band_to_tridiag.h
index ebb3b903b9..d0ae27f69e 100644
--- a/include/dlaf/eigensolver/bt_band_to_tridiag.h
+++ b/include/dlaf/eigensolver/bt_band_to_tridiag.h
@@ -42,17 +42,19 @@ namespace dlaf::eigensolver::internal {
 /// identified by the letter T.
 ///
 /// @param mat_hh matrix containing reflectors together with taus (compact form see representation above)
+/// @pre @p mat_hh is not distributed
+/// @pre @p mat_hh has size (N x N)
+/// @pre @p mat_hh has blocksize (NB x NB)
+/// @pre @p mat_hh has tilesize (NB x NB)
+///
 /// @param mat_e matrix to which the inverse transformation is applied to
+/// @pre @p mat_e is not distributed
+/// @pre @p mat_e has size (N x M)
+/// @pre @p mat_e has blocksize (NB x MB)
+/// @pre @p mat_e has tilesize (NB x MB)
+///
 /// @param band_size size of the reflectors (normal one, not constrained by any matrix size limit)
-/// @pre mat_hh has a square size
-/// @pre mat_hh has a square block size
-/// @pre mat_e and mat_hh share the same number of rows
-/// @pre mat_e block size and mat_hh block size share the same number of rows
-/// @pre band_size is a divisor of mat_hh.blockSize().cols()
-/// @pre mat_e is not distributed
-/// @pre mat_hh is not distributed
-/// @pre mat_e has equal tile and block sizes
-/// @pre mat_hh has equal tile and block sizes
+/// @pre @p band_size is a divisor of `mat_hh.blockSize().cols()`
 template <Backend B, Device D, class T>
 void bt_band_to_tridiagonal(const SizeType band_size, matrix::Matrix<T, D>& mat_e,
                             matrix::Matrix<const T, Device::CPU>& mat_hh) {
@@ -97,17 +99,19 @@ void bt_band_to_tridiagonal(const SizeType band_size, matrix::Matrix<T, D>& mat_
 /// identified by the letter T.
 ///
 /// @param mat_hh matrix containing reflectors together with taus (compact form see representation above)
+/// @pre @p mat_hh is distributed according to @p grid
+/// @pre @p mat_hh has size (N x N)
+/// @pre @p mat_hh has blocksize (NB x NB)
+/// @pre @p mat_hh has tilesize (NB x NB)
+///
 /// @param mat_e matrix to which the inverse transformation is applied to
+/// @pre @p mat_e is distributed according to @p grid
+/// @pre @p mat_e has size (N x M)
+/// @pre @p mat_e has blocksize (NB x MB)
+/// @pre @p mat_e has tilesize (NB x MB)
+///
 /// @param band_size size of the reflectors (normal one, not constrained by any matrix size limit)
-/// @pre mat_hh has a square size
-/// @pre mat_hh has a square block size
-/// @pre mat_e and mat_hh share the same number of rows
-/// @pre mat_e block size and mat_hh block size share the same number of rows
-/// @pre band_size is a divisor of mat_hh.blockSize().cols()
-/// @pre mat_e is distributed according to grid
-/// @pre mat_hh is distributed according to grid
-/// @pre mat_e has equal tile and block sizes
-/// @pre mat_hh has equal tile and block sizes
+/// @pre @p band_size is a divisor of `mat_hh.blockSize().cols()`
 template <Backend B, Device D, class T>
 void bt_band_to_tridiagonal(comm::CommunicatorGrid grid, const SizeType band_size,
                             matrix::Matrix<T, D>& mat_e, matrix::Matrix<const T, Device::CPU>& mat_hh) {
diff --git a/include/dlaf/eigensolver/bt_reduction_to_band.h b/include/dlaf/eigensolver/bt_reduction_to_band.h
index a8cd0c01bb..fd386cf7af 100644
--- a/include/dlaf/eigensolver/bt_reduction_to_band.h
+++ b/include/dlaf/eigensolver/bt_reduction_to_band.h
@@ -28,14 +28,18 @@ namespace dlaf::eigensolver::internal {
 /// defined by the j-th element of tau and the HH reflector stored in the j-th column of the matrix V.
 ///
 /// @param mat_c contains the (m x n) matrix C (blocksize (mb x nb)), while on exit it contains Q C.
+/// @pre @p mat_c is not distributed
+/// @pre @p mat_c has blocksize (NB x NB)
+/// @pre @p mat_c has tilesize (NB x NB)
+///
 /// @param mat_v is (m x m) matrix with blocksize (mb x mb), which contains the Householder reflectors.
 /// The j-th HH reflector is v_j = (1, V(mb + j : n, j)).
+/// @pre @p mat_v is not distributed
+/// @pre @p mat_v has blocksize (NB x NB)
+/// @pre @p mat_v has tilesize (NB x NB)
+///
 /// @param mat_taus is the tau vector as returned by reductionToBand. The j-th element is the scaling
 /// factor for the j-th HH tranformation.
-/// @pre mat_c is not distributed,
-/// @pre mat_v is not distributed,
-/// @pre mat_c has equal tile and block sizes,
-/// @pre mat_v has equal tile and block sizes.
 template <Backend backend, Device device, class T>
 void bt_reduction_to_band(const SizeType b, Matrix<T, device>& mat_c, Matrix<const T, device>& mat_v,
                           Matrix<const T, Device::CPU>& mat_taus) {
@@ -64,15 +68,19 @@ void bt_reduction_to_band(const SizeType b, Matrix<T, device>& mat_c, Matrix<con
 /// (HH(j) is the House-Holder transformation (I - v tau vH)
 /// defined by the j-th element of tau and the HH reflector stored in the j-th column of the matrix V.
 ///
-/// @param mat_c contains the (m x n) matrix C (blocksize (mb x nb)), while on exit it contains Q C.
+/// @param mat_c contains the (m x n) matrix C (blocksize (mb x nb)), while on exit it contains Q C
+/// @pre @p mat_c is distributed according to @p grid
+/// @pre @p mat_c has blocksize (NB x NB)
+/// @pre @p mat_c has tilesize (NB x NB)
+///
 /// @param mat_v is (m x m) matrix with blocksize (mb x mb), which contains the Householder reflectors.
 /// The j-th HH reflector is v_j = (1, V(mb + j : n, j)).
+/// @pre @p mat_v is distributed according to @p grid
+/// @pre @p mat_v has blocksize (NB x NB)
+/// @pre @p mat_v has tilesize (NB x NB)
+///
 /// @param mat_taus is the tau vector as returned by reductionToBand. The j-th element is the scaling
 /// factor for the j-th HH tranformation.
-/// @pre mat_c is distributed,
-/// @pre mat_v is distributed according to grid,
-/// @pre mat_c has equal tile and block sizes,
-/// @pre mat_v has equal tile and block sizes.
 template <Backend backend, Device device, class T>
 void bt_reduction_to_band(comm::CommunicatorGrid grid, const SizeType b, Matrix<T, device>& mat_c,
                           Matrix<const T, device>& mat_v, Matrix<const T, Device::CPU>& mat_taus) {
diff --git a/include/dlaf/eigensolver/eigensolver.h b/include/dlaf/eigensolver/eigensolver.h
index 7c4dfecb42..ee667e79cf 100644
--- a/include/dlaf/eigensolver/eigensolver.h
+++ b/include/dlaf/eigensolver/eigensolver.h
@@ -34,22 +34,22 @@ namespace dlaf {
 /// @param uplo specifies if upper or lower triangular part of @p mat will be referenced
 ///
 /// @param[in,out] mat contains the Hermitian matrix A
-/// @pre mat is not distributed
-/// @pre mat has size (N x N)
-/// @pre mat has blocksize (NB x NB)
-/// @pre mat has tilesize (NB x NB)
+/// @pre @p mat is not distributed
+/// @pre @p mat has size (N x N)
+/// @pre @p mat has blocksize (NB x NB)
+/// @pre @p mat has tilesize (NB x NB)
 ///
 /// @param[out] eigenvalues contains the eigenvalues
-/// @pre eigenvalues is not distributed
-/// @pre eigenvalues has size (N x 1)
-/// @pre eigenvalues has blocksize (NB x 1)
-/// @pre eigenvalues has tilesize (NB x 1)
+/// @pre @p eigenvalues is not distributed
+/// @pre @p eigenvalues has size (N x 1)
+/// @pre @p eigenvalues has blocksize (NB x 1)
+/// @pre @p eigenvalues has tilesize (NB x 1)
 ///
 /// @param[out] eigenvectors contains the eigenvectors
-/// @pre eigenvectors is not distributed
-/// @pre eigenvectors has size (N x N)
-/// @pre eigenvectors has blocksize (NB x NB)
-/// @pre eigenvectors has tilesize (NB x NB)
+/// @pre @p eigenvectors is not distributed
+/// @pre @p eigenvectors has size (N x N)
+/// @pre @p eigenvectors has blocksize (NB x NB)
+/// @pre @p eigenvectors has tilesize (NB x NB)
 template <Backend B, Device D, class T>
 void hermitian_eigensolver(blas::Uplo uplo, Matrix<T, D>& mat, Matrix<BaseType<T>, D>& eigenvalues,
                            Matrix<T, D>& eigenvectors) {
@@ -86,10 +86,10 @@ void hermitian_eigensolver(blas::Uplo uplo, Matrix<T, D>& mat, Matrix<BaseType<T
 /// @param uplo specifies if upper or lower triangular part of @p mat will be referenced
 ///
 /// @param[in,out] mat contains the Hermitian matrix A
-/// @pre mat is not distributed
-/// @pre mat has size (N x N)
-/// @pre mat has blocksize (NB x NB)
-/// @pre mat has tilesize (NB x NB)
+/// @pre @p mat is not distributed
+/// @pre @p mat has size (N x N)
+/// @pre @p mat has blocksize (NB x NB)
+/// @pre @p mat has tilesize (NB x NB)
 template <Backend B, Device D, class T>
 EigensolverResult<T, D> hermitian_eigensolver(blas::Uplo uplo, Matrix<T, D>& mat) {
   const SizeType size = mat.size().rows();
@@ -112,27 +112,26 @@ EigensolverResult<T, D> hermitian_eigensolver(blas::Uplo uplo, Matrix<T, D>& mat
 /// Implementation on distributed memory.
 ///
 /// @param grid is the communicator grid on which the matrix @p mat has been distributed
-/// @pre grid is an (NG x MG) grid
 ///
 /// @param uplo specifies if upper or lower triangular part of @p mat will be referenced
 ///
 /// @param[in,out] mat contains the Hermitian matrix A
-/// @pre mat is distributed according to @p grid
-/// @pre mat has size (N x N)
-/// @pre mat has blocksize (NB x NB)
-/// @pre mat has tilesize (NB x NB)
+/// @pre @p mat is distributed according to @p grid
+/// @pre @p mat has size (N x N)
+/// @pre @p mat has blocksize (NB x NB)
+/// @pre @p mat has tilesize (NB x NB)
 ///
 /// @param[out] eigenvalues contains the eigenvalues
-/// @pre eigenvalues is stored on all ranks
-/// @pre eigenvalues has size (N x 1)
-/// @pre eigenvalues has blocksize (NB x 1)
-/// @pre eigenvalues has tilesize (NB x 1)
+/// @pre @p eigenvalues is stored on all ranks
+/// @pre @p eigenvalues has size (N x 1)
+/// @pre @p eigenvalues has blocksize (NB x 1)
+/// @pre @p eigenvalues has tilesize (NB x 1)
 ///
 /// @param[out] eigenvectors contains the eigenvectors
-/// @pre eigenvectors is distributed according to @p grid
-/// @pre eigenvectors has size (N x N)
-/// @pre eigenvectors has blocksize (NB x NB)
-/// @pre eigenvectors has tilesize (NB x NB)
+/// @pre @p eigenvectors is distributed according to @p grid
+/// @pre @p eigenvectors has size (N x N)
+/// @pre @p eigenvectors has blocksize (NB x NB)
+/// @pre @p eigenvectors has tilesize (NB x NB)
 template <Backend B, Device D, class T>
 void hermitian_eigensolver(comm::CommunicatorGrid grid, blas::Uplo uplo, Matrix<T, D>& mat,
                            Matrix<BaseType<T>, D>& eigenvalues, Matrix<T, D>& eigenvectors) {
@@ -171,10 +170,10 @@ void hermitian_eigensolver(comm::CommunicatorGrid grid, blas::Uplo uplo, Matrix<
 /// @param uplo specifies if upper or lower triangular part of @p mat will be referenced
 ///
 /// @param[in,out] mat contains the Hermitian matrix A
-/// @pre mat is distributed according to @p grid
-/// @pre mat has size (N x N)
-/// @pre mat has blocksize (NB x NB)
-/// @pre mat has tilesize (NB x NB)
+/// @pre @p mat is distributed according to @p grid
+/// @pre @p mat has size (N x N)
+/// @pre @p mat has blocksize (NB x NB)
+/// @pre @p mat has tilesize (NB x NB)
 template <Backend B, Device D, class T>
 EigensolverResult<T, D> hermitian_eigensolver(comm::CommunicatorGrid grid, blas::Uplo uplo,
                                               Matrix<T, D>& mat) {
diff --git a/include/dlaf/eigensolver/gen_eigensolver.h b/include/dlaf/eigensolver/gen_eigensolver.h
index 1831ad9c51..1c473863cf 100644
--- a/include/dlaf/eigensolver/gen_eigensolver.h
+++ b/include/dlaf/eigensolver/gen_eigensolver.h
@@ -34,23 +34,30 @@ namespace dlaf {
 /// Implementation on local memory.
 ///
 /// @param uplo specifies if upper or lower triangular part of @p mat_a and @p mat_b will be referenced
+///
 /// @param mat_a contains the Hermitian matrix A
+/// @pre @p mat_a is not distributed
+/// @pre @p mat_a has size (N x N)
+/// @pre @p mat_a has blocksize (NB x NB)
+/// @pre @p mat_a has tilesize (NB x NB)
+///
 /// @param mat_b contains the Hermitian positive definite matrix B
-/// @param eigenvalues is a N x 1 matrix which on output contains the eigenvalues
-/// @param eigenvectors is a N x N matrix which on output contains the eigenvectors
-/// @pre mat_a is not distributed
-/// @pre mat_a has a square size
-/// @pre mat_a has a square blocksize
-/// @pre mat_a has equal tile and block sizes
-/// @pre mat_b is not distributed
-/// @pre mat_b has a square size
-/// @pre mat_b has a square blocksize
-/// @pre mat_b has equal tile and block sizes
-/// @pre eigenvalues is not distributed
-/// @pre eigenvalues has equal tile and block sizes
-/// @pre eigenvectors is not distributed
-/// @pre eigenvectors has a square blocksize
-/// @pre eigenvectors has equal tile and block sizes
+/// @pre @p mat_b is not distributed
+/// @pre @p mat_b has size (N x N)
+/// @pre @p mat_b has blocksize (NB x NB)
+/// @pre @p mat_b has tilesize (NB x NB)
+///
+/// @param[out] eigenvalues contains the eigenvalues
+/// @pre @p eigenvalues is not distributed
+/// @pre @p eigenvalues has size (N x 1)
+/// @pre @p eigenvalues has blocksize (NB x NB)
+/// @pre @p eigenvalues has tilesize (NB x NB)
+///
+/// @param[out] eigenvectors contains the eigenvectors
+/// @pre @p eigenvectors is not distributed
+/// @pre @p eigenvectors has size (N x N)
+/// @pre @p eigenvectors has blocksize (NB x NB)
+/// @pre @p eigenvectors has tilesize (NB x NB)
 template <Backend B, Device D, class T>
 void hermitian_generalized_eigensolver(blas::Uplo uplo, Matrix<T, D>& mat_a, Matrix<T, D>& mat_b,
                                        Matrix<BaseType<T>, D>& eigenvalues, Matrix<T, D>& eigenvectors) {
@@ -92,16 +99,18 @@ void hermitian_generalized_eigensolver(blas::Uplo uplo, Matrix<T, D>& mat_a, Mat
 ///
 /// @return struct ReturnEigensolverType with eigenvalues, as a vector<T>, and eigenvectors as a Matrix
 /// @param uplo specifies if upper or lower triangular part of @p mat_a and @p mat_b will be referenced
+///
 /// @param mat_a contains the Hermitian matrix A
+/// @pre @p mat_a is not distributed
+/// @pre @p mat_a has size (N x N)
+/// @pre @p mat_a has blocksize (NB x NB)
+/// @pre @p mat_a has tilesize (NB x NB)
+///
 /// @param mat_b contains the Hermitian positive definite matrix B
-/// @pre mat_a is not distributed
-/// @pre mat_a has a square size
-/// @pre mat_a has a square blocksize
-/// @pre mat_a has equal tile and block sizes
-/// @pre mat_b is not distributed
-/// @pre mat_b has a square size
-/// @pre mat_b has a square blocksize
-/// @pre mat_b has equal tile and block sizes
+/// @pre @p mat_b is not distributed
+/// @pre @p mat_b has size (N x N)
+/// @pre @p mat_b has blocksize (NB x NB)
+/// @pre @p mat_b has tilesize (NB x NB)
 template <Backend B, Device D, class T>
 EigensolverResult<T, D> hermitian_generalized_eigensolver(blas::Uplo uplo, Matrix<T, D>& mat_a,
                                                           Matrix<T, D>& mat_b) {
@@ -140,23 +149,30 @@ EigensolverResult<T, D> hermitian_generalized_eigensolver(blas::Uplo uplo, Matri
 ///
 /// @param grid is the communicator grid on which the matrices @p mat_a and @p mat_b have been distributed,
 /// @param uplo specifies if upper or lower triangular part of @p mat_a and @p mat_b will be referenced
+///
 /// @param mat_a contains the Hermitian matrix A
+/// @pre @p mat_a is distributed according to @p grid
+/// @pre @p mat_a has size (N x N)
+/// @pre @p mat_a has blocksize (NB x NB)
+/// @pre @p mat_a has tilesize (NB x NB)
+///
 /// @param mat_b contains the Hermitian positive definite matrix B
+/// @pre @p mat_b is distributed according to @p grid
+/// @pre @p mat_b has size (N x N)
+/// @pre @p mat_b has blocksize (NB x NB)
+/// @pre @p mat_b has tilesize (NB x NB)
+///
 /// @param eigenvalues is a N x 1 matrix which on output contains the eigenvalues
-/// @param eigenvectors is a N x N matrix which on output contains the eigenvectors
-/// @pre mat_a is distributed according to grid
-/// @pre mat_a has a square size
-/// @pre mat_a has a square blocksize
-/// @pre mat_a has equal tile and block sizes
-/// @pre mat_b is distributed according to grid
-/// @pre mat_b has a square size
-/// @pre mat_b has a square blocksize
-/// @pre mat_b has equal tile and block sizes
-/// @pre eigenvalues is not distributed
-/// @pre eigenvalues has equal tile and block sizes
-/// @pre eigenvectors is distributed according to grid
-/// @pre eigenvectors has a square blocksize
-/// @pre eigenvectors has equal tile and block sizes
+/// @pre @p eigenvalues is not distributed
+/// @pre @p eigenvalues has size (N x 1)
+/// @pre @p eigenvalues has blocksize (NB x 1)
+/// @pre @p eigenvalues has tilesize (NB x 1)
+///
+/// @param[out] eigenvectors contains the eigenvectors
+/// @pre @p eigenvectors is distributed according to @p grid
+/// @pre @p eigenvectors has size (N x N)
+/// @pre @p eigenvectors has blocksize (NB x NB)
+/// @pre @p eigenvectors has tilesize (NB x NB)
 template <Backend B, Device D, class T>
 void hermitian_generalized_eigensolver(comm::CommunicatorGrid grid, blas::Uplo uplo, Matrix<T, D>& mat_a,
                                        Matrix<T, D>& mat_b, Matrix<BaseType<T>, D>& eigenvalues,
@@ -201,16 +217,18 @@ void hermitian_generalized_eigensolver(comm::CommunicatorGrid grid, blas::Uplo u
 /// @return struct ReturnEigensolverType with eigenvalues, as a vector<T>, and eigenvectors as a Matrix
 /// @param grid is the communicator grid on which the matrices @p mat_a and @p mat_b have been distributed,
 /// @param uplo specifies if upper or lower triangular part of @p mat_a and @p mat_b will be referenced
+///
 /// @param mat_a contains the Hermitian matrix A
+/// @pre @p mat_a is distributed according to @p grid
+/// @pre @p mat_a has size (N x N)
+/// @pre @p mat_a has blocksize (NB x NB)
+/// @pre @p mat_a has tilesize (NB x NB)
+///
 /// @param mat_b contains the Hermitian positive definite matrix B
-/// @pre mat_a is distributed according to grid
-/// @pre mat_a has a square size
-/// @pre mat_a has a square blocksize
-/// @pre mat_a has equal tile and block sizes
-/// @pre mat_b is distributed according to grid
-/// @pre mat_b has a square size
-/// @pre mat_b has a square blocksize
-/// @pre mat_b has equal tile and block sizes
+/// @pre @p mat_b is distributed according to @p grid
+/// @pre @p mat_b has size (N x N)
+/// @pre @p mat_b has blocksize (NB x NB)
+/// @pre @p mat_b has tilesize (NB x NB)
 template <Backend B, Device D, class T>
 EigensolverResult<T, D> hermitian_generalized_eigensolver(comm::CommunicatorGrid grid, blas::Uplo uplo,
                                                           Matrix<T, D>& mat_a, Matrix<T, D>& mat_b) {
diff --git a/include/dlaf/eigensolver/gen_to_std.h b/include/dlaf/eigensolver/gen_to_std.h
index 48d83e52ec..a41f9d5183 100644
--- a/include/dlaf/eigensolver/gen_to_std.h
+++ b/include/dlaf/eigensolver/gen_to_std.h
@@ -29,16 +29,22 @@ namespace dlaf::eigensolver::internal {
 ///
 /// @param uplo specifies if the elements of the Hermitian matrix A and the triangular matrix B
 /// to be referenced are the elements in the lower or upper triangular part,
+///
 /// @param mat_a on entry it contains the Hermitian matrix A (if A is real, the matrix is symmetric),
 /// on exit the matrix elements are overwritten with the elements of the matrix B.
 /// Only the tiles of the matrix which contain the lower triangular or the upper triangular part are accessed.
+/// @pre @p mat_a is not distributed
+/// @pre @p mat_a has size (N x N)
+/// @pre @p mat_a has blocksize (NB x NB)
+/// @pre @p mat_a has tilesize (NB x NB)
+///
 /// @param mat_b contains the triangular matrix. It can be lower (L) or upper (U). Only the tiles of
 /// the matrix which contain the lower triangular or the upper triangular part are accessed.
-/// Note: B should be modifiable as the diagonal tiles might be temporarly modified during the calculation.
-/// @pre mat_a and mat_b have the same square size,
-/// @pre mat_a and mat_b have the same square block size,
-/// @pre mat_a and mat_b have the same tile and block sizes,
-/// @pre mat_a and mat_b are not distributed.
+/// Note: B should be modifiable as the diagonal tiles might be temporarily modified during the calculation.
+/// @pre @p mat_b is not distributed
+/// @pre @p mat_b has size (N x N)
+/// @pre @p mat_b has blocksize (NB x NB)
+/// @pre @p mat_b has tilesize (NB x NB)
 template <Backend backend, Device device, class T>
 void generalized_to_standard(blas::Uplo uplo, Matrix<T, device>& mat_a, Matrix<T, device>& mat_b) {
   DLAF_ASSERT(matrix::square_size(mat_a), mat_a);
@@ -74,16 +80,22 @@ void generalized_to_standard(blas::Uplo uplo, Matrix<T, device>& mat_a, Matrix<T
 /// @param grid is the communicator grid on which the matrix A has been distributed,
 /// @param uplo specifies if the elements of the Hermitian matrix A and the triangular matrix B
 /// to be referenced are the elements in the lower or upper triangular part,
+///
 /// @param mat_a on entry it contains the Hermitian matrix A (if A is real, the matrix is symmetric),
 /// on exit the matrix elements are overwritten with the elements of the matrix B.
 /// Only the tiles of the matrix which contain the lower triangular or the upper triangular part are accessed.
+/// @pre @p mat_a is distributed according to @p grid
+/// @pre @p mat_a has size (N x N)
+/// @pre @p mat_a has blocksize (NB x NB)
+/// @pre @p mat_a has tilesize (NB x NB)
+///
 /// @param mat_b contains the triangular matrix. It can be lower (L) or upper (U). Only the tiles of
 /// the matrix which contain the lower triangular or the upper triangular part are accessed.
-/// Note: B should be modifiable as the diagonal tiles might be temporarly modified during the calculation.
-/// @pre mat_a and mat_b have the same square size,
-/// @pre mat_a and mat_b have the same square block size,
-/// @pre mat_a and mat_b have the same tile and block sizes,
-/// @pre mat_a and mat_b are distributed according to the grid.
+/// Note: B should be modifiable as the diagonal tiles might be temporarily modified during the calculation.
+/// @pre @p mat_b is distributed according to @p grid
+/// @pre @p mat_b has size (N x N)
+/// @pre @p mat_b has blocksize (NB x NB)
+/// @pre @p mat_b has tilesize (NB x NB)
 template <Backend backend, Device device, class T>
 void generalized_to_standard(comm::CommunicatorGrid grid, blas::Uplo uplo, Matrix<T, device>& mat_a,
                              Matrix<T, device>& mat_b) {
diff --git a/include/dlaf/eigensolver/reduction_to_band.h b/include/dlaf/eigensolver/reduction_to_band.h
index fededca70b..b3e05b45ef 100644
--- a/include/dlaf/eigensolver/reduction_to_band.h
+++ b/include/dlaf/eigensolver/reduction_to_band.h
@@ -27,14 +27,16 @@ namespace dlaf::eigensolver::internal {
 /// @param mat_a on entry it contains an Hermitian matrix, on exit it is overwritten with the
 /// band-diagonal result together with the elementary reflectors. Just the tiles of the lower
 /// triangular part will be used.
+/// @pre @p mat_a is not distributed
+/// @pre @p mat_a has size (N x N)
+/// @pre @p mat_a has blocksize (NB x NB)
+/// @pre @p mat_a has tilesize (NB x NB)
+///
 /// @param band_size size of the band of the resulting matrix (main diagonal + band_size sub-diagonals)
+/// @pre @p `mat_a.blockSize().rows() % band_size == 0`
+///
 /// @return the tau vector as needed by backtransformationReductionToBand
 ///
-/// @pre mat_a has a square size
-/// @pre mat_a has a square block size
-/// @pre mat_a has equal tile and block sizes
-/// @pre mat_a is a local matrix
-/// @pre mat_a.blockSize().rows() % band_size == 0
 template <Backend B, Device D, class T>
 Matrix<T, Device::CPU> reduction_to_band(Matrix<T, D>& mat_a, const SizeType band_size) {
   DLAF_ASSERT(matrix::square_size(mat_a), mat_a);
@@ -91,17 +93,19 @@ v v v v * *
 @endverbatim
 */
 /// @param grid is the CommunicatorGrid on which @p mat_a is distributed
+///
 /// @param mat_a on entry it contains an Hermitian matrix, on exit it is overwritten with the
 /// band-diagonal result together with the elementary reflectors as described above. Just the tiles of
 /// the lower triangular part will be used.
+/// @pre @p mat_a is distributed according to @p grid
+/// @pre @p mat_a has size (N x N)
+/// @pre @p mat_a has blocksize (NB x NB)
+/// @pre @p mat_a has tilesize (NB x NB)
+///
 /// @param band_size size of the band of the resulting matrix (main diagonal + band_size sub-diagonals)
-/// @return the tau vector as needed by backtransformationReductionToBand
+/// @pre `mat_a.blockSize().rows() % band_size == 0`
 ///
-/// @pre mat_a has a square size
-/// @pre mat_a has a square block size
-/// @pre mat_a has equal tile and block sizes
-/// @pre mat_a is distributed according to @p grid
-/// @pre mat_a.blockSize().rows() % band_size == 0
+/// @return the tau vector as needed by backtransformationReductionToBand
 template <Backend B, Device D, class T>
 Matrix<T, Device::CPU> reduction_to_band(comm::CommunicatorGrid grid, Matrix<T, D>& mat_a,
                                          const SizeType band_size) {
diff --git a/include/dlaf/eigensolver/tridiag_solver.h b/include/dlaf/eigensolver/tridiag_solver.h
index 2cbd768e7c..99df261c9b 100644
--- a/include/dlaf/eigensolver/tridiag_solver.h
+++ b/include/dlaf/eigensolver/tridiag_solver.h
@@ -22,20 +22,25 @@ namespace dlaf::eigensolver::internal {
 
 /// Finds the eigenvalues and eigenvectors of the local symmetric tridiagonal matrix @p tridiag.
 ///
-/// @param tridiag [in/out] (n x 2) local matrix with the diagonal and off-diagonal of the symmetric
-///                tridiagonal matrix in the first column and second columns respectively. The last entry
-///                of the second column is not used.
-/// @param evals [out] (n x 1) local matrix holding the eigenvalues of the symmetric tridiagonal matrix
-/// @param evecs [out] (n x n) local matrix holding the eigenvectors of the symmetric tridiagonal matrix
-///              on exit.
+/// @param tridiag local matrix with the diagonal and off-diagonal of the symmetric tridiagonal
+///                matrix in the first column and second columns respectively. The last entry of the
+///                second column is not used.
+/// @pre @p tridiag is not distributed
+/// @pre @p tridiag has size (N x 2)
+/// @pre @p tridiag has blocksize (NB x 2)
+/// @pre @p tridiag has tilesize (NB x 2)
 ///
-/// @pre tridiag and @p evals and @p evecs are local matrices
-/// @pre tridiag has 2 columns and column block size of 2
-/// @pre tridiag has equal tile and block sizes
-/// @pre evecs is a square matrix with number of rows equal to the number of rows of @p tridiag and @p evals
-/// @pre evecs has a square block size with number of block rows equal to the block rows of @p tridiag and @p evals
-/// @pre evals has equal tile and block sizes
-/// @pre evecs has equal tile and block sizes
+/// @param[out] evals contains the eigenvalues of the symmetric tridiagonal matrix
+/// @pre @p evals is not distributed
+/// @pre @p evals has size (N x 1)
+/// @pre @p evals has blocksize (NB x 1)
+/// @pre @p evals has tilesize (NB x 1)
+///
+/// @param[out] evecs contains the eigenvectors of the symmetric tridiagonal matrix
+/// @pre @p evecs is not distributed
+/// @pre @p evecs has size (N x N)
+/// @pre @p evecs has blocksize (NB x NB)
+/// @pre @p evecs has tilesize (NB x NB)
 template <Backend backend, Device device, class T>
 void tridiagonal_eigensolver(Matrix<BaseType<T>, Device::CPU>& tridiag,
                              Matrix<BaseType<T>, device>& evals, Matrix<T, device>& evecs) {
@@ -70,22 +75,25 @@ void tridiagonal_eigensolver(Matrix<BaseType<T>, Device::CPU>& tridiag,
 /// on each rank. The resulting eigenvalues @p evals are stored locally on each rank while the resulting
 /// eigenvectors @p evecs are distributed across ranks in 2D block-cyclic manner.
 ///
-/// @param tridiag [in/out] (n x 2) local matrix with the diagonal and off-diagonal of the symmetric
-///                tridiagonal matrix in the first column and second columns respectively. The last entry
-///                of the second column is not used.
-/// @param evals [out] (n x 1) local matrix holding the eigenvalues of the symmetric tridiagonal matrix
-/// @param evecs [out] (n x n) distributed matrix holding the eigenvectors of the symmetric tridiagonal
-///              matrix on exit.
+/// @param tridiag matrix with the diagonal and off-diagonal of the symmetric tridiagonal matrix in the
+///                first column and second columns respectively. The last entry of the second column is
+///                not used.
+/// @pre @p tridiag is not distributed
+/// @pre @p tridiag has size (N x 2)
+/// @pre @p tridiag has blocksize (NB x 2)
+/// @pre @p tridiag has tilesize (NB x 2)
+///
+/// @param[out] evals holds the eigenvalues of the symmetric tridiagonal matrix
+/// @pre @p evals is not distributed
+/// @pre @p evals has size (N x 1)
+/// @pre @p evals has blocksize (NB x 1)
+/// @pre @p evals has tilesize (NB x 1)
 ///
-/// @pre tridiag and @p evals are local matrices and are the same on all ranks
-/// @pre tridiag has 2 columns and column block size of 2
-/// @pre tridiag has equal tile and block sizes
-/// @pre evecs is a square matrix with global number of rows equal to the number of rows of @p tridiag
-/// and @p evals
-/// @pre evecs has a square block size with number of block rows equal to the block rows of @p tridiag
-/// and @p evals
-/// @pre evals has equal tile and block sizes
-/// @pre evecs has equal tile and block sizes
+/// @param[out] evecs holds the eigenvectors of the symmetric tridiagonal matrix
+/// @pre @p evecs is distributed according to @p grid
+/// @pre @p evecs has size (N x N)
+/// @pre @p evecs has blocksize (NB x NB)
+/// @pre @p evecs has tilesize (NB x NB)
 template <Backend B, Device D, class T>
 void tridiagonal_eigensolver(comm::CommunicatorGrid grid, Matrix<BaseType<T>, Device::CPU>& tridiag,
                              Matrix<BaseType<T>, D>& evals, Matrix<T, D>& evecs) {
diff --git a/include/dlaf/factorization/cholesky.h b/include/dlaf/factorization/cholesky.h
index 84ebb34bb9..fd9d65ac27 100644
--- a/include/dlaf/factorization/cholesky.h
+++ b/include/dlaf/factorization/cholesky.h
@@ -28,13 +28,14 @@ namespace dlaf {
 /// where L is a lower and U is an upper triangular matrix.
 /// @param uplo specifies if the elements of the Hermitian matrix to be referenced are the elements in
 /// the lower or upper triangular part,
+///
 /// @param mat_a on entry it contains the triangular matrix A, on exit the matrix elements
 /// are overwritten with the elements of the Cholesky factor. Only the tiles of the matrix
 /// which contain the upper or the lower triangular part (depending on the value of uplo),
-/// @pre mat_a has a square size,
-/// @pre mat_a has a square block size,
-/// @pre mat_a has equal tile and block sizes
-/// @pre mat_a is not distributed.
+/// @pre @p mat_a is not distributed
+/// @pre @p mat_a has size (N x N)
+/// @pre @p mat_a has blocksize (NB x NB)
+/// @pre @p mat_a has tilesize (NB x NB)
 template <Backend backend, Device device, class T>
 void cholesky_factorization(blas::Uplo uplo, Matrix<T, device>& mat_a) {
   DLAF_ASSERT(matrix::square_size(mat_a), mat_a);
@@ -59,10 +60,10 @@ void cholesky_factorization(blas::Uplo uplo, Matrix<T, device>& mat_a) {
 /// @param mat_a on entry it contains the triangular matrix A, on exit the matrix elements
 /// are overwritten with the elements of the Cholesky factor. Only the tiles of the matrix
 /// which contain the upper or the lower triangular part (depending on the value of uplo),
-/// @pre mat_a has a square size,
-/// @pre mat_a has a square block size,
-/// @pre mat_a has equal tile and block sizes
-/// @pre mat_a is distributed according to grid.
+/// @pre @p mat_a is distributed according to @p grid
+/// @pre @p mat_a has size (N x N)
+/// @pre @p mat_a has blocksize (NB x NB)
+/// @pre @p mat_a has tilesize (NB x NB)
 template <Backend backend, Device device, class T>
 void cholesky_factorization(comm::CommunicatorGrid grid, blas::Uplo uplo, Matrix<T, device>& mat_a) {
   DLAF_ASSERT(matrix::square_size(mat_a), mat_a);
diff --git a/include/dlaf/multiplication/general.h b/include/dlaf/multiplication/general.h
index 3cefd432d2..5754a2d21e 100644
--- a/include/dlaf/multiplication/general.h
+++ b/include/dlaf/multiplication/general.h
@@ -33,18 +33,30 @@ namespace dlaf::multiplication::internal {
 ///         \a NoTrans, \a Trans, \a ConjTrans,
 /// @param  opB specifies the form of opB(B) to be used in the matrix multiplication:
 ///         \a NoTrans, \a Trans, \a ConjTrans,
+///
 /// @param  mat_a contains the input matrix A. Only tiles whose both row and col tile coords are in
 ///         the closed range [a,b] are accessed in read-only mode (elements are not modified)
+/// @pre @p mat_a is not distributed
+/// @pre @p mat_a has size (N x N)
+/// @pre @p mat_a has blocksize (NB x NB)
+/// @pre @p mat_a has tilesize (NB x NB)
+///
 /// @param  mat_b contains the input matrix B. Only tiles whose both row and col tile coords are in
 ///         the closed range [a,b] are accessed in read-only mode (elements are not modified)
+/// @pre @p mat_b is not distributed
+/// @pre @p mat_b has size (N x N)
+/// @pre @p mat_b has blocksize (NB x NB)
+/// @pre @p mat_b has tilesize (NB x NB)
+///
 /// @param  mat_c On entry it contains the input matrix C. On exit matrix tiles in the range will be
 ///         overwritten with the result, while others are left untouched.
 ///         Only tiles whose both row and col tile coords are in the closed range [a,b] are accessed.
-/// @pre mat_a, mat_b and mat_c have the same square block size,
-/// @pre mat_a, mat_b and mat_c have the same size,
-/// @pre mat_a, mat_b and mat_c have equal tile and block sizes,
-/// @pre mat_a, mat_b and mat_c are not distributed,
-/// @pre a <= b <= mat_a.nrTiles().rows()
+/// @pre @p mat_c is not distributed
+/// @pre @p mat_c has size (N x N)
+/// @pre @p mat_c has blocksize (NB x NB)
+/// @pre @p mat_c has tilesize (NB x NB)
+///
+/// @pre `a <= b <= mat_a.nrTiles().rows()`
 template <Backend B, Device D, class T>
 void generalSubMatrix(const SizeType a, const SizeType b, const blas::Op opA, const blas::Op opB,
                       const T alpha, Matrix<const T, D>& mat_a, Matrix<const T, D>& mat_b, const T beta,
@@ -90,16 +102,27 @@ void generalSubMatrix(const SizeType a, const SizeType b, const blas::Op opA, co
 ///
 /// @param  mat_a contains the input matrix A. Only tiles whose both row and col tile coords are in
 ///         the closed range [a,b] are accessed in read-only mode (elements are not modified)
+/// @pre @p mat_a is distributed according to @p grid
+/// @pre @p mat_a has size (N x N)
+/// @pre @p mat_a has blocksize (NB x NB)
+/// @pre @p mat_a has tilesize (NB x NB)
+///
 /// @param  mat_b contains the input matrix B. Only tiles whose both row and col tile coords are in
 ///         the closed range [a,b] are accessed in read-only mode (elements are not modified)
+/// @pre @p mat_b is distributed according to @p grid
+/// @pre @p mat_b has size (N x N)
+/// @pre @p mat_b has blocksize (NB x NB)
+/// @pre @p mat_b has tilesize (NB x NB)
+///
 /// @param  mat_c On entry it contains the input matrix C. On exit matrix tiles in the range will be
 ///         overwritten with the result, while others are left untouched.
 ///         Only tiles whose both row and col tile coords are in the closed range [a,b] are accessed.
-/// @pre mat_a, mat_b and mat_c are distributed in the same way,
-/// @pre mat_a, mat_b and mat_c have the same square block size,
-/// @pre mat_a, mat_b and mat_c have the same size,
-/// @pre mat_a, mat_b and mat_c have equal tile and block sizes,
-/// @pre a <= b <= mat_a.nrTiles().rows()
+/// @pre @p mat_c is distributed according to @p grid
+/// @pre @p mat_c has size (N x N)
+/// @pre @p mat_c has blocksize (NB x NB)
+/// @pre @p mat_c has tilesize (NB x NB)
+///
+/// @pre `a <= b <= mat_a.nrTiles().rows()`
 template <Backend B, Device D, class T>
 void generalSubMatrix([[maybe_unused]] comm::CommunicatorGrid grid,
                       common::Pipeline<comm::Communicator>& row_task_chain,
diff --git a/include/dlaf/multiplication/hermitian.h b/include/dlaf/multiplication/hermitian.h
index ba5b8e5986..bfe1599e44 100644
--- a/include/dlaf/multiplication/hermitian.h
+++ b/include/dlaf/multiplication/hermitian.h
@@ -27,17 +27,27 @@ namespace dlaf {
 /// @param side specifies whether A appears on the \a Left or on the \a Right of matrix B,
 /// @param uplo specifies if the elements of the Hermitian matrix A to be referenced are the elements in
 /// the lower or upper triangular part,
+///
 /// @param mat_a contains the hermitian matrix A. Only the tiles of the matrix which contain the upper or
 /// the lower triangular part which represent the Hermitian matrix (depending on the value of uplo)
 /// are accessed in read-only mode (the elements are not modified),
+/// @pre @p mat_b is not distributed
+/// @pre @p mat_b has size (N x M)
+/// @pre @p mat_b has blocksize (NB x NB)
+/// @pre @p mat_b has tilesize (NB x NB)
+///
 /// @param mat_b contains the matrix B accessed in read-only mode (the elements are not modified),
+/// @pre @p mat_b is not distributed
+/// @pre @p mat_b has size (M x K)
+/// @pre @p mat_b has blocksize (NB x NB)
+/// @pre @p mat_b has tilesize (NB x NB)
+///
 /// @param mat_c on entry it contains the matrix C, on exit the matrix elements are overwritten with the
 /// elements of the result.
-/// @pre mat_a has a square size,
-/// @pre mat_a has a square block size,
-/// @pre mat_a mat_b and mat_c have equal tile and block size,
-/// @pre mat_a mat_b and mat_c are not distributed,
-/// @pre mat_a mat_b are multipliable and the result can be summed to mat_c.
+/// @pre @p mat_b is not distributed
+/// @pre @p mat_b has size (N x K)
+/// @pre @p mat_b has blocksize (NB x NB)
+/// @pre @p mat_b has tilesize (NB x NB)
 template <Backend B, Device D, class T>
 void hermitian_multiplication(blas::Side side, blas::Uplo uplo, const T alpha, Matrix<const T, D>& mat_a,
                               Matrix<const T, D>& mat_b, const T beta, Matrix<T, D>& mat_c) {
@@ -79,17 +89,27 @@ void hermitian_multiplication(blas::Side side, blas::Uplo uplo, const T alpha, M
 /// @param side specifies whether A appears on the \a Left or on the \a Right of matrix B,
 /// @param uplo specifies if the elements of the Hermitian matrix A to be referenced are the elements in
 /// the lower or upper triangular part,
+///
 /// @param mat_a contains the hermitian matrix A. Only the tiles of the matrix which contain the upper or
 /// the lower triangular part which represent the Hermitian matrix (depending on the value of uplo)
 /// are accessed in read-only mode (the elements are not modified),
+/// @pre @p mat_b is distributed according to @p grid
+/// @pre @p mat_b has size (N x M)
+/// @pre @p mat_b has blocksize (NB x NB)
+/// @pre @p mat_b has tilesize (NB x NB)
+///
 /// @param mat_b contains the matrix B accessed in read-only mode (the elements are not modified),
+/// @pre @p mat_b is distributed according to @p grid
+/// @pre @p mat_b has size (M x K)
+/// @pre @p mat_b has blocksize (NB x NB)
+/// @pre @p mat_b has tilesize (NB x NB)
+///
 /// @param mat_c on entry it contains the matrix C, on exit the matrix elements are overwritten with the
 /// elements of the result.
-/// @pre mat_a has a square size,
-/// @pre mat_a has a square block size,
-/// @pre mat_a, mat_b and mat_c have equal tile and block size,
-/// @pre mat_a, mat_b and mat_c are distributed according to the grid,
-/// @pre mat_a mat_b are multipliable and the result can be summed to mat_c.
+/// @pre @p mat_b is distributed according to @p grid
+/// @pre @p mat_b has size (N x K)
+/// @pre @p mat_b has blocksize (NB x NB)
+/// @pre @p mat_b has tilesize (NB x NB)
 template <Backend B, Device D, class T>
 void hermitian_multiplication(comm::CommunicatorGrid grid, blas::Side side, blas::Uplo uplo,
                               const T alpha, Matrix<const T, D>& mat_a, Matrix<const T, D>& mat_b,
diff --git a/include/dlaf/multiplication/triangular.h b/include/dlaf/multiplication/triangular.h
index 8d895e8882..ee57dcbd38 100644
--- a/include/dlaf/multiplication/triangular.h
+++ b/include/dlaf/multiplication/triangular.h
@@ -29,16 +29,21 @@ namespace dlaf {
 /// @param op specifies the form of op(A) to be used in the matrix multiplication: \a NoTrans, \a Trans,
 /// \a ConjTrans,
 /// @param diag specifies if the matrix A is assumed to be unit triangular (\a Unit) or not (\a NonUnit),
+///
 /// @param mat_a contains the triangular matrix A. Only the tiles of the matrix which contain the upper or
 /// the lower triangular part (depending on the value of uplo) are accessed in read-only mode (the
 /// elements are not modified),
+/// @pre @p mat_a is not distributed
+/// @pre @p mat_a has size (N x N)
+/// @pre @p mat_a has blocksize (NB x NB)
+/// @pre @p mat_a has tilesize (NB x NB)
+///
 /// @param mat_b on entry it contains the matrix B, on exit the matrix elements are overwritten with the
 /// elements of the result.
-/// @pre mat_a has a square size,
-/// @pre mat_a has a square block size,
-/// @pre mat_a and mat_b have equal tile and block sizes,
-/// @pre mat_a and mat_b are not distributed,
-/// @pre mat_a and mat_b are multipliable.
+/// @pre @p mat_b is not distributed
+/// @pre @p mat_b has size (N x N)
+/// @pre @p mat_b has blocksize (NB x NB)
+/// @pre @p mat_b has tilesize (NB x NB)
 template <Backend backend, Device device, class T>
 void triangular_multiplication(blas::Side side, blas::Uplo uplo, blas::Op op, blas::Diag diag, T alpha,
                                Matrix<const T, device>& mat_a, Matrix<T, device>& mat_b) {
@@ -103,16 +108,21 @@ void triangular_multiplication(blas::Side side, blas::Uplo uplo, blas::Op op, bl
 /// @param op specifies the form of op(A) to be used in the matrix multiplication: \a NoTrans, \a Trans,
 /// \a ConjTrans,
 /// @param diag specifies if the matrix A is assumed to be unit triangular (\a Unit) or not (\a NonUnit),
+///
 /// @param mat_a contains the triangular matrix A. Only the tiles of the matrix which contain the upper or
 /// the lower triangular part (depending on the value of uplo) are accessed in read-only mode (the
 /// elements are not modified),
+/// @pre @p mat_a is distributed according to @p grid
+/// @pre @p mat_a has size (N x N)
+/// @pre @p mat_a has blocksize (NB x NB)
+/// @pre @p mat_a has tilesize (NB x NB)
+///
 /// @param mat_b on entry it contains the matrix B, on exit the matrix elements are overwritten with the
 /// elements of the result.
-/// @pre mat_a has a square size,
-/// @pre mat_a has a square block size,
-/// @pre mat_a and mat_b have equal tile and block sizes,
-/// @pre mat_a and mat_b are distributed according to the grid,
-/// @pre mat_a and mat_b are multipliable.
+/// @pre @p mat_b is distributed according to @p grid
+/// @pre @p mat_b has size (N x N)
+/// @pre @p mat_b has blocksize (NB x NB)
+/// @pre @p mat_b has tilesize (NB x NB)
 template <Backend backend, Device device, class T>
 void triangular_multiplication(comm::CommunicatorGrid grid, blas::Side side, blas::Uplo uplo,
                                blas::Op op, blas::Diag diag, T alpha, Matrix<const T, device>& mat_a,
diff --git a/include/dlaf/permutations/general.h b/include/dlaf/permutations/general.h
index cfa6cb549e..5dca25704e 100644
--- a/include/dlaf/permutations/general.h
+++ b/include/dlaf/permutations/general.h
@@ -30,20 +30,22 @@ namespace dlaf::permutations {
 ///        the range [0, n) where `n` is the size of the submatrix (i.e. the indices are local to the
 ///        submatrix, they are not global). Only tiles whose row tile coords are in the range
 ///        [i_begin,i_end) are accessed in read-only mode.
+/// @pre @p perms is not distributed
+/// @pre @p perms has blocksize (NB x MB)
+/// @pre @p perms has tilesize (NB x MB)
+///
 /// @param mat_in is the input matrix. Only tiles whose both row and col tile coords are in
 ///        the range [i_begin,i_end) are accessed in read-only mode.
+/// @pre @p mat_in is not distributed
+/// @pre @p mat_in has size (N x N)
+/// @pre @p mat_in has blocksize (NB x NB)
+/// @pre @p mat_in has tilesize (NB x NB)
+///
 /// @param mat_out is the output matrix. Only tiles whose both row and col tile coords are in
 ///        the range [i_begin,i_end) are accessed in write-only mode.
-/// @pre perms is not distributed
-/// @pre perms has equal tile and block sizes
-/// @pre mat_in is not distributed
-/// @pre mat_in has equal tile and block sizes
-/// @pre mat_in has a square size
-/// @pre mat_in has a square blocksize
-/// @pre mat_out is not distributed
-/// @pre mat_out has equal tile and block sizes
-/// @pre mat_out has a square size
-/// @pre mat_out has a square blocksize
+/// @pre @p mat_out has size (N x N)
+/// @pre @p mat_out has blocksize (NB x NB)
+/// @pre @p mat_out has tilesize (NB x NB)
 template <Backend B, Device D, class T, Coord coord>
 void permute(SizeType i_begin, SizeType i_end, Matrix<const SizeType, D>& perms,
              Matrix<const T, D>& mat_in, Matrix<T, D>& mat_out) {
@@ -84,24 +86,28 @@ void permute(SizeType i_begin, SizeType i_end, Matrix<const SizeType, D>& perms,
 /// @param sub_task_chain orders non-blocking collective calls used internally. If @tparam coord is Coord::Col,
 ///        a row communicator pipeline is expected, otherwise if @tparam is Coord::Row a column communicator
 ///        pipeline is expected.
+///
 /// @param perms is the index map of permutations represented as a local tiled column vector. Indices are in
 ///        the range [0, n) where `n` is the global size of the submatrix (i.e. submatrix indices are used
 ///        instead of the full matrix indices). Only tiles whose row tile coords are in the range
 ///        [i_begin,i_end) are accessed in read-only mode.
+/// @pre @p perms is not distributed
+/// @pre @p perms has blocksize (NB x MB)
+/// @pre @p perms has tilesize (NB x MB)
+///
 /// @param mat_in is the distributed input matrix. Only tiles whose both global row and col tile coords are in
 ///        the range [i_begin,i_end) are accessed in readwrite-mode.
+/// @pre @p mat_in is distributed according to @p grid
+/// @pre @p mat_in has size (N x N)
+/// @pre @p mat_in has blocksize (NB x NB)
+/// @pre @p mat_in has tilesize (NB x NB)
+///
 /// @param mat_out is the distributed output matrix. Only tiles whose both global row and col tile coords are in
 ///        the range [i_begin,i_end) are accessed in readwrite-mode.
-/// @pre perms is not distributed
-/// @pre perms has equal tile and block sizes
-/// @pre mat_in is distributed according to grid
-/// @pre mat_in has equal tile and block sizes
-/// @pre mat_in has a square size
-/// @pre mat_in has a square blocksize
-/// @pre mat_out is distributed according to grid
-/// @pre mat_out has equal tile and block sizes
-/// @pre mat_out has a square size
-/// @pre mat_out has a square blocksize
+/// @pre @p mat_out is distributed according to @p grid
+/// @pre @p mat_out has size (N x N)
+/// @pre @p mat_out has blocksize (NB x NB)
+/// @pre @p mat_out has tilesize (NB x NB)
 ///
 /// Note: The Pipeline<> API allows to use permute() within other algorithms without having to clone communicators
 ///       internally.
diff --git a/include/dlaf/solver/triangular.h b/include/dlaf/solver/triangular.h
index f4e43e092d..b4087134a3 100644
--- a/include/dlaf/solver/triangular.h
+++ b/include/dlaf/solver/triangular.h
@@ -29,16 +29,21 @@ namespace dlaf {
 /// @param op specifies the form of op(A) to be used in the matrix multiplication: \a NoTrans, \a Trans,
 /// \a ConjTrans,
 /// @param diag specifies if the matrix A is assumed to be unit triangular (\a Unit) or not (\a NonUnit),
+///
 /// @param mat_a contains the triangular matrix A. Only the tiles of the matrix which contain the upper or
 /// the lower triangular part (depending on the value of uplo) are accessed in read-only mode (the
 /// elements are not modified),
+/// @pre @p mat_a is not distributed
+/// @pre @p mat_a has size (N x M)
+/// @pre @p mat_a has blocksize (NB x NB)
+/// @pre @p mat_a has tilesize (NB x NB)
+///
 /// @param mat_b on entry it contains the matrix B, on exit the matrix elements are overwritten with the
 /// elements of the matrix X,
-/// @pre mat_a has a square size,
-/// @pre mat_a has a square block size,
-/// @pre mat_a and mat_b have equal tile and block size,
-/// @pre mat_a and mat_b are not distributed,
-/// @pre mat_a and mat_b are multipliable.
+/// @pre @p mat_b is not distributed
+/// @pre @p mat_b has size (M x K)
+/// @pre @p mat_b has blocksize (NB x NB)
+/// @pre @p mat_b has tilesize (NB x NB)
 template <Backend backend, Device device, class T>
 void triangular_solver(blas::Side side, blas::Uplo uplo, blas::Op op, blas::Diag diag, T alpha,
                        Matrix<const T, device>& mat_a, Matrix<T, device>& mat_b) {
@@ -101,16 +106,21 @@ void triangular_solver(blas::Side side, blas::Uplo uplo, blas::Op op, blas::Diag
 /// Trans, \a ConjTrans,
 /// @param diag specifies if the matrix A is assumed to be unit triangular (\a Unit) or not (\a
 /// NonUnit),
+///
 /// @param mat_a contains the triangular matrix A. Only the tiles of the matrix which contain the upper
 /// or the lower triangular part (depending on the value of uplo) are accessed in read-only mode (the
 /// elements are not modified),
+/// @pre @p mat_b is distributed according to @p grid
+/// @pre @p mat_b has size (N x M)
+/// @pre @p mat_b has blocksize (NB x NB)
+/// @pre @p mat_b has tilesize (NB x NB)
+///
 /// @param mat_b on entry it contains the matrix B, on exit the matrix elements are overwritten with
 /// the elements of the matrix X,
-/// @pre matrix A has a square size,
-/// @pre matrix A has a square block size,
-/// @pre matrix A and matrix B have equal tile and block sizes,
-/// @pre matrix A and matrix B are distributed according to the grid,
-/// @pre matrix A and matrix B are multipliable.
+/// @pre @p mat_b is distributed according to @p grid
+/// @pre @p mat_b has size (M x K)
+/// @pre @p mat_b has blocksize (NB x NB)
+/// @pre @p mat_b has tilesize (NB x NB)
 template <Backend backend, Device device, class T>
 void triangular_solver(comm::CommunicatorGrid grid, blas::Side side, blas::Uplo uplo, blas::Op op,
                        blas::Diag diag, T alpha, Matrix<const T, device>& mat_a,