diff --git a/include/dlaf/auxiliary/norm.h b/include/dlaf/auxiliary/norm.h index c206e180c0..31adeee37b 100644 --- a/include/dlaf/auxiliary/norm.h +++ b/include/dlaf/auxiliary/norm.h @@ -27,9 +27,9 @@ namespace dlaf::auxiliary { /// /// @note @p uplo == blas::uplo::Upper not yet implemented /// -/// @pre `A.blockSize().rows() == A.blockSize().cols()`, -/// @pre @p A is distributed according to @p grid, -/// @pre @p A has equal tile and block sizes, +/// @pre @p A is distributed according to @p grid +/// @pre @p A has blocksize (NB x NB) +/// @pre @p A has tilesize (NB x NB) /// @return the max norm of the Matrix @p A or 0 if `A.size().isEmpty()` template dlaf::BaseType max_norm(comm::CommunicatorGrid grid, comm::Index2D rank, blas::Uplo uplo, diff --git a/include/dlaf/eigensolver/band_to_tridiag.h b/include/dlaf/eigensolver/band_to_tridiag.h index 252be9d133..f5cb09acfb 100644 --- a/include/dlaf/eigensolver/band_to_tridiag.h +++ b/include/dlaf/eigensolver/band_to_tridiag.h @@ -65,11 +65,12 @@ namespace dlaf::eigensolver::internal { /// Implementation on local memory. /// /// @param mat_a contains the Hermitian band matrix A (if A is real, the matrix is symmetric). -/// @pre mat_a has a square size, -/// @pre mat_a has a square block size, -/// @pre band_size is a divisor of mat_a.blockSize().cols(), and band_size >= 2 -/// @pre mat_a is not distributed, -/// @pre mat_a has equal tile and block sizes. +/// @pre @p mat_a is not distributed +/// @pre @p mat_a has size (N x N) +/// @pre @p mat_a has blocksize (NB x NB) +/// @pre @p mat_a has tilesize (NB x NB) +/// +/// @pre @p band_size is a divisor of `mat_a.blockSize().cols()`, and @p band_size >= 2 template TridiagResult band_to_tridiagonal(blas::Uplo uplo, SizeType band_size, Matrix& mat_a) { @@ -138,11 +139,12 @@ TridiagResult band_to_tridiagonal(blas::Uplo uplo, SizeType band /// Implementation on distributed memory. /// /// @param mat_a contains the Hermitian band matrix A (if A is real, the matrix is symmetric). -/// @pre mat_a has a square size, -/// @pre mat_a has a square block size, -/// @pre band_size is a divisor of mat_a.blockSize().cols() and band_size >= 2, -/// @pre mat_a is distributed according to grid, -/// @pre mat_a has equal tile and block sizes. +/// @pre @p mat_a is distributed according to @p grid +/// @pre @p mat_a has size (N x N) +/// @pre @p mat_a has blocksize (NB x NB) +/// @pre @p mat_a has tilesize (NB x NB) +/// +/// @pre @p band_size is a divisor of `mat_a.blockSize().cols()`, and @p band_size >= 2 template TridiagResult band_to_tridiagonal(comm::CommunicatorGrid grid, blas::Uplo uplo, SizeType band_size, Matrix& mat_a) { diff --git a/include/dlaf/eigensolver/bt_band_to_tridiag.h b/include/dlaf/eigensolver/bt_band_to_tridiag.h index ae54910af3..d0ae27f69e 100644 --- a/include/dlaf/eigensolver/bt_band_to_tridiag.h +++ b/include/dlaf/eigensolver/bt_band_to_tridiag.h @@ -19,40 +19,42 @@ namespace dlaf::eigensolver::internal { -// Eigenvalue back-transformation implementation on local memory, which applies the inverse of the -// transformation used to get a tridiagonal matrix from a band one. -// -// It computes E -= V T V* E, applying to a general matrix E the inverse of the transformation described -// by the reflectors in V (block-wise, so T represents the T factor which embeds the information about -// taus), which are the ones used to transform a band matrix to a tridiagonal matrix. -// -// In particular, V and T are obtained using data about reflectors and taus passed via @p mat_hh -// where they are stored using following compact representation -// -// compact extended -// AT BT CT DT 1 0 0 0 -// A1 B1 C1 D1 A1 1 0 0 -// A2 B2 C2 D2 A2 B1 1 0 -// A3 B3 C3 D3 A3 B2 C1 1 -// 0 B3 C2 D1 -// 0 0 C3 D2 -// 0 0 0 D3 -// -// where A, B, C and D refers to distinct reflectors, with their components numbered and their taus -// identified by the letter T. -// -// @param mat_hh matrix containing reflectors together with taus (compact form see representation above) -// @param mat_e matrix to which the inverse transformation is applied to -// @param band_size size of the reflectors (normal one, not constrained by any matrix size limit) -// @pre mat_hh has a square size -// @pre mat_hh has a square block size -// @pre mat_e and mat_hh share the same number of rows -// @pre mat_e block size and mat_hh block size share the same number of rows -// @pre band_size is a divisor of mat_hh.blockSize().cols() -// @pre mat_e is not distributed -// @pre mat_hh is not distributed -// @pre mat_e has equal tile and block sizes -// @pre mat_hh has equal tile and block sizes +/// Eigenvalue back-transformation implementation on local memory, which applies the inverse of the +/// transformation used to get a tridiagonal matrix from a band one. +/// +/// It computes E -= V T V* E, applying to a general matrix E the inverse of the transformation described +/// by the reflectors in V (block-wise, so T represents the T factor which embeds the information about +/// taus), which are the ones used to transform a band matrix to a tridiagonal matrix. +/// +/// In particular, V and T are obtained using data about reflectors and taus passed via @p mat_hh +/// where they are stored using following compact representation +/// +/// compact extended +/// AT BT CT DT 1 0 0 0 +/// A1 B1 C1 D1 A1 1 0 0 +/// A2 B2 C2 D2 A2 B1 1 0 +/// A3 B3 C3 D3 A3 B2 C1 1 +/// 0 B3 C2 D1 +/// 0 0 C3 D2 +/// 0 0 0 D3 +/// +/// where A, B, C and D refers to distinct reflectors, with their components numbered and their taus +/// identified by the letter T. +/// +/// @param mat_hh matrix containing reflectors together with taus (compact form see representation above) +/// @pre @p mat_hh is not distributed +/// @pre @p mat_hh has size (N x N) +/// @pre @p mat_hh has blocksize (NB x NB) +/// @pre @p mat_hh has tilesize (NB x NB) +/// +/// @param mat_e matrix to which the inverse transformation is applied to +/// @pre @p mat_e is not distributed +/// @pre @p mat_e has size (N x M) +/// @pre @p mat_e has blocksize (NB x MB) +/// @pre @p mat_e has tilesize (NB x MB) +/// +/// @param band_size size of the reflectors (normal one, not constrained by any matrix size limit) +/// @pre @p band_size is a divisor of `mat_hh.blockSize().cols()` template void bt_band_to_tridiagonal(const SizeType band_size, matrix::Matrix& mat_e, matrix::Matrix& mat_hh) { @@ -74,6 +76,42 @@ void bt_band_to_tridiagonal(const SizeType band_size, matrix::Matrix& mat_ BackTransformationT2B::call(band_size, mat_e, mat_hh); } +/// Eigenvalue back-transformation implementation, which applies the inverse of the transformation used +/// to get a tridiagonal matrix from a band one. +/// +/// It computes E -= V T V* E, applying to a general matrix E the inverse of the transformation described +/// by the reflectors in V (block-wise, so T represents the T factor which embeds the information about +/// taus), which are the ones used to transform a band matrix to a tridiagonal matrix. +/// +/// In particular, V and T are obtained using data about reflectors and taus passed via @p mat_hh +/// where they are stored using following compact representation +/// +/// compact extended +/// AT BT CT DT 1 0 0 0 +/// A1 B1 C1 D1 A1 1 0 0 +/// A2 B2 C2 D2 A2 B1 1 0 +/// A3 B3 C3 D3 A3 B2 C1 1 +/// 0 B3 C2 D1 +/// 0 0 C3 D2 +/// 0 0 0 D3 +/// +/// where A, B, C and D refers to distinct reflectors, with their components numbered and their taus +/// identified by the letter T. +/// +/// @param mat_hh matrix containing reflectors together with taus (compact form see representation above) +/// @pre @p mat_hh is distributed according to @p grid +/// @pre @p mat_hh has size (N x N) +/// @pre @p mat_hh has blocksize (NB x NB) +/// @pre @p mat_hh has tilesize (NB x NB) +/// +/// @param mat_e matrix to which the inverse transformation is applied to +/// @pre @p mat_e is distributed according to @p grid +/// @pre @p mat_e has size (N x M) +/// @pre @p mat_e has blocksize (NB x MB) +/// @pre @p mat_e has tilesize (NB x MB) +/// +/// @param band_size size of the reflectors (normal one, not constrained by any matrix size limit) +/// @pre @p band_size is a divisor of `mat_hh.blockSize().cols()` template void bt_band_to_tridiagonal(comm::CommunicatorGrid grid, const SizeType band_size, matrix::Matrix& mat_e, matrix::Matrix& mat_hh) { diff --git a/include/dlaf/eigensolver/bt_reduction_to_band.h b/include/dlaf/eigensolver/bt_reduction_to_band.h index a8cd0c01bb..fd386cf7af 100644 --- a/include/dlaf/eigensolver/bt_reduction_to_band.h +++ b/include/dlaf/eigensolver/bt_reduction_to_band.h @@ -28,14 +28,18 @@ namespace dlaf::eigensolver::internal { /// defined by the j-th element of tau and the HH reflector stored in the j-th column of the matrix V. /// /// @param mat_c contains the (m x n) matrix C (blocksize (mb x nb)), while on exit it contains Q C. +/// @pre @p mat_c is not distributed +/// @pre @p mat_c has blocksize (NB x NB) +/// @pre @p mat_c has tilesize (NB x NB) +/// /// @param mat_v is (m x m) matrix with blocksize (mb x mb), which contains the Householder reflectors. /// The j-th HH reflector is v_j = (1, V(mb + j : n, j)). +/// @pre @p mat_v is not distributed +/// @pre @p mat_v has blocksize (NB x NB) +/// @pre @p mat_v has tilesize (NB x NB) +/// /// @param mat_taus is the tau vector as returned by reductionToBand. The j-th element is the scaling /// factor for the j-th HH tranformation. -/// @pre mat_c is not distributed, -/// @pre mat_v is not distributed, -/// @pre mat_c has equal tile and block sizes, -/// @pre mat_v has equal tile and block sizes. template void bt_reduction_to_band(const SizeType b, Matrix& mat_c, Matrix& mat_v, Matrix& mat_taus) { @@ -64,15 +68,19 @@ void bt_reduction_to_band(const SizeType b, Matrix& mat_c, Matrix void bt_reduction_to_band(comm::CommunicatorGrid grid, const SizeType b, Matrix& mat_c, Matrix& mat_v, Matrix& mat_taus) { diff --git a/include/dlaf/eigensolver/eigensolver.h b/include/dlaf/eigensolver/eigensolver.h index 16ce5b4e6d..ee667e79cf 100644 --- a/include/dlaf/eigensolver/eigensolver.h +++ b/include/dlaf/eigensolver/eigensolver.h @@ -32,9 +32,24 @@ namespace dlaf { /// Implementation on local memory. /// /// @param uplo specifies if upper or lower triangular part of @p mat will be referenced -/// @param mat contains the Hermitian matrix A -/// @param eigenvalues is a N x 1 matrix which on output contains the eigenvalues -/// @param eigenvectors is a N x N matrix which on output contains the eigenvectors +/// +/// @param[in,out] mat contains the Hermitian matrix A +/// @pre @p mat is not distributed +/// @pre @p mat has size (N x N) +/// @pre @p mat has blocksize (NB x NB) +/// @pre @p mat has tilesize (NB x NB) +/// +/// @param[out] eigenvalues contains the eigenvalues +/// @pre @p eigenvalues is not distributed +/// @pre @p eigenvalues has size (N x 1) +/// @pre @p eigenvalues has blocksize (NB x 1) +/// @pre @p eigenvalues has tilesize (NB x 1) +/// +/// @param[out] eigenvectors contains the eigenvectors +/// @pre @p eigenvectors is not distributed +/// @pre @p eigenvectors has size (N x N) +/// @pre @p eigenvectors has blocksize (NB x NB) +/// @pre @p eigenvectors has tilesize (NB x NB) template void hermitian_eigensolver(blas::Uplo uplo, Matrix& mat, Matrix, D>& eigenvalues, Matrix& eigenvectors) { @@ -66,9 +81,15 @@ void hermitian_eigensolver(blas::Uplo uplo, Matrix& mat, Matrix, and eigenvectors as a Matrix +/// @return ReturnEigensolverType with eigenvalues and eigenvectors as a Matrix +/// /// @param uplo specifies if upper or lower triangular part of @p mat will be referenced -/// @param mat contains the Hermitian matrix A +/// +/// @param[in,out] mat contains the Hermitian matrix A +/// @pre @p mat is not distributed +/// @pre @p mat has size (N x N) +/// @pre @p mat has blocksize (NB x NB) +/// @pre @p mat has tilesize (NB x NB) template EigensolverResult hermitian_eigensolver(blas::Uplo uplo, Matrix& mat) { const SizeType size = mat.size().rows(); @@ -90,11 +111,27 @@ EigensolverResult hermitian_eigensolver(blas::Uplo uplo, Matrix& mat /// /// Implementation on distributed memory. /// -/// @param grid is the communicator grid on which the matrix @p mat has been distributed, +/// @param grid is the communicator grid on which the matrix @p mat has been distributed +/// /// @param uplo specifies if upper or lower triangular part of @p mat will be referenced -/// @param mat contains the Hermitian matrix A -/// @param eigenvalues is a N x 1 matrix which on output contains the eigenvalues -/// @param eigenvectors is a N x N matrix which on output contains the eigenvectors +/// +/// @param[in,out] mat contains the Hermitian matrix A +/// @pre @p mat is distributed according to @p grid +/// @pre @p mat has size (N x N) +/// @pre @p mat has blocksize (NB x NB) +/// @pre @p mat has tilesize (NB x NB) +/// +/// @param[out] eigenvalues contains the eigenvalues +/// @pre @p eigenvalues is stored on all ranks +/// @pre @p eigenvalues has size (N x 1) +/// @pre @p eigenvalues has blocksize (NB x 1) +/// @pre @p eigenvalues has tilesize (NB x 1) +/// +/// @param[out] eigenvectors contains the eigenvectors +/// @pre @p eigenvectors is distributed according to @p grid +/// @pre @p eigenvectors has size (N x N) +/// @pre @p eigenvectors has blocksize (NB x NB) +/// @pre @p eigenvectors has tilesize (NB x NB) template void hermitian_eigensolver(comm::CommunicatorGrid grid, blas::Uplo uplo, Matrix& mat, Matrix, D>& eigenvalues, Matrix& eigenvectors) { @@ -126,10 +163,17 @@ void hermitian_eigensolver(comm::CommunicatorGrid grid, blas::Uplo uplo, Matrix< /// /// Implementation on distributed memory. /// -/// @return struct ReturnEigensolverType with eigenvalues, as a vector, and eigenvectors as a Matrix -/// @param grid is the communicator grid on which the matrix @p mat has been distributed, +/// @return struct ReturnEigensolverType with eigenvalues and eigenvectors as a Matrix +/// +/// @param grid is the communicator grid on which the matrix @p mat has been distributed +/// /// @param uplo specifies if upper or lower triangular part of @p mat will be referenced -/// @param mat contains the Hermitian matrix A +/// +/// @param[in,out] mat contains the Hermitian matrix A +/// @pre @p mat is distributed according to @p grid +/// @pre @p mat has size (N x N) +/// @pre @p mat has blocksize (NB x NB) +/// @pre @p mat has tilesize (NB x NB) template EigensolverResult hermitian_eigensolver(comm::CommunicatorGrid grid, blas::Uplo uplo, Matrix& mat) { diff --git a/include/dlaf/eigensolver/gen_eigensolver.h b/include/dlaf/eigensolver/gen_eigensolver.h index b5b2c015ac..1c473863cf 100644 --- a/include/dlaf/eigensolver/gen_eigensolver.h +++ b/include/dlaf/eigensolver/gen_eigensolver.h @@ -34,10 +34,30 @@ namespace dlaf { /// Implementation on local memory. /// /// @param uplo specifies if upper or lower triangular part of @p mat_a and @p mat_b will be referenced +/// /// @param mat_a contains the Hermitian matrix A +/// @pre @p mat_a is not distributed +/// @pre @p mat_a has size (N x N) +/// @pre @p mat_a has blocksize (NB x NB) +/// @pre @p mat_a has tilesize (NB x NB) +/// /// @param mat_b contains the Hermitian positive definite matrix B -/// @param eigenvalues is a N x 1 matrix which on output contains the eigenvalues -/// @param eigenvectors is a N x N matrix which on output contains the eigenvectors +/// @pre @p mat_b is not distributed +/// @pre @p mat_b has size (N x N) +/// @pre @p mat_b has blocksize (NB x NB) +/// @pre @p mat_b has tilesize (NB x NB) +/// +/// @param[out] eigenvalues contains the eigenvalues +/// @pre @p eigenvalues is not distributed +/// @pre @p eigenvalues has size (N x 1) +/// @pre @p eigenvalues has blocksize (NB x NB) +/// @pre @p eigenvalues has tilesize (NB x NB) +/// +/// @param[out] eigenvectors contains the eigenvectors +/// @pre @p eigenvectors is not distributed +/// @pre @p eigenvectors has size (N x N) +/// @pre @p eigenvectors has blocksize (NB x NB) +/// @pre @p eigenvectors has tilesize (NB x NB) template void hermitian_generalized_eigensolver(blas::Uplo uplo, Matrix& mat_a, Matrix& mat_b, Matrix, D>& eigenvalues, Matrix& eigenvectors) { @@ -79,8 +99,18 @@ void hermitian_generalized_eigensolver(blas::Uplo uplo, Matrix& mat_a, Mat /// /// @return struct ReturnEigensolverType with eigenvalues, as a vector, and eigenvectors as a Matrix /// @param uplo specifies if upper or lower triangular part of @p mat_a and @p mat_b will be referenced +/// /// @param mat_a contains the Hermitian matrix A +/// @pre @p mat_a is not distributed +/// @pre @p mat_a has size (N x N) +/// @pre @p mat_a has blocksize (NB x NB) +/// @pre @p mat_a has tilesize (NB x NB) +/// /// @param mat_b contains the Hermitian positive definite matrix B +/// @pre @p mat_b is not distributed +/// @pre @p mat_b has size (N x N) +/// @pre @p mat_b has blocksize (NB x NB) +/// @pre @p mat_b has tilesize (NB x NB) template EigensolverResult hermitian_generalized_eigensolver(blas::Uplo uplo, Matrix& mat_a, Matrix& mat_b) { @@ -119,10 +149,30 @@ EigensolverResult hermitian_generalized_eigensolver(blas::Uplo uplo, Matri /// /// @param grid is the communicator grid on which the matrices @p mat_a and @p mat_b have been distributed, /// @param uplo specifies if upper or lower triangular part of @p mat_a and @p mat_b will be referenced +/// /// @param mat_a contains the Hermitian matrix A +/// @pre @p mat_a is distributed according to @p grid +/// @pre @p mat_a has size (N x N) +/// @pre @p mat_a has blocksize (NB x NB) +/// @pre @p mat_a has tilesize (NB x NB) +/// /// @param mat_b contains the Hermitian positive definite matrix B +/// @pre @p mat_b is distributed according to @p grid +/// @pre @p mat_b has size (N x N) +/// @pre @p mat_b has blocksize (NB x NB) +/// @pre @p mat_b has tilesize (NB x NB) +/// /// @param eigenvalues is a N x 1 matrix which on output contains the eigenvalues -/// @param eigenvectors is a N x N matrix which on output contains the eigenvectors +/// @pre @p eigenvalues is not distributed +/// @pre @p eigenvalues has size (N x 1) +/// @pre @p eigenvalues has blocksize (NB x 1) +/// @pre @p eigenvalues has tilesize (NB x 1) +/// +/// @param[out] eigenvectors contains the eigenvectors +/// @pre @p eigenvectors is distributed according to @p grid +/// @pre @p eigenvectors has size (N x N) +/// @pre @p eigenvectors has blocksize (NB x NB) +/// @pre @p eigenvectors has tilesize (NB x NB) template void hermitian_generalized_eigensolver(comm::CommunicatorGrid grid, blas::Uplo uplo, Matrix& mat_a, Matrix& mat_b, Matrix, D>& eigenvalues, @@ -167,8 +217,18 @@ void hermitian_generalized_eigensolver(comm::CommunicatorGrid grid, blas::Uplo u /// @return struct ReturnEigensolverType with eigenvalues, as a vector, and eigenvectors as a Matrix /// @param grid is the communicator grid on which the matrices @p mat_a and @p mat_b have been distributed, /// @param uplo specifies if upper or lower triangular part of @p mat_a and @p mat_b will be referenced +/// /// @param mat_a contains the Hermitian matrix A +/// @pre @p mat_a is distributed according to @p grid +/// @pre @p mat_a has size (N x N) +/// @pre @p mat_a has blocksize (NB x NB) +/// @pre @p mat_a has tilesize (NB x NB) +/// /// @param mat_b contains the Hermitian positive definite matrix B +/// @pre @p mat_b is distributed according to @p grid +/// @pre @p mat_b has size (N x N) +/// @pre @p mat_b has blocksize (NB x NB) +/// @pre @p mat_b has tilesize (NB x NB) template EigensolverResult hermitian_generalized_eigensolver(comm::CommunicatorGrid grid, blas::Uplo uplo, Matrix& mat_a, Matrix& mat_b) { diff --git a/include/dlaf/eigensolver/gen_to_std.h b/include/dlaf/eigensolver/gen_to_std.h index 48d83e52ec..a41f9d5183 100644 --- a/include/dlaf/eigensolver/gen_to_std.h +++ b/include/dlaf/eigensolver/gen_to_std.h @@ -29,16 +29,22 @@ namespace dlaf::eigensolver::internal { /// /// @param uplo specifies if the elements of the Hermitian matrix A and the triangular matrix B /// to be referenced are the elements in the lower or upper triangular part, +/// /// @param mat_a on entry it contains the Hermitian matrix A (if A is real, the matrix is symmetric), /// on exit the matrix elements are overwritten with the elements of the matrix B. /// Only the tiles of the matrix which contain the lower triangular or the upper triangular part are accessed. +/// @pre @p mat_a is not distributed +/// @pre @p mat_a has size (N x N) +/// @pre @p mat_a has blocksize (NB x NB) +/// @pre @p mat_a has tilesize (NB x NB) +/// /// @param mat_b contains the triangular matrix. It can be lower (L) or upper (U). Only the tiles of /// the matrix which contain the lower triangular or the upper triangular part are accessed. -/// Note: B should be modifiable as the diagonal tiles might be temporarly modified during the calculation. -/// @pre mat_a and mat_b have the same square size, -/// @pre mat_a and mat_b have the same square block size, -/// @pre mat_a and mat_b have the same tile and block sizes, -/// @pre mat_a and mat_b are not distributed. +/// Note: B should be modifiable as the diagonal tiles might be temporarily modified during the calculation. +/// @pre @p mat_b is not distributed +/// @pre @p mat_b has size (N x N) +/// @pre @p mat_b has blocksize (NB x NB) +/// @pre @p mat_b has tilesize (NB x NB) template void generalized_to_standard(blas::Uplo uplo, Matrix& mat_a, Matrix& mat_b) { DLAF_ASSERT(matrix::square_size(mat_a), mat_a); @@ -74,16 +80,22 @@ void generalized_to_standard(blas::Uplo uplo, Matrix& mat_a, Matrix void generalized_to_standard(comm::CommunicatorGrid grid, blas::Uplo uplo, Matrix& mat_a, Matrix& mat_b) { diff --git a/include/dlaf/eigensolver/reduction_to_band.h b/include/dlaf/eigensolver/reduction_to_band.h index fededca70b..b3e05b45ef 100644 --- a/include/dlaf/eigensolver/reduction_to_band.h +++ b/include/dlaf/eigensolver/reduction_to_band.h @@ -27,14 +27,16 @@ namespace dlaf::eigensolver::internal { /// @param mat_a on entry it contains an Hermitian matrix, on exit it is overwritten with the /// band-diagonal result together with the elementary reflectors. Just the tiles of the lower /// triangular part will be used. +/// @pre @p mat_a is not distributed +/// @pre @p mat_a has size (N x N) +/// @pre @p mat_a has blocksize (NB x NB) +/// @pre @p mat_a has tilesize (NB x NB) +/// /// @param band_size size of the band of the resulting matrix (main diagonal + band_size sub-diagonals) +/// @pre @p `mat_a.blockSize().rows() % band_size == 0` +/// /// @return the tau vector as needed by backtransformationReductionToBand /// -/// @pre mat_a has a square size -/// @pre mat_a has a square block size -/// @pre mat_a has equal tile and block sizes -/// @pre mat_a is a local matrix -/// @pre mat_a.blockSize().rows() % band_size == 0 template Matrix reduction_to_band(Matrix& mat_a, const SizeType band_size) { DLAF_ASSERT(matrix::square_size(mat_a), mat_a); @@ -91,17 +93,19 @@ v v v v * * @endverbatim */ /// @param grid is the CommunicatorGrid on which @p mat_a is distributed +/// /// @param mat_a on entry it contains an Hermitian matrix, on exit it is overwritten with the /// band-diagonal result together with the elementary reflectors as described above. Just the tiles of /// the lower triangular part will be used. +/// @pre @p mat_a is distributed according to @p grid +/// @pre @p mat_a has size (N x N) +/// @pre @p mat_a has blocksize (NB x NB) +/// @pre @p mat_a has tilesize (NB x NB) +/// /// @param band_size size of the band of the resulting matrix (main diagonal + band_size sub-diagonals) -/// @return the tau vector as needed by backtransformationReductionToBand +/// @pre `mat_a.blockSize().rows() % band_size == 0` /// -/// @pre mat_a has a square size -/// @pre mat_a has a square block size -/// @pre mat_a has equal tile and block sizes -/// @pre mat_a is distributed according to @p grid -/// @pre mat_a.blockSize().rows() % band_size == 0 +/// @return the tau vector as needed by backtransformationReductionToBand template Matrix reduction_to_band(comm::CommunicatorGrid grid, Matrix& mat_a, const SizeType band_size) { diff --git a/include/dlaf/eigensolver/tridiag_solver.h b/include/dlaf/eigensolver/tridiag_solver.h index 133748b640..99df261c9b 100644 --- a/include/dlaf/eigensolver/tridiag_solver.h +++ b/include/dlaf/eigensolver/tridiag_solver.h @@ -22,21 +22,25 @@ namespace dlaf::eigensolver::internal { /// Finds the eigenvalues and eigenvectors of the local symmetric tridiagonal matrix @p tridiag. /// -/// @param tridiag [in/out] (n x 2) local matrix with the diagonal and off-diagonal of the symmetric -/// tridiagonal matrix in the first column and second columns respectively. The last entry -/// of the second column is not used. -/// @param evals [out] (n x 1) local matrix holding the eigenvalues of the the symmetric tridiagonal -/// matrix -/// @param evecs [out] (n x n) local matrix holding the eigenvectors of the the symmetric tridiagonal -/// matrix on exit. +/// @param tridiag local matrix with the diagonal and off-diagonal of the symmetric tridiagonal +/// matrix in the first column and second columns respectively. The last entry of the +/// second column is not used. +/// @pre @p tridiag is not distributed +/// @pre @p tridiag has size (N x 2) +/// @pre @p tridiag has blocksize (NB x 2) +/// @pre @p tridiag has tilesize (NB x 2) /// -/// @pre tridiag and @p evals and @p evecs are local matrices -/// @pre tridiag has 2 columns and column block size of 2 -/// @pre tridiag has equal tile and block sizes -/// @pre evecs is a square matrix with number of rows equal to the number of rows of @p tridiag and @p evals -/// @pre evecs has a square block size with number of block rows equal to the block rows of @p tridiag and @p evals -/// @pre evals has equal tile and block sizes -/// @pre evecs has equal tile and block sizes +/// @param[out] evals contains the eigenvalues of the symmetric tridiagonal matrix +/// @pre @p evals is not distributed +/// @pre @p evals has size (N x 1) +/// @pre @p evals has blocksize (NB x 1) +/// @pre @p evals has tilesize (NB x 1) +/// +/// @param[out] evecs contains the eigenvectors of the symmetric tridiagonal matrix +/// @pre @p evecs is not distributed +/// @pre @p evecs has size (N x N) +/// @pre @p evecs has blocksize (NB x NB) +/// @pre @p evecs has tilesize (NB x NB) template void tridiagonal_eigensolver(Matrix, Device::CPU>& tridiag, Matrix, device>& evals, Matrix& evecs) { @@ -71,24 +75,25 @@ void tridiagonal_eigensolver(Matrix, Device::CPU>& tridiag, /// on each rank. The resulting eigenvalues @p evals are stored locally on each rank while the resulting /// eigenvectors @p evecs are distributed across ranks in 2D block-cyclic manner. /// -/// @param tridiag [in/out] (n x 2) local matrix with the diagonal and off-diagonal of the symmetric -/// tridiagonal matrix in the first column and second columns respectively. The last entry -/// of the second column is not used. -/// @param evals [out] (n x 1) local matrix holding the eigenvalues of the the symmetric tridiagonal -/// matrix -/// @param evecs [out] (n x n) distributed matrix holding the eigenvectors of the the symmetric -/// tridiagonal -/// matrix on exit. +/// @param tridiag matrix with the diagonal and off-diagonal of the symmetric tridiagonal matrix in the +/// first column and second columns respectively. The last entry of the second column is +/// not used. +/// @pre @p tridiag is not distributed +/// @pre @p tridiag has size (N x 2) +/// @pre @p tridiag has blocksize (NB x 2) +/// @pre @p tridiag has tilesize (NB x 2) +/// +/// @param[out] evals holds the eigenvalues of the symmetric tridiagonal matrix +/// @pre @p evals is not distributed +/// @pre @p evals has size (N x 1) +/// @pre @p evals has blocksize (NB x 1) +/// @pre @p evals has tilesize (NB x 1) /// -/// @pre tridiag and @p evals are local matrices and are the same on all ranks -/// @pre tridiag has 2 columns and column block size of 2 -/// @pre tridiag has equal tile and block sizes -/// @pre evecs is a square matrix with global number of rows equal to the number of rows of @p tridiag -/// and @p evals -/// @pre evecs has a square block size with number of block rows equal to the block rows of @p tridiag -/// and @p evals -/// @pre evals has equal tile and block sizes -/// @pre evecs has equal tile and block sizes +/// @param[out] evecs holds the eigenvectors of the symmetric tridiagonal matrix +/// @pre @p evecs is distributed according to @p grid +/// @pre @p evecs has size (N x N) +/// @pre @p evecs has blocksize (NB x NB) +/// @pre @p evecs has tilesize (NB x NB) template void tridiagonal_eigensolver(comm::CommunicatorGrid grid, Matrix, Device::CPU>& tridiag, Matrix, D>& evals, Matrix& evecs) { diff --git a/include/dlaf/eigensolver/tridiag_solver/merge.h b/include/dlaf/eigensolver/tridiag_solver/merge.h index d3aa24b6d7..e9de15002a 100644 --- a/include/dlaf/eigensolver/tridiag_solver/merge.h +++ b/include/dlaf/eigensolver/tridiag_solver/merge.h @@ -538,8 +538,8 @@ std::vector> applyDeflationToArrays(T rho, T tol, const SizeTy d2 = tmp; rots.push_back(GivensRotation{i1s, i2s, c, s}); - // Set the the `i1` column as "Dense" if the `i2` column has opposite non-zero structure (i.e if - // one comes from Q1 and the other from Q2 or vice-versa) + // Set the `i1` column as "Dense" if the `i2` column has opposite non-zero structure (i.e if one + // comes from Q1 and the other from Q2 or vice-versa) if ((c1 == ColType::UpperHalf && c2 == ColType::LowerHalf) || (c1 == ColType::LowerHalf && c2 == ColType::UpperHalf)) { c1 = ColType::Dense; diff --git a/include/dlaf/eigensolver/tridiag_solver/rot.h b/include/dlaf/eigensolver/tridiag_solver/rot.h index 6d4860f410..daa56b9186 100644 --- a/include/dlaf/eigensolver/tridiag_solver/rot.h +++ b/include/dlaf/eigensolver/tridiag_solver/rot.h @@ -165,12 +165,12 @@ void applyGivensRotationsToMatrixColumns(const SizeType i_begin, const SizeType common::internal::SingleThreadedBlasScope single; for (const GivensRotation& rot : rots) { - // Get the index of the tile that has column `rot.i` and the the index of the column within the tile. + // Get the index of the tile that has column `rot.i` and the index of the column within the tile. const SizeType i_tile = distr.globalTileLinearIndex(GlobalElementIndex(0, rot.i)); const SizeType i_el = distr.tileElementFromGlobalElement(rot.i); T* x = tiles[to_sizet(i_tile)].ptr(TileElementIndex(0, i_el)); - // Get the index of the tile that has column `rot.j` and the the index of the column within the tile. + // Get the index of the tile that has column `rot.j` and the index of the column within the tile. const SizeType j_tile = distr.globalTileLinearIndex(GlobalElementIndex(0, rot.j)); const SizeType j_el = distr.tileElementFromGlobalElement(rot.j); T* y = tiles[to_sizet(j_tile)].ptr(TileElementIndex(0, j_el)); diff --git a/include/dlaf/factorization/cholesky.h b/include/dlaf/factorization/cholesky.h index 84ebb34bb9..fd9d65ac27 100644 --- a/include/dlaf/factorization/cholesky.h +++ b/include/dlaf/factorization/cholesky.h @@ -28,13 +28,14 @@ namespace dlaf { /// where L is a lower and U is an upper triangular matrix. /// @param uplo specifies if the elements of the Hermitian matrix to be referenced are the elements in /// the lower or upper triangular part, +/// /// @param mat_a on entry it contains the triangular matrix A, on exit the matrix elements /// are overwritten with the elements of the Cholesky factor. Only the tiles of the matrix /// which contain the upper or the lower triangular part (depending on the value of uplo), -/// @pre mat_a has a square size, -/// @pre mat_a has a square block size, -/// @pre mat_a has equal tile and block sizes -/// @pre mat_a is not distributed. +/// @pre @p mat_a is not distributed +/// @pre @p mat_a has size (N x N) +/// @pre @p mat_a has blocksize (NB x NB) +/// @pre @p mat_a has tilesize (NB x NB) template void cholesky_factorization(blas::Uplo uplo, Matrix& mat_a) { DLAF_ASSERT(matrix::square_size(mat_a), mat_a); @@ -59,10 +60,10 @@ void cholesky_factorization(blas::Uplo uplo, Matrix& mat_a) { /// @param mat_a on entry it contains the triangular matrix A, on exit the matrix elements /// are overwritten with the elements of the Cholesky factor. Only the tiles of the matrix /// which contain the upper or the lower triangular part (depending on the value of uplo), -/// @pre mat_a has a square size, -/// @pre mat_a has a square block size, -/// @pre mat_a has equal tile and block sizes -/// @pre mat_a is distributed according to grid. +/// @pre @p mat_a is distributed according to @p grid +/// @pre @p mat_a has size (N x N) +/// @pre @p mat_a has blocksize (NB x NB) +/// @pre @p mat_a has tilesize (NB x NB) template void cholesky_factorization(comm::CommunicatorGrid grid, blas::Uplo uplo, Matrix& mat_a) { DLAF_ASSERT(matrix::square_size(mat_a), mat_a); diff --git a/include/dlaf/multiplication/general.h b/include/dlaf/multiplication/general.h index 3cefd432d2..5754a2d21e 100644 --- a/include/dlaf/multiplication/general.h +++ b/include/dlaf/multiplication/general.h @@ -33,18 +33,30 @@ namespace dlaf::multiplication::internal { /// \a NoTrans, \a Trans, \a ConjTrans, /// @param opB specifies the form of opB(B) to be used in the matrix multiplication: /// \a NoTrans, \a Trans, \a ConjTrans, +/// /// @param mat_a contains the input matrix A. Only tiles whose both row and col tile coords are in /// the closed range [a,b] are accessed in read-only mode (elements are not modified) +/// @pre @p mat_a is not distributed +/// @pre @p mat_a has size (N x N) +/// @pre @p mat_a has blocksize (NB x NB) +/// @pre @p mat_a has tilesize (NB x NB) +/// /// @param mat_b contains the input matrix B. Only tiles whose both row and col tile coords are in /// the closed range [a,b] are accessed in read-only mode (elements are not modified) +/// @pre @p mat_b is not distributed +/// @pre @p mat_b has size (N x N) +/// @pre @p mat_b has blocksize (NB x NB) +/// @pre @p mat_b has tilesize (NB x NB) +/// /// @param mat_c On entry it contains the input matrix C. On exit matrix tiles in the range will be /// overwritten with the result, while others are left untouched. /// Only tiles whose both row and col tile coords are in the closed range [a,b] are accessed. -/// @pre mat_a, mat_b and mat_c have the same square block size, -/// @pre mat_a, mat_b and mat_c have the same size, -/// @pre mat_a, mat_b and mat_c have equal tile and block sizes, -/// @pre mat_a, mat_b and mat_c are not distributed, -/// @pre a <= b <= mat_a.nrTiles().rows() +/// @pre @p mat_c is not distributed +/// @pre @p mat_c has size (N x N) +/// @pre @p mat_c has blocksize (NB x NB) +/// @pre @p mat_c has tilesize (NB x NB) +/// +/// @pre `a <= b <= mat_a.nrTiles().rows()` template void generalSubMatrix(const SizeType a, const SizeType b, const blas::Op opA, const blas::Op opB, const T alpha, Matrix& mat_a, Matrix& mat_b, const T beta, @@ -90,16 +102,27 @@ void generalSubMatrix(const SizeType a, const SizeType b, const blas::Op opA, co /// /// @param mat_a contains the input matrix A. Only tiles whose both row and col tile coords are in /// the closed range [a,b] are accessed in read-only mode (elements are not modified) +/// @pre @p mat_a is distributed according to @p grid +/// @pre @p mat_a has size (N x N) +/// @pre @p mat_a has blocksize (NB x NB) +/// @pre @p mat_a has tilesize (NB x NB) +/// /// @param mat_b contains the input matrix B. Only tiles whose both row and col tile coords are in /// the closed range [a,b] are accessed in read-only mode (elements are not modified) +/// @pre @p mat_b is distributed according to @p grid +/// @pre @p mat_b has size (N x N) +/// @pre @p mat_b has blocksize (NB x NB) +/// @pre @p mat_b has tilesize (NB x NB) +/// /// @param mat_c On entry it contains the input matrix C. On exit matrix tiles in the range will be /// overwritten with the result, while others are left untouched. /// Only tiles whose both row and col tile coords are in the closed range [a,b] are accessed. -/// @pre mat_a, mat_b and mat_c are distributed in the same way, -/// @pre mat_a, mat_b and mat_c have the same square block size, -/// @pre mat_a, mat_b and mat_c have the same size, -/// @pre mat_a, mat_b and mat_c have equal tile and block sizes, -/// @pre a <= b <= mat_a.nrTiles().rows() +/// @pre @p mat_c is distributed according to @p grid +/// @pre @p mat_c has size (N x N) +/// @pre @p mat_c has blocksize (NB x NB) +/// @pre @p mat_c has tilesize (NB x NB) +/// +/// @pre `a <= b <= mat_a.nrTiles().rows()` template void generalSubMatrix([[maybe_unused]] comm::CommunicatorGrid grid, common::Pipeline& row_task_chain, diff --git a/include/dlaf/multiplication/hermitian.h b/include/dlaf/multiplication/hermitian.h index ba5b8e5986..bfe1599e44 100644 --- a/include/dlaf/multiplication/hermitian.h +++ b/include/dlaf/multiplication/hermitian.h @@ -27,17 +27,27 @@ namespace dlaf { /// @param side specifies whether A appears on the \a Left or on the \a Right of matrix B, /// @param uplo specifies if the elements of the Hermitian matrix A to be referenced are the elements in /// the lower or upper triangular part, +/// /// @param mat_a contains the hermitian matrix A. Only the tiles of the matrix which contain the upper or /// the lower triangular part which represent the Hermitian matrix (depending on the value of uplo) /// are accessed in read-only mode (the elements are not modified), +/// @pre @p mat_b is not distributed +/// @pre @p mat_b has size (N x M) +/// @pre @p mat_b has blocksize (NB x NB) +/// @pre @p mat_b has tilesize (NB x NB) +/// /// @param mat_b contains the matrix B accessed in read-only mode (the elements are not modified), +/// @pre @p mat_b is not distributed +/// @pre @p mat_b has size (M x K) +/// @pre @p mat_b has blocksize (NB x NB) +/// @pre @p mat_b has tilesize (NB x NB) +/// /// @param mat_c on entry it contains the matrix C, on exit the matrix elements are overwritten with the /// elements of the result. -/// @pre mat_a has a square size, -/// @pre mat_a has a square block size, -/// @pre mat_a mat_b and mat_c have equal tile and block size, -/// @pre mat_a mat_b and mat_c are not distributed, -/// @pre mat_a mat_b are multipliable and the result can be summed to mat_c. +/// @pre @p mat_b is not distributed +/// @pre @p mat_b has size (N x K) +/// @pre @p mat_b has blocksize (NB x NB) +/// @pre @p mat_b has tilesize (NB x NB) template void hermitian_multiplication(blas::Side side, blas::Uplo uplo, const T alpha, Matrix& mat_a, Matrix& mat_b, const T beta, Matrix& mat_c) { @@ -79,17 +89,27 @@ void hermitian_multiplication(blas::Side side, blas::Uplo uplo, const T alpha, M /// @param side specifies whether A appears on the \a Left or on the \a Right of matrix B, /// @param uplo specifies if the elements of the Hermitian matrix A to be referenced are the elements in /// the lower or upper triangular part, +/// /// @param mat_a contains the hermitian matrix A. Only the tiles of the matrix which contain the upper or /// the lower triangular part which represent the Hermitian matrix (depending on the value of uplo) /// are accessed in read-only mode (the elements are not modified), +/// @pre @p mat_b is distributed according to @p grid +/// @pre @p mat_b has size (N x M) +/// @pre @p mat_b has blocksize (NB x NB) +/// @pre @p mat_b has tilesize (NB x NB) +/// /// @param mat_b contains the matrix B accessed in read-only mode (the elements are not modified), +/// @pre @p mat_b is distributed according to @p grid +/// @pre @p mat_b has size (M x K) +/// @pre @p mat_b has blocksize (NB x NB) +/// @pre @p mat_b has tilesize (NB x NB) +/// /// @param mat_c on entry it contains the matrix C, on exit the matrix elements are overwritten with the /// elements of the result. -/// @pre mat_a has a square size, -/// @pre mat_a has a square block size, -/// @pre mat_a, mat_b and mat_c have equal tile and block size, -/// @pre mat_a, mat_b and mat_c are distributed according to the grid, -/// @pre mat_a mat_b are multipliable and the result can be summed to mat_c. +/// @pre @p mat_b is distributed according to @p grid +/// @pre @p mat_b has size (N x K) +/// @pre @p mat_b has blocksize (NB x NB) +/// @pre @p mat_b has tilesize (NB x NB) template void hermitian_multiplication(comm::CommunicatorGrid grid, blas::Side side, blas::Uplo uplo, const T alpha, Matrix& mat_a, Matrix& mat_b, diff --git a/include/dlaf/multiplication/triangular.h b/include/dlaf/multiplication/triangular.h index 8d895e8882..ee57dcbd38 100644 --- a/include/dlaf/multiplication/triangular.h +++ b/include/dlaf/multiplication/triangular.h @@ -29,16 +29,21 @@ namespace dlaf { /// @param op specifies the form of op(A) to be used in the matrix multiplication: \a NoTrans, \a Trans, /// \a ConjTrans, /// @param diag specifies if the matrix A is assumed to be unit triangular (\a Unit) or not (\a NonUnit), +/// /// @param mat_a contains the triangular matrix A. Only the tiles of the matrix which contain the upper or /// the lower triangular part (depending on the value of uplo) are accessed in read-only mode (the /// elements are not modified), +/// @pre @p mat_a is not distributed +/// @pre @p mat_a has size (N x N) +/// @pre @p mat_a has blocksize (NB x NB) +/// @pre @p mat_a has tilesize (NB x NB) +/// /// @param mat_b on entry it contains the matrix B, on exit the matrix elements are overwritten with the /// elements of the result. -/// @pre mat_a has a square size, -/// @pre mat_a has a square block size, -/// @pre mat_a and mat_b have equal tile and block sizes, -/// @pre mat_a and mat_b are not distributed, -/// @pre mat_a and mat_b are multipliable. +/// @pre @p mat_b is not distributed +/// @pre @p mat_b has size (N x N) +/// @pre @p mat_b has blocksize (NB x NB) +/// @pre @p mat_b has tilesize (NB x NB) template void triangular_multiplication(blas::Side side, blas::Uplo uplo, blas::Op op, blas::Diag diag, T alpha, Matrix& mat_a, Matrix& mat_b) { @@ -103,16 +108,21 @@ void triangular_multiplication(blas::Side side, blas::Uplo uplo, blas::Op op, bl /// @param op specifies the form of op(A) to be used in the matrix multiplication: \a NoTrans, \a Trans, /// \a ConjTrans, /// @param diag specifies if the matrix A is assumed to be unit triangular (\a Unit) or not (\a NonUnit), +/// /// @param mat_a contains the triangular matrix A. Only the tiles of the matrix which contain the upper or /// the lower triangular part (depending on the value of uplo) are accessed in read-only mode (the /// elements are not modified), +/// @pre @p mat_a is distributed according to @p grid +/// @pre @p mat_a has size (N x N) +/// @pre @p mat_a has blocksize (NB x NB) +/// @pre @p mat_a has tilesize (NB x NB) +/// /// @param mat_b on entry it contains the matrix B, on exit the matrix elements are overwritten with the /// elements of the result. -/// @pre mat_a has a square size, -/// @pre mat_a has a square block size, -/// @pre mat_a and mat_b have equal tile and block sizes, -/// @pre mat_a and mat_b are distributed according to the grid, -/// @pre mat_a and mat_b are multipliable. +/// @pre @p mat_b is distributed according to @p grid +/// @pre @p mat_b has size (N x N) +/// @pre @p mat_b has blocksize (NB x NB) +/// @pre @p mat_b has tilesize (NB x NB) template void triangular_multiplication(comm::CommunicatorGrid grid, blas::Side side, blas::Uplo uplo, blas::Op op, blas::Diag diag, T alpha, Matrix& mat_a, diff --git a/include/dlaf/permutations/general.h b/include/dlaf/permutations/general.h index b898a9e03e..5dca25704e 100644 --- a/include/dlaf/permutations/general.h +++ b/include/dlaf/permutations/general.h @@ -30,11 +30,22 @@ namespace dlaf::permutations { /// the range [0, n) where `n` is the size of the submatrix (i.e. the indices are local to the /// submatrix, they are not global). Only tiles whose row tile coords are in the range /// [i_begin,i_end) are accessed in read-only mode. +/// @pre @p perms is not distributed +/// @pre @p perms has blocksize (NB x MB) +/// @pre @p perms has tilesize (NB x MB) +/// /// @param mat_in is the input matrix. Only tiles whose both row and col tile coords are in /// the range [i_begin,i_end) are accessed in read-only mode. +/// @pre @p mat_in is not distributed +/// @pre @p mat_in has size (N x N) +/// @pre @p mat_in has blocksize (NB x NB) +/// @pre @p mat_in has tilesize (NB x NB) +/// /// @param mat_out is the output matrix. Only tiles whose both row and col tile coords are in /// the range [i_begin,i_end) are accessed in write-only mode. -/// +/// @pre @p mat_out has size (N x N) +/// @pre @p mat_out has blocksize (NB x NB) +/// @pre @p mat_out has tilesize (NB x NB) template void permute(SizeType i_begin, SizeType i_end, Matrix& perms, Matrix& mat_in, Matrix& mat_out) { @@ -75,14 +86,28 @@ void permute(SizeType i_begin, SizeType i_end, Matrix& perms, /// @param sub_task_chain orders non-blocking collective calls used internally. If @tparam coord is Coord::Col, /// a row communicator pipeline is expected, otherwise if @tparam is Coord::Row a column communicator /// pipeline is expected. +/// /// @param perms is the index map of permutations represented as a local tiled column vector. Indices are in /// the range [0, n) where `n` is the global size of the submatrix (i.e. submatrix indices are used /// instead of the full matrix indices). Only tiles whose row tile coords are in the range /// [i_begin,i_end) are accessed in read-only mode. +/// @pre @p perms is not distributed +/// @pre @p perms has blocksize (NB x MB) +/// @pre @p perms has tilesize (NB x MB) +/// /// @param mat_in is the distributed input matrix. Only tiles whose both global row and col tile coords are in /// the range [i_begin,i_end) are accessed in readwrite-mode. +/// @pre @p mat_in is distributed according to @p grid +/// @pre @p mat_in has size (N x N) +/// @pre @p mat_in has blocksize (NB x NB) +/// @pre @p mat_in has tilesize (NB x NB) +/// /// @param mat_out is the distributed output matrix. Only tiles whose both global row and col tile coords are in /// the range [i_begin,i_end) are accessed in readwrite-mode. +/// @pre @p mat_out is distributed according to @p grid +/// @pre @p mat_out has size (N x N) +/// @pre @p mat_out has blocksize (NB x NB) +/// @pre @p mat_out has tilesize (NB x NB) /// /// Note: The Pipeline<> API allows to use permute() within other algorithms without having to clone communicators /// internally. @@ -122,7 +147,6 @@ void permute(comm::CommunicatorGrid grid, common::Pipeline& /// /// This overload clones the row communicator (if Coord::Col) or column communicator (if Coord::Row) of /// @p grid internally. -/// template void permute(comm::CommunicatorGrid grid, SizeType i_begin, SizeType i_end, Matrix& perms, Matrix& mat_in, Matrix& mat_out) { diff --git a/include/dlaf/sender/transform_mpi.h b/include/dlaf/sender/transform_mpi.h index 50f23cb0b7..c953b20c2f 100644 --- a/include/dlaf/sender/transform_mpi.h +++ b/include/dlaf/sender/transform_mpi.h @@ -34,7 +34,7 @@ void consumeCommunicatorWrapper(T&) {} /// /// Wrapper type around calls to MPI functions. Provides a call operator that /// creates an MPI request and passes it as the last argument to the provided -/// callable. The wrapper then waits for the the request to complete with +/// callable. The wrapper then waits for the request to complete with /// yield_while. /// /// This could in theory be a lambda inside transformMPI. However, clang at diff --git a/include/dlaf/solver/triangular.h b/include/dlaf/solver/triangular.h index f4e43e092d..b4087134a3 100644 --- a/include/dlaf/solver/triangular.h +++ b/include/dlaf/solver/triangular.h @@ -29,16 +29,21 @@ namespace dlaf { /// @param op specifies the form of op(A) to be used in the matrix multiplication: \a NoTrans, \a Trans, /// \a ConjTrans, /// @param diag specifies if the matrix A is assumed to be unit triangular (\a Unit) or not (\a NonUnit), +/// /// @param mat_a contains the triangular matrix A. Only the tiles of the matrix which contain the upper or /// the lower triangular part (depending on the value of uplo) are accessed in read-only mode (the /// elements are not modified), +/// @pre @p mat_a is not distributed +/// @pre @p mat_a has size (N x M) +/// @pre @p mat_a has blocksize (NB x NB) +/// @pre @p mat_a has tilesize (NB x NB) +/// /// @param mat_b on entry it contains the matrix B, on exit the matrix elements are overwritten with the /// elements of the matrix X, -/// @pre mat_a has a square size, -/// @pre mat_a has a square block size, -/// @pre mat_a and mat_b have equal tile and block size, -/// @pre mat_a and mat_b are not distributed, -/// @pre mat_a and mat_b are multipliable. +/// @pre @p mat_b is not distributed +/// @pre @p mat_b has size (M x K) +/// @pre @p mat_b has blocksize (NB x NB) +/// @pre @p mat_b has tilesize (NB x NB) template void triangular_solver(blas::Side side, blas::Uplo uplo, blas::Op op, blas::Diag diag, T alpha, Matrix& mat_a, Matrix& mat_b) { @@ -101,16 +106,21 @@ void triangular_solver(blas::Side side, blas::Uplo uplo, blas::Op op, blas::Diag /// Trans, \a ConjTrans, /// @param diag specifies if the matrix A is assumed to be unit triangular (\a Unit) or not (\a /// NonUnit), +/// /// @param mat_a contains the triangular matrix A. Only the tiles of the matrix which contain the upper /// or the lower triangular part (depending on the value of uplo) are accessed in read-only mode (the /// elements are not modified), +/// @pre @p mat_b is distributed according to @p grid +/// @pre @p mat_b has size (N x M) +/// @pre @p mat_b has blocksize (NB x NB) +/// @pre @p mat_b has tilesize (NB x NB) +/// /// @param mat_b on entry it contains the matrix B, on exit the matrix elements are overwritten with /// the elements of the matrix X, -/// @pre matrix A has a square size, -/// @pre matrix A has a square block size, -/// @pre matrix A and matrix B have equal tile and block sizes, -/// @pre matrix A and matrix B are distributed according to the grid, -/// @pre matrix A and matrix B are multipliable. +/// @pre @p mat_b is distributed according to @p grid +/// @pre @p mat_b has size (M x K) +/// @pre @p mat_b has blocksize (NB x NB) +/// @pre @p mat_b has tilesize (NB x NB) template void triangular_solver(comm::CommunicatorGrid grid, blas::Side side, blas::Uplo uplo, blas::Op op, blas::Diag diag, T alpha, Matrix& mat_a,