diff --git a/examples/sparse/CMakeLists.txt b/examples/sparse/CMakeLists.txt index 5fc2eaf8..1ee36d54 100644 --- a/examples/sparse/CMakeLists.txt +++ b/examples/sparse/CMakeLists.txt @@ -2,6 +2,8 @@ add_executable(testPoisson2d EXCLUDE_FROM_ALL testPoisson2d.cpp) add_executable(testMMdouble EXCLUDE_FROM_ALL testMMdouble.cpp) add_executable(testPoisson3d EXCLUDE_FROM_ALL testPoisson3d.cpp) add_executable(testMixedPrecision EXCLUDE_FROM_ALL testMixedPrecision.cpp) +add_executable(testMixedPrecisionSymmetricPositiveDefinite EXCLUDE_FROM_ALL testMixedPrecisionSymmetricPositiveDefinite.cpp) +add_executable(testSymmetricPositiveDefinite EXCLUDE_FROM_ALL testSymmetricPositiveDefinite.cpp) add_executable(sexample EXCLUDE_FROM_ALL sexample.c) add_executable(dexample EXCLUDE_FROM_ALL dexample.c) add_executable(cexample EXCLUDE_FROM_ALL cexample.c) @@ -13,6 +15,8 @@ target_link_libraries(testPoisson2d strumpack) target_link_libraries(testMMdouble strumpack) target_link_libraries(testPoisson3d strumpack) target_link_libraries(testMixedPrecision strumpack) +target_link_libraries(testMixedPrecisionSymmetricPositiveDefinite strumpack) +target_link_libraries(testSymmetricPositiveDefinite strumpack) target_link_libraries(sexample strumpack) target_link_libraries(dexample strumpack) target_link_libraries(cexample strumpack) @@ -24,6 +28,8 @@ add_dependencies(examples testMMdouble testPoisson3d testMixedPrecision + testMixedPrecisionSymmetricPositiveDefinite + testSymmetricPositiveDefinite sexample dexample cexample diff --git a/examples/sparse/testMixedPrecisionSymmetricPositiveDefinite.cpp b/examples/sparse/testMixedPrecisionSymmetricPositiveDefinite.cpp new file mode 100644 index 00000000..d2476f68 --- /dev/null +++ b/examples/sparse/testMixedPrecisionSymmetricPositiveDefinite.cpp @@ -0,0 +1,181 @@ +// +// Created by tingxuan on 2023/12/24. +// +/* + * STRUMPACK -- STRUctured Matrices PACKage, Copyright (c) 2014, The + * Regents of the University of California, through Lawrence Berkeley + * National Laboratory (subject to receipt of any required approvals + * from the U.S. Dept. of Energy). All rights reserved. + * + * If you have questions about your rights to use or distribute this + * software, please contact Berkeley Lab's Technology Transfer + * Department at TTD@lbl.gov. + * + * NOTICE. This software is owned by the U.S. Department of Energy. As + * such, the U.S. Government has been granted for itself and others + * acting on its behalf a paid-up, nonexclusive, irrevocable, + * worldwide license in the Software to reproduce, prepare derivative + * works, and perform publicly and display publicly. Beginning five + * (5) years after the date permission to assert copyright is obtained + * from the U.S. Department of Energy, and subject to any subsequent + * five (5) year renewals, the U.S. Government is granted for itself + * and others acting on its behalf a paid-up, nonexclusive, + * irrevocable, worldwide license in the Software to reproduce, + * prepare derivative works, distribute copies to the public, perform + * publicly and display publicly, and to permit others to do so. + * + * Developers: Pieter Ghysels, Francois-Henry Rouet, Xiaoye S. Li. + * (Lawrence Berkeley National Lab, Computational Research + * Division). + * + */ +#include +#include +#include +#include + +#include "StrumpackSparseSolver.hpp" +#include "StrumpackSparseSolverMixedPrecision.hpp" +#include "sparse/CSRMatrix.hpp" + +using namespace strumpack; + + +/** + * Test the STRUMPACK sparse solver, and the mixed precision sparse + * solver. + * + * For working_t == float, the mixed precision solver will + * compute the factorization in single precision, but do iterative + * refinement in double precision to give a more accurate results than + * the standard single precision solver. + * + * For working_t == double, the mixed precision solver will compute + * the factorization in single precision and perform the iterative + * refinement in double precision. If the problem is not too + * ill-conditioned, this should be about as accurate, and about twice + * as fast as the standard double precision solver. The speedup + * depends on the relative cost of the sparse triangular solver phase + * compared to the sparse LU factorization phase. + * + * TODO long double + */ +template +void test(CSRMatrix& A, + DenseMatrix& b, DenseMatrix& x_exact, + int argc, char* argv[]) { + int m = b.cols(); // number of right-hand sides + auto N = A.size(); + DenseMatrix x(N, m); + + std::cout << std::endl; + std::cout << "###############################################" << std::endl; + std::cout << "### Working precision: " << + (std::is_same::value ? "single" : "double") + << " #################" << std::endl; + std::cout << "###############################################" << std::endl; + + { + std::cout << std::endl; + std::cout << "### MIXED Precision Solver ####################" << std::endl; + + SparseSolverMixedPrecision spss; + /** options for the outer solver */ + spss.options().set_Krylov_solver(KrylovSolver::REFINE); +// spss.options().set_Krylov_solver(KrylovSolver::PREC_BICGSTAB); +// spss.options().set_Krylov_solver(KrylovSolver::PREC_GMRES); + spss.options().set_rel_tol(1e-14); + spss.options().set_from_command_line(argc, argv); + + /** options for the inner solver */ + spss.solver().options().set_Krylov_solver(KrylovSolver::DIRECT); + spss.solver().options().set_from_command_line(argc, argv); + spss.options().set_matching(strumpack::MatchingJob::NONE); + spss.solver().options().set_matching(strumpack::MatchingJob::NONE); + spss.options().enable_symmetric(); + spss.options().enable_positive_definite(); + spss.solver().options().enable_symmetric(); + spss.solver().options().enable_positive_definite(); + + spss.set_matrix(A); + spss.reorder(); + spss.factor(); + spss.solve(b, x); + + std::cout << "# COMPONENTWISE SCALED RESIDUAL = " + << A.max_scaled_residual(x.data(), b.data()) << std::endl; + strumpack::blas::axpy(N, -1., x_exact.data(), 1, x.data(), 1); + auto nrm_error = strumpack::blas::nrm2(N, x.data(), 1); + auto nrm_x_exact = strumpack::blas::nrm2(N, x_exact.data(), 1); + std::cout << "# RELATIVE ERROR = " << (nrm_error/nrm_x_exact) << std::endl; + } + + std::cout << std::endl; +} + + +int main(int argc, char* argv[]) { + + std::cout << "long double size in bytes: " + << sizeof(long double) << " " + << std::endl; + + std::string f; + if (argc > 1) f = std::string(argv[1]); + + CSRMatrix A_d; + A_d.read_matrix_market(f); + auto A_f = cast_matrix(A_d); + + int N = A_d.size(); + int m = 1; // nr of RHSs + DenseMatrix b_d(N, m), x_true_d(N, m); + + + // set the exact solution, see: + // http://www.netlib.org/lapack/lawnspdf/lawn165.pdf + // page 20 + std::default_random_engine gen; + std::uniform_real_distribution dist(0., std::sqrt(24.)); + for (int j=0; j x(N, m); + // step 7, but in double, not double-double + SparseSolver spss; + // SparseSolverMixedPrecision spss; + spss.options().enable_symmetric(); + spss.options().enable_positive_definite(); + spss.set_matrix(A_d); + spss.options().set_matching(strumpack::MatchingJob::NONE); + spss.options().set_Krylov_solver(KrylovSolver::DIRECT); + spss.solve(b_d, x); + + + std::cout << "# COMPONENTWISE SCALED RESIDUAL = " + << A_d.max_scaled_residual(x.data(), b_d.data()) << std::endl; + strumpack::blas::axpy(N, -1., x_true_d.data(), 1, x.data(), 1); + auto nrm_error = strumpack::blas::nrm2(N, x.data(), 1); + auto nrm_x_exact = strumpack::blas::nrm2(N, x_true_d.data(), 1); + std::cout << "# RELATIVE ERROR = " << (nrm_error/nrm_x_exact) << std::endl; + + } + + // cast RHS and true solution to single precision + DenseMatrix b_f(N, m), x_true_f(N, m); + copy(x_true_d, x_true_f); + copy(b_d, b_f); + + test(A_d, b_d, x_true_d, argc, argv); + test(A_f, b_f, x_true_f, argc, argv); + + return 0; +} diff --git a/examples/sparse/testSymmetricPositiveDefinite.cpp b/examples/sparse/testSymmetricPositiveDefinite.cpp new file mode 100644 index 00000000..39383e01 --- /dev/null +++ b/examples/sparse/testSymmetricPositiveDefinite.cpp @@ -0,0 +1,181 @@ +// +// Created by tingxuan on 2023/12/24. +// +/* + * STRUMPACK -- STRUctured Matrices PACKage, Copyright (c) 2014, The + * Regents of the University of California, through Lawrence Berkeley + * National Laboratory (subject to receipt of any required approvals + * from the U.S. Dept. of Energy). All rights reserved. + * + * If you have questions about your rights to use or distribute this + * software, please contact Berkeley Lab's Technology Transfer + * Department at TTD@lbl.gov. + * + * NOTICE. This software is owned by the U.S. Department of Energy. As + * such, the U.S. Government has been granted for itself and others + * acting on its behalf a paid-up, nonexclusive, irrevocable, + * worldwide license in the Software to reproduce, prepare derivative + * works, and perform publicly and display publicly. Beginning five + * (5) years after the date permission to assert copyright is obtained + * from the U.S. Department of Energy, and subject to any subsequent + * five (5) year renewals, the U.S. Government is granted for itself + * and others acting on its behalf a paid-up, nonexclusive, + * irrevocable, worldwide license in the Software to reproduce, + * prepare derivative works, distribute copies to the public, perform + * publicly and display publicly, and to permit others to do so. + * + * Developers: Pieter Ghysels, Francois-Henry Rouet, Xiaoye S. Li. + * (Lawrence Berkeley National Lab, Computational Research + * Division). + * + */ +#include +#include +#include +#include + +#include "StrumpackSparseSolver.hpp" +#include "StrumpackSparseSolverMixedPrecision.hpp" +#include "sparse/CSRMatrix.hpp" + +using namespace strumpack; + + +/** + * Test the STRUMPACK sparse solver, and the mixed precision sparse + * solver. + * + * For working_t == float, the mixed precision solver will + * compute the factorization in single precision, but do iterative + * refinement in double precision to give a more accurate results than + * the standard single precision solver. + * + * For working_t == double, the mixed precision solver will compute + * the factorization in single precision and perform the iterative + * refinement in double precision. If the problem is not too + * ill-conditioned, this should be about as accurate, and about twice + * as fast as the standard double precision solver. The speedup + * depends on the relative cost of the sparse triangular solver phase + * compared to the sparse LU factorization phase. + * + * TODO long double + */ +template +void test(CSRMatrix& A, + DenseMatrix& b, DenseMatrix& x_exact, + int argc, char* argv[]) { + int m = b.cols(); // number of right-hand sides + auto N = A.size(); + DenseMatrix x(N, m); + + std::cout << std::endl; + std::cout << "###############################################" << std::endl; + std::cout << "### Working precision: " << + (std::is_same::value ? "single" : "double") + << " #################" << std::endl; + std::cout << "###############################################" << std::endl; + + { + std::cout << std::endl; + std::cout << "### MIXED Precision Solver ####################" << std::endl; + + SparseSolverMixedPrecision spss; + /** options for the outer solver */ + spss.options().set_Krylov_solver(KrylovSolver::REFINE); +// spss.options().set_Krylov_solver(KrylovSolver::PREC_BICGSTAB); +// spss.options().set_Krylov_solver(KrylovSolver::PREC_GMRES); + spss.options().set_rel_tol(1e-14); + spss.options().set_from_command_line(argc, argv); + + /** options for the inner solver */ + spss.solver().options().set_Krylov_solver(KrylovSolver::DIRECT); + spss.solver().options().set_from_command_line(argc, argv); + spss.options().set_matching(strumpack::MatchingJob::NONE); + spss.solver().options().set_matching(strumpack::MatchingJob::NONE); + spss.options().enable_symmetric(); + spss.options().enable_positive_definite(); + spss.solver().options().enable_symmetric(); + spss.solver().options().enable_positive_definite(); + + spss.set_matrix(A); + spss.reorder(); + spss.factor(); + spss.solve(b, x); + + std::cout << "# COMPONENTWISE SCALED RESIDUAL = " + << A.max_scaled_residual(x.data(), b.data()) << std::endl; + strumpack::blas::axpy(N, -1., x_exact.data(), 1, x.data(), 1); + auto nrm_error = strumpack::blas::nrm2(N, x.data(), 1); + auto nrm_x_exact = strumpack::blas::nrm2(N, x_exact.data(), 1); + std::cout << "# RELATIVE ERROR = " << (nrm_error/nrm_x_exact) << std::endl; + } + + std::cout << std::endl; +} + + +int main(int argc, char* argv[]) { + + CSRMatrix A_d; + if(argc > 1){ + A_d.read_matrix_market(argv[1]); + }else{ + int n =3; + int ptr[4] = {0,2,3,5}; + int Index[5] = {0,2,1,0,2}; + double val[5] = {2.1,1,3.5,1,5.2}; + A_d = CSRMatrix(n, ptr, Index, val); + } + + + int N = A_d.size(); + int m = 1; // nr of RHSs + DenseMatrix b_d(N, m), x_true_d(N, m); + + + // set the exact solution, see: + // http://www.netlib.org/lapack/lawnspdf/lawn165.pdf + // page 20 + std::default_random_engine gen; + std::uniform_real_distribution dist(0., std::sqrt(24.)); + for (int j=0; j x(N, m); + // step 7, but in double, not double-double + SparseSolver spss; + // SparseSolverMixedPrecision spss; + spss.options().enable_symmetric(); + spss.options().enable_positive_definite(); + spss.set_matrix(A_d); + spss.options().set_matching(strumpack::MatchingJob::NONE); + spss.options().set_Krylov_solver(KrylovSolver::DIRECT); + spss.solve(b_d, x); + + std::cout<<"x_true_d="; + for(int r=0; r> - (s, sbegin, send, upd); + if (is_symmetric(opts) && is_positive_definite(opts)){ + front.reset + (new FrontalMatrixGPUSymmetricPositiveDefinite(s, sbegin, send, upd)); + } else { + front.reset + (new FrontalMatrixGPU(s, sbegin, send, upd)); + } #endif #endif if (root) fc.dense++; diff --git a/src/sparse/fronts/FrontFactory.hpp b/src/sparse/fronts/FrontFactory.hpp index f5e3bc57..d5749ca0 100644 --- a/src/sparse/fronts/FrontFactory.hpp +++ b/src/sparse/fronts/FrontFactory.hpp @@ -65,6 +65,22 @@ namespace strumpack { #endif } + template bool is_symmetric + (const SPOptions& opts) { +#if defined(STRUMPACK_USE_CUDA) || defined(STRUMPACK_USE_HIP) || defined(STRUMPACK_USE_SYCL) + return opts.use_symmetric() && opts.compression() == CompressionType::NONE; +#endif + return false; + } + + template bool is_positive_definite + (const SPOptions& opts) { +#if defined(STRUMPACK_USE_CUDA) || defined(STRUMPACK_USE_HIP) || defined(STRUMPACK_USE_SYCL) + return opts.use_positive_definite(); +#endif + return false; + } + template bool is_HSS (int dsep, int dupd, const SPOptions& opts) { return opts.compression() == CompressionType::HSS && diff --git a/src/sparse/fronts/FrontalMatrix.hpp b/src/sparse/fronts/FrontalMatrix.hpp index 97ef067b..d48ba00f 100644 --- a/src/sparse/fronts/FrontalMatrix.hpp +++ b/src/sparse/fronts/FrontalMatrix.hpp @@ -107,6 +107,13 @@ namespace strumpack { VectorPool workspace; return factor(A, opts, workspace, etree_level, task_depth); } + + // TODO(Jie): make it pure virtual + virtual ReturnCode + multifrontal_factorization_symmetric(const SpMat_t& A, const Opts_t& opts, + int etree_level=0, int task_depth=0) { abort(); }; + + virtual ReturnCode factor(const SpMat_t& A, const Opts_t& opts, VectorPool& workspace, int etree_level=0, int task_depth=0) { @@ -197,6 +204,15 @@ namespace strumpack { int task_depth) { extend_add_to_dense(paF11, paF12, paF21, paF22, p, task_depth); } + // TODO(Jie): make it pure virtual + /* + * This is for symmetric + */ + virtual void + extend_add_to_dense(DenseM_t& paF11, + DenseM_t& paF21, DenseM_t& paF22, + const FrontalMatrix* p, + int task_depth) { abort(); } virtual void extend_add_to_blr(BLRM_t& paF11, BLRM_t& paF12, diff --git a/src/sparse/fronts/FrontalMatrixGPUKernels.hpp b/src/sparse/fronts/FrontalMatrixGPUKernels.hpp index ffd1633d..3b9942f1 100644 --- a/src/sparse/fronts/FrontalMatrixGPUKernels.hpp +++ b/src/sparse/fronts/FrontalMatrixGPUKernels.hpp @@ -83,10 +83,18 @@ namespace strumpack { template void assemble(unsigned int, AssembleData*, AssembleData*); + template void + assemble_symmetric(unsigned int, AssembleData*, AssembleData*); + template::value_type> void factor_block_batch(unsigned int, FrontData*, bool, real_t, int*); + template::value_type> + void factor_symmetric_block_batch(unsigned int count, FrontData* dat, + bool replace, real_t thresh, int* dinfo); + template::value_type> void replace_pivots(int, T*, real_t, gpu::Stream* = nullptr); diff --git a/src/sparse/fronts/FrontalMatrixGPUSymmetricPositiveDefinite.cpp b/src/sparse/fronts/FrontalMatrixGPUSymmetricPositiveDefinite.cpp new file mode 100644 index 00000000..693c76a2 --- /dev/null +++ b/src/sparse/fronts/FrontalMatrixGPUSymmetricPositiveDefinite.cpp @@ -0,0 +1,1094 @@ +// +// Created by tingxuan on 23-6-19. +// +/* + * STRUMPACK -- STRUctured Matrices PACKage, Copyright (c) 2014, The + * Regents of the University of California, through Lawrence Berkeley + * National Laboratory (subject to receipt of any required approvals + * from the U.S. Dept. of Energy). All rights reserved. + * + * If you have questions about your rights to use or distribute this + * software, please contact Berkeley Lab's Technology Transfer + * Department at TTD@lbl.gov. + * + * NOTICE. This software is owned by the U.S. Department of Energy. As + * such, the U.S. Government has been granted for itself and others + * acting on its behalf a paid-up, nonexclusive, irrevocable, + * worldwide license in the Software to reproduce, prepare derivative + * works, and perform publicly and display publicly. Beginning five + * (5) years after the date permission to assert copyright is obtained + * from the U.S. Department of Energy, and subject to any subsequent + * five (5) year renewals, the U.S. Government is granted for itself + * and others acting on its behalf a paid-up, nonexclusive, + * irrevocable, worldwide license in the Software to reproduce, + * prepare derivative works, distribute copies to the public, perform + * publicly and display publicly, and to permit others to do so. + * + * Developers: Pieter Ghysels, Francois-Henry Rouet, Xiaoye S. Li. + * (Lawrence Berkeley National Lab, Computational Research + * Division). + * + */ +#include + +#include "FrontalMatrixGPUSymmetricPositiveDefinite.hpp" +#include "FrontalMatrixGPUKernels.hpp" + +#if defined(STRUMPACK_USE_MPI) +#include "ExtendAdd.hpp" +#include "FrontalMatrixMPI.hpp" +#endif + +namespace strumpack { + + template class LevelInfoUnified { + using F_t = FrontalMatrix; + using FG_t = FrontalMatrixGPUSymmetricPositiveDefinite; + using DenseMW_t = DenseMatrixWrapper; + using SpMat_t = CompressedSparseMatrix; + + public: + LevelInfoUnified() {} + + LevelInfoUnified(const std::vector& fronts, gpu::Handle& handle, + int max_streams, const SpMat_t* A) { + if (!A->symm_sparse()) { + f.reserve(fronts.size()); + for (auto& F : fronts) + f.push_back(dynamic_cast(F)); + std::size_t max_dsep = 0; + // This pragma causes "internal error: null pointer" on the + // intel compiler. It seems to be because the variables are + // class members. + // #pragma omp parallel for reduction(+:L_size,U_size,Schur_size,piv_size,total_upd_size,N8,N16,N24,N32,factors_small) reduction(max:max_dsep) + for (std::size_t i=0; idim_sep(); + const std::size_t dupd = F->dim_upd(); + diagonal_size += dsep * dsep; + off_diagonal_size += 2 * dsep * dupd; + L_size += dsep*(dsep + dupd); + U_size += dsep*dupd; + Schur_size += dupd*dupd; + piv_size += dsep; + total_upd_size += dupd; + if (dsep <= 32) { + factors_diagonal_small += dsep * dsep; + factors_off_diagonal_small += 2 * dsep * dupd; + factors_small += dsep*(dsep + 2*dupd); + if (dsep <= 8) N8++; + else if (dsep <= 16) N16++; + else if (dsep <= 24) N24++; + else N32++; + } + if (dsep > max_dsep) max_dsep = dsep; + } + small_fronts = N8 + N16 + N24 + N32; + if (small_fronts && small_fronts != f.size()) + std::partition + (f.begin(), f.end(), [](const FG_t* const& a) -> bool { + return a->dim_sep() <= 32; }); + { + auto N = f.size(); + elems11.resize(N+1); + elems12.resize(N+1); + elems21.resize(N+1); + Isize.resize(N+1); +#pragma omp parallel for + for (std::size_t i=0; icount_front_elements + (F.sep_begin(), F.sep_end(), F.upd(), + elems11[i+1], elems12[i+1], elems21[i+1]); + if (F.lchild_) Isize[i+1] += F.lchild_->dim_upd(); + if (F.rchild_) Isize[i+1] += F.rchild_->dim_upd(); + } + for (std::size_t i=0; i(handle, max_dsep); + getrf_work_size = gpu::potrf_buffersize(handle, UpLo::L, max_dsep); + + factor_bytes = sizeof(scalar_t) * factor_size; + factor_bytes = gpu::round_up(factor_bytes); + + work_bytes = sizeof(scalar_t) * (Schur_size + getrf_work_size * max_streams); + work_bytes = gpu::round_up(work_bytes); + work_bytes += sizeof(int) * (piv_size + f.size()); + work_bytes = gpu::round_up(work_bytes); + work_bytes += sizeof(gpu::FrontData) * (N8 + N16 + N24 + N32); + work_bytes = gpu::round_up(work_bytes); + + ea_bytes = sizeof(gpu::AssembleData) * f.size(); + ea_bytes = gpu::round_up(ea_bytes); + ea_bytes += sizeof(std::size_t) * Isize.back(); + ea_bytes = gpu::round_up(ea_bytes); + ea_bytes += sizeof(Triplet) * (elems11.back() + elems12.back() + elems21.back()); + ea_bytes = gpu::round_up(ea_bytes); + } else { + f.reserve(fronts.size()); + for (auto& F : fronts) + f.push_back(dynamic_cast(F)); + std::size_t max_dsep = 0; + // This pragma causes "internal error: null pointer" on the + // intel compiler. It seems to be because the variables are + // class members. + // #pragma omp parallel for reduction(+:L_size,U_size,Schur_size,piv_size,total_upd_size,N8,N16,N24,N32,factors_small) reduction(max:max_dsep) + for (std::size_t i=0; idim_sep(); + const std::size_t dupd = F->dim_upd(); + diagonal_size += dsep * dsep; + off_diagonal_size += dsep * dupd; + L_size += dsep*dsep; + U_size += dsep*dupd; + Schur_size += dupd*dupd; + piv_size += dsep; + total_upd_size += dupd; + if (dsep <= 32) { + factors_diagonal_small += dsep * dsep; + factors_off_diagonal_small += dsep * dupd; + factors_small += dsep*(dsep + dupd); + if (dsep <= 8) N8++; + else if (dsep <= 16) N16++; + else if (dsep <= 24) N24++; + else N32++; + } + if (dsep > max_dsep) max_dsep = dsep; + } + small_fronts = N8 + N16 + N24 + N32; + if (small_fronts && small_fronts != f.size()) + std::partition + (f.begin(), f.end(), [](const FG_t* const& a) -> bool { + return a->dim_sep() <= 32; }); + { + auto N = f.size(); + elems11.resize(N+1); + elems21.resize(N+1); + Isize.resize(N+1); +#pragma omp parallel for + for (std::size_t i=0; icount_front_elements_symmetric + (F.sep_begin(), F.sep_end(), F.upd(), + elems11[i+1], elems21[i+1]); + if (F.lchild_) Isize[i+1] += F.lchild_->dim_upd(); + if (F.rchild_) Isize[i+1] += F.rchild_->dim_upd(); + } + for (std::size_t i=0; i(handle, max_dsep); + getrf_work_size = gpu::potrf_buffersize(handle, UpLo::L, max_dsep); + + factor_bytes = sizeof(scalar_t) * factor_size; + factor_bytes = gpu::round_up(factor_bytes); + + work_bytes = sizeof(scalar_t) * (Schur_size + getrf_work_size * max_streams); + work_bytes = gpu::round_up(work_bytes); + work_bytes += sizeof(int) * (piv_size + f.size()); + work_bytes = gpu::round_up(work_bytes); + work_bytes += sizeof(gpu::FrontData) * (N8 + N16 + N24 + N32); + work_bytes = gpu::round_up(work_bytes); + + ea_bytes = sizeof(gpu::AssembleData) * f.size(); + ea_bytes = gpu::round_up(ea_bytes); + ea_bytes += sizeof(std::size_t) * Isize.back(); + ea_bytes = gpu::round_up(ea_bytes); + ea_bytes += sizeof(Triplet) * (elems11.back() + elems21.back()); + ea_bytes = gpu::round_up(ea_bytes); + } + } + + void print_info(int l, int lvls) { + std::cout << "# level " << l << " of " << lvls + << " has " << f.size() << " nodes and " + << N8 << " <=8, " << N16 << " <=16, " + << N24 << " <=24, " << N32 << " <=32, needs " + << factor_bytes / 1.e6 + << " MB for factors, " + << Schur_size * sizeof(scalar_t) / 1.e6 + << " MB for Schur complements" << std::endl; + } + + void flops(long long& level_flops, long long& small_flops) { + level_flops = small_flops = 0; + auto N = f.size(); +#pragma omp parallel for reduction(+: level_flops, small_flops) + for (std::size_t i=0; iF11_) + + gemm_flops(Trans::N, Trans::N, scalar_t(-1.), + F->F21_, F->F12_, scalar_t(1.)) + + trsm_flops(Side::L, scalar_t(1.), F->F11_, F->F12_) + + trsm_flops(Side::R, scalar_t(1.), F->F11_, F->F21_); + level_flops += flops; + if (F->dim_sep() <= 32) + small_flops += flops; + } + } + + /* + * first store L factors, then U factors, + * F11, F21, F11, F21, ..., F12, F12, ... + */ + void set_factor_pointers(scalar_t* factors) { + for (auto F : f) { + const std::size_t dsep = F->dim_sep(); + const std::size_t dupd = F->dim_upd(); + F->F11_ = DenseMW_t(dsep, dsep, factors, dsep); factors += dsep*dsep; + F->F12_ = DenseMW_t(dsep, dupd, factors, dsep); factors += dsep*dupd; + F->F21_ = DenseMW_t(dupd, dsep, factors, dupd); factors += dupd*dsep; + } + } + + void set_factor_pointers(scalar_t* factors_diagonal, scalar_t* factors_off_diagonal) { + for (auto F : f) { + const std::size_t dsep = F->dim_sep(); + const std::size_t dupd = F->dim_upd(); + F->F11_ = DenseMW_t(dsep, dsep, factors_diagonal, dsep); factors_diagonal += dsep*dsep; + F->F21_ = DenseMW_t(dupd, dsep, factors_off_diagonal, dupd); factors_off_diagonal += dupd*dsep; + } + } + + void set_pivot_pointers(int* pmem) { + for (auto F : f) { + F->piv_ = pmem; + pmem += F->dim_sep(); + } + } + + void set_work_pointers(void* wmem, int max_streams) { + auto schur = gpu::aligned_ptr(wmem); + for (auto F : f) { + const int dupd = F->dim_upd(); + if (dupd) { + F->F22_ = DenseMW_t(dupd, dupd, schur, dupd); + schur += dupd*dupd; + } + } + dev_getrf_work = schur; + schur += max_streams * getrf_work_size; + auto imem = gpu::aligned_ptr(schur); + for (auto F : f) { + F->piv_ = imem; + imem += F->dim_sep(); + } + dev_getrf_err = imem; imem += f.size(); + auto fdat = gpu::aligned_ptr>(imem); + f8 = fdat; fdat += N8; + f16 = fdat; fdat += N16; + f24 = fdat; fdat += N24; + f32 = fdat; fdat += N32; + } + + int align = 0; + std::vector f; + std::size_t diagonal_size = 0, off_diagonal_size = 0, L_size = 0, U_size = 0, + factors_diagonal_small = 0, factors_off_diagonal_small = 0, + factor_size = 0, factors_small = 0, Schur_size = 0, piv_size = 0, + total_upd_size = 0; + std::size_t N8 = 0, N16 = 0, N24 = 0, N32 = 0, small_fronts = 0; + std::size_t work_bytes = 0, ea_bytes = 0, factor_bytes = 0; + std::vector elems11, elems12, elems21, Isize; + scalar_t* dev_getrf_work = nullptr; + int* dev_getrf_err = nullptr; + int getrf_work_size = 0; + gpu::FrontData *f8 = nullptr, *f16 = nullptr, + *f24 = nullptr, *f32 = nullptr; + }; + + + template + FrontalMatrixGPUSymmetricPositiveDefinite::FrontalMatrixGPUSymmetricPositiveDefinite + (integer_t sep, integer_t sep_begin, integer_t sep_end, + std::vector& upd) + : F_t(nullptr, nullptr, sep, sep_begin, sep_end, upd) {} + + template + FrontalMatrixGPUSymmetricPositiveDefinite::~FrontalMatrixGPUSymmetricPositiveDefinite() { +#if defined(STRUMPACK_COUNT_FLOPS) + const std::size_t dupd = dim_upd(); + const std::size_t dsep = dim_sep(); + STRUMPACK_SUB_MEMORY(dsep*(dsep+2*dupd)*sizeof(scalar_t)); +#endif + } + + template void + FrontalMatrixGPUSymmetricPositiveDefinite::release_work_memory() { + F22_.clear(); + host_Schur_.reset(nullptr); + } + +#if defined(STRUMPACK_USE_MPI) + template void + FrontalMatrixGPUSymmetricPositiveDefinite::extend_add_copy_to_buffers + (std::vector>& sbuf, + const FrontalMatrixMPI* pa) const { + ExtendAdd::extend_add_seq_copy_to_buffers + (F22_, sbuf, pa, this); + } +#endif + + + template void + FrontalMatrixGPUSymmetricPositiveDefinite::extend_add_to_dense + (DenseM_t& paF11, DenseM_t& paF21, DenseM_t& paF22, + const F_t* p, int task_depth) { + const std::size_t pdsep = paF11.rows(); + const std::size_t dupd = dim_upd(); + std::size_t upd2sep; + auto I = this->upd_to_parent(p, upd2sep); +#if defined(STRUMPACK_USE_OPENMP_TASKLOOP) +#pragma omp taskloop default(shared) grainsize(64) \ + if(task_depth < params::task_recursion_cutoff_level) +#endif + for (std::size_t c=0; c()?2:1) * dupd * dupd); + STRUMPACK_FULL_RANK_FLOPS((is_complex()?2:1) * dupd * dupd); + release_work_memory(); + } + + template + std::size_t peak_device_memory + (const std::vector>& ldata) { + std::size_t peak_dmem = 0; + for (std::size_t l=0; l void + FrontalMatrixGPUSymmetricPositiveDefinite::front_assembly + (const SpMat_t& A, LInfo_t& L, char* hea_mem, char* dea_mem) { + using Trip_t = Triplet; + auto N = L.f.size(); + auto hasmbl = gpu::aligned_ptr>(hea_mem); + auto Iptr = gpu::aligned_ptr(hasmbl + N); + auto e11 = gpu::aligned_ptr(Iptr + L.Isize.back()); + auto e21 = e11 + L.elems11.back(); + auto dasmbl = gpu::aligned_ptr>(dea_mem); + auto dIptr = gpu::aligned_ptr(dasmbl + N); + auto de11 = gpu::aligned_ptr(dIptr + L.Isize.back()); + auto de21 = de11 + L.elems11.back(); + +#pragma omp parallel for + for (std::size_t n=0; n + (f.dim_sep(), f.dim_upd(), f.F11_.data(), nullptr, + f.F21_.data(), f.F22_.data(), + L.elems11[n+1]-L.elems11[n], 0, + L.elems21[n+1]-L.elems21[n], + de11+L.elems11[n], nullptr, de21+L.elems21[n]); + auto fIptr = Iptr + L.Isize[n]; + auto fdIptr = dIptr + L.Isize[n]; + if (f.lchild_) { + auto c = dynamic_cast(f.lchild_.get()); + hasmbl[n].set_ext_add_left(c->dim_upd(), c->F22_.data(), fdIptr); + c->upd_to_parent(&f, fIptr); + fIptr += c->dim_upd(); + fdIptr += c->dim_upd(); + } + if (f.rchild_) { + auto c = dynamic_cast(f.rchild_.get()); + hasmbl[n].set_ext_add_right(c->dim_upd(), c->F22_.data(), fdIptr); + c->upd_to_parent(&f, fIptr); + } + } + gpu::copy_host_to_device(dea_mem, hea_mem, L.ea_bytes); + gpu::assemble_symmetric(N, hasmbl, dasmbl); + } + + // TODO(Jie): fix for un-symmetric + template long long + FrontalMatrixGPUSymmetricPositiveDefinite::dense_node_factor_nonzeros() const { + long long dsep = dim_sep(), dupd = dim_upd(); + return dsep * (dsep + dupd); + } + + template void + FrontalMatrixGPUSymmetricPositiveDefinite::factor_small_fronts + (LInfo_t& L, gpu::FrontData* fdata, int* dinfo, + const SPOptions& opts) { + if (!L.small_fronts) return; + for (std::size_t n=0, n8=0, n16=L.N8, n24=n16+L.N16, n32=n24+L.N24; + n + t(dsep, f.dim_upd(), f.F11_.data(), nullptr, + f.F21_.data(), f.F22_.data(), f.piv_); + if (dsep <= 8) fdata[n8++] = t; + else if (dsep <= 16) fdata[n16++] = t; + else if (dsep <= 24) fdata[n24++] = t; + else fdata[n32++] = t; + } + gpu::copy_host_to_device(L.f8, fdata, L.small_fronts); + auto replace = opts.replace_tiny_pivots(); + auto thresh = opts.pivot_threshold(); + gpu::factor_symmetric_block_batch(L.N8, L.f8, replace, thresh, dinfo); + gpu::factor_symmetric_block_batch(L.N16, L.f16, replace, thresh, dinfo+L.N8); + gpu::factor_symmetric_block_batch(L.N24, L.f24, replace, thresh, dinfo+L.N8+L.N16); + gpu::factor_symmetric_block_batch(L.N32, L.f32, replace, thresh, dinfo+L.N8+L.N16+L.N24); + } + + template ReturnCode + FrontalMatrixGPUSymmetricPositiveDefinite::split_smaller + (const SpMat_t& A, const SPOptions& opts, + int etree_level, int task_depth) { + if (opts.verbose()) + std::cout << "# Factorization does not fit in GPU memory, " + "splitting in smaller traversals." << std::endl; + const std::size_t dupd = dim_upd(), dsep = dim_sep(); + ReturnCode err_code = ReturnCode::SUCCESS; + if (lchild_) { + auto el = lchild_->multifrontal_factorization_symmetric + (A, opts, etree_level+1, task_depth); + if (el != ReturnCode::SUCCESS) err_code = el; + } + if (rchild_) { + auto er = rchild_->multifrontal_factorization_symmetric + (A, opts, etree_level+1, task_depth); + if (er != ReturnCode::SUCCESS) err_code = er; + } + STRUMPACK_ADD_MEMORY(dsep*(dsep+dupd)*sizeof(scalar_t)); + STRUMPACK_ADD_MEMORY(dupd*dupd*sizeof(scalar_t)); + host_factors_diagonal_.reset(new scalar_t[dsep*dsep]); + host_factors_off_diagonal_.reset(new scalar_t[dsep * dupd]); + host_Schur_.reset(new scalar_t[dupd*dupd]); + { + auto fmem_diagonal = host_factors_diagonal_.get(); + auto fmem_off_diagonal = host_factors_off_diagonal_.get(); + F11_ = DenseMW_t(dsep, dsep, fmem_diagonal, dsep); + F21_ = DenseMW_t(dupd, dsep, fmem_off_diagonal, dupd); + } + F22_ = DenseMW_t(dupd, dupd, host_Schur_.get(), dupd); + F11_.zero(); + F21_.zero(); F22_.zero(); + A.extract_front + (F11_, F12_, F21_, this->sep_begin_, this->sep_end_, + this->upd_, task_depth); + if (lchild_) { +#pragma omp parallel +#pragma omp single + lchild_->extend_add_to_dense(F11_, F21_, F22_, this, 0); + } + if (rchild_) { +#pragma omp parallel +#pragma omp single + rchild_->extend_add_to_dense(F11_, F21_, F22_, this, 0); + } + // TaskTimer tl(""); + // tl.start(); + if (dsep) { + gpu::Handle sh; + auto workSize = gpu::potrf_buffersize(sh, UpLo::L, dsep); + gpu::DeviceMemory dm11(dsep*dsep + workSize); + gpu::DeviceMemory dpiv(dsep+1); // and ierr + DenseMW_t dF11(dsep, dsep, dm11, dsep); + gpu::copy_host_to_device(dF11, F11_); + gpu::potrf(sh, UpLo::L, dF11, dm11 + dsep*dsep, workSize, dpiv+dsep); + if (opts.replace_tiny_pivots()) + gpu::replace_pivots + (F11_.rows(), dF11.data(), opts.pivot_threshold()); + int info; + gpu::copy_device_to_host(&info, dpiv+dsep, 1); + if (info) err_code = ReturnCode::ZERO_PIVOT; + pivot_mem_.resize(dsep); + piv_ = pivot_mem_.data(); + gpu::copy_device_to_host(piv_, dpiv.as(), dsep); + gpu::copy_device_to_host(F11_, dF11); + if (dupd) { + gpu::Handle bh; + gpu::DeviceMemory dm21(dsep*dupd); + DenseMW_t dF21(dupd, dsep, dm21, dupd); + gpu::copy_host_to_device(dF21, F21_); + gpu::trsm(bh, Side::R, UpLo::L, Trans::T, Diag::N, scalar_t(1.), dF11, dF21); + gpu::copy_device_to_host(F21_, dF21); + dm11.release(); + gpu::DeviceMemory dm22((dsep+dupd)*dupd); + DenseMW_t dF22(dupd, dupd, dm22, dupd); + gpu::copy_host_to_device(dF22, host_Schur_.get()); + gpu::syrk(bh, UpLo::L, Trans::N, + scalar_t(-1.), dF21, scalar_t(1.), dF22); + gpu::copy_device_to_host(host_Schur_.get(), dF22); + } + } + // count flops + auto level_flops = LU_flops(F11_) + + gemm_flops(Trans::N, Trans::N, scalar_t(-1.), F21_, F12_, scalar_t(1.)) + + trsm_flops(Side::L, scalar_t(1.), F11_, F21_); + STRUMPACK_FULL_RANK_FLOPS(level_flops); + // if (opts.verbose()) { + // auto level_time = tl.elapsed(); + // std::cout << "# GPU Factorization complete, took: " + // << level_time << " seconds, " + // << level_flops / 1.e9 << " GFLOPS, " + // << (float(level_flops) / level_time) / 1.e9 + // << " GFLOP/s" << std::endl; + // } + return err_code; + } + + template + ReturnCode FrontalMatrixGPUSymmetricPositiveDefinite::multifrontal_factorization_symmetric( + const SpMat_t& A, + const SPOptions& opts, + int etree_level, int task_depth) { + ReturnCode err_code = ReturnCode::SUCCESS; + const int max_streams = opts.gpu_streams(); + std::vector handles(max_streams); + const int lvls = this->levels(); + std::vector ldata(lvls); + { + std::vector> fp(lvls); + try { + this->get_level_fronts_gpu(fp); + } catch (...) { + return split_smaller(A, opts, etree_level, task_depth); + } + for (int l=lvls-1; l>=0; l--) + ldata[l] = LInfo_t(fp[l], handles[0], max_streams, &A); + } + auto peak_dmem = peak_device_memory(ldata); + if (peak_dmem >= 0.9 * gpu::available_memory()) + return split_smaller(A, opts, etree_level, task_depth); + + std::vector streams(max_streams); + gpu::Stream copy_stream; + for (int i=0; i=0; l--) { + auto& L = ldata[l]; + max_small_fronts = std::max(max_small_fronts, L.N8+L.N16+L.N24+L.N32); + for (auto& f : L.f) { + const std::size_t dsep = f->dim_sep(); + const std::size_t dupd = f->dim_upd(); + std::size_t fs = dsep*(dsep + dupd); + max_pinned = std::max(max_pinned, fs); + } + max_pinned = std::max(max_pinned, L.factors_small); + } + gpu::HostMemory pinned(2*max_pinned); + gpu::HostMemory> fdata(max_small_fronts); + std::size_t peak_hea_mem = 0; + for (int l=lvls-1; l>=0; l--) + peak_hea_mem = std::max(peak_hea_mem, ldata[l].ea_bytes); + gpu::HostMemory hea_mem(peak_hea_mem); + gpu::DeviceMemory all_dmem(peak_dmem); + if (opts.verbose()) { + std::size_t factor_bytes = 0, ea_bytes = 0, work_bytes = 0; + for (auto &l: ldata) { + factor_bytes = std::max(l.factor_bytes, factor_bytes); + ea_bytes = std::max(l.ea_bytes, ea_bytes); + work_bytes = std::max(l.work_bytes, work_bytes); + } + + std::cout << "# - working space memory (host) = " << double(peak_hea_mem) / 1024 / 1024 << " MB" + << std::endl; + printf("# - working space memory (device) = factor + ea + working = %f MB + %f MB + %f MB = %f MB\n", + double(factor_bytes) / 1024 / 1024, double(ea_bytes) / 1024 / 1024, double(work_bytes) / 1024 / 1024, + double(peak_dmem) / 1024 / 1024); + std::cout << "# - working space memory (pinned) = " + << double(2 * max_pinned) * sizeof(scalar_t) / 1024 / 1024 << " MB" << std::endl; + } + char* old_work = nullptr; + for (int l=lvls-1; l>=0; l--) { + // TaskTimer tl(""); + // tl.start(); + LInfo_t& L = ldata[l]; + // if (opts.verbose()) L.print_info(l, lvls); + try { + char *work_mem = nullptr, *dea_mem = nullptr; + scalar_t* dev_factors = nullptr; + if (l % 2) { + work_mem = all_dmem; + dea_mem = work_mem + L.work_bytes; + dev_factors = gpu::aligned_ptr(dea_mem + L.ea_bytes); + } else { + work_mem = all_dmem + peak_dmem - L.work_bytes; + dea_mem = work_mem - L.ea_bytes; + dev_factors = gpu::aligned_ptr(dea_mem - L.factor_bytes); + } + gpu::memset(work_mem, 0, L.Schur_size); + gpu::memset(dev_factors, 0, L.factor_size); +// L.set_factor_pointers(dev_factors); + L.set_factor_pointers(dev_factors, dev_factors + L.diagonal_size); + L.set_work_pointers(work_mem, max_streams); + old_work = work_mem; + + // default stream + gpu_check(cudaDeviceSynchronize()); + front_assembly(A, L, hea_mem, dea_mem); + gpu::Event e_assemble; + e_assemble.record(); + gpu_check(cudaDeviceSynchronize()); + + // default stream + factor_small_fronts(L, fdata, L.dev_getrf_err, opts); + gpu::Event e_small; + e_small.record(); + + for (auto& s : streams) + e_assemble.wait(s); + + // larger fronts in multiple streams. Copy back in nchunks + // chunks, but a single chunk cannot be larger than the pinned + // buffer + const int nchunks = 16; + std::size_t Bf = (L.f.size()-L.small_fronts + nchunks - 1) / nchunks; + std::vector chunks; +// std::vector factors_chunk; + std::vector factors_diagonal_chunk, factors_off_diagonal_chunk; + for (std::size_t n=L.small_fronts, fc=0, c=0, fdc = 0, fodc = 0; n max_pinned) { + chunks.push_back(c); +// factors_chunk.push_back(fc); + factors_diagonal_chunk.push_back(fdc); + factors_off_diagonal_chunk.push_back(fodc); + c = fc = 0; + fdc = fodc = 0; + } + c++; + fc += size_front; + fdc += size_factors_diagonal; + fodc += size_factors_off_diagonal; + if (n == L.f.size()-1) { // final chunk + chunks.push_back(c); +// factors_chunk.push_back(fc); + factors_diagonal_chunk.push_back(fdc); + factors_off_diagonal_chunk.push_back(fodc); + } + } + + e_small.wait(copy_stream); +// gpu::copy_device_to_host_async +// (pinned, dev_factors, L.factors_small, copy_stream)); + gpu::copy_device_to_host_async + (pinned, dev_factors, L.factors_diagonal_small, copy_stream); + gpu::copy_device_to_host_async + (pinned + L.factors_diagonal_small, + dev_factors + L.diagonal_size, + L.factors_off_diagonal_small, copy_stream); + + STRUMPACK_ADD_MEMORY(L.factor_bytes); +// L.f[0]->host_factors_.reset(new scalar_t[L.factor_size]); +// scalar_t* host_factors = L.f[0]->host_factors_.get(); + L.f[0]->host_factors_diagonal_.reset(new scalar_t[L.diagonal_size]); + L.f[0]->host_factors_off_diagonal_.reset(new scalar_t[L.off_diagonal_size]); + scalar_t* host_factors_diagonal = L.f[0]->host_factors_diagonal_.get(); + scalar_t* host_factors_off_diagonal = L.f[0]->host_factors_off_diagonal_.get(); + copy_stream.synchronize(); +//#pragma omp parallel for +// for (std::size_t i=0; i(), + pinned.template as() + max_pinned}; + std::vector events(chunks.size()); + + for (std::size_t c=0, n=L.small_fronts; c +// (pin[c % 2], f.F11_.data(), +// factors_chunk[c], copy_stream); + auto fdc = factors_diagonal_chunk[c]; + auto fodc = factors_off_diagonal_chunk[c]; + gpu::copy_device_to_host_async + (pin[c % 2], f.F11_.data(), + fdc, copy_stream); + gpu::copy_device_to_host_async + (pin[c % 2] + fdc, f.F21_.data(), + fodc, copy_stream); + } + } + } + copy_stream.synchronize(); +// auto fc = factors_chunk.back(); +//#pragma omp parallel for +// for (std::size_t i=0; ipivot_mem_.resize(L.piv_size); + copy_stream.synchronize(); +// gpu::copy_device_to_host +// (L.f[0]->pivot_mem_.data(), L.f[0]->piv_, L.piv_size); +// L.set_factor_pointers(L.f[0]->host_factors_.get(); + L.set_factor_pointers(L.f[0]->host_factors_diagonal_.get(), L.f[0]->host_factors_off_diagonal_.get()); +// L.set_pivot_pointers(L.f[0]->pivot_mem_.data()); + + std::vector getrf_infos(L.f.size()); + gpu::copy_device_to_host + (getrf_infos.data(), L.dev_getrf_err, L.f.size()); + for (auto ierr : getrf_infos) + if (ierr) { + err_code = ReturnCode::ZERO_PIVOT; + break; + } + } catch (const std::bad_alloc& e) { + std::cerr << "Out of memory" << std::endl; + abort(); + } + long long level_flops, small_flops; + L.flops(level_flops, small_flops); + STRUMPACK_FULL_RANK_FLOPS(level_flops); + STRUMPACK_FLOPS(small_flops); + // if (opts.verbose()) { + // auto level_time = tl.elapsed(); + // std::cout << "# GPU Factorization complete, took: " + // << level_time << " seconds, " + // << level_flops / 1.e9 << " GFLOPS, " + // << (float(level_flops) / level_time) / 1.e9 + // << " GFLOP/s" << std::endl; + // } + } + const std::size_t dupd = dim_upd(); + if (dupd) { // get the contribution block from the device + host_Schur_.reset(new scalar_t[dupd*dupd]); + gpu::copy_device_to_host + (host_Schur_.get(), + reinterpret_cast(old_work), dupd*dupd); + F22_ = DenseMW_t(dupd, dupd, host_Schur_.get(), dupd); + } + return err_code; + } + + template ReturnCode + FrontalMatrixGPUSymmetricPositiveDefinite::multifrontal_factorization + (const SpMat_t& A, const SPOptions& opts, + int etree_level, int task_depth) { + if (!A.symm_sparse()) { + std::cerr << "The Matrix is not symmetric, please unable_symmetric in option settings" << std::endl; + exit(EXIT_FAILURE); // stop + }else{ + return multifrontal_factorization_symmetric(A, opts, etree_level, task_depth); + } + } + + template void + FrontalMatrixGPUSymmetricPositiveDefinite::fwd_solve_phase2 + (DenseM_t& b, DenseM_t& bupd, int etree_level, int task_depth) const { + if (dim_sep()) { + DenseMW_t bloc(dim_sep(), b.cols(), b, this->sep_begin_, 0); + trsv(UpLo::L, Trans::N, Diag::N, F11_, bloc, task_depth); +// F11_.solve_LU_in_place(bloc, piv_, task_depth); + if (dim_upd()) { + if (b.cols() == 1) + gemv(Trans::N, scalar_t(-1.), F21_, bloc, + scalar_t(1.), bupd, task_depth); + else + gemm(Trans::N, Trans::N, scalar_t(-1.), F21_, bloc, + scalar_t(1.), bupd, task_depth); + } + } + } + + template void + FrontalMatrixGPUSymmetricPositiveDefinite::bwd_solve_phase1 + (DenseM_t& y, DenseM_t& yupd, int etree_level, int task_depth) const { + if (dim_sep()) { + DenseMW_t yloc(dim_sep(), y.cols(), y, this->sep_begin_, 0); + if (y.cols() == 1) { + if (dim_upd()) + gemv(Trans::T, scalar_t(-1.), F21_, yupd, + scalar_t(1.), yloc, task_depth); + } else { + if (dim_upd()) + gemm(Trans::T, Trans::N, scalar_t(-1.), F21_, yupd, + scalar_t(1.), yloc, task_depth); + } + trsv(UpLo::L, Trans::T, Diag::N, F11_, yloc, params::task_recursion_cutoff_level); + } + } + + template ReturnCode + FrontalMatrixGPUSymmetricPositiveDefinite::node_inertia + (integer_t& neg, integer_t& zero, integer_t& pos) const { + using real_t = typename RealType::value_type; + for (std::size_t i=0; i real_t(0.)) pos++; + else if (absFii < real_t(0.)) neg++; + else if (absFii == real_t(0.)) zero++; + else std::cerr << "F(" << i << "," << i << ")=" << F11_(i,i) << std::endl; + } + return ReturnCode::SUCCESS; + } + + template + class MatrixWrapperForSparseF11 : public DenseMatrix { + public: + /** + * Default constructor. Creates an empty, 0x0 matrix. + */ + MatrixWrapperForSparseF11() : DenseMatrix() {} + struct { + size_t rowTotal, colTotal, nnz; + scalar_t *value{nullptr}; + size_t *innerIndex{nullptr}, *outerIndex{nullptr}; + } sparseF11; + + /** + * Constructor. Create an m x n matrix wrapper using already + * allocated memory, pointed to by D, with leading dimension ld. + * + * \param m number of rows of the new (sub) matrix + * \param n number of columns of the new matrix + * \param D pointer to memory representing matrix, this should + * point to at least ld*n bytes of allocated memory + * \param ld leading dimension of matrix allocated at D. ld >= m + */ + MatrixWrapperForSparseF11(std::size_t m, std::size_t n, + scalar_t* D, std::size_t ld) { + this->data_ = D; this->rows_ = m; this->cols_ = n; + this->ld_ = std::max(std::size_t(1), ld); + } + + /** + * Constructor. Create a DenseMatrixWrapper as a submatrix of size + * m x n, of a DenseMatrix (or DenseMatrixWrapper) D, at position + * i,j in D. The constructed DenseMatrixWrapper will be the + * submatrix D(i:i+m,j:j+n). + * + * \param m number of rows of the new (sub) matrix + * \param n number of columns of the new matrix + * \param D matrix from which to take a submatrix + * \param i row offset in D of the top left corner of the submatrix + * \param j columns offset in D of the top left corner of the + * submatrix + */ + MatrixWrapperForSparseF11(std::size_t m, std::size_t n, DenseMatrix& D, + std::size_t i, std::size_t j) + : MatrixWrapperForSparseF11(m, n, &D(i, j), D.ld()) { + assert(i+m <= D.rows()); + assert(j+n <= D.cols()); + } + + /** + * Virtual destructor. Since a DenseMatrixWrapper does not + * actually own it's memory, put just keeps a pointer, this will + * not free any memory. + */ + virtual ~MatrixWrapperForSparseF11() { this->data_ = nullptr; } + + /** + * Clear the MatrixWrapperForSparseF11. Ie, set to an empty matrix. This + * will not affect the original matrix, to which this is a + * wrapper, only the wrapper itself is reset. No memory is + * released. + */ + void clear() override { + this->rows_ = 0; this->cols_ = 0; + this->ld_ = 1; this->data_ = nullptr; + } + + /** + * Return the amount of memory taken by this wrapper, ie, + * 0. (since the wrapper itself does not own the memory). The + * memory will likely be owned by a DenseMatrix, while this + * MatrixWrapperForSparseF11 is just a submatrix of that existing + * matrix. Returning 0 here avoids counting memory double. + * + * \see nonzeros + */ + std::size_t memory() const override { return 0; } + + /** + * Return the number of nonzeros taken by this wrapper, ie, + * 0. (since the wrapper itself does not own the memory). The + * memory will likely be owned by a DenseMatrix, while this + * MatrixWrapperForSparseF11 is just a submatrix of that existing + * matrix. Returning 0 here avoids counting nonzeros double. + * + * \see memory + */ + std::size_t nonzeros() const override { return 0; } + + /** + * Default copy constructor, from another DenseMatrixWrapper. + */ + MatrixWrapperForSparseF11(const MatrixWrapperForSparseF11&) = default; + + /** + * Constructing a MatrixWrapperForSparseF11 from a MatrixWrapperForSparseF11 is + * not allowed. + * TODO Why not??!! just delegate to MatrixWrapperForSparseF11(m, n, D, i, j)?? + */ + MatrixWrapperForSparseF11(const DenseMatrix&) = delete; + + /** + * Default move constructor. + */ + MatrixWrapperForSparseF11(MatrixWrapperForSparseF11&&) = default; + + /** + * Moving from a DenseMatrix is not allowed. + */ + MatrixWrapperForSparseF11(DenseMatrix&&) = delete; + + // /** + // * Assignment operator. Shallow copy only. This only copies the + // * wrapper object. Does not copy matrix elements. + // * + // * \param D matrix wrapper to copy from, this will be duplicated + // */ + // MatrixWrapperForSparseF11& + // operator=(const MatrixWrapperForSparseF11& D) { + // this->data_ = D.data(); + // this->rows_ = D.rows(); + // this->cols_ = D.cols(); + // this->ld_ = D.ld(); + // return *this; + // } + + /** + * Move assignment. This moves only the wrapper. + * + * \param D matrix wrapper to move from. This will not be + * modified. + */ + MatrixWrapperForSparseF11& + operator=(MatrixWrapperForSparseF11&& D) { + this->data_ = D.data(); this->rows_ = D.rows(); + this->cols_ = D.cols(); this->ld_ = D.ld(); return *this; } + + /** + * Assignment operator, from a DenseMatrix. Assign the memory of + * the DenseMatrix to the matrix wrapped by this + * MatrixWrapperForSparseF11 object. + * + * \param a matrix to copy from, should be a.rows() == + * this->rows() and a.cols() == this->cols() + */ + MatrixWrapperForSparseF11& + operator=(const DenseMatrix& a) override { + assert(a.rows()==this->rows() && a.cols()==this->cols()); + for (std::size_t j=0; jcols(); j++) + for (std::size_t i=0; irows(); i++) + this->operator()(i, j) = a(i, j); + return *this; + } + }; + + // explicit template instantiations + template class FrontalMatrixGPUSymmetricPositiveDefinite; + template class FrontalMatrixGPUSymmetricPositiveDefinite; + template class FrontalMatrixGPUSymmetricPositiveDefinite,int>; + template class FrontalMatrixGPUSymmetricPositiveDefinite,int>; + + template class FrontalMatrixGPUSymmetricPositiveDefinite; + template class FrontalMatrixGPUSymmetricPositiveDefinite; + template class FrontalMatrixGPUSymmetricPositiveDefinite,long int>; + template class FrontalMatrixGPUSymmetricPositiveDefinite,long int>; + + template class FrontalMatrixGPUSymmetricPositiveDefinite; + template class FrontalMatrixGPUSymmetricPositiveDefinite; + template class FrontalMatrixGPUSymmetricPositiveDefinite,long long int>; + template class FrontalMatrixGPUSymmetricPositiveDefinite,long long int>; + +} // end namespace strumpack diff --git a/src/sparse/fronts/FrontalMatrixGPUSymmetricPositiveDefinite.cu b/src/sparse/fronts/FrontalMatrixGPUSymmetricPositiveDefinite.cu new file mode 100644 index 00000000..3007867a --- /dev/null +++ b/src/sparse/fronts/FrontalMatrixGPUSymmetricPositiveDefinite.cu @@ -0,0 +1,728 @@ +/* + * STRUMPACK -- STRUctured Matrices PACKage, Copyright (c) 2014, The + * Regents of the University of California, through Lawrence Berkeley + * National Laboratory (subject to receipt of any required approvals + * from the U.S. Dept. of Energy). All rights reserved. + * + * If you have questions about your rights to use or distribute this + * software, please contact Berkeley Lab's Technology Transfer + * Department at TTD@lbl.gov. + * + * NOTICE. This software is owned by the U.S. Department of Energy. As + * such, the U.S. Government has been granted for itself and others + * acting on its behalf a paid-up, nonexclusive, irrevocable, + * worldwide license in the Software to reproduce, prepare derivative + * works, and perform publicly and display publicly. Beginning five + * (5) years after the date permission to assert copyright is obtained + * from the U.S. Department of Energy, and subject to any subsequent + * five (5) year renewals, the U.S. Government is granted for itself + * and others acting on its behalf a paid-up, nonexclusive, + * irrevocable, worldwide license in the Software to reproduce, + * prepare derivative works, distribute copies to the public, perform + * publicly and display publicly, and to permit others to do so. + * + * Developers: Pieter Ghysels, Francois-Henry Rouet, Xiaoye S. Li. + * (Lawrence Berkeley National Lab, Computational Research + * Division). + * + */ +#define STRUMPACK_NO_TRIPLET_MPI +#include "FrontalMatrixGPUKernels.hpp" +#include "dense/CUDAWrapper.hpp" +#include "dense/GPUWrapper.hpp" + +#include +#include +#include + + +namespace strumpack { + namespace gpu { + + /** + * Get the real T type corresponding to a scalar, for instance T, + * std::complex or thrust::complex, to be used for instance + * to compute norms or absolute value. + */ + template struct real_type { typedef T value_type; }; + template struct real_type> { typedef T value_type; }; + template struct real_type> { typedef T value_type; }; + + /** + * The types float2 and double2 are binary the same as + * std::complex or thrust::complex, but they can be used as + * __shared__ variables, whereas thrust::complex cannot because it + * doesn't have a no-argument default constructor. + */ + template struct primitive_type { typedef T value_type; }; + template<> struct primitive_type> { typedef float2 value_type; }; + template<> struct primitive_type> { typedef double2 value_type; }; + template<> struct primitive_type> { typedef float2 value_type; }; + template<> struct primitive_type> { typedef double2 value_type; }; + + /** + * Get the corresponding thrust::complex for std::complex + */ + template struct cuda_type { typedef T value_type; }; + template struct cuda_type> { typedef thrust::complex value_type; }; + + __device__ float inline real_part(float& a) { return a; } + __device__ double inline real_part(double& a) { return a; } + __device__ float inline real_part(thrust::complex& a) { return a.real(); } + __device__ double inline real_part(thrust::complex& a) { return a.real(); } + + __device__ float inline absolute_value(float& a) { return fabsf(a); } + __device__ double inline absolute_value(double& a) { return fabs(a); } + __device__ float inline absolute_value(thrust::complex& a) { return thrust::abs(a); } + __device__ double inline absolute_value(thrust::complex& a) { return thrust::abs(a); } + + + /** + * Put elements of the sparse matrix in the F11, F12 and F21 parts + * of the front. The sparse elements are taken from F.e11, F.e12, + * F.e21, which are lists of triplets {r,c,v}. The front is + * assumed to be initialized to zero. + * + */ + template __global__ void + assemble_kernel(unsigned int nf, AssembleData* dat) { + int idx = blockIdx.x * blockDim.x * unroll + threadIdx.x, + op = blockIdx.y * blockDim.y + threadIdx.y; + if (op >= nf) return; + auto& F = dat[op]; + for (int i=0, j=idx; i= F.n11) break; + auto& t = F.e11[j]; + F.F11[t.r + t.c*F.d1] = t.v; + } + for (int i=0, j=idx; i= F.n12) break; + auto& t = F.e12[j]; + F.F12[t.r + t.c*F.d1] = t.v; + } + for (int i=0, j=idx; i= F.n21) break; + auto& t = F.e21[j]; + F.F21[t.r + t.c*F.d2] = t.v; + } + } + + template __global__ void + assemble_symmetric_kernel(unsigned int nf, AssembleData* dat) { + int idx = blockIdx.x * blockDim.x * unroll + threadIdx.x, + op = blockIdx.y * blockDim.y + threadIdx.y; + if (op >= nf) return; + auto& F = dat[op]; + for (int i=0, j=idx; i= F.n11) break; + auto& t = F.e11[j]; + F.F11[t.r + t.c*F.d1] = t.v; + } + for (int i=0, j=idx; i= F.n21) break; + auto& t = F.e21[j]; + F.F21[t.r + t.c*F.d2] = t.v; + } + } + + /** + * Single extend-add operation from one contribution block into + * the parent front. d1 is the size of F11, d2 is the size of F22. + */ + template + __global__ void extend_add_kernel + (unsigned int by0, unsigned int nf, AssembleData* dat) { + int y = blockIdx.x * blockDim.x + threadIdx.x, + x0 = (blockIdx.y + by0) * unroll, + z = blockIdx.z * blockDim.z + threadIdx.z; + if (z >= nf) return; + auto& f = dat[z]; + auto CB = left ? f.CB1 : f.CB2; + if (!CB) return; + auto dCB = left ? f.dCB1 : f.dCB2; + if (y >= dCB) return; + auto I = left ? f.I1 : f.I2; + auto Iy = I[y]; + CB += y + x0*dCB; + int d1 = f.d1, d2 = f.d2; + int ld; + T* F[2]; + if (Iy < d1) { + ld = d1; + F[0] = f.F11+Iy; + F[1] = f.F12+Iy-d1*d1; + } else { + ld = d2; + F[0] = f.F21+Iy-d1; + F[1] = f.F22+Iy-d1-d1*d2; + } +#pragma unroll + for (int i=0; i= dCB) break; + auto Ix = I[x]; + F[Ix >= d1][Ix*ld] += CB[i*dCB]; + } + } + + template + __global__ void extend_add_symmetric_kernel + (unsigned int by0, unsigned int nf, AssembleData* dat) { + int y = blockIdx.x * blockDim.x + threadIdx.x, + x0 = (blockIdx.y + by0) * unroll, + z = blockIdx.z * blockDim.z + threadIdx.z; + if (z >= nf) return; + auto& f = dat[z]; + auto CB = left ? f.CB1 : f.CB2; + if (!CB) return; + auto dCB = left ? f.dCB1 : f.dCB2; + if (y >= dCB) return; + auto I = left ? f.I1 : f.I2; + auto Iy = I[y]; + int d1 = f.d1, d2 = f.d2; + int ld; + T* F[2]; + if (Iy < d1) { + ld = d1; + F[0] = f.F11+Iy; + F[1] = nullptr; + } else { + ld = d2; + F[0] = f.F21+Iy-d1; + F[1] = f.F22+Iy-d1-d1*d2; + } +#pragma unroll + for (int i=0; i= dCB) break; + auto Ix = I[x]; + F[Ix >= d1][Ix*ld] += CB[row + col*dCB]; + } + } + + template void + assemble(unsigned int nf, AssembleData* dat, + AssembleData* ddat) { + { // front assembly from sparse matrix + std::size_t nnz = 0; + for (unsigned int f=0; f nnz && nt > 8 && ops < 64) { + nt /= 2; + ops *= 2; + } + ops = std::min(ops, nf); + unsigned int nb = (nnz + nt*unroll - 1) / (nt*unroll), + nbf = (nf + ops - 1) / ops; + dim3 block(nt, ops); + for (unsigned int f=0; f<<>>(nf-f*ops, ddat+f*ops); + } + } + } + gpu_check(cudaPeekAtLastError()); + { // extend-add + int du = 0; + for (unsigned int f=0; f du && ops < 64) { + nt /= 2; + ops *= 2; + } + ops = std::min(ops, nf); + unsigned int nbx = (du + nt - 1) / nt, + nby = (du + unroll - 1) / unroll, + nbf = (nf + ops - 1) / ops; + dim3 block(nt, 1, ops); + using T_ = typename cuda_type::value_type; + auto dat_ = reinterpret_cast*>(ddat); + for (unsigned int y=0; y<<>> + (y, nf-f*ops, dat_+f*ops); + extend_add_kernel<<>> + (y, nf-f*ops, dat_+f*ops); + } + } + } + } + gpu_check(cudaPeekAtLastError()); + } + + template void + assemble_symmetric(unsigned int nf, AssembleData* dat, + AssembleData* ddat) { + { // front assembly from sparse matrix + std::size_t nnz = 0; + for (unsigned int f=0; f nnz && nt > 8 && ops < 64) { + nt /= 2; + ops *= 2; + } + ops = std::min(ops, nf); + unsigned int nb = (nnz + nt*unroll - 1) / (nt*unroll), + nbf = (nf + ops - 1) / ops; + dim3 block(nt, ops); + for (unsigned int f=0; f<<>>(nf-f*ops, ddat+f*ops); + } + } + } + gpu_check(cudaPeekAtLastError()); + { // extend-add + int du = 0; + for (unsigned int f=0; f du && ops < 64) { + nt /= 2; + ops *= 2; + } + ops = std::min(ops, nf); + unsigned int nbx = (du + nt - 1) / nt, + nby = (du + unroll - 1) / unroll, + nbf = (nf + ops - 1) / ops; + dim3 block(nt, 1, ops); + using T_ = typename cuda_type::value_type; + auto dat_ = reinterpret_cast*>(ddat); + for (unsigned int y=0; y<<>> + (y, nf-f*ops, dat_+f*ops); + extend_add_symmetric_kernel<<>> + (y, nf-f*ops, dat_+f*ops); + } + } + } + } + gpu_check(cudaPeekAtLastError()); + } + + + // /** + // * This only works if value >= 0. + // * It's assuming two's complement for the int. + // * __float_as_int is like reinterpret_cast(value) + // */ + // __device__ __forceinline__ void atomicAbsMax(float* data, float value) { + // atomicMax((int *)data, __float_as_int(value)); + // } + // __device__ __forceinline__ void atomicAbsMax(double* addr, double value) { + // // why does this not compile? + // atomicMax((long long int *)addr, __double_as_longlong(value)); + // } + + + /** + * LU with row pivoting, with a single NTxNT thread block. The + * matrix size n must be less than NT. + * + * This is a naive implementation. The goal here is to reduce + * kernel launch overhead by batching many small LU + * factorizations. + * + * Use thrust::complex instead of std::complex. + */ + template __device__ void + LLT_block_kernel(int n, T* F, int* info) { + using cuda_primitive_t = typename primitive_type::value_type; + using real_t = typename real_type::value_type; + __shared__ cuda_primitive_t M_[NT*NT]; + T* M = reinterpret_cast(M_); + int j = threadIdx.x, i = threadIdx.y; + *info = 0; + + // copy F from global device storage into shared memory + if (i < n && j < n) + M[i+j*NT] = F[i+j*n]; + __syncthreads(); + + for (int k=0; k= k && i < n) + M[i+k*NT] /= diagonal; + __syncthreads(); + // Schur update + if (j > k && i > k && j < n && i < n) + M[i+j*NT] -= M[i+k*NT] * M[j+k*NT]; + __syncthreads(); + } + // write back from shared to global device memory + if (i < n && j < n) + F[i+j*n] = M[i+j*NT]; + } + + template __global__ void + LLT_block_kernel_batched(FrontData* dat, bool replace, + real_t thresh, int* dinfo) { + FrontData& A = dat[blockIdx.x]; + LLT_block_kernel(A.n1, A.F11, &dinfo[blockIdx.x]); + if (replace) { + int i = threadIdx.x, j = threadIdx.y; + if (i == j && i < A.n1) { + std::size_t k = i + i*A.n1; + if (absolute_value(A.F11[k]) < thresh) + A.F11[k] = (real_part(A.F11[k]) < 0) ? -thresh : thresh; + } + } + } + + template __global__ void + replace_pivots_kernel(int n, T* A, real_t thresh) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < n) { + std::size_t k = i + i*n; + if (absolute_value(A[k]) < thresh) + A[k] = (real_part(A[k]) < 0) ? -thresh : thresh; + } + } + + template + void replace_pivots(int n, T* A, real_t thresh, gpu::Stream* s) { + if (!n) return; + using T_ = typename cuda_type::value_type; + int NT = 128; + if (s) + replace_pivots_kernel<<<(n+NT-1)/NT, NT, 0, get_cuda_stream(*s)>>> + (n, (T_*)(A), thresh); + else + replace_pivots_kernel<<<(n+NT-1)/NT, NT>>> + (n, (T_*)(A), thresh); + gpu_check(cudaPeekAtLastError()); + } + + template __global__ void + replace_pivots_vbatch_kernel(int* dn, T** dA, int* lddA, real_t thresh, + unsigned int batchCount) { + int i = blockIdx.x * blockDim.x + threadIdx.x, + f = blockIdx.y * blockDim.y + threadIdx.y; + if (f >= batchCount) return; + if (i >= dn[f]) return; + auto A = dA[f]; + auto ldA = lddA[f]; + std::size_t ii = i + i*ldA; + if (absolute_value(A[ii]) < thresh) + A[ii] = (real_part(A[ii]) < 0) ? -thresh : thresh; + } + + /** + * LU solve with matrix F factor in LU, with pivot vector piv. F + * is n x n, and n <= NT. X is the right hand side, and is n x + * m. Both F and X have leading dimension n. + * + * NTxNT is the dimension of the thread block. + * + * This doesn't work for T = std::complex, use + * T=thrust::complex instead. + */ + template __device__ void + solve_symmetric_block_kernel(int n, int m, T* F, T* X) { + using primitive_t = typename primitive_type::value_type; + __shared__ primitive_t A_[NT*NT], B_[NT*NT]; + T *B = reinterpret_cast(B_), *A = reinterpret_cast(A_); + int i = threadIdx.x, j = threadIdx.y; + // put matrix F in shared memory + if (i < n && j < n) + A[i+j*NT] = F[i+j*n]; + __syncthreads(); + + // loop over blocks of NT columns of X + for (int b=0; b k && i < n && c < m) + B[i + j * NT] -= A[i + k * NT] * B[k + j * NT]; + __syncthreads(); + } + + // write from shared back to global device memory + if (i < n && c < m) + X[c+i*m] = B[i+j*NT]; + } + } + + template __global__ void + solve_symmetric_block_kernel_batched(FrontData* dat) { + FrontData& A = dat[blockIdx.x]; + solve_symmetric_block_kernel(A.n1, A.n2, A.F11, A.F21); + } + + + /** + * Compute F -= F21 * F12, where F is d2 x d2 and F12 is d1 x d2. + * d1 is <= NT. This should be called with a single NT x NT thread + * block. + */ + template __device__ void + Schur_symmetric_block_kernel(int d1, int d2, T* F21, T* F22) { + using cuda_primitive_t = typename primitive_type::value_type; + __shared__ cuda_primitive_t B_[NT*NT], A_[NT*NT]; + T *B = reinterpret_cast(B_), *A = reinterpret_cast(A_); + int j = threadIdx.x, i = threadIdx.y; + A[j+i*NT] = B[j+i*NT] = 0.; + for (int cb=0; cb __global__ void + Schur_symmetric_block_kernel_batched(FrontData* dat) { + FrontData& A = dat[blockIdx.x]; + Schur_symmetric_block_kernel(A.n1, A.n2, A.F21, A.F22); + } + + + template + void factor_symmetric_block_batch(unsigned int count, FrontData* dat, + bool replace, real_t thresh, int* dinfo) { + if (!count) return; + using T_ = typename cuda_type::value_type; + auto dat_ = reinterpret_cast*>(dat); + dim3 block(NT, NT); //, grid(count, 1, 1); + LLT_block_kernel_batched<<>> + (dat_, replace, thresh, dinfo); + gpu_check(cudaPeekAtLastError()); + solve_symmetric_block_kernel_batched<<>>(dat_); + gpu_check(cudaPeekAtLastError()); + Schur_symmetric_block_kernel_batched<<>>(dat_); + gpu_check(cudaPeekAtLastError()); + } + + + template __global__ void + solve_block_kernel_batched(int nrhs, FrontData* dat) { + FrontData& A = dat[blockIdx.x]; + solve_symmetric_block_kernel(A.n1, nrhs, A.F11, A.F12, A.piv); + } + + /** + * Single extend-add operation along the column dimension, for the + * solve. d1 is the size of F11, d2 is the size of F22. + */ + template __device__ void + ea_rhs_kernel(int r, int N, int nrhs, + int dsep, int dupd, int dCB, + T* b, T* bupd, T* CB, std::size_t* I) { + if (r >= dCB) return; + auto Ir = I[r]; + for (int c=0; c __global__ void + extend_add_rhs_kernel_left + (int N, int nrhs, unsigned int nf, AssembleData* dat) { + int r = blockIdx.x * blockDim.x + threadIdx.x, + i = blockIdx.y * blockDim.y + threadIdx.y; + if (i >= nf) return; + auto& f = dat[i]; + if (f.CB1) + ea_rhs_kernel(r, N, nrhs, f.d1, f.d2, f.dCB1, + f.F11, f.F21, f.CB1, f.I1); + } + template __global__ void + extend_add_rhs_kernel_right + (int N, int nrhs, unsigned int nf, AssembleData* dat) { + int r = blockIdx.x * blockDim.x + threadIdx.x, + i = blockIdx.y * blockDim.y + threadIdx.y; + if (i >= nf) return; + auto& f = dat[i]; + if (f.CB2) + ea_rhs_kernel(r, N, nrhs, f.d1, f.d2, f.dCB2, + f.F11, f.F21, f.CB2, f.I2); + } + + template void + extend_add_rhs(int N, int nrhs, unsigned int nf, + AssembleData* dat, AssembleData* ddat) { + int du = 0; + for (unsigned int f=0; f du && ops < 64) { + nt /= 2; + ops *= 2; + } + ops = std::min(ops, nf); + unsigned int nb = (du + nt - 1) / nt, nbf = (nf + ops - 1) / ops; + dim3 block(nt, ops); + using T_ = typename cuda_type::value_type; + auto dat_ = reinterpret_cast*>(ddat); + for (unsigned int f=0; f>> + (N, nrhs, nf-f*ops, dat_+f*ops); + extend_add_rhs_kernel_right<<>> + (N, nrhs, nf-f*ops, dat_+f*ops); + } + gpu_check(cudaPeekAtLastError()); + } + + + /** + * Single extend-add operation along the column dimension, for the + * solve. d1 is the size of F11, d2 is the size of F22. + */ + template __device__ void + extract_rhs_kernel(int r, int N, int nrhs, + int dsep, int dupd, int dCB, + T* b, T* bupd, T* CB, std::size_t* I) { + if (r >= dCB) return; + auto Ir = I[r]; + for (int c=0; c __global__ void + extract_rhs_kernel(int N, int nrhs, unsigned int nf, + AssembleData* dat) { + int r = blockIdx.x * blockDim.x + threadIdx.x, + i = blockIdx.y * blockDim.y + threadIdx.y; + if (i >= nf) return; + auto& f = dat[i]; + if (f.CB1) + extract_rhs_kernel(r, N, nrhs, f.d1, f.d2, f.dCB1, + f.F11, f.F21, f.CB1, f.I1); + if (f.CB2) + extract_rhs_kernel(r, N, nrhs, f.d1, f.d2, f.dCB2, + f.F11, f.F21, f.CB2, f.I2); + } + + template void + extract_rhs(int N, int nrhs, unsigned int nf, AssembleData* dat, + AssembleData* ddat) { + int du = 0; + for (unsigned int f=0; f du && ops < 64) { + nt /= 2; + ops *= 2; + } + ops = std::min(ops, nf); + unsigned int nb = (du + nt - 1) / nt, nbf = (nf + ops - 1) / ops; + dim3 block(nt, ops); + using T_ = typename cuda_type::value_type; + auto dat_ = reinterpret_cast*>(ddat); + for (unsigned int f=0; f>> + (N, nrhs, nf-f*ops, dat_+f*ops); + } + } + + + + // explicit template instantiations + template void assemble(unsigned int, AssembleData*, AssembleData*); + template void assemble(unsigned int, AssembleData*, AssembleData*); + template void assemble(unsigned int, AssembleData>*, AssembleData>*); + template void assemble(unsigned int, AssembleData>*, AssembleData>*); + + template void assemble_symmetric(unsigned int, AssembleData*, AssembleData*); + template void assemble_symmetric(unsigned int, AssembleData*, AssembleData*); + template void assemble_symmetric(unsigned int, AssembleData>*, AssembleData>*); + template void assemble_symmetric(unsigned int, AssembleData>*, AssembleData>*); + + template void extend_add_rhs(int, int, unsigned int, AssembleData*, AssembleData*); + template void extend_add_rhs(int, int, unsigned int, AssembleData*, AssembleData*); + template void extend_add_rhs(int, int, unsigned int, AssembleData>*, AssembleData>*); + template void extend_add_rhs(int, int, unsigned int, AssembleData>*, AssembleData>*); + + template void extract_rhs(int, int, unsigned int, AssembleData*, AssembleData*); + template void extract_rhs(int, int, unsigned int, AssembleData*, AssembleData*); + template void extract_rhs(int, int, unsigned int, AssembleData>*, AssembleData>*); + template void extract_rhs(int, int, unsigned int, AssembleData>*, AssembleData>*); + + + template void factor_symmetric_block_batch(unsigned int, FrontData*, bool, float, int*); + template void factor_symmetric_block_batch(unsigned int, FrontData*, bool, double, int*); + template void factor_symmetric_block_batch,8,float>(unsigned int, FrontData>*, bool, float, int*); + template void factor_symmetric_block_batch,8,double>(unsigned int, FrontData>*, bool, double, int*); + + template void factor_symmetric_block_batch(unsigned int, FrontData*, bool, float, int*); + template void factor_symmetric_block_batch(unsigned int, FrontData*, bool, double, int*); + template void factor_symmetric_block_batch,16,float>(unsigned int, FrontData>*, bool, float, int*); + template void factor_symmetric_block_batch,16,double>(unsigned int, FrontData>*, bool, double, int*); + + template void factor_symmetric_block_batch(unsigned int, FrontData*, bool, float, int*); + template void factor_symmetric_block_batch(unsigned int, FrontData*, bool, double, int*); + template void factor_symmetric_block_batch,24,float>(unsigned int, FrontData>*, bool, float, int*); + template void factor_symmetric_block_batch,24,double>(unsigned int, FrontData>*, bool, double, int*); + + template void factor_symmetric_block_batch(unsigned int, FrontData*, bool, float, int*); + template void factor_symmetric_block_batch(unsigned int, FrontData*, bool, double, int*); + template void factor_symmetric_block_batch,32,float>(unsigned int, FrontData>*, bool, float, int*); + template void factor_symmetric_block_batch,32,double>(unsigned int, FrontData>*, bool, double, int*); + + template void replace_pivots(int, float*, float, gpu::Stream*); + template void replace_pivots(int, double*, double, gpu::Stream*); + template void replace_pivots(int, std::complex*, float, gpu::Stream*); + template void replace_pivots(int, std::complex*, double, gpu::Stream*); + + } // end namespace gpu +} // end namespace strumpack diff --git a/src/sparse/fronts/FrontalMatrixGPUSymmetricPositiveDefinite.hpp b/src/sparse/fronts/FrontalMatrixGPUSymmetricPositiveDefinite.hpp new file mode 100644 index 00000000..3fbf3b86 --- /dev/null +++ b/src/sparse/fronts/FrontalMatrixGPUSymmetricPositiveDefinite.hpp @@ -0,0 +1,111 @@ +// +// Created by tingxuan on 23-6-19. +// + +#pragma once + +#include "FrontalMatrixDense.hpp" + +#if defined(STRUMPACK_USE_CUDA) +#include "dense/CUDAWrapper.hpp" +#endif +#if defined(STRUMPACK_USE_HIP) +#include "dense/HIPWrapper.hpp" +#endif +#if defined(STRUMPACK_USE_SYCL) +#include "dense/DPCPPWrapper.hpp" +#endif + +namespace strumpack { + + template class LevelInfoUnified; + + namespace gpu { + template struct FrontData; + // template struct FwdSolveData; + } + + + template class FrontalMatrixGPUSymmetricPositiveDefinite + : public FrontalMatrix { + using F_t = FrontalMatrix; + using FG_t = FrontalMatrixGPUSymmetricPositiveDefinite; + using DenseM_t = DenseMatrix; + using DenseMW_t = DenseMatrixWrapper; + using SpMat_t = CompressedSparseMatrix; + using LInfo_t = LevelInfoUnified; + + public: + FrontalMatrixGPUSymmetricPositiveDefinite(integer_t sep, integer_t sep_begin, integer_t sep_end, + std::vector& upd); + ~FrontalMatrixGPUSymmetricPositiveDefinite(); + + long long dense_node_factor_nonzeros() const override; + + void release_work_memory() override; + + void extend_add_to_dense(DenseM_t& paF11, DenseM_t& paF21, DenseM_t& paF22, + const F_t* p, int task_depth) override; + + ReturnCode multifrontal_factorization(const SpMat_t& A, + const SPOptions& opts, + int etree_level=0, + int task_depth=0) override; + + ReturnCode multifrontal_factorization_symmetric(const SpMat_t& A, + const SPOptions& opts, + int etree_level=0, + int task_depth=0); + + void extract_CB_sub_matrix(const std::vector& I, + const std::vector& J, + DenseM_t& B, int task_depth) const override {} + + std::string type() const override { return "FrontalMatrixGPU"; } + bool isGPU() const override { return true; } + +#if defined(STRUMPACK_USE_MPI) + void + extend_add_copy_to_buffers(std::vector>& sbuf, + const FrontalMatrixMPI* pa) + const override; +#endif + + private: + std::unique_ptr host_factors_, host_Schur_; + std::unique_ptr> host_factors_diagonal_{nullptr, std::default_delete{}}, host_factors_off_diagonal_{nullptr, std::default_delete{}}; + DenseMW_t F11_, F12_, F21_, F22_; + std::vector pivot_mem_; + int* piv_ = nullptr; + + FrontalMatrixGPUSymmetricPositiveDefinite(const FrontalMatrixGPUSymmetricPositiveDefinite&) = delete; + FrontalMatrixGPUSymmetricPositiveDefinite& operator=(FrontalMatrixGPUSymmetricPositiveDefinite const&) = delete; + + void front_assembly(const SpMat_t& A, LInfo_t& L, + char* hea_mem, char* dea_mem); + + void factor_small_fronts(LInfo_t& L, gpu::FrontData* fdata, + int* dinfo, const SPOptions& opts); + + ReturnCode split_smaller(const SpMat_t& A, const SPOptions& opts, + int etree_level=0, int task_depth=0); + + void fwd_solve_phase2(DenseM_t& b, DenseM_t& bupd, + int etree_level, int task_depth) const override; + void bwd_solve_phase1(DenseM_t& y, DenseM_t& yupd, + int etree_level, int task_depth) const override; + + ReturnCode node_inertia(integer_t& neg, + integer_t& zero, + integer_t& pos) const override; + + using F_t::lchild_; + using F_t::rchild_; + using F_t::dim_sep; + using F_t::dim_upd; + + template friend class LevelInfoUnified; + }; + +} // end namespace strumpack + diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 676a4075..4fedf1e7 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -2,17 +2,25 @@ add_executable(test_HSS_seq test_HSS_seq.cpp) add_executable(test_sparse_seq test_sparse_seq.cpp) add_executable(test_BLR_seq test_BLR_seq.cpp) add_executable(test_matrix_IO test_matrix_IO.cpp) +add_executable(test_SPD_seq test_SPD_seq.cpp) +add_executable(test_SPD_mixedPrecision test_SPD_mixedPrecision.cpp) target_link_libraries(test_HSS_seq strumpack) target_link_libraries(test_sparse_seq strumpack) target_link_libraries(test_BLR_seq strumpack) target_link_libraries(test_matrix_IO strumpack) +target_link_libraries(test_SPD_seq strumpack) +target_link_libraries(test_SPD_mixedPrecision strumpack) add_test("user_test_HSS_seq" ${CMAKE_CURRENT_BINARY_DIR}/test_HSS_seq T 100) add_test("user_test_sparse_seq" ${CMAKE_CURRENT_BINARY_DIR}/test_sparse_seq ${PROJECT_SOURCE_DIR}/examples/sparse/data/pde900.mtx) add_test("user_matrix_IO" ${CMAKE_CURRENT_BINARY_DIR}/test_matrix_IO T 1000) add_test("user_test_BLR_seq" ${CMAKE_CURRENT_BINARY_DIR}/test_BLR_seq 300) +add_test("user_test_SPD_seq" ${CMAKE_CURRENT_BINARY_DIR}/test_SPD_seq + ${PROJECT_SOURCE_DIR}/examples/sparse/data/bcsstm08.mtx) +add_test("user_test_SPD_mixedPrecision" ${CMAKE_CURRENT_BINARY_DIR}/test_SPD_mixedPrecision + ${PROJECT_SOURCE_DIR}/examples/sparse/data/bcsstm08.mtx) if(STRUMPACK_USE_MPI) add_executable(test_HSS_mpi test_HSS_mpi.cpp) diff --git a/test/test_SPD_mixedPrecision.cpp b/test/test_SPD_mixedPrecision.cpp new file mode 100644 index 00000000..b94add1b --- /dev/null +++ b/test/test_SPD_mixedPrecision.cpp @@ -0,0 +1,200 @@ +/* + * STRUMPACK -- STRUctured Matrices PACKage, Copyright (c) 2014, The + * Regents of the University of California, through Lawrence Berkeley + * National Laboratory (subject to receipt of any required approvals + * from the U.S. Dept. of Energy). All rights reserved. + * + * If you have questions about your rights to use or distribute this + * software, please contact Berkeley Lab's Technology Transfer + * Department at TTD@lbl.gov. + * + * NOTICE. This software is owned by the U.S. Department of Energy. As + * such, the U.S. Government has been granted for itself and others + * acting on its behalf a paid-up, nonexclusive, irrevocable, + * worldwide license in the Software to reproduce, prepare derivative + * works, and perform publicly and display publicly. Beginning five + * (5) years after the date permission to assert copyright is obtained + * from the U.S. Department of Energy, and subject to any subsequent + * five (5) year renewals, the U.S. Government is granted for itself + * and others acting on its behalf a paid-up, nonexclusive, + * irrevocable, worldwide license in the Software to reproduce, + * prepare derivative works, distribute copies to the public, perform + * publicly and display publicly, and to permit others to do so. + * + * Developers: Pieter Ghysels, Francois-Henry Rouet, Xiaoye S. Li. + * (Lawrence Berkeley National Lab, Computational Research + * Division). + * + */ +#include +#include +using namespace std; + +#include "StrumpackSparseSolver.hpp" +#include "StrumpackSparseSolverMixedPrecision.hpp" +#include "misc/RandomWrapper.hpp" +#include "sparse/CSRMatrix.hpp" + +using namespace strumpack; + +#define ERROR_TOLERANCE 1e2 +#define SOLVE_TOLERANCE 1e-12 + +template +void test(CSRMatrix &A, DenseMatrix &b, + DenseMatrix &x_exact, int argc, char *argv[]) { + int m = b.cols(); // number of right-hand sides + auto N = A.size(); + DenseMatrix x(N, m); + + std::cout << std::endl; + std::cout << "###############################################" << std::endl; + std::cout << "### Working precision: " + << (std::is_same::value ? "single" : "double") + << " #################" << std::endl; + std::cout << "###############################################" << std::endl; + + { + std::cout << std::endl; + std::cout << "### MIXED Precision Solver ####################" << std::endl; + + SparseSolverMixedPrecision spss; + /** options for the outer solver */ + spss.options().set_Krylov_solver(KrylovSolver::REFINE); + // spss.options().set_Krylov_solver(KrylovSolver::PREC_BICGSTAB); + // spss.options().set_Krylov_solver(KrylovSolver::PREC_GMRES); + spss.options().set_rel_tol(1e-14); + spss.options().enable_symmetric(); + spss.options().enable_positive_definite(); + spss.options().set_matching(strumpack::MatchingJob::NONE); + spss.options().set_from_command_line(argc, argv); + + /** options for the inner solver */ + spss.solver().options().set_Krylov_solver(KrylovSolver::DIRECT); + spss.solver().options().set_from_command_line(argc, argv); + spss.solver().options().set_matching(strumpack::MatchingJob::NONE); + spss.solver().options().enable_symmetric(); + spss.solver().options().enable_positive_definite(); + + spss.set_lower_triangle_matrix(A); + spss.reorder(); + spss.factor(); + spss.solve(b, x); + + std::cout << "# COMPONENTWISE SCALED RESIDUAL = " + << A.max_scaled_residual(x.data(), b.data()) << std::endl; + strumpack::blas::axpy(N, -1., x_exact.data(), 1, x.data(), 1); + auto nrm_error = strumpack::blas::nrm2(N, x.data(), 1); + auto nrm_x_exact = strumpack::blas::nrm2(N, x_exact.data(), 1); + std::cout << "# RELATIVE ERROR = " << (nrm_error / nrm_x_exact) + << std::endl; + } + + { + std::cout << std::endl; + std::cout << "### STANDARD solver ###########################" << std::endl; + + SparseSolver spss; + + spss.options().enable_symmetric(); + spss.options().enable_positive_definite(); + spss.options().set_matching(strumpack::MatchingJob::NONE); + spss.options().set_from_command_line(argc, argv); + + spss.set_lower_triangle_matrix(A); + spss.reorder(); + spss.factor(); + spss.solve(b, x); + + std::cout << "# COMPONENTWISE SCALED RESIDUAL = " + << A.max_scaled_residual(x.data(), b.data()) << std::endl; + strumpack::blas::axpy(N, -1., x_exact.data(), 1, x.data(), 1); + auto nrm_error = strumpack::blas::nrm2(N, x.data(), 1); + auto nrm_x_exact = strumpack::blas::nrm2(N, x_exact.data(), 1); + std::cout << "# RELATIVE ERROR = " << (nrm_error / nrm_x_exact) + << std::endl; + } + + std::cout << std::endl << std::endl; +} + +template +int test_sparse_solver(int argc, char *argv[], + CSRMatrix &A_d) { + // set the exact solution, see: + // http://www.netlib.org/lapack/lawnspdf/lawn165.pdf + // page 20 + int N = A_d.size(); + int m = 1; // nr of RHSs + DenseMatrix b_d(N, m), x_true_d(N, m); + auto A_f = cast_matrix(A_d); + + std::default_random_engine gen; + std::uniform_real_distribution dist(0., std::sqrt(24.)); + for (int j = 0; j < m; j++) { + // step 4, use a different tau for each RHS + double tau = std::pow(dist(gen), 2.); + for (int i = 0; i < N; i++) + // step 4c + x_true_d(i, j) = std::pow(tau, -double(i) / (N - 1)); + } + + // step 6, but in double, not double-double + A_d.spmv(x_true_d, b_d); + { + // step 7, but in double, not double-double + SparseSolver spss; + // SparseSolverMixedPrecision spss; +// spss.set_lower_triangle_matrix(A_d); + spss.set_matrix(A_d); + spss.solve(b_d, x_true_d); + } + + // cast RHS and true solution to single precision + DenseMatrix b_f(N, m), x_true_f(N, m); + copy(x_true_d, x_true_f); + copy(b_d, b_f); + + test(A_d, b_d, x_true_d, argc, argv); + test(A_f, b_f, x_true_f, argc, argv); + + return 0; +} + +template +int read_matrix_and_run_tests(int argc, char *argv[]) { + string f(argv[1]); + CSRMatrix A; + if (A.read_matrix_market(f) == 0) + return test_sparse_solver(argc, argv, A); + else { + CSRMatrix, integer_t> Acomplex; + if (Acomplex.read_matrix_market(f)) { + std::cerr << "Could not read matrix from file." << std::endl; + return 1; + } + } +} + +int main(int argc, char *argv[]) { + if (argc < 2) { + cout + << "Solve a linear system with a matrix given in matrix market format\n" + << "using the sequential/multithreaded C++ STRUMPACK interface.\n\n" + << "Usage: \n\t./testMMdouble pde900.mtx" << endl; + return 1; + } + cout << "# Running with:\n# "; +#if defined(_OPENMP) + cout << "OMP_NUM_THREADS=" << omp_get_max_threads() << " "; +#endif + for (int i = 0; i < argc; i++) + cout << argv[i] << " "; + cout << endl; + + int ierr = 0; + // ierr = read_matrix_and_run_tests(argc, argv); + // if (ierr) return ierr; + ierr = read_matrix_and_run_tests(argc, argv); + return ierr; +} diff --git a/test/test_SPD_seq.cpp b/test/test_SPD_seq.cpp new file mode 100644 index 00000000..1b86d4d6 --- /dev/null +++ b/test/test_SPD_seq.cpp @@ -0,0 +1,130 @@ +/* + * STRUMPACK -- STRUctured Matrices PACKage, Copyright (c) 2014, The + * Regents of the University of California, through Lawrence Berkeley + * National Laboratory (subject to receipt of any required approvals + * from the U.S. Dept. of Energy). All rights reserved. + * + * If you have questions about your rights to use or distribute this + * software, please contact Berkeley Lab's Technology Transfer + * Department at TTD@lbl.gov. + * + * NOTICE. This software is owned by the U.S. Department of Energy. As + * such, the U.S. Government has been granted for itself and others + * acting on its behalf a paid-up, nonexclusive, irrevocable, + * worldwide license in the Software to reproduce, prepare derivative + * works, and perform publicly and display publicly. Beginning five + * (5) years after the date permission to assert copyright is obtained + * from the U.S. Department of Energy, and subject to any subsequent + * five (5) year renewals, the U.S. Government is granted for itself + * and others acting on its behalf a paid-up, nonexclusive, + * irrevocable, worldwide license in the Software to reproduce, + * prepare derivative works, distribute copies to the public, perform + * publicly and display publicly, and to permit others to do so. + * + * Developers: Pieter Ghysels, Francois-Henry Rouet, Xiaoye S. Li. + * (Lawrence Berkeley National Lab, Computational Research + * Division). + * + */ +#include +#include +using namespace std; + +#include "StrumpackSparseSolver.hpp" +#include "misc/RandomWrapper.hpp" +#include "sparse/CSRMatrix.hpp" + +using namespace strumpack; + +#define ERROR_TOLERANCE 1e2 +#define SOLVE_TOLERANCE 1e-12 + +template +int test_SPD_solver(int argc, const char *const argv[], + CSRMatrix &A) { + using real_t = typename RealType::value_type; + StrumpackSparseSolver spss; + spss.options().set_from_command_line(argc, argv); + + int N = A.size(); + vector b(N), x(N), x_exact(N); + { + auto rgen = random::make_default_random_generator(); + for (auto &xi : x_exact) + xi = rgen->get(); + } + A.spmv(x_exact.data(), b.data()); + + spss.set_lower_triangle_matrix(A); + spss.options().enable_symmetric(); + spss.options().enable_positive_definite(); + spss.options().set_matching(strumpack::MatchingJob::NONE); + spss.options().set_Krylov_solver(KrylovSolver::DIRECT); + if (spss.reorder() != ReturnCode::SUCCESS) { + cout << "problem with reordering of the matrix." << endl; + return 1; + } + if (spss.factor() != ReturnCode::SUCCESS) { + cout << "problem during factorization of the matrix." << endl; + return 1; + } + spss.solve(b.data(), x.data()); + + auto comp_scal_res = A.max_scaled_residual(x.data(), b.data()); + cout << "# COMPONENTWISE SCALED RESIDUAL = " << comp_scal_res << endl; + + blas::axpy(N, scalar_t(-1.), x_exact.data(), 1, x.data(), 1); + auto nrm_error = blas::nrm2(N, x.data(), 1); + auto nrm_x_exact = blas::nrm2(N, x_exact.data(), 1); + cout << "# RELATIVE ERROR = " << (nrm_error / nrm_x_exact) << endl; + + if (comp_scal_res > ERROR_TOLERANCE * spss.options().rel_tol()) { + cout << "RESIDUAL TOO LARGE!" << endl; + return 1; + } + return 0; +} + +template +int read_matrix_and_run_tests(int argc, const char *const argv[]) { + string f(argv[1]); + CSRMatrix A; + if (A.read_matrix_market(f) == 0) + return test_SPD_solver(argc, argv, A); + else { + CSRMatrix, integer_t> Acomplex; + if (Acomplex.read_matrix_market(f)) { + std::cerr << "Could not read matrix from file." << std::endl; + return 1; + } + return test_SPD_solver(argc, argv, Acomplex); + } +} + +int main(int argc, char *argv[]) { + if (argc < 2) { + cout + << "Solve a linear system with a matrix given in matrix market format\n" + << "using the sequential/multithreaded C++ STRUMPACK interface.\n\n" + << "Usage: \n\t./testMMdouble bcsstm08.mtx" << endl; + return 1; + } + cout << "# Running with:\n# "; +#if defined(_OPENMP) + cout << "OMP_NUM_THREADS=" << omp_get_max_threads() << " "; +#endif + for (int i = 0; i < argc; i++) + cout << argv[i] << " "; + cout << endl; + + int ierr = 0; + // ierr = read_matrix_and_run_tests(argc, argv); + // if (ierr) return ierr; + ierr = read_matrix_and_run_tests(argc, argv); + if (ierr) + return ierr; + // ierr = read_matrix_and_run_tests(argc, argv); + // if (ierr) return ierr; + ierr = read_matrix_and_run_tests(argc, argv); + return ierr; +}