From 6b971cadbb27f8c5e6568dd0d41f23e2597ae21c Mon Sep 17 00:00:00 2001 From: Sebastian Grimberg Date: Wed, 21 Feb 2024 16:10:56 -0800 Subject: [PATCH] Fix bug which surfaced with wave ports running on many processors (BC elimination is intended only for square matrices) --- palace/fem/bilinearform.cpp | 33 ++---- palace/fem/libceed/operator.cpp | 171 +++++++++++++++++--------------- palace/fem/libceed/operator.hpp | 4 +- palace/linalg/rap.cpp | 33 ++++-- 4 files changed, 128 insertions(+), 113 deletions(-) diff --git a/palace/fem/bilinearform.cpp b/palace/fem/bilinearform.cpp index 307ffa784..2a6876249 100644 --- a/palace/fem/bilinearform.cpp +++ b/palace/fem/bilinearform.cpp @@ -52,11 +52,6 @@ BilinearForm::PartialAssemble(const FiniteElementSpace &trial_fespace, PalacePragmaOmp(parallel if (nt > 1)) { Ceed ceed = ceed::internal::GetCeedObjects()[utils::GetThreadNum()]; - - // Initialize the composite operator on each thread. - CeedOperator loc_op; - PalaceCeedCall(ceed, CeedCompositeOperatorCreate(ceed, &loc_op)); - for (const auto &[geom, data] : mesh.GetCeedGeomFactorData(ceed)) { const auto trial_map_type = @@ -80,8 +75,7 @@ BilinearForm::PartialAssemble(const FiniteElementSpace &trial_fespace, integ->SetMapTypes(trial_map_type, test_map_type); integ->Assemble(ceed, trial_restr, test_restr, trial_basis, test_basis, data.geom_data, data.geom_data_restr, &sub_op); - PalaceCeedCall(ceed, CeedCompositeOperatorAddSub(loc_op, sub_op)); - PalaceCeedCall(ceed, CeedOperatorDestroy(&sub_op)); + op->AddOper(sub_op); // Sub-operator owned by ceed::Operator } } else if (mfem::Geometry::Dimension[geom] == mesh.Dimension() - 1 && @@ -101,15 +95,15 @@ BilinearForm::PartialAssemble(const FiniteElementSpace &trial_fespace, integ->SetMapTypes(trial_map_type, test_map_type); integ->Assemble(ceed, trial_restr, test_restr, trial_basis, test_basis, data.geom_data, data.geom_data_restr, &sub_op); - PalaceCeedCall(ceed, CeedCompositeOperatorAddSub(loc_op, sub_op)); - PalaceCeedCall(ceed, CeedOperatorDestroy(&sub_op)); + op->AddOper(sub_op); // Sub-operator owned by ceed::Operator } } } - PalaceCeedCall(ceed, CeedOperatorCheckReady(loc_op)); - op->AddOper(loc_op); } + // Finalize the operator (call CeedOperatorCheckReady). + op->Finalize(); + return op; } @@ -222,12 +216,6 @@ std::unique_ptr DiscreteLinearOperator::PartialAssemble() const PalacePragmaOmp(parallel if (nt > 1)) { Ceed ceed = ceed::internal::GetCeedObjects()[utils::GetThreadNum()]; - - // Initialize the composite operators for each thread. - CeedOperator loc_op, loc_op_t; - PalaceCeedCall(ceed, CeedCompositeOperatorCreate(ceed, &loc_op)); - PalaceCeedCall(ceed, CeedCompositeOperatorCreate(ceed, &loc_op_t)); - for (const auto &[geom, data] : mesh.GetCeedGeomFactorData(ceed)) { if (mfem::Geometry::Dimension[geom] == mesh.Dimension() && !domain_interps.empty()) @@ -253,21 +241,18 @@ std::unique_ptr DiscreteLinearOperator::PartialAssemble() const { CeedOperator sub_op, sub_op_t; interp->Assemble(ceed, trial_restr, test_restr, interp_basis, &sub_op, &sub_op_t); - PalaceCeedCall(ceed, CeedCompositeOperatorAddSub(loc_op, sub_op)); - PalaceCeedCall(ceed, CeedCompositeOperatorAddSub(loc_op_t, sub_op_t)); - PalaceCeedCall(ceed, CeedOperatorDestroy(&sub_op)); - PalaceCeedCall(ceed, CeedOperatorDestroy(&sub_op_t)); + op->AddOper(sub_op, sub_op_t); // Sub-operator owned by ceed::Operator } // Basis is owned by the operator. PalaceCeedCall(ceed, CeedBasisDestroy(&interp_basis)); } } - PalaceCeedCall(ceed, CeedOperatorCheckReady(loc_op)); - PalaceCeedCall(ceed, CeedOperatorCheckReady(loc_op_t)); - op->AddOper(loc_op, loc_op_t); } + // Finalize the operator (call CeedOperatorCheckReady). + op->Finalize(); + // Construct dof multiplicity vector for scaling to account for dofs shared between // elements (on host, then copy to device). Vector test_multiplicity(test_fespace.GetVSize()); diff --git a/palace/fem/libceed/operator.cpp b/palace/fem/libceed/operator.cpp index da0fe2a70..4fafb72d4 100644 --- a/palace/fem/libceed/operator.cpp +++ b/palace/fem/libceed/operator.cpp @@ -21,6 +21,22 @@ Operator::Operator(int h, int w) : palace::Operator(h, w) op_t.resize(nt, nullptr); u.resize(nt, nullptr); v.resize(nt, nullptr); + PalacePragmaOmp(parallel if (op.size() > 1)) + { + const int id = utils::GetThreadNum(); + MFEM_ASSERT(id < op.size(), "Out of bounds access for thread number " << id << "!"); + Ceed ceed = ceed::internal::GetCeedObjects()[utils::GetThreadNum()]; + CeedOperator loc_op, loc_op_t; + CeedVector loc_u, loc_v; + PalaceCeedCall(ceed, CeedCompositeOperatorCreate(ceed, &loc_op)); + PalaceCeedCall(ceed, CeedCompositeOperatorCreate(ceed, &loc_op_t)); + PalaceCeedCall(ceed, CeedVectorCreate(ceed, width, &loc_u)); + PalaceCeedCall(ceed, CeedVectorCreate(ceed, height, &loc_v)); + op[id] = loc_op; + op_t[id] = loc_op_t; + u[id] = loc_u; + v[id] = loc_v; + } temp.UseDevice(true); } @@ -28,10 +44,9 @@ Operator::~Operator() { PalacePragmaOmp(parallel if (op.size() > 1)) { - Ceed ceed; const int id = utils::GetThreadNum(); - MFEM_ASSERT(id < op.size() && op[id], - "Out of bounds access for thread number " << id << "!"); + MFEM_ASSERT(id < op.size(), "Out of bounds access for thread number " << id << "!"); + Ceed ceed; PalaceCeedCallBackend(CeedOperatorGetCeed(op[id], &ceed)); PalaceCeedCall(ceed, CeedOperatorDestroy(&op[id])); PalaceCeedCall(ceed, CeedOperatorDestroy(&op_t[id])); @@ -40,32 +55,45 @@ Operator::~Operator() } } -void Operator::AddOper(CeedOperator op_, CeedOperator op_t_) +void Operator::AddOper(CeedOperator sub_op, CeedOperator sub_op_t) { + // This should be called from within a OpenMP parallel region. + const int id = utils::GetThreadNum(); + MFEM_ASSERT(id < op.size(), "Out of bounds access for thread number " << id << "!"); Ceed ceed; + PalaceCeedCallBackend(CeedOperatorGetCeed(sub_op, &ceed)); CeedSize l_in, l_out; - CeedVector loc_u, loc_v; - PalaceCeedCallBackend(CeedOperatorGetCeed(op_, &ceed)); - PalaceCeedCall(ceed, CeedOperatorGetActiveVectorLengths(op_, &l_in, &l_out)); - MFEM_VERIFY((l_in == 0 && l_out == 0) || (mfem::internal::to_int(l_in) == width && - mfem::internal::to_int(l_out) == height), + PalaceCeedCall(ceed, CeedOperatorGetActiveVectorLengths(sub_op, &l_in, &l_out)); + MFEM_VERIFY(mfem::internal::to_int(l_in) == width && + mfem::internal::to_int(l_out) == height, "Dimensions mismatch for CeedOperator!"); - if (op_t_) + PalaceCeedCall(ceed, CeedCompositeOperatorAddSub(op[id], sub_op)); + PalaceCeedCall(ceed, CeedOperatorDestroy(&sub_op)); + if (sub_op_t) { + Ceed ceed_t; + PalaceCeedCallBackend(CeedOperatorGetCeed(sub_op_t, &ceed_t)); + MFEM_VERIFY(ceed_t == ceed, "Ceed context mismatch for transpose CeedOperator!"); CeedSize l_in_t, l_out_t; - PalaceCeedCall(ceed, CeedOperatorGetActiveVectorLengths(op_t_, &l_in_t, &l_out_t)); - MFEM_VERIFY((l_in_t == 0 && l_out_t == 0) || (l_in_t == l_out && l_out_t == l_in), + PalaceCeedCall(ceed, CeedOperatorGetActiveVectorLengths(sub_op_t, &l_in_t, &l_out_t)); + MFEM_VERIFY(l_in_t == l_out && l_out_t == l_in, "Dimensions mismatch for transpose CeedOperator!"); + PalaceCeedCall(ceed, CeedCompositeOperatorAddSub(op_t[id], sub_op_t)); + PalaceCeedCall(ceed, CeedOperatorDestroy(&sub_op_t)); } - PalaceCeedCall(ceed, CeedVectorCreate(ceed, l_in, &loc_u)); - PalaceCeedCall(ceed, CeedVectorCreate(ceed, l_out, &loc_v)); +} - const int id = utils::GetThreadNum(); - MFEM_ASSERT(id < op.size(), "Out of bounds access for thread number " << id << "!"); - op[id] = op_; - op_t[id] = op_t_; - u[id] = loc_u; - v[id] = loc_v; +void Operator::Finalize() +{ + PalacePragmaOmp(parallel if (op.size() > 1)) + { + const int id = utils::GetThreadNum(); + MFEM_ASSERT(id < op.size(), "Out of bounds access for thread number " << id << "!"); + Ceed ceed; + PalaceCeedCallBackend(CeedOperatorGetCeed(op[id], &ceed)); + PalaceCeedCall(ceed, CeedOperatorCheckReady(op[id])); + PalaceCeedCall(ceed, CeedOperatorCheckReady(op_t[id])); + } } void Operator::AssembleDiagonal(Vector &diag) const @@ -84,10 +112,9 @@ void Operator::AssembleDiagonal(Vector &diag) const PalacePragmaOmp(parallel if (op.size() > 1)) { - Ceed ceed; const int id = utils::GetThreadNum(); - MFEM_ASSERT(id < op.size() && op[id], - "Out of bounds access for thread number " << id << "!"); + MFEM_ASSERT(id < op.size(), "Out of bounds access for thread number " << id << "!"); + Ceed ceed; PalaceCeedCallBackend(CeedOperatorGetCeed(op[id], &ceed)); PalaceCeedCall(ceed, CeedVectorSetArray(v[id], mem, CEED_USE_POINTER, diag_data)); PalaceCeedCall( @@ -116,10 +143,9 @@ inline void CeedAddMult(const std::vector &op, PalacePragmaOmp(parallel if (op.size() > 1)) { - Ceed ceed; const int id = utils::GetThreadNum(); - MFEM_ASSERT(id < op.size() && op[id], - "Out of bounds access for thread number " << id << "!"); + MFEM_ASSERT(id < op.size(), "Out of bounds access for thread number " << id << "!"); + Ceed ceed; PalaceCeedCallBackend(CeedOperatorGetCeed(op[id], &ceed)); PalaceCeedCall(ceed, CeedVectorSetArray(u[id], mem, CEED_USE_POINTER, const_cast(x_data))); @@ -407,32 +433,36 @@ std::unique_ptr OperatorCOOtoCSR(Ceed ceed, CeedInt m, Ce std::unique_ptr CeedOperatorFullAssemble(const Operator &op, bool skip_zeros, bool set) { - if (op.Size() == 0) - { - return std::make_unique(op.Height(), op.Width(), 0); - } - // Assemble operators on each thread. std::vector> loc_mat(op.Size()); PalacePragmaOmp(parallel if (op.Size() > 1)) { - Ceed ceed; const int id = utils::GetThreadNum(); - MFEM_ASSERT(id < op.Size() && op[id], - "Out of bounds access for thread number " << id << "!"); + MFEM_ASSERT(id < op.Size(), "Out of bounds access for thread number " << id << "!"); + Ceed ceed; PalaceCeedCallBackend(CeedOperatorGetCeed(op[id], &ceed)); - // First, get matrix on master thread in COO format, withs rows/cols always on host and - // vals potentially on the device. Process skipping zeros if desired. - CeedSize nnz; - CeedInt *rows, *cols; - CeedVector vals; - CeedMemType mem; - CeedOperatorAssembleCOO(ceed, op[id], skip_zeros, &nnz, &rows, &cols, &vals, &mem); - - // Convert COO to CSR (on each thread). The COO memory is free'd internally. - loc_mat[id] = - OperatorCOOtoCSR(ceed, op.Height(), op.Width(), nnz, rows, cols, vals, mem, set); + // Check if the operator is empty, otherwise assemble. + CeedInt nsub_ops; + PalaceCeedCall(ceed, CeedCompositeOperatorGetNumSub(op[id], &nsub_ops)); + if (nsub_ops == 0) + { + loc_mat[id] = std::make_unique(op.Height(), op.Width(), 0); + } + else + { + // First, get matrix on master thread in COO format, withs rows/cols always on host + // and vals potentially on the device. Process skipping zeros if desired. + CeedSize nnz; + CeedInt *rows, *cols; + CeedVector vals; + CeedMemType mem; + CeedOperatorAssembleCOO(ceed, op[id], skip_zeros, &nnz, &rows, &cols, &vals, &mem); + + // Convert COO to CSR (on each thread). The COO memory is free'd internally. + loc_mat[id] = + OperatorCOOtoCSR(ceed, op.Height(), op.Width(), nnz, rows, cols, vals, mem, set); + } } // Add CSR matrix objects from each thread (HYPRE's hypre_CSRMatrixAdd uses threads @@ -443,19 +473,19 @@ std::unique_ptr CeedOperatorFullAssemble(const Operator & { b_mat = std::make_unique(hypre_CSRMatrixClone(*mat, 0)); hypre_CSRMatrixSetConstantValues(*b_mat, 1.0); - for (std::size_t i = 1; i < op.Size(); i++) + for (std::size_t id = 1; id < op.Size(); id++) { - hypre_CSRMatrix *b_loc_mat = hypre_CSRMatrixClone(*loc_mat[i], 0); + hypre_CSRMatrix *b_loc_mat = hypre_CSRMatrixClone(*loc_mat[id], 0); hypre_CSRMatrixSetConstantValues(b_loc_mat, 1.0); b_mat = std::make_unique( hypre_CSRMatrixAdd(1.0, *b_mat, 1.0, b_loc_mat)); hypre_CSRMatrixDestroy(b_loc_mat); } } - for (std::size_t i = 1; i < op.Size(); i++) + for (std::size_t id = 1; id < op.Size(); id++) { mat = std::make_unique( - hypre_CSRMatrixAdd(1.0, *mat, 1.0, *loc_mat[i])); + hypre_CSRMatrixAdd(1.0, *mat, 1.0, *loc_mat[id])); } if (set && op.Size() > 1) { @@ -498,10 +528,10 @@ std::unique_ptr CeedOperatorCoarsen(const Operator &op_fine, // types, integrators) of the original fine operator. PalacePragmaOmp(parallel if (op_fine.Size() > 1)) { - Ceed ceed; const int id = utils::GetThreadNum(); - MFEM_ASSERT(id < op_fine.Size() && op_fine[id], + MFEM_ASSERT(id < op_fine.Size(), "Out of bounds access for thread number " << id << "!"); + Ceed ceed; PalaceCeedCallBackend(CeedOperatorGetCeed(op_fine[id], &ceed)); { Ceed ceed_parent; @@ -511,38 +541,21 @@ std::unique_ptr CeedOperatorCoarsen(const Operator &op_fine, ceed = ceed_parent; } } - - // Initialize the composite operator on each thread. - CeedOperator loc_op; - PalaceCeedCall(ceed, CeedCompositeOperatorCreate(ceed, &loc_op)); - - bool composite; - PalaceCeedCall(ceed, CeedOperatorIsComposite(op_fine[id], &composite)); - if (composite) - { - CeedInt nloc_ops_fine; - CeedOperator *loc_ops_fine; - PalaceCeedCall(ceed, CeedCompositeOperatorGetNumSub(op_fine[id], &nloc_ops_fine)); - PalaceCeedCall(ceed, CeedCompositeOperatorGetSubList(op_fine[id], &loc_ops_fine)); - for (CeedInt k = 0; k < nloc_ops_fine; k++) - { - CeedOperator sub_op; - SingleOperatorCoarsen(ceed, loc_ops_fine[k], &sub_op); - PalaceCeedCall(ceed, CeedCompositeOperatorAddSub(loc_op, sub_op)); - PalaceCeedCall(ceed, CeedOperatorDestroy(&sub_op)); - } - } - else + CeedInt nsub_ops_fine; + CeedOperator *sub_ops_fine; + PalaceCeedCall(ceed, CeedCompositeOperatorGetNumSub(op_fine[id], &nsub_ops_fine)); + PalaceCeedCall(ceed, CeedCompositeOperatorGetSubList(op_fine[id], &sub_ops_fine)); + for (CeedInt k = 0; k < nsub_ops_fine; k++) { - CeedOperator sub_op; - SingleOperatorCoarsen(ceed, op_fine[id], &sub_op); - PalaceCeedCall(ceed, CeedCompositeOperatorAddSub(loc_op, sub_op)); - PalaceCeedCall(ceed, CeedOperatorDestroy(&sub_op)); + CeedOperator sub_op_coarse; + SingleOperatorCoarsen(ceed, sub_ops_fine[k], &sub_op_coarse); + op_coarse->AddOper(sub_op_coarse); // Sub-operator owned by ceed::Operator } - PalaceCeedCall(ceed, CeedOperatorCheckReady(loc_op)); - op_coarse->AddOper(loc_op); // Thread-safe } + // Finalize the operator (call CeedOperatorCheckReady). + op_coarse->Finalize(); + return op_coarse; } diff --git a/palace/fem/libceed/operator.hpp b/palace/fem/libceed/operator.hpp index fe44d85ad..72848fcea 100644 --- a/palace/fem/libceed/operator.hpp +++ b/palace/fem/libceed/operator.hpp @@ -44,7 +44,9 @@ class Operator : public palace::Operator CeedOperator operator[](std::size_t i) const { return op[i]; } auto Size() const { return op.size(); } - void AddOper(CeedOperator op_, CeedOperator op_t_ = nullptr); + void AddOper(CeedOperator sub_op, CeedOperator sub_op_t = nullptr); + + void Finalize(); void SetDofMultiplicity(Vector &&mult) { dof_multiplicity = std::move(mult); } diff --git a/palace/linalg/rap.cpp b/palace/linalg/rap.cpp index f76fe40ad..070358e2f 100644 --- a/palace/linalg/rap.cpp +++ b/palace/linalg/rap.cpp @@ -106,25 +106,40 @@ mfem::HypreParMatrix &ParOperator::ParallelAssemble(bool skip_zeros) const if (!use_R) { const mfem::HypreParMatrix *Rt = test_fespace.Get().Dof_TrueDof_Matrix(); - RAP = std::make_unique(hypre_ParCSRMatrixRAP(*Rt, hA, *P), true); + RAP = std::make_unique(hypre_ParCSRMatrixRAPKT(*Rt, hA, *P, 1), + true); } else { - mfem::SparseMatrix *sRt = mfem::Transpose(*test_fespace.GetRestrictionMatrix()); - mfem::HypreParMatrix *hRt = new mfem::HypreParMatrix( - test_fespace.GetComm(), test_fespace.GlobalVSize(), test_fespace.GlobalTrueVSize(), - test_fespace.Get().GetDofOffsets(), test_fespace.Get().GetTrueDofOffsets(), sRt); - RAP = std::make_unique(hypre_ParCSRMatrixRAP(*hRt, hA, *P), true); - delete sRt; - delete hRt; + mfem::HypreParMatrix *hR = new mfem::HypreParMatrix( + test_fespace.GetComm(), test_fespace.GlobalTrueVSize(), test_fespace.GlobalVSize(), + test_fespace.Get().GetTrueDofOffsets(), test_fespace.Get().GetDofOffsets(), + const_cast(test_fespace.GetRestrictionMatrix())); + hypre_ParCSRMatrix *AP = hypre_ParCSRMatMat(hA, *P); + RAP = std::make_unique(hypre_ParCSRMatMat(*hR, AP), true); + hypre_ParCSRMatrixDestroy(AP); + delete hR; } hypre_ParCSRMatrixDiag(hA) = hA_diag; hypre_ParCSRMatrixDestroy(hA); hypre_ParCSRMatrixSetNumNonzeros(*RAP); + if (&trial_fespace == &test_fespace) + { + // Make sure that the first entry in each row is the diagonal one, for a square matrix. + hypre_CSRMatrixReorder(hypre_ParCSRMatrixDiag((hypre_ParCSRMatrix *)*RAP)); + } // Eliminate boundary conditions on the assembled (square) matrix. - RAP->EliminateBC(dbc_tdof_list, diag_policy); + if (&trial_fespace == &test_fespace) + { + RAP->EliminateBC(dbc_tdof_list, diag_policy); + } + else + { + MFEM_VERIFY(dbc_tdof_list.Size() == 0, + "Essential BC elimination is only available for square ParOperator!"); + } return *RAP; }