From ee10c84d9fef7811f830a7aa9e3b5a2701088d6f Mon Sep 17 00:00:00 2001
From: Sebastian Grimberg <sjg@amazon.com>
Date: Mon, 29 Jan 2024 21:59:20 -0800
Subject: [PATCH] Add HypreCSRMatrix wrapper class to replace
 mfem::SparseMatrix

This allows OpenMP and better GPU acceleration for ceed::Operator full assembly into a CSR matrix using Hypre's functionality.
---
 palace/fem/bilinearform.cpp        |   4 +-
 palace/fem/bilinearform.hpp        |  17 +-
 palace/fem/libceed/operator.cpp    | 284 +++++++++++++----------------
 palace/fem/libceed/operator.hpp    |  14 +-
 palace/linalg/divfree.cpp          |   5 +-
 palace/linalg/rap.cpp              | 217 +++++++++++-----------
 palace/linalg/rap.hpp              |  14 +-
 palace/models/curlcurloperator.cpp |   5 +-
 palace/models/laplaceoperator.cpp  |   5 +-
 palace/models/spaceoperator.cpp    |   7 +-
 test/unit/test-libceed.cpp         | 106 +++++++----
 11 files changed, 338 insertions(+), 340 deletions(-)
diff --git a/palace/fem/bilinearform.cpp b/palace/fem/bilinearform.cpp
index 0463c8ed5..307ffa784 100644
--- a/palace/fem/bilinearform.cpp
+++ b/palace/fem/bilinearform.cpp
@@ -113,8 +113,8 @@ BilinearForm::PartialAssemble(const FiniteElementSpace &trial_fespace,
   return op;
 }
 
-std::unique_ptr<mfem::SparseMatrix> BilinearForm::FullAssemble(const ceed::Operator &op,
-                                                               bool skip_zeros, bool set)
+std::unique_ptr<hypre::HypreCSRMatrix> BilinearForm::FullAssemble(const ceed::Operator &op,
+                                                                  bool skip_zeros, bool set)
 {
   return ceed::CeedOperatorFullAssemble(op, skip_zeros, set);
 }
diff --git a/palace/fem/bilinearform.hpp b/palace/fem/bilinearform.hpp
index 9059785e3..fef50f91f 100644
--- a/palace/fem/bilinearform.hpp
+++ b/palace/fem/bilinearform.hpp
@@ -9,6 +9,7 @@
 #include <mfem.hpp>
 #include "fem/integrator.hpp"
 #include "fem/libceed/operator.hpp"
+#include "linalg/hypre.hpp"
 
 namespace palace
 {
@@ -69,19 +70,19 @@ class BilinearForm
     return PartialAssemble(GetTrialSpace(), GetTestSpace());
   }
 
-  std::unique_ptr<mfem::SparseMatrix> FullAssemble(bool skip_zeros) const
+  std::unique_ptr<hypre::HypreCSRMatrix> FullAssemble(bool skip_zeros) const
   {
     return FullAssemble(*PartialAssemble(), skip_zeros, false);
   }
 
-  static std::unique_ptr<mfem::SparseMatrix> FullAssemble(const ceed::Operator &op,
-                                                          bool skip_zeros)
+  static std::unique_ptr<hypre::HypreCSRMatrix> FullAssemble(const ceed::Operator &op,
+                                                             bool skip_zeros)
   {
     return FullAssemble(op, skip_zeros, false);
   }
 
-  static std::unique_ptr<mfem::SparseMatrix> FullAssemble(const ceed::Operator &op,
-                                                          bool skip_zeros, bool set);
+  static std::unique_ptr<hypre::HypreCSRMatrix> FullAssemble(const ceed::Operator &op,
+                                                             bool skip_zeros, bool set);
 
   std::unique_ptr<Operator> Assemble(bool skip_zeros) const;
 
@@ -120,13 +121,13 @@ class DiscreteLinearOperator
 
   std::unique_ptr<ceed::Operator> PartialAssemble() const;
 
-  std::unique_ptr<mfem::SparseMatrix> FullAssemble(bool skip_zeros) const
+  std::unique_ptr<hypre::HypreCSRMatrix> FullAssemble(bool skip_zeros) const
   {
     return BilinearForm::FullAssemble(*PartialAssemble(), skip_zeros, true);
   }
 
-  static std::unique_ptr<mfem::SparseMatrix> FullAssemble(const ceed::Operator &op,
-                                                          bool skip_zeros)
+  static std::unique_ptr<hypre::HypreCSRMatrix> FullAssemble(const ceed::Operator &op,
+                                                             bool skip_zeros)
   {
     return BilinearForm::FullAssemble(op, skip_zeros, true);
   }
diff --git a/palace/fem/libceed/operator.cpp b/palace/fem/libceed/operator.cpp
index df642e94e..da0fe2a70 100644
--- a/palace/fem/libceed/operator.cpp
+++ b/palace/fem/libceed/operator.cpp
@@ -5,8 +5,10 @@
 
 #include <numeric>
 #include <ceed/backend.h>
+#include <mfem.hpp>
 #include <mfem/general/forall.hpp>
 #include "fem/fespace.hpp"
+#include "linalg/hypre.hpp"
 #include "utils/omp.hpp"
 
 namespace palace::ceed
@@ -211,174 +213,71 @@ int CeedInternalFree(void *p)
 
 #define CeedInternalCalloc(n, p) CeedInternalCallocArray((n), sizeof(**(p)), p)
 
-void CeedOperatorAssembleCOORemoveZeros(Ceed ceed, CeedSize *nnz, CeedInt **rows,
-                                        CeedInt **cols, CeedVector *vals, CeedMemType *mem)
-{
-  // Filter out zero entries. For now, eliminating zeros happens all on the host.
-  // XX TODO: Use Thrust for this (thrust::copy_if and thrust::zip_iterator)
-  CeedInt *new_rows, *new_cols;
-  PalaceCeedCall(ceed, CeedInternalCalloc(*nnz, &new_rows));
-  PalaceCeedCall(ceed, CeedInternalCalloc(*nnz, &new_cols));
-
-  CeedVector new_vals;
-  PalaceCeedCall(ceed, CeedVectorCreate(ceed, *nnz, &new_vals));
-
-  CeedSize q = 0;
-  const CeedScalar *vals_array;
-  CeedScalar *new_vals_array;
-  PalaceCeedCall(ceed, CeedVectorGetArrayRead(*vals, CEED_MEM_HOST, &vals_array));
-  PalaceCeedCall(ceed, CeedVectorGetArrayWrite(new_vals, CEED_MEM_HOST, &new_vals_array));
-  for (CeedSize k = 0; k < *nnz; k++)
-  {
-    if (vals_array[k] != 0.0)
-    {
-      new_rows[q] = (*rows)[k];
-      new_cols[q] = (*cols)[k];
-      new_vals_array[q] = vals_array[k];
-      q++;
-    }
-  }
-  PalaceCeedCall(ceed, CeedVectorRestoreArrayRead(*vals, &vals_array));
-  PalaceCeedCall(ceed, CeedVectorRestoreArray(new_vals, &new_vals_array));
-
-  PalaceCeedCall(ceed, CeedInternalFree(rows));
-  PalaceCeedCall(ceed, CeedInternalFree(cols));
-  PalaceCeedCall(ceed, CeedVectorDestroy(vals));
-
-  *rows = new_rows;
-  *cols = new_cols;
-  *vals = new_vals;
-  *nnz = q;
-}
-
-void CeedOperatorAssembleCOO(const Operator &op, bool skip_zeros, CeedSize *nnz,
+void CeedOperatorAssembleCOO(Ceed ceed, CeedOperator op, bool skip_zeros, CeedSize *nnz,
                              CeedInt **rows, CeedInt **cols, CeedVector *vals,
                              CeedMemType *mem)
 {
-  const std::size_t nt = op.Size();
-  if (nt == 0)
-  {
-    *nnz = 0;
-    *rows = nullptr;
-    *cols = nullptr;
-    *vals = nullptr;
-    *mem = CEED_MEM_HOST;
-    return;
-  }
-
-  Ceed ceed;
-  CeedScalar *vals_array;
-  std::vector<CeedSize> loc_nnz(nt), loc_offsets(nt + 1);
-  std::vector<CeedInt *> loc_rows(nt), loc_cols(nt);
-  std::vector<CeedVector> loc_vals(nt);
-
-  PalaceCeedCallBackend(CeedOperatorGetCeed(op[0], &ceed));
   PalaceCeedCall(ceed, CeedGetPreferredMemType(ceed, mem));
-  if (!mfem::Device::Allows(mfem::Backend::DEVICE_MASK))
-  {
-    *mem = CEED_MEM_HOST;
-  }
-
-  PalacePragmaOmp(parallel if (nt > 1))
-  {
-    Ceed ceed;
-    const int id = utils::GetThreadNum();
-    MFEM_ASSERT(id < op.Size() && op[id],
-                "Out of bounds access for thread number " << id << "!");
-    PalaceCeedCallBackend(CeedOperatorGetCeed(op[id], &ceed));
 
-    // Assemble sparsity pattern (rows, cols are always host pointers).
-    PalaceCeedCall(ceed, CeedOperatorLinearAssembleSymbolic(op[id], &loc_nnz[id],
-                                                            &loc_rows[id], &loc_cols[id]));
+  // Assemble sparsity pattern (rows, cols are always host pointers).
+  PalaceCeedCall(ceed, CeedOperatorLinearAssembleSymbolic(op, nnz, rows, cols));
 
-    // Assemble values.
-    PalaceCeedCall(ceed, CeedVectorCreate(ceed, loc_nnz[id], &loc_vals[id]));
-    PalaceCeedCall(ceed, CeedOperatorLinearAssemble(op[id], loc_vals[id]));
-  }
+  // Assemble values.
+  PalaceCeedCall(ceed, CeedVectorCreate(ceed, *nnz, vals));
+  PalaceCeedCall(ceed, CeedOperatorLinearAssemble(op, *vals));
 
-  loc_offsets[0] = 0;
-  std::inclusive_scan(loc_nnz.begin(), loc_nnz.end(), loc_offsets.begin() + 1);
-  *nnz = loc_offsets.back();
-  if (nt == 1)
-  {
-    // Assemble values.
-    *rows = loc_rows[0];
-    *cols = loc_cols[0];
-    *vals = loc_vals[0];
-  }
-  else
+  // Filter out zero entries. For now, eliminating zeros happens all on the host.
+  // std::cout << "  Operator full assembly (COO) has " << *nnz << " NNZ";
+  if (skip_zeros && *nnz > 0)
   {
-    // Global assembly.
-    PalaceCeedCall(ceed, CeedInternalCalloc(*nnz, rows));
-    PalaceCeedCall(ceed, CeedInternalCalloc(*nnz, cols));
-    PalaceCeedCall(ceed, CeedVectorCreate(ceed, *nnz, vals));
-    PalaceCeedCall(ceed, CeedVectorGetArrayWrite(*vals, *mem, &vals_array));
+    // XX TODO: Use Thrust for this (thrust::copy_if and thrust::zip_iterator)
+    CeedInt *new_rows, *new_cols;
+    PalaceCeedCall(ceed, CeedInternalCalloc(*nnz, &new_rows));
+    PalaceCeedCall(ceed, CeedInternalCalloc(*nnz, &new_cols));
+
+    CeedVector new_vals;
+    PalaceCeedCall(ceed, CeedVectorCreate(ceed, *nnz, &new_vals));
 
-    PalacePragmaOmp(parallel if (nt > 1))
+    CeedSize q = 0;
+    const CeedScalar *vals_array;
+    CeedScalar *new_vals_array;
+    PalaceCeedCall(ceed, CeedVectorGetArrayRead(*vals, CEED_MEM_HOST, &vals_array));
+    PalaceCeedCall(ceed, CeedVectorGetArrayWrite(new_vals, CEED_MEM_HOST, &new_vals_array));
+    for (CeedSize k = 0; k < *nnz; k++)
     {
-      const int id = utils::GetThreadNum();
-      MFEM_ASSERT(id < op.Size() && op[id],
-                  "Out of bounds access for thread number " << id << "!");
-      const auto start = loc_offsets[id];
-      const auto end = loc_offsets[id + 1];
-      for (auto k = start; k < end; k++)
+      if (vals_array[k] != 0.0)
       {
-        (*rows)[k] = loc_rows[id][k - start];
-        (*cols)[k] = loc_cols[id][k - start];
+        new_rows[q] = (*rows)[k];
+        new_cols[q] = (*cols)[k];
+        new_vals_array[q] = vals_array[k];
+        q++;
       }
-
-      // The CeedVector is on only on device when MFEM is also using the device. This is
-      // also correctly a non-OpenMP loop when executed on the CPU (OpenMP is handled in
-      // outer scope above).
-      Ceed ceed;
-      const CeedScalar *loc_vals_array;
-      PalaceCeedCallBackend(CeedVectorGetCeed(loc_vals[id], &ceed));
-      PalaceCeedCall(ceed, CeedVectorGetArrayRead(loc_vals[id], *mem, &loc_vals_array));
-      mfem::forall_switch(*mem == CEED_MEM_DEVICE, end - start,
-                          [=] MFEM_HOST_DEVICE(int k)
-                          { vals_array[k + start] = loc_vals_array[k]; });
-      PalaceCeedCall(ceed, CeedVectorRestoreArrayRead(loc_vals[id], &loc_vals_array));
-      PalaceCeedCall(ceed, CeedInternalFree(&loc_rows[id]));
-      PalaceCeedCall(ceed, CeedInternalFree(&loc_cols[id]));
-      PalaceCeedCall(ceed, CeedVectorDestroy(&loc_vals[id]));
     }
+    PalaceCeedCall(ceed, CeedVectorRestoreArrayRead(*vals, &vals_array));
+    PalaceCeedCall(ceed, CeedVectorRestoreArray(new_vals, &new_vals_array));
 
-    PalaceCeedCall(ceed, CeedVectorRestoreArray(*vals, &vals_array));
-  }
+    PalaceCeedCall(ceed, CeedInternalFree(rows));
+    PalaceCeedCall(ceed, CeedInternalFree(cols));
+    PalaceCeedCall(ceed, CeedVectorDestroy(vals));
+
+    *nnz = q;
+    *rows = new_rows;
+    *cols = new_cols;
+    *vals = new_vals;
 
-  // std::cout << "  Operator full assembly (COO) has " << *nnz << " NNZ";
-  if (skip_zeros && *nnz > 0)
-  {
-    CeedOperatorAssembleCOORemoveZeros(ceed, nnz, rows, cols, vals, mem);
     // std::cout << " (new NNZ after removal: " << *nnz << ")";
   }
   // std::cout << "\n";
 }
 
-}  // namespace
-
-std::unique_ptr<mfem::SparseMatrix> CeedOperatorFullAssemble(const Operator &op,
-                                                             bool skip_zeros, bool set)
+std::unique_ptr<hypre::HypreCSRMatrix> OperatorCOOtoCSR(Ceed ceed, CeedInt m, CeedInt n,
+                                                        CeedSize nnz, CeedInt *rows,
+                                                        CeedInt *cols, CeedVector vals,
+                                                        CeedMemType mem, bool set)
 {
-  // First, get matrix on master thread in COO format, withs rows/cols always on host and
-  // vals potentially on the device. Process skipping zeros if desired.
-  CeedSize nnz;
-  CeedInt *rows, *cols;
-  CeedVector vals;
-  CeedMemType mem;
-  CeedOperatorAssembleCOO(op, skip_zeros, &nnz, &rows, &cols, &vals, &mem);
-
   // Preallocate CSR memory on host (like PETSc's MatSetValuesCOO).
-  auto mat = std::make_unique<mfem::SparseMatrix>();
-  mat->OverrideSize(op.Height(), op.Width());
-  mat->GetMemoryI().New(op.Height() + 1);
-  auto *I = mat->GetI();
-  mfem::Array<int> J(nnz), perm(nnz), Jmap(nnz + 1);
-
-  for (int i = 0; i < op.Height() + 1; i++)
-  {
-    I[i] = 0;
-  }
+  mfem::Array<int> I(m + 1), J(nnz), perm(nnz), Jmap(nnz + 1);
+  I = 0;
   for (int k = 0; k < nnz; k++)
   {
     perm[k] = k;
@@ -419,11 +318,13 @@ std::unique_ptr<mfem::SparseMatrix> CeedOperatorFullAssemble(const Operator &op,
       }
     }
   }
-  const int nnz_new = q + 1;
+  PalaceCeedCall(ceed, CeedInternalFree(&rows));
+  PalaceCeedCall(ceed, CeedInternalFree(&cols));
 
   // Finalize I, Jmap.
+  const int nnz_new = q + 1;
   I[0] = 0;
-  for (int i = 0; i < op.Height(); i++)
+  for (int i = 0; i < m; i++)
   {
     I[i + 1] += I[i];
   }
@@ -433,23 +334,25 @@ std::unique_ptr<mfem::SparseMatrix> CeedOperatorFullAssemble(const Operator &op,
     Jmap[k + 1] += Jmap[k];
   }
 
-  mat->GetMemoryJ().New(nnz_new, mat->GetMemoryJ().GetMemoryType());
-  mat->GetMemoryData().New(nnz_new, mat->GetMemoryJ().GetMemoryType());
+  // Construct and fill the final CSR matrix. On GPU, MFEM and Hypre share the same memory
+  // space. On CPU, the inner nested OpenMP loop (if enabled in MFEM) should be ignored.
+  auto mat = std::make_unique<hypre::HypreCSRMatrix>(m, n, nnz_new);
+  {
+    const auto *d_I_old = I.Read();
+    auto *d_I = mat->GetI();
+    mfem::forall(m + 1, [=] MFEM_HOST_DEVICE(int i) { d_I[i] = d_I_old[i]; });
+  }
   {
-    // This always executes on the device.
     const auto *d_J_old = J.Read();
-    auto *d_J = mfem::Write(mat->GetMemoryJ(), nnz_new);
+    auto *d_J = mat->GetJ();
     mfem::forall(nnz_new, [=] MFEM_HOST_DEVICE(int k) { d_J[k] = d_J_old[k]; });
   }
-
-  // Fill the values (also always on device).
-  if (vals)
   {
     auto FillValues = [&](const double *vals_array)
     {
       const auto *d_perm = perm.Read();
       const auto *d_Jmap = Jmap.Read();
-      auto *d_A = mfem::Write(mat->GetMemoryData(), nnz_new);
+      auto *d_A = mat->GetData();
       if (set)
       {
         mfem::forall(nnz_new, [=] MFEM_HOST_DEVICE(int k)
@@ -477,7 +380,6 @@ std::unique_ptr<mfem::SparseMatrix> CeedOperatorFullAssemble(const Operator &op,
     {
       // Copy values to device before filling.
       Vector d_vals(nnz);
-      d_vals.UseDevice(true);
       {
         auto *d_vals_array = d_vals.HostWrite();
         PalacePragmaOmp(parallel for schedule(static))
@@ -495,8 +397,72 @@ std::unique_ptr<mfem::SparseMatrix> CeedOperatorFullAssemble(const Operator &op,
     }
     PalaceCeedCall(ceed, CeedVectorRestoreArrayRead(vals, &vals_array));
     PalaceCeedCall(ceed, CeedVectorDestroy(&vals));
-    PalaceCeedCall(ceed, CeedInternalFree(&rows));
-    PalaceCeedCall(ceed, CeedInternalFree(&cols));
+  }
+
+  return mat;
+}
+
+}  // namespace
+
+std::unique_ptr<hypre::HypreCSRMatrix> CeedOperatorFullAssemble(const Operator &op,
+                                                                bool skip_zeros, bool set)
+{
+  if (op.Size() == 0)
+  {
+    return std::make_unique<hypre::HypreCSRMatrix>(op.Height(), op.Width(), 0);
+  }
+
+  // Assemble operators on each thread.
+  std::vector<std::unique_ptr<hypre::HypreCSRMatrix>> loc_mat(op.Size());
+  PalacePragmaOmp(parallel if (op.Size() > 1))
+  {
+    Ceed ceed;
+    const int id = utils::GetThreadNum();
+    MFEM_ASSERT(id < op.Size() && op[id],
+                "Out of bounds access for thread number " << id << "!");
+    PalaceCeedCallBackend(CeedOperatorGetCeed(op[id], &ceed));
+
+    // First, get matrix on master thread in COO format, withs rows/cols always on host and
+    // vals potentially on the device. Process skipping zeros if desired.
+    CeedSize nnz;
+    CeedInt *rows, *cols;
+    CeedVector vals;
+    CeedMemType mem;
+    CeedOperatorAssembleCOO(ceed, op[id], skip_zeros, &nnz, &rows, &cols, &vals, &mem);
+
+    // Convert COO to CSR (on each thread). The COO memory is free'd internally.
+    loc_mat[id] =
+        OperatorCOOtoCSR(ceed, op.Height(), op.Width(), nnz, rows, cols, vals, mem, set);
+  }
+
+  // Add CSR matrix objects from each thread (HYPRE's hypre_CSRMatrixAdd uses threads
+  // internally as available). We have to scale the duplicated nonzeros when set = true.
+  auto mat = std::move(loc_mat[0]);
+  std::unique_ptr<hypre::HypreCSRMatrix> b_mat;
+  if (set && op.Size() > 1)
+  {
+    b_mat = std::make_unique<hypre::HypreCSRMatrix>(hypre_CSRMatrixClone(*mat, 0));
+    hypre_CSRMatrixSetConstantValues(*b_mat, 1.0);
+    for (std::size_t i = 1; i < op.Size(); i++)
+    {
+      hypre_CSRMatrix *b_loc_mat = hypre_CSRMatrixClone(*loc_mat[i], 0);
+      hypre_CSRMatrixSetConstantValues(b_loc_mat, 1.0);
+      b_mat = std::make_unique<hypre::HypreCSRMatrix>(
+          hypre_CSRMatrixAdd(1.0, *b_mat, 1.0, b_loc_mat));
+      hypre_CSRMatrixDestroy(b_loc_mat);
+    }
+  }
+  for (std::size_t i = 1; i < op.Size(); i++)
+  {
+    mat = std::make_unique<hypre::HypreCSRMatrix>(
+        hypre_CSRMatrixAdd(1.0, *mat, 1.0, *loc_mat[i]));
+  }
+  if (set && op.Size() > 1)
+  {
+    const auto *d_b_data = b_mat->GetData();
+    auto *d_data = mat->GetData();
+    mfem::forall(mat->NNZ(),
+                 [=] MFEM_HOST_DEVICE(int i) { d_data[i] *= 1.0 / d_b_data[i]; });
   }
 
   return mat;
diff --git a/palace/fem/libceed/operator.hpp b/palace/fem/libceed/operator.hpp
index 68dd90585..fe44d85ad 100644
--- a/palace/fem/libceed/operator.hpp
+++ b/palace/fem/libceed/operator.hpp
@@ -6,7 +6,6 @@
 
 #include <memory>
 #include <vector>
-#include <mfem.hpp>
 #include "fem/libceed/ceed.hpp"
 #include "linalg/operator.hpp"
 #include "linalg/vector.hpp"
@@ -16,6 +15,13 @@ namespace palace
 
 class FiniteElementSpace;
 
+namespace hypre
+{
+
+class HypreCSRMatrix;
+
+}  // namespace hypre
+
 namespace ceed
 {
 
@@ -67,9 +73,9 @@ class SymmetricOperator : public Operator
   }
 };
 
-// Assemble a ceed::Operator as an mfem::SparseMatrix.
-std::unique_ptr<mfem::SparseMatrix> CeedOperatorFullAssemble(const Operator &op,
-                                                             bool skip_zeros, bool set);
+// Assemble a ceed::Operator as a CSR matrix.
+std::unique_ptr<hypre::HypreCSRMatrix> CeedOperatorFullAssemble(const Operator &op,
+                                                                bool skip_zeros, bool set);
 
 // Construct a coarse-level ceed::Operator, reusing the quadrature data and quadrature
 // function from the fine-level operator. Only available for square, symmetric operators
diff --git a/palace/linalg/divfree.cpp b/palace/linalg/divfree.cpp
index af158c517..853a26d82 100644
--- a/palace/linalg/divfree.cpp
+++ b/palace/linalg/divfree.cpp
@@ -64,6 +64,10 @@ DivFreeSolver<VecType>::DivFreeSolver(
       const auto &h1_fespace_l = h1_fespaces.GetFESpaceAtLevel(l);
       auto M_l = BuildLevelParOperator<OperType>(std::move(m_vec[l]), h1_fespace_l);
       M_l->SetEssentialTrueDofs(h1_bdr_tdof_lists[l], Operator::DiagonalPolicy::DIAG_ONE);
+      if (l == h1_fespaces.GetNumLevels() - 1)
+      {
+        bdr_tdof_list_M = M_l->GetEssentialTrueDofs();
+      }
       M_mg->AddOperator(std::move(M_l));
     }
     M = std::move(M_mg);
@@ -76,7 +80,6 @@ DivFreeSolver<VecType>::DivFreeSolver(
                                             h1_fespaces.GetFinestFESpace(), false);
   }
   Grad = &h1_fespaces.GetFinestFESpace().GetDiscreteInterpolator();
-  bdr_tdof_list_M = &h1_bdr_tdof_lists.back();
 
   // The system matrix for the projection is real and SPD.
   auto amg = std::make_unique<MfemWrapperSolver<OperType>>(
diff --git a/palace/linalg/rap.cpp b/palace/linalg/rap.cpp
index 3b8611da9..f76fe40ad 100644
--- a/palace/linalg/rap.cpp
+++ b/palace/linalg/rap.cpp
@@ -4,6 +4,7 @@
 #include "rap.hpp"
 
 #include "fem/bilinearform.hpp"
+#include "linalg/hypre.hpp"
 
 namespace palace
 {
@@ -14,7 +15,7 @@ ParOperator::ParOperator(std::unique_ptr<Operator> &&dA, const Operator *pA,
   : Operator(test_fespace.GetTrueVSize(), trial_fespace.GetTrueVSize()),
     data_A(std::move(dA)), A((data_A != nullptr) ? data_A.get() : pA),
     trial_fespace(trial_fespace), test_fespace(test_fespace), use_R(test_restrict),
-    dbc_tdof_list(nullptr), diag_policy(DiagonalPolicy::DIAG_ONE), RAP(nullptr)
+    diag_policy(DiagonalPolicy::DIAG_ONE), RAP(nullptr)
 {
   MFEM_VERIFY(A, "Cannot construct ParOperator from an empty matrix!");
 }
@@ -40,25 +41,20 @@ void ParOperator::SetEssentialTrueDofs(const mfem::Array<int> &tdof_list,
               "only DiagonalPolicy::DIAG_ONE or DiagonalPolicy::DIAG_ZERO!");
   MFEM_VERIFY(height == width, "Set essential true dofs for both test and trial spaces "
                                "for rectangular ParOperator!");
-  dbc_tdof_list = &tdof_list;
-  dbc_tdof_list->Read(true);  // Copy to device
+  tdof_list.Read();
+  dbc_tdof_list.MakeRef(tdof_list);
   diag_policy = policy;
 }
 
 void ParOperator::EliminateRHS(const Vector &x, Vector &b) const
 {
-  if (!dbc_tdof_list)
-  {
-    return;
-  }
-
   MFEM_VERIFY(A, "No local matrix available for ParOperator::EliminateRHS!");
   auto &lx = trial_fespace.GetLVector<Vector>();
   auto &ly = GetTestLVector();
   {
     auto &tx = trial_fespace.GetTVector<Vector>();
     tx = 0.0;
-    linalg::SetSubVector(tx, *dbc_tdof_list, x);
+    linalg::SetSubVector(tx, dbc_tdof_list, x);
     trial_fespace.GetProlongationMatrix()->Mult(tx, lx);
   }
 
@@ -70,11 +66,11 @@ void ParOperator::EliminateRHS(const Vector &x, Vector &b) const
   b.Add(-1.0, ty);
   if (diag_policy == DiagonalPolicy::DIAG_ONE)
   {
-    linalg::SetSubVector(b, *dbc_tdof_list, x);
+    linalg::SetSubVector(b, dbc_tdof_list, x);
   }
   else if (diag_policy == DiagonalPolicy::DIAG_ZERO)
   {
-    linalg::SetSubVector(b, *dbc_tdof_list, 0.0);
+    linalg::SetSubVector(b, dbc_tdof_list, 0.0);
   }
 }
 
@@ -86,59 +82,50 @@ mfem::HypreParMatrix &ParOperator::ParallelAssemble(bool skip_zeros) const
   }
 
   // Build the square or rectangular assembled HypreParMatrix.
-  const auto *sA = dynamic_cast<const mfem::SparseMatrix *>(A);
-  std::unique_ptr<mfem::SparseMatrix> data_sA;
+  const auto *sA = dynamic_cast<const hypre::HypreCSRMatrix *>(A);
+  std::unique_ptr<hypre::HypreCSRMatrix> data_sA;
   if (!sA)
   {
     const auto *cA = dynamic_cast<const ceed::Operator *>(A);
-    MFEM_VERIFY(cA, "ParOperator::ParallelAssemble requires A as an mfem::SparseMatrix or "
-                    "ceed::Operator!");
+    MFEM_VERIFY(cA,
+                "ParOperator::ParallelAssemble requires A as an hypre::HypreCSRMatrix or "
+                "ceed::Operator!");
     data_sA = BilinearForm::FullAssemble(*cA, skip_zeros, use_R);
     sA = data_sA.get();
   }
-  if (&trial_fespace == &test_fespace)
+
+  hypre_ParCSRMatrix *hA = hypre_ParCSRMatrixCreate(
+      trial_fespace.GetComm(), test_fespace.GlobalVSize(), trial_fespace.GlobalVSize(),
+      test_fespace.Get().GetDofOffsets(), trial_fespace.Get().GetDofOffsets(), 0, sA->NNZ(),
+      0);
+  hypre_CSRMatrix *hA_diag = hypre_ParCSRMatrixDiag(hA);
+  hypre_ParCSRMatrixDiag(hA) = *const_cast<hypre::HypreCSRMatrix *>(sA);
+  hypre_ParCSRMatrixInitialize(hA);
+
+  const mfem::HypreParMatrix *P = trial_fespace.Get().Dof_TrueDof_Matrix();
+  if (!use_R)
   {
-    mfem::HypreParMatrix *hA = new mfem::HypreParMatrix(
-        trial_fespace.GetComm(), trial_fespace.GlobalVSize(),
-        trial_fespace.Get().GetDofOffsets(), const_cast<mfem::SparseMatrix *>(sA));
-    const mfem::HypreParMatrix *P = trial_fespace.Get().Dof_TrueDof_Matrix();
-    RAP = std::make_unique<mfem::HypreParMatrix>(hypre_ParCSRMatrixRAP(*P, *hA, *P), true);
-    delete hA;
+    const mfem::HypreParMatrix *Rt = test_fespace.Get().Dof_TrueDof_Matrix();
+    RAP = std::make_unique<mfem::HypreParMatrix>(hypre_ParCSRMatrixRAP(*Rt, hA, *P), true);
   }
   else
   {
-    mfem::HypreParMatrix *hA = new mfem::HypreParMatrix(
-        trial_fespace.GetComm(), test_fespace.GlobalVSize(), trial_fespace.GlobalVSize(),
-        test_fespace.Get().GetDofOffsets(), trial_fespace.Get().GetDofOffsets(),
-        const_cast<mfem::SparseMatrix *>(sA));
-    const mfem::HypreParMatrix *P = trial_fespace.Get().Dof_TrueDof_Matrix();
-    if (!use_R)
-    {
-      const mfem::HypreParMatrix *Rt = test_fespace.Get().Dof_TrueDof_Matrix();
-      RAP =
-          std::make_unique<mfem::HypreParMatrix>(hypre_ParCSRMatrixRAP(*Rt, *hA, *P), true);
-    }
-    else
-    {
-      mfem::SparseMatrix *sRt = mfem::Transpose(*test_fespace.GetRestrictionMatrix());
-      mfem::HypreParMatrix *hRt = new mfem::HypreParMatrix(
-          test_fespace.GetComm(), test_fespace.GlobalVSize(),
-          test_fespace.GlobalTrueVSize(), test_fespace.Get().GetDofOffsets(),
-          test_fespace.Get().GetTrueDofOffsets(), sRt);
-      RAP = std::make_unique<mfem::HypreParMatrix>(hypre_ParCSRMatrixRAP(*hRt, *hA, *P),
-                                                   true);
-      delete sRt;
-      delete hRt;
-    }
-    delete hA;
+    mfem::SparseMatrix *sRt = mfem::Transpose(*test_fespace.GetRestrictionMatrix());
+    mfem::HypreParMatrix *hRt = new mfem::HypreParMatrix(
+        test_fespace.GetComm(), test_fespace.GlobalVSize(), test_fespace.GlobalTrueVSize(),
+        test_fespace.Get().GetDofOffsets(), test_fespace.Get().GetTrueDofOffsets(), sRt);
+    RAP = std::make_unique<mfem::HypreParMatrix>(hypre_ParCSRMatrixRAP(*hRt, hA, *P), true);
+    delete sRt;
+    delete hRt;
   }
+
+  hypre_ParCSRMatrixDiag(hA) = hA_diag;
+  hypre_ParCSRMatrixDestroy(hA);
   hypre_ParCSRMatrixSetNumNonzeros(*RAP);
 
   // Eliminate boundary conditions on the assembled (square) matrix.
-  if (dbc_tdof_list)
-  {
-    RAP->EliminateBC(*dbc_tdof_list, diag_policy);
-  }
+  RAP->EliminateBC(dbc_tdof_list, diag_policy);
+
   return *RAP;
 }
 
@@ -156,14 +143,7 @@ void ParOperator::AssembleDiagonal(Vector &diag) const
   MFEM_VERIFY(&trial_fespace == &test_fespace,
               "Diagonal assembly is only available for square ParOperator!");
   auto &lx = trial_fespace.GetLVector<Vector>();
-  if (const auto *sA = dynamic_cast<const mfem::SparseMatrix *>(A))
-  {
-    sA->GetDiag(lx);
-  }
-  else
-  {
-    A->AssembleDiagonal(lx);
-  }
+  A->AssembleDiagonal(lx);
 
   // Parallel assemble and eliminate essential true dofs.
   const Operator *P = test_fespace.GetProlongationMatrix();
@@ -175,15 +155,17 @@ void ParOperator::AssembleDiagonal(Vector &diag) const
   {
     P->MultTranspose(lx, diag);
   }
-  if (dbc_tdof_list)
+
+  // Eliminate essential true dofs.
+  if (dbc_tdof_list.Size())
   {
     if (diag_policy == DiagonalPolicy::DIAG_ONE)
     {
-      linalg::SetSubVector(diag, *dbc_tdof_list, 1.0);
+      linalg::SetSubVector(diag, dbc_tdof_list, 1.0);
     }
     else if (diag_policy == DiagonalPolicy::DIAG_ZERO)
     {
-      linalg::SetSubVector(diag, *dbc_tdof_list, 0.0);
+      linalg::SetSubVector(diag, dbc_tdof_list, 0.0);
     }
   }
 }
@@ -200,11 +182,11 @@ void ParOperator::Mult(const Vector &x, Vector &y) const
 
   auto &lx = trial_fespace.GetLVector<Vector>();
   auto &ly = GetTestLVector();
-  if (dbc_tdof_list)
+  if (dbc_tdof_list.Size())
   {
     auto &tx = trial_fespace.GetTVector<Vector>();
     tx = x;
-    linalg::SetSubVector(tx, *dbc_tdof_list, 0.0);
+    linalg::SetSubVector(tx, dbc_tdof_list, 0.0);
     trial_fespace.GetProlongationMatrix()->Mult(tx, lx);
   }
   else
@@ -216,15 +198,15 @@ void ParOperator::Mult(const Vector &x, Vector &y) const
   A->Mult(lx, ly);
 
   RestrictionMatrixMult(ly, y);
-  if (dbc_tdof_list)
+  if (dbc_tdof_list.Size())
   {
     if (diag_policy == DiagonalPolicy::DIAG_ONE)
     {
-      linalg::SetSubVector(y, *dbc_tdof_list, x);
+      linalg::SetSubVector(y, dbc_tdof_list, x);
     }
     else if (diag_policy == DiagonalPolicy::DIAG_ZERO)
     {
-      linalg::SetSubVector(y, *dbc_tdof_list, 0.0);
+      linalg::SetSubVector(y, dbc_tdof_list, 0.0);
     }
   }
 }
@@ -241,11 +223,11 @@ void ParOperator::MultTranspose(const Vector &x, Vector &y) const
 
   auto &lx = trial_fespace.GetLVector<Vector>();
   auto &ly = GetTestLVector();
-  if (dbc_tdof_list)
+  if (dbc_tdof_list.Size())
   {
     auto &ty = test_fespace.GetTVector<Vector>();
     ty = x;
-    linalg::SetSubVector(ty, *dbc_tdof_list, 0.0);
+    linalg::SetSubVector(ty, dbc_tdof_list, 0.0);
     RestrictionMatrixMultTranspose(ty, ly);
   }
   else
@@ -257,15 +239,15 @@ void ParOperator::MultTranspose(const Vector &x, Vector &y) const
   A->MultTranspose(ly, lx);
 
   trial_fespace.GetProlongationMatrix()->MultTranspose(lx, y);
-  if (dbc_tdof_list)
+  if (dbc_tdof_list.Size())
   {
     if (diag_policy == DiagonalPolicy::DIAG_ONE)
     {
-      linalg::SetSubVector(y, *dbc_tdof_list, x);
+      linalg::SetSubVector(y, dbc_tdof_list, x);
     }
     else if (diag_policy == DiagonalPolicy::DIAG_ZERO)
     {
-      linalg::SetSubVector(y, *dbc_tdof_list, 0.0);
+      linalg::SetSubVector(y, dbc_tdof_list, 0.0);
     }
   }
 }
@@ -282,11 +264,11 @@ void ParOperator::AddMult(const Vector &x, Vector &y, const double a) const
 
   auto &lx = trial_fespace.GetLVector<Vector>();
   auto &ly = GetTestLVector();
-  if (dbc_tdof_list)
+  if (dbc_tdof_list.Size())
   {
     auto &tx = trial_fespace.GetTVector<Vector>();
     tx = x;
-    linalg::SetSubVector(tx, *dbc_tdof_list, 0.0);
+    linalg::SetSubVector(tx, dbc_tdof_list, 0.0);
     trial_fespace.GetProlongationMatrix()->Mult(tx, lx);
   }
   else
@@ -299,15 +281,15 @@ void ParOperator::AddMult(const Vector &x, Vector &y, const double a) const
 
   auto &ty = test_fespace.GetTVector<Vector>();
   RestrictionMatrixMult(ly, ty);
-  if (dbc_tdof_list)
+  if (dbc_tdof_list.Size())
   {
     if (diag_policy == DiagonalPolicy::DIAG_ONE)
     {
-      linalg::SetSubVector(ty, *dbc_tdof_list, x);
+      linalg::SetSubVector(ty, dbc_tdof_list, x);
     }
     else if (diag_policy == DiagonalPolicy::DIAG_ZERO)
     {
-      linalg::SetSubVector(ty, *dbc_tdof_list, 0.0);
+      linalg::SetSubVector(ty, dbc_tdof_list, 0.0);
     }
   }
   y.Add(a, ty);
@@ -325,11 +307,11 @@ void ParOperator::AddMultTranspose(const Vector &x, Vector &y, const double a) c
 
   auto &lx = trial_fespace.GetLVector<Vector>();
   auto &ly = GetTestLVector();
-  if (dbc_tdof_list)
+  if (dbc_tdof_list.Size())
   {
     auto &ty = test_fespace.GetTVector<Vector>();
     ty = x;
-    linalg::SetSubVector(ty, *dbc_tdof_list, 0.0);
+    linalg::SetSubVector(ty, dbc_tdof_list, 0.0);
     RestrictionMatrixMultTranspose(ty, ly);
   }
   else
@@ -342,15 +324,15 @@ void ParOperator::AddMultTranspose(const Vector &x, Vector &y, const double a) c
 
   auto &tx = trial_fespace.GetTVector<Vector>();
   trial_fespace.GetProlongationMatrix()->MultTranspose(lx, tx);
-  if (dbc_tdof_list)
+  if (dbc_tdof_list.Size())
   {
     if (diag_policy == DiagonalPolicy::DIAG_ONE)
     {
-      linalg::SetSubVector(tx, *dbc_tdof_list, x);
+      linalg::SetSubVector(tx, dbc_tdof_list, x);
     }
     else if (diag_policy == DiagonalPolicy::DIAG_ZERO)
     {
-      linalg::SetSubVector(tx, *dbc_tdof_list, 0.0);
+      linalg::SetSubVector(tx, dbc_tdof_list, 0.0);
     }
   }
   y.Add(a, tx);
@@ -397,8 +379,7 @@ ComplexParOperator::ComplexParOperator(std::unique_ptr<Operator> &&dAr,
                ? std::make_unique<ComplexWrapperOperator>(std::move(dAr), std::move(dAi))
                : std::make_unique<ComplexWrapperOperator>(pAr, pAi)),
     A(data_A.get()), trial_fespace(trial_fespace), test_fespace(test_fespace),
-    use_R(test_restrict), dbc_tdof_list(nullptr),
-    diag_policy(Operator::DiagonalPolicy::DIAG_ONE),
+    use_R(test_restrict), diag_policy(Operator::DiagonalPolicy::DIAG_ONE),
     RAPr(A->Real()
              ? std::make_unique<ParOperator>(*A->Real(), trial_fespace, test_fespace, use_R)
              : nullptr),
@@ -441,8 +422,8 @@ void ComplexParOperator::SetEssentialTrueDofs(const mfem::Array<int> &tdof_list,
       "DiagonalPolicy::DIAG_ONE specified for ComplexParOperator with no real part!");
   MFEM_VERIFY(height == width, "Set essential true dofs for both test and trial spaces "
                                "for rectangular ComplexParOperator!");
-  dbc_tdof_list = &tdof_list;
-  dbc_tdof_list->Read(true);  // Copy to device
+  tdof_list.Read();
+  dbc_tdof_list.MakeRef(tdof_list);
   diag_policy = policy;
   if (RAPr)
   {
@@ -472,13 +453,14 @@ void ComplexParOperator::Mult(const ComplexVector &x, ComplexVector &y) const
 {
   MFEM_ASSERT(x.Size() == width && y.Size() == height,
               "Incompatible dimensions for ComplexParOperator::Mult!");
+
   auto &lx = trial_fespace.GetLVector<ComplexVector>();
   auto &ly = GetTestLVector();
-  if (dbc_tdof_list)
+  if (dbc_tdof_list.Size())
   {
     auto &tx = trial_fespace.GetTVector<ComplexVector>();
     tx = x;
-    linalg::SetSubVector(tx, *dbc_tdof_list, 0.0);
+    linalg::SetSubVector(tx, dbc_tdof_list, 0.0);
     trial_fespace.GetProlongationMatrix()->Mult(tx.Real(), lx.Real());
     trial_fespace.GetProlongationMatrix()->Mult(tx.Imag(), lx.Imag());
   }
@@ -492,15 +474,15 @@ void ComplexParOperator::Mult(const ComplexVector &x, ComplexVector &y) const
   A->Mult(lx, ly);
 
   RestrictionMatrixMult(ly, y);
-  if (dbc_tdof_list)
+  if (dbc_tdof_list.Size())
   {
     if (diag_policy == Operator::DiagonalPolicy::DIAG_ONE)
     {
-      linalg::SetSubVector(y, *dbc_tdof_list, x);
+      linalg::SetSubVector(y, dbc_tdof_list, x);
     }
     else if (diag_policy == Operator::DiagonalPolicy::DIAG_ZERO)
     {
-      linalg::SetSubVector(y, *dbc_tdof_list, 0.0);
+      linalg::SetSubVector(y, dbc_tdof_list, 0.0);
     }
   }
 }
@@ -509,13 +491,14 @@ void ComplexParOperator::MultTranspose(const ComplexVector &x, ComplexVector &y)
 {
   MFEM_ASSERT(x.Size() == height && y.Size() == width,
               "Incompatible dimensions for ComplexParOperator::MultTranspose!");
+
   auto &lx = trial_fespace.GetLVector<ComplexVector>();
   auto &ly = GetTestLVector();
-  if (dbc_tdof_list)
+  if (dbc_tdof_list.Size())
   {
     auto &ty = test_fespace.GetTVector<ComplexVector>();
     ty = x;
-    linalg::SetSubVector(ty, *dbc_tdof_list, 0.0);
+    linalg::SetSubVector(ty, dbc_tdof_list, 0.0);
     RestrictionMatrixMultTranspose(ty, ly);
   }
   else
@@ -528,15 +511,15 @@ void ComplexParOperator::MultTranspose(const ComplexVector &x, ComplexVector &y)
 
   trial_fespace.GetProlongationMatrix()->MultTranspose(lx.Real(), y.Real());
   trial_fespace.GetProlongationMatrix()->MultTranspose(lx.Imag(), y.Imag());
-  if (dbc_tdof_list)
+  if (dbc_tdof_list.Size())
   {
     if (diag_policy == Operator::DiagonalPolicy::DIAG_ONE)
     {
-      linalg::SetSubVector(y, *dbc_tdof_list, x);
+      linalg::SetSubVector(y, dbc_tdof_list, x);
     }
     else if (diag_policy == Operator::DiagonalPolicy::DIAG_ZERO)
     {
-      linalg::SetSubVector(y, *dbc_tdof_list, 0.0);
+      linalg::SetSubVector(y, dbc_tdof_list, 0.0);
     }
   }
 }
@@ -546,13 +529,14 @@ void ComplexParOperator::MultHermitianTranspose(const ComplexVector &x,
 {
   MFEM_ASSERT(x.Size() == height && y.Size() == width,
               "Incompatible dimensions for ComplexParOperator::MultHermitianTranspose!");
+
   auto &lx = trial_fespace.GetLVector<ComplexVector>();
   auto &ly = GetTestLVector();
-  if (dbc_tdof_list)
+  if (dbc_tdof_list.Size())
   {
     auto &ty = test_fespace.GetTVector<ComplexVector>();
     ty = x;
-    linalg::SetSubVector(ty, *dbc_tdof_list, 0.0);
+    linalg::SetSubVector(ty, dbc_tdof_list, 0.0);
     RestrictionMatrixMultTranspose(ty, ly);
   }
   else
@@ -565,15 +549,15 @@ void ComplexParOperator::MultHermitianTranspose(const ComplexVector &x,
 
   trial_fespace.GetProlongationMatrix()->MultTranspose(lx.Real(), y.Real());
   trial_fespace.GetProlongationMatrix()->MultTranspose(lx.Imag(), y.Imag());
-  if (dbc_tdof_list)
+  if (dbc_tdof_list.Size())
   {
     if (diag_policy == Operator::DiagonalPolicy::DIAG_ONE)
     {
-      linalg::SetSubVector(y, *dbc_tdof_list, x);
+      linalg::SetSubVector(y, dbc_tdof_list, x);
     }
     else if (diag_policy == Operator::DiagonalPolicy::DIAG_ZERO)
     {
-      linalg::SetSubVector(y, *dbc_tdof_list, 0.0);
+      linalg::SetSubVector(y, dbc_tdof_list, 0.0);
     }
   }
 }
@@ -583,13 +567,14 @@ void ComplexParOperator::AddMult(const ComplexVector &x, ComplexVector &y,
 {
   MFEM_ASSERT(x.Size() == width && y.Size() == height,
               "Incompatible dimensions for ComplexParOperator::AddMult!");
+
   auto &lx = trial_fespace.GetLVector<ComplexVector>();
   auto &ly = GetTestLVector();
-  if (dbc_tdof_list)
+  if (dbc_tdof_list.Size())
   {
     auto &tx = trial_fespace.GetTVector<ComplexVector>();
     tx = x;
-    linalg::SetSubVector(tx, *dbc_tdof_list, 0.0);
+    linalg::SetSubVector(tx, dbc_tdof_list, 0.0);
     trial_fespace.GetProlongationMatrix()->Mult(tx.Real(), lx.Real());
     trial_fespace.GetProlongationMatrix()->Mult(tx.Imag(), lx.Imag());
   }
@@ -604,15 +589,15 @@ void ComplexParOperator::AddMult(const ComplexVector &x, ComplexVector &y,
 
   auto &ty = test_fespace.GetTVector<ComplexVector>();
   RestrictionMatrixMult(ly, ty);
-  if (dbc_tdof_list)
+  if (dbc_tdof_list.Size())
   {
     if (diag_policy == Operator::DiagonalPolicy::DIAG_ONE)
     {
-      linalg::SetSubVector(ty, *dbc_tdof_list, x);
+      linalg::SetSubVector(ty, dbc_tdof_list, x);
     }
     else if (diag_policy == Operator::DiagonalPolicy::DIAG_ZERO)
     {
-      linalg::SetSubVector(ty, *dbc_tdof_list, 0.0);
+      linalg::SetSubVector(ty, dbc_tdof_list, 0.0);
     }
   }
   y.AXPY(a, ty);
@@ -623,13 +608,14 @@ void ComplexParOperator::AddMultTranspose(const ComplexVector &x, ComplexVector
 {
   MFEM_ASSERT(x.Size() == height && y.Size() == width,
               "Incompatible dimensions for ComplexParOperator::AddMultTranspose!");
+
   auto &lx = trial_fespace.GetLVector<ComplexVector>();
   auto &ly = GetTestLVector();
-  if (dbc_tdof_list)
+  if (dbc_tdof_list.Size())
   {
     auto &ty = test_fespace.GetTVector<ComplexVector>();
     ty = x;
-    linalg::SetSubVector(ty, *dbc_tdof_list, 0.0);
+    linalg::SetSubVector(ty, dbc_tdof_list, 0.0);
     RestrictionMatrixMultTranspose(ty, ly);
   }
   else
@@ -643,15 +629,15 @@ void ComplexParOperator::AddMultTranspose(const ComplexVector &x, ComplexVector
   auto &tx = trial_fespace.GetTVector<ComplexVector>();
   trial_fespace.GetProlongationMatrix()->MultTranspose(lx.Real(), tx.Real());
   trial_fespace.GetProlongationMatrix()->MultTranspose(lx.Imag(), tx.Imag());
-  if (dbc_tdof_list)
+  if (dbc_tdof_list.Size())
   {
     if (diag_policy == Operator::DiagonalPolicy::DIAG_ONE)
     {
-      linalg::SetSubVector(tx, *dbc_tdof_list, x);
+      linalg::SetSubVector(tx, dbc_tdof_list, x);
     }
     else if (diag_policy == Operator::DiagonalPolicy::DIAG_ZERO)
     {
-      linalg::SetSubVector(tx, *dbc_tdof_list, 0.0);
+      linalg::SetSubVector(tx, dbc_tdof_list, 0.0);
     }
   }
   y.AXPY(a, tx);
@@ -662,13 +648,14 @@ void ComplexParOperator::AddMultHermitianTranspose(const ComplexVector &x, Compl
 {
   MFEM_ASSERT(x.Size() == height && y.Size() == width,
               "Incompatible dimensions for ComplexParOperator::AddMultHermitianTranspose!");
+
   auto &lx = trial_fespace.GetLVector<ComplexVector>();
   auto &ly = GetTestLVector();
-  if (dbc_tdof_list)
+  if (dbc_tdof_list.Size())
   {
     auto &ty = test_fespace.GetTVector<ComplexVector>();
     ty = x;
-    linalg::SetSubVector(ty, *dbc_tdof_list, 0.0);
+    linalg::SetSubVector(ty, dbc_tdof_list, 0.0);
     RestrictionMatrixMultTranspose(ty, ly);
   }
   else
@@ -682,15 +669,15 @@ void ComplexParOperator::AddMultHermitianTranspose(const ComplexVector &x, Compl
   auto &tx = trial_fespace.GetTVector<ComplexVector>();
   trial_fespace.GetProlongationMatrix()->MultTranspose(lx.Real(), tx.Real());
   trial_fespace.GetProlongationMatrix()->MultTranspose(lx.Imag(), tx.Imag());
-  if (dbc_tdof_list)
+  if (dbc_tdof_list.Size())
   {
     if (diag_policy == Operator::DiagonalPolicy::DIAG_ONE)
     {
-      linalg::SetSubVector(tx, *dbc_tdof_list, x);
+      linalg::SetSubVector(tx, dbc_tdof_list, x);
     }
     else if (diag_policy == Operator::DiagonalPolicy::DIAG_ZERO)
     {
-      linalg::SetSubVector(tx, *dbc_tdof_list, 0.0);
+      linalg::SetSubVector(tx, dbc_tdof_list, 0.0);
     }
   }
   y.AXPY(a, tx);
diff --git a/palace/linalg/rap.hpp b/palace/linalg/rap.hpp
index 4c9d36174..1be52fa20 100644
--- a/palace/linalg/rap.hpp
+++ b/palace/linalg/rap.hpp
@@ -32,7 +32,7 @@ class ParOperator : public Operator
   const bool use_R;
 
   // Lists of constrained essential boundary true dofs for elimination.
-  const mfem::Array<int> *dbc_tdof_list;
+  mfem::Array<int> dbc_tdof_list;
 
   // Diagonal policy for constrained true dofs.
   DiagonalPolicy diag_policy;
@@ -78,7 +78,10 @@ class ParOperator : public Operator
 
   // Get the essential boundary condition true dofs associated with the operator. May be
   // nullptr.
-  const mfem::Array<int> *GetEssentialTrueDofs() const { return dbc_tdof_list; }
+  const mfem::Array<int> *GetEssentialTrueDofs() const
+  {
+    return dbc_tdof_list.Size() ? &dbc_tdof_list : nullptr;
+  }
 
   // Eliminate essential true dofs from the RHS vector b, using the essential boundary
   // condition values in x.
@@ -119,7 +122,7 @@ class ComplexParOperator : public ComplexOperator
   const bool use_R;
 
   // Lists of constrained essential boundary true dofs for elimination.
-  const mfem::Array<int> *dbc_tdof_list;
+  mfem::Array<int> dbc_tdof_list;
 
   // Diagonal policy for constrained true dofs.
   Operator::DiagonalPolicy diag_policy;
@@ -174,7 +177,10 @@ class ComplexParOperator : public ComplexOperator
 
   // Get the essential boundary condition true dofs associated with the operator. May be
   // nullptr.
-  const mfem::Array<int> *GetEssentialTrueDofs() const { return dbc_tdof_list; }
+  const mfem::Array<int> *GetEssentialTrueDofs() const
+  {
+    return dbc_tdof_list.Size() ? &dbc_tdof_list : nullptr;
+  }
 
   void AssembleDiagonal(ComplexVector &diag) const override;
 
diff --git a/palace/models/curlcurloperator.cpp b/palace/models/curlcurloperator.cpp
index ce49dae17..0fdd0eb15 100644
--- a/palace/models/curlcurloperator.cpp
+++ b/palace/models/curlcurloperator.cpp
@@ -8,6 +8,7 @@
 #include "fem/integrator.hpp"
 #include "fem/mesh.hpp"
 #include "fem/multigrid.hpp"
+#include "linalg/hypre.hpp"
 #include "linalg/rap.hpp"
 #include "utils/communication.hpp"
 #include "utils/geodata.hpp"
@@ -168,9 +169,9 @@ std::unique_ptr<Operator> CurlCurlOperator::GetStiffnessMatrix()
     {
       Mpi::Print(" Level {:d} (p = {:d}): {:d} unknowns", l,
                  nd_fespace_l.GetMaxElementOrder(), nd_fespace_l.GlobalTrueVSize());
-      if (const auto *k_spm = dynamic_cast<const mfem::SparseMatrix *>(k_vec[l].get()))
+      if (const auto *k_spm = dynamic_cast<const hypre::HypreCSRMatrix *>(k_vec[l].get()))
       {
-        HYPRE_BigInt nnz = k_spm->NumNonZeroElems();
+        HYPRE_BigInt nnz = k_spm->NNZ();
         Mpi::GlobalSum(1, &nnz, nd_fespace_l.GetComm());
         Mpi::Print(", {:d} NNZ\n", nnz);
       }
diff --git a/palace/models/laplaceoperator.cpp b/palace/models/laplaceoperator.cpp
index 265b30fc4..769b26756 100644
--- a/palace/models/laplaceoperator.cpp
+++ b/palace/models/laplaceoperator.cpp
@@ -7,6 +7,7 @@
 #include "fem/integrator.hpp"
 #include "fem/mesh.hpp"
 #include "fem/multigrid.hpp"
+#include "linalg/hypre.hpp"
 #include "linalg/rap.hpp"
 #include "utils/communication.hpp"
 #include "utils/geodata.hpp"
@@ -187,9 +188,9 @@ std::unique_ptr<Operator> LaplaceOperator::GetStiffnessMatrix()
     {
       Mpi::Print(" Level {:d} (p = {:d}): {:d} unknowns", l,
                  h1_fespace_l.GetMaxElementOrder(), h1_fespace_l.GlobalTrueVSize());
-      if (const auto *k_spm = dynamic_cast<const mfem::SparseMatrix *>(k_vec[l].get()))
+      if (const auto *k_spm = dynamic_cast<const hypre::HypreCSRMatrix *>(k_vec[l].get()))
       {
-        HYPRE_BigInt nnz = k_spm->NumNonZeroElems();
+        HYPRE_BigInt nnz = k_spm->NNZ();
         Mpi::GlobalSum(1, &nnz, h1_fespace_l.GetComm());
         Mpi::Print(", {:d} NNZ\n", nnz);
       }
diff --git a/palace/models/spaceoperator.cpp b/palace/models/spaceoperator.cpp
index 7b94367dd..096e68e55 100644
--- a/palace/models/spaceoperator.cpp
+++ b/palace/models/spaceoperator.cpp
@@ -9,6 +9,7 @@
 #include "fem/integrator.hpp"
 #include "fem/mesh.hpp"
 #include "fem/multigrid.hpp"
+#include "linalg/hypre.hpp"
 #include "linalg/rap.hpp"
 #include "utils/communication.hpp"
 #include "utils/geodata.hpp"
@@ -769,14 +770,14 @@ std::unique_ptr<OperType> SpaceOperator::GetPreconditionerMatrix(double a0, doub
       {
         Mpi::Print(" Level {:d}{} (p = {:d}): {:d} unknowns", l, aux ? " (auxiliary)" : "",
                    fespace_l.GetMaxElementOrder(), fespace_l.GlobalTrueVSize());
-        const auto *b_spm = dynamic_cast<const mfem::SparseMatrix *>(br_l.get());
+        const auto *b_spm = dynamic_cast<const hypre::HypreCSRMatrix *>(br_l.get());
         if (!b_spm)
         {
-          b_spm = dynamic_cast<const mfem::SparseMatrix *>(bi_l.get());
+          b_spm = dynamic_cast<const hypre::HypreCSRMatrix *>(bi_l.get());
         }
         if (b_spm)
         {
-          HYPRE_BigInt nnz = b_spm->NumNonZeroElems();
+          HYPRE_BigInt nnz = b_spm->NNZ();
           Mpi::GlobalSum(1, &nnz, fespace_l.GetComm());
           Mpi::Print(", {:d} NNZ\n", nnz);
         }
diff --git a/test/unit/test-libceed.cpp b/test/unit/test-libceed.cpp
index 397852e1e..725b716bc 100644
--- a/test/unit/test-libceed.cpp
+++ b/test/unit/test-libceed.cpp
@@ -12,6 +12,7 @@
 #include "fem/fespace.hpp"
 #include "fem/integrator.hpp"
 #include "fem/mesh.hpp"
+#include "linalg/hypre.hpp"
 #include "models/materialoperator.hpp"
 #include "utils/communication.hpp"
 
@@ -253,8 +254,7 @@ void TestCeedOperatorMult(const Operator &op_test, const Operator &op_ref,
   }
 }
 
-void TestCeedOperatorFullAssemble(const mfem::SparseMatrix &mat_test,
-                                  const mfem::SparseMatrix &mat_ref)
+void TestCeedOperatorFullAssemble(mfem::SparseMatrix &mat_test, mfem::SparseMatrix &mat_ref)
 {
   // Ensure host memory is up to date (mfem::Add is missing the device to host copy).
   mat_test.HostReadI();
@@ -270,20 +270,47 @@ void TestCeedOperatorFullAssemble(const mfem::SparseMatrix &mat_test,
   REQUIRE(mat_diff->MaxNorm() < 1.0e-12 * std::max(mat_ref.MaxNorm(), 1.0));
 }
 
+void TestCeedOperatorFullAssemble(hypre::HypreCSRMatrix &mat_test,
+                                  mfem::SparseMatrix &mat_ref)
+{
+  // Copy test matrix into MFEM's sparse matrix data type.
+  hypre_CSRMatrixMigrate(mat_test, HYPRE_MEMORY_HOST);
+  mfem::SparseMatrix mat_test_sp(mat_test.GetI(), mat_test.GetJ(), mat_test.GetData(),
+                                 mat_test.Height(), mat_test.Width(), false, false, false);
+
+  // Perform the test.
+  TestCeedOperatorFullAssemble(mat_test_sp, mat_ref);
+}
+
+void TestCeedOperatorFullAssemble(hypre::HypreCSRMatrix &mat_test,
+                                  hypre::HypreCSRMatrix &mat_ref)
+{
+  // Copy test and reference matrix into MFEM's sparse matrix data type.
+  hypre_CSRMatrixMigrate(mat_test, HYPRE_MEMORY_HOST);
+  hypre_CSRMatrixMigrate(mat_ref, HYPRE_MEMORY_HOST);
+  mfem::SparseMatrix mat_test_sp(mat_test.GetI(), mat_test.GetJ(), mat_test.GetData(),
+                                 mat_test.Height(), mat_test.Width(), false, false, false);
+  mfem::SparseMatrix mat_ref_sp(mat_ref.GetI(), mat_ref.GetJ(), mat_ref.GetData(),
+                                mat_ref.Height(), mat_ref.Width(), false, false, false);
+
+  // Perform the test.
+  TestCeedOperatorFullAssemble(mat_test_sp, mat_ref_sp);
+}
+
 template <typename T1, typename T2>
 void TestCeedOperator(T1 &a_test, T2 &a_ref, bool test_transpose, bool skip_zeros)
 {
   a_ref.Assemble(skip_zeros);
   a_ref.Finalize(skip_zeros);
-  const mfem::SparseMatrix *mat_ref = &a_ref.SpMat();
-  const Operator *op_ref = mat_ref;
+  auto *mat_ref = &a_ref.SpMat();
+  auto *op_ref = mat_ref;
 
   // Test operator application.
-  const auto op_test = a_test.PartialAssemble();
+  auto op_test = a_test.PartialAssemble();
   TestCeedOperatorMult(*op_test, *op_ref, test_transpose);
 
   // Test full assembly.
-  const auto mat_test = a_test.FullAssemble(*op_test, skip_zeros);
+  auto mat_test = a_test.FullAssemble(*op_test, skip_zeros);
   TestCeedOperatorFullAssemble(*mat_test, *mat_ref);
 
   // Test diagonal assembly if possible.
@@ -350,11 +377,11 @@ void BenchmarkCeedIntegrator(FiniteElementSpace &fespace, T1 AssembleTest,
   if (!benchmark_no_fa)
   {
     constexpr bool bdr_integ = true;
-    const auto op_test = AssembleTest(fespace, bdr_integ);
-    const auto op_test_ref = AssembleTestRef(fespace, bdr_integ);
-    const auto mat_test = BilinearForm::FullAssemble(*op_test, skip_zeros);
-    const auto mat_test_ref = BilinearForm::FullAssemble(*op_test_ref, skip_zeros);
-    nnz = mat_test->NumNonZeroElems();
+    auto op_test = AssembleTest(fespace, bdr_integ);
+    auto op_test_ref = AssembleTestRef(fespace, bdr_integ);
+    auto mat_test = BilinearForm::FullAssemble(*op_test, skip_zeros);
+    auto mat_test_ref = BilinearForm::FullAssemble(*op_test_ref, skip_zeros);
+    nnz = mat_test->NNZ();
     TestCeedOperatorFullAssemble(*mat_test, *mat_test_ref);
   }
 
@@ -363,11 +390,11 @@ void BenchmarkCeedIntegrator(FiniteElementSpace &fespace, T1 AssembleTest,
   {
     BENCHMARK("Assemble (MFEM Legacy)")
     {
-      const auto op_ref = AssembleRef(fespace, mfem::AssemblyLevel::LEGACY, skip_zeros);
+      auto op_ref = AssembleRef(fespace, mfem::AssemblyLevel::LEGACY, skip_zeros);
       return op_ref->Height();
     };
     {
-      const auto op_ref = AssembleRef(fespace, mfem::AssemblyLevel::LEGACY, skip_zeros);
+      auto op_ref = AssembleRef(fespace, mfem::AssemblyLevel::LEGACY, skip_zeros);
       y_ref = 0.0;
       BENCHMARK("AddMult (MFEM Legacy)")
       {
@@ -382,11 +409,11 @@ void BenchmarkCeedIntegrator(FiniteElementSpace &fespace, T1 AssembleTest,
   {
     BENCHMARK("Assemble (MFEM Partial)")
     {
-      const auto op_ref = AssembleRef(fespace, mfem::AssemblyLevel::PARTIAL, skip_zeros);
+      auto op_ref = AssembleRef(fespace, mfem::AssemblyLevel::PARTIAL, skip_zeros);
       return op_ref->Height();
     };
     {
-      const auto op_ref = AssembleRef(fespace, mfem::AssemblyLevel::PARTIAL, skip_zeros);
+      auto op_ref = AssembleRef(fespace, mfem::AssemblyLevel::PARTIAL, skip_zeros);
       y_ref = 0.0;
       BENCHMARK("AddMult (MFEM Partial)")
       {
@@ -401,11 +428,11 @@ void BenchmarkCeedIntegrator(FiniteElementSpace &fespace, T1 AssembleTest,
   // Benchmark libCEED assembly.
   BENCHMARK("Assemble (libCEED)")
   {
-    const auto op_test = AssembleTest(fespace);
+    auto op_test = AssembleTest(fespace);
     return op_test->Height();
   };
   {
-    const auto op_test = AssembleTest(fespace);
+    auto op_test = AssembleTest(fespace);
     y_test = 0.0;
     BENCHMARK("AddMult (libCEED)")
     {
@@ -417,9 +444,9 @@ void BenchmarkCeedIntegrator(FiniteElementSpace &fespace, T1 AssembleTest,
   {
     BENCHMARK("Full Assemble (libCEED)")
     {
-      const auto op_test = AssembleTest(fespace);
-      const auto mat_test = BilinearForm::FullAssemble(*op_test, skip_zeros);
-      return mat_test->NumNonZeroElems();
+      auto op_test = AssembleTest(fespace);
+      auto mat_test = BilinearForm::FullAssemble(*op_test, skip_zeros);
+      return mat_test->NNZ();
     };
   }
 
@@ -473,12 +500,12 @@ void BenchmarkCeedInterpolator(FiniteElementSpace &trial_fespace,
   std::size_t nnz = 0;
   if (!benchmark_no_fa)
   {
-    const auto op_test = AssembleTest(trial_fespace, test_fespace);
-    const auto op_ref = AssembleRef(trial_fespace, test_fespace,
-                                    mfem::AssemblyLevel::LEGACY, skip_zeros_interp);
-    const auto mat_test = DiscreteLinearOperator::FullAssemble(*op_test, skip_zeros_interp);
-    const auto *mat_ref = &op_ref->SpMat();
-    nnz = mat_test->NumNonZeroElems();
+    auto op_test = AssembleTest(trial_fespace, test_fespace);
+    auto op_ref = AssembleRef(trial_fespace, test_fespace, mfem::AssemblyLevel::LEGACY,
+                              skip_zeros_interp);
+    auto mat_test = DiscreteLinearOperator::FullAssemble(*op_test, skip_zeros_interp);
+    auto *mat_ref = &op_ref->SpMat();
+    nnz = mat_test->NNZ();
     TestCeedOperatorFullAssemble(*mat_test, *mat_ref);
   }
 
@@ -487,13 +514,13 @@ void BenchmarkCeedInterpolator(FiniteElementSpace &trial_fespace,
   {
     BENCHMARK("Assemble (MFEM Legacy)")
     {
-      const auto op_ref = AssembleRef(trial_fespace, test_fespace,
-                                      mfem::AssemblyLevel::LEGACY, skip_zeros_interp);
+      auto op_ref = AssembleRef(trial_fespace, test_fespace, mfem::AssemblyLevel::LEGACY,
+                                skip_zeros_interp);
       return op_ref->Height();
     };
     {
-      const auto op_ref = AssembleRef(trial_fespace, test_fespace,
-                                      mfem::AssemblyLevel::LEGACY, skip_zeros_interp);
+      auto op_ref = AssembleRef(trial_fespace, test_fespace, mfem::AssemblyLevel::LEGACY,
+                                skip_zeros_interp);
       y_ref = 0.0;
       BENCHMARK("AddMult (MFEM Legacy)")
       {
@@ -509,13 +536,13 @@ void BenchmarkCeedInterpolator(FiniteElementSpace &trial_fespace,
   {
     BENCHMARK("Assemble (MFEM Partial)")
     {
-      const auto op_ref = AssembleRef(trial_fespace, test_fespace,
-                                      mfem::AssemblyLevel::PARTIAL, skip_zeros_interp);
+      auto op_ref = AssembleRef(trial_fespace, test_fespace, mfem::AssemblyLevel::PARTIAL,
+                                skip_zeros_interp);
       return op_ref->Height();
     };
     {
-      const auto op_ref = AssembleRef(trial_fespace, test_fespace,
-                                      mfem::AssemblyLevel::PARTIAL, skip_zeros_interp);
+      auto op_ref = AssembleRef(trial_fespace, test_fespace, mfem::AssemblyLevel::PARTIAL,
+                                skip_zeros_interp);
       y_ref = 0.0;
       BENCHMARK("AddMult (MFEM Partial)")
       {
@@ -530,11 +557,11 @@ void BenchmarkCeedInterpolator(FiniteElementSpace &trial_fespace,
   // Benchmark libCEED assembly.
   BENCHMARK("Assemble (libCEED)")
   {
-    const auto op_test = AssembleTest(trial_fespace, test_fespace);
+    auto op_test = AssembleTest(trial_fespace, test_fespace);
     return op_test->Height();
   };
   {
-    const auto op_test = AssembleTest(trial_fespace, test_fespace);
+    auto op_test = AssembleTest(trial_fespace, test_fespace);
     y_test = 0.0;
     BENCHMARK("AddMult (libCEED)")
     {
@@ -546,10 +573,9 @@ void BenchmarkCeedInterpolator(FiniteElementSpace &trial_fespace,
   {
     BENCHMARK("Full Assemble (libCEED)")
     {
-      const auto op_test = AssembleTest(trial_fespace, test_fespace);
-      const auto mat_test =
-          DiscreteLinearOperator::FullAssemble(*op_test, skip_zeros_interp);
-      return mat_test->NumNonZeroElems();
+      auto op_test = AssembleTest(trial_fespace, test_fespace);
+      auto mat_test = DiscreteLinearOperator::FullAssemble(*op_test, skip_zeros_interp);
+      return mat_test->NNZ();
     };
   }