diff --git a/examples/sparse/CMakeLists.txt b/examples/sparse/CMakeLists.txt
index 5fc2eaf8..1ee36d54 100644
--- a/examples/sparse/CMakeLists.txt
+++ b/examples/sparse/CMakeLists.txt
@@ -2,6 +2,8 @@ add_executable(testPoisson2d      EXCLUDE_FROM_ALL testPoisson2d.cpp)
 add_executable(testMMdouble       EXCLUDE_FROM_ALL testMMdouble.cpp)
 add_executable(testPoisson3d      EXCLUDE_FROM_ALL testPoisson3d.cpp)
 add_executable(testMixedPrecision EXCLUDE_FROM_ALL testMixedPrecision.cpp)
+add_executable(testMixedPrecisionSymmetricPositiveDefinite EXCLUDE_FROM_ALL testMixedPrecisionSymmetricPositiveDefinite.cpp)
+add_executable(testSymmetricPositiveDefinite EXCLUDE_FROM_ALL testSymmetricPositiveDefinite.cpp)
 add_executable(sexample           EXCLUDE_FROM_ALL sexample.c)
 add_executable(dexample           EXCLUDE_FROM_ALL dexample.c)
 add_executable(cexample           EXCLUDE_FROM_ALL cexample.c)
@@ -13,6 +15,8 @@ target_link_libraries(testPoisson2d strumpack)
 target_link_libraries(testMMdouble strumpack)
 target_link_libraries(testPoisson3d strumpack)
 target_link_libraries(testMixedPrecision strumpack)
+target_link_libraries(testMixedPrecisionSymmetricPositiveDefinite strumpack)
+target_link_libraries(testSymmetricPositiveDefinite strumpack)
 target_link_libraries(sexample strumpack)
 target_link_libraries(dexample strumpack)
 target_link_libraries(cexample strumpack)
@@ -24,6 +28,8 @@ add_dependencies(examples
   testMMdouble
   testPoisson3d
   testMixedPrecision
+  testMixedPrecisionSymmetricPositiveDefinite
+  testSymmetricPositiveDefinite
   sexample
   dexample
   cexample
diff --git a/examples/sparse/testMixedPrecisionSymmetricPositiveDefinite.cpp b/examples/sparse/testMixedPrecisionSymmetricPositiveDefinite.cpp
new file mode 100644
index 00000000..d2476f68
--- /dev/null
+++ b/examples/sparse/testMixedPrecisionSymmetricPositiveDefinite.cpp
@@ -0,0 +1,181 @@
+//
+// Created by tingxuan on 2023/12/24.
+//
+/*
+ * STRUMPACK -- STRUctured Matrices PACKage, Copyright (c) 2014, The
+ * Regents of the University of California, through Lawrence Berkeley
+ * National Laboratory (subject to receipt of any required approvals
+ * from the U.S. Dept. of Energy).  All rights reserved.
+ *
+ * If you have questions about your rights to use or distribute this
+ * software, please contact Berkeley Lab's Technology Transfer
+ * Department at TTD@lbl.gov.
+ *
+ * NOTICE. This software is owned by the U.S. Department of Energy. As
+ * such, the U.S. Government has been granted for itself and others
+ * acting on its behalf a paid-up, nonexclusive, irrevocable,
+ * worldwide license in the Software to reproduce, prepare derivative
+ * works, and perform publicly and display publicly.  Beginning five
+ * (5) years after the date permission to assert copyright is obtained
+ * from the U.S. Department of Energy, and subject to any subsequent
+ * five (5) year renewals, the U.S. Government is granted for itself
+ * and others acting on its behalf a paid-up, nonexclusive,
+ * irrevocable, worldwide license in the Software to reproduce,
+ * prepare derivative works, distribute copies to the public, perform
+ * publicly and display publicly, and to permit others to do so.
+ *
+ * Developers: Pieter Ghysels, Francois-Henry Rouet, Xiaoye S. Li.
+ *             (Lawrence Berkeley National Lab, Computational Research
+ *             Division).
+ *
+ */
+#include <iostream>
+#include <type_traits>
+#include <random>
+#include <cmath>
+
+#include "StrumpackSparseSolver.hpp"
+#include "StrumpackSparseSolverMixedPrecision.hpp"
+#include "sparse/CSRMatrix.hpp"
+
+using namespace strumpack;
+
+
+/**
+ * Test the STRUMPACK sparse solver, and the mixed precision sparse
+ * solver.
+ *
+ * For working_t == float, the mixed precision solver will
+ * compute the factorization in single precision, but do iterative
+ * refinement in double precision to give a more accurate results than
+ * the standard single precision solver.
+ *
+ * For working_t == double, the mixed precision solver will compute
+ * the factorization in single precision and perform the iterative
+ * refinement in double precision. If the problem is not too
+ * ill-conditioned, this should be about as accurate, and about twice
+ * as fast as the standard double precision solver. The speedup
+ * depends on the relative cost of the sparse triangular solver phase
+ * compared to the sparse LU factorization phase.
+ *
+ * TODO long double
+ */
+template<typename working_t>
+void test(CSRMatrix<working_t,int>& A,
+          DenseMatrix<working_t>& b, DenseMatrix<working_t>& x_exact,
+          int argc, char* argv[]) {
+    int m = b.cols();  // number of right-hand sides
+    auto N = A.size();
+    DenseMatrix<working_t> x(N, m);
+
+    std::cout << std::endl;
+    std::cout << "###############################################" << std::endl;
+    std::cout << "### Working precision: " <<
+              (std::is_same<float,working_t>::value ? "single" : "double")
+              << " #################" << std::endl;
+    std::cout << "###############################################" << std::endl;
+
+    {
+        std::cout << std::endl;
+        std::cout << "### MIXED Precision Solver ####################" << std::endl;
+
+        SparseSolverMixedPrecision<float,double,int> spss;
+        /** options for the outer solver */
+        spss.options().set_Krylov_solver(KrylovSolver::REFINE);
+//     spss.options().set_Krylov_solver(KrylovSolver::PREC_BICGSTAB);
+//     spss.options().set_Krylov_solver(KrylovSolver::PREC_GMRES);
+        spss.options().set_rel_tol(1e-14);
+        spss.options().set_from_command_line(argc, argv);
+
+        /** options for the inner solver */
+        spss.solver().options().set_Krylov_solver(KrylovSolver::DIRECT);
+        spss.solver().options().set_from_command_line(argc, argv);
+        spss.options().set_matching(strumpack::MatchingJob::NONE);
+        spss.solver().options().set_matching(strumpack::MatchingJob::NONE);
+        spss.options().enable_symmetric();
+        spss.options().enable_positive_definite();
+        spss.solver().options().enable_symmetric();
+        spss.solver().options().enable_positive_definite();
+
+        spss.set_matrix(A);
+        spss.reorder();
+        spss.factor();
+        spss.solve(b, x);
+
+        std::cout << "# COMPONENTWISE SCALED RESIDUAL = "
+                  << A.max_scaled_residual(x.data(), b.data()) << std::endl;
+        strumpack::blas::axpy(N, -1., x_exact.data(), 1, x.data(), 1);
+        auto nrm_error = strumpack::blas::nrm2(N, x.data(), 1);
+        auto nrm_x_exact = strumpack::blas::nrm2(N, x_exact.data(), 1);
+        std::cout << "# RELATIVE ERROR = " << (nrm_error/nrm_x_exact) << std::endl;
+    }
+
+    std::cout << std::endl;
+}
+
+
+int main(int argc, char* argv[]) {
+
+    std::cout << "long double size in bytes: "
+              << sizeof(long double) << " "
+              << std::endl;
+
+    std::string f;
+    if (argc > 1) f = std::string(argv[1]);
+
+    CSRMatrix<double,int> A_d;
+    A_d.read_matrix_market(f);
+    auto A_f = cast_matrix<double,int,float>(A_d);
+
+    int N = A_d.size();
+    int m = 1; // nr of RHSs
+    DenseMatrix<double> b_d(N, m), x_true_d(N, m);
+
+
+    // set the exact solution, see:
+    //   http://www.netlib.org/lapack/lawnspdf/lawn165.pdf
+    // page 20
+    std::default_random_engine gen;
+    std::uniform_real_distribution<double> dist(0., std::sqrt(24.));
+    for (int j=0; j<m; j++) {
+        // step 4, use a different tau for each RHS
+        double tau = std::pow(dist(gen), 2.);
+        for (int i=0; i<N; i++)
+            // step 4c
+            x_true_d(i, j) = std::pow(tau, -double(i)/(N-1));
+    }
+
+    // step 6, but in double, not double-double
+    A_d.spmv(x_true_d, b_d);
+    {
+        DenseMatrix<double> x(N, m);
+        // step 7, but in double, not double-double
+        SparseSolver<double,int> spss;
+        // SparseSolverMixedPrecision<double,long double,int> spss;
+        spss.options().enable_symmetric();
+        spss.options().enable_positive_definite();
+        spss.set_matrix(A_d);
+        spss.options().set_matching(strumpack::MatchingJob::NONE);
+        spss.options().set_Krylov_solver(KrylovSolver::DIRECT);
+        spss.solve(b_d, x);
+
+
+        std::cout << "# COMPONENTWISE SCALED RESIDUAL = "
+                  << A_d.max_scaled_residual(x.data(), b_d.data()) << std::endl;
+        strumpack::blas::axpy(N, -1., x_true_d.data(), 1, x.data(), 1);
+        auto nrm_error = strumpack::blas::nrm2(N, x.data(), 1);
+        auto nrm_x_exact = strumpack::blas::nrm2(N, x_true_d.data(), 1);
+        std::cout << "# RELATIVE ERROR = " << (nrm_error/nrm_x_exact) << std::endl;
+
+    }
+
+    // cast RHS and true solution to single precision
+    DenseMatrix<float> b_f(N, m), x_true_f(N, m);
+    copy(x_true_d, x_true_f);
+    copy(b_d, b_f);
+
+    test<double>(A_d, b_d, x_true_d, argc, argv);
+    test<float >(A_f, b_f, x_true_f, argc, argv);
+
+    return 0;
+}
diff --git a/examples/sparse/testSymmetricPositiveDefinite.cpp b/examples/sparse/testSymmetricPositiveDefinite.cpp
new file mode 100644
index 00000000..39383e01
--- /dev/null
+++ b/examples/sparse/testSymmetricPositiveDefinite.cpp
@@ -0,0 +1,181 @@
+//
+// Created by tingxuan on 2023/12/24.
+//
+/*
+ * STRUMPACK -- STRUctured Matrices PACKage, Copyright (c) 2014, The
+ * Regents of the University of California, through Lawrence Berkeley
+ * National Laboratory (subject to receipt of any required approvals
+ * from the U.S. Dept. of Energy).  All rights reserved.
+ *
+ * If you have questions about your rights to use or distribute this
+ * software, please contact Berkeley Lab's Technology Transfer
+ * Department at TTD@lbl.gov.
+ *
+ * NOTICE. This software is owned by the U.S. Department of Energy. As
+ * such, the U.S. Government has been granted for itself and others
+ * acting on its behalf a paid-up, nonexclusive, irrevocable,
+ * worldwide license in the Software to reproduce, prepare derivative
+ * works, and perform publicly and display publicly.  Beginning five
+ * (5) years after the date permission to assert copyright is obtained
+ * from the U.S. Department of Energy, and subject to any subsequent
+ * five (5) year renewals, the U.S. Government is granted for itself
+ * and others acting on its behalf a paid-up, nonexclusive,
+ * irrevocable, worldwide license in the Software to reproduce,
+ * prepare derivative works, distribute copies to the public, perform
+ * publicly and display publicly, and to permit others to do so.
+ *
+ * Developers: Pieter Ghysels, Francois-Henry Rouet, Xiaoye S. Li.
+ *             (Lawrence Berkeley National Lab, Computational Research
+ *             Division).
+ *
+ */
+#include <iostream>
+#include <type_traits>
+#include <random>
+#include <cmath>
+
+#include "StrumpackSparseSolver.hpp"
+#include "StrumpackSparseSolverMixedPrecision.hpp"
+#include "sparse/CSRMatrix.hpp"
+
+using namespace strumpack;
+
+
+/**
+ * Test the STRUMPACK sparse solver, and the mixed precision sparse
+ * solver.
+ *
+ * For working_t == float, the mixed precision solver will
+ * compute the factorization in single precision, but do iterative
+ * refinement in double precision to give a more accurate results than
+ * the standard single precision solver.
+ *
+ * For working_t == double, the mixed precision solver will compute
+ * the factorization in single precision and perform the iterative
+ * refinement in double precision. If the problem is not too
+ * ill-conditioned, this should be about as accurate, and about twice
+ * as fast as the standard double precision solver. The speedup
+ * depends on the relative cost of the sparse triangular solver phase
+ * compared to the sparse LU factorization phase.
+ *
+ * TODO long double
+ */
+template<typename working_t>
+void test(CSRMatrix<working_t,int>& A,
+          DenseMatrix<working_t>& b, DenseMatrix<working_t>& x_exact,
+          int argc, char* argv[]) {
+    int m = b.cols();  // number of right-hand sides
+    auto N = A.size();
+    DenseMatrix<working_t> x(N, m);
+
+    std::cout << std::endl;
+    std::cout << "###############################################" << std::endl;
+    std::cout << "### Working precision: " <<
+              (std::is_same<float,working_t>::value ? "single" : "double")
+              << " #################" << std::endl;
+    std::cout << "###############################################" << std::endl;
+
+    {
+        std::cout << std::endl;
+        std::cout << "### MIXED Precision Solver ####################" << std::endl;
+
+        SparseSolverMixedPrecision<float,double,int> spss;
+        /** options for the outer solver */
+        spss.options().set_Krylov_solver(KrylovSolver::REFINE);
+//     spss.options().set_Krylov_solver(KrylovSolver::PREC_BICGSTAB);
+//     spss.options().set_Krylov_solver(KrylovSolver::PREC_GMRES);
+        spss.options().set_rel_tol(1e-14);
+        spss.options().set_from_command_line(argc, argv);
+
+        /** options for the inner solver */
+        spss.solver().options().set_Krylov_solver(KrylovSolver::DIRECT);
+        spss.solver().options().set_from_command_line(argc, argv);
+        spss.options().set_matching(strumpack::MatchingJob::NONE);
+        spss.solver().options().set_matching(strumpack::MatchingJob::NONE);
+        spss.options().enable_symmetric();
+        spss.options().enable_positive_definite();
+        spss.solver().options().enable_symmetric();
+        spss.solver().options().enable_positive_definite();
+
+        spss.set_matrix(A);
+        spss.reorder();
+        spss.factor();
+        spss.solve(b, x);
+
+        std::cout << "# COMPONENTWISE SCALED RESIDUAL = "
+                  << A.max_scaled_residual(x.data(), b.data()) << std::endl;
+        strumpack::blas::axpy(N, -1., x_exact.data(), 1, x.data(), 1);
+        auto nrm_error = strumpack::blas::nrm2(N, x.data(), 1);
+        auto nrm_x_exact = strumpack::blas::nrm2(N, x_exact.data(), 1);
+        std::cout << "# RELATIVE ERROR = " << (nrm_error/nrm_x_exact) << std::endl;
+    }
+
+    std::cout << std::endl;
+}
+
+
+int main(int argc, char* argv[]) {
+
+    CSRMatrix<double,int> A_d;
+    if(argc > 1){
+        A_d.read_matrix_market(argv[1]);
+    }else{
+        int n =3;
+        int ptr[4] = {0,2,3,5};
+        int Index[5] = {0,2,1,0,2};
+        double val[5] = {2.1,1,3.5,1,5.2};
+        A_d = CSRMatrix<double,int>(n, ptr, Index, val);
+    }
+
+
+    int N = A_d.size();
+    int m = 1; // nr of RHSs
+    DenseMatrix<double> b_d(N, m), x_true_d(N, m);
+
+
+    // set the exact solution, see:
+    //   http://www.netlib.org/lapack/lawnspdf/lawn165.pdf
+    // page 20
+    std::default_random_engine gen;
+    std::uniform_real_distribution<double> dist(0., std::sqrt(24.));
+    for (int j=0; j<m; j++) {
+        // step 4, use a different tau for each RHS
+        double tau = std::pow(dist(gen), 2.);
+        for (int i=0; i<N; i++)
+            // step 4c
+            x_true_d(i, j) = std::pow(tau, -double(i)/(N-1));
+    }
+
+    // step 6, but in double, not double-double
+    A_d.spmv(x_true_d, b_d);
+    {
+        DenseMatrix<double> x(N, m);
+        // step 7, but in double, not double-double
+        SparseSolver<double,int> spss;
+        // SparseSolverMixedPrecision<double,long double,int> spss;
+        spss.options().enable_symmetric();
+        spss.options().enable_positive_definite();
+        spss.set_matrix(A_d);
+        spss.options().set_matching(strumpack::MatchingJob::NONE);
+        spss.options().set_Krylov_solver(KrylovSolver::DIRECT);
+        spss.solve(b_d, x);
+
+        std::cout<<"x_true_d=";
+        for(int r=0; r<N; r++){
+            for(int c=0; c<m; c++){
+                std::cout<<x_true_d.data()[r+m*c];
+            }
+            std::cout<<std::endl;
+        }
+
+        std::cout<<"x_solve=";
+        for(int r=0; r<N; r++){
+            for(int c=0; c<m; c++){
+                std::cout<<x.data()[r+m*c];
+            }
+            std::cout<<std::endl;
+        }
+
+    }
+    return 0;
+}
diff --git a/src/SparseSolver.cpp b/src/SparseSolver.cpp
index 589fb0bb..856d0714 100644
--- a/src/SparseSolver.cpp
+++ b/src/SparseSolver.cpp
@@ -88,6 +88,32 @@ namespace strumpack {
     factored_ = reordered_ = false;
   }
 
+  template <typename scalar_t, typename integer_t>
+  void SparseSolver<scalar_t, integer_t>::set_lower_triangle_matrix
+  (const CSRMatrix<scalar_t, integer_t> &A) {
+    auto ptr = A.ptr();
+    auto index = A.ind();
+    auto value = A.val();
+    std::vector<integer_t> mat_ptr = {0};
+    std::vector<integer_t> mat_ind;
+    std::vector<scalar_t> mat_val;
+    for (int row = 0; row < A.size(); ++row) {
+      mat_ptr.push_back(integer_t(0));
+      for (int j = ptr[row]; j < ptr[row + 1]; ++j) {
+        if (index[j] <= row) {
+          mat_ind.push_back(index[j]);
+          mat_val.push_back(value[j]);
+          mat_ptr[row + 1]++;
+        }
+      }
+      mat_ptr[row + 1] += mat_ptr[row];
+    }
+    mat_.reset(new CSRMatrix<scalar_t, integer_t>(
+        integer_t(mat_ptr.size() - 1), mat_ptr.data(), mat_ind.data(),
+        mat_val.data()));
+    factored_ = reordered_ = false;
+  }
+
   template<typename scalar_t,typename integer_t> void
   SparseSolver<scalar_t,integer_t>::update_matrix_values
   (const CSRMatrix<scalar_t,integer_t>& A) {
diff --git a/src/SparseSolverBase.cpp b/src/SparseSolverBase.cpp
index a5f72587..e6445866 100644
--- a/src/SparseSolverBase.cpp
+++ b/src/SparseSolverBase.cpp
@@ -333,8 +333,11 @@ namespace strumpack {
       }
     }
 
-    equil_ = matrix()->equilibration();
-    matrix()->equilibrate(equil_);
+    // TODO(Jie): disable equilibration for sym temperately
+    if (!is_symmetric(opts_)){
+      equil_ = matrix()->equilibration();
+      matrix()->equilibrate(equil_);
+    }
     if (opts_.verbose() && is_root_)
       std::cout << "# matrix equilibration, r_cond = "
                 << equil_.rcond << " , c_cond = " << equil_.ccond
diff --git a/src/SparseSolverMixedPrecision.cpp b/src/SparseSolverMixedPrecision.cpp
index bcfbfd9a..dee1d67d 100644
--- a/src/SparseSolverMixedPrecision.cpp
+++ b/src/SparseSolverMixedPrecision.cpp
@@ -206,6 +206,20 @@ namespace strumpack {
     solver_.set_matrix(A);
   }
 
+  template<typename factor_t,typename refine_t,typename integer_t> void
+  SparseSolverMixedPrecision<factor_t,refine_t,integer_t>::
+  set_lower_triangle_matrix(const CSRMatrix<refine_t,integer_t>& A) {
+    mat_ = A;
+    solver_.set_lower_triangle_matrix(cast_matrix<refine_t,integer_t,factor_t>(A));
+  }
+
+  template<typename factor_t,typename refine_t,typename integer_t> void
+  SparseSolverMixedPrecision<factor_t,refine_t,integer_t>::
+  set_lower_triangle_matrix(const CSRMatrix<factor_t,integer_t>& A) {
+    mat_ = cast_matrix<factor_t,integer_t,refine_t>(A);
+    solver_.set_lower_triangle_matrix(A);
+  }
+
   template<typename factor_t,typename refine_t,typename integer_t> void
   SparseSolverMixedPrecision<factor_t,refine_t,integer_t>::
   update_matrix_values(const CSRMatrix<refine_t,integer_t>& A) {
diff --git a/src/StrumpackOptions.hpp b/src/StrumpackOptions.hpp
index 508a2170..d190efb1 100644
--- a/src/StrumpackOptions.hpp
+++ b/src/StrumpackOptions.hpp
@@ -749,7 +749,28 @@ namespace strumpack {
      */
     void disable_gpu() { use_gpu_ = false; }
 
-    /**
+//      /**
+//     * Disable symmetric solver.
+//     */
+//      void disable_symmetric() { use_symmetric_ = false; }
+//
+//      /**
+//   * Disable positive_definite solver.
+//   */
+//      void disable_positive_definite() { use_positive_definite_ = false; }
+      /**
+      * Enable symmetric solver.
+      */
+      void enable_symmetric() { use_symmetric_ = true; }
+
+
+      /**
+      * Enable positive_definite solver.
+      */
+      void enable_positive_definite() { use_positive_definite_ = true; }
+
+
+      /**
      * Set the number of (CUDA) streams to be used in the code.
      */
     void set_gpu_streams(int s) { gpu_streams_ = s; }
@@ -1149,6 +1170,16 @@ namespace strumpack {
      */
     bool use_gpu() const { return use_gpu_; }
 
+    /**
+     * Check wheter or not to use symmetric solver.
+     */
+    bool use_symmetric() const { return use_symmetric_; }
+
+   /**
+    * Check wheter or not to use symmetric solver.
+    */
+    bool use_positive_definite() const { return use_positive_definite_; }
+
     /**
      * Check wheter or not to use OpenMP tree traversal is the sparse
      * solver.
@@ -1296,6 +1327,8 @@ namespace strumpack {
     bool print_comp_front_stats_ = false;
     ProportionalMapping prop_map_ = ProportionalMapping::FLOPS;
     bool use_openmp_tree_ = true;
+    bool use_symmetric_ = false;
+    bool use_positive_definite_ = false;
 
     /** GPU options */
 #if defined(STRUMPACK_USE_GPU)
diff --git a/src/StrumpackSparseSolver.hpp b/src/StrumpackSparseSolver.hpp
index a73b016e..8c088f51 100644
--- a/src/StrumpackSparseSolver.hpp
+++ b/src/StrumpackSparseSolver.hpp
@@ -177,6 +177,26 @@ namespace strumpack {
      *
      * \see set_csr_matrix, set_matrix
      */
+    void set_lower_triangle_matrix(const CSRMatrix<scalar_t,integer_t>& A);
+    /**
+     * Associate the lower triangle from a (sequential) NxN CSR matrix with this solver.
+     *
+     * This matrix will not be modified. An internal copy will be
+     * made, so it is safe to delete the data immediately after
+     * calling this function. See the manual for a description of the
+     * CSR format. You can also use the CSRMatrix class.
+     *
+     * \param N number of rows and columns of the full CSR input matrix.
+     * \param row_ptr indices in col_ind and values for the start of
+     * each row. Nonzeros for row r are in [row_ptr[r],row_ptr[r+1])
+     * \param col_ind column indices of each nonzero
+     * \param values nonzero values
+     * \param symmetric_pattern denotes whether the sparsity
+     * __pattern__ of the input matrix is symmetric, does not require
+     * the matrix __values__ to be symmetric
+     *
+     * \see set_matrix
+     */
     void update_matrix_values(integer_t N,
                               const integer_t* row_ptr,
                               const integer_t* col_ind,
diff --git a/src/StrumpackSparseSolverMixedPrecision.hpp b/src/StrumpackSparseSolverMixedPrecision.hpp
index 5b202567..7d7cc08b 100644
--- a/src/StrumpackSparseSolverMixedPrecision.hpp
+++ b/src/StrumpackSparseSolverMixedPrecision.hpp
@@ -111,6 +111,9 @@ namespace strumpack {
     void set_matrix(const CSRMatrix<refine_t,integer_t>& A);
     void set_matrix(const CSRMatrix<factor_t,integer_t>& A);
 
+    void set_lower_triangle_matrix(const CSRMatrix<refine_t,integer_t>& A);
+    void set_lower_triangle_matrix(const CSRMatrix<factor_t,integer_t>& A);
+
     void update_matrix_values(const CSRMatrix<refine_t,integer_t>& A);
     void update_matrix_values(const CSRMatrix<factor_t,integer_t>& A);
 
diff --git a/src/dense/CUDAWrapper.cpp b/src/dense/CUDAWrapper.cpp
index 56405d83..52de7693 100644
--- a/src/dense/CUDAWrapper.cpp
+++ b/src/dense/CUDAWrapper.cpp
@@ -205,6 +205,17 @@ namespace strumpack {
       }
     }
 
+        cublasFillMode_t F2cuOp(UpLo op) {
+            switch (op) {
+                case UpLo::U: return CUBLAS_FILL_MODE_UPPER;
+                case UpLo::L: return CUBLAS_FILL_MODE_LOWER;
+                case UpLo::F: return CUBLAS_FILL_MODE_FULL;
+                default:
+                    assert(false);
+                    return CUBLAS_FILL_MODE_LOWER;
+            }
+        }
+
     cublasSideMode_t S2cuOp(Side op) {
       switch (op) {
       case Side::L: return CUBLAS_SIDE_LEFT;
@@ -446,7 +457,77 @@ namespace strumpack {
                        const DenseMatrix<std::complex<double>>&, const DenseMatrix<std::complex<double>>&,
                        std::complex<double>, DenseMatrix<std::complex<double>>&);
 
-    void getrf_buffersize(cusolverDnHandle_t& handle, int m, int n, float* A, int lda, int* Lwork) {
+    void syrk(Handle& handle, cublasFillMode_t uplo,
+                  cublasOperation_t transa, int n, int k,
+                  float alpha, const float* A, int lda,
+                  float beta, float* C, int ldc) {
+            STRUMPACK_FLOPS(blas::gemm_flops(n,n,k,alpha,beta));
+            STRUMPACK_BYTES(4*blas::gemm_moves(n,n,k));
+            gpu_check(cublasSsyrk_v2(get_cublas_handle(handle), uplo, transa, n, k, &alpha, A, lda, &beta, C, ldc));
+        }
+        void syrk(Handle& handle, cublasFillMode_t uplo,
+                  cublasOperation_t transa, int n, int k,
+                  double alpha, const double* A, int lda,
+                  double beta, double* C, int ldc) {
+            STRUMPACK_FLOPS(blas::gemm_flops(n,n,k,alpha,beta));
+            STRUMPACK_BYTES(8*blas::gemm_moves(n,n,k));
+            gpu_check(cublasDsyrk_v2(get_cublas_handle(handle), uplo, transa, n, k, &alpha, A, lda, &beta, C, ldc));
+        }
+        void syrk(Handle& handle, cublasFillMode_t uplo,
+                  cublasOperation_t transa, int n, int k,
+                  std::complex<float> alpha,
+                  const std::complex<float>* A, int lda,
+                  std::complex<float> beta, std::complex<float> *C,
+                  int ldc) {
+            STRUMPACK_FLOPS(4*blas::gemm_flops(n,n,k,alpha,beta));
+            STRUMPACK_BYTES(2*4*blas::gemm_moves(n,n,k));
+            gpu_check(cublasCsyrk_v2(get_cublas_handle(handle), uplo, transa, n, k,
+                                     reinterpret_cast<cuComplex*>(&alpha),
+                                     reinterpret_cast<const cuComplex*>(A), lda,
+                                     reinterpret_cast<cuComplex*>(&beta),
+                                     reinterpret_cast<cuComplex*>(C), ldc));
+        }
+        void syrk(Handle& handle, cublasFillMode_t uplo,
+                  cublasOperation_t transa, int n, int k,
+                  std::complex<double> alpha,
+                  const std::complex<double> *A, int lda,
+                  std::complex<double> beta,
+                  std::complex<double> *C, int ldc) {
+            STRUMPACK_FLOPS(4*blas::gemm_flops(n,n,k,alpha,beta));
+            STRUMPACK_BYTES(2*8*blas::gemm_moves(n,n,k));
+            gpu_check(cublasZsyrk_v2(get_cublas_handle(handle), uplo, transa, n, k,
+                                     reinterpret_cast<cuDoubleComplex*>(&alpha),
+                                     reinterpret_cast<const cuDoubleComplex*>(A), lda,
+                                     reinterpret_cast<cuDoubleComplex*>(&beta),
+                                     reinterpret_cast<cuDoubleComplex*>(C), ldc));
+        }
+
+        template<typename scalar_t> void
+        syrk(Handle& handle, UpLo uplo, Trans ta,
+             scalar_t alpha, const DenseMatrix<scalar_t>& a,
+             scalar_t beta, DenseMatrix<scalar_t>& c) {
+            assert((ta==Trans::N && a.rows()==c.rows()) ||
+                   (ta!=Trans::N && a.cols()==c.rows()));
+            syrk(handle, F2cuOp(uplo), T2cuOp(ta), c.rows(),
+                 (ta==Trans::N) ? a.cols() : a.rows(), alpha, a.data(), a.ld(),
+                 beta, c.data(), c.ld());
+        }
+        template void syrk(Handle&, UpLo, Trans,
+                           float, const DenseMatrix<float>&,
+                           float, DenseMatrix<float>&);
+        template void syrk(Handle&, UpLo, Trans,
+                           double, const DenseMatrix<double>&,
+                           double, DenseMatrix<double>&);
+        template void syrk(Handle&, UpLo, Trans, std::complex<float>,
+                           const DenseMatrix<std::complex<float>>&,
+                           std::complex<float>,
+                           DenseMatrix<std::complex<float>>&);
+        template void syrk(Handle&, UpLo, Trans, std::complex<double>,
+                           const DenseMatrix<std::complex<double>>&,
+                           std::complex<double>,
+                           DenseMatrix<std::complex<double>>&);
+
+        void getrf_buffersize(cusolverDnHandle_t& handle, int m, int n, float* A, int lda, int* Lwork) {
       gpu_check(cusolverDnSgetrf_bufferSize(handle, m, n, A, lda, Lwork));
     }
     void getrf_buffersize(cusolverDnHandle_t& handle, int m, int n, double *A, int lda, int* Lwork) {
@@ -569,7 +650,75 @@ namespace strumpack {
     template void getrs(Handle&, Trans, const DenseMatrix<std::complex<double>>&, const int*,
                         DenseMatrix<std::complex<double>>&, int*, std::complex<double>*, std::int64_t);
 
-    void trsm(cublasHandle_t& handle, cublasSideMode_t side,
+        void potrf_buffersize
+                (Handle& handle, cublasFillMode_t uplo, int m, float* A, int lda, int* Lwork) {
+            gpu_check(cusolverDnSpotrf_bufferSize(get_cusolver_handle(handle), uplo, m, A, lda, Lwork));
+        }
+        void potrf_buffersize
+                (Handle& handle, cublasFillMode_t uplo, int m, double * A, int lda, int* Lwork) {
+            gpu_check(cusolverDnDpotrf_bufferSize(get_cusolver_handle(handle), uplo, m, A, lda, Lwork));
+        }
+        void potrf_buffersize
+                (Handle& handle, cublasFillMode_t uplo, int m, std::complex<float>* A, int lda,
+                 int *Lwork) {
+            gpu_check(cusolverDnCpotrf_bufferSize(get_cusolver_handle(handle), uplo, m, reinterpret_cast<cuComplex*>(A), lda, Lwork));
+        }
+        void potrf_buffersize
+                (Handle& handle, cublasFillMode_t uplo, int m, std::complex<double>* A, int lda,
+                 int *Lwork) {
+            gpu_check(cusolverDnZpotrf_bufferSize(get_cusolver_handle(handle), uplo, m, reinterpret_cast<cuDoubleComplex*>(A), lda, Lwork));
+        }
+
+        template<typename scalar_t>
+        int potrf_buffersize(Handle& handle, UpLo uplo, int n) {
+            int Lwork;
+            potrf_buffersize(handle, F2cuOp(uplo), n, static_cast<scalar_t*>(nullptr), n, &Lwork);
+            return Lwork;
+        }
+        template int potrf_buffersize<float>(Handle&, UpLo, int);
+        template int potrf_buffersize<double>(Handle&, UpLo, int);
+        template int potrf_buffersize<std::complex<float>>(Handle&, UpLo, int);
+        template int potrf_buffersize<std::complex<double>>(Handle&, UpLo, int);
+
+
+        void potrf(Handle& handle, cublasFillMode_t uplo, int m, float* A, int lda,
+                   float* Workspace, int Lwork, int* devInfo) {
+            STRUMPACK_FLOPS(blas::potrf_flops(m));
+            gpu_check(cusolverDnSpotrf(get_cusolver_handle(handle), uplo, m, A, lda, Workspace, Lwork, devInfo));
+        }
+        void potrf(Handle& handle, cublasFillMode_t uplo, int m, double* A, int lda,
+                   double* Workspace, int Lwork, int* devInfo) {
+            STRUMPACK_FLOPS(blas::potrf_flops(m));
+            gpu_check(cusolverDnDpotrf(get_cusolver_handle(handle), uplo, m, A, lda, Workspace, Lwork, devInfo));
+        }
+        void potrf(Handle& handle, cublasFillMode_t uplo, int m, std::complex<float>* A, int lda,
+                   std::complex<float>* Workspace, int Lwork, int* devInfo) {
+            STRUMPACK_FLOPS(4*blas::potrf_flops(m));
+            gpu_check(cusolverDnCpotrf(get_cusolver_handle(handle), uplo, m, reinterpret_cast<cuComplex*>(A), lda,
+                                       reinterpret_cast<cuComplex*>(Workspace), Lwork, devInfo));
+        }
+        void potrf(Handle& handle, cublasFillMode_t uplo, int m, std::complex<double>* A, int lda,
+                   std::complex<double>* Workspace, int Lwork, int* devInfo) {
+            STRUMPACK_FLOPS(4*blas::potrf_flops(m));
+            gpu_check(cusolverDnZpotrf(get_cusolver_handle(handle), uplo, m, reinterpret_cast<cuDoubleComplex*>(A), lda,
+                                       reinterpret_cast<cuDoubleComplex*>(Workspace), Lwork, devInfo));
+        }
+
+        template<typename scalar_t> void
+        potrf(Handle& handle, UpLo uplo, DenseMatrix<scalar_t>& A,
+              scalar_t* Workspace, int Lwork, int* devInfo) {
+            potrf(handle, F2cuOp(uplo), A.rows(), A.data(), A.ld(), Workspace, Lwork, devInfo);
+        }
+        template void potrf(Handle&, UpLo, DenseMatrix<float>&,
+                            float*, int, int*);
+        template void potrf(Handle&, UpLo, DenseMatrix<double>&,
+                            double*, int, int*);
+        template void potrf(Handle&, UpLo, DenseMatrix<std::complex<float>>&,
+                            std::complex<float>*, int, int*);
+        template void potrf(Handle&, UpLo, DenseMatrix<std::complex<double>>&,
+                            std::complex<double>*, int, int*);
+
+        void trsm(cublasHandle_t& handle, cublasSideMode_t side,
               cublasFillMode_t uplo, cublasOperation_t trans,
               cublasDiagType_t diag, int m, int n, const float* alpha,
               const float* A, int lda, float* B, int ldb) {
diff --git a/src/dense/DenseMatrix.hpp b/src/dense/DenseMatrix.hpp
index 0b1bb1a5..639b53e2 100644
--- a/src/dense/DenseMatrix.hpp
+++ b/src/dense/DenseMatrix.hpp
@@ -82,7 +82,8 @@ namespace strumpack {
    */
   enum class UpLo : char {
     U='U',  /*!< Upper triangle    */
-    L='L'   /*!< Lower triangle    */
+    L='L',   /*!< Lower triangle    */
+    F='F'   /*!< Full matrix    */
   };
 
   /**
diff --git a/src/dense/GPUWrapper.hpp b/src/dense/GPUWrapper.hpp
index 336e2b6d..42b94487 100644
--- a/src/dense/GPUWrapper.hpp
+++ b/src/dense/GPUWrapper.hpp
@@ -393,7 +393,19 @@ namespace strumpack {
           DenseMatrix<scalar_t>& B, int *dinfo,
           scalar_t* work=nullptr, std::int64_t lwork=0);
 
+    template<typename scalar_t>
+    int potrf_buffersize(Handle& handle, UpLo uplo, int n);
+
+    template<typename scalar_t> void
+    potrf(Handle& handle, UpLo uplo, DenseMatrix<scalar_t>& A,
+          scalar_t* Workspace, int Lwork, int* devInfo);
+
     template<typename scalar_t> void
+    syrk(Handle& handle, UpLo uplo, Trans ta,
+         scalar_t alpha, const DenseMatrix<scalar_t>& a,
+         scalar_t beta, DenseMatrix<scalar_t>& c);
+
+        template<typename scalar_t> void
     trsm(Handle& handle, Side side, UpLo uplo,
          Trans trans, Diag diag, const scalar_t alpha,
          DenseMatrix<scalar_t>& A, DenseMatrix<scalar_t>& B);
diff --git a/src/sparse/CSRMatrix.cpp b/src/sparse/CSRMatrix.cpp
index 3a6e8601..085e3451 100644
--- a/src/sparse/CSRMatrix.cpp
+++ b/src/sparse/CSRMatrix.cpp
@@ -502,6 +502,40 @@ namespace strumpack {
     }
   }
 
+  template<typename scalar_t,typename integer_t> void
+  CSRMatrix<scalar_t,integer_t>::set_front_elements_symmetric
+  (integer_t slo, integer_t shi, const std::vector<integer_t>& upd,
+   Triplet<scalar_t>* e11, Triplet<scalar_t>* e21) const {
+    integer_t ds = shi - slo, du = upd.size();
+    for (integer_t row=0; row<ds; row++) { // separator rows
+        integer_t upd_ptr = 0;
+        const auto hij = ptr_[row+slo+1];
+        for (integer_t j=ptr_[row+slo]; j<hij; j++) {
+            integer_t col = ind_[j];
+            if (col >= slo) {
+                if (col < shi)
+                    *e11++ = Triplet<scalar_t>(row, col-slo, val_[j]);
+                else {
+                    while (upd_ptr<du && upd[upd_ptr]<col) upd_ptr++;
+                    if (upd_ptr == du) break;
+          }
+        }
+      }
+    }
+    for (integer_t i=0; i<du; i++) { // update rows
+        auto row = upd[i];
+        const auto hij = ptr_[row+1];
+        for (integer_t j=ptr_[row]; j<hij; j++) {
+            integer_t col = ind_[j];
+            if (col >= slo) {
+                if (col < shi)
+                    *e21++ = Triplet<scalar_t>(i, col-slo, val_[j]);
+                else break;
+        }
+      }
+    }
+  }
+
   template<typename scalar_t,typename integer_t> void
   CSRMatrix<scalar_t,integer_t>::count_front_elements
   (integer_t slo, integer_t shi, const std::vector<integer_t>& upd,
@@ -535,6 +569,38 @@ namespace strumpack {
     }
   }
 
+  template<typename scalar_t,typename integer_t> void
+  CSRMatrix<scalar_t,integer_t>::count_front_elements_symmetric
+  (integer_t slo, integer_t shi, const std::vector<integer_t>& upd,
+   std::size_t& e11, std::size_t& e21) const {
+    integer_t ds = shi - slo, du = upd.size();
+    for (integer_t row=0; row<ds; row++) { // separator rows
+        integer_t upd_ptr = 0;
+        const auto hij = ptr_[row+slo+1];
+        for (integer_t j=ptr_[row+slo]; j<hij; j++) {
+            integer_t col = ind_[j];
+            if (col >= slo) {
+                if (col < shi) e11++;
+                else {
+                    while (upd_ptr<du && upd[upd_ptr]<col) upd_ptr++;
+                    if (upd_ptr == du) break;
+                }
+            }
+        }
+    }
+    for (integer_t i=0; i<du; i++) { // update rows
+        auto row = upd[i];
+        const auto hij = ptr_[row+1];
+        for (integer_t j=ptr_[row]; j<hij; j++) {
+            integer_t col = ind_[j];
+            if (col >= slo) {
+                if (col < shi) e21++;
+                else break;
+           }
+        }
+     }
+  }
+
   // TODO parallel -> will be hard to do efficiently
   // assume F11, F12 and F21 are set to zero
   template<typename scalar_t,typename integer_t> void
diff --git a/src/sparse/CSRMatrix.hpp b/src/sparse/CSRMatrix.hpp
index 823804d7..22ddd430 100644
--- a/src/sparse/CSRMatrix.hpp
+++ b/src/sparse/CSRMatrix.hpp
@@ -135,11 +135,20 @@ namespace strumpack {
                             const std::vector<integer_t>&,
                             Triplet<scalar_t>*, Triplet<scalar_t>*,
                             Triplet<scalar_t>*) const override;
+
+    void set_front_elements_symmetric(integer_t, integer_t,
+                                    const std::vector<integer_t>&,
+                                    Triplet<scalar_t>*, Triplet<scalar_t>*) const override;
+
     void count_front_elements(integer_t, integer_t,
                               const std::vector<integer_t>&,
                               std::size_t&, std::size_t&, std::size_t&)
       const override;
 
+    void count_front_elements_symmetric(integer_t, integer_t,
+                                          const std::vector<integer_t>&,
+                                          std::size_t&, std::size_t&) const override;
+
     void front_multiply_F11(Trans op, integer_t slo, integer_t shi,
                             const DenseM_t& R, DenseM_t& S,
                             int depth) const override;
diff --git a/src/sparse/CompressedSparseMatrix.hpp b/src/sparse/CompressedSparseMatrix.hpp
index 1fd7fa68..f63e31cb 100644
--- a/src/sparse/CompressedSparseMatrix.hpp
+++ b/src/sparse/CompressedSparseMatrix.hpp
@@ -394,7 +394,13 @@ namespace strumpack {
     set_front_elements(integer_t, integer_t, const std::vector<integer_t>&,
                        Triplet<scalar_t>*, Triplet<scalar_t>*,
                        Triplet<scalar_t>*) const = 0;
+
+    // TODO(Jie): make it pure virtual
     virtual void
+    set_front_elements_symmetric(integer_t, integer_t, const std::vector<integer_t>&,
+                                 Triplet<scalar_t>*, Triplet<scalar_t>*) const { abort(); };
+
+      virtual void
     count_front_elements(integer_t, integer_t, const std::vector<integer_t>&,
                          std::size_t&, std::size_t&, std::size_t&) const = 0;
 
@@ -404,6 +410,11 @@ namespace strumpack {
                    const DenseM_t& R, DenseM_t& Sr, DenseM_t& Sc,
                    int depth) const = 0;
 
+      // TODO(Jie): make it pure virtual
+      virtual void
+      count_front_elements_symmetric(integer_t, integer_t, const std::vector<integer_t>&,
+                                     std::size_t&, std::size_t&) const { abort(); };
+
     virtual void
     front_multiply_F11(Trans op, integer_t slo, integer_t shi,
                        const DenseM_t& R, DenseM_t& S, int depth) const = 0;
diff --git a/src/sparse/fronts/CMakeLists.txt b/src/sparse/fronts/CMakeLists.txt
index e94ea854..891310f8 100644
--- a/src/sparse/fronts/CMakeLists.txt
+++ b/src/sparse/fronts/CMakeLists.txt
@@ -36,7 +36,10 @@ if(STRUMPACK_USE_CUDA OR STRUMPACK_USE_HIP OR STRUMPACK_USE_SYCL)
     PRIVATE
     ${CMAKE_CURRENT_LIST_DIR}/FrontalMatrixGPU.cpp
     ${CMAKE_CURRENT_LIST_DIR}/FrontalMatrixGPU.hpp
+    ${CMAKE_CURRENT_LIST_DIR}/FrontalMatrixGPUSymmetricPositiveDefinite.cpp
+    ${CMAKE_CURRENT_LIST_DIR}/FrontalMatrixGPUSymmetricPositiveDefinite.hpp
     ${CMAKE_CURRENT_LIST_DIR}/FrontalMatrixGPUKernels.hpp)
+
 endif()
 
 if(STRUMPACK_USE_MAGMA)
@@ -47,7 +50,8 @@ endif()
 
 if(STRUMPACK_USE_CUDA)
   target_sources(strumpack PRIVATE
-    ${CMAKE_CURRENT_LIST_DIR}/FrontalMatrixCUDA.cu)
+    ${CMAKE_CURRENT_LIST_DIR}/FrontalMatrixCUDA.cu
+    ${CMAKE_CURRENT_LIST_DIR}/FrontalMatrixGPUSymmetricPositiveDefinite.cu)
 endif()
 
 if(STRUMPACK_USE_SYCL)
diff --git a/src/sparse/fronts/FrontFactory.cpp b/src/sparse/fronts/FrontFactory.cpp
index ee1b7cd1..b8d74319 100644
--- a/src/sparse/fronts/FrontFactory.cpp
+++ b/src/sparse/fronts/FrontFactory.cpp
@@ -51,6 +51,7 @@
 #endif
 #if defined(STRUMPACK_USE_GPU)
 #include "FrontalMatrixGPU.hpp"
+#include "FrontalMatrixGPUSymmetricPositiveDefinite.hpp"
 #endif
 #if defined(STRUMPACK_USE_ZFP)
 #include "FrontalMatrixLossy.hpp"
@@ -146,8 +147,13 @@ namespace strumpack {
         (s, sbegin, send, upd);
 #else
 #if defined(STRUMPACK_USE_GPU)
-      front = std::make_unique<FrontalMatrixGPU<scalar_t,integer_t>>
-        (s, sbegin, send, upd);
+        if (is_symmetric(opts) && is_positive_definite(opts)){
+            front.reset
+                    (new FrontalMatrixGPUSymmetricPositiveDefinite<scalar_t,integer_t>(s, sbegin, send, upd));
+        } else {
+            front.reset
+                    (new FrontalMatrixGPU<scalar_t,integer_t>(s, sbegin, send, upd));
+        }
 #endif
 #endif
       if (root) fc.dense++;
diff --git a/src/sparse/fronts/FrontFactory.hpp b/src/sparse/fronts/FrontFactory.hpp
index f5e3bc57..d5749ca0 100644
--- a/src/sparse/fronts/FrontFactory.hpp
+++ b/src/sparse/fronts/FrontFactory.hpp
@@ -65,6 +65,22 @@ namespace strumpack {
 #endif
   }
 
+    template<typename scalar_t> bool is_symmetric
+            (const SPOptions<scalar_t>& opts) {
+#if defined(STRUMPACK_USE_CUDA) || defined(STRUMPACK_USE_HIP) || defined(STRUMPACK_USE_SYCL)
+        return opts.use_symmetric() && opts.compression() == CompressionType::NONE;
+#endif
+        return false;
+    }
+
+    template<typename scalar_t> bool is_positive_definite
+            (const SPOptions<scalar_t>& opts) {
+#if defined(STRUMPACK_USE_CUDA) || defined(STRUMPACK_USE_HIP) || defined(STRUMPACK_USE_SYCL)
+        return opts.use_positive_definite();
+#endif
+        return false;
+    }
+
   template<typename scalar_t> bool is_HSS
   (int dsep, int dupd, const SPOptions<scalar_t>& opts) {
     return opts.compression() == CompressionType::HSS &&
diff --git a/src/sparse/fronts/FrontalMatrix.hpp b/src/sparse/fronts/FrontalMatrix.hpp
index 97ef067b..d48ba00f 100644
--- a/src/sparse/fronts/FrontalMatrix.hpp
+++ b/src/sparse/fronts/FrontalMatrix.hpp
@@ -107,6 +107,13 @@ namespace strumpack {
       VectorPool<scalar_t> workspace;
       return factor(A, opts, workspace, etree_level, task_depth);
     }
+
+      // TODO(Jie): make it pure virtual
+      virtual ReturnCode
+      multifrontal_factorization_symmetric(const SpMat_t& A, const Opts_t& opts,
+                                           int etree_level=0, int task_depth=0) { abort(); };
+
+
     virtual ReturnCode factor(const SpMat_t& A, const Opts_t& opts,
                               VectorPool<scalar_t>& workspace,
                               int etree_level=0, int task_depth=0) {
@@ -197,6 +204,15 @@ namespace strumpack {
                         int task_depth) {
       extend_add_to_dense(paF11, paF12, paF21, paF22, p, task_depth);
     }
+      // TODO(Jie): make it pure virtual
+      /*
+       * This is for symmetric
+       */
+      virtual void
+      extend_add_to_dense(DenseM_t& paF11,
+                                    DenseM_t& paF21, DenseM_t& paF22,
+                                    const FrontalMatrix<scalar_t,integer_t>* p,
+                                    int task_depth) { abort(); }
 
     virtual void
     extend_add_to_blr(BLRM_t& paF11, BLRM_t& paF12,
diff --git a/src/sparse/fronts/FrontalMatrixGPUKernels.hpp b/src/sparse/fronts/FrontalMatrixGPUKernels.hpp
index ffd1633d..3b9942f1 100644
--- a/src/sparse/fronts/FrontalMatrixGPUKernels.hpp
+++ b/src/sparse/fronts/FrontalMatrixGPUKernels.hpp
@@ -83,10 +83,18 @@ namespace strumpack {
     template<typename T> void
     assemble(unsigned int, AssembleData<T>*, AssembleData<T>*);
 
+    template<typename T> void
+    assemble_symmetric(unsigned int, AssembleData<T>*, AssembleData<T>*);
+
     template<typename T, int NT=32,
              typename real_t = typename RealType<T>::value_type>
     void factor_block_batch(unsigned int, FrontData<T>*, bool, real_t, int*);
 
+    template<typename T, int NT=32,
+          typename real_t = typename RealType<T>::value_type>
+    void factor_symmetric_block_batch(unsigned int count, FrontData<T>* dat,
+                                        bool replace, real_t thresh, int* dinfo);
+
     template<typename T,
              typename real_t = typename RealType<T>::value_type>
     void replace_pivots(int, T*, real_t, gpu::Stream* = nullptr);
diff --git a/src/sparse/fronts/FrontalMatrixGPUSymmetricPositiveDefinite.cpp b/src/sparse/fronts/FrontalMatrixGPUSymmetricPositiveDefinite.cpp
new file mode 100644
index 00000000..693c76a2
--- /dev/null
+++ b/src/sparse/fronts/FrontalMatrixGPUSymmetricPositiveDefinite.cpp
@@ -0,0 +1,1094 @@
+//
+// Created by tingxuan on 23-6-19.
+//
+/*
+ * STRUMPACK -- STRUctured Matrices PACKage, Copyright (c) 2014, The
+ * Regents of the University of California, through Lawrence Berkeley
+ * National Laboratory (subject to receipt of any required approvals
+ * from the U.S. Dept. of Energy).  All rights reserved.
+ *
+ * If you have questions about your rights to use or distribute this
+ * software, please contact Berkeley Lab's Technology Transfer
+ * Department at TTD@lbl.gov.
+ *
+ * NOTICE. This software is owned by the U.S. Department of Energy. As
+ * such, the U.S. Government has been granted for itself and others
+ * acting on its behalf a paid-up, nonexclusive, irrevocable,
+ * worldwide license in the Software to reproduce, prepare derivative
+ * works, and perform publicly and display publicly.  Beginning five
+ * (5) years after the date permission to assert copyright is obtained
+ * from the U.S. Department of Energy, and subject to any subsequent
+ * five (5) year renewals, the U.S. Government is granted for itself
+ * and others acting on its behalf a paid-up, nonexclusive,
+ * irrevocable, worldwide license in the Software to reproduce,
+ * prepare derivative works, distribute copies to the public, perform
+ * publicly and display publicly, and to permit others to do so.
+ *
+ * Developers: Pieter Ghysels, Francois-Henry Rouet, Xiaoye S. Li.
+ *             (Lawrence Berkeley National Lab, Computational Research
+ *             Division).
+ *
+ */
+#include <array>
+
+#include "FrontalMatrixGPUSymmetricPositiveDefinite.hpp"
+#include "FrontalMatrixGPUKernels.hpp"
+
+#if defined(STRUMPACK_USE_MPI)
+#include "ExtendAdd.hpp"
+#include "FrontalMatrixMPI.hpp"
+#endif
+
+namespace strumpack {
+
+    template<typename scalar_t, typename integer_t> class LevelInfoUnified {
+        using F_t = FrontalMatrix<scalar_t,integer_t>;
+        using FG_t = FrontalMatrixGPUSymmetricPositiveDefinite<scalar_t,integer_t>;
+        using DenseMW_t = DenseMatrixWrapper<scalar_t>;
+        using SpMat_t = CompressedSparseMatrix<scalar_t,integer_t>;
+
+    public:
+        LevelInfoUnified() {}
+
+        LevelInfoUnified(const std::vector<F_t*>& fronts, gpu::Handle& handle,
+                  int max_streams, const SpMat_t* A) {
+            if (!A->symm_sparse()) {
+                f.reserve(fronts.size());
+                for (auto& F : fronts)
+                    f.push_back(dynamic_cast<FG_t*>(F));
+                std::size_t max_dsep = 0;
+                // This pragma causes "internal error: null pointer" on the
+                // intel compiler.  It seems to be because the variables are
+                // class members.
+                // #pragma omp parallel for reduction(+:L_size,U_size,Schur_size,piv_size,total_upd_size,N8,N16,N24,N32,factors_small) reduction(max:max_dsep)
+                for (std::size_t i=0; i<f.size(); i++) {
+                    auto F = f[i];
+                    const std::size_t dsep = F->dim_sep();
+                    const std::size_t dupd = F->dim_upd();
+                    diagonal_size += dsep * dsep;
+                    off_diagonal_size += 2 * dsep * dupd;
+                    L_size += dsep*(dsep + dupd);
+                    U_size += dsep*dupd;
+                    Schur_size += dupd*dupd;
+                    piv_size += dsep;
+                    total_upd_size += dupd;
+                    if (dsep <= 32) {
+                        factors_diagonal_small += dsep * dsep;
+                        factors_off_diagonal_small += 2 * dsep * dupd;
+                        factors_small += dsep*(dsep + 2*dupd);
+                        if (dsep <= 8)       N8++;
+                        else if (dsep <= 16) N16++;
+                        else if (dsep <= 24) N24++;
+                        else N32++;
+                    }
+                    if (dsep > max_dsep) max_dsep = dsep;
+                }
+                small_fronts = N8 + N16 + N24 + N32;
+                if (small_fronts && small_fronts != f.size())
+                    std::partition
+                            (f.begin(), f.end(), [](const FG_t* const& a) -> bool {
+                                return a->dim_sep() <= 32; });
+                {
+                    auto N = f.size();
+                    elems11.resize(N+1);
+                    elems12.resize(N+1);
+                    elems21.resize(N+1);
+                    Isize.resize(N+1);
+#pragma omp parallel for
+                    for (std::size_t i=0; i<N; i++) {
+                        auto& F = *(f[i]);
+                        A->count_front_elements
+                                (F.sep_begin(), F.sep_end(), F.upd(),
+                                 elems11[i+1], elems12[i+1], elems21[i+1]);
+                        if (F.lchild_) Isize[i+1] += F.lchild_->dim_upd();
+                        if (F.rchild_) Isize[i+1] += F.rchild_->dim_upd();
+                    }
+                    for (std::size_t i=0; i<N; i++) {
+                        elems11[i+1] += elems11[i];
+                        elems12[i+1] += elems12[i];
+                        elems21[i+1] += elems21[i];
+                        Isize[i+1] += Isize[i];
+                    }
+                }
+                factor_size = L_size + U_size;
+                // TODO(Jie): fix for un-symmetric
+//            getrf_work_size = gpu::getrf_buffersize<scalar_t>(handle, max_dsep);
+                getrf_work_size = gpu::potrf_buffersize<scalar_t>(handle, UpLo::L, max_dsep);
+
+                factor_bytes = sizeof(scalar_t) * factor_size;
+                factor_bytes = gpu::round_up(factor_bytes);
+
+                work_bytes = sizeof(scalar_t) * (Schur_size + getrf_work_size * max_streams);
+                work_bytes = gpu::round_up(work_bytes);
+                work_bytes += sizeof(int) * (piv_size + f.size());
+                work_bytes = gpu::round_up(work_bytes);
+                work_bytes += sizeof(gpu::FrontData<scalar_t>) * (N8 + N16 + N24 + N32);
+                work_bytes = gpu::round_up(work_bytes);
+
+                ea_bytes = sizeof(gpu::AssembleData<scalar_t>) * f.size();
+                ea_bytes = gpu::round_up(ea_bytes);
+                ea_bytes += sizeof(std::size_t) * Isize.back();
+                ea_bytes = gpu::round_up(ea_bytes);
+                ea_bytes += sizeof(Triplet<scalar_t>) * (elems11.back() + elems12.back() + elems21.back());
+                ea_bytes = gpu::round_up(ea_bytes);
+            } else {
+                f.reserve(fronts.size());
+                for (auto& F : fronts)
+                    f.push_back(dynamic_cast<FG_t*>(F));
+                std::size_t max_dsep = 0;
+                // This pragma causes "internal error: null pointer" on the
+                // intel compiler.  It seems to be because the variables are
+                // class members.
+                // #pragma omp parallel for reduction(+:L_size,U_size,Schur_size,piv_size,total_upd_size,N8,N16,N24,N32,factors_small) reduction(max:max_dsep)
+                for (std::size_t i=0; i<f.size(); i++) {
+                    auto F = f[i];
+                    const std::size_t dsep = F->dim_sep();
+                    const std::size_t dupd = F->dim_upd();
+                    diagonal_size += dsep * dsep;
+                    off_diagonal_size += dsep * dupd;
+                    L_size += dsep*dsep;
+                    U_size += dsep*dupd;
+                    Schur_size += dupd*dupd;
+                    piv_size += dsep;
+                    total_upd_size += dupd;
+                    if (dsep <= 32) {
+                        factors_diagonal_small += dsep * dsep;
+                        factors_off_diagonal_small += dsep * dupd;
+                        factors_small += dsep*(dsep + dupd);
+                        if (dsep <= 8)       N8++;
+                        else if (dsep <= 16) N16++;
+                        else if (dsep <= 24) N24++;
+                        else N32++;
+                    }
+                    if (dsep > max_dsep) max_dsep = dsep;
+                }
+                small_fronts = N8 + N16 + N24 + N32;
+                if (small_fronts && small_fronts != f.size())
+                    std::partition
+                            (f.begin(), f.end(), [](const FG_t* const& a) -> bool {
+                                return a->dim_sep() <= 32; });
+                {
+                    auto N = f.size();
+                    elems11.resize(N+1);
+                    elems21.resize(N+1);
+                    Isize.resize(N+1);
+#pragma omp parallel for
+                    for (std::size_t i=0; i<N; i++) {
+                        auto& F = *(f[i]);
+                        A->count_front_elements_symmetric
+                                (F.sep_begin(), F.sep_end(), F.upd(),
+                                 elems11[i+1], elems21[i+1]);
+                        if (F.lchild_) Isize[i+1] += F.lchild_->dim_upd();
+                        if (F.rchild_) Isize[i+1] += F.rchild_->dim_upd();
+                    }
+                    for (std::size_t i=0; i<N; i++) {
+                        elems11[i+1] += elems11[i];
+                        elems21[i+1] += elems21[i];
+                        Isize[i+1] += Isize[i];
+                    }
+                }
+                factor_size = L_size + U_size;
+//            getrf_work_size = gpu::getrf_buffersize<scalar_t>(handle, max_dsep);
+                getrf_work_size = gpu::potrf_buffersize<scalar_t>(handle, UpLo::L, max_dsep);
+
+                factor_bytes = sizeof(scalar_t) * factor_size;
+                factor_bytes = gpu::round_up(factor_bytes);
+
+                work_bytes = sizeof(scalar_t) * (Schur_size + getrf_work_size * max_streams);
+                work_bytes = gpu::round_up(work_bytes);
+                work_bytes += sizeof(int) * (piv_size + f.size());
+                work_bytes = gpu::round_up(work_bytes);
+                work_bytes += sizeof(gpu::FrontData<scalar_t>) * (N8 + N16 + N24 + N32);
+                work_bytes = gpu::round_up(work_bytes);
+
+                ea_bytes = sizeof(gpu::AssembleData<scalar_t>) * f.size();
+                ea_bytes = gpu::round_up(ea_bytes);
+                ea_bytes += sizeof(std::size_t) * Isize.back();
+                ea_bytes = gpu::round_up(ea_bytes);
+                ea_bytes += sizeof(Triplet<scalar_t>) * (elems11.back() + elems21.back());
+                ea_bytes = gpu::round_up(ea_bytes);
+            }
+        }
+
+        void print_info(int l, int lvls) {
+            std::cout << "#  level " << l << " of " << lvls
+                      << " has " << f.size() << " nodes and "
+                      << N8 << " <=8, " << N16 << " <=16, "
+                      << N24 << " <=24, " << N32 << " <=32, needs "
+                      << factor_bytes / 1.e6
+                      << " MB for factors, "
+                      << Schur_size * sizeof(scalar_t) / 1.e6
+                      << " MB for Schur complements" << std::endl;
+        }
+
+        void flops(long long& level_flops, long long& small_flops) {
+            level_flops = small_flops = 0;
+            auto N = f.size();
+#pragma omp parallel for reduction(+: level_flops, small_flops)
+            for (std::size_t i=0; i<N; i++) {
+                auto F = f[i];
+                auto flops = LU_flops(F->F11_) +
+                             gemm_flops(Trans::N, Trans::N, scalar_t(-1.),
+                                        F->F21_, F->F12_, scalar_t(1.)) +
+                             trsm_flops(Side::L, scalar_t(1.), F->F11_, F->F12_) +
+                             trsm_flops(Side::R, scalar_t(1.), F->F11_, F->F21_);
+                level_flops += flops;
+                if (F->dim_sep() <= 32)
+                    small_flops += flops;
+            }
+        }
+
+        /*
+         * first store L factors, then U factors,
+         *  F11, F21, F11, F21, ..., F12, F12, ...
+         */
+        void set_factor_pointers(scalar_t* factors) {
+            for (auto F : f) {
+                const std::size_t dsep = F->dim_sep();
+                const std::size_t dupd = F->dim_upd();
+                F->F11_ = DenseMW_t(dsep, dsep, factors, dsep); factors += dsep*dsep;
+                F->F12_ = DenseMW_t(dsep, dupd, factors, dsep); factors += dsep*dupd;
+                F->F21_ = DenseMW_t(dupd, dsep, factors, dupd); factors += dupd*dsep;
+            }
+        }
+
+        void set_factor_pointers(scalar_t* factors_diagonal, scalar_t* factors_off_diagonal) {
+            for (auto F : f) {
+                const std::size_t dsep = F->dim_sep();
+                const std::size_t dupd = F->dim_upd();
+                F->F11_ = DenseMW_t(dsep, dsep, factors_diagonal, dsep); factors_diagonal += dsep*dsep;
+                F->F21_ = DenseMW_t(dupd, dsep, factors_off_diagonal, dupd); factors_off_diagonal += dupd*dsep;
+            }
+        }
+
+        void set_pivot_pointers(int* pmem) {
+            for (auto F : f) {
+                F->piv_ = pmem;
+                pmem += F->dim_sep();
+            }
+        }
+
+        void set_work_pointers(void* wmem, int max_streams) {
+            auto schur = gpu::aligned_ptr<scalar_t>(wmem);
+            for (auto F : f) {
+                const int dupd = F->dim_upd();
+                if (dupd) {
+                    F->F22_ = DenseMW_t(dupd, dupd, schur, dupd);
+                    schur += dupd*dupd;
+                }
+            }
+            dev_getrf_work = schur;
+            schur += max_streams * getrf_work_size;
+            auto imem = gpu::aligned_ptr<int>(schur);
+            for (auto F : f) {
+                F->piv_ = imem;
+                imem += F->dim_sep();
+            }
+            dev_getrf_err = imem;   imem += f.size();
+            auto fdat = gpu::aligned_ptr<gpu::FrontData<scalar_t>>(imem);
+            f8  = fdat;  fdat += N8;
+            f16 = fdat;  fdat += N16;
+            f24 = fdat;  fdat += N24;
+            f32 = fdat;  fdat += N32;
+        }
+
+        int align = 0;
+        std::vector<FG_t*> f;
+        std::size_t diagonal_size = 0, off_diagonal_size = 0, L_size = 0, U_size = 0,
+                factors_diagonal_small = 0, factors_off_diagonal_small = 0,
+                factor_size = 0, factors_small = 0, Schur_size = 0, piv_size = 0,
+                total_upd_size = 0;
+        std::size_t N8 = 0, N16 = 0, N24 = 0, N32 = 0, small_fronts = 0;
+        std::size_t work_bytes = 0, ea_bytes = 0, factor_bytes = 0;
+        std::vector<std::size_t> elems11, elems12, elems21, Isize;
+        scalar_t* dev_getrf_work = nullptr;
+        int* dev_getrf_err = nullptr;
+        int getrf_work_size = 0;
+        gpu::FrontData<scalar_t> *f8 = nullptr, *f16 = nullptr,
+                *f24 = nullptr, *f32 = nullptr;
+    };
+
+
+    template<typename scalar_t,typename integer_t>
+    FrontalMatrixGPUSymmetricPositiveDefinite<scalar_t,integer_t>::FrontalMatrixGPUSymmetricPositiveDefinite
+            (integer_t sep, integer_t sep_begin, integer_t sep_end,
+             std::vector<integer_t>& upd)
+            : F_t(nullptr, nullptr, sep, sep_begin, sep_end, upd) {}
+
+    template<typename scalar_t,typename integer_t>
+    FrontalMatrixGPUSymmetricPositiveDefinite<scalar_t,integer_t>::~FrontalMatrixGPUSymmetricPositiveDefinite() {
+#if defined(STRUMPACK_COUNT_FLOPS)
+        const std::size_t dupd = dim_upd();
+    const std::size_t dsep = dim_sep();
+    STRUMPACK_SUB_MEMORY(dsep*(dsep+2*dupd)*sizeof(scalar_t));
+#endif
+    }
+
+    template<typename scalar_t,typename integer_t> void
+    FrontalMatrixGPUSymmetricPositiveDefinite<scalar_t,integer_t>::release_work_memory() {
+        F22_.clear();
+        host_Schur_.reset(nullptr);
+    }
+
+#if defined(STRUMPACK_USE_MPI)
+    template<typename scalar_t,typename integer_t> void
+    FrontalMatrixGPUSymmetricPositiveDefinite<scalar_t,integer_t>::extend_add_copy_to_buffers
+            (std::vector<std::vector<scalar_t>>& sbuf,
+             const FrontalMatrixMPI<scalar_t,integer_t>* pa) const {
+        ExtendAdd<scalar_t,integer_t>::extend_add_seq_copy_to_buffers
+                (F22_, sbuf, pa, this);
+    }
+#endif
+
+
+    template<typename scalar_t,typename integer_t> void
+    FrontalMatrixGPUSymmetricPositiveDefinite<scalar_t,integer_t>::extend_add_to_dense
+            (DenseM_t& paF11, DenseM_t& paF21, DenseM_t& paF22,
+             const F_t* p, int task_depth) {
+        const std::size_t pdsep = paF11.rows();
+        const std::size_t dupd = dim_upd();
+        std::size_t upd2sep;
+        auto I = this->upd_to_parent(p, upd2sep);
+#if defined(STRUMPACK_USE_OPENMP_TASKLOOP)
+#pragma omp taskloop default(shared) grainsize(64)      \
+  if(task_depth < params::task_recursion_cutoff_level)
+#endif
+        for (std::size_t c=0; c<dupd; c++) {
+            auto pc = I[c];
+            if (pc < pdsep) {
+                for (std::size_t r=c; r<upd2sep; r++)
+                    paF11(I[r],pc) += F22_(r,c);
+                for (std::size_t r=std::max(c, upd2sep); r<dupd; r++)
+                    paF21(I[r]-pdsep,pc) += F22_(r,c);
+            } else {
+                for (std::size_t r=std::max(c, upd2sep); r<dupd; r++)
+                    paF22(I[r]-pdsep,pc-pdsep) += F22_(r,c);
+            }
+        }
+        STRUMPACK_FLOPS((is_complex<scalar_t>()?2:1) * dupd * dupd);
+        STRUMPACK_FULL_RANK_FLOPS((is_complex<scalar_t>()?2:1) * dupd * dupd);
+        release_work_memory();
+    }
+
+    template<typename scalar_t,typename integer_t>
+    std::size_t peak_device_memory
+            (const std::vector<LevelInfoUnified<scalar_t,integer_t>>& ldata) {
+        std::size_t peak_dmem = 0;
+        for (std::size_t l=0; l<ldata.size(); l++) {
+            auto& L = ldata[l];
+            // memory needed on this level: factors,
+            // schur updates, pivot vectors, cuSOLVER work space,
+            // assembly data (indices, sparse elements)
+            std::size_t level_mem = L.factor_bytes + L.work_bytes + L.ea_bytes;
+            // the contribution blocks of the previous level are still
+            // needed for the extend-add
+            if (l+1 < ldata.size())
+                level_mem += ldata[l+1].work_bytes;
+            peak_dmem = std::max(peak_dmem, level_mem);
+        }
+        return peak_dmem;
+    }
+
+    template<typename scalar_t, typename integer_t> void
+    FrontalMatrixGPUSymmetricPositiveDefinite<scalar_t,integer_t>::front_assembly
+            (const SpMat_t& A, LInfo_t& L, char* hea_mem, char* dea_mem) {
+        using Trip_t = Triplet<scalar_t>;
+        auto N = L.f.size();
+        auto hasmbl = gpu::aligned_ptr<gpu::AssembleData<scalar_t>>(hea_mem);
+        auto Iptr   = gpu::aligned_ptr<std::size_t>(hasmbl + N);
+        auto e11    = gpu::aligned_ptr<Trip_t>(Iptr + L.Isize.back());
+        auto e21    = e11 + L.elems11.back();
+        auto dasmbl = gpu::aligned_ptr<gpu::AssembleData<scalar_t>>(dea_mem);
+        auto dIptr  = gpu::aligned_ptr<std::size_t>(dasmbl + N);
+        auto de11   = gpu::aligned_ptr<Trip_t>(dIptr + L.Isize.back());
+        auto de21   = de11 + L.elems11.back();
+
+#pragma omp parallel for
+        for (std::size_t n=0; n<N; n++) {
+            auto& f = *(L.f[n]);
+            A.set_front_elements_symmetric
+                    (f.sep_begin_, f.sep_end_, f.upd_,
+                     e11+L.elems11[n], e21+L.elems21[n]);
+            hasmbl[n] = gpu::AssembleData<scalar_t>
+                    (f.dim_sep(), f.dim_upd(), f.F11_.data(), nullptr,
+                     f.F21_.data(), f.F22_.data(),
+                     L.elems11[n+1]-L.elems11[n], 0,
+                     L.elems21[n+1]-L.elems21[n],
+                     de11+L.elems11[n], nullptr, de21+L.elems21[n]);
+            auto fIptr = Iptr + L.Isize[n];
+            auto fdIptr = dIptr + L.Isize[n];
+            if (f.lchild_) {
+                auto c = dynamic_cast<FG_t*>(f.lchild_.get());
+                hasmbl[n].set_ext_add_left(c->dim_upd(), c->F22_.data(), fdIptr);
+                c->upd_to_parent(&f, fIptr);
+                fIptr += c->dim_upd();
+                fdIptr += c->dim_upd();
+            }
+            if (f.rchild_) {
+                auto c = dynamic_cast<FG_t*>(f.rchild_.get());
+                hasmbl[n].set_ext_add_right(c->dim_upd(), c->F22_.data(), fdIptr);
+                c->upd_to_parent(&f, fIptr);
+            }
+        }
+        gpu::copy_host_to_device<char>(dea_mem, hea_mem, L.ea_bytes);
+        gpu::assemble_symmetric(N, hasmbl, dasmbl);
+    }
+
+    // TODO(Jie): fix for un-symmetric
+    template<typename scalar_t, typename integer_t> long long
+    FrontalMatrixGPUSymmetricPositiveDefinite<scalar_t,integer_t>::dense_node_factor_nonzeros() const {
+        long long dsep = dim_sep(), dupd = dim_upd();
+        return dsep * (dsep + dupd);
+    }
+
+    template<typename scalar_t, typename integer_t> void
+    FrontalMatrixGPUSymmetricPositiveDefinite<scalar_t,integer_t>::factor_small_fronts
+            (LInfo_t& L, gpu::FrontData<scalar_t>* fdata, int* dinfo,
+             const SPOptions<scalar_t>& opts) {
+        if (!L.small_fronts) return;
+        for (std::size_t n=0, n8=0, n16=L.N8, n24=n16+L.N16, n32=n24+L.N24;
+             n<L.small_fronts; n++) {
+            auto& f = *(L.f[n]);
+            const auto dsep = f.dim_sep();
+            gpu::FrontData<scalar_t>
+                    t(dsep, f.dim_upd(), f.F11_.data(), nullptr,
+                      f.F21_.data(), f.F22_.data(), f.piv_);
+            if (dsep <= 8)       fdata[n8++] = t;
+            else if (dsep <= 16) fdata[n16++] = t;
+            else if (dsep <= 24) fdata[n24++] = t;
+            else                 fdata[n32++] = t;
+        }
+        gpu::copy_host_to_device(L.f8, fdata, L.small_fronts);
+        auto replace = opts.replace_tiny_pivots();
+        auto thresh = opts.pivot_threshold();
+        gpu::factor_symmetric_block_batch<scalar_t,8>(L.N8, L.f8, replace, thresh, dinfo);
+        gpu::factor_symmetric_block_batch<scalar_t,16>(L.N16, L.f16, replace, thresh, dinfo+L.N8);
+        gpu::factor_symmetric_block_batch<scalar_t,24>(L.N24, L.f24, replace, thresh, dinfo+L.N8+L.N16);
+        gpu::factor_symmetric_block_batch<scalar_t,32>(L.N32, L.f32, replace, thresh, dinfo+L.N8+L.N16+L.N24);
+    }
+
+    template<typename scalar_t,typename integer_t> ReturnCode
+    FrontalMatrixGPUSymmetricPositiveDefinite<scalar_t,integer_t>::split_smaller
+            (const SpMat_t& A, const SPOptions<scalar_t>& opts,
+             int etree_level, int task_depth) {
+        if (opts.verbose())
+            std::cout << "# Factorization does not fit in GPU memory, "
+                         "splitting in smaller traversals." << std::endl;
+        const std::size_t dupd = dim_upd(), dsep = dim_sep();
+        ReturnCode err_code = ReturnCode::SUCCESS;
+        if (lchild_) {
+            auto el = lchild_->multifrontal_factorization_symmetric
+                    (A, opts, etree_level+1, task_depth);
+            if (el != ReturnCode::SUCCESS) err_code = el;
+        }
+        if (rchild_) {
+            auto er = rchild_->multifrontal_factorization_symmetric
+                    (A, opts, etree_level+1, task_depth);
+            if (er != ReturnCode::SUCCESS) err_code = er;
+        }
+        STRUMPACK_ADD_MEMORY(dsep*(dsep+dupd)*sizeof(scalar_t));
+        STRUMPACK_ADD_MEMORY(dupd*dupd*sizeof(scalar_t));
+        host_factors_diagonal_.reset(new scalar_t[dsep*dsep]);
+        host_factors_off_diagonal_.reset(new scalar_t[dsep * dupd]);
+        host_Schur_.reset(new scalar_t[dupd*dupd]);
+        {
+            auto fmem_diagonal = host_factors_diagonal_.get();
+            auto fmem_off_diagonal = host_factors_off_diagonal_.get();
+            F11_ = DenseMW_t(dsep, dsep, fmem_diagonal, dsep);
+            F21_ = DenseMW_t(dupd, dsep, fmem_off_diagonal, dupd);
+        }
+        F22_ = DenseMW_t(dupd, dupd, host_Schur_.get(), dupd);
+        F11_.zero();
+        F21_.zero(); F22_.zero();
+        A.extract_front
+                (F11_, F12_, F21_, this->sep_begin_, this->sep_end_,
+                 this->upd_, task_depth);
+        if (lchild_) {
+#pragma omp parallel
+#pragma omp single
+            lchild_->extend_add_to_dense(F11_, F21_, F22_, this, 0);
+        }
+        if (rchild_) {
+#pragma omp parallel
+#pragma omp single
+            rchild_->extend_add_to_dense(F11_, F21_, F22_, this, 0);
+        }
+        // TaskTimer tl("");
+        // tl.start();
+        if (dsep) {
+            gpu::Handle sh;
+            auto workSize = gpu::potrf_buffersize<scalar_t>(sh, UpLo::L, dsep);
+            gpu::DeviceMemory<scalar_t> dm11(dsep*dsep + workSize);
+            gpu::DeviceMemory<int> dpiv(dsep+1); // and ierr
+            DenseMW_t dF11(dsep, dsep, dm11, dsep);
+            gpu::copy_host_to_device(dF11, F11_);
+            gpu::potrf(sh, UpLo::L, dF11, dm11 + dsep*dsep, workSize, dpiv+dsep);
+            if (opts.replace_tiny_pivots())
+                gpu::replace_pivots
+                        (F11_.rows(), dF11.data(), opts.pivot_threshold());
+            int info;
+            gpu::copy_device_to_host(&info, dpiv+dsep, 1);
+            if (info) err_code = ReturnCode::ZERO_PIVOT;
+            pivot_mem_.resize(dsep);
+            piv_ = pivot_mem_.data();
+            gpu::copy_device_to_host(piv_, dpiv.as<int>(), dsep);
+            gpu::copy_device_to_host(F11_, dF11);
+            if (dupd) {
+                gpu::Handle bh;
+                gpu::DeviceMemory<scalar_t> dm21(dsep*dupd);
+                DenseMW_t dF21(dupd, dsep, dm21, dupd);
+                gpu::copy_host_to_device(dF21, F21_);
+                gpu::trsm(bh, Side::R, UpLo::L, Trans::T, Diag::N, scalar_t(1.), dF11, dF21);
+                gpu::copy_device_to_host(F21_, dF21);
+                dm11.release();
+                gpu::DeviceMemory<scalar_t> dm22((dsep+dupd)*dupd);
+                DenseMW_t dF22(dupd, dupd, dm22, dupd);
+                gpu::copy_host_to_device(dF22, host_Schur_.get());
+                gpu::syrk(bh, UpLo::L, Trans::N,
+                          scalar_t(-1.), dF21, scalar_t(1.), dF22);
+                gpu::copy_device_to_host(host_Schur_.get(), dF22);
+            }
+        }
+        // count flops
+        auto level_flops = LU_flops(F11_) +
+                           gemm_flops(Trans::N, Trans::N, scalar_t(-1.), F21_, F12_, scalar_t(1.)) +
+                           trsm_flops(Side::L, scalar_t(1.), F11_, F21_);
+        STRUMPACK_FULL_RANK_FLOPS(level_flops);
+        // if (opts.verbose()) {
+        //   auto level_time = tl.elapsed();
+        //   std::cout << "#   GPU Factorization complete, took: "
+        //             << level_time << " seconds, "
+        //             << level_flops / 1.e9 << " GFLOPS, "
+        //             << (float(level_flops) / level_time) / 1.e9
+        //             << " GFLOP/s" << std::endl;
+        // }
+        return err_code;
+    }
+
+    template<typename scalar_t,typename integer_t>
+    ReturnCode FrontalMatrixGPUSymmetricPositiveDefinite<scalar_t, integer_t>::multifrontal_factorization_symmetric(
+            const SpMat_t& A,
+            const SPOptions<scalar_t>& opts,
+            int etree_level, int task_depth) {
+        ReturnCode err_code = ReturnCode::SUCCESS;
+        const int max_streams = opts.gpu_streams();
+        std::vector<gpu::Handle> handles(max_streams);
+        const int lvls = this->levels();
+        std::vector<LInfo_t> ldata(lvls);
+        {
+            std::vector<std::vector<F_t*>> fp(lvls);
+            try {
+                this->get_level_fronts_gpu(fp);
+            } catch (...) {
+                return split_smaller(A, opts, etree_level, task_depth);
+            }
+            for (int l=lvls-1; l>=0; l--)
+                ldata[l] = LInfo_t(fp[l], handles[0], max_streams, &A);
+        }
+        auto peak_dmem = peak_device_memory(ldata);
+        if (peak_dmem >= 0.9 * gpu::available_memory())
+            return split_smaller(A, opts, etree_level, task_depth);
+
+        std::vector<gpu::Stream> streams(max_streams);
+        gpu::Stream copy_stream;
+        for (int i=0; i<max_streams; i++) {
+            handles[i].set_stream(streams[i]);
+        }
+        std::size_t max_small_fronts = 0, max_pinned = 0;
+        for (int l=lvls-1; l>=0; l--) {
+            auto& L = ldata[l];
+            max_small_fronts = std::max(max_small_fronts, L.N8+L.N16+L.N24+L.N32);
+            for (auto& f : L.f) {
+                const std::size_t dsep = f->dim_sep();
+                const std::size_t dupd = f->dim_upd();
+                std::size_t fs = dsep*(dsep + dupd);
+                max_pinned = std::max(max_pinned, fs);
+            }
+            max_pinned = std::max(max_pinned, L.factors_small);
+        }
+        gpu::HostMemory<scalar_t> pinned(2*max_pinned);
+        gpu::HostMemory<gpu::FrontData<scalar_t>> fdata(max_small_fronts);
+        std::size_t peak_hea_mem = 0;
+        for (int l=lvls-1; l>=0; l--)
+            peak_hea_mem = std::max(peak_hea_mem, ldata[l].ea_bytes);
+        gpu::HostMemory<char> hea_mem(peak_hea_mem);
+        gpu::DeviceMemory<char> all_dmem(peak_dmem);
+        if (opts.verbose()) {
+            std::size_t factor_bytes = 0, ea_bytes = 0, work_bytes = 0;
+            for (auto &l: ldata) {
+                factor_bytes = std::max(l.factor_bytes, factor_bytes);
+                ea_bytes = std::max(l.ea_bytes, ea_bytes);
+                work_bytes = std::max(l.work_bytes, work_bytes);
+            }
+
+            std::cout << "#   - working space memory (host) = " << double(peak_hea_mem) / 1024 / 1024 << " MB"
+                      << std::endl;
+            printf("#   - working space memory (device) = factor + ea + working = %f MB + %f MB + %f MB = %f MB\n",
+                   double(factor_bytes) / 1024 / 1024, double(ea_bytes) / 1024 / 1024, double(work_bytes) / 1024 / 1024,
+                   double(peak_dmem) / 1024 / 1024);
+            std::cout << "#   - working space memory (pinned) = "
+                      << double(2 * max_pinned) * sizeof(scalar_t) / 1024 / 1024 << " MB" << std::endl;
+        }
+        char* old_work = nullptr;
+        for (int l=lvls-1; l>=0; l--) {
+            // TaskTimer tl("");
+            // tl.start();
+            LInfo_t& L = ldata[l];
+            // if (opts.verbose()) L.print_info(l, lvls);
+            try {
+                char *work_mem = nullptr, *dea_mem = nullptr;
+                scalar_t* dev_factors = nullptr;
+                if (l % 2) {
+                    work_mem = all_dmem;
+                    dea_mem = work_mem + L.work_bytes;
+                    dev_factors = gpu::aligned_ptr<scalar_t>(dea_mem + L.ea_bytes);
+                } else {
+                    work_mem = all_dmem + peak_dmem - L.work_bytes;
+                    dea_mem = work_mem - L.ea_bytes;
+                    dev_factors = gpu::aligned_ptr<scalar_t>(dea_mem - L.factor_bytes);
+                }
+                gpu::memset<scalar_t>(work_mem, 0, L.Schur_size);
+                gpu::memset<scalar_t>(dev_factors, 0, L.factor_size);
+//                L.set_factor_pointers(dev_factors);
+                L.set_factor_pointers(dev_factors, dev_factors + L.diagonal_size);
+                L.set_work_pointers(work_mem, max_streams);
+                old_work = work_mem;
+
+                // default stream
+                gpu_check(cudaDeviceSynchronize());
+                front_assembly(A, L, hea_mem, dea_mem);
+                gpu::Event e_assemble;
+                e_assemble.record();
+                gpu_check(cudaDeviceSynchronize());
+
+                // default stream
+                factor_small_fronts(L, fdata, L.dev_getrf_err, opts);
+                gpu::Event e_small;
+                e_small.record();
+
+                for (auto& s : streams)
+                    e_assemble.wait(s);
+
+                // larger fronts in multiple streams.  Copy back in nchunks
+                // chunks, but a single chunk cannot be larger than the pinned
+                // buffer
+                const int nchunks = 16;
+                std::size_t Bf = (L.f.size()-L.small_fronts + nchunks - 1) / nchunks;
+                std::vector<std::size_t> chunks;
+//                std::vector<std::size_t> factors_chunk;
+                std::vector<std::size_t> factors_diagonal_chunk, factors_off_diagonal_chunk;
+                for (std::size_t n=L.small_fronts, fc=0, c=0, fdc = 0, fodc = 0; n<L.f.size(); n++) {
+                    auto& f = *(L.f[n]);
+                    const std::size_t dsep = f.dim_sep();
+                    const std::size_t dupd = f.dim_upd();
+                    std::size_t size_front = dsep * (dsep + dupd);
+                    std::size_t size_factors_diagonal = dsep * dsep;
+                    std::size_t size_factors_off_diagonal = dsep * dupd;
+                    if (c == Bf || fc + size_front > max_pinned) {
+                        chunks.push_back(c);
+//                        factors_chunk.push_back(fc);
+                        factors_diagonal_chunk.push_back(fdc);
+                        factors_off_diagonal_chunk.push_back(fodc);
+                        c = fc = 0;
+                        fdc = fodc = 0;
+                    }
+                    c++;
+                    fc += size_front;
+                    fdc += size_factors_diagonal;
+                    fodc += size_factors_off_diagonal;
+                    if (n == L.f.size()-1) { // final chunk
+                        chunks.push_back(c);
+//                        factors_chunk.push_back(fc);
+                        factors_diagonal_chunk.push_back(fdc);
+                        factors_off_diagonal_chunk.push_back(fodc);
+                    }
+                }
+
+                e_small.wait(copy_stream);
+//                gpu::copy_device_to_host_async<scalar_t>
+//                                  (pinned, dev_factors, L.factors_small, copy_stream));
+                gpu::copy_device_to_host_async<scalar_t>
+                                  (pinned, dev_factors, L.factors_diagonal_small, copy_stream);
+                gpu::copy_device_to_host_async<scalar_t>
+                                  (pinned + L.factors_diagonal_small,
+                                   dev_factors + L.diagonal_size,
+                                   L.factors_off_diagonal_small, copy_stream);
+
+                STRUMPACK_ADD_MEMORY(L.factor_bytes);
+//                L.f[0]->host_factors_.reset(new scalar_t[L.factor_size]);
+//                scalar_t* host_factors = L.f[0]->host_factors_.get();
+                L.f[0]->host_factors_diagonal_.reset(new scalar_t[L.diagonal_size]);
+                L.f[0]->host_factors_off_diagonal_.reset(new scalar_t[L.off_diagonal_size]);
+                scalar_t* host_factors_diagonal = L.f[0]->host_factors_diagonal_.get();
+                scalar_t* host_factors_off_diagonal = L.f[0]->host_factors_off_diagonal_.get();
+                copy_stream.synchronize();
+//#pragma omp parallel for
+//                for (std::size_t i=0; i<L.factors_small; i++)
+//                    host_factors[i] = pinned[i];
+//                host_factors += L.factors_small;
+                memcpy(host_factors_diagonal, pinned, L.factors_diagonal_small * sizeof(scalar_t));
+                memcpy(host_factors_off_diagonal, pinned + L.factors_diagonal_small, L.factors_off_diagonal_small * sizeof(scalar_t));
+                host_factors_diagonal += L.factors_diagonal_small;
+                host_factors_off_diagonal += L.factors_off_diagonal_small;
+
+                if (!chunks.empty()) {
+                    scalar_t* pin[2] = {pinned.template as<scalar_t>(),
+                                        pinned.template as<scalar_t>() + max_pinned};
+                    std::vector<gpu::Event> events(chunks.size());
+
+                    for (std::size_t c=0, n=L.small_fronts; c<chunks.size(); c++) {
+                        int s = c % streams.size(), n0 = n;
+#pragma omp parallel
+#pragma omp single
+                        {
+                            if (c) {
+#pragma omp task
+                                {
+                                    copy_stream.synchronize();
+//                                    auto fc = factors_chunk[c-1];
+//#if defined(STRUMPACK_USE_OPENMP_TASKLOOP)
+//#pragma omp taskloop //num_tasks(omp_get_num_threads()-1)
+//#endif
+//                                    for (std::size_t i=0; i<fc; i++)
+//                                        host_factors[i] = pin[(c-1) % 2][i];
+//                                    host_factors += fc;
+                                    auto fdc = factors_diagonal_chunk[c-1];
+                                    auto fodc = factors_off_diagonal_chunk[c-1];
+                                    memcpy(host_factors_diagonal,
+                                           pin[(c-1) % 2], fdc * sizeof(scalar_t));
+                                    memcpy(host_factors_off_diagonal,
+                                           pin[(c-1) % 2] + fdc, fodc * sizeof(scalar_t));
+                                    host_factors_diagonal += fdc;
+                                    host_factors_off_diagonal += fodc;
+                                }
+                            }
+#pragma omp task
+                            {
+                                for (std::size_t ci=0; ci<chunks[c]; ci++, n++) {
+                                    auto& f = *(L.f[n]);
+                                    gpu::potrf(handles[s],
+                                               UpLo::L,
+                                               f.F11_,
+                                               L.dev_getrf_work + s * L.getrf_work_size,
+                                               L.getrf_work_size,
+                                               L.dev_getrf_err + n);
+                                    if (opts.replace_tiny_pivots())
+                                        gpu::replace_pivots
+                                                (f.dim_sep(), f.F11_.data(),
+                                                 opts.pivot_threshold(), &streams[s]);
+                                    if (f.dim_upd()) {
+                                        gpu::trsm(handles[s],
+                                                  Side::R, UpLo::L, Trans::T, Diag::N,
+                                                  scalar_t(1.), f.F11_, f.F21_);
+                                        gpu::syrk(handles[s],
+                                                  UpLo::L, Trans::N,
+                                                  scalar_t(-1.), f.F21_, scalar_t(1.), f.F22_);
+                                    }
+                                }
+                                events[c].record(streams[s]);
+                                events[c].wait(copy_stream);
+                                auto& f = *(L.f[n0]);
+//                                gpu::copy_device_to_host_async<scalar_t>
+//                                                  (pin[c % 2], f.F11_.data(),
+//                                                   factors_chunk[c], copy_stream);
+                                auto fdc = factors_diagonal_chunk[c];
+                                auto fodc = factors_off_diagonal_chunk[c];
+                                gpu::copy_device_to_host_async<scalar_t>
+                                                  (pin[c % 2], f.F11_.data(),
+                                                   fdc, copy_stream);
+                                gpu::copy_device_to_host_async<scalar_t>
+                                                  (pin[c % 2] + fdc, f.F21_.data(),
+                                                   fodc, copy_stream);
+                            }
+                        }
+                    }
+                    copy_stream.synchronize();
+//                    auto fc = factors_chunk.back();
+//#pragma omp parallel for
+//                    for (std::size_t i=0; i<fc; i++)
+//                        host_factors[i] = pin[(chunks.size()-1) % 2][i];
+                    auto fdc = factors_diagonal_chunk.back();
+                    auto fodc = factors_off_diagonal_chunk.back();
+                    memcpy(host_factors_diagonal, pin[(chunks.size()-1) % 2], fdc * sizeof(scalar_t));
+                    memcpy(host_factors_off_diagonal, pin[(chunks.size()-1) % 2] + fdc, fodc * sizeof(scalar_t));
+                }
+
+//                L.f[0]->pivot_mem_.resize(L.piv_size);
+                copy_stream.synchronize();
+//                gpu::copy_device_to_host
+//                                  (L.f[0]->pivot_mem_.data(), L.f[0]->piv_, L.piv_size);
+//                L.set_factor_pointers(L.f[0]->host_factors_.get();
+                L.set_factor_pointers(L.f[0]->host_factors_diagonal_.get(), L.f[0]->host_factors_off_diagonal_.get());
+//                L.set_pivot_pointers(L.f[0]->pivot_mem_.data());
+
+                std::vector<int> getrf_infos(L.f.size());
+                gpu::copy_device_to_host
+                                  (getrf_infos.data(), L.dev_getrf_err, L.f.size());
+                for (auto ierr : getrf_infos)
+                    if (ierr) {
+                        err_code = ReturnCode::ZERO_PIVOT;
+                        break;
+                    }
+            } catch (const std::bad_alloc& e) {
+                std::cerr << "Out of memory" << std::endl;
+                abort();
+            }
+            long long level_flops, small_flops;
+            L.flops(level_flops, small_flops);
+            STRUMPACK_FULL_RANK_FLOPS(level_flops);
+            STRUMPACK_FLOPS(small_flops);
+            // if (opts.verbose()) {
+            //   auto level_time = tl.elapsed();
+            //   std::cout << "#   GPU Factorization complete, took: "
+            //             << level_time << " seconds, "
+            //             << level_flops / 1.e9 << " GFLOPS, "
+            //             << (float(level_flops) / level_time) / 1.e9
+            //             << " GFLOP/s" << std::endl;
+            // }
+        }
+        const std::size_t dupd = dim_upd();
+        if (dupd) { // get the contribution block from the device
+            host_Schur_.reset(new scalar_t[dupd*dupd]);
+            gpu::copy_device_to_host
+                              (host_Schur_.get(),
+                               reinterpret_cast<scalar_t*>(old_work), dupd*dupd);
+            F22_ = DenseMW_t(dupd, dupd, host_Schur_.get(), dupd);
+        }
+        return err_code;
+    }
+
+    template<typename scalar_t,typename integer_t> ReturnCode
+    FrontalMatrixGPUSymmetricPositiveDefinite<scalar_t,integer_t>::multifrontal_factorization
+            (const SpMat_t& A, const SPOptions<scalar_t>& opts,
+             int etree_level, int task_depth) {
+        if (!A.symm_sparse()) {
+            std::cerr << "The Matrix is not symmetric, please unable_symmetric in option settings" << std::endl;
+            exit(EXIT_FAILURE); // stop
+        }else{
+            return multifrontal_factorization_symmetric(A, opts, etree_level, task_depth);
+        }
+    }
+
+    template<typename scalar_t,typename integer_t> void
+    FrontalMatrixGPUSymmetricPositiveDefinite<scalar_t,integer_t>::fwd_solve_phase2
+            (DenseM_t& b, DenseM_t& bupd, int etree_level, int task_depth) const {
+        if (dim_sep()) {
+            DenseMW_t bloc(dim_sep(), b.cols(), b, this->sep_begin_, 0);
+            trsv(UpLo::L, Trans::N, Diag::N, F11_, bloc, task_depth);
+//            F11_.solve_LU_in_place(bloc, piv_, task_depth);
+            if (dim_upd()) {
+                if (b.cols() == 1)
+                    gemv(Trans::N, scalar_t(-1.), F21_, bloc,
+                         scalar_t(1.), bupd, task_depth);
+                else
+                    gemm(Trans::N, Trans::N, scalar_t(-1.), F21_, bloc,
+                         scalar_t(1.), bupd, task_depth);
+            }
+        }
+    }
+
+    template<typename scalar_t,typename integer_t> void
+    FrontalMatrixGPUSymmetricPositiveDefinite<scalar_t,integer_t>::bwd_solve_phase1
+            (DenseM_t& y, DenseM_t& yupd, int etree_level, int task_depth) const {
+        if (dim_sep()) {
+            DenseMW_t yloc(dim_sep(), y.cols(), y, this->sep_begin_, 0);
+            if (y.cols() == 1) {
+                if (dim_upd())
+                    gemv(Trans::T, scalar_t(-1.), F21_, yupd,
+                         scalar_t(1.), yloc, task_depth);
+            } else {
+                if (dim_upd())
+                    gemm(Trans::T, Trans::N, scalar_t(-1.), F21_, yupd,
+                         scalar_t(1.), yloc, task_depth);
+            }
+            trsv(UpLo::L, Trans::T, Diag::N, F11_, yloc, params::task_recursion_cutoff_level);
+        }
+    }
+
+    template<typename scalar_t,typename integer_t> ReturnCode
+    FrontalMatrixGPUSymmetricPositiveDefinite<scalar_t,integer_t>::node_inertia
+            (integer_t& neg, integer_t& zero, integer_t& pos) const {
+        using real_t = typename RealType<scalar_t>::value_type;
+        for (std::size_t i=0; i<F11_.rows(); i++) {
+            if (piv_[i] != int(i+1)) return ReturnCode::INACCURATE_INERTIA;
+            auto absFii = std::abs(F11_(i, i));
+            if (absFii > real_t(0.)) pos++;
+            else if (absFii < real_t(0.)) neg++;
+            else if (absFii == real_t(0.)) zero++;
+            else std::cerr << "F(" << i << "," << i << ")=" << F11_(i,i) << std::endl;
+        }
+        return ReturnCode::SUCCESS;
+    }
+
+    template<typename scalar_t>
+    class MatrixWrapperForSparseF11 : public DenseMatrix<scalar_t> {
+    public:
+        /**
+         * Default constructor. Creates an empty, 0x0 matrix.
+         */
+        MatrixWrapperForSparseF11() : DenseMatrix<scalar_t>() {}
+        struct {
+            size_t rowTotal, colTotal, nnz;
+            scalar_t *value{nullptr};
+            size_t *innerIndex{nullptr}, *outerIndex{nullptr};
+        } sparseF11;
+
+        /**
+         * Constructor. Create an m x n matrix wrapper using already
+         * allocated memory, pointed to by D, with leading dimension ld.
+         *
+         * \param m number of rows of the new (sub) matrix
+         * \param n number of columns of the new matrix
+         * \param D pointer to memory representing matrix, this should
+         * point to at least ld*n bytes of allocated memory
+         * \param ld leading dimension of matrix allocated at D. ld >= m
+         */
+        MatrixWrapperForSparseF11(std::size_t m, std::size_t n,
+                           scalar_t* D, std::size_t ld) {
+            this->data_ = D; this->rows_ = m; this->cols_ = n;
+            this->ld_ = std::max(std::size_t(1), ld);
+        }
+
+        /**
+         * Constructor. Create a DenseMatrixWrapper as a submatrix of size
+         * m x n, of a DenseMatrix (or DenseMatrixWrapper) D, at position
+         * i,j in D. The constructed DenseMatrixWrapper will be the
+         * submatrix D(i:i+m,j:j+n).
+         *
+         * \param m number of rows of the new (sub) matrix
+         * \param n number of columns of the new matrix
+         * \param D matrix from which to take a submatrix
+         * \param i row offset in D of the top left corner of the submatrix
+         * \param j columns offset in D of the top left corner of the
+         * submatrix
+         */
+        MatrixWrapperForSparseF11(std::size_t m, std::size_t n, DenseMatrix<scalar_t>& D,
+                           std::size_t i, std::size_t j)
+                : MatrixWrapperForSparseF11<scalar_t>(m, n, &D(i, j), D.ld()) {
+            assert(i+m <= D.rows());
+            assert(j+n <= D.cols());
+        }
+
+        /**
+         * Virtual destructor. Since a DenseMatrixWrapper does not
+         * actually own it's memory, put just keeps a pointer, this will
+         * not free any memory.
+         */
+        virtual ~MatrixWrapperForSparseF11() { this->data_ = nullptr; }
+
+        /**
+         * Clear the MatrixWrapperForSparseF11. Ie, set to an empty matrix. This
+         * will not affect the original matrix, to which this is a
+         * wrapper, only the wrapper itself is reset. No memory is
+         * released.
+         */
+        void clear() override {
+            this->rows_ = 0; this->cols_ = 0;
+            this->ld_ = 1; this->data_ = nullptr;
+        }
+
+        /**
+         * Return the amount of memory taken by this wrapper, ie,
+         * 0. (since the wrapper itself does not own the memory). The
+         * memory will likely be owned by a DenseMatrix, while this
+         * MatrixWrapperForSparseF11 is just a submatrix of that existing
+         * matrix. Returning 0 here avoids counting memory double.
+         *
+         * \see nonzeros
+         */
+        std::size_t memory() const override { return 0; }
+
+        /**
+         * Return the number of nonzeros taken by this wrapper, ie,
+         * 0. (since the wrapper itself does not own the memory). The
+         * memory will likely be owned by a DenseMatrix, while this
+         * MatrixWrapperForSparseF11 is just a submatrix of that existing
+         * matrix. Returning 0 here avoids counting nonzeros double.
+         *
+         * \see memory
+         */
+        std::size_t nonzeros() const override { return 0; }
+
+        /**
+         * Default copy constructor, from another DenseMatrixWrapper.
+         */
+        MatrixWrapperForSparseF11(const MatrixWrapperForSparseF11<scalar_t>&) = default;
+
+        /**
+         * Constructing a MatrixWrapperForSparseF11 from a MatrixWrapperForSparseF11 is
+         * not allowed.
+         * TODO Why not??!! just delegate to MatrixWrapperForSparseF11(m, n, D, i, j)??
+         */
+        MatrixWrapperForSparseF11(const DenseMatrix<scalar_t>&) = delete;
+
+        /**
+         * Default move constructor.
+         */
+        MatrixWrapperForSparseF11(MatrixWrapperForSparseF11<scalar_t>&&) = default;
+
+        /**
+         * Moving from a DenseMatrix is not allowed.
+         */
+        MatrixWrapperForSparseF11(DenseMatrix<scalar_t>&&) = delete;
+
+        // /**
+        //  * Assignment operator. Shallow copy only. This only copies the
+        //  * wrapper object. Does not copy matrix elements.
+        //  *
+        //  * \param D matrix wrapper to copy from, this will be duplicated
+        //  */
+        // MatrixWrapperForSparseF11<scalar_t>&
+        // operator=(const MatrixWrapperForSparseF11<scalar_t>& D) {
+        //   this->data_ = D.data();
+        //   this->rows_ = D.rows();
+        //   this->cols_ = D.cols();
+        //   this->ld_ = D.ld();
+        //   return *this;
+        // }
+
+        /**
+         * Move assignment. This moves only the wrapper.
+         *
+         * \param D matrix wrapper to move from. This will not be
+         * modified.
+         */
+        MatrixWrapperForSparseF11<scalar_t>&
+        operator=(MatrixWrapperForSparseF11<scalar_t>&& D) {
+            this->data_ = D.data(); this->rows_ = D.rows();
+            this->cols_ = D.cols(); this->ld_ = D.ld(); return *this; }
+
+        /**
+         * Assignment operator, from a DenseMatrix. Assign the memory of
+         * the DenseMatrix to the matrix wrapped by this
+         * MatrixWrapperForSparseF11 object.
+         *
+         * \param a matrix to copy from, should be a.rows() ==
+         * this->rows() and a.cols() == this->cols()
+         */
+        MatrixWrapperForSparseF11<scalar_t>&
+        operator=(const DenseMatrix<scalar_t>& a) override {
+            assert(a.rows()==this->rows() && a.cols()==this->cols());
+            for (std::size_t j=0; j<this->cols(); j++)
+                for (std::size_t i=0; i<this->rows(); i++)
+                    this->operator()(i, j) = a(i, j);
+            return *this;
+        }
+    };
+
+    // explicit template instantiations
+    template class FrontalMatrixGPUSymmetricPositiveDefinite<float,int>;
+    template class FrontalMatrixGPUSymmetricPositiveDefinite<double,int>;
+    template class FrontalMatrixGPUSymmetricPositiveDefinite<std::complex<float>,int>;
+    template class FrontalMatrixGPUSymmetricPositiveDefinite<std::complex<double>,int>;
+
+    template class FrontalMatrixGPUSymmetricPositiveDefinite<float,long int>;
+    template class FrontalMatrixGPUSymmetricPositiveDefinite<double,long int>;
+    template class FrontalMatrixGPUSymmetricPositiveDefinite<std::complex<float>,long int>;
+    template class FrontalMatrixGPUSymmetricPositiveDefinite<std::complex<double>,long int>;
+
+    template class FrontalMatrixGPUSymmetricPositiveDefinite<float,long long int>;
+    template class FrontalMatrixGPUSymmetricPositiveDefinite<double,long long int>;
+    template class FrontalMatrixGPUSymmetricPositiveDefinite<std::complex<float>,long long int>;
+    template class FrontalMatrixGPUSymmetricPositiveDefinite<std::complex<double>,long long int>;
+
+} // end namespace strumpack
diff --git a/src/sparse/fronts/FrontalMatrixGPUSymmetricPositiveDefinite.cu b/src/sparse/fronts/FrontalMatrixGPUSymmetricPositiveDefinite.cu
new file mode 100644
index 00000000..3007867a
--- /dev/null
+++ b/src/sparse/fronts/FrontalMatrixGPUSymmetricPositiveDefinite.cu
@@ -0,0 +1,728 @@
+/*
+ * STRUMPACK -- STRUctured Matrices PACKage, Copyright (c) 2014, The
+ * Regents of the University of California, through Lawrence Berkeley
+ * National Laboratory (subject to receipt of any required approvals
+ * from the U.S. Dept. of Energy).  All rights reserved.
+ *
+ * If you have questions about your rights to use or distribute this
+ * software, please contact Berkeley Lab's Technology Transfer
+ * Department at TTD@lbl.gov.
+ *
+ * NOTICE. This software is owned by the U.S. Department of Energy. As
+ * such, the U.S. Government has been granted for itself and others
+ * acting on its behalf a paid-up, nonexclusive, irrevocable,
+ * worldwide license in the Software to reproduce, prepare derivative
+ * works, and perform publicly and display publicly.  Beginning five
+ * (5) years after the date permission to assert copyright is obtained
+ * from the U.S. Department of Energy, and subject to any subsequent
+ * five (5) year renewals, the U.S. Government is granted for itself
+ * and others acting on its behalf a paid-up, nonexclusive,
+ * irrevocable, worldwide license in the Software to reproduce,
+ * prepare derivative works, distribute copies to the public, perform
+ * publicly and display publicly, and to permit others to do so.
+ *
+ * Developers: Pieter Ghysels, Francois-Henry Rouet, Xiaoye S. Li.
+ *             (Lawrence Berkeley National Lab, Computational Research
+ *             Division).
+ *
+ */
+#define STRUMPACK_NO_TRIPLET_MPI
+#include "FrontalMatrixGPUKernels.hpp"
+#include "dense/CUDAWrapper.hpp"
+#include "dense/GPUWrapper.hpp"
+
+#include <complex>
+#include <iostream>
+#include <thrust/complex.h>
+
+
+namespace strumpack {
+  namespace gpu {
+
+    /**
+     * Get the real T type corresponding to a scalar, for instance T,
+     * std::complex<T> or thrust::complex<T>, to be used for instance
+     * to compute norms or absolute value.
+     */
+    template<class T> struct real_type { typedef T value_type; };
+    template<class T> struct real_type<thrust::complex<T>> { typedef T value_type; };
+    template<class T> struct real_type<std::complex<T>> { typedef T value_type; };
+
+    /**
+     * The types float2 and double2 are binary the same as
+     * std::complex or thrust::complex, but they can be used as
+     * __shared__ variables, whereas thrust::complex cannot because it
+     * doesn't have a no-argument default constructor.
+     */
+    template<class T> struct primitive_type { typedef T value_type; };
+    template<> struct primitive_type<thrust::complex<float>> { typedef float2 value_type; };
+    template<> struct primitive_type<thrust::complex<double>> { typedef double2 value_type; };
+    template<> struct primitive_type<std::complex<float>> { typedef float2 value_type; };
+    template<> struct primitive_type<std::complex<double>> { typedef double2 value_type; };
+
+    /**
+     * Get the corresponding thrust::complex for std::complex
+     */
+    template<class T> struct cuda_type { typedef T value_type; };
+    template<class T> struct cuda_type<std::complex<T>> { typedef thrust::complex<T> value_type; };
+
+    __device__ float inline real_part(float& a) { return a; }
+    __device__ double inline real_part(double& a) { return a; }
+    __device__ float inline real_part(thrust::complex<float>& a) { return a.real(); }
+    __device__ double inline real_part(thrust::complex<double>& a) { return a.real(); }
+
+    __device__ float inline absolute_value(float& a) { return fabsf(a); }
+    __device__ double inline absolute_value(double& a) { return fabs(a); }
+    __device__ float inline absolute_value(thrust::complex<float>& a) { return thrust::abs(a); }
+    __device__ double inline absolute_value(thrust::complex<double>& a) { return thrust::abs(a); }
+
+
+    /**
+     * Put elements of the sparse matrix in the F11, F12 and F21 parts
+     * of the front.  The sparse elements are taken from F.e11, F.e12,
+     * F.e21, which are lists of triplets {r,c,v}. The front is
+     * assumed to be initialized to zero.
+     *
+     */
+    template<typename T, int unroll> __global__ void
+    assemble_kernel(unsigned int nf, AssembleData<T>* dat) {
+      int idx = blockIdx.x * blockDim.x * unroll + threadIdx.x,
+        op = blockIdx.y * blockDim.y + threadIdx.y;
+      if (op >= nf) return;
+      auto& F = dat[op];
+      for (int i=0, j=idx; i<unroll; i++, j+=blockDim.x) {
+        if (j >= F.n11) break;
+        auto& t = F.e11[j];
+        F.F11[t.r + t.c*F.d1] = t.v;
+      }
+      for (int i=0, j=idx; i<unroll; i++, j+=blockDim.x) {
+        if (j >= F.n12) break;
+        auto& t = F.e12[j];
+        F.F12[t.r + t.c*F.d1] = t.v;
+      }
+      for (int i=0, j=idx; i<unroll; i++, j+=blockDim.x) {
+        if (j >= F.n21) break;
+        auto& t = F.e21[j];
+        F.F21[t.r + t.c*F.d2] = t.v;
+      }
+    }
+
+      template<typename T, int unroll> __global__ void
+      assemble_symmetric_kernel(unsigned int nf, AssembleData<T>* dat) {
+          int idx = blockIdx.x * blockDim.x * unroll + threadIdx.x,
+                  op = blockIdx.y * blockDim.y + threadIdx.y;
+          if (op >= nf) return;
+          auto& F = dat[op];
+          for (int i=0, j=idx; i<unroll; i++, j+=blockDim.x) {
+              if (j >= F.n11) break;
+              auto& t = F.e11[j];
+              F.F11[t.r + t.c*F.d1] = t.v;
+          }
+          for (int i=0, j=idx; i<unroll; i++, j+=blockDim.x) {
+              if (j >= F.n21) break;
+              auto& t = F.e21[j];
+              F.F21[t.r + t.c*F.d2] = t.v;
+          }
+      }
+
+    /**
+     * Single extend-add operation from one contribution block into
+     * the parent front. d1 is the size of F11, d2 is the size of F22.
+     */
+    template<typename T, unsigned int unroll, bool left>
+    __global__ void extend_add_kernel
+    (unsigned int by0, unsigned int nf, AssembleData<T>* dat) {
+      int y = blockIdx.x * blockDim.x + threadIdx.x,
+        x0 = (blockIdx.y + by0) * unroll,
+        z = blockIdx.z * blockDim.z + threadIdx.z;
+      if (z >= nf) return;
+      auto& f = dat[z];
+      auto CB = left ? f.CB1 : f.CB2;
+      if (!CB) return;
+      auto dCB = left ? f.dCB1 : f.dCB2;
+      if (y >= dCB) return;
+      auto I = left ? f.I1 : f.I2;
+      auto Iy = I[y];
+      CB += y + x0*dCB;
+      int d1 = f.d1, d2 = f.d2;
+      int ld;
+      T* F[2];
+      if (Iy < d1) {
+        ld = d1;
+        F[0] = f.F11+Iy;
+        F[1] = f.F12+Iy-d1*d1;
+      } else {
+        ld = d2;
+        F[0] = f.F21+Iy-d1;
+        F[1] = f.F22+Iy-d1-d1*d2;
+      }
+#pragma unroll
+      for (int i=0; i<unroll; i++) {
+        int x = x0 + i;
+        if (x >= dCB) break;
+        auto Ix = I[x];
+        F[Ix >= d1][Ix*ld] += CB[i*dCB];
+      }
+    }
+
+      template<typename T, unsigned int unroll, bool left>
+      __global__ void extend_add_symmetric_kernel
+              (unsigned int by0, unsigned int nf, AssembleData<T>* dat) {
+          int y = blockIdx.x * blockDim.x + threadIdx.x,
+                  x0 = (blockIdx.y + by0) * unroll,
+                  z = blockIdx.z * blockDim.z + threadIdx.z;
+          if (z >= nf) return;
+          auto& f = dat[z];
+          auto CB = left ? f.CB1 : f.CB2;
+          if (!CB) return;
+          auto dCB = left ? f.dCB1 : f.dCB2;
+          if (y >= dCB) return;
+          auto I = left ? f.I1 : f.I2;
+          auto Iy = I[y];
+          int d1 = f.d1, d2 = f.d2;
+          int ld;
+          T* F[2];
+          if (Iy < d1) {
+              ld = d1;
+              F[0] = f.F11+Iy;
+              F[1] = nullptr;
+          } else {
+              ld = d2;
+              F[0] = f.F21+Iy-d1;
+              F[1] = f.F22+Iy-d1-d1*d2;
+          }
+#pragma unroll
+          for (int i=0; i<unroll; i++) {
+              int row = y, col = x0 + i;
+              if (row < col) {
+                  continue;
+              }
+              int x = x0 + i;
+              if (x >= dCB) break;
+              auto Ix = I[x];
+              F[Ix >= d1][Ix*ld] += CB[row + col*dCB];
+          }
+      }
+
+    template<typename T> void
+    assemble(unsigned int nf, AssembleData<T>* dat,
+             AssembleData<T>* ddat) {
+      { // front assembly from sparse matrix
+        std::size_t nnz = 0;
+        for (unsigned int f=0; f<nf; f++)
+          nnz = std::max
+            (nnz, std::max(dat[f].n11, std::max(dat[f].n12, dat[f].n21)));
+        if (nnz) {
+          unsigned int nt = 512, ops = 1;
+          const int unroll = 8;
+          while (nt*unroll > nnz && nt > 8 && ops < 64) {
+            nt /= 2;
+            ops *= 2;
+          }
+          ops = std::min(ops, nf);
+          unsigned int nb = (nnz + nt*unroll - 1) / (nt*unroll),
+            nbf = (nf + ops - 1) / ops;
+          dim3 block(nt, ops);
+          for (unsigned int f=0; f<nbf; f+=MAX_BLOCKS_Y) {
+            dim3 grid(nb, std::min(nbf-f, MAX_BLOCKS_Y));
+            assemble_kernel<T,unroll><<<grid,block>>>(nf-f*ops, ddat+f*ops);
+          }
+        }
+      }
+      gpu_check(cudaPeekAtLastError());
+      { // extend-add
+        int du = 0;
+        for (unsigned int f=0; f<nf; f++)
+          du = std::max(du, std::max(dat[f].dCB1, dat[f].dCB2));
+        if (du) {
+          const unsigned int unroll = 16;
+          unsigned int nt = 512, ops = 1;
+          while (nt > du && ops < 64) {
+            nt /= 2;
+            ops *= 2;
+          }
+          ops = std::min(ops, nf);
+          unsigned int nbx = (du + nt - 1) / nt,
+            nby = (du + unroll - 1) / unroll,
+            nbf = (nf + ops - 1) / ops;
+          dim3 block(nt, 1, ops);
+          using T_ = typename cuda_type<T>::value_type;
+          auto dat_ = reinterpret_cast<AssembleData<T_>*>(ddat);
+          for (unsigned int y=0; y<nby; y+=MAX_BLOCKS_Y) {
+            unsigned int ny = std::min(nby-y, MAX_BLOCKS_Y);
+            for (unsigned int f=0; f<nbf; f+=MAX_BLOCKS_Z) {
+              dim3 grid(nbx, ny, std::min(nbf-f, MAX_BLOCKS_Z));
+              extend_add_kernel<T_,unroll, true><<<grid, block>>>
+                (y, nf-f*ops, dat_+f*ops);
+              extend_add_kernel<T_,unroll, false><<<grid, block>>>
+                (y, nf-f*ops, dat_+f*ops);
+            }
+          }
+        }
+      }
+      gpu_check(cudaPeekAtLastError());
+    }
+
+      template<typename T> void
+      assemble_symmetric(unsigned int nf, AssembleData<T>* dat,
+                         AssembleData<T>* ddat) {
+          { // front assembly from sparse matrix
+              std::size_t nnz = 0;
+              for (unsigned int f=0; f<nf; f++)
+                  nnz = std::max
+                          (nnz, std::max(dat[f].n11, std::max(dat[f].n12, dat[f].n21)));
+              if (nnz) {
+                  unsigned int nt = 512, ops = 1;
+                  const int unroll = 8;
+                  while (nt*unroll > nnz && nt > 8 && ops < 64) {
+                      nt /= 2;
+                      ops *= 2;
+                  }
+                  ops = std::min(ops, nf);
+                  unsigned int nb = (nnz + nt*unroll - 1) / (nt*unroll),
+                          nbf = (nf + ops - 1) / ops;
+                  dim3 block(nt, ops);
+                  for (unsigned int f=0; f<nbf; f+=MAX_BLOCKS_Y) {
+                      dim3 grid(nb, std::min(nbf-f, MAX_BLOCKS_Y));
+                      assemble_symmetric_kernel<T,unroll><<<grid,block>>>(nf-f*ops, ddat+f*ops);
+                  }
+              }
+          }
+          gpu_check(cudaPeekAtLastError());
+          { // extend-add
+              int du = 0;
+              for (unsigned int f=0; f<nf; f++)
+                  du = std::max(du, std::max(dat[f].dCB1, dat[f].dCB2));
+              if (du) {
+                  const unsigned int unroll = 16;
+                  unsigned int nt = 512, ops = 1;
+                  while (nt > du && ops < 64) {
+                      nt /= 2;
+                      ops *= 2;
+                  }
+                  ops = std::min(ops, nf);
+                  unsigned int nbx = (du + nt - 1) / nt,
+                          nby = (du + unroll - 1) / unroll,
+                          nbf = (nf + ops - 1) / ops;
+                  dim3 block(nt, 1, ops);
+                  using T_ = typename cuda_type<T>::value_type;
+                  auto dat_ = reinterpret_cast<AssembleData<T_>*>(ddat);
+                  for (unsigned int y=0; y<nby; y+=MAX_BLOCKS_Y) {
+                      unsigned int ny = std::min(nby-y, MAX_BLOCKS_Y);
+                      for (unsigned int f=0; f<nbf; f+=MAX_BLOCKS_Z) {
+                          dim3 grid(nbx, ny, std::min(nbf-f, MAX_BLOCKS_Z));
+                          extend_add_symmetric_kernel<T_,unroll, true><<<grid, block>>>
+                                  (y, nf-f*ops, dat_+f*ops);
+                          extend_add_symmetric_kernel<T_,unroll, false><<<grid, block>>>
+                                  (y, nf-f*ops, dat_+f*ops);
+                      }
+                  }
+              }
+          }
+          gpu_check(cudaPeekAtLastError());
+      }
+
+
+    // /**
+    //  * This only works if value >= 0.
+    //  * It's assuming two's complement for the int.
+    //  * __float_as_int is like reinterpret_cast<int&>(value)
+    //  */
+    // __device__ __forceinline__ void atomicAbsMax(float* data, float value) {
+    //   atomicMax((int *)data, __float_as_int(value));
+    // }
+    // __device__ __forceinline__ void atomicAbsMax(double* addr, double value) {
+    //   // why does this not compile?
+    //   atomicMax((long long int *)addr, __double_as_longlong(value));
+    // }
+
+
+    /**
+     * LU with row pivoting, with a single NTxNT thread block. The
+     * matrix size n must be less than NT.
+     *
+     * This is a naive implementation. The goal here is to reduce
+     * kernel launch overhead by batching many small LU
+     * factorizations.
+     *
+     * Use thrust::complex instead of std::complex.
+     */
+    template<typename T, int NT> __device__ void
+    LLT_block_kernel(int n, T* F, int* info) {
+      using cuda_primitive_t = typename primitive_type<T>::value_type;
+      using real_t = typename real_type<T>::value_type;
+      __shared__ cuda_primitive_t M_[NT*NT];
+      T* M = reinterpret_cast<T*>(M_);
+      int j = threadIdx.x, i = threadIdx.y;
+      *info = 0;
+
+      // copy F from global device storage into shared memory
+      if (i < n && j < n)
+        M[i+j*NT] = F[i+j*n];
+      __syncthreads();
+
+      for (int k=0; k<n; k++) {
+          auto diagonal = M[k + k * NT];
+          if (absolute_value(diagonal) <= 0) {
+              *info = k;
+              return;
+          }
+          diagonal = sqrt(diagonal);
+          // divide by the pivot element
+          if (j == k && i >= k && i < n)
+            M[i+k*NT] /= diagonal;
+          __syncthreads();
+          // Schur update
+          if (j > k && i > k && j < n && i < n)
+            M[i+j*NT] -= M[i+k*NT] * M[j+k*NT];
+          __syncthreads();
+      }
+      // write back from shared to global device memory
+      if (i < n && j < n)
+        F[i+j*n] = M[i+j*NT];
+    }
+
+    template<typename T, int NT, typename real_t> __global__ void
+    LLT_block_kernel_batched(FrontData<T>* dat, bool replace,
+                            real_t thresh, int* dinfo) {
+      FrontData<T>& A = dat[blockIdx.x];
+      LLT_block_kernel<T,NT>(A.n1, A.F11, &dinfo[blockIdx.x]);
+      if (replace) {
+        int i = threadIdx.x, j = threadIdx.y;
+        if (i == j && i < A.n1) {
+          std::size_t k = i + i*A.n1;
+          if (absolute_value(A.F11[k]) < thresh)
+            A.F11[k] = (real_part(A.F11[k]) < 0) ? -thresh : thresh;
+        }
+      }
+    }
+
+    template<typename T, typename real_t> __global__ void
+    replace_pivots_kernel(int n, T* A, real_t thresh) {
+      int i = blockIdx.x * blockDim.x + threadIdx.x;
+      if (i < n) {
+        std::size_t k = i + i*n;
+        if (absolute_value(A[k]) < thresh)
+          A[k] = (real_part(A[k]) < 0) ? -thresh : thresh;
+      }
+    }
+
+    template<typename T, typename real_t>
+    void replace_pivots(int n, T* A, real_t thresh, gpu::Stream* s) {
+      if (!n) return;
+      using T_ = typename cuda_type<T>::value_type;
+      int NT = 128;
+      if (s)
+        replace_pivots_kernel<T_,real_t><<<(n+NT-1)/NT, NT, 0, get_cuda_stream(*s)>>>
+          (n, (T_*)(A), thresh);
+      else
+        replace_pivots_kernel<T_,real_t><<<(n+NT-1)/NT, NT>>>
+          (n, (T_*)(A), thresh);
+      gpu_check(cudaPeekAtLastError());
+    }
+
+    template<typename T, typename real_t> __global__ void
+    replace_pivots_vbatch_kernel(int* dn, T** dA, int* lddA, real_t thresh,
+                                 unsigned int batchCount) {
+      int i = blockIdx.x * blockDim.x + threadIdx.x,
+        f = blockIdx.y * blockDim.y + threadIdx.y;
+      if (f >= batchCount) return;
+      if (i >= dn[f]) return;
+      auto A = dA[f];
+      auto ldA = lddA[f];
+      std::size_t ii = i + i*ldA;
+      if (absolute_value(A[ii]) < thresh)
+        A[ii] = (real_part(A[ii]) < 0) ? -thresh : thresh;
+    }
+
+    /**
+     * LU solve with matrix F factor in LU, with pivot vector piv. F
+     * is n x n, and n <= NT. X is the right hand side, and is n x
+     * m. Both F and X have leading dimension n.
+     *
+     * NTxNT is the dimension of the thread block.
+     *
+     * This doesn't work for T = std::complex<?>, use
+     * T=thrust::complex<?> instead.
+     */
+    template<typename T, int NT> __device__ void
+    solve_symmetric_block_kernel(int n, int m, T* F, T* X) {
+      using primitive_t = typename primitive_type<T>::value_type;
+      __shared__ primitive_t A_[NT*NT], B_[NT*NT];
+      T *B = reinterpret_cast<T*>(B_), *A = reinterpret_cast<T*>(A_);
+      int i = threadIdx.x, j = threadIdx.y;
+      // put matrix F in shared memory
+      if (i < n && j < n)
+        A[i+j*NT] = F[i+j*n];
+      __syncthreads();
+
+      // loop over blocks of NT columns of X
+      for (int b=0; b<m; b+=NT) {
+        int c = b + j;
+
+        // put X in shared memory, while applying the permutation
+        if (i < n && c < m)
+          B[i+j*NT] = X[c+i*m];
+        __syncthreads();
+
+        // solve with L (unit diagonal)
+        for (int k=0; k<n; k++) {
+            if (i == k) {
+                B[k + j * NT] /= A[k + k * NT];
+            }
+            __syncthreads();
+          if (i > k && i < n && c < m)
+            B[i + j * NT] -= A[i + k * NT] * B[k + j * NT];
+          __syncthreads();
+        }
+
+        // write from shared back to global device memory
+        if (i < n && c < m)
+          X[c+i*m] = B[i+j*NT];
+      }
+    }
+
+    template<typename T, int NT> __global__ void
+    solve_symmetric_block_kernel_batched(FrontData<T>* dat) {
+      FrontData<T>& A = dat[blockIdx.x];
+      solve_symmetric_block_kernel<T,NT>(A.n1, A.n2, A.F11, A.F21);
+    }
+
+
+    /**
+     * Compute F -= F21 * F12, where F is d2 x d2 and F12 is d1 x d2.
+     * d1 is <= NT. This should be called with a single NT x NT thread
+     * block.
+     */
+    template<typename T, int NT> __device__ void
+    Schur_symmetric_block_kernel(int d1, int d2, T* F21, T* F22) {
+      using cuda_primitive_t = typename primitive_type<T>::value_type;
+      __shared__ cuda_primitive_t B_[NT*NT], A_[NT*NT];
+      T *B = reinterpret_cast<T*>(B_), *A = reinterpret_cast<T*>(A_);
+      int j = threadIdx.x, i = threadIdx.y;
+      A[j+i*NT] = B[j+i*NT] = 0.;
+      for (int cb=0; cb<d2; cb+=NT) {
+        int c = cb + j;
+        // put NT columns of F12 in shared memory B
+        if (i < d1 && c < d2)
+          B[j+i*NT] = F21[c+i*d2];
+        __syncthreads();
+        for (int rb=0; rb<d2; rb+=NT) {
+          int r = rb + i;
+          // put NT rows of F21 in shared memory A
+          if (r < d2 && j < d1)
+            A[j+i*NT] = F21[r+j*d2];
+          __syncthreads(); // wait for A and B
+          if (c < d2 && r < d2) {
+            T tmp(0.);
+            // k < d1 <= NT, by using k<NT this can be unrolled
+            for (int k=0; k<NT; k++)
+              tmp += A[k+i*NT] * B[j+k*NT];
+            F22[r+c*d2] -= tmp;
+          }
+          __syncthreads(); // sync before reading new A/B
+        }
+      }
+    }
+
+    template<typename T, int NT> __global__ void
+    Schur_symmetric_block_kernel_batched(FrontData<T>* dat) {
+      FrontData<T>& A = dat[blockIdx.x];
+      Schur_symmetric_block_kernel<T,NT>(A.n1, A.n2, A.F21, A.F22);
+    }
+
+
+    template<typename T, int NT, typename real_t>
+    void factor_symmetric_block_batch(unsigned int count, FrontData<T>* dat,
+                            bool replace, real_t thresh, int* dinfo) {
+      if (!count) return;
+      using T_ = typename cuda_type<T>::value_type;
+      auto dat_ = reinterpret_cast<FrontData<T_>*>(dat);
+      dim3 block(NT, NT); //, grid(count, 1, 1);
+      LLT_block_kernel_batched<T_,NT,real_t><<<count, block>>>
+        (dat_, replace, thresh, dinfo);
+      gpu_check(cudaPeekAtLastError());
+      solve_symmetric_block_kernel_batched<T_,NT><<<count, block>>>(dat_);
+      gpu_check(cudaPeekAtLastError());
+      Schur_symmetric_block_kernel_batched<T_,NT><<<count, block>>>(dat_);
+      gpu_check(cudaPeekAtLastError());
+    }
+
+
+    template<typename T, int NT> __global__ void
+    solve_block_kernel_batched(int nrhs, FrontData<T>* dat) {
+      FrontData<T>& A = dat[blockIdx.x];
+      solve_symmetric_block_kernel<T,NT>(A.n1, nrhs, A.F11, A.F12, A.piv);
+    }
+
+    /**
+     * Single extend-add operation along the column dimension, for the
+     * solve.  d1 is the size of F11, d2 is the size of F22.
+     */
+    template<typename T> __device__ void
+    ea_rhs_kernel(int r, int N, int nrhs,
+                  int dsep, int dupd, int dCB,
+                  T* b, T* bupd, T* CB, std::size_t* I) {
+      if (r >= dCB) return;
+      auto Ir = I[r];
+      for (int c=0; c<nrhs; c++)
+        if (Ir < dsep) b[Ir+c*N] += CB[r+c*dCB];
+        else bupd[Ir-dsep+c*dupd] += CB[r+c*dCB];
+    }
+
+    template<typename T> __global__ void
+    extend_add_rhs_kernel_left
+    (int N, int nrhs, unsigned int nf, AssembleData<T>* dat) {
+      int r = blockIdx.x * blockDim.x + threadIdx.x,
+        i = blockIdx.y * blockDim.y + threadIdx.y;
+      if (i >= nf) return;
+      auto& f = dat[i];
+      if (f.CB1)
+        ea_rhs_kernel(r, N, nrhs, f.d1, f.d2, f.dCB1,
+                      f.F11, f.F21, f.CB1, f.I1);
+    }
+    template<typename T> __global__ void
+    extend_add_rhs_kernel_right
+    (int N, int nrhs, unsigned int nf, AssembleData<T>* dat) {
+      int r = blockIdx.x * blockDim.x + threadIdx.x,
+        i = blockIdx.y * blockDim.y + threadIdx.y;
+      if (i >= nf) return;
+      auto& f = dat[i];
+      if (f.CB2)
+        ea_rhs_kernel(r, N, nrhs, f.d1, f.d2, f.dCB2,
+                      f.F11, f.F21, f.CB2, f.I2);
+    }
+
+    template<typename T> void
+    extend_add_rhs(int N, int nrhs, unsigned int nf,
+                   AssembleData<T>* dat, AssembleData<T>* ddat) {
+      int du = 0;
+      for (unsigned int f=0; f<nf; f++)
+        du = std::max(du, std::max(dat[f].dCB1, dat[f].dCB2));
+      if (!du) return;
+      unsigned int nt = 512, ops = 1;
+      while (nt > du && ops < 64) {
+        nt /= 2;
+        ops *= 2;
+      }
+      ops = std::min(ops, nf);
+      unsigned int nb = (du + nt - 1) / nt, nbf = (nf + ops - 1) / ops;
+      dim3 block(nt, ops);
+      using T_ = typename cuda_type<T>::value_type;
+      auto dat_ = reinterpret_cast<AssembleData<T_>*>(ddat);
+      for (unsigned int f=0; f<nbf; f+=MAX_BLOCKS_Z) {
+        dim3 grid(nb, std::min(nbf-f, MAX_BLOCKS_Z));
+        extend_add_rhs_kernel_left<<<grid, block>>>
+          (N, nrhs, nf-f*ops, dat_+f*ops);
+        extend_add_rhs_kernel_right<<<grid, block>>>
+          (N, nrhs, nf-f*ops, dat_+f*ops);
+      }
+      gpu_check(cudaPeekAtLastError());
+    }
+
+
+    /**
+     * Single extend-add operation along the column dimension, for the
+     * solve.  d1 is the size of F11, d2 is the size of F22.
+     */
+    template<typename T> __device__ void
+    extract_rhs_kernel(int r, int N, int nrhs,
+                       int dsep, int dupd, int dCB,
+                       T* b, T* bupd, T* CB, std::size_t* I) {
+      if (r >= dCB) return;
+      auto Ir = I[r];
+      for (int c=0; c<nrhs; c++)
+        if (Ir < dsep) CB[r+c*dCB] = b[Ir+c*N];
+        else CB[r+c*dCB] = bupd[Ir-dsep+c*dupd];
+    }
+
+    template<typename T> __global__ void
+    extract_rhs_kernel(int N, int nrhs, unsigned int nf,
+                       AssembleData<T>* dat) {
+      int r = blockIdx.x * blockDim.x + threadIdx.x,
+        i = blockIdx.y * blockDim.y + threadIdx.y;
+      if (i >= nf) return;
+      auto& f = dat[i];
+      if (f.CB1)
+        extract_rhs_kernel(r, N, nrhs, f.d1, f.d2, f.dCB1,
+                           f.F11, f.F21, f.CB1, f.I1);
+      if (f.CB2)
+        extract_rhs_kernel(r, N, nrhs, f.d1, f.d2, f.dCB2,
+                           f.F11, f.F21, f.CB2, f.I2);
+    }
+
+    template<typename T> void
+    extract_rhs(int N, int nrhs, unsigned int nf, AssembleData<T>* dat,
+                AssembleData<T>* ddat) {
+      int du = 0;
+      for (unsigned int f=0; f<nf; f++)
+        du = std::max(du, std::max(dat[f].dCB1, dat[f].dCB2));
+      if (!du) return;
+      unsigned int nt = 512, ops = 1;
+      while (nt > du && ops < 64) {
+        nt /= 2;
+        ops *= 2;
+      }
+      ops = std::min(ops, nf);
+      unsigned int nb = (du + nt - 1) / nt, nbf = (nf + ops - 1) / ops;
+      dim3 block(nt, ops);
+      using T_ = typename cuda_type<T>::value_type;
+      auto dat_ = reinterpret_cast<AssembleData<T_>*>(ddat);
+      for (unsigned int f=0; f<nbf; f+=MAX_BLOCKS_Z) {
+        dim3 grid(nb, std::min(nbf-f, MAX_BLOCKS_Z));
+        extract_rhs_kernel<<<grid, block>>>
+          (N, nrhs, nf-f*ops, dat_+f*ops);
+      }
+    }
+
+
+
+      // explicit template instantiations
+    template void assemble(unsigned int, AssembleData<float>*, AssembleData<float>*);
+    template void assemble(unsigned int, AssembleData<double>*, AssembleData<double>*);
+    template void assemble(unsigned int, AssembleData<std::complex<float>>*, AssembleData<std::complex<float>>*);
+    template void assemble(unsigned int, AssembleData<std::complex<double>>*, AssembleData<std::complex<double>>*);
+
+      template void assemble_symmetric(unsigned int, AssembleData<float>*, AssembleData<float>*);
+      template void assemble_symmetric(unsigned int, AssembleData<double>*, AssembleData<double>*);
+      template void assemble_symmetric(unsigned int, AssembleData<std::complex<float>>*, AssembleData<std::complex<float>>*);
+      template void assemble_symmetric(unsigned int, AssembleData<std::complex<double>>*, AssembleData<std::complex<double>>*);
+
+    template void extend_add_rhs(int, int, unsigned int, AssembleData<float>*, AssembleData<float>*);
+    template void extend_add_rhs(int, int, unsigned int, AssembleData<double>*, AssembleData<double>*);
+    template void extend_add_rhs(int, int, unsigned int, AssembleData<std::complex<float>>*, AssembleData<std::complex<float>>*);
+    template void extend_add_rhs(int, int, unsigned int, AssembleData<std::complex<double>>*, AssembleData<std::complex<double>>*);
+
+    template void extract_rhs(int, int, unsigned int, AssembleData<float>*, AssembleData<float>*);
+    template void extract_rhs(int, int, unsigned int, AssembleData<double>*, AssembleData<double>*);
+    template void extract_rhs(int, int, unsigned int, AssembleData<std::complex<float>>*, AssembleData<std::complex<float>>*);
+    template void extract_rhs(int, int, unsigned int, AssembleData<std::complex<double>>*, AssembleData<std::complex<double>>*);
+
+
+    template void factor_symmetric_block_batch<float,8,float>(unsigned int, FrontData<float>*, bool, float, int*);
+    template void factor_symmetric_block_batch<double,8,double>(unsigned int, FrontData<double>*, bool, double, int*);
+    template void factor_symmetric_block_batch<std::complex<float>,8,float>(unsigned int, FrontData<std::complex<float>>*, bool, float, int*);
+    template void factor_symmetric_block_batch<std::complex<double>,8,double>(unsigned int, FrontData<std::complex<double>>*, bool, double, int*);
+
+    template void factor_symmetric_block_batch<float,16,float>(unsigned int, FrontData<float>*, bool, float, int*);
+    template void factor_symmetric_block_batch<double,16,double>(unsigned int, FrontData<double>*, bool, double, int*);
+    template void factor_symmetric_block_batch<std::complex<float>,16,float>(unsigned int, FrontData<std::complex<float>>*, bool, float, int*);
+    template void factor_symmetric_block_batch<std::complex<double>,16,double>(unsigned int, FrontData<std::complex<double>>*, bool, double, int*);
+
+    template void factor_symmetric_block_batch<float,24,float>(unsigned int, FrontData<float>*, bool, float, int*);
+    template void factor_symmetric_block_batch<double,24,double>(unsigned int, FrontData<double>*, bool, double, int*);
+    template void factor_symmetric_block_batch<std::complex<float>,24,float>(unsigned int, FrontData<std::complex<float>>*, bool, float, int*);
+    template void factor_symmetric_block_batch<std::complex<double>,24,double>(unsigned int, FrontData<std::complex<double>>*, bool, double, int*);
+
+    template void factor_symmetric_block_batch<float,32,float>(unsigned int, FrontData<float>*, bool, float, int*);
+    template void factor_symmetric_block_batch<double,32,double>(unsigned int, FrontData<double>*, bool, double, int*);
+    template void factor_symmetric_block_batch<std::complex<float>,32,float>(unsigned int, FrontData<std::complex<float>>*, bool, float, int*);
+    template void factor_symmetric_block_batch<std::complex<double>,32,double>(unsigned int, FrontData<std::complex<double>>*, bool, double, int*);
+
+    template void replace_pivots(int, float*, float, gpu::Stream*);
+    template void replace_pivots(int, double*, double, gpu::Stream*);
+    template void replace_pivots(int, std::complex<float>*, float, gpu::Stream*);
+    template void replace_pivots(int, std::complex<double>*, double, gpu::Stream*);
+
+  } // end namespace gpu
+} // end namespace strumpack
diff --git a/src/sparse/fronts/FrontalMatrixGPUSymmetricPositiveDefinite.hpp b/src/sparse/fronts/FrontalMatrixGPUSymmetricPositiveDefinite.hpp
new file mode 100644
index 00000000..3fbf3b86
--- /dev/null
+++ b/src/sparse/fronts/FrontalMatrixGPUSymmetricPositiveDefinite.hpp
@@ -0,0 +1,111 @@
+//
+// Created by tingxuan on 23-6-19.
+//
+
+#pragma once
+
+#include "FrontalMatrixDense.hpp"
+
+#if defined(STRUMPACK_USE_CUDA)
+#include "dense/CUDAWrapper.hpp"
+#endif
+#if defined(STRUMPACK_USE_HIP)
+#include "dense/HIPWrapper.hpp"
+#endif
+#if defined(STRUMPACK_USE_SYCL)
+#include "dense/DPCPPWrapper.hpp"
+#endif
+
+namespace strumpack {
+
+    template<typename scalar_t, typename integer_t> class LevelInfoUnified;
+
+    namespace gpu {
+        template<typename scalar_t> struct FrontData;
+        // template<typename scalar_t> struct FwdSolveData;
+    }
+
+
+    template<typename scalar_t,typename integer_t> class FrontalMatrixGPUSymmetricPositiveDefinite
+            : public FrontalMatrix<scalar_t,integer_t> {
+        using F_t = FrontalMatrix<scalar_t,integer_t>;
+        using FG_t = FrontalMatrixGPUSymmetricPositiveDefinite<scalar_t,integer_t>;
+        using DenseM_t = DenseMatrix<scalar_t>;
+        using DenseMW_t = DenseMatrixWrapper<scalar_t>;
+        using SpMat_t = CompressedSparseMatrix<scalar_t,integer_t>;
+        using LInfo_t = LevelInfoUnified<scalar_t,integer_t>;
+
+    public:
+        FrontalMatrixGPUSymmetricPositiveDefinite(integer_t sep, integer_t sep_begin, integer_t sep_end,
+                         std::vector<integer_t>& upd);
+        ~FrontalMatrixGPUSymmetricPositiveDefinite();
+
+        long long dense_node_factor_nonzeros() const override;
+
+        void release_work_memory() override;
+
+        void extend_add_to_dense(DenseM_t& paF11, DenseM_t& paF21, DenseM_t& paF22,
+                                 const F_t* p, int task_depth) override;
+
+        ReturnCode multifrontal_factorization(const SpMat_t& A,
+                                              const SPOptions<scalar_t>& opts,
+                                              int etree_level=0,
+                                              int task_depth=0) override;
+
+        ReturnCode multifrontal_factorization_symmetric(const SpMat_t& A,
+                                                        const SPOptions<scalar_t>& opts,
+                                                        int etree_level=0,
+                                                        int task_depth=0);
+
+        void extract_CB_sub_matrix(const std::vector<std::size_t>& I,
+                                   const std::vector<std::size_t>& J,
+                                   DenseM_t& B, int task_depth) const override {}
+
+        std::string type() const override { return "FrontalMatrixGPU"; }
+        bool isGPU() const override { return true; }
+
+#if defined(STRUMPACK_USE_MPI)
+        void
+        extend_add_copy_to_buffers(std::vector<std::vector<scalar_t>>& sbuf,
+                                   const FrontalMatrixMPI<scalar_t,integer_t>* pa)
+        const override;
+#endif
+
+    private:
+        std::unique_ptr<scalar_t[]> host_factors_, host_Schur_;
+        std::unique_ptr<scalar_t[], std::function<void(scalar_t*)>> host_factors_diagonal_{nullptr, std::default_delete<scalar_t[]>{}}, host_factors_off_diagonal_{nullptr, std::default_delete<scalar_t[]>{}};
+        DenseMW_t F11_, F12_, F21_, F22_;
+        std::vector<int> pivot_mem_;
+        int* piv_ = nullptr;
+
+        FrontalMatrixGPUSymmetricPositiveDefinite(const FrontalMatrixGPUSymmetricPositiveDefinite&) = delete;
+        FrontalMatrixGPUSymmetricPositiveDefinite& operator=(FrontalMatrixGPUSymmetricPositiveDefinite const&) = delete;
+
+        void front_assembly(const SpMat_t& A, LInfo_t& L,
+                            char* hea_mem, char* dea_mem);
+
+        void factor_small_fronts(LInfo_t& L, gpu::FrontData<scalar_t>* fdata,
+                                 int* dinfo, const SPOptions<scalar_t>& opts);
+
+        ReturnCode split_smaller(const SpMat_t& A, const SPOptions<scalar_t>& opts,
+                                 int etree_level=0, int task_depth=0);
+
+        void fwd_solve_phase2(DenseM_t& b, DenseM_t& bupd,
+                              int etree_level, int task_depth) const override;
+        void bwd_solve_phase1(DenseM_t& y, DenseM_t& yupd,
+                              int etree_level, int task_depth) const override;
+
+        ReturnCode node_inertia(integer_t& neg,
+                                integer_t& zero,
+                                integer_t& pos) const override;
+
+        using F_t::lchild_;
+        using F_t::rchild_;
+        using F_t::dim_sep;
+        using F_t::dim_upd;
+
+        template<typename T,typename I> friend class LevelInfoUnified;
+    };
+
+} // end namespace strumpack
+
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 676a4075..4fedf1e7 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -2,17 +2,25 @@ add_executable(test_HSS_seq    test_HSS_seq.cpp)
 add_executable(test_sparse_seq test_sparse_seq.cpp)
 add_executable(test_BLR_seq    test_BLR_seq.cpp)
 add_executable(test_matrix_IO  test_matrix_IO.cpp)
+add_executable(test_SPD_seq test_SPD_seq.cpp)
+add_executable(test_SPD_mixedPrecision test_SPD_mixedPrecision.cpp)
 
 target_link_libraries(test_HSS_seq strumpack)
 target_link_libraries(test_sparse_seq strumpack)
 target_link_libraries(test_BLR_seq strumpack)
 target_link_libraries(test_matrix_IO strumpack)
+target_link_libraries(test_SPD_seq strumpack)
+target_link_libraries(test_SPD_mixedPrecision strumpack)
 
 add_test("user_test_HSS_seq" ${CMAKE_CURRENT_BINARY_DIR}/test_HSS_seq T 100)
 add_test("user_test_sparse_seq" ${CMAKE_CURRENT_BINARY_DIR}/test_sparse_seq
   ${PROJECT_SOURCE_DIR}/examples/sparse/data/pde900.mtx)
 add_test("user_matrix_IO" ${CMAKE_CURRENT_BINARY_DIR}/test_matrix_IO T 1000)
 add_test("user_test_BLR_seq" ${CMAKE_CURRENT_BINARY_DIR}/test_BLR_seq 300)
+add_test("user_test_SPD_seq" ${CMAKE_CURRENT_BINARY_DIR}/test_SPD_seq
+        ${PROJECT_SOURCE_DIR}/examples/sparse/data/bcsstm08.mtx)
+add_test("user_test_SPD_mixedPrecision" ${CMAKE_CURRENT_BINARY_DIR}/test_SPD_mixedPrecision
+        ${PROJECT_SOURCE_DIR}/examples/sparse/data/bcsstm08.mtx)
 
 if(STRUMPACK_USE_MPI)
   add_executable(test_HSS_mpi             test_HSS_mpi.cpp)
diff --git a/test/test_SPD_mixedPrecision.cpp b/test/test_SPD_mixedPrecision.cpp
new file mode 100644
index 00000000..b94add1b
--- /dev/null
+++ b/test/test_SPD_mixedPrecision.cpp
@@ -0,0 +1,200 @@
+/*
+ * STRUMPACK -- STRUctured Matrices PACKage, Copyright (c) 2014, The
+ * Regents of the University of California, through Lawrence Berkeley
+ * National Laboratory (subject to receipt of any required approvals
+ * from the U.S. Dept. of Energy).  All rights reserved.
+ *
+ * If you have questions about your rights to use or distribute this
+ * software, please contact Berkeley Lab's Technology Transfer
+ * Department at TTD@lbl.gov.
+ *
+ * NOTICE. This software is owned by the U.S. Department of Energy. As
+ * such, the U.S. Government has been granted for itself and others
+ * acting on its behalf a paid-up, nonexclusive, irrevocable,
+ * worldwide license in the Software to reproduce, prepare derivative
+ * works, and perform publicly and display publicly.  Beginning five
+ * (5) years after the date permission to assert copyright is obtained
+ * from the U.S. Department of Energy, and subject to any subsequent
+ * five (5) year renewals, the U.S. Government is granted for itself
+ * and others acting on its behalf a paid-up, nonexclusive,
+ * irrevocable, worldwide license in the Software to reproduce,
+ * prepare derivative works, distribute copies to the public, perform
+ * publicly and display publicly, and to permit others to do so.
+ *
+ * Developers: Pieter Ghysels, Francois-Henry Rouet, Xiaoye S. Li.
+ *             (Lawrence Berkeley National Lab, Computational Research
+ *             Division).
+ *
+ */
+#include <cstring>
+#include <iostream>
+using namespace std;
+
+#include "StrumpackSparseSolver.hpp"
+#include "StrumpackSparseSolverMixedPrecision.hpp"
+#include "misc/RandomWrapper.hpp"
+#include "sparse/CSRMatrix.hpp"
+
+using namespace strumpack;
+
+#define ERROR_TOLERANCE 1e2
+#define SOLVE_TOLERANCE 1e-12
+
+template <typename working_t, typename integer_t>
+void test(CSRMatrix<working_t, integer_t> &A, DenseMatrix<working_t> &b,
+          DenseMatrix<working_t> &x_exact, int argc, char *argv[]) {
+  int m = b.cols(); // number of right-hand sides
+  auto N = A.size();
+  DenseMatrix<working_t> x(N, m);
+
+  std::cout << std::endl;
+  std::cout << "###############################################" << std::endl;
+  std::cout << "### Working precision: "
+            << (std::is_same<float, working_t>::value ? "single" : "double")
+            << " #################" << std::endl;
+  std::cout << "###############################################" << std::endl;
+
+  {
+    std::cout << std::endl;
+    std::cout << "### MIXED Precision Solver ####################" << std::endl;
+
+    SparseSolverMixedPrecision<float, double, int> spss;
+    /** options for the outer solver */
+    spss.options().set_Krylov_solver(KrylovSolver::REFINE);
+    //     spss.options().set_Krylov_solver(KrylovSolver::PREC_BICGSTAB);
+    //     spss.options().set_Krylov_solver(KrylovSolver::PREC_GMRES);
+    spss.options().set_rel_tol(1e-14);
+    spss.options().enable_symmetric();
+    spss.options().enable_positive_definite();
+    spss.options().set_matching(strumpack::MatchingJob::NONE);
+    spss.options().set_from_command_line(argc, argv);
+
+    /** options for the inner solver */
+    spss.solver().options().set_Krylov_solver(KrylovSolver::DIRECT);
+    spss.solver().options().set_from_command_line(argc, argv);
+    spss.solver().options().set_matching(strumpack::MatchingJob::NONE);
+    spss.solver().options().enable_symmetric();
+    spss.solver().options().enable_positive_definite();
+
+    spss.set_lower_triangle_matrix(A);
+    spss.reorder();
+    spss.factor();
+    spss.solve(b, x);
+
+    std::cout << "# COMPONENTWISE SCALED RESIDUAL = "
+              << A.max_scaled_residual(x.data(), b.data()) << std::endl;
+    strumpack::blas::axpy(N, -1., x_exact.data(), 1, x.data(), 1);
+    auto nrm_error = strumpack::blas::nrm2(N, x.data(), 1);
+    auto nrm_x_exact = strumpack::blas::nrm2(N, x_exact.data(), 1);
+    std::cout << "# RELATIVE ERROR = " << (nrm_error / nrm_x_exact)
+              << std::endl;
+  }
+
+  {
+    std::cout << std::endl;
+    std::cout << "### STANDARD solver ###########################" << std::endl;
+
+    SparseSolver<working_t, int> spss;
+
+    spss.options().enable_symmetric();
+    spss.options().enable_positive_definite();
+    spss.options().set_matching(strumpack::MatchingJob::NONE);
+    spss.options().set_from_command_line(argc, argv);
+
+    spss.set_lower_triangle_matrix(A);
+    spss.reorder();
+    spss.factor();
+    spss.solve(b, x);
+
+    std::cout << "# COMPONENTWISE SCALED RESIDUAL = "
+              << A.max_scaled_residual(x.data(), b.data()) << std::endl;
+    strumpack::blas::axpy(N, -1., x_exact.data(), 1, x.data(), 1);
+    auto nrm_error = strumpack::blas::nrm2(N, x.data(), 1);
+    auto nrm_x_exact = strumpack::blas::nrm2(N, x_exact.data(), 1);
+    std::cout << "# RELATIVE ERROR = " << (nrm_error / nrm_x_exact)
+              << std::endl;
+  }
+
+  std::cout << std::endl << std::endl;
+}
+
+template <typename integer_t>
+int test_sparse_solver(int argc, char *argv[],
+                       CSRMatrix<double, integer_t> &A_d) {
+  // set the exact solution, see:
+  //   http://www.netlib.org/lapack/lawnspdf/lawn165.pdf
+  // page 20
+  int N = A_d.size();
+  int m = 1; // nr of RHSs
+  DenseMatrix<double> b_d(N, m), x_true_d(N, m);
+  auto A_f = cast_matrix<double, integer_t, float>(A_d);
+
+  std::default_random_engine gen;
+  std::uniform_real_distribution<double> dist(0., std::sqrt(24.));
+  for (int j = 0; j < m; j++) {
+    // step 4, use a different tau for each RHS
+    double tau = std::pow(dist(gen), 2.);
+    for (int i = 0; i < N; i++)
+      // step 4c
+      x_true_d(i, j) = std::pow(tau, -double(i) / (N - 1));
+  }
+
+  // step 6, but in double, not double-double
+  A_d.spmv(x_true_d, b_d);
+  {
+    // step 7, but in double, not double-double
+    SparseSolver<double, integer_t> spss;
+    // SparseSolverMixedPrecision<double,long double,int> spss;
+//    spss.set_lower_triangle_matrix(A_d);
+    spss.set_matrix(A_d);
+    spss.solve(b_d, x_true_d);
+  }
+
+  // cast RHS and true solution to single precision
+  DenseMatrix<float> b_f(N, m), x_true_f(N, m);
+  copy(x_true_d, x_true_f);
+  copy(b_d, b_f);
+
+  test<double, integer_t>(A_d, b_d, x_true_d, argc, argv);
+  test<float, integer_t>(A_f, b_f, x_true_f, argc, argv);
+
+  return 0;
+}
+
+template <typename integer_t>
+int read_matrix_and_run_tests(int argc, char *argv[]) {
+  string f(argv[1]);
+  CSRMatrix<double, integer_t> A;
+  if (A.read_matrix_market(f) == 0)
+    return test_sparse_solver(argc, argv, A);
+  else {
+    CSRMatrix<complex<double>, integer_t> Acomplex;
+    if (Acomplex.read_matrix_market(f)) {
+      std::cerr << "Could not read matrix from file." << std::endl;
+      return 1;
+    }
+  }
+}
+
+int main(int argc, char *argv[]) {
+  if (argc < 2) {
+    cout
+        << "Solve a linear system with a matrix given in matrix market format\n"
+        << "using the sequential/multithreaded C++ STRUMPACK interface.\n\n"
+        << "Usage: \n\t./testMMdouble pde900.mtx" << endl;
+    return 1;
+  }
+  cout << "# Running with:\n# ";
+#if defined(_OPENMP)
+  cout << "OMP_NUM_THREADS=" << omp_get_max_threads() << " ";
+#endif
+  for (int i = 0; i < argc; i++)
+    cout << argv[i] << " ";
+  cout << endl;
+
+  int ierr = 0;
+  // ierr = read_matrix_and_run_tests<float,int>(argc, argv);
+  // if (ierr) return ierr;
+  ierr = read_matrix_and_run_tests<int>(argc, argv);
+  return ierr;
+}
diff --git a/test/test_SPD_seq.cpp b/test/test_SPD_seq.cpp
new file mode 100644
index 00000000..1b86d4d6
--- /dev/null
+++ b/test/test_SPD_seq.cpp
@@ -0,0 +1,130 @@
+/*
+ * STRUMPACK -- STRUctured Matrices PACKage, Copyright (c) 2014, The
+ * Regents of the University of California, through Lawrence Berkeley
+ * National Laboratory (subject to receipt of any required approvals
+ * from the U.S. Dept. of Energy).  All rights reserved.
+ *
+ * If you have questions about your rights to use or distribute this
+ * software, please contact Berkeley Lab's Technology Transfer
+ * Department at TTD@lbl.gov.
+ *
+ * NOTICE. This software is owned by the U.S. Department of Energy. As
+ * such, the U.S. Government has been granted for itself and others
+ * acting on its behalf a paid-up, nonexclusive, irrevocable,
+ * worldwide license in the Software to reproduce, prepare derivative
+ * works, and perform publicly and display publicly.  Beginning five
+ * (5) years after the date permission to assert copyright is obtained
+ * from the U.S. Department of Energy, and subject to any subsequent
+ * five (5) year renewals, the U.S. Government is granted for itself
+ * and others acting on its behalf a paid-up, nonexclusive,
+ * irrevocable, worldwide license in the Software to reproduce,
+ * prepare derivative works, distribute copies to the public, perform
+ * publicly and display publicly, and to permit others to do so.
+ *
+ * Developers: Pieter Ghysels, Francois-Henry Rouet, Xiaoye S. Li.
+ *             (Lawrence Berkeley National Lab, Computational Research
+ *             Division).
+ *
+ */
+#include <cstring>
+#include <iostream>
+using namespace std;
+
+#include "StrumpackSparseSolver.hpp"
+#include "misc/RandomWrapper.hpp"
+#include "sparse/CSRMatrix.hpp"
+
+using namespace strumpack;
+
+#define ERROR_TOLERANCE 1e2
+#define SOLVE_TOLERANCE 1e-12
+
+template <typename scalar_t, typename integer_t>
+int test_SPD_solver(int argc, const char *const argv[],
+                    CSRMatrix<scalar_t, integer_t> &A) {
+  using real_t = typename RealType<scalar_t>::value_type;
+  StrumpackSparseSolver<scalar_t, integer_t> spss;
+  spss.options().set_from_command_line(argc, argv);
+
+  int N = A.size();
+  vector<scalar_t> b(N), x(N), x_exact(N);
+  {
+    auto rgen = random::make_default_random_generator<real_t>();
+    for (auto &xi : x_exact)
+      xi = rgen->get();
+  }
+  A.spmv(x_exact.data(), b.data());
+
+  spss.set_lower_triangle_matrix(A);
+  spss.options().enable_symmetric();
+  spss.options().enable_positive_definite();
+  spss.options().set_matching(strumpack::MatchingJob::NONE);
+  spss.options().set_Krylov_solver(KrylovSolver::DIRECT);
+  if (spss.reorder() != ReturnCode::SUCCESS) {
+    cout << "problem with reordering of the matrix." << endl;
+    return 1;
+  }
+  if (spss.factor() != ReturnCode::SUCCESS) {
+    cout << "problem during factorization of the matrix." << endl;
+    return 1;
+  }
+  spss.solve(b.data(), x.data());
+
+  auto comp_scal_res = A.max_scaled_residual(x.data(), b.data());
+  cout << "# COMPONENTWISE SCALED RESIDUAL = " << comp_scal_res << endl;
+
+  blas::axpy(N, scalar_t(-1.), x_exact.data(), 1, x.data(), 1);
+  auto nrm_error = blas::nrm2(N, x.data(), 1);
+  auto nrm_x_exact = blas::nrm2(N, x_exact.data(), 1);
+  cout << "# RELATIVE ERROR = " << (nrm_error / nrm_x_exact) << endl;
+
+  if (comp_scal_res > ERROR_TOLERANCE * spss.options().rel_tol()) {
+    cout << "RESIDUAL TOO LARGE!" << endl;
+    return 1;
+  }
+  return 0;
+}
+
+template <typename real_t, typename integer_t>
+int read_matrix_and_run_tests(int argc, const char *const argv[]) {
+  string f(argv[1]);
+  CSRMatrix<real_t, integer_t> A;
+  if (A.read_matrix_market(f) == 0)
+    return test_SPD_solver(argc, argv, A);
+  else {
+    CSRMatrix<complex<real_t>, integer_t> Acomplex;
+    if (Acomplex.read_matrix_market(f)) {
+      std::cerr << "Could not read matrix from file." << std::endl;
+      return 1;
+    }
+    return test_SPD_solver(argc, argv, Acomplex);
+  }
+}
+
+int main(int argc, char *argv[]) {
+  if (argc < 2) {
+    cout
+        << "Solve a linear system with a matrix given in matrix market format\n"
+        << "using the sequential/multithreaded C++ STRUMPACK interface.\n\n"
+        << "Usage: \n\t./testMMdouble bcsstm08.mtx" << endl;
+    return 1;
+  }
+  cout << "# Running with:\n# ";
+#if defined(_OPENMP)
+  cout << "OMP_NUM_THREADS=" << omp_get_max_threads() << " ";
+#endif
+  for (int i = 0; i < argc; i++)
+    cout << argv[i] << " ";
+  cout << endl;
+
+  int ierr = 0;
+  // ierr = read_matrix_and_run_tests<float,int>(argc, argv);
+  // if (ierr) return ierr;
+  ierr = read_matrix_and_run_tests<double, int>(argc, argv);
+  if (ierr)
+    return ierr;
+  // ierr = read_matrix_and_run_tests<float,long long int>(argc, argv);
+  // if (ierr) return ierr;
+  ierr = read_matrix_and_run_tests<double, long long int>(argc, argv);
+  return ierr;
+}