JuliaSmoothOptimizers · amontoison · Dec 7, 2023 · Dec 5, 2023 · Dec 5, 2023 · Dec 5, 2023
diff --git a/ext/AMDGPU/operators.jl b/ext/AMDGPU/operators.jl
@@ -18,21 +18,19 @@ for (SparseMatrixType, BlasType) in ((:(ROCSparseMatrixCSR{T}), :BlasFloat),
     @eval begin
         function KP.KrylovOperator(A::$SparseMatrixType; nrhs::Int=1, transa::Char='N') where T <: $BlasType
             m,n = size(A)
+            alpha = Ref{T}(one(T))
+            beta = Ref{T}(zero(T))
+            descA = rocSPARSE.ROCSparseMatrixDescriptor(A, 'O')
             if nrhs == 1
-                alpha = Ref{T}(one(T))
-                beta = Ref{T}(zero(T))
-                descA = rocSPARSE.ROCSparseMatrixDescriptor(A, 'O')
                 descX = rocSPARSE.ROCDenseVectorDescriptor(T, n)
                 descY = rocSPARSE.ROCDenseVectorDescriptor(T, m)
                 algo = rocSPARSE.rocSPARSE.rocsparse_spmv_alg_default
                 buffer_size = Ref{Csize_t}()
-                rocSPARSE.rocSPARSE.rocsparse_spmv(rocSPARSE.handle(), transa, alpha, descA, descX, beta, descY, T, algo, buffer_size, C_NULL)
+                rocSPARSE.rocsparse_spmv(rocSPARSE.handle(), transa, alpha, descA, descX,
+                                         beta, descY, T, algo, buffer_size, C_NULL)
                 buffer = ROCVector{UInt8}(undef, buffer_size[])
                 return AMD_KrylovOperator{T}(T, m, n, nrhs, transa, descA, buffer_size, buffer)
             else
-                alpha = Ref{T}(one(T))
-                beta = Ref{T}(zero(T))
-                descA = rocSPARSE.ROCSparseMatrixDescriptor(A, 'O')
                 descX = rocSPARSE.ROCDenseMatrixDescriptor(T, n, nrhs)
                 descY = rocSPARSE.ROCDenseMatrixDescriptor(T, m, nrhs)
                 algo = rocSPARSE.rocsparse_spmm_alg_default
@@ -62,8 +60,10 @@ function LinearAlgebra.mul!(y::ROCVector{T}, A::AMD_KrylovOperator{T}, x::ROCVec
     descY = rocSPARSE.ROCDenseVectorDescriptor(y)
     descX = rocSPARSE.ROCDenseVectorDescriptor(x)
     algo = rocSPARSE.rocsparse_spmv_alg_default
-    rocSPARSE.rocsparse_spmv(rocSPARSE.handle(), A.transa, Ref{T}(one(T)), A.descA, descX,
-                             Ref{T}(zero(T)), descY, T, algo, A.buffer_size, A.buffer)
+    alpha = Ref{T}(one(T))
+    beta = Ref{T}(zero(T))
+    rocSPARSE.rocsparse_spmv(rocSPARSE.handle(), A.transa, alpha, A.descA, descX,
+                             beta, descY, T, algo, A.buffer_size, A.buffer)
 end
 
 function LinearAlgebra.mul!(Y::ROCMatrix{T}, A::AMD_KrylovOperator{T}, X::ROCMatrix{T}) where T <: BlasFloat
@@ -75,6 +75,92 @@ function LinearAlgebra.mul!(Y::ROCMatrix{T}, A::AMD_KrylovOperator{T}, X::ROCMat
     descY = rocSPARSE.ROCDenseMatrixDescriptor(Y)
     descX = rocSPARSE.ROCDenseMatrixDescriptor(X)
     algo = rocSPARSE.rocsparse_spmm_alg_default
-    rocSPARSE.rocsparse_spmm(rocSPARSE.handle(), A.transa, 'N', Ref{T}(one(T)), A.descA, descX,
-                             Ref{T}(zero(T)), descY, T, algo, rocSPARSE.rocsparse_spmm_stage_compute, A.buffer_size, A.buffer)
+    alpha = Ref{T}(one(T))
+    beta = Ref{T}(zero(T))
+    rocSPARSE.rocsparse_spmm(rocSPARSE.handle(), A.transa, 'N', alpha, A.descA, descX,
+                             beta, descY, T, algo, rocSPARSE.rocsparse_spmm_stage_compute, A.buffer_size, A.buffer)
+end
+
+mutable struct AMD_TriangularOperator{T} <: AbstractTriangularOperator{T}
+    type::Type{T}
+    m::Int
+    n::Int
+    nrhs::Int
+    transa::Char
+    descA::rocSPARSE.ROCSparseMatrixDescriptor
+    buffer_size::Ref{Csize_t}
+    buffer::ROCVector{UInt8}
+end
+
+eltype(A::AMD_TriangularOperator{T}) where T = T
+size(A::AMD_TriangularOperator) = (A.m, A.n)
+
+for (SparseMatrixType, BlasType) in ((:(ROCSparseMatrixCSR{T}), :BlasFloat),
+                                     (:(ROCSparseMatrixCOO{T}), :BlasFloat))
+    @eval begin
+        function KP.TriangularOperator(A::$SparseMatrixType, uplo::Char, diag::Char; nrhs::Int=1, transa::Char='N') where T <: $BlasType
+            m,n = size(A)
+            alpha = Ref{T}(one(T))
+            descA = rocSPARSE.ROCSparseMatrixDescriptor(A, 'O')
+            rocsparse_uplo = Ref{rocSPARSE.rocsparse_fill_mode}(uplo)
+            rocsparse_diag = Ref{rocSPARSE.rocsparse_diag_type}(diag)
+            rocSPARSE.rocsparse_spmat_set_attribute(descA, rocSPARSE.rocsparse_spmat_fill_mode, rocsparse_uplo, Csize_t(sizeof(rocsparse_uplo)))
+            rocSPARSE.rocsparse_spmat_set_attribute(descA, rocSPARSE.rocsparse_spmat_diag_type, rocsparse_diag, Csize_t(sizeof(rocsparse_diag)))
+            if nrhs == 1
+                descX = rocSPARSE.ROCDenseVectorDescriptor(T, n)
+                descY = rocSPARSE.ROCDenseVectorDescriptor(T, m)
+                algo = rocSPARSE.rocsparse_spsv_alg_default
+                buffer_size = Ref{Csize_t}()
+                rocSPARSE.rocsparse_spsv(rocSPARSE.handle(), transa, alpha, descA, descX, descY, T, algo,
+                                         rocSPARSE.rocsparse_spsv_stage_buffer_size, buffer_size, C_NULL)
+                buffer = ROCVector{UInt8}(undef, buffer_size[])
+                rocSPARSE.rocsparse_spsv(rocSPARSE.handle(), transa, alpha, descA, descX, descY, T, algo,
+                                         rocSPARSE.rocsparse_spsv_stage_preprocess, buffer_size, buffer)
+                return AMD_TriangularOperator{T}(T, m, n, nrhs, transa, descA, buffer_size, buffer)
+            else
+                descX = rocSPARSE.ROCDenseMatrixDescriptor(T, n, nrhs)
+                descY = rocSPARSE.ROCDenseMatrixDescriptor(T, m, nrhs)
+                algo = rocSPARSE.rocsparse_spsm_alg_default
+                buffer_size = Ref{Csize_t}()
+                rocSPARSE.rocsparse_spsm(rocSPARSE.handle(), transa, 'N', alpha, descA, descX, descY, T, algo,
+                                         rocSPARSE.rocsparse_spsm_stage_buffer_size, buffer_size, C_NULL)
+                buffer = ROCVector{UInt8}(undef, buffer_size[])
+                rocSPARSE.rocsparse_spsm(rocSPARSE.handle(), transa, 'N', alpha, descA, descX, descY, T, algo,
+                                         rocSPARSE.rocsparse_spsm_stage_preprocess, buffer_size, buffer)
+                return AMD_TriangularOperator{T}(T, m, n, nrhs, transa, descA, buffer_size, buffer)
+            end
+        end
+
+        function KP.update!(A::AMD_TriangularOperator{T}, B::$SparseMatrixType) where T <: $BlasFloat
+            (B isa ROCSparseMatrixCOO) && rocSPARSE.rocsparse_coo_set_pointers(A.descA, B.rowInd, B.colInd, B.nzVal)
+            (B isa ROCSparseMatrixCSR) && rocSPARSE.rocsparse_csr_set_pointers(A.descA, B.rowPtr, B.colVal, B.nzVal)
+            return A
+        end
+    end
+end
+
+function LinearAlgebra.ldiv!(y::ROCVector{T}, A::AMD_TriangularOperator{T}, x::ROCVector{T}) where T <: BlasFloat
+    (length(y) != A.m) && throw(DimensionMismatch("length(y) != A.m"))
+    (length(x) != A.n) && throw(DimensionMismatch("length(x) != A.n"))
+    (A.nrhs == 1) || throw(DimensionMismatch("A.nrhs != 1"))
+    descY = rocSPARSE.ROCDenseVectorDescriptor(y)
+    descX = rocSPARSE.ROCDenseVectorDescriptor(x)
+    algo = rocSPARSE.rocsparse_spsv_alg_default
+    alpha = Ref{T}(one(T))
+    rocSPARSE.rocsparse_spsv(rocSPARSE.handle(), A.transa, alpha, A.descA, descX, descY, T,
+                             algo, rocSPARSE.rocsparse_spsv_stage_compute, A.buffer_size, A.buffer)
+end
+
+function LinearAlgebra.ldiv!(Y::ROCMatrix{T}, A::AMD_TriangularOperator{T}, X::ROCMatrix{T}) where T <: BlasFloat
+    mY, nY = size(Y)
+    mX, nX = size(X)
+    (mY != A.m) && throw(DimensionMismatch("mY != A.m"))
+    (mX != A.n) && throw(DimensionMismatch("mX != A.n"))
+    (nY == nX == A.nrhs) || throw(DimensionMismatch("nY != A.nrhs or nX != A.nrhs"))
+    descY = rocSPARSE.ROCDenseMatrixDescriptor(Y)
+    descX = rocSPARSE.ROCDenseMatrixDescriptor(X)
+    algo = rocSPARSE.rocsparse_spsm_alg_default
+    alpha = Ref{T}(one(T))
+    rocSPARSE.rocsparse_spsm(rocSPARSE.handle(), A.transa, 'N', alpha, A.descA, descX, descY, T,
+                             algo, rocSPARSE.rocsparse_spsm_stage_compute, A.buffer_size, A.buffer)
 end
diff --git a/ext/CUDA/operators.jl b/ext/CUDA/operators.jl
@@ -17,10 +17,10 @@ for (SparseMatrixType, BlasType) in ((:(CuSparseMatrixCSR{T}), :BlasFloat),
     @eval begin
         function KP.KrylovOperator(A::$SparseMatrixType; nrhs::Int=1, transa::Char='N') where T <: $BlasType
             m,n = size(A)
+            alpha = Ref{T}(one(T))
+            beta = Ref{T}(zero(T))
+            descA = CUSPARSE.CuSparseMatrixDescriptor(A, 'O')
             if nrhs == 1
-                alpha = Ref{T}(one(T))
-                beta = Ref{T}(zero(T))
-                descA = CUSPARSE.CuSparseMatrixDescriptor(A, 'O')
                 descX = CUSPARSE.CuDenseVectorDescriptor(T, n)
                 descY = CUSPARSE.CuDenseVectorDescriptor(T, m)
                 algo = CUSPARSE.CUSPARSE_SPMV_ALG_DEFAULT
@@ -29,18 +29,14 @@ for (SparseMatrixType, BlasType) in ((:(CuSparseMatrixCSR{T}), :BlasFloat),
                 buffer = CuVector{UInt8}(undef, buffer_size[])
                 return NVIDIA_KrylovOperator{T}(T, m, n, nrhs, transa, descA, buffer)
             else
-                alpha = Ref{T}(one(T))
-                beta = Ref{T}(zero(T))
-                descA = CUSPARSE.CuSparseMatrixDescriptor(A, 'O')
                 descX = CUSPARSE.CuDenseMatrixDescriptor(T, n, nrhs)
                 descY = CUSPARSE.CuDenseMatrixDescriptor(T, m, nrhs)
                 algo = CUSPARSE.CUSPARSE_SPMM_ALG_DEFAULT
                 buffer_size = Ref{Csize_t}()
-                transb = 'N'
-                CUSPARSE.cusparseSpMM_bufferSize(CUSPARSE.handle(), transa, transb, alpha, descA, descX, beta, descY, T, algo, buffer_size)
+                CUSPARSE.cusparseSpMM_bufferSize(CUSPARSE.handle(), transa, 'N', alpha, descA, descX, beta, descY, T, algo, buffer_size)
                 buffer = CuVector{UInt8}(undef, buffer_size[])
                 if !(A isa CuSparseMatrixCOO)
-                    CUSPARSE.cusparseSpMM_preprocess(CUSPARSE.handle(), transa, transb, alpha, descA, descX, beta, descY, T, algo, buffer)
+                    CUSPARSE.cusparseSpMM_preprocess(CUSPARSE.handle(), transa, 'N', alpha, descA, descX, beta, descY, T, algo, buffer)
                 end
                 return NVIDIA_KrylovOperator{T}(T, m, n, nrhs, transa, descA, buffer)
             end
@@ -61,7 +57,9 @@ function LinearAlgebra.mul!(y::CuVector{T}, A::NVIDIA_KrylovOperator{T}, x::CuVe
     descY = CUSPARSE.CuDenseVectorDescriptor(y)
     descX = CUSPARSE.CuDenseVectorDescriptor(x)
     algo = CUSPARSE.CUSPARSE_SPMV_ALG_DEFAULT
-    CUSPARSE.cusparseSpMV(CUSPARSE.handle(), A.transa, Ref{T}(one(T)), A.descA, descX, Ref{T}(zero(T)), descY, T, algo, A.buffer)
+    alpha = Ref{T}(one(T))
+    beta = Ref{T}(zero(T))
+    CUSPARSE.cusparseSpMV(CUSPARSE.handle(), A.transa, alpha, A.descA, descX, beta, descY, T, algo, A.buffer)
 end
 
 function LinearAlgebra.mul!(Y::CuMatrix{T}, A::NVIDIA_KrylovOperator{T}, X::CuMatrix{T}) where T <: BlasFloat
@@ -73,5 +71,93 @@ function LinearAlgebra.mul!(Y::CuMatrix{T}, A::NVIDIA_KrylovOperator{T}, X::CuMa
     descY = CUSPARSE.CuDenseMatrixDescriptor(Y)
     descX = CUSPARSE.CuDenseMatrixDescriptor(X)
     algo = CUSPARSE.CUSPARSE_SPMM_ALG_DEFAULT
-    CUSPARSE.cusparseSpMM(CUSPARSE.handle(), A.transa, 'N', Ref{T}(one(T)), A.descA, descX, Ref{T}(zero(T)), descY, T, algo, A.buffer)
+    alpha = Ref{T}(one(T))
+    beta = Ref{T}(zero(T))
+    CUSPARSE.cusparseSpMM(CUSPARSE.handle(), A.transa, 'N', alpha, A.descA, descX, beta, descY, T, algo, A.buffer)
+end
+
+mutable struct NVIDIA_TriangularOperator{T,S} <: AbstractTriangularOperator{T}
+    type::Type{T}
+    m::Int
+    n::Int
+    nrhs::Int
+    transa::Char
+    descA::CUSPARSE.CuSparseMatrixDescriptor
+    descT::S
+    buffer::CuVector{UInt8}
+end
+
+eltype(A::NVIDIA_TriangularOperator{T}) where T = T
+size(A::NVIDIA_TriangularOperator) = (A.m, A.n)
+
+for (SparseMatrixType, BlasType) in ((:(CuSparseMatrixCSR{T}), :BlasFloat),
+                                     (:(CuSparseMatrixCOO{T}), :BlasFloat))
+    @eval begin
+        function KP.TriangularOperator(A::$SparseMatrixType, uplo::Char, diag::Char; nrhs::Int=1, transa::Char='N') where T <: $BlasType
+            m,n = size(A)
+            alpha = Ref{T}(one(T))
+            descA = CUSPARSE.CuSparseMatrixDescriptor(A, 'O')
+            cusparse_uplo = Ref{CUSPARSE.cusparseFillMode_t}(uplo)
+            cusparse_diag = Ref{CUSPARSE.cusparseDiagType_t}(diag)
+            CUSPARSE.cusparseSpMatSetAttribute(descA, 'F', cusparse_uplo, Csize_t(sizeof(cusparse_uplo)))
+            CUSPARSE.cusparseSpMatSetAttribute(descA, 'D', cusparse_diag, Csize_t(sizeof(cusparse_diag)))
+            if nrhs == 1
+                descT = CUSPARSE.CuSparseSpSVDescriptor()
+                descX = CUSPARSE.CuDenseVectorDescriptor(T, n)
+                descY = CUSPARSE.CuDenseVectorDescriptor(T, m)
+                algo = CUSPARSE.CUSPARSE_SPSV_ALG_DEFAULT
+                buffer_size = Ref{Csize_t}()
+                CUSPARSE.cusparseSpSV_bufferSize(CUSPARSE.handle(), transa, alpha, descA, descX, descY, T, algo, descT, buffer_size)
+                buffer = CuVector{UInt8}(undef, buffer_size[])
+                CUSPARSE.cusparseSpSV_analysis(CUSPARSE.handle(), transa, alpha, descA, descX, descY, T, algo, descT, buffer)
+                return NVIDIA_TriangularOperator{T,CUSPARSE.CuSparseSpSVDescriptor}(T, m, n, nrhs, transa, descA, descT, buffer)
+            else
+                descT = CUSPARSE.CuSparseSpSMDescriptor()
+                descX = CUSPARSE.CuDenseMatrixDescriptor(T, n, nrhs)
+                descY = CUSPARSE.CuDenseMatrixDescriptor(T, m, nrhs)
+                algo = CUSPARSE.CUSPARSE_SPSM_ALG_DEFAULT
+                buffer_size = Ref{Csize_t}()
+                CUSPARSE.cusparseSpSM_bufferSize(CUSPARSE.handle(), transa, 'N', alpha, descA, descX, descY, T, algo, descT, buffer_size)
+                buffer = CuVector{UInt8}(undef, buffer_size[])
+                CUSPARSE.cusparseSpSM_analysis(CUSPARSE.handle(), transa, 'N', alpha, descA, descX, descY, T, algo, descT, buffer)
+                return NVIDIA_TriangularOperator{T,CUSPARSE.CuSparseSpSMDescriptor}(T, m, n, nrhs, transa, descA, descT, buffer)
+            end
+        end
+
+        function KP.update!(A::NVIDIA_TriangularOperator{T,CUSPARSE.CuSparseSpSVDescriptor}, B::$SparseMatrixType) where T <: $BlasFloat
+            CUSPARSE.version() ≥ v"12.2" || error("This operation is only support by CUDA ≥ v12.3")
+            descB = CUSPARSE.CuSparseMatrixDescriptor(B, 'O')
+            A.descA = descB
+            CUSPARSE.cusparseSpSV_updateMatrix(CUSPARSE.handle(), A.descT, B.nzVal, 'G')
+            return A
+        end
+
+        function KP.update!(A::NVIDIA_TriangularOperator{T,CUSPARSE.CuSparseSpSMDescriptor}, B::$SparseMatrixType) where T <: $BlasFloat
+            return error("This operation will be supported in CUDA v12.4")
+        end
+    end
+end
+
+function LinearAlgebra.ldiv!(y::CuVector{T}, A::NVIDIA_TriangularOperator{T}, x::CuVector{T}) where T <: BlasFloat
+    (length(y) != A.m) && throw(DimensionMismatch("length(y) != A.m"))
+    (length(x) != A.n) && throw(DimensionMismatch("length(x) != A.n"))
+    (A.nrhs == 1) || throw(DimensionMismatch("A.nrhs != 1"))
+    descY = CUSPARSE.CuDenseVectorDescriptor(y)
+    descX = CUSPARSE.CuDenseVectorDescriptor(x)
+    algo = CUSPARSE.CUSPARSE_SPSV_ALG_DEFAULT
+    alpha = Ref{T}(one(T))
+    CUSPARSE.cusparseSpSV_solve(CUSPARSE.handle(), A.transa, alpha, A.descA, descX, descY, T, algo, A.descT)
+end
+
+function LinearAlgebra.ldiv!(Y::CuMatrix{T}, A::NVIDIA_TriangularOperator{T}, X::CuMatrix{T}) where T <: BlasFloat
+    mY, nY = size(Y)
+    mX, nX = size(X)
+    (mY != A.m) && throw(DimensionMismatch("mY != A.m"))
+    (mX != A.n) && throw(DimensionMismatch("mX != A.n"))
+    (nY == nX == A.nrhs) || throw(DimensionMismatch("nY != A.nrhs or nX != A.nrhs"))
+    descY = CUSPARSE.CuDenseMatrixDescriptor(Y)
+    descX = CUSPARSE.CuDenseMatrixDescriptor(X)
+    algo = CUSPARSE.CUSPARSE_SPSM_ALG_DEFAULT
+    alpha = Ref{T}(one(T))
+    CUSPARSE.cusparseSpSM_solve(CUSPARSE.handle(), A.transa, 'N', alpha, A.descA, descX, descY, T, algo, A.descT)
 end
diff --git a/src/KrylovPreconditioners.jl b/src/KrylovPreconditioners.jl
@@ -15,6 +15,9 @@ export AbstractKrylovPreconditioner
 abstract type AbstractKrylovOperator{T} end
 export AbstractKrylovOperator
 
+abstract type AbstractTriangularOperator{T} end
+export AbstractTriangularOperator
+
 update!(p::AbstractKrylovPreconditioner, A::SparseMatrixCSC) = error("update!() for $(typeof(p)) is not implemented.")
 update!(p::AbstractKrylovPreconditioner, A) = error("update!() for $(typeof(p)) is not implemented.")
 update!(p::AbstractKrylovOperator, A::SparseMatrixCSC) = error("update!() for $(typeof(p)) is not implemented.")
@@ -33,6 +36,9 @@ end
 function KrylovOperator end
 export KrylovOperator
 
+function TriangularOperator end
+export TriangularOperator
+
 # Preconditioners
 include("ic0.jl")
 include("ilu0.jl")

diff --git a/test/gpu/amd.jl b/test/gpu/amd.jl
@@ -40,6 +40,15 @@ include("gpu.jl")
     end
   end
 
+  @testset "TriangularOperator" begin
+    @testset "ROCSparseMatrixCOO -- $FC" for FC in (Float64, ComplexF64)
+      test_triangular(FC, ROCVector{FC}, ROCMatrix{FC}, ROCSparseMatrixCOO{FC})
+    end
+    @testset "ROCSparseMatrixCSR -- $FC" for FC in (Float64, ComplexF64)
+      test_triangular(FC, ROCVector{FC}, ROCMatrix{FC}, ROCSparseMatrixCSR{FC})
+    end
+  end
+
   @testset "Block Jacobi preconditioner" begin
     test_block_jacobi(ROCBackend(), ROCArray, ROCSparseMatrixCSR)
   end

diff --git a/test/gpu/gpu.jl b/test/gpu/gpu.jl
@@ -128,6 +128,71 @@ function test_operator(FC, V, DM, SM)
   end
 end
 
+function test_triangular(FC, V, DM, SM)
+  n = 100
+  for (uplo, diag, triangle) in [('L', 'U', UnitLowerTriangular),
+                                 ('L', 'N', LowerTriangular    ),
+                                 ('U', 'U', UnitUpperTriangular),
+                                 ('U', 'N', UpperTriangular    )]
+    A_cpu = rand(FC, n, n)
+    A_cpu = uplo == 'L' ? tril(A_cpu) : triu(A_cpu)
+    A_cpu = diag == 'U' ? A_cpu - Diagonal(A_cpu) + I : A_cpu
+    A_cpu = sparse(A_cpu)
+    b_cpu = rand(FC, n)
+
+    A_gpu = SM(A_cpu)
+    b_gpu = V(b_cpu)
+    opA_gpu = TriangularOperator(A_gpu, uplo, diag)
+    for i = 1:5
+      y_cpu = rand(FC, n)
+      x_cpu = rand(FC, n)
+      ldiv!(y_cpu, triangle(A_cpu), x_cpu)
+      y_gpu = V(y_cpu)
+      x_gpu = V(x_cpu)
+      ldiv!(y_gpu, opA_gpu, x_gpu)
+      @test collect(y_gpu) ≈ y_cpu
+    end
+    for j = 1:5
+      y_cpu = rand(FC, n)
+      x_cpu = rand(FC, n)
+      A_cpu2 = A_cpu + j*tril(A_cpu,-1) + j*triu(A_cpu,1)
+      ldiv!(y_cpu, triangle(A_cpu2), x_cpu)
+      y_gpu = V(y_cpu)
+      x_gpu = V(x_cpu)
+      A_gpu2 = SM(A_cpu2)
+      update!(opA_gpu, A_gpu2)
+      ldiv!(y_gpu, opA_gpu, x_gpu)
+      @test collect(y_gpu) ≈ y_cpu
+    end
+
+    nrhs = 3
+    opA_gpu = TriangularOperator(A_gpu, uplo, diag; nrhs)
+    for i = 1:5
+      Y_cpu = rand(FC, n, nrhs)
+      X_cpu = rand(FC, n, nrhs)
+      ldiv!(Y_cpu, triangle(A_cpu), X_cpu)
+      Y_gpu = DM(Y_cpu)
+      X_gpu = DM(X_cpu)
+      ldiv!(Y_gpu, opA_gpu, X_gpu)
+      @test collect(Y_gpu) ≈ Y_cpu
+    end
+    if V.body.name.name != :CuArray
+      for j = 1:5
+        Y_cpu = rand(FC, n, nrhs)
+        X_cpu = rand(FC, n, nrhs)
+        A_cpu2 = A_cpu + j*tril(A_cpu,-1) + j*triu(A_cpu,1)
+        ldiv!(Y_cpu, triangle(A_cpu2), X_cpu)
+        Y_gpu = DM(Y_cpu)
+        X_gpu = DM(X_cpu)
+        A_gpu2 = SM(A_cpu2)
+        update!(opA_gpu, A_gpu2)
+        ldiv!(Y_gpu, opA_gpu, X_gpu)
+        @test collect(Y_gpu) ≈ Y_cpu
+      end
+    end
+  end
+end
+
 _get_type(J::SparseMatrixCSC) = Vector{Float64}
 
 function generate_random_system(n::Int, m::Int)

diff --git a/test/gpu/nvidia.jl b/test/gpu/nvidia.jl
@@ -40,6 +40,15 @@ include("gpu.jl")
     end
   end
 
+  @testset "TriangularOperator" begin
+    @testset "CuSparseMatrixCOO -- $FC" for FC in (Float64, ComplexF64)
+      test_triangular(FC, CuVector{FC}, CuMatrix{FC}, CuSparseMatrixCOO{FC})
+    end
+    @testset "CuSparseMatrixCSR -- $FC" for FC in (Float64, ComplexF64)
+      test_triangular(FC, CuVector{FC}, CuMatrix{FC}, CuSparseMatrixCSR{FC})
+    end
+  end
+
   @testset "Block Jacobi preconditioner" begin
     test_block_jacobi(CUDABackend(), CuArray, CuSparseMatrixCSR)
   end