Update IC(0) and ILU(0) for CUDA GPUs (#30)

JuliaSmoothOptimizers · Nov 30, 2023 · 80c493b · 80c493b
1 parent 9745f5f
commit 80c493b
Show file tree

Hide file tree

Showing 8 changed files with 164 additions and 38 deletions.
diff --git a/ext/AMDGPU/ic0.jl b/ext/AMDGPU/ic0.jl
@@ -8,7 +8,11 @@ for (SparseMatrixType, BlasType) in ((:(ROCSparseMatrixCSR{T,Cint}), :BlasFloat)
   @eval begin
     function KP.kp_ic0(A::$SparseMatrixType) where T <: $BlasType
       P = rocSPARSE.ic0(A, 'O')
-      return AMD_IC0(P,0.0)
+      return AMD_IC0(P, 0.0)
+    end
+
+    function KP.update!(p::AMD_IC0{$SparseMatrixType}, A::$SparseMatrixType) where T <: $BlasType
+      p.P = rocSPARSE.ic0(A, 'O')
     end
   end
 end
@@ -44,7 +48,3 @@ for ArrayType in (:(ROCVector{T}), :(ROCMatrix{T}))
     end
   end
 end
-
-function KP.update!(p::AMD_IC0, A)
-    p.P = rocSPARSE.ic0(A, 'O')
-end
diff --git a/ext/AMDGPU/ilu0.jl b/ext/AMDGPU/ilu0.jl
@@ -8,7 +8,11 @@ for (SparseMatrixType, BlasType) in ((:(ROCSparseMatrixCSR{T,Cint}), :BlasFloat)
   @eval begin
     function KP.kp_ilu0(A::$SparseMatrixType) where T <: $BlasType
       P = rocSPARSE.ilu0(A, 'O')
-      return AMD_ILU0(P,0.0)
+      return AMD_ILU0(P, 0.0)
+    end
+
+    function KP.update!(p::AMD_ILU0{$SparseMatrixType}, A::$SparseMatrixType) where T <: $BlasType
+      p.P = rocSPARSE.ilu0(A, 'O')
     end
   end
 end
@@ -44,7 +48,3 @@ for ArrayType in (:(ROCVector{T}), :(ROCMatrix{T}))
     end
   end
 end
-
-function KP.update!(p::AMD_ILU0, A)
-    p.P = rocSPARSE.ilu0(A, 'O')
-end
diff --git a/ext/CUDA/ic0.jl b/ext/CUDA/ic0.jl
@@ -1,15 +1,73 @@
+mutable struct IC0Info
+    info::CUSPARSE.csric02Info_t
+
+    function IC0Info()
+        info_ref = Ref{CUSPARSE.csric02Info_t}()
+        CUSPARSE.cusparseCreateCsric02Info(info_ref)
+        obj = new(info_ref[])
+        finalizer(CUSPARSE.cusparseDestroyCsric02Info, obj)
+        obj
+    end
+end
+
+unsafe_convert(::Type{CUSPARSE.csric02Info_t}, info::IC0Info) = info.info
+
 mutable struct NVIDIA_IC0{SM} <: AbstractKrylovPreconditioner
-  P::SM
+  n::Int
+  desc::CUSPARSE.CuMatrixDescriptor
+  buffer::CuVector{UInt8}
+  info::IC0Info
   timer_update::Float64
+  P::SM
 end
 
-for (SparseMatrixType, BlasType) in ((:(CuSparseMatrixCSR{T,Cint}), :BlasFloat),
-                                     (:(CuSparseMatrixCSC{T,Cint}), :BlasReal))
+for (bname, aname, sname, T) in ((:cusparseScsric02_bufferSize, :cusparseScsric02_analysis, :cusparseScsric02, :Float32),
+                                 (:cusparseDcsric02_bufferSize, :cusparseDcsric02_analysis, :cusparseDcsric02, :Float64),
+                                 (:cusparseCcsric02_bufferSize, :cusparseCcsric02_analysis, :cusparseCcsric02, :ComplexF32),
+                                 (:cusparseZcsric02_bufferSize, :cusparseZcsric02_analysis, :cusparseZcsric02, :ComplexF64))
   @eval begin
-    function KP.kp_ic0(A::$SparseMatrixType) where T <: $BlasType
-      P = CUSPARSE.ic02(A)
-      n = checksquare(A)
-      return NVIDIA_IC0(P,0.0)
+    function KP.kp_ic0(A::CuSparseMatrixCSR{$T,Cint})
+      P = copy(A)
+      n = checksquare(P)
+      desc = CUSPARSE.CuMatrixDescriptor('G', 'L', 'N', 'O')
+      info = IC0Info()
+      buffer_size = Ref{Cint}()
+      CUSPARSE.$bname(CUSPARSE.handle(), n, nnz(P), desc, P.nzVal, P.rowPtr, P.colVal, info, buffer_size)
+      buffer = CuVector{UInt8}(undef, buffer_size[])
+      CUSPARSE.$aname(CUSPARSE.handle(), n, nnz(P), desc, P.nzVal, P.rowPtr, P.colVal, info, CUSPARSE.CUSPARSE_SOLVE_POLICY_USE_LEVEL, buffer)
+      posit = Ref{Cint}(1)
+      CUSPARSE.cusparseXcsric02_zeroPivot(CUSPARSE.handle(), info, posit)
+      (posit[] ≥ 0) && error("Structural/numerical zero in A at ($(posit[]),$(posit[])))")
+      CUSPARSE.$sname(CUSPARSE.handle(), n, nnz(P), desc, P.nzVal, P.rowPtr, P.colVal, info, CUSPARSE.CUSPARSE_SOLVE_POLICY_USE_LEVEL, buffer)
+      return NVIDIA_IC0(n, desc, buffer, info, 0.0, P)
+    end
+
+    function KP.update!(p::NVIDIA_IC0{CuSparseMatrixCSR{$T,Cint}}, A::CuSparseMatrixCSR{$T,Cint})
+      copyto!(p.P.nzVal, A.nzVal)
+      CUSPARSE.$sname(CUSPARSE.handle(), p.n, nnz(p.P), p.desc, p.P.nzVal, p.P.rowPtr, p.P.colVal, p.info, CUSPARSE.CUSPARSE_SOLVE_POLICY_USE_LEVEL, p.buffer)
+      return p
+    end
+
+    function KP.kp_ic0(A::CuSparseMatrixCSC{$T,Cint})
+      P = copy(A)
+      n = checksquare(P)
+      desc = CUSPARSE.CuMatrixDescriptor('G', 'L', 'N', 'O')
+      info = IC0Info()
+      buffer_size = Ref{Cint}()
+      CUSPARSE.$bname(CUSPARSE.handle(), n, nnz(P), desc, P.nzVal, P.colPtr, P.rowVal, info, buffer_size)
+      buffer = CuVector{UInt8}(undef, buffer_size[])
+      CUSPARSE.$aname(CUSPARSE.handle(), n, nnz(P), desc, P.nzVal, P.colPtr, P.rowVal, info, CUSPARSE.CUSPARSE_SOLVE_POLICY_USE_LEVEL, buffer)
+      posit = Ref{Cint}(1)
+      CUSPARSE.cusparseXcsric02_zeroPivot(CUSPARSE.handle(), info, posit)
+      (posit[] ≥ 0) && error("Structural/numerical zero in A at ($(posit[]),$(posit[])))")
+      CUSPARSE.$sname(CUSPARSE.handle(), n, nnz(P), desc, P.nzVal, P.colPtr, P.rowVal, info, CUSPARSE.CUSPARSE_SOLVE_POLICY_USE_LEVEL, buffer)
+      return NVIDIA_IC0(n, desc, buffer, info, 0.0, P)
+    end
+
+    function KP.update!(p::NVIDIA_IC0{CuSparseMatrixCSC{$T,Cint}}, A::CuSparseMatrixCSC{$T,Cint})
+      copyto!(p.P.nzVal, A.nzVal)
+      CUSPARSE.$sname(CUSPARSE.handle(), p.n, nnz(p.P), p.desc, p.P.nzVal, p.P.colPtr, p.P.rowVal, p.info, CUSPARSE.CUSPARSE_SOLVE_POLICY_USE_LEVEL, p.buffer)
+      return p
     end
   end
 end
@@ -45,8 +103,3 @@ for ArrayType in (:(CuVector{T}), :(CuMatrix{T}))
     end
   end
 end
-
-function KP.update!(p::NVIDIA_IC0, A)
-    p.P = CUSPARSE.ic02(A)
-end
-
diff --git a/ext/CUDA/ilu0.jl b/ext/CUDA/ilu0.jl
@@ -1,15 +1,73 @@
+mutable struct ILU0Info
+    info::CUSPARSE.csrilu02Info_t
+
+    function ILU0Info()
+        info_ref = Ref{CUSPARSE.csrilu02Info_t}()
+        CUSPARSE.cusparseCreateCsrilu02Info(info_ref)
+        obj = new(info_ref[])
+        finalizer(CUSPARSE.cusparseDestroyCsrilu02Info, obj)
+        obj
+    end
+end
+
+unsafe_convert(::Type{CUSPARSE.csrilu02Info_t}, info::ILU0Info) = info.info
+
 mutable struct NVIDIA_ILU0{SM} <: AbstractKrylovPreconditioner
-  P::SM
+  n::Int
+  desc::CUSPARSE.CuMatrixDescriptor
+  buffer::CuVector{UInt8}
+  info::ILU0Info
   timer_update::Float64
+  P::SM
 end
 
-for (SparseMatrixType, BlasType) in ((:(CuSparseMatrixCSR{T,Cint}), :BlasFloat),
-                                     (:(CuSparseMatrixCSC{T,Cint}), :BlasReal))
+for (bname, aname, sname, T) in ((:cusparseScsrilu02_bufferSize, :cusparseScsrilu02_analysis, :cusparseScsrilu02, :Float32),
+                                 (:cusparseDcsrilu02_bufferSize, :cusparseDcsrilu02_analysis, :cusparseDcsrilu02, :Float64),
+                                 (:cusparseCcsrilu02_bufferSize, :cusparseCcsrilu02_analysis, :cusparseCcsrilu02, :ComplexF32),
+                                 (:cusparseZcsrilu02_bufferSize, :cusparseZcsrilu02_analysis, :cusparseZcsrilu02, :ComplexF64))
   @eval begin
-    function KP.kp_ilu0(A::$SparseMatrixType) where T <: $BlasType
-      P = CUSPARSE.ilu02(A)
-      n = checksquare(A)
-      return NVIDIA_ILU0(P,0.0)
+    function KP.kp_ilu0(A::CuSparseMatrixCSR{$T,Cint})
+      P = copy(A)
+      n = checksquare(P)
+      desc = CUSPARSE.CuMatrixDescriptor('G', 'L', 'N', 'O')
+      info = ILU0Info()
+      buffer_size = Ref{Cint}()
+      CUSPARSE.$bname(CUSPARSE.handle(), n, nnz(P), desc, P.nzVal, P.rowPtr, P.colVal, info, buffer_size)
+      buffer = CuVector{UInt8}(undef, buffer_size[])
+      CUSPARSE.$aname(CUSPARSE.handle(), n, nnz(P), desc, P.nzVal, P.rowPtr, P.colVal, info, CUSPARSE.CUSPARSE_SOLVE_POLICY_USE_LEVEL, buffer)
+      posit = Ref{Cint}(1)
+      CUSPARSE.cusparseXcsrilu02_zeroPivot(CUSPARSE.handle(), info, posit)
+      (posit[] ≥ 0) && error("Structural/numerical zero in A at ($(posit[]),$(posit[])))")
+      CUSPARSE.$sname(CUSPARSE.handle(), n, nnz(P), desc, P.nzVal, P.rowPtr, P.colVal, info, CUSPARSE.CUSPARSE_SOLVE_POLICY_USE_LEVEL, buffer)
+      return NVIDIA_ILU0(n, desc, buffer, info, 0.0, P)
+    end
+
+    function KP.update!(p::NVIDIA_ILU0{CuSparseMatrixCSR{$T,Cint}}, A::CuSparseMatrixCSR{$T,Cint})
+      copyto!(p.P.nzVal, A.nzVal)
+      CUSPARSE.$sname(CUSPARSE.handle(), p.n, nnz(p.P), p.desc, p.P.nzVal, p.P.rowPtr, p.P.colVal, p.info, CUSPARSE.CUSPARSE_SOLVE_POLICY_USE_LEVEL, p.buffer)
+      return p
+    end
+
+    function KP.kp_ilu0(A::CuSparseMatrixCSC{$T,Cint})
+      P = copy(A)
+      n = checksquare(P)
+      desc = CUSPARSE.CuMatrixDescriptor('G', 'L', 'N', 'O')
+      info = ILU0Info()
+      buffer_size = Ref{Cint}()
+      CUSPARSE.$bname(CUSPARSE.handle(), n, nnz(P), desc, P.nzVal, P.colPtr, P.rowVal, info, buffer_size)
+      buffer = CuVector{UInt8}(undef, buffer_size[])
+      CUSPARSE.$aname(CUSPARSE.handle(), n, nnz(P), desc, P.nzVal, P.colPtr, P.rowVal, info, CUSPARSE.CUSPARSE_SOLVE_POLICY_USE_LEVEL, buffer)
+      posit = Ref{Cint}(1)
+      CUSPARSE.cusparseXcsrilu02_zeroPivot(CUSPARSE.handle(), info, posit)
+      (posit[] ≥ 0) && error("Structural/numerical zero in A at ($(posit[]),$(posit[])))")
+      CUSPARSE.$sname(CUSPARSE.handle(), n, nnz(P), desc, P.nzVal, P.colPtr, P.rowVal, info, CUSPARSE.CUSPARSE_SOLVE_POLICY_USE_LEVEL, buffer)
+      return NVIDIA_ILU0(n, desc, buffer, info, 0.0, P)
+    end
+
+    function KP.update!(p::NVIDIA_ILU0{CuSparseMatrixCSC{$T,Cint}}, A::CuSparseMatrixCSC{$T,Cint})
+      copyto!(p.P.nzVal, A.nzVal)
+      CUSPARSE.$sname(CUSPARSE.handle(), p.n, nnz(p.P), p.desc, p.P.nzVal, p.P.colPtr, p.P.rowVal, p.info, CUSPARSE.CUSPARSE_SOLVE_POLICY_USE_LEVEL, p.buffer)
+      return p
     end
   end
 end
@@ -45,7 +103,3 @@ for ArrayType in (:(CuVector{T}), :(CuMatrix{T}))
     end
   end
 end
-
-function KP.update!(p::NVIDIA_ILU0, A)
-    p.P = CUSPARSE.ilu02(A)
-end
diff --git a/ext/KrylovPreconditionersAMDGPUExt.jl b/ext/KrylovPreconditionersAMDGPUExt.jl
@@ -5,7 +5,7 @@ using AMDGPU
 using AMDGPU.rocSPARSE
 using LinearAlgebra: checksquare, BlasReal, BlasFloat
 import LinearAlgebra: ldiv!, mul!
-import Base: size, eltype
+import Base: size, eltype, unsafe_convert
 
 using KrylovPreconditioners
 const KP = KrylovPreconditioners

diff --git a/ext/KrylovPreconditionersCUDAExt.jl b/ext/KrylovPreconditionersCUDAExt.jl
@@ -5,7 +5,7 @@ using CUDA
 using CUDA.CUSPARSE
 using LinearAlgebra: checksquare, BlasReal, BlasFloat
 import LinearAlgebra: ldiv!, mul!
-import Base: size, eltype
+import Base: size, eltype, unsafe_convert
 
 using KrylovPreconditioners
 const KP = KrylovPreconditioners

diff --git a/src/KrylovPreconditioners.jl b/src/KrylovPreconditioners.jl
@@ -11,7 +11,10 @@ import LinearAlgebra: ldiv!
 
 abstract type AbstractKrylovPreconditioner end
 export AbstractKrylovPreconditioner
-update!(p::AbstractKrylovPreconditioner, A) = error("update!() for $(typeof(p)) not implemented")
+
+update!(p::AbstractKrylovPreconditioner, A::SparseMatrixCSC) = error("update!() for $(typeof(p)) is not implemented.")
+update!(p::AbstractKrylovPreconditioner, A) = error("update!() for $(typeof(p)) is not implemented.")
+
 export update!, get_timer, reset_timer!
 
 function get_timer(p::AbstractKrylovPreconditioner)

diff --git a/test/gpu/gpu.jl b/test/gpu/gpu.jl
@@ -14,7 +14,6 @@ function test_ic0(FC, V, M)
   A_gpu = M(A_cpu)
   b_gpu = V(b_cpu)
   P = kp_ic0(A_gpu)
-  update!(P, A_gpu)
 
   x_gpu, stats = cg(A_gpu, b_gpu, M=P, ldiv=true)
   r_gpu = b_gpu - A_gpu * x_gpu
@@ -24,6 +23,17 @@ function test_ic0(FC, V, M)
   else
     @test norm(r_gpu) ≤ 1e-8
   end
+
+  A_gpu = M(A_cpu + 200*I)
+  update!(P, A_gpu)
+  x_gpu, stats = cg(A_gpu, b_gpu, M=P, ldiv=true)
+  r_gpu = b_gpu - A_gpu * x_gpu
+  @test stats.niter ≤ 5
+  if (FC <: ComplexF64) && V.body.name.name == :ROCArray
+    @test_broken norm(r_gpu) ≤ 1e-6
+  else
+    @test norm(r_gpu) ≤ 1e-8
+  end
 end
 
 function test_ilu0(FC, V, M)
@@ -36,12 +46,18 @@ function test_ilu0(FC, V, M)
   A_gpu = M(A_cpu)
   b_gpu = V(b_cpu)
   P = kp_ilu0(A_gpu)
-  update!(P, A_gpu)
 
   x_gpu, stats = gmres(A_gpu, b_gpu, N=P, ldiv=true)
   r_gpu = b_gpu - A_gpu * x_gpu
   @test stats.niter ≤ 5
   @test norm(r_gpu) ≤ 1e-8
+
+  A_gpu = M(A_cpu + 200*I)
+  update!(P, A_gpu)
+  x_gpu, stats = gmres(A_gpu, b_gpu, N=P, ldiv=true)
+  r_gpu = b_gpu - A_gpu * x_gpu
+  @test stats.niter ≤ 5
+  @test norm(r_gpu) ≤ 1e-8
 end
 
 function test_operator(FC, V, DM, SM)