From 3d467785e7aeb98ae834fa20685f29ef1541cf8b Mon Sep 17 00:00:00 2001 From: Alexis Montoison Date: Mon, 27 Nov 2023 22:13:59 -0600 Subject: [PATCH 01/27] Add a KrylovOperator for sparse matrices on NVIDIA GPUs --- ext/CUDA/operators.jl | 58 +++++++++++++++++++++++++++++ ext/KrylovPreconditionersCUDAExt.jl | 4 +- src/KrylovPreconditioners.jl | 3 ++ test/gpu/gpu.jl | 17 +++++++++ test/gpu/nvidia.jl | 12 ++++++ 5 files changed, 92 insertions(+), 2 deletions(-) create mode 100644 ext/CUDA/operators.jl diff --git a/ext/CUDA/operators.jl b/ext/CUDA/operators.jl new file mode 100644 index 0000000..b56ab53 --- /dev/null +++ b/ext/CUDA/operators.jl @@ -0,0 +1,58 @@ +mutable struct CUDA_KrylovOperator{T} + m::Int + n::Int + nrhs::Int + type::T + transa::Char + descA::CUSPARSE.CuSparseMatrixDescriptor + buffer::CuVector{UInt8} +end + +eltype(A::CUDA_KrylovOperator{T}) where T = T +size(A::CUDA_KrylovOperator) = (m, n) + +for (SparseMatrixType, BlasType) in ((:(CuSparseMatrixCSR{T,Cint}), :BlasFloat), + (:(CuSparseMatrixCSC{T,Cint}), :BlasFloat) + @eval begin + function KP.KrylovOperator(A::$SparseMatrixType; nrhs::Int=1, transa::Char='N') where T <: $BlasType + m,n = size(A) + if nrhs == 1 + alpha = Ref{T}(one(T)) + beta = Ref{T}(zero(T)) + descA = CUSPARSE.CuSparseMatrixDescriptor(A, 'O') + descX = CUSPARSE.CuDenseVectorDescriptor(T, n) + descY = CUSPARSE.CuDenseVectorDescriptor(T, m) + algo = CUSPARSE.CUSPARSE_SPMV_ALG_DEFAULT + buffer_size = Ref{Csize_t}() + CUSPARSE.cusparseSpMV_bufferSize(CUDA.handle(), transa, alpha, descA, descX, beta, descY, T, algo, buffer_size) + buffer = CuVector{UInt8}(undef, buffer_size) + return CUDA_KrylovOperator{T}(m, n, rhs, transa, descA, buffer) + else + alpha = Ref{T}(one(T)) + beta = Ref{T}(zeto(T)) + descA = CUSPARSE.CuSparseMatrixDescriptor(A, 'O') + descX = CUSPARSE.CuDenseMatrixDescriptor(T, n, nrhs) + descY = CUSPARSE.CuDenseMatrixDescriptor(T, m, nrhs) + algo = CUSPARSE.CUSPARSE_SPMM_ALG_DEFAULT + buffer_size = Ref{Csize_t}() + transb = 'N' + CUSPARSE.cusparseSpMM_bufferSize(CUDA.handle(), transa, transb, alpha, descA, descX, beta, descY, T, algo, buffer_size) + buffer = CuVector{UInt8}(undef, buffer_size) + CUSPARSE.cusparseSpMM_preprocess(CUDA.handle(), transa, transb, alpha, descA, descX, beta, descY, T, algo, buffer) + return CUDA_KrylovOperator{T}(m, n, rhs, transa, descA, buffer) + end + end + + function LinearAlgebra.mul!(y::CuVector{T}, A::KrylovOperator{T}, x::CuVector{T}) where T<: BlasFloat + descY = CUSPARSE.CuDenseVectorDescriptor(y) + descX = CUSPARSE.CuDenseVectorDescriptor(x) + CUSPARSE.cusparseSpMV(CUDA.handle(), A.transa, one(T), A.descA, descX, zero(T), descY, T, A.algo, A.buffer) + end + + function LinearAlgebra.mul!(Y::CuMatrix{T}, A::KrylovOperator{T}, X::CuMatrix{T}) where T<: BlasFloat + descY = CUSPARSE.CuDenseVectorDescriptor(y) + descX = CUSPARSE.CuDenseVectorDescriptor(x) + CUSPARSE.cusparseSpMM(CUDA.handle(), A.transa, 'N', one(T), A.descA, descX, zero(T), descY, T, A.algo, A.buffer) + end + end +end diff --git a/ext/KrylovPreconditionersCUDAExt.jl b/ext/KrylovPreconditionersCUDAExt.jl index c0e0f83..f6b941d 100644 --- a/ext/KrylovPreconditionersCUDAExt.jl +++ b/ext/KrylovPreconditionersCUDAExt.jl @@ -3,8 +3,7 @@ using LinearAlgebra using CUDA using CUDA.CUSPARSE using LinearAlgebra: checksquare, BlasReal, BlasFloat -import LinearAlgebra: ldiv! -using SparseArrays +import LinearAlgebra: ldiv!, mul! using KrylovPreconditioners const KP = KrylovPreconditioners @@ -14,5 +13,6 @@ const KA = KernelAbstractions include("CUDA/ic0.jl") include("CUDA/ilu0.jl") include("CUDA/blockjacobi.jl") +include("CUDA/operators.jl") end diff --git a/src/KrylovPreconditioners.jl b/src/KrylovPreconditioners.jl index 8c43404..1eb3da3 100644 --- a/src/KrylovPreconditioners.jl +++ b/src/KrylovPreconditioners.jl @@ -12,6 +12,9 @@ import LinearAlgebra: ldiv! abstract type AbstractKrylovPreconditioner end export AbstractKrylovPreconditioner +function KrylovOperator end +export KrylovOperator + # Preconditioners include("ic0.jl") include("ilu0.jl") diff --git a/test/gpu/gpu.jl b/test/gpu/gpu.jl index 107ad94..9dbd1a8 100644 --- a/test/gpu/gpu.jl +++ b/test/gpu/gpu.jl @@ -42,6 +42,23 @@ function test_ilu0(FC, V, M) @test norm(r_gpu) ≤ 1e-8 end +function test_operator(FC, V, M) + n = 100 + R = real(FC) + A_cpu = rand(FC, m, n) + A_cpu = sparse(A_cpu) + b_cpu = rand(FC, m) + + A_gpu = M(A_cpu) + b_gpu = V(b_cpu) + opA_gpu = KrylovOperator(A_gpu) + + x_gpu, stats = lsmr(opA_gpu, b_gpu) + r_gpu = b_gpu - A_gpu * x_gpu + @test stats.solved + @test norm(A' * r_gpu) ≤ 1e-8 +end + _get_type(J::SparseMatrixCSC) = Vector{Float64} function generate_random_system(n::Int, m::Int) diff --git a/test/gpu/nvidia.jl b/test/gpu/nvidia.jl index cb155da..dbf212f 100644 --- a/test/gpu/nvidia.jl +++ b/test/gpu/nvidia.jl @@ -26,6 +26,18 @@ include("gpu.jl") end end + @testset "KrylovOperator" begin + @testset "CuSparseMatrixCOO -- $FC" for FC in (Float64, ComplexF64) + test_operator(FC, CuVector{FC}, CuSparseMatrixCOO{FC}) + end + @testset "CuSparseMatrixCSC -- $FC" for FC in (Float64, ComplexF64) + test_operator(FC, CuVector{FC}, CuSparseMatrixCSC{FC}) + end + @testset "CuSparseMatrixCSR -- $FC" for FC in (Float64, ComplexF64) + test_operator(FC, CuVector{FC}, CuSparseMatrixCSR{FC}) + end + end + @testset "Block Jacobi preconditioner" begin if CUDA.functional() test_block_jacobi(CUDABackend(), CuArray, CuSparseMatrixCSR) From 44112fd70b8c5e3a3f0df6f8feff98d7074bda2a Mon Sep 17 00:00:00 2001 From: Alexis Montoison Date: Mon, 27 Nov 2023 22:23:16 -0600 Subject: [PATCH 02/27] Fix a typo --- ext/CUDA/operators.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ext/CUDA/operators.jl b/ext/CUDA/operators.jl index b56ab53..b8ea8d2 100644 --- a/ext/CUDA/operators.jl +++ b/ext/CUDA/operators.jl @@ -12,7 +12,7 @@ eltype(A::CUDA_KrylovOperator{T}) where T = T size(A::CUDA_KrylovOperator) = (m, n) for (SparseMatrixType, BlasType) in ((:(CuSparseMatrixCSR{T,Cint}), :BlasFloat), - (:(CuSparseMatrixCSC{T,Cint}), :BlasFloat) + (:(CuSparseMatrixCSC{T,Cint}), :BlasFloat)) @eval begin function KP.KrylovOperator(A::$SparseMatrixType; nrhs::Int=1, transa::Char='N') where T <: $BlasType m,n = size(A) From 3f6d0907938c4671a7c8289326589ba578dc402c Mon Sep 17 00:00:00 2001 From: Alexis Montoison Date: Mon, 27 Nov 2023 22:43:24 -0600 Subject: [PATCH 03/27] Fix a typo --- ext/CUDA/operators.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ext/CUDA/operators.jl b/ext/CUDA/operators.jl index b8ea8d2..ada9b1f 100644 --- a/ext/CUDA/operators.jl +++ b/ext/CUDA/operators.jl @@ -43,13 +43,13 @@ for (SparseMatrixType, BlasType) in ((:(CuSparseMatrixCSR{T,Cint}), :BlasFloat), end end - function LinearAlgebra.mul!(y::CuVector{T}, A::KrylovOperator{T}, x::CuVector{T}) where T<: BlasFloat + function LinearAlgebra.mul!(y::CuVector{T}, A::CUDA_KrylovOperator{T}, x::CuVector{T}) where T<: BlasFloat descY = CUSPARSE.CuDenseVectorDescriptor(y) descX = CUSPARSE.CuDenseVectorDescriptor(x) CUSPARSE.cusparseSpMV(CUDA.handle(), A.transa, one(T), A.descA, descX, zero(T), descY, T, A.algo, A.buffer) end - function LinearAlgebra.mul!(Y::CuMatrix{T}, A::KrylovOperator{T}, X::CuMatrix{T}) where T<: BlasFloat + function LinearAlgebra.mul!(Y::CuMatrix{T}, A::CUDA_KrylovOperator{T}, X::CuMatrix{T}) where T<: BlasFloat descY = CUSPARSE.CuDenseVectorDescriptor(y) descX = CUSPARSE.CuDenseVectorDescriptor(x) CUSPARSE.cusparseSpMM(CUDA.handle(), A.transa, 'N', one(T), A.descA, descX, zero(T), descY, T, A.algo, A.buffer) From 213eb68bee9489eb5099a8c37912045dc99f6edb Mon Sep 17 00:00:00 2001 From: Alexis Montoison Date: Mon, 27 Nov 2023 22:49:56 -0600 Subject: [PATCH 04/27] Fix a typo --- ext/CUDA/operators.jl | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/ext/CUDA/operators.jl b/ext/CUDA/operators.jl index ada9b1f..de3ef23 100644 --- a/ext/CUDA/operators.jl +++ b/ext/CUDA/operators.jl @@ -42,17 +42,24 @@ for (SparseMatrixType, BlasType) in ((:(CuSparseMatrixCSR{T,Cint}), :BlasFloat), return CUDA_KrylovOperator{T}(m, n, rhs, transa, descA, buffer) end end +end - function LinearAlgebra.mul!(y::CuVector{T}, A::CUDA_KrylovOperator{T}, x::CuVector{T}) where T<: BlasFloat - descY = CUSPARSE.CuDenseVectorDescriptor(y) - descX = CUSPARSE.CuDenseVectorDescriptor(x) - CUSPARSE.cusparseSpMV(CUDA.handle(), A.transa, one(T), A.descA, descX, zero(T), descY, T, A.algo, A.buffer) - end +function LinearAlgebra.mul!(y::CuVector{T}, A::CUDA_KrylovOperator{T}, x::CuVector{T}) where T <: BlasFloat + (length(y) != A.m) && throw(DimensionMismatch("")) + (length(x) != A.n) && throw(DimensionMismatch("")) + (A.nrhs != 1) && throw(DimensionMismatch("")) + descY = CUSPARSE.CuDenseVectorDescriptor(y) + descX = CUSPARSE.CuDenseVectorDescriptor(x) + CUSPARSE.cusparseSpMV(CUDA.handle(), A.transa, one(T), A.descA, descX, zero(T), descY, T, A.algo, A.buffer) +end - function LinearAlgebra.mul!(Y::CuMatrix{T}, A::CUDA_KrylovOperator{T}, X::CuMatrix{T}) where T<: BlasFloat - descY = CUSPARSE.CuDenseVectorDescriptor(y) - descX = CUSPARSE.CuDenseVectorDescriptor(x) - CUSPARSE.cusparseSpMM(CUDA.handle(), A.transa, 'N', one(T), A.descA, descX, zero(T), descY, T, A.algo, A.buffer) - end - end +function LinearAlgebra.mul!(Y::CuMatrix{T}, A::CUDA_KrylovOperator{T}, X::CuMatrix{T}) where T <: BlasFloat + mY, nY = size(Y) + mX, nX = size(X) + (mY != A.m) && throw(DimensionMismatch("")) + (mX != A.n) && throw(DimensionMismatch("")) + (nY == nX == A.nrhs) || throw(DimensionMismatch("")) + descY = CUSPARSE.CuDenseVectorDescriptor(Y) + descX = CUSPARSE.CuDenseVectorDescriptor(X) + CUSPARSE.cusparseSpMM(CUDA.handle(), A.transa, 'N', one(T), A.descA, descX, zero(T), descY, T, A.algo, A.buffer) end From c8b74e4f4846ee8f5920542a77edafe1976e5171 Mon Sep 17 00:00:00 2001 From: Alexis Montoison Date: Mon, 27 Nov 2023 23:08:18 -0600 Subject: [PATCH 05/27] More fixes --- ext/CUDA/operators.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ext/CUDA/operators.jl b/ext/CUDA/operators.jl index de3ef23..f9e486c 100644 --- a/ext/CUDA/operators.jl +++ b/ext/CUDA/operators.jl @@ -9,10 +9,10 @@ mutable struct CUDA_KrylovOperator{T} end eltype(A::CUDA_KrylovOperator{T}) where T = T -size(A::CUDA_KrylovOperator) = (m, n) +size(A::CUDA_KrylovOperator) = (A.m, A.n) -for (SparseMatrixType, BlasType) in ((:(CuSparseMatrixCSR{T,Cint}), :BlasFloat), - (:(CuSparseMatrixCSC{T,Cint}), :BlasFloat)) +for (SparseMatrixType, BlasType) in ((:(CuSparseMatrixCSR{T}), :BlasFloat), + (:(CuSparseMatrixCSC{T}), :BlasFloat)) @eval begin function KP.KrylovOperator(A::$SparseMatrixType; nrhs::Int=1, transa::Char='N') where T <: $BlasType m,n = size(A) From 38e66581ab42f92431dcf42b0b6dfc527d3db949 Mon Sep 17 00:00:00 2001 From: Alexis Montoison Date: Mon, 27 Nov 2023 23:16:19 -0600 Subject: [PATCH 06/27] More fixes --- ext/CUDA/operators.jl | 55 ++++++++++++++++++++++--------------------- 1 file changed, 28 insertions(+), 27 deletions(-) diff --git a/ext/CUDA/operators.jl b/ext/CUDA/operators.jl index f9e486c..646d907 100644 --- a/ext/CUDA/operators.jl +++ b/ext/CUDA/operators.jl @@ -13,33 +13,34 @@ size(A::CUDA_KrylovOperator) = (A.m, A.n) for (SparseMatrixType, BlasType) in ((:(CuSparseMatrixCSR{T}), :BlasFloat), (:(CuSparseMatrixCSC{T}), :BlasFloat)) - @eval begin - function KP.KrylovOperator(A::$SparseMatrixType; nrhs::Int=1, transa::Char='N') where T <: $BlasType - m,n = size(A) - if nrhs == 1 - alpha = Ref{T}(one(T)) - beta = Ref{T}(zero(T)) - descA = CUSPARSE.CuSparseMatrixDescriptor(A, 'O') - descX = CUSPARSE.CuDenseVectorDescriptor(T, n) - descY = CUSPARSE.CuDenseVectorDescriptor(T, m) - algo = CUSPARSE.CUSPARSE_SPMV_ALG_DEFAULT - buffer_size = Ref{Csize_t}() - CUSPARSE.cusparseSpMV_bufferSize(CUDA.handle(), transa, alpha, descA, descX, beta, descY, T, algo, buffer_size) - buffer = CuVector{UInt8}(undef, buffer_size) - return CUDA_KrylovOperator{T}(m, n, rhs, transa, descA, buffer) - else - alpha = Ref{T}(one(T)) - beta = Ref{T}(zeto(T)) - descA = CUSPARSE.CuSparseMatrixDescriptor(A, 'O') - descX = CUSPARSE.CuDenseMatrixDescriptor(T, n, nrhs) - descY = CUSPARSE.CuDenseMatrixDescriptor(T, m, nrhs) - algo = CUSPARSE.CUSPARSE_SPMM_ALG_DEFAULT - buffer_size = Ref{Csize_t}() - transb = 'N' - CUSPARSE.cusparseSpMM_bufferSize(CUDA.handle(), transa, transb, alpha, descA, descX, beta, descY, T, algo, buffer_size) - buffer = CuVector{UInt8}(undef, buffer_size) - CUSPARSE.cusparseSpMM_preprocess(CUDA.handle(), transa, transb, alpha, descA, descX, beta, descY, T, algo, buffer) - return CUDA_KrylovOperator{T}(m, n, rhs, transa, descA, buffer) + @eval begin + function KP.KrylovOperator(A::$SparseMatrixType; nrhs::Int=1, transa::Char='N') where T <: $BlasType + m,n = size(A) + if nrhs == 1 + alpha = Ref{T}(one(T)) + beta = Ref{T}(zero(T)) + descA = CUSPARSE.CuSparseMatrixDescriptor(A, 'O') + descX = CUSPARSE.CuDenseVectorDescriptor(T, n) + descY = CUSPARSE.CuDenseVectorDescriptor(T, m) + algo = CUSPARSE.CUSPARSE_SPMV_ALG_DEFAULT + buffer_size = Ref{Csize_t}() + CUSPARSE.cusparseSpMV_bufferSize(CUDA.handle(), transa, alpha, descA, descX, beta, descY, T, algo, buffer_size) + buffer = CuVector{UInt8}(undef, buffer_size) + return CUDA_KrylovOperator{T}(m, n, rhs, transa, descA, buffer) + else + alpha = Ref{T}(one(T)) + beta = Ref{T}(zeto(T)) + descA = CUSPARSE.CuSparseMatrixDescriptor(A, 'O') + descX = CUSPARSE.CuDenseMatrixDescriptor(T, n, nrhs) + descY = CUSPARSE.CuDenseMatrixDescriptor(T, m, nrhs) + algo = CUSPARSE.CUSPARSE_SPMM_ALG_DEFAULT + buffer_size = Ref{Csize_t}() + transb = 'N' + CUSPARSE.cusparseSpMM_bufferSize(CUDA.handle(), transa, transb, alpha, descA, descX, beta, descY, T, algo, buffer_size) + buffer = CuVector{UInt8}(undef, buffer_size) + CUSPARSE.cusparseSpMM_preprocess(CUDA.handle(), transa, transb, alpha, descA, descX, beta, descY, T, algo, buffer) + return CUDA_KrylovOperator{T}(m, n, rhs, transa, descA, buffer) + end end end end From 23acdcb6a3dc9a6e0217cb792106f68922812596 Mon Sep 17 00:00:00 2001 From: Alexis Montoison Date: Mon, 27 Nov 2023 23:36:42 -0600 Subject: [PATCH 07/27] More fixes --- ext/CUDA/operators.jl | 4 ++-- test/gpu/gpu.jl | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/ext/CUDA/operators.jl b/ext/CUDA/operators.jl index 646d907..af3824a 100644 --- a/ext/CUDA/operators.jl +++ b/ext/CUDA/operators.jl @@ -26,7 +26,7 @@ for (SparseMatrixType, BlasType) in ((:(CuSparseMatrixCSR{T}), :BlasFloat), buffer_size = Ref{Csize_t}() CUSPARSE.cusparseSpMV_bufferSize(CUDA.handle(), transa, alpha, descA, descX, beta, descY, T, algo, buffer_size) buffer = CuVector{UInt8}(undef, buffer_size) - return CUDA_KrylovOperator{T}(m, n, rhs, transa, descA, buffer) + return CUDA_KrylovOperator{T}(m, n, nrhs, transa, descA, buffer) else alpha = Ref{T}(one(T)) beta = Ref{T}(zeto(T)) @@ -39,7 +39,7 @@ for (SparseMatrixType, BlasType) in ((:(CuSparseMatrixCSR{T}), :BlasFloat), CUSPARSE.cusparseSpMM_bufferSize(CUDA.handle(), transa, transb, alpha, descA, descX, beta, descY, T, algo, buffer_size) buffer = CuVector{UInt8}(undef, buffer_size) CUSPARSE.cusparseSpMM_preprocess(CUDA.handle(), transa, transb, alpha, descA, descX, beta, descY, T, algo, buffer) - return CUDA_KrylovOperator{T}(m, n, rhs, transa, descA, buffer) + return CUDA_KrylovOperator{T}(m, n, nrhs, transa, descA, buffer) end end end diff --git a/test/gpu/gpu.jl b/test/gpu/gpu.jl index 9dbd1a8..54d128f 100644 --- a/test/gpu/gpu.jl +++ b/test/gpu/gpu.jl @@ -43,7 +43,8 @@ function test_ilu0(FC, V, M) end function test_operator(FC, V, M) - n = 100 + m = 100 + n = 50 R = real(FC) A_cpu = rand(FC, m, n) A_cpu = sparse(A_cpu) From dee3385b2d9e8b44400895538bec673f4328c057 Mon Sep 17 00:00:00 2001 From: Alexis Montoison Date: Mon, 27 Nov 2023 23:57:42 -0600 Subject: [PATCH 08/27] More fixes --- ext/CUDA/operators.jl | 3 ++- ext/KrylovPreconditionersCUDAExt.jl | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/ext/CUDA/operators.jl b/ext/CUDA/operators.jl index af3824a..7fb8258 100644 --- a/ext/CUDA/operators.jl +++ b/ext/CUDA/operators.jl @@ -12,7 +12,8 @@ eltype(A::CUDA_KrylovOperator{T}) where T = T size(A::CUDA_KrylovOperator) = (A.m, A.n) for (SparseMatrixType, BlasType) in ((:(CuSparseMatrixCSR{T}), :BlasFloat), - (:(CuSparseMatrixCSC{T}), :BlasFloat)) + (:(CuSparseMatrixCSC{T}), :BlasFloat), + (:(CuSparseMatrixCOO{T}), :BlasFloat))) @eval begin function KP.KrylovOperator(A::$SparseMatrixType; nrhs::Int=1, transa::Char='N') where T <: $BlasType m,n = size(A) diff --git a/ext/KrylovPreconditionersCUDAExt.jl b/ext/KrylovPreconditionersCUDAExt.jl index f6b941d..a74f941 100644 --- a/ext/KrylovPreconditionersCUDAExt.jl +++ b/ext/KrylovPreconditionersCUDAExt.jl @@ -4,6 +4,7 @@ using CUDA using CUDA.CUSPARSE using LinearAlgebra: checksquare, BlasReal, BlasFloat import LinearAlgebra: ldiv!, mul! +import Base: size, eltype using KrylovPreconditioners const KP = KrylovPreconditioners From ab32b18172fe06c9dad4cd9b2cec9d17de8fe7c0 Mon Sep 17 00:00:00 2001 From: Alexis Montoison Date: Tue, 28 Nov 2023 00:01:05 -0600 Subject: [PATCH 09/27] More fixes --- ext/CUDA/operators.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ext/CUDA/operators.jl b/ext/CUDA/operators.jl index 7fb8258..f5fee4a 100644 --- a/ext/CUDA/operators.jl +++ b/ext/CUDA/operators.jl @@ -13,7 +13,7 @@ size(A::CUDA_KrylovOperator) = (A.m, A.n) for (SparseMatrixType, BlasType) in ((:(CuSparseMatrixCSR{T}), :BlasFloat), (:(CuSparseMatrixCSC{T}), :BlasFloat), - (:(CuSparseMatrixCOO{T}), :BlasFloat))) + (:(CuSparseMatrixCOO{T}), :BlasFloat)) @eval begin function KP.KrylovOperator(A::$SparseMatrixType; nrhs::Int=1, transa::Char='N') where T <: $BlasType m,n = size(A) From 42e42ba7c76cf4fb0201fbaf8b077d71adaa3fda Mon Sep 17 00:00:00 2001 From: Alexis Montoison Date: Tue, 28 Nov 2023 00:06:15 -0600 Subject: [PATCH 10/27] More fixes --- ext/CUDA/operators.jl | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/ext/CUDA/operators.jl b/ext/CUDA/operators.jl index f5fee4a..890cad2 100644 --- a/ext/CUDA/operators.jl +++ b/ext/CUDA/operators.jl @@ -25,7 +25,7 @@ for (SparseMatrixType, BlasType) in ((:(CuSparseMatrixCSR{T}), :BlasFloat), descY = CUSPARSE.CuDenseVectorDescriptor(T, m) algo = CUSPARSE.CUSPARSE_SPMV_ALG_DEFAULT buffer_size = Ref{Csize_t}() - CUSPARSE.cusparseSpMV_bufferSize(CUDA.handle(), transa, alpha, descA, descX, beta, descY, T, algo, buffer_size) + CUSPARSE.cusparseSpMV_bufferSize(CUSPARSE.handle(), transa, alpha, descA, descX, beta, descY, T, algo, buffer_size) buffer = CuVector{UInt8}(undef, buffer_size) return CUDA_KrylovOperator{T}(m, n, nrhs, transa, descA, buffer) else @@ -37,9 +37,9 @@ for (SparseMatrixType, BlasType) in ((:(CuSparseMatrixCSR{T}), :BlasFloat), algo = CUSPARSE.CUSPARSE_SPMM_ALG_DEFAULT buffer_size = Ref{Csize_t}() transb = 'N' - CUSPARSE.cusparseSpMM_bufferSize(CUDA.handle(), transa, transb, alpha, descA, descX, beta, descY, T, algo, buffer_size) + CUSPARSE.cusparseSpMM_bufferSize(CUSPARSE.handle(), transa, transb, alpha, descA, descX, beta, descY, T, algo, buffer_size) buffer = CuVector{UInt8}(undef, buffer_size) - CUSPARSE.cusparseSpMM_preprocess(CUDA.handle(), transa, transb, alpha, descA, descX, beta, descY, T, algo, buffer) + CUSPARSE.cusparseSpMM_preprocess(CUSPARSE.handle(), transa, transb, alpha, descA, descX, beta, descY, T, algo, buffer) return CUDA_KrylovOperator{T}(m, n, nrhs, transa, descA, buffer) end end @@ -52,7 +52,7 @@ function LinearAlgebra.mul!(y::CuVector{T}, A::CUDA_KrylovOperator{T}, x::CuVect (A.nrhs != 1) && throw(DimensionMismatch("")) descY = CUSPARSE.CuDenseVectorDescriptor(y) descX = CUSPARSE.CuDenseVectorDescriptor(x) - CUSPARSE.cusparseSpMV(CUDA.handle(), A.transa, one(T), A.descA, descX, zero(T), descY, T, A.algo, A.buffer) + CUSPARSE.cusparseSpMV(CUSPARSE.handle(), A.transa, one(T), A.descA, descX, zero(T), descY, T, A.algo, A.buffer) end function LinearAlgebra.mul!(Y::CuMatrix{T}, A::CUDA_KrylovOperator{T}, X::CuMatrix{T}) where T <: BlasFloat @@ -63,5 +63,5 @@ function LinearAlgebra.mul!(Y::CuMatrix{T}, A::CUDA_KrylovOperator{T}, X::CuMatr (nY == nX == A.nrhs) || throw(DimensionMismatch("")) descY = CUSPARSE.CuDenseVectorDescriptor(Y) descX = CUSPARSE.CuDenseVectorDescriptor(X) - CUSPARSE.cusparseSpMM(CUDA.handle(), A.transa, 'N', one(T), A.descA, descX, zero(T), descY, T, A.algo, A.buffer) + CUSPARSE.cusparseSpMM(CUSPARSE.handle(), A.transa, 'N', one(T), A.descA, descX, zero(T), descY, T, A.algo, A.buffer) end From 4d77d658e6648d85f672745d09014402e238c078 Mon Sep 17 00:00:00 2001 From: Alexis Montoison Date: Tue, 28 Nov 2023 00:09:45 -0600 Subject: [PATCH 11/27] More fixes --- ext/CUDA/operators.jl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ext/CUDA/operators.jl b/ext/CUDA/operators.jl index 890cad2..296c830 100644 --- a/ext/CUDA/operators.jl +++ b/ext/CUDA/operators.jl @@ -26,7 +26,7 @@ for (SparseMatrixType, BlasType) in ((:(CuSparseMatrixCSR{T}), :BlasFloat), algo = CUSPARSE.CUSPARSE_SPMV_ALG_DEFAULT buffer_size = Ref{Csize_t}() CUSPARSE.cusparseSpMV_bufferSize(CUSPARSE.handle(), transa, alpha, descA, descX, beta, descY, T, algo, buffer_size) - buffer = CuVector{UInt8}(undef, buffer_size) + buffer = CuVector{UInt8}(undef, buffer_size[]) return CUDA_KrylovOperator{T}(m, n, nrhs, transa, descA, buffer) else alpha = Ref{T}(one(T)) @@ -38,7 +38,7 @@ for (SparseMatrixType, BlasType) in ((:(CuSparseMatrixCSR{T}), :BlasFloat), buffer_size = Ref{Csize_t}() transb = 'N' CUSPARSE.cusparseSpMM_bufferSize(CUSPARSE.handle(), transa, transb, alpha, descA, descX, beta, descY, T, algo, buffer_size) - buffer = CuVector{UInt8}(undef, buffer_size) + buffer = CuVector{UInt8}(undef, buffer_size[]) CUSPARSE.cusparseSpMM_preprocess(CUSPARSE.handle(), transa, transb, alpha, descA, descX, beta, descY, T, algo, buffer) return CUDA_KrylovOperator{T}(m, n, nrhs, transa, descA, buffer) end @@ -61,7 +61,7 @@ function LinearAlgebra.mul!(Y::CuMatrix{T}, A::CUDA_KrylovOperator{T}, X::CuMatr (mY != A.m) && throw(DimensionMismatch("")) (mX != A.n) && throw(DimensionMismatch("")) (nY == nX == A.nrhs) || throw(DimensionMismatch("")) - descY = CUSPARSE.CuDenseVectorDescriptor(Y) - descX = CUSPARSE.CuDenseVectorDescriptor(X) + descY = CUSPARSE.CuDenseMatrixDescriptor(Y) + descX = CUSPARSE.CuDenseMatrixDescriptor(X) CUSPARSE.cusparseSpMM(CUSPARSE.handle(), A.transa, 'N', one(T), A.descA, descX, zero(T), descY, T, A.algo, A.buffer) end From 3575997ed3685543044fe30f2af6c23317cfa0e3 Mon Sep 17 00:00:00 2001 From: Alexis Montoison Date: Tue, 28 Nov 2023 00:14:26 -0600 Subject: [PATCH 12/27] More fixes --- ext/CUDA/operators.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ext/CUDA/operators.jl b/ext/CUDA/operators.jl index 296c830..fda7042 100644 --- a/ext/CUDA/operators.jl +++ b/ext/CUDA/operators.jl @@ -1,8 +1,8 @@ mutable struct CUDA_KrylovOperator{T} + type::Type{T} m::Int n::Int nrhs::Int - type::T transa::Char descA::CUSPARSE.CuSparseMatrixDescriptor buffer::CuVector{UInt8} @@ -27,7 +27,7 @@ for (SparseMatrixType, BlasType) in ((:(CuSparseMatrixCSR{T}), :BlasFloat), buffer_size = Ref{Csize_t}() CUSPARSE.cusparseSpMV_bufferSize(CUSPARSE.handle(), transa, alpha, descA, descX, beta, descY, T, algo, buffer_size) buffer = CuVector{UInt8}(undef, buffer_size[]) - return CUDA_KrylovOperator{T}(m, n, nrhs, transa, descA, buffer) + return CUDA_KrylovOperator{T}(T, m, n, nrhs, transa, descA, buffer) else alpha = Ref{T}(one(T)) beta = Ref{T}(zeto(T)) @@ -40,7 +40,7 @@ for (SparseMatrixType, BlasType) in ((:(CuSparseMatrixCSR{T}), :BlasFloat), CUSPARSE.cusparseSpMM_bufferSize(CUSPARSE.handle(), transa, transb, alpha, descA, descX, beta, descY, T, algo, buffer_size) buffer = CuVector{UInt8}(undef, buffer_size[]) CUSPARSE.cusparseSpMM_preprocess(CUSPARSE.handle(), transa, transb, alpha, descA, descX, beta, descY, T, algo, buffer) - return CUDA_KrylovOperator{T}(m, n, nrhs, transa, descA, buffer) + return CUDA_KrylovOperator{T}(T, m, n, nrhs, transa, descA, buffer) end end end From 44514a3c76dd5e3cde1cfc6341dc324efdbb75b1 Mon Sep 17 00:00:00 2001 From: Alexis Montoison Date: Tue, 28 Nov 2023 00:18:30 -0600 Subject: [PATCH 13/27] More fixes --- test/gpu/gpu.jl | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/test/gpu/gpu.jl b/test/gpu/gpu.jl index 54d128f..65453d5 100644 --- a/test/gpu/gpu.jl +++ b/test/gpu/gpu.jl @@ -43,21 +43,20 @@ function test_ilu0(FC, V, M) end function test_operator(FC, V, M) - m = 100 - n = 50 + n = 100 R = real(FC) - A_cpu = rand(FC, m, n) + A_cpu = rand(FC, n, n) A_cpu = sparse(A_cpu) - b_cpu = rand(FC, m) + b_cpu = rand(FC, n) A_gpu = M(A_cpu) b_gpu = V(b_cpu) opA_gpu = KrylovOperator(A_gpu) - x_gpu, stats = lsmr(opA_gpu, b_gpu) + x_gpu, stats = gmres(opA_gpu, b_gpu) r_gpu = b_gpu - A_gpu * x_gpu @test stats.solved - @test norm(A' * r_gpu) ≤ 1e-8 + @test norm(r_gpu) ≤ 1e-8 end _get_type(J::SparseMatrixCSC) = Vector{Float64} From faf75e9fcd343c3c0faaa08dc58704ebfee7b0ec Mon Sep 17 00:00:00 2001 From: Alexis Montoison Date: Tue, 28 Nov 2023 00:23:01 -0600 Subject: [PATCH 14/27] More fixes --- ext/CUDA/operators.jl | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/ext/CUDA/operators.jl b/ext/CUDA/operators.jl index fda7042..59bac7b 100644 --- a/ext/CUDA/operators.jl +++ b/ext/CUDA/operators.jl @@ -52,7 +52,8 @@ function LinearAlgebra.mul!(y::CuVector{T}, A::CUDA_KrylovOperator{T}, x::CuVect (A.nrhs != 1) && throw(DimensionMismatch("")) descY = CUSPARSE.CuDenseVectorDescriptor(y) descX = CUSPARSE.CuDenseVectorDescriptor(x) - CUSPARSE.cusparseSpMV(CUSPARSE.handle(), A.transa, one(T), A.descA, descX, zero(T), descY, T, A.algo, A.buffer) + algo = CUSPARSE.CUSPARSE_SPMV_ALG_DEFAULT + CUSPARSE.cusparseSpMV(CUSPARSE.handle(), A.transa, one(T), A.descA, descX, zero(T), descY, T, algo, A.buffer) end function LinearAlgebra.mul!(Y::CuMatrix{T}, A::CUDA_KrylovOperator{T}, X::CuMatrix{T}) where T <: BlasFloat @@ -63,5 +64,6 @@ function LinearAlgebra.mul!(Y::CuMatrix{T}, A::CUDA_KrylovOperator{T}, X::CuMatr (nY == nX == A.nrhs) || throw(DimensionMismatch("")) descY = CUSPARSE.CuDenseMatrixDescriptor(Y) descX = CUSPARSE.CuDenseMatrixDescriptor(X) - CUSPARSE.cusparseSpMM(CUSPARSE.handle(), A.transa, 'N', one(T), A.descA, descX, zero(T), descY, T, A.algo, A.buffer) + algo = CUSPARSE.CUSPARSE_SPMM_ALG_DEFAULT + CUSPARSE.cusparseSpMM(CUSPARSE.handle(), A.transa, 'N', one(T), A.descA, descX, zero(T), descY, T, algo, A.buffer) end From c4ab48b5acb79f742acb4fdffa5ae4a360d9cb17 Mon Sep 17 00:00:00 2001 From: Alexis Montoison Date: Tue, 28 Nov 2023 00:27:48 -0600 Subject: [PATCH 15/27] More fixes --- ext/CUDA/operators.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ext/CUDA/operators.jl b/ext/CUDA/operators.jl index 59bac7b..f4b65da 100644 --- a/ext/CUDA/operators.jl +++ b/ext/CUDA/operators.jl @@ -53,7 +53,7 @@ function LinearAlgebra.mul!(y::CuVector{T}, A::CUDA_KrylovOperator{T}, x::CuVect descY = CUSPARSE.CuDenseVectorDescriptor(y) descX = CUSPARSE.CuDenseVectorDescriptor(x) algo = CUSPARSE.CUSPARSE_SPMV_ALG_DEFAULT - CUSPARSE.cusparseSpMV(CUSPARSE.handle(), A.transa, one(T), A.descA, descX, zero(T), descY, T, algo, A.buffer) + CUSPARSE.cusparseSpMV(CUSPARSE.handle(), A.transa, Ref{T}(one(T)), A.descA, descX, Ref{T}(zero(T)), descY, T, algo, A.buffer) end function LinearAlgebra.mul!(Y::CuMatrix{T}, A::CUDA_KrylovOperator{T}, X::CuMatrix{T}) where T <: BlasFloat @@ -65,5 +65,5 @@ function LinearAlgebra.mul!(Y::CuMatrix{T}, A::CUDA_KrylovOperator{T}, X::CuMatr descY = CUSPARSE.CuDenseMatrixDescriptor(Y) descX = CUSPARSE.CuDenseMatrixDescriptor(X) algo = CUSPARSE.CUSPARSE_SPMM_ALG_DEFAULT - CUSPARSE.cusparseSpMM(CUSPARSE.handle(), A.transa, 'N', one(T), A.descA, descX, zero(T), descY, T, algo, A.buffer) + CUSPARSE.cusparseSpMM(CUSPARSE.handle(), A.transa, 'N', Ref{T}(one(T)), A.descA, descX, Ref{T}(zero(T)), descY, T, algo, A.buffer) end From a79a64839989178298051f157fb3a06d9e420994 Mon Sep 17 00:00:00 2001 From: Alexis Montoison Date: Tue, 28 Nov 2023 01:14:30 -0600 Subject: [PATCH 16/27] [AMDGPU.jl] Add a KrylovOperator for sparse matrices --- ext/AMDGPU/operators.jl | 74 +++++++++++++++++++++++++++ ext/CUDA/operators.jl | 14 ++--- ext/KrylovPreconditionersAMDGPUExt.jl | 5 +- test/gpu/amd.jl | 12 +++++ 4 files changed, 96 insertions(+), 9 deletions(-) create mode 100644 ext/AMDGPU/operators.jl diff --git a/ext/AMDGPU/operators.jl b/ext/AMDGPU/operators.jl new file mode 100644 index 0000000..d6fdf12 --- /dev/null +++ b/ext/AMDGPU/operators.jl @@ -0,0 +1,74 @@ +mutable struct AMD_KrylovOperator{T} + type::Type{T} + m::Int + n::Int + nrhs::Int + transa::Char + descA::rocSPARSE.ROCSparseMatrixDescriptor + buffer_size::Ref{Csize_t} + buffer::ROCVector{UInt8} +end + +eltype(A::AMD_KrylovOperator{T}) where T = T +size(A::AMD_KrylovOperator) = (A.m, A.n) + +for (SparseMatrixType, BlasType) in ((:(ROCSparseMatrixCSR{T}), :BlasFloat), + (:(ROCSparseMatrixCSC{T}), :BlasFloat), + (:(ROCSparseMatrixCOO{T}), :BlasFloat)) + @eval begin + function KP.KrylovOperator(A::$SparseMatrixType; nrhs::Int=1, transa::Char='N') where T <: $BlasType + m,n = size(A) + if nrhs == 1 + alpha = Ref{T}(one(T)) + beta = Ref{T}(zero(T)) + descA = rocSPARSE.ROCSparseMatrixDescriptor(A, 'O') + descX = rocSPARSE.ROCDenseVectorDescriptor(T, n) + descY = rocSPARSE.ROCDenseVectorDescriptor(T, m) + algo = rocSPARSE.rocSPARSE.rocsparse_spmv_alg_default + buffer_size = Ref{Csize_t}() + rocSPARSE.rocSPARSE.rocsparse_spmv(rocSPARSE.handle(), transa, alpha, descA, descX, beta, descY, T, algo, buffer_size, C_NULL) + buffer = ROCVector{UInt8}(undef, buffer_size[]) + return AMD_KrylovOperator{T}(T, m, n, nrhs, transa, descA, buffer_size, buffer) + else + alpha = Ref{T}(one(T)) + beta = Ref{T}(zeto(T)) + descA = rocSPARSE.ROCSparseMatrixDescriptor(A, 'O') + descX = rocSPARSE.ROCDenseMatrixDescriptor(T, n, nrhs) + descY = rocSPARSE.ROCDenseMatrixDescriptor(T, m, nrhs) + algo = rocSPARSE.rocsparse_spmm_alg_default + buffer_size = Ref{Csize_t}() + transb = 'N' + rocSPARSE.rocsparse_spmm(rocSPARSE.handle(), transa, 'N', alpha, descA, descX, beta, descY, T, + algo, rocSPARSE.rocsparse_spmm_stage_buffer_size, buffer_size, C_NULL) + buffer = ROCVector{UInt8}(undef, buffer_size[]) + rocSPARSE.rocsparse_spmm(rocSPARSE.handle(), transa, 'N', alpha, descA, descX, beta, descY, T, + algo, rocSPARSE.rocsparse_spmm_stage_preprocess, buffer_size, buffer) + return AMD_KrylovOperator{T}(T, m, n, nrhs, transa, descA, buffer_size, buffer) + end + end + end +end + +function LinearAlgebra.mul!(y::ROCVector{T}, A::AMD_KrylovOperator{T}, x::ROCVector{T}) where T <: BlasFloat + (length(y) != A.m) && throw(DimensionMismatch("")) + (length(x) != A.n) && throw(DimensionMismatch("")) + (A.nrhs != 1) && throw(DimensionMismatch("")) + descY = rocSPARSE.ROCDenseVectorDescriptor(y) + descX = rocSPARSE.ROCDenseVectorDescriptor(x) + algo = rocSPARSE.rocsparse_spmv_alg_default + rocSPARSE.rocsparse_spmv(rocSPARSE.handle(), A.transa, Ref{T}(one(T)), A.descA, descX, + Ref{T}(zero(T)), descY, T, algo, A.buffer_size, A.buffer) +end + +function LinearAlgebra.mul!(Y::ROCMatrix{T}, A::AMD_KrylovOperator{T}, X::ROCMatrix{T}) where T <: BlasFloat + mY, nY = size(Y) + mX, nX = size(X) + (mY != A.m) && throw(DimensionMismatch("")) + (mX != A.n) && throw(DimensionMismatch("")) + (nY == nX == A.nrhs) || throw(DimensionMismatch("")) + descY = rocSPARSE.ROCDenseMatrixDescriptor(Y) + descX = rocSPARSE.ROCDenseMatrixDescriptor(X) + algo = rocSPARSE.rocsparse_spmm_alg_default + rocSPARSE.rocsparse_spmm(rocSPARSE.handle(), A.transa, 'N', Ref{T}(one(T)), A.descA, descX, + Ref{T}(zero(T)), descY, T, algo, rocSPARSE.rocsparse_spmm_stage_compute, A.buffer_size, A.buffer) +end diff --git a/ext/CUDA/operators.jl b/ext/CUDA/operators.jl index f4b65da..81d3e35 100644 --- a/ext/CUDA/operators.jl +++ b/ext/CUDA/operators.jl @@ -1,4 +1,4 @@ -mutable struct CUDA_KrylovOperator{T} +mutable struct NVIDIA_KrylovOperator{T} type::Type{T} m::Int n::Int @@ -8,8 +8,8 @@ mutable struct CUDA_KrylovOperator{T} buffer::CuVector{UInt8} end -eltype(A::CUDA_KrylovOperator{T}) where T = T -size(A::CUDA_KrylovOperator) = (A.m, A.n) +eltype(A::NVIDIA_KrylovOperator{T}) where T = T +size(A::NVIDIA_KrylovOperator) = (A.m, A.n) for (SparseMatrixType, BlasType) in ((:(CuSparseMatrixCSR{T}), :BlasFloat), (:(CuSparseMatrixCSC{T}), :BlasFloat), @@ -27,7 +27,7 @@ for (SparseMatrixType, BlasType) in ((:(CuSparseMatrixCSR{T}), :BlasFloat), buffer_size = Ref{Csize_t}() CUSPARSE.cusparseSpMV_bufferSize(CUSPARSE.handle(), transa, alpha, descA, descX, beta, descY, T, algo, buffer_size) buffer = CuVector{UInt8}(undef, buffer_size[]) - return CUDA_KrylovOperator{T}(T, m, n, nrhs, transa, descA, buffer) + return NVIDIA_KrylovOperator{T}(T, m, n, nrhs, transa, descA, buffer) else alpha = Ref{T}(one(T)) beta = Ref{T}(zeto(T)) @@ -40,13 +40,13 @@ for (SparseMatrixType, BlasType) in ((:(CuSparseMatrixCSR{T}), :BlasFloat), CUSPARSE.cusparseSpMM_bufferSize(CUSPARSE.handle(), transa, transb, alpha, descA, descX, beta, descY, T, algo, buffer_size) buffer = CuVector{UInt8}(undef, buffer_size[]) CUSPARSE.cusparseSpMM_preprocess(CUSPARSE.handle(), transa, transb, alpha, descA, descX, beta, descY, T, algo, buffer) - return CUDA_KrylovOperator{T}(T, m, n, nrhs, transa, descA, buffer) + return NVIDIA_KrylovOperator{T}(T, m, n, nrhs, transa, descA, buffer) end end end end -function LinearAlgebra.mul!(y::CuVector{T}, A::CUDA_KrylovOperator{T}, x::CuVector{T}) where T <: BlasFloat +function LinearAlgebra.mul!(y::CuVector{T}, A::NVIDIA_KrylovOperator{T}, x::CuVector{T}) where T <: BlasFloat (length(y) != A.m) && throw(DimensionMismatch("")) (length(x) != A.n) && throw(DimensionMismatch("")) (A.nrhs != 1) && throw(DimensionMismatch("")) @@ -56,7 +56,7 @@ function LinearAlgebra.mul!(y::CuVector{T}, A::CUDA_KrylovOperator{T}, x::CuVect CUSPARSE.cusparseSpMV(CUSPARSE.handle(), A.transa, Ref{T}(one(T)), A.descA, descX, Ref{T}(zero(T)), descY, T, algo, A.buffer) end -function LinearAlgebra.mul!(Y::CuMatrix{T}, A::CUDA_KrylovOperator{T}, X::CuMatrix{T}) where T <: BlasFloat +function LinearAlgebra.mul!(Y::CuMatrix{T}, A::NVIDIA_KrylovOperator{T}, X::CuMatrix{T}) where T <: BlasFloat mY, nY = size(Y) mX, nX = size(X) (mY != A.m) && throw(DimensionMismatch("")) diff --git a/ext/KrylovPreconditionersAMDGPUExt.jl b/ext/KrylovPreconditionersAMDGPUExt.jl index b75c28c..54e8e6e 100644 --- a/ext/KrylovPreconditionersAMDGPUExt.jl +++ b/ext/KrylovPreconditionersAMDGPUExt.jl @@ -3,8 +3,8 @@ using LinearAlgebra using AMDGPU using AMDGPU.rocSPARSE using LinearAlgebra: checksquare, BlasReal, BlasFloat -import LinearAlgebra: ldiv! -using SparseArrays +import LinearAlgebra: ldiv!, mul! +import Base: size, eltype using KrylovPreconditioners const KP = KrylovPreconditioners @@ -14,5 +14,6 @@ const KA = KernelAbstractions include("AMDGPU/ic0.jl") include("AMDGPU/ilu0.jl") include("AMDGPU/blockjacobi.jl") +# include("AMDGPU/operators.jl") end diff --git a/test/gpu/amd.jl b/test/gpu/amd.jl index 366e2cd..0948908 100644 --- a/test/gpu/amd.jl +++ b/test/gpu/amd.jl @@ -26,6 +26,18 @@ include("gpu.jl") end end + # @testset "KrylovOperator" begin + # @testset "ROCSparseMatrixCOO -- $FC" for FC in (Float64, ComplexF64) + # test_operator(FC, ROCVector{FC}, ROCSparseMatrixCOO{FC}) + # end + # @testset "ROCSparseMatrixCSC -- $FC" for FC in (Float64, ComplexF64) + # test_operator(FC, ROCVector{FC}, ROCSparseMatrixCSC{FC}) + # end + # @testset "ROCSparseMatrixCSR -- $FC" for FC in (Float64, ComplexF64) + # test_operator(FC, ROCVector{FC}, ROCSparseMatrixCSR{FC}) + # end + # end + @testset "Block Jacobi preconditioner" begin if AMDGPU.functional() test_block_jacobi(ROCBackend(), ROCArray, ROCSparseMatrixCSR) From c86714986919a66d3901f4377420f90b4ff30c09 Mon Sep 17 00:00:00 2001 From: Alexis Montoison Date: Tue, 28 Nov 2023 11:57:37 -0600 Subject: [PATCH 17/27] Add more tests with KrylovOperators --- test/gpu/gpu.jl | 27 ++++++++++++++++++++++++--- test/gpu/nvidia.jl | 6 +++--- 2 files changed, 27 insertions(+), 6 deletions(-) diff --git a/test/gpu/gpu.jl b/test/gpu/gpu.jl index 65453d5..4f345d4 100644 --- a/test/gpu/gpu.jl +++ b/test/gpu/gpu.jl @@ -42,14 +42,14 @@ function test_ilu0(FC, V, M) @test norm(r_gpu) ≤ 1e-8 end -function test_operator(FC, V, M) +function test_operator(FC, V, DM, SM) + m = 200 n = 100 - R = real(FC) A_cpu = rand(FC, n, n) A_cpu = sparse(A_cpu) b_cpu = rand(FC, n) - A_gpu = M(A_cpu) + A_gpu = SM(A_cpu) b_gpu = V(b_cpu) opA_gpu = KrylovOperator(A_gpu) @@ -57,6 +57,27 @@ function test_operator(FC, V, M) r_gpu = b_gpu - A_gpu * x_gpu @test stats.solved @test norm(r_gpu) ≤ 1e-8 + + A_cpu = rand(FC, m, n) + A_cpu = sparse(A_cpu) + opA_gpu = KrylovOperator(A_gpu) + for i = 1:5 + y_cpu = rand(FC, m) + y_gpu = V(y_cpu) + x_cpu = rand(FC, n) + x_gpu = V(x_cpu) + mul!(y_gpu, opA_gpu, x_gpu) + end + + nrhs = 3 + opA_gpu = KrylovOperator(A_gpu; nrhs) + for i = 1:5 + Y_cpu = rand(FC, m, nrhs) + Y_gpu = DM(Y_cpu) + X_cpu = rand(FC, n, nrhs) + X_gpu = DM(X_cpu) + mul!(Y_gpu, opA_gpu, X_gpu) + end end _get_type(J::SparseMatrixCSC) = Vector{Float64} diff --git a/test/gpu/nvidia.jl b/test/gpu/nvidia.jl index dbf212f..655e1df 100644 --- a/test/gpu/nvidia.jl +++ b/test/gpu/nvidia.jl @@ -28,13 +28,13 @@ include("gpu.jl") @testset "KrylovOperator" begin @testset "CuSparseMatrixCOO -- $FC" for FC in (Float64, ComplexF64) - test_operator(FC, CuVector{FC}, CuSparseMatrixCOO{FC}) + test_operator(FC, CuVector{FC}, CuMatrix{FC}, CuSparseMatrixCOO{FC}) end @testset "CuSparseMatrixCSC -- $FC" for FC in (Float64, ComplexF64) - test_operator(FC, CuVector{FC}, CuSparseMatrixCSC{FC}) + test_operator(FC, CuVector{FC}, CuMatrix{FC}, CuSparseMatrixCSC{FC}) end @testset "CuSparseMatrixCSR -- $FC" for FC in (Float64, ComplexF64) - test_operator(FC, CuVector{FC}, CuSparseMatrixCSR{FC}) + test_operator(FC, CuVector{FC}, CuMatrix{FC}, CuSparseMatrixCSR{FC}) end end From 68403ce1a959126e50562554ab3275d067d4bd3c Mon Sep 17 00:00:00 2001 From: Alexis Montoison Date: Tue, 28 Nov 2023 13:16:06 -0600 Subject: [PATCH 18/27] debug --- ext/CUDA/operators.jl | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/ext/CUDA/operators.jl b/ext/CUDA/operators.jl index 81d3e35..13ac454 100644 --- a/ext/CUDA/operators.jl +++ b/ext/CUDA/operators.jl @@ -47,9 +47,9 @@ for (SparseMatrixType, BlasType) in ((:(CuSparseMatrixCSR{T}), :BlasFloat), end function LinearAlgebra.mul!(y::CuVector{T}, A::NVIDIA_KrylovOperator{T}, x::CuVector{T}) where T <: BlasFloat - (length(y) != A.m) && throw(DimensionMismatch("")) - (length(x) != A.n) && throw(DimensionMismatch("")) - (A.nrhs != 1) && throw(DimensionMismatch("")) + (length(y) != A.m) && throw(DimensionMismatch("length(y) != A.m")) + (length(x) != A.n) && throw(DimensionMismatch("length(x) != A.n")) + (A.nrhs == 1) || throw(DimensionMismatch("A.nrhs != 1")) descY = CUSPARSE.CuDenseVectorDescriptor(y) descX = CUSPARSE.CuDenseVectorDescriptor(x) algo = CUSPARSE.CUSPARSE_SPMV_ALG_DEFAULT @@ -59,9 +59,9 @@ end function LinearAlgebra.mul!(Y::CuMatrix{T}, A::NVIDIA_KrylovOperator{T}, X::CuMatrix{T}) where T <: BlasFloat mY, nY = size(Y) mX, nX = size(X) - (mY != A.m) && throw(DimensionMismatch("")) - (mX != A.n) && throw(DimensionMismatch("")) - (nY == nX == A.nrhs) || throw(DimensionMismatch("")) + (mY != A.m) && throw(DimensionMismatch("mY != A.m")) + (mX != A.n) && throw(DimensionMismatch("mX != A.n")) + (nY == nX == A.nrhs) || throw(DimensionMismatch("nY != A.nrhs or nX != A.nrhs")) descY = CUSPARSE.CuDenseMatrixDescriptor(Y) descX = CUSPARSE.CuDenseMatrixDescriptor(X) algo = CUSPARSE.CUSPARSE_SPMM_ALG_DEFAULT From 0bdfa60eeaeede1edff92383dab40c268297f588 Mon Sep 17 00:00:00 2001 From: Alexis Montoison Date: Tue, 28 Nov 2023 13:32:11 -0600 Subject: [PATCH 19/27] debug --- ext/KrylovPreconditionersAMDGPUExt.jl | 2 +- test/gpu/amd.jl | 22 +++++++++++----------- test/gpu/gpu.jl | 8 ++++++-- 3 files changed, 18 insertions(+), 14 deletions(-) diff --git a/ext/KrylovPreconditionersAMDGPUExt.jl b/ext/KrylovPreconditionersAMDGPUExt.jl index 54e8e6e..0355a2a 100644 --- a/ext/KrylovPreconditionersAMDGPUExt.jl +++ b/ext/KrylovPreconditionersAMDGPUExt.jl @@ -14,6 +14,6 @@ const KA = KernelAbstractions include("AMDGPU/ic0.jl") include("AMDGPU/ilu0.jl") include("AMDGPU/blockjacobi.jl") -# include("AMDGPU/operators.jl") +include("AMDGPU/operators.jl") end diff --git a/test/gpu/amd.jl b/test/gpu/amd.jl index 0948908..fbd7a07 100644 --- a/test/gpu/amd.jl +++ b/test/gpu/amd.jl @@ -26,17 +26,17 @@ include("gpu.jl") end end - # @testset "KrylovOperator" begin - # @testset "ROCSparseMatrixCOO -- $FC" for FC in (Float64, ComplexF64) - # test_operator(FC, ROCVector{FC}, ROCSparseMatrixCOO{FC}) - # end - # @testset "ROCSparseMatrixCSC -- $FC" for FC in (Float64, ComplexF64) - # test_operator(FC, ROCVector{FC}, ROCSparseMatrixCSC{FC}) - # end - # @testset "ROCSparseMatrixCSR -- $FC" for FC in (Float64, ComplexF64) - # test_operator(FC, ROCVector{FC}, ROCSparseMatrixCSR{FC}) - # end - # end + @testset "KrylovOperator" begin + @testset "ROCSparseMatrixCOO -- $FC" for FC in (Float64, ComplexF64) + test_operator(FC, ROCVector{FC}, ROCMatrix{FC}, ROCSparseMatrixCOO{FC}) + end + @testset "ROCSparseMatrixCSC -- $FC" for FC in (Float64, ComplexF64) + test_operator(FC, ROCVector{FC}, ROCMatrix{FC}, ROCSparseMatrixCSC{FC}) + end + @testset "ROCSparseMatrixCSR -- $FC" for FC in (Float64, ComplexF64) + test_operator(FC, ROCVector{FC}, ROCMatrix{FC}, ROCSparseMatrixCSR{FC}) + end + end @testset "Block Jacobi preconditioner" begin if AMDGPU.functional() diff --git a/test/gpu/gpu.jl b/test/gpu/gpu.jl index 4f345d4..3abdc2c 100644 --- a/test/gpu/gpu.jl +++ b/test/gpu/gpu.jl @@ -63,20 +63,24 @@ function test_operator(FC, V, DM, SM) opA_gpu = KrylovOperator(A_gpu) for i = 1:5 y_cpu = rand(FC, m) - y_gpu = V(y_cpu) x_cpu = rand(FC, n) + mul!(y_cpu, A_cpu, x_cpu) + y_gpu = V(y_cpu) x_gpu = V(x_cpu) mul!(y_gpu, opA_gpu, x_gpu) + @test collect(y_gpu) ≈ y_cpu end nrhs = 3 opA_gpu = KrylovOperator(A_gpu; nrhs) for i = 1:5 Y_cpu = rand(FC, m, nrhs) - Y_gpu = DM(Y_cpu) X_cpu = rand(FC, n, nrhs) + mul!(Y_cpu, A_cpu, X_cpu) + Y_gpu = DM(Y_cpu) X_gpu = DM(X_cpu) mul!(Y_gpu, opA_gpu, X_gpu) + @test collect(Y_gpu) ≈ Y_cpu end end From 5b91db338ea606fe88da9ef34976711b1f1b3bbb Mon Sep 17 00:00:00 2001 From: Alexis Montoison Date: Tue, 28 Nov 2023 13:46:46 -0600 Subject: [PATCH 20/27] debug --- .buildkite/pipeline.yml | 3 ++- ext/AMDGPU/operators.jl | 12 ++++++------ test/gpu/gpu.jl | 2 ++ 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 56d7207..2942169 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -33,7 +33,8 @@ steps: julia --color=yes --project=test/gpu -e ' using Pkg Pkg.develop(path=".") - Pkg.add("AMDGPU") + # Pkg.add("AMDGPU") + Pkg.add(url="https://github.com/JuliaGPU/AMDGPU.jl", rev="master") Pkg.add("Krylov") Pkg.instantiate() include("test/gpu/amd.jl")' diff --git a/ext/AMDGPU/operators.jl b/ext/AMDGPU/operators.jl index d6fdf12..407c284 100644 --- a/ext/AMDGPU/operators.jl +++ b/ext/AMDGPU/operators.jl @@ -50,9 +50,9 @@ for (SparseMatrixType, BlasType) in ((:(ROCSparseMatrixCSR{T}), :BlasFloat), end function LinearAlgebra.mul!(y::ROCVector{T}, A::AMD_KrylovOperator{T}, x::ROCVector{T}) where T <: BlasFloat - (length(y) != A.m) && throw(DimensionMismatch("")) - (length(x) != A.n) && throw(DimensionMismatch("")) - (A.nrhs != 1) && throw(DimensionMismatch("")) + (length(y) != A.m) && throw(DimensionMismatch("length(y) != A.m")) + (length(x) != A.n) && throw(DimensionMismatch("length(x) != A.n")) + (A.nrhs == 1) || throw(DimensionMismatch("A.nrhs != 1")) descY = rocSPARSE.ROCDenseVectorDescriptor(y) descX = rocSPARSE.ROCDenseVectorDescriptor(x) algo = rocSPARSE.rocsparse_spmv_alg_default @@ -63,9 +63,9 @@ end function LinearAlgebra.mul!(Y::ROCMatrix{T}, A::AMD_KrylovOperator{T}, X::ROCMatrix{T}) where T <: BlasFloat mY, nY = size(Y) mX, nX = size(X) - (mY != A.m) && throw(DimensionMismatch("")) - (mX != A.n) && throw(DimensionMismatch("")) - (nY == nX == A.nrhs) || throw(DimensionMismatch("")) + (mY != A.m) && throw(DimensionMismatch("mY != A.m")) + (mX != A.n) && throw(DimensionMismatch("mX != A.n")) + (nY == nX == A.nrhs) || throw(DimensionMismatch("nY != A.nrhs or nX != A.nrhs")) descY = rocSPARSE.ROCDenseMatrixDescriptor(Y) descX = rocSPARSE.ROCDenseMatrixDescriptor(X) algo = rocSPARSE.rocsparse_spmm_alg_default diff --git a/test/gpu/gpu.jl b/test/gpu/gpu.jl index 3abdc2c..23ea318 100644 --- a/test/gpu/gpu.jl +++ b/test/gpu/gpu.jl @@ -60,6 +60,8 @@ function test_operator(FC, V, DM, SM) A_cpu = rand(FC, m, n) A_cpu = sparse(A_cpu) + A_gpu = SM(A_cpu) + opA_gpu = KrylovOperator(A_gpu) for i = 1:5 y_cpu = rand(FC, m) From 76bd1bdbf393551f09869ad7bd30ccbbcdfa3f50 Mon Sep 17 00:00:00 2001 From: Alexis Montoison Date: Tue, 28 Nov 2023 13:58:50 -0600 Subject: [PATCH 21/27] debug --- ext/AMDGPU/operators.jl | 2 +- ext/CUDA/operators.jl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ext/AMDGPU/operators.jl b/ext/AMDGPU/operators.jl index 407c284..f77504d 100644 --- a/ext/AMDGPU/operators.jl +++ b/ext/AMDGPU/operators.jl @@ -31,7 +31,7 @@ for (SparseMatrixType, BlasType) in ((:(ROCSparseMatrixCSR{T}), :BlasFloat), return AMD_KrylovOperator{T}(T, m, n, nrhs, transa, descA, buffer_size, buffer) else alpha = Ref{T}(one(T)) - beta = Ref{T}(zeto(T)) + beta = Ref{T}(zero(T)) descA = rocSPARSE.ROCSparseMatrixDescriptor(A, 'O') descX = rocSPARSE.ROCDenseMatrixDescriptor(T, n, nrhs) descY = rocSPARSE.ROCDenseMatrixDescriptor(T, m, nrhs) diff --git a/ext/CUDA/operators.jl b/ext/CUDA/operators.jl index 13ac454..4671c21 100644 --- a/ext/CUDA/operators.jl +++ b/ext/CUDA/operators.jl @@ -30,7 +30,7 @@ for (SparseMatrixType, BlasType) in ((:(CuSparseMatrixCSR{T}), :BlasFloat), return NVIDIA_KrylovOperator{T}(T, m, n, nrhs, transa, descA, buffer) else alpha = Ref{T}(one(T)) - beta = Ref{T}(zeto(T)) + beta = Ref{T}(zero(T)) descA = CUSPARSE.CuSparseMatrixDescriptor(A, 'O') descX = CUSPARSE.CuDenseMatrixDescriptor(T, n, nrhs) descY = CUSPARSE.CuDenseMatrixDescriptor(T, m, nrhs) From 7db244f99940f7d1e41518e2efa429a8071545b0 Mon Sep 17 00:00:00 2001 From: Alexis Montoison Date: Tue, 28 Nov 2023 14:30:18 -0600 Subject: [PATCH 22/27] debug --- ext/CUDA/operators.jl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ext/CUDA/operators.jl b/ext/CUDA/operators.jl index 4671c21..9733020 100644 --- a/ext/CUDA/operators.jl +++ b/ext/CUDA/operators.jl @@ -39,7 +39,9 @@ for (SparseMatrixType, BlasType) in ((:(CuSparseMatrixCSR{T}), :BlasFloat), transb = 'N' CUSPARSE.cusparseSpMM_bufferSize(CUSPARSE.handle(), transa, transb, alpha, descA, descX, beta, descY, T, algo, buffer_size) buffer = CuVector{UInt8}(undef, buffer_size[]) - CUSPARSE.cusparseSpMM_preprocess(CUSPARSE.handle(), transa, transb, alpha, descA, descX, beta, descY, T, algo, buffer) + if !(A isa CuSparseMatrixCOO) + CUSPARSE.cusparseSpMM_preprocess(CUSPARSE.handle(), transa, transb, alpha, descA, descX, beta, descY, T, algo, buffer) + end return NVIDIA_KrylovOperator{T}(T, m, n, nrhs, transa, descA, buffer) end end From ac5623984cf0778c03f1f2a37636caad2a5ab1f0 Mon Sep 17 00:00:00 2001 From: Alexis Montoison Date: Tue, 28 Nov 2023 14:39:18 -0600 Subject: [PATCH 23/27] debug --- ext/AMDGPU/operators.jl | 2 +- ext/CUDA/operators.jl | 2 +- src/KrylovPreconditioners.jl | 3 +++ 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/ext/AMDGPU/operators.jl b/ext/AMDGPU/operators.jl index f77504d..7413d46 100644 --- a/ext/AMDGPU/operators.jl +++ b/ext/AMDGPU/operators.jl @@ -1,4 +1,4 @@ -mutable struct AMD_KrylovOperator{T} +mutable struct AMD_KrylovOperator{T} <: AbstractKrylovOperator{T} type::Type{T} m::Int n::Int diff --git a/ext/CUDA/operators.jl b/ext/CUDA/operators.jl index 9733020..fb0ec88 100644 --- a/ext/CUDA/operators.jl +++ b/ext/CUDA/operators.jl @@ -1,4 +1,4 @@ -mutable struct NVIDIA_KrylovOperator{T} +mutable struct NVIDIA_KrylovOperator{T} <: AbstractKrylovOperator{T} type::Type{T} m::Int n::Int diff --git a/src/KrylovPreconditioners.jl b/src/KrylovPreconditioners.jl index 1eb3da3..4b51c9d 100644 --- a/src/KrylovPreconditioners.jl +++ b/src/KrylovPreconditioners.jl @@ -12,6 +12,9 @@ import LinearAlgebra: ldiv! abstract type AbstractKrylovPreconditioner end export AbstractKrylovPreconditioner +abstract type AbstractKrylovOperator{T} end +export AbstractKrylovOperator end + function KrylovOperator end export KrylovOperator From fe6570744aab08a641f33b7d9180008ea1ead9b4 Mon Sep 17 00:00:00 2001 From: Alexis Montoison Date: Tue, 28 Nov 2023 14:40:21 -0600 Subject: [PATCH 24/27] debug --- src/KrylovPreconditioners.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/KrylovPreconditioners.jl b/src/KrylovPreconditioners.jl index 4b51c9d..97e0e22 100644 --- a/src/KrylovPreconditioners.jl +++ b/src/KrylovPreconditioners.jl @@ -13,7 +13,7 @@ abstract type AbstractKrylovPreconditioner end export AbstractKrylovPreconditioner abstract type AbstractKrylovOperator{T} end -export AbstractKrylovOperator end +export AbstractKrylovOperator function KrylovOperator end export KrylovOperator From 3805f13f5319d89e4a0d4ce01c93fe9c8460a177 Mon Sep 17 00:00:00 2001 From: Michel Schanen Date: Wed, 29 Nov 2023 15:48:17 -0500 Subject: [PATCH 25/27] Fix broken IC0 test --- test/gpu/gpu.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/gpu/gpu.jl b/test/gpu/gpu.jl index 23ea318..bff3d72 100644 --- a/test/gpu/gpu.jl +++ b/test/gpu/gpu.jl @@ -18,7 +18,7 @@ function test_ic0(FC, V, M) x_gpu, stats = cg(A_gpu, b_gpu, M=P, ldiv=true) r_gpu = b_gpu - A_gpu * x_gpu @test stats.niter ≤ 5 - if (FC <: ComplexF64) && typeof(V).name.name == :ROCArray + if (FC <: ComplexF64) && V <: ROCArray{ComplexF64, 1} @test_broken norm(r_gpu) ≤ 1e-6 else @test norm(r_gpu) ≤ 1e-8 From 082d49b8799ae034d42274603ae925a79b7a902e Mon Sep 17 00:00:00 2001 From: Alexis Montoison <35051714+amontoison@users.noreply.github.com> Date: Wed, 29 Nov 2023 15:12:15 -0600 Subject: [PATCH 26/27] Update test/gpu/gpu.jl --- test/gpu/gpu.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/gpu/gpu.jl b/test/gpu/gpu.jl index bff3d72..f42fe4b 100644 --- a/test/gpu/gpu.jl +++ b/test/gpu/gpu.jl @@ -18,7 +18,7 @@ function test_ic0(FC, V, M) x_gpu, stats = cg(A_gpu, b_gpu, M=P, ldiv=true) r_gpu = b_gpu - A_gpu * x_gpu @test stats.niter ≤ 5 - if (FC <: ComplexF64) && V <: ROCArray{ComplexF64, 1} + if (FC <: ComplexF64) && V.name.name == :ROCArray @test_broken norm(r_gpu) ≤ 1e-6 else @test norm(r_gpu) ≤ 1e-8 From 7377271d48c5f69a8d847f3e10da5074506adeeb Mon Sep 17 00:00:00 2001 From: Alexis Montoison Date: Wed, 29 Nov 2023 23:12:56 -0600 Subject: [PATCH 27/27] Update gpu.jl --- test/gpu/gpu.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/gpu/gpu.jl b/test/gpu/gpu.jl index f42fe4b..c852ad7 100644 --- a/test/gpu/gpu.jl +++ b/test/gpu/gpu.jl @@ -18,7 +18,7 @@ function test_ic0(FC, V, M) x_gpu, stats = cg(A_gpu, b_gpu, M=P, ldiv=true) r_gpu = b_gpu - A_gpu * x_gpu @test stats.niter ≤ 5 - if (FC <: ComplexF64) && V.name.name == :ROCArray + if (FC <: ComplexF64) && V.body.name.name == :ROCArray @test_broken norm(r_gpu) ≤ 1e-6 else @test norm(r_gpu) ≤ 1e-8