Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add a KrylovOperator for sparse matrices on GPUs #25

Merged
merged 27 commits into from
Nov 30, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .buildkite/pipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@ steps:
julia --color=yes --project=test/gpu -e '
using Pkg
Pkg.develop(path=".")
Pkg.add("AMDGPU")
# Pkg.add("AMDGPU")
Pkg.add(url="https://github.com/JuliaGPU/AMDGPU.jl", rev="master")
Pkg.add("Krylov")
Pkg.instantiate()
include("test/gpu/amd.jl")'
Expand Down
74 changes: 74 additions & 0 deletions ext/AMDGPU/operators.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
mutable struct AMD_KrylovOperator{T} <: AbstractKrylovOperator{T}
type::Type{T}
m::Int
n::Int
nrhs::Int
transa::Char
descA::rocSPARSE.ROCSparseMatrixDescriptor
buffer_size::Ref{Csize_t}
buffer::ROCVector{UInt8}
end

eltype(A::AMD_KrylovOperator{T}) where T = T
size(A::AMD_KrylovOperator) = (A.m, A.n)

for (SparseMatrixType, BlasType) in ((:(ROCSparseMatrixCSR{T}), :BlasFloat),
(:(ROCSparseMatrixCSC{T}), :BlasFloat),
(:(ROCSparseMatrixCOO{T}), :BlasFloat))
@eval begin
function KP.KrylovOperator(A::$SparseMatrixType; nrhs::Int=1, transa::Char='N') where T <: $BlasType
m,n = size(A)
if nrhs == 1
alpha = Ref{T}(one(T))
beta = Ref{T}(zero(T))
descA = rocSPARSE.ROCSparseMatrixDescriptor(A, 'O')
descX = rocSPARSE.ROCDenseVectorDescriptor(T, n)
descY = rocSPARSE.ROCDenseVectorDescriptor(T, m)
algo = rocSPARSE.rocSPARSE.rocsparse_spmv_alg_default
buffer_size = Ref{Csize_t}()
rocSPARSE.rocSPARSE.rocsparse_spmv(rocSPARSE.handle(), transa, alpha, descA, descX, beta, descY, T, algo, buffer_size, C_NULL)
buffer = ROCVector{UInt8}(undef, buffer_size[])
return AMD_KrylovOperator{T}(T, m, n, nrhs, transa, descA, buffer_size, buffer)
else
alpha = Ref{T}(one(T))
beta = Ref{T}(zero(T))
descA = rocSPARSE.ROCSparseMatrixDescriptor(A, 'O')
descX = rocSPARSE.ROCDenseMatrixDescriptor(T, n, nrhs)
descY = rocSPARSE.ROCDenseMatrixDescriptor(T, m, nrhs)
algo = rocSPARSE.rocsparse_spmm_alg_default
buffer_size = Ref{Csize_t}()
transb = 'N'
rocSPARSE.rocsparse_spmm(rocSPARSE.handle(), transa, 'N', alpha, descA, descX, beta, descY, T,
algo, rocSPARSE.rocsparse_spmm_stage_buffer_size, buffer_size, C_NULL)
buffer = ROCVector{UInt8}(undef, buffer_size[])
rocSPARSE.rocsparse_spmm(rocSPARSE.handle(), transa, 'N', alpha, descA, descX, beta, descY, T,
algo, rocSPARSE.rocsparse_spmm_stage_preprocess, buffer_size, buffer)
return AMD_KrylovOperator{T}(T, m, n, nrhs, transa, descA, buffer_size, buffer)
end
end
end
end

function LinearAlgebra.mul!(y::ROCVector{T}, A::AMD_KrylovOperator{T}, x::ROCVector{T}) where T <: BlasFloat
(length(y) != A.m) && throw(DimensionMismatch("length(y) != A.m"))
(length(x) != A.n) && throw(DimensionMismatch("length(x) != A.n"))
(A.nrhs == 1) || throw(DimensionMismatch("A.nrhs != 1"))
descY = rocSPARSE.ROCDenseVectorDescriptor(y)
descX = rocSPARSE.ROCDenseVectorDescriptor(x)
algo = rocSPARSE.rocsparse_spmv_alg_default
rocSPARSE.rocsparse_spmv(rocSPARSE.handle(), A.transa, Ref{T}(one(T)), A.descA, descX,
Ref{T}(zero(T)), descY, T, algo, A.buffer_size, A.buffer)
end

function LinearAlgebra.mul!(Y::ROCMatrix{T}, A::AMD_KrylovOperator{T}, X::ROCMatrix{T}) where T <: BlasFloat
mY, nY = size(Y)
mX, nX = size(X)
(mY != A.m) && throw(DimensionMismatch("mY != A.m"))
(mX != A.n) && throw(DimensionMismatch("mX != A.n"))
(nY == nX == A.nrhs) || throw(DimensionMismatch("nY != A.nrhs or nX != A.nrhs"))
descY = rocSPARSE.ROCDenseMatrixDescriptor(Y)
descX = rocSPARSE.ROCDenseMatrixDescriptor(X)
algo = rocSPARSE.rocsparse_spmm_alg_default
rocSPARSE.rocsparse_spmm(rocSPARSE.handle(), A.transa, 'N', Ref{T}(one(T)), A.descA, descX,
Ref{T}(zero(T)), descY, T, algo, rocSPARSE.rocsparse_spmm_stage_compute, A.buffer_size, A.buffer)
end
71 changes: 71 additions & 0 deletions ext/CUDA/operators.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
mutable struct NVIDIA_KrylovOperator{T} <: AbstractKrylovOperator{T}
type::Type{T}
m::Int
n::Int
nrhs::Int
transa::Char
descA::CUSPARSE.CuSparseMatrixDescriptor
buffer::CuVector{UInt8}
end

eltype(A::NVIDIA_KrylovOperator{T}) where T = T
size(A::NVIDIA_KrylovOperator) = (A.m, A.n)

for (SparseMatrixType, BlasType) in ((:(CuSparseMatrixCSR{T}), :BlasFloat),
(:(CuSparseMatrixCSC{T}), :BlasFloat),
(:(CuSparseMatrixCOO{T}), :BlasFloat))
@eval begin
function KP.KrylovOperator(A::$SparseMatrixType; nrhs::Int=1, transa::Char='N') where T <: $BlasType
m,n = size(A)
if nrhs == 1
alpha = Ref{T}(one(T))
beta = Ref{T}(zero(T))
descA = CUSPARSE.CuSparseMatrixDescriptor(A, 'O')
descX = CUSPARSE.CuDenseVectorDescriptor(T, n)
descY = CUSPARSE.CuDenseVectorDescriptor(T, m)
algo = CUSPARSE.CUSPARSE_SPMV_ALG_DEFAULT
buffer_size = Ref{Csize_t}()
CUSPARSE.cusparseSpMV_bufferSize(CUSPARSE.handle(), transa, alpha, descA, descX, beta, descY, T, algo, buffer_size)
buffer = CuVector{UInt8}(undef, buffer_size[])
return NVIDIA_KrylovOperator{T}(T, m, n, nrhs, transa, descA, buffer)
else
alpha = Ref{T}(one(T))
beta = Ref{T}(zero(T))
descA = CUSPARSE.CuSparseMatrixDescriptor(A, 'O')
descX = CUSPARSE.CuDenseMatrixDescriptor(T, n, nrhs)
descY = CUSPARSE.CuDenseMatrixDescriptor(T, m, nrhs)
algo = CUSPARSE.CUSPARSE_SPMM_ALG_DEFAULT
buffer_size = Ref{Csize_t}()
transb = 'N'
CUSPARSE.cusparseSpMM_bufferSize(CUSPARSE.handle(), transa, transb, alpha, descA, descX, beta, descY, T, algo, buffer_size)
buffer = CuVector{UInt8}(undef, buffer_size[])
if !(A isa CuSparseMatrixCOO)
CUSPARSE.cusparseSpMM_preprocess(CUSPARSE.handle(), transa, transb, alpha, descA, descX, beta, descY, T, algo, buffer)
end
return NVIDIA_KrylovOperator{T}(T, m, n, nrhs, transa, descA, buffer)
end
end
end
end

function LinearAlgebra.mul!(y::CuVector{T}, A::NVIDIA_KrylovOperator{T}, x::CuVector{T}) where T <: BlasFloat
(length(y) != A.m) && throw(DimensionMismatch("length(y) != A.m"))
(length(x) != A.n) && throw(DimensionMismatch("length(x) != A.n"))
(A.nrhs == 1) || throw(DimensionMismatch("A.nrhs != 1"))
descY = CUSPARSE.CuDenseVectorDescriptor(y)
descX = CUSPARSE.CuDenseVectorDescriptor(x)
algo = CUSPARSE.CUSPARSE_SPMV_ALG_DEFAULT
CUSPARSE.cusparseSpMV(CUSPARSE.handle(), A.transa, Ref{T}(one(T)), A.descA, descX, Ref{T}(zero(T)), descY, T, algo, A.buffer)
end

function LinearAlgebra.mul!(Y::CuMatrix{T}, A::NVIDIA_KrylovOperator{T}, X::CuMatrix{T}) where T <: BlasFloat
mY, nY = size(Y)
mX, nX = size(X)
(mY != A.m) && throw(DimensionMismatch("mY != A.m"))
(mX != A.n) && throw(DimensionMismatch("mX != A.n"))
(nY == nX == A.nrhs) || throw(DimensionMismatch("nY != A.nrhs or nX != A.nrhs"))
descY = CUSPARSE.CuDenseMatrixDescriptor(Y)
descX = CUSPARSE.CuDenseMatrixDescriptor(X)
algo = CUSPARSE.CUSPARSE_SPMM_ALG_DEFAULT
CUSPARSE.cusparseSpMM(CUSPARSE.handle(), A.transa, 'N', Ref{T}(one(T)), A.descA, descX, Ref{T}(zero(T)), descY, T, algo, A.buffer)
end
5 changes: 3 additions & 2 deletions ext/KrylovPreconditionersAMDGPUExt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@ using LinearAlgebra
using AMDGPU
using AMDGPU.rocSPARSE
using LinearAlgebra: checksquare, BlasReal, BlasFloat
import LinearAlgebra: ldiv!
using SparseArrays
import LinearAlgebra: ldiv!, mul!
import Base: size, eltype

using KrylovPreconditioners
const KP = KrylovPreconditioners
Expand All @@ -14,5 +14,6 @@ const KA = KernelAbstractions
include("AMDGPU/ic0.jl")
include("AMDGPU/ilu0.jl")
include("AMDGPU/blockjacobi.jl")
include("AMDGPU/operators.jl")

end
5 changes: 3 additions & 2 deletions ext/KrylovPreconditionersCUDAExt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@ using LinearAlgebra
using CUDA
using CUDA.CUSPARSE
using LinearAlgebra: checksquare, BlasReal, BlasFloat
import LinearAlgebra: ldiv!
using SparseArrays
import LinearAlgebra: ldiv!, mul!
import Base: size, eltype

using KrylovPreconditioners
const KP = KrylovPreconditioners
Expand All @@ -14,5 +14,6 @@ const KA = KernelAbstractions
include("CUDA/ic0.jl")
include("CUDA/ilu0.jl")
include("CUDA/blockjacobi.jl")
include("CUDA/operators.jl")

end
6 changes: 6 additions & 0 deletions src/KrylovPreconditioners.jl
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,12 @@ import LinearAlgebra: ldiv!
abstract type AbstractKrylovPreconditioner end
export AbstractKrylovPreconditioner

abstract type AbstractKrylovOperator{T} end
export AbstractKrylovOperator

function KrylovOperator end
export KrylovOperator

# Preconditioners
include("ic0.jl")
include("ilu0.jl")
Expand Down
12 changes: 12 additions & 0 deletions test/gpu/amd.jl
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,18 @@ include("gpu.jl")
end
end

@testset "KrylovOperator" begin
@testset "ROCSparseMatrixCOO -- $FC" for FC in (Float64, ComplexF64)
test_operator(FC, ROCVector{FC}, ROCMatrix{FC}, ROCSparseMatrixCOO{FC})
end
@testset "ROCSparseMatrixCSC -- $FC" for FC in (Float64, ComplexF64)
test_operator(FC, ROCVector{FC}, ROCMatrix{FC}, ROCSparseMatrixCSC{FC})
end
@testset "ROCSparseMatrixCSR -- $FC" for FC in (Float64, ComplexF64)
test_operator(FC, ROCVector{FC}, ROCMatrix{FC}, ROCSparseMatrixCSR{FC})
end
end

@testset "Block Jacobi preconditioner" begin
if AMDGPU.functional()
test_block_jacobi(ROCBackend(), ROCArray, ROCSparseMatrixCSR)
Expand Down
46 changes: 45 additions & 1 deletion test/gpu/gpu.jl
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ function test_ic0(FC, V, M)
x_gpu, stats = cg(A_gpu, b_gpu, M=P, ldiv=true)
r_gpu = b_gpu - A_gpu * x_gpu
@test stats.niter ≤ 5
if (FC <: ComplexF64) && typeof(V).name.name == :ROCArray
if (FC <: ComplexF64) && V.body.name.name == :ROCArray
@test_broken norm(r_gpu) ≤ 1e-6
else
@test norm(r_gpu) ≤ 1e-8
Expand All @@ -42,6 +42,50 @@ function test_ilu0(FC, V, M)
@test norm(r_gpu) ≤ 1e-8
end

function test_operator(FC, V, DM, SM)
m = 200
n = 100
A_cpu = rand(FC, n, n)
A_cpu = sparse(A_cpu)
b_cpu = rand(FC, n)

A_gpu = SM(A_cpu)
b_gpu = V(b_cpu)
opA_gpu = KrylovOperator(A_gpu)

x_gpu, stats = gmres(opA_gpu, b_gpu)
r_gpu = b_gpu - A_gpu * x_gpu
@test stats.solved
@test norm(r_gpu) ≤ 1e-8

A_cpu = rand(FC, m, n)
A_cpu = sparse(A_cpu)
A_gpu = SM(A_cpu)

opA_gpu = KrylovOperator(A_gpu)
for i = 1:5
y_cpu = rand(FC, m)
x_cpu = rand(FC, n)
mul!(y_cpu, A_cpu, x_cpu)
y_gpu = V(y_cpu)
x_gpu = V(x_cpu)
mul!(y_gpu, opA_gpu, x_gpu)
@test collect(y_gpu) ≈ y_cpu
end

nrhs = 3
opA_gpu = KrylovOperator(A_gpu; nrhs)
for i = 1:5
Y_cpu = rand(FC, m, nrhs)
X_cpu = rand(FC, n, nrhs)
mul!(Y_cpu, A_cpu, X_cpu)
Y_gpu = DM(Y_cpu)
X_gpu = DM(X_cpu)
mul!(Y_gpu, opA_gpu, X_gpu)
@test collect(Y_gpu) ≈ Y_cpu
end
end

_get_type(J::SparseMatrixCSC) = Vector{Float64}

function generate_random_system(n::Int, m::Int)
Expand Down
12 changes: 12 additions & 0 deletions test/gpu/nvidia.jl
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,18 @@ include("gpu.jl")
end
end

@testset "KrylovOperator" begin
@testset "CuSparseMatrixCOO -- $FC" for FC in (Float64, ComplexF64)
test_operator(FC, CuVector{FC}, CuMatrix{FC}, CuSparseMatrixCOO{FC})
end
@testset "CuSparseMatrixCSC -- $FC" for FC in (Float64, ComplexF64)
test_operator(FC, CuVector{FC}, CuMatrix{FC}, CuSparseMatrixCSC{FC})
end
@testset "CuSparseMatrixCSR -- $FC" for FC in (Float64, ComplexF64)
test_operator(FC, CuVector{FC}, CuMatrix{FC}, CuSparseMatrixCSR{FC})
end
end

@testset "Block Jacobi preconditioner" begin
if CUDA.functional()
test_block_jacobi(CUDABackend(), CuArray, CuSparseMatrixCSR)
Expand Down
Loading