diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 56d7207..2942169 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -33,7 +33,8 @@ steps: julia --color=yes --project=test/gpu -e ' using Pkg Pkg.develop(path=".") - Pkg.add("AMDGPU") + # Pkg.add("AMDGPU") + Pkg.add(url="https://github.com/JuliaGPU/AMDGPU.jl", rev="master") Pkg.add("Krylov") Pkg.instantiate() include("test/gpu/amd.jl")' diff --git a/ext/AMDGPU/operators.jl b/ext/AMDGPU/operators.jl new file mode 100644 index 0000000..7413d46 --- /dev/null +++ b/ext/AMDGPU/operators.jl @@ -0,0 +1,74 @@ +mutable struct AMD_KrylovOperator{T} <: AbstractKrylovOperator{T} + type::Type{T} + m::Int + n::Int + nrhs::Int + transa::Char + descA::rocSPARSE.ROCSparseMatrixDescriptor + buffer_size::Ref{Csize_t} + buffer::ROCVector{UInt8} +end + +eltype(A::AMD_KrylovOperator{T}) where T = T +size(A::AMD_KrylovOperator) = (A.m, A.n) + +for (SparseMatrixType, BlasType) in ((:(ROCSparseMatrixCSR{T}), :BlasFloat), + (:(ROCSparseMatrixCSC{T}), :BlasFloat), + (:(ROCSparseMatrixCOO{T}), :BlasFloat)) + @eval begin + function KP.KrylovOperator(A::$SparseMatrixType; nrhs::Int=1, transa::Char='N') where T <: $BlasType + m,n = size(A) + if nrhs == 1 + alpha = Ref{T}(one(T)) + beta = Ref{T}(zero(T)) + descA = rocSPARSE.ROCSparseMatrixDescriptor(A, 'O') + descX = rocSPARSE.ROCDenseVectorDescriptor(T, n) + descY = rocSPARSE.ROCDenseVectorDescriptor(T, m) + algo = rocSPARSE.rocSPARSE.rocsparse_spmv_alg_default + buffer_size = Ref{Csize_t}() + rocSPARSE.rocSPARSE.rocsparse_spmv(rocSPARSE.handle(), transa, alpha, descA, descX, beta, descY, T, algo, buffer_size, C_NULL) + buffer = ROCVector{UInt8}(undef, buffer_size[]) + return AMD_KrylovOperator{T}(T, m, n, nrhs, transa, descA, buffer_size, buffer) + else + alpha = Ref{T}(one(T)) + beta = Ref{T}(zero(T)) + descA = rocSPARSE.ROCSparseMatrixDescriptor(A, 'O') + descX = rocSPARSE.ROCDenseMatrixDescriptor(T, n, nrhs) + descY = rocSPARSE.ROCDenseMatrixDescriptor(T, m, nrhs) + algo = rocSPARSE.rocsparse_spmm_alg_default + buffer_size = Ref{Csize_t}() + transb = 'N' + rocSPARSE.rocsparse_spmm(rocSPARSE.handle(), transa, 'N', alpha, descA, descX, beta, descY, T, + algo, rocSPARSE.rocsparse_spmm_stage_buffer_size, buffer_size, C_NULL) + buffer = ROCVector{UInt8}(undef, buffer_size[]) + rocSPARSE.rocsparse_spmm(rocSPARSE.handle(), transa, 'N', alpha, descA, descX, beta, descY, T, + algo, rocSPARSE.rocsparse_spmm_stage_preprocess, buffer_size, buffer) + return AMD_KrylovOperator{T}(T, m, n, nrhs, transa, descA, buffer_size, buffer) + end + end + end +end + +function LinearAlgebra.mul!(y::ROCVector{T}, A::AMD_KrylovOperator{T}, x::ROCVector{T}) where T <: BlasFloat + (length(y) != A.m) && throw(DimensionMismatch("length(y) != A.m")) + (length(x) != A.n) && throw(DimensionMismatch("length(x) != A.n")) + (A.nrhs == 1) || throw(DimensionMismatch("A.nrhs != 1")) + descY = rocSPARSE.ROCDenseVectorDescriptor(y) + descX = rocSPARSE.ROCDenseVectorDescriptor(x) + algo = rocSPARSE.rocsparse_spmv_alg_default + rocSPARSE.rocsparse_spmv(rocSPARSE.handle(), A.transa, Ref{T}(one(T)), A.descA, descX, + Ref{T}(zero(T)), descY, T, algo, A.buffer_size, A.buffer) +end + +function LinearAlgebra.mul!(Y::ROCMatrix{T}, A::AMD_KrylovOperator{T}, X::ROCMatrix{T}) where T <: BlasFloat + mY, nY = size(Y) + mX, nX = size(X) + (mY != A.m) && throw(DimensionMismatch("mY != A.m")) + (mX != A.n) && throw(DimensionMismatch("mX != A.n")) + (nY == nX == A.nrhs) || throw(DimensionMismatch("nY != A.nrhs or nX != A.nrhs")) + descY = rocSPARSE.ROCDenseMatrixDescriptor(Y) + descX = rocSPARSE.ROCDenseMatrixDescriptor(X) + algo = rocSPARSE.rocsparse_spmm_alg_default + rocSPARSE.rocsparse_spmm(rocSPARSE.handle(), A.transa, 'N', Ref{T}(one(T)), A.descA, descX, + Ref{T}(zero(T)), descY, T, algo, rocSPARSE.rocsparse_spmm_stage_compute, A.buffer_size, A.buffer) +end diff --git a/ext/CUDA/operators.jl b/ext/CUDA/operators.jl new file mode 100644 index 0000000..fb0ec88 --- /dev/null +++ b/ext/CUDA/operators.jl @@ -0,0 +1,71 @@ +mutable struct NVIDIA_KrylovOperator{T} <: AbstractKrylovOperator{T} + type::Type{T} + m::Int + n::Int + nrhs::Int + transa::Char + descA::CUSPARSE.CuSparseMatrixDescriptor + buffer::CuVector{UInt8} +end + +eltype(A::NVIDIA_KrylovOperator{T}) where T = T +size(A::NVIDIA_KrylovOperator) = (A.m, A.n) + +for (SparseMatrixType, BlasType) in ((:(CuSparseMatrixCSR{T}), :BlasFloat), + (:(CuSparseMatrixCSC{T}), :BlasFloat), + (:(CuSparseMatrixCOO{T}), :BlasFloat)) + @eval begin + function KP.KrylovOperator(A::$SparseMatrixType; nrhs::Int=1, transa::Char='N') where T <: $BlasType + m,n = size(A) + if nrhs == 1 + alpha = Ref{T}(one(T)) + beta = Ref{T}(zero(T)) + descA = CUSPARSE.CuSparseMatrixDescriptor(A, 'O') + descX = CUSPARSE.CuDenseVectorDescriptor(T, n) + descY = CUSPARSE.CuDenseVectorDescriptor(T, m) + algo = CUSPARSE.CUSPARSE_SPMV_ALG_DEFAULT + buffer_size = Ref{Csize_t}() + CUSPARSE.cusparseSpMV_bufferSize(CUSPARSE.handle(), transa, alpha, descA, descX, beta, descY, T, algo, buffer_size) + buffer = CuVector{UInt8}(undef, buffer_size[]) + return NVIDIA_KrylovOperator{T}(T, m, n, nrhs, transa, descA, buffer) + else + alpha = Ref{T}(one(T)) + beta = Ref{T}(zero(T)) + descA = CUSPARSE.CuSparseMatrixDescriptor(A, 'O') + descX = CUSPARSE.CuDenseMatrixDescriptor(T, n, nrhs) + descY = CUSPARSE.CuDenseMatrixDescriptor(T, m, nrhs) + algo = CUSPARSE.CUSPARSE_SPMM_ALG_DEFAULT + buffer_size = Ref{Csize_t}() + transb = 'N' + CUSPARSE.cusparseSpMM_bufferSize(CUSPARSE.handle(), transa, transb, alpha, descA, descX, beta, descY, T, algo, buffer_size) + buffer = CuVector{UInt8}(undef, buffer_size[]) + if !(A isa CuSparseMatrixCOO) + CUSPARSE.cusparseSpMM_preprocess(CUSPARSE.handle(), transa, transb, alpha, descA, descX, beta, descY, T, algo, buffer) + end + return NVIDIA_KrylovOperator{T}(T, m, n, nrhs, transa, descA, buffer) + end + end + end +end + +function LinearAlgebra.mul!(y::CuVector{T}, A::NVIDIA_KrylovOperator{T}, x::CuVector{T}) where T <: BlasFloat + (length(y) != A.m) && throw(DimensionMismatch("length(y) != A.m")) + (length(x) != A.n) && throw(DimensionMismatch("length(x) != A.n")) + (A.nrhs == 1) || throw(DimensionMismatch("A.nrhs != 1")) + descY = CUSPARSE.CuDenseVectorDescriptor(y) + descX = CUSPARSE.CuDenseVectorDescriptor(x) + algo = CUSPARSE.CUSPARSE_SPMV_ALG_DEFAULT + CUSPARSE.cusparseSpMV(CUSPARSE.handle(), A.transa, Ref{T}(one(T)), A.descA, descX, Ref{T}(zero(T)), descY, T, algo, A.buffer) +end + +function LinearAlgebra.mul!(Y::CuMatrix{T}, A::NVIDIA_KrylovOperator{T}, X::CuMatrix{T}) where T <: BlasFloat + mY, nY = size(Y) + mX, nX = size(X) + (mY != A.m) && throw(DimensionMismatch("mY != A.m")) + (mX != A.n) && throw(DimensionMismatch("mX != A.n")) + (nY == nX == A.nrhs) || throw(DimensionMismatch("nY != A.nrhs or nX != A.nrhs")) + descY = CUSPARSE.CuDenseMatrixDescriptor(Y) + descX = CUSPARSE.CuDenseMatrixDescriptor(X) + algo = CUSPARSE.CUSPARSE_SPMM_ALG_DEFAULT + CUSPARSE.cusparseSpMM(CUSPARSE.handle(), A.transa, 'N', Ref{T}(one(T)), A.descA, descX, Ref{T}(zero(T)), descY, T, algo, A.buffer) +end diff --git a/ext/KrylovPreconditionersAMDGPUExt.jl b/ext/KrylovPreconditionersAMDGPUExt.jl index b75c28c..0355a2a 100644 --- a/ext/KrylovPreconditionersAMDGPUExt.jl +++ b/ext/KrylovPreconditionersAMDGPUExt.jl @@ -3,8 +3,8 @@ using LinearAlgebra using AMDGPU using AMDGPU.rocSPARSE using LinearAlgebra: checksquare, BlasReal, BlasFloat -import LinearAlgebra: ldiv! -using SparseArrays +import LinearAlgebra: ldiv!, mul! +import Base: size, eltype using KrylovPreconditioners const KP = KrylovPreconditioners @@ -14,5 +14,6 @@ const KA = KernelAbstractions include("AMDGPU/ic0.jl") include("AMDGPU/ilu0.jl") include("AMDGPU/blockjacobi.jl") +include("AMDGPU/operators.jl") end diff --git a/ext/KrylovPreconditionersCUDAExt.jl b/ext/KrylovPreconditionersCUDAExt.jl index c0e0f83..a74f941 100644 --- a/ext/KrylovPreconditionersCUDAExt.jl +++ b/ext/KrylovPreconditionersCUDAExt.jl @@ -3,8 +3,8 @@ using LinearAlgebra using CUDA using CUDA.CUSPARSE using LinearAlgebra: checksquare, BlasReal, BlasFloat -import LinearAlgebra: ldiv! -using SparseArrays +import LinearAlgebra: ldiv!, mul! +import Base: size, eltype using KrylovPreconditioners const KP = KrylovPreconditioners @@ -14,5 +14,6 @@ const KA = KernelAbstractions include("CUDA/ic0.jl") include("CUDA/ilu0.jl") include("CUDA/blockjacobi.jl") +include("CUDA/operators.jl") end diff --git a/src/KrylovPreconditioners.jl b/src/KrylovPreconditioners.jl index 8c43404..97e0e22 100644 --- a/src/KrylovPreconditioners.jl +++ b/src/KrylovPreconditioners.jl @@ -12,6 +12,12 @@ import LinearAlgebra: ldiv! abstract type AbstractKrylovPreconditioner end export AbstractKrylovPreconditioner +abstract type AbstractKrylovOperator{T} end +export AbstractKrylovOperator + +function KrylovOperator end +export KrylovOperator + # Preconditioners include("ic0.jl") include("ilu0.jl") diff --git a/test/gpu/amd.jl b/test/gpu/amd.jl index 366e2cd..fbd7a07 100644 --- a/test/gpu/amd.jl +++ b/test/gpu/amd.jl @@ -26,6 +26,18 @@ include("gpu.jl") end end + @testset "KrylovOperator" begin + @testset "ROCSparseMatrixCOO -- $FC" for FC in (Float64, ComplexF64) + test_operator(FC, ROCVector{FC}, ROCMatrix{FC}, ROCSparseMatrixCOO{FC}) + end + @testset "ROCSparseMatrixCSC -- $FC" for FC in (Float64, ComplexF64) + test_operator(FC, ROCVector{FC}, ROCMatrix{FC}, ROCSparseMatrixCSC{FC}) + end + @testset "ROCSparseMatrixCSR -- $FC" for FC in (Float64, ComplexF64) + test_operator(FC, ROCVector{FC}, ROCMatrix{FC}, ROCSparseMatrixCSR{FC}) + end + end + @testset "Block Jacobi preconditioner" begin if AMDGPU.functional() test_block_jacobi(ROCBackend(), ROCArray, ROCSparseMatrixCSR) diff --git a/test/gpu/gpu.jl b/test/gpu/gpu.jl index 107ad94..c852ad7 100644 --- a/test/gpu/gpu.jl +++ b/test/gpu/gpu.jl @@ -18,7 +18,7 @@ function test_ic0(FC, V, M) x_gpu, stats = cg(A_gpu, b_gpu, M=P, ldiv=true) r_gpu = b_gpu - A_gpu * x_gpu @test stats.niter ≤ 5 - if (FC <: ComplexF64) && typeof(V).name.name == :ROCArray + if (FC <: ComplexF64) && V.body.name.name == :ROCArray @test_broken norm(r_gpu) ≤ 1e-6 else @test norm(r_gpu) ≤ 1e-8 @@ -42,6 +42,50 @@ function test_ilu0(FC, V, M) @test norm(r_gpu) ≤ 1e-8 end +function test_operator(FC, V, DM, SM) + m = 200 + n = 100 + A_cpu = rand(FC, n, n) + A_cpu = sparse(A_cpu) + b_cpu = rand(FC, n) + + A_gpu = SM(A_cpu) + b_gpu = V(b_cpu) + opA_gpu = KrylovOperator(A_gpu) + + x_gpu, stats = gmres(opA_gpu, b_gpu) + r_gpu = b_gpu - A_gpu * x_gpu + @test stats.solved + @test norm(r_gpu) ≤ 1e-8 + + A_cpu = rand(FC, m, n) + A_cpu = sparse(A_cpu) + A_gpu = SM(A_cpu) + + opA_gpu = KrylovOperator(A_gpu) + for i = 1:5 + y_cpu = rand(FC, m) + x_cpu = rand(FC, n) + mul!(y_cpu, A_cpu, x_cpu) + y_gpu = V(y_cpu) + x_gpu = V(x_cpu) + mul!(y_gpu, opA_gpu, x_gpu) + @test collect(y_gpu) ≈ y_cpu + end + + nrhs = 3 + opA_gpu = KrylovOperator(A_gpu; nrhs) + for i = 1:5 + Y_cpu = rand(FC, m, nrhs) + X_cpu = rand(FC, n, nrhs) + mul!(Y_cpu, A_cpu, X_cpu) + Y_gpu = DM(Y_cpu) + X_gpu = DM(X_cpu) + mul!(Y_gpu, opA_gpu, X_gpu) + @test collect(Y_gpu) ≈ Y_cpu + end +end + _get_type(J::SparseMatrixCSC) = Vector{Float64} function generate_random_system(n::Int, m::Int) diff --git a/test/gpu/nvidia.jl b/test/gpu/nvidia.jl index cb155da..655e1df 100644 --- a/test/gpu/nvidia.jl +++ b/test/gpu/nvidia.jl @@ -26,6 +26,18 @@ include("gpu.jl") end end + @testset "KrylovOperator" begin + @testset "CuSparseMatrixCOO -- $FC" for FC in (Float64, ComplexF64) + test_operator(FC, CuVector{FC}, CuMatrix{FC}, CuSparseMatrixCOO{FC}) + end + @testset "CuSparseMatrixCSC -- $FC" for FC in (Float64, ComplexF64) + test_operator(FC, CuVector{FC}, CuMatrix{FC}, CuSparseMatrixCSC{FC}) + end + @testset "CuSparseMatrixCSR -- $FC" for FC in (Float64, ComplexF64) + test_operator(FC, CuVector{FC}, CuMatrix{FC}, CuSparseMatrixCSR{FC}) + end + end + @testset "Block Jacobi preconditioner" begin if CUDA.functional() test_block_jacobi(CUDABackend(), CuArray, CuSparseMatrixCSR)