diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index d91d5c1..878e0a3 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -34,8 +34,8 @@ steps: julia --color=yes --project=test/gpu -e ' using Pkg Pkg.develop(path=".") - # Pkg.add("AMDGPU") - Pkg.add(url="https://github.com/JuliaGPU/AMDGPU.jl", rev="master") + Pkg.add("AMDGPU") + # Pkg.add(url="https://github.com/JuliaGPU/AMDGPU.jl", rev="master") Pkg.add("Krylov") Pkg.instantiate() include("test/gpu/amd.jl")' @@ -53,6 +53,7 @@ steps: using Pkg Pkg.develop(path=".") Pkg.add("oneAPI") + # Pkg.add(url="https://github.com/JuliaGPU/oneAPI.jl", rev="master") Pkg.add("Krylov") Pkg.instantiate() include("test/gpu/intel.jl")' diff --git a/Project.toml b/Project.toml index c46016e..065c820 100644 --- a/Project.toml +++ b/Project.toml @@ -14,15 +14,18 @@ SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" [weakdeps] AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e" CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" +oneAPI = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b" [extensions] KrylovPreconditionersAMDGPUExt = "AMDGPU" KrylovPreconditionersCUDAExt = "CUDA" +KrylovPreconditionersoneAPIExt = "oneAPI" [compat] -AMDGPU = "0.8.3, 0.9" +AMDGPU = "0.9" Adapt = "3, 4" CUDA = "5.3.0" +oneAPI = "1.5.0" KernelAbstractions = "0.9" Krylov = "0.9.4" LightGraphs = "1" diff --git a/ext/AMDGPU/blockjacobi.jl b/ext/AMDGPU/blockjacobi.jl index dd02070..1dab800 100644 --- a/ext/AMDGPU/blockjacobi.jl +++ b/ext/AMDGPU/blockjacobi.jl @@ -1,7 +1,7 @@ KP.BlockJacobiPreconditioner(J::rocSPARSE.ROCSparseMatrixCSR; options...) = BlockJacobiPreconditioner(SparseMatrixCSC(J); options...) function KP.create_blocklist(cublocks::ROCArray, npart) - blocklist = Array{ROCArray{Float64,2}}(undef, npart) + blocklist = Array{ROCMatrix{Float64}}(undef, npart) for b in 1:npart blocklist[b] = ROCMatrix{Float64}(undef, size(cublocks,1), size(cublocks,2)) end @@ -25,8 +25,8 @@ function _update_gpu(p, j_rowptr, j_colval, j_nzval, device::ROCBackend) for b in 1:nblocks p.blocklist[b] .= p.cublocks[:,:,b] end - AMDGPU.@sync pivot, info = AMDGPU.rocSOLVER.getrf_batched!(p.blocklist) - AMDGPU.@sync pivot, info, p.blocklist = AMDGPU.rocSOLVER.getri_batched!(p.blocklist, pivot) + AMDGPU.@sync pivot, info = rocSOLVER.getrf_batched!(p.blocklist) + AMDGPU.@sync pivot, info, p.blocklist = rocSOLVER.getri_batched!(p.blocklist, pivot) for b in 1:nblocks p.cublocks[:,:,b] .= p.blocklist[b] end diff --git a/ext/CUDA/blockjacobi.jl b/ext/CUDA/blockjacobi.jl index 1e28d8a..5c33865 100644 --- a/ext/CUDA/blockjacobi.jl +++ b/ext/CUDA/blockjacobi.jl @@ -1,7 +1,7 @@ KP.BlockJacobiPreconditioner(J::CUSPARSE.CuSparseMatrixCSR; options...) = BlockJacobiPreconditioner(SparseMatrixCSC(J); options...) function KP.create_blocklist(cublocks::CuArray, npart) - blocklist = Array{CuArray{Float64,2}}(undef, npart) + blocklist = Array{CuMatrix{Float64}}(undef, npart) for b in 1:npart blocklist[b] = CuMatrix{Float64}(undef, size(cublocks,1), size(cublocks,2)) end @@ -25,8 +25,8 @@ function _update_gpu(p, j_rowptr, j_colval, j_nzval, device::CUDABackend) for b in 1:nblocks p.blocklist[b] .= p.cublocks[:,:,b] end - CUDA.@sync pivot, info = CUDA.CUBLAS.getrf_batched!(p.blocklist, true) - CUDA.@sync pivot, info, p.blocklist = CUDA.CUBLAS.getri_batched(p.blocklist, pivot) + CUDA.@sync pivot, info = CUBLAS.getrf_batched!(p.blocklist, true) + CUDA.@sync pivot, info, p.blocklist = CUBLAS.getri_batched(p.blocklist, pivot) for b in 1:nblocks p.cublocks[:,:,b] .= p.blocklist[b] end diff --git a/ext/KrylovPreconditionersAMDGPUExt.jl b/ext/KrylovPreconditionersAMDGPUExt.jl index 1c9cc18..9f9b0e0 100644 --- a/ext/KrylovPreconditionersAMDGPUExt.jl +++ b/ext/KrylovPreconditionersAMDGPUExt.jl @@ -2,7 +2,7 @@ module KrylovPreconditionersAMDGPUExt using LinearAlgebra using SparseArrays using AMDGPU -using AMDGPU.rocSPARSE +using AMDGPU.rocSPARSE, AMDGPU.rocSOLVER using LinearAlgebra: checksquare, BlasReal, BlasFloat import LinearAlgebra: ldiv!, mul! import Base: size, eltype, unsafe_convert diff --git a/ext/KrylovPreconditionersCUDAExt.jl b/ext/KrylovPreconditionersCUDAExt.jl index b074141..47a6899 100644 --- a/ext/KrylovPreconditionersCUDAExt.jl +++ b/ext/KrylovPreconditionersCUDAExt.jl @@ -2,7 +2,7 @@ module KrylovPreconditionersCUDAExt using LinearAlgebra using SparseArrays using CUDA -using CUDA.CUSPARSE +using CUDA.CUSPARSE, CUDA.CUBLAS using LinearAlgebra: checksquare, BlasReal, BlasFloat import LinearAlgebra: ldiv!, mul! import Base: size, eltype, unsafe_convert diff --git a/ext/KrylovPreconditionersoneAPIExt.jl b/ext/KrylovPreconditionersoneAPIExt.jl new file mode 100644 index 0000000..36ff42a --- /dev/null +++ b/ext/KrylovPreconditionersoneAPIExt.jl @@ -0,0 +1,19 @@ +module KrylovPreconditionersoneAPIExt +using LinearAlgebra +using SparseArrays +using oneAPI +using oneAPI: global_queue, sycl_queue, context, device +using oneAPI.oneMKL +using LinearAlgebra: checksquare, BlasReal, BlasFloat +import LinearAlgebra: ldiv!, mul! +import Base: size, eltype, unsafe_convert + +using KrylovPreconditioners +const KP = KrylovPreconditioners +using KernelAbstractions +const KA = KernelAbstractions + +include("oneAPI/blockjacobi.jl") +include("oneAPI/operators.jl") + +end diff --git a/ext/oneAPI/blockjacobi.jl b/ext/oneAPI/blockjacobi.jl new file mode 100644 index 0000000..8f195a4 --- /dev/null +++ b/ext/oneAPI/blockjacobi.jl @@ -0,0 +1,48 @@ +KP.BlockJacobiPreconditioner(J::oneMKL.oneSparseMatrixCSR; options...) = BlockJacobiPreconditioner(SparseMatrixCSC(J); options...) + +function KP.create_blocklist(cublocks::oneArray, npart) + blocklist = Array{oneMatrix{Float64}}(undef, npart) + for b in 1:npart + blocklist[b] = oneMatrix{Float64}(undef, size(cublocks,1), size(cublocks,2)) + end + return blocklist +end + +function _update_gpu(p, j_rowptr, j_colval, j_nzval, device::oneAPIBackend) + nblocks = p.nblocks + blocksize = p.blocksize + fillblock_gpu_kernel! = KP._fillblock_gpu!(device) + # Fill Block Jacobi" begin + fillblock_gpu_kernel!( + p.cublocks, size(p.id,1), + p.cupartitions, p.cumap, + j_rowptr, j_colval, j_nzval, + p.cupart, p.culpartitions, p.id, + ndrange=(nblocks, blocksize), + ) + KA.synchronize(device) + # Invert blocks begin + for b in 1:nblocks + p.blocklist[b] .= p.cublocks[:,:,b] + end + oneAPI.@sync pivot, p.blocklist = oneMKL.getrf_batched!(p.blocklist) + oneAPI.@sync pivot, p.blocklist = oneMKL.getri_batched!(p.blocklist, pivot) + for b in 1:nblocks + p.cublocks[:,:,b] .= p.blocklist[b] + end + return +end + +""" + function update!(J::oneSparseMatrixCSR, p) + +Update the preconditioner `p` from the sparse Jacobian `J` in CSR format for oneAPI + +1) The dense blocks `cuJs` are filled from the sparse Jacobian `J` +2) To a batch inversion of the dense blocks using oneMKL +3) Extract the preconditioner matrix `p.P` from the dense blocks `cuJs` + +""" +function KP.update!(p::BlockJacobiPreconditioner, J::oneMKL.oneSparseMatrixCSR) + _update_gpu(p, J.rowPtr, J.colVal, J.nzVal, p.device) +end diff --git a/ext/oneAPI/operators.jl b/ext/oneAPI/operators.jl new file mode 100644 index 0000000..f69bab8 --- /dev/null +++ b/ext/oneAPI/operators.jl @@ -0,0 +1,96 @@ +mutable struct INTEL_KrylovOperator{T} <: AbstractKrylovOperator{T} + type::Type{T} + m::Int + n::Int + nrhs::Int + transa::Char + matrix::oneSparseMatrixCSR{T} +end + +eltype(A::INTEL_KrylovOperator{T}) where T = T +size(A::INTEL_KrylovOperator) = (A.m, A.n) + +for (SparseMatrixType, BlasType) in ((:(oneSparseMatrixCSR{T}), :BlasFloat),) + @eval begin + function KP.KrylovOperator(A::$SparseMatrixType; nrhs::Int=1, transa::Char='N') where T <: $BlasType + m,n = size(A) + if nrhs == 1 + oneMKL.sparse_optimize_gemv!(transa, A) + end + # sparse_optimize_gemm! is only available with oneAPI > v2024.1.0 + return INTEL_KrylovOperator{T}(T, m, n, nrhs, transa, A) + end + + function KP.update!(A::INTEL_KrylovOperator{T}, B::$SparseMatrixType) where T <: $BlasFloat + error("The update of an INTEL_KrylovOperator is not supported.") + end + end +end + +function LinearAlgebra.mul!(y::oneVector{T}, A::INTEL_KrylovOperator{T}, x::oneVector{T}) where T <: BlasFloat + (length(y) != A.m) && throw(DimensionMismatch("length(y) != A.m")) + (length(x) != A.n) && throw(DimensionMismatch("length(x) != A.n")) + (A.nrhs == 1) || throw(DimensionMismatch("A.nrhs != 1")) + alpha = one(T) + beta = zero(T) + oneMKL.sparse_gemv!(A.transa, alpha, A.matrix, x, beta, y) +end + +function LinearAlgebra.mul!(Y::oneMatrix{T}, A::INTEL_KrylovOperator{T}, X::oneMatrix{T}) where T <: BlasFloat + mY, nY = size(Y) + mX, nX = size(X) + (mY != A.m) && throw(DimensionMismatch("mY != A.m")) + (mX != A.n) && throw(DimensionMismatch("mX != A.n")) + (nY == nX == A.nrhs) || throw(DimensionMismatch("nY != A.nrhs or nX != A.nrhs")) + alpha = one(T) + beta = zero(T) + oneMKL.sparse_gemm!(A.transa, 'N', alpha, A.matrix, X, beta, Y) +end + +mutable struct INTEL_TriangularOperator{T} <: AbstractTriangularOperator{T} + type::Type{T} + m::Int + n::Int + nrhs::Int + uplo::Char + diag::Char + transa::Char + matrix::oneSparseMatrixCSR{T} +end + +eltype(A::INTEL_TriangularOperator{T}) where T = T +size(A::INTEL_TriangularOperator) = (A.m, A.n) + +for (SparseMatrixType, BlasType) in ((:(oneSparseMatrixCSR{T}), :BlasFloat),) + @eval begin + function KP.TriangularOperator(A::$SparseMatrixType, uplo::Char, diag::Char; nrhs::Int=1, transa::Char='N') where T <: $BlasType + m,n = size(A) + if nrhs == 1 + oneMKL.sparse_optimize_trsv!(uplo, transa, diag, A) + else + oneMKL.sparse_optimize_trsm!(uplo, transa, diag, nrhs, A) + end + return INTEL_TriangularOperator{T}(T, m, n, nrhs, uplo, diag, transa, A) + end + + function KP.update!(A::INTEL_TriangularOperator{T}, B::$SparseMatrixType) where T <: $BlasFloat + return error("The update of an INTEL_TriangularOperator is not supported.") + end + end +end + +function LinearAlgebra.ldiv!(y::oneVector{T}, A::INTEL_TriangularOperator{T}, x::oneVector{T}) where T <: BlasFloat + (length(y) != A.m) && throw(DimensionMismatch("length(y) != A.m")) + (length(x) != A.n) && throw(DimensionMismatch("length(x) != A.n")) + (A.nrhs == 1) || throw(DimensionMismatch("A.nrhs != 1")) + oneMKL.sparse_trsv!(A.uplo, A.transa, A.diag, one(T), A.matrix, x, y) +end + +function LinearAlgebra.ldiv!(Y::oneMatrix{T}, A::INTEL_TriangularOperator{T}, X::oneMatrix{T}) where T <: BlasFloat + mY, nY = size(Y) + mX, nX = size(X) + (mY != A.m) && throw(DimensionMismatch("mY != A.m")) + (mX != A.n) && throw(DimensionMismatch("mX != A.n")) + (nY == nX == A.nrhs) || throw(DimensionMismatch("nY != A.nrhs or nX != A.nrhs")) + oneMKL.sparse_trsm!(A.uplo, A.transa, 'N', A.diag, one(T), A.matrix, X, Y) +end diff --git a/test/Project.toml b/test/Project.toml index f16ff85..b04c043 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -6,3 +6,4 @@ LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" +oneAPI = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b" diff --git a/test/gpu/gpu.jl b/test/gpu/gpu.jl index 9f98d6d..20b5911 100644 --- a/test/gpu/gpu.jl +++ b/test/gpu/gpu.jl @@ -90,17 +90,19 @@ function test_operator(FC, V, DM, SM) mul!(y_gpu, opA_gpu, x_gpu) @test collect(y_gpu) ≈ y_cpu end - for j = 1:5 - y_cpu = rand(FC, m) - x_cpu = rand(FC, n) - A_cpu2 = A_cpu + j*I - mul!(y_cpu, A_cpu2, x_cpu) - y_gpu = V(y_cpu) - x_gpu = V(x_cpu) - A_gpu2 = SM(A_cpu2) - update!(opA_gpu, A_gpu2) - mul!(y_gpu, opA_gpu, x_gpu) - @test collect(y_gpu) ≈ y_cpu + if V.body.name.name != :oneArray + for j = 1:5 + y_cpu = rand(FC, m) + x_cpu = rand(FC, n) + A_cpu2 = A_cpu + j*I + mul!(y_cpu, A_cpu2, x_cpu) + y_gpu = V(y_cpu) + x_gpu = V(x_cpu) + A_gpu2 = SM(A_cpu2) + update!(opA_gpu, A_gpu2) + mul!(y_gpu, opA_gpu, x_gpu) + @test collect(y_gpu) ≈ y_cpu + end end nrhs = 3 @@ -114,17 +116,19 @@ function test_operator(FC, V, DM, SM) mul!(Y_gpu, opA_gpu, X_gpu) @test collect(Y_gpu) ≈ Y_cpu end - for j = 1:5 - Y_cpu = rand(FC, m, nrhs) - X_cpu = rand(FC, n, nrhs) - A_cpu2 = A_cpu + j*I - mul!(Y_cpu, A_cpu2, X_cpu) - Y_gpu = DM(Y_cpu) - X_gpu = DM(X_cpu) - A_gpu2 = SM(A_cpu2) - update!(opA_gpu, A_gpu2) - mul!(Y_gpu, opA_gpu, X_gpu) - @test collect(Y_gpu) ≈ Y_cpu + if V.body.name.name != :oneArray + for j = 1:5 + Y_cpu = rand(FC, m, nrhs) + X_cpu = rand(FC, n, nrhs) + A_cpu2 = A_cpu + j*I + mul!(Y_cpu, A_cpu2, X_cpu) + Y_gpu = DM(Y_cpu) + X_gpu = DM(X_cpu) + A_gpu2 = SM(A_cpu2) + update!(opA_gpu, A_gpu2) + mul!(Y_gpu, opA_gpu, X_gpu) + @test collect(Y_gpu) ≈ Y_cpu + end end end @@ -152,17 +156,19 @@ function test_triangular(FC, V, DM, SM) ldiv!(y_gpu, opA_gpu, x_gpu) @test collect(y_gpu) ≈ y_cpu end - for j = 1:5 - y_cpu = rand(FC, n) - x_cpu = rand(FC, n) - A_cpu2 = A_cpu + j*tril(A_cpu,-1) + j*triu(A_cpu,1) - ldiv!(y_cpu, triangle(A_cpu2), x_cpu) - y_gpu = V(y_cpu) - x_gpu = V(x_cpu) - A_gpu2 = SM(A_cpu2) - update!(opA_gpu, A_gpu2) - ldiv!(y_gpu, opA_gpu, x_gpu) - @test collect(y_gpu) ≈ y_cpu + if V.body.name.name != :oneArray + for j = 1:5 + y_cpu = rand(FC, n) + x_cpu = rand(FC, n) + A_cpu2 = A_cpu + j*tril(A_cpu,-1) + j*triu(A_cpu,1) + ldiv!(y_cpu, triangle(A_cpu2), x_cpu) + y_gpu = V(y_cpu) + x_gpu = V(x_cpu) + A_gpu2 = SM(A_cpu2) + update!(opA_gpu, A_gpu2) + ldiv!(y_gpu, opA_gpu, x_gpu) + @test collect(y_gpu) ≈ y_cpu + end end nrhs = 3 @@ -176,17 +182,19 @@ function test_triangular(FC, V, DM, SM) ldiv!(Y_gpu, opA_gpu, X_gpu) @test collect(Y_gpu) ≈ Y_cpu end - for j = 1:5 - Y_cpu = rand(FC, n, nrhs) - X_cpu = rand(FC, n, nrhs) - A_cpu2 = A_cpu + j*tril(A_cpu,-1) + j*triu(A_cpu,1) - ldiv!(Y_cpu, triangle(A_cpu2), X_cpu) - Y_gpu = DM(Y_cpu) - X_gpu = DM(X_cpu) - A_gpu2 = SM(A_cpu2) - update!(opA_gpu, A_gpu2) - ldiv!(Y_gpu, opA_gpu, X_gpu) - @test collect(Y_gpu) ≈ Y_cpu + if V.body.name.name != :oneArray + for j = 1:5 + Y_cpu = rand(FC, n, nrhs) + X_cpu = rand(FC, n, nrhs) + A_cpu2 = A_cpu + j*tril(A_cpu,-1) + j*triu(A_cpu,1) + ldiv!(Y_cpu, triangle(A_cpu2), X_cpu) + Y_gpu = DM(Y_cpu) + X_gpu = DM(X_cpu) + A_gpu2 = SM(A_cpu2) + update!(opA_gpu, A_gpu2) + ldiv!(Y_gpu, opA_gpu, X_gpu) + @test collect(Y_gpu) ≈ Y_cpu + end end end end diff --git a/test/gpu/intel.jl b/test/gpu/intel.jl index 0ab2c6c..7b06824 100644 --- a/test/gpu/intel.jl +++ b/test/gpu/intel.jl @@ -1,9 +1,27 @@ -using oneAPI +using oneAPI, oneAPI.oneMKL +_get_type(J::oneSparseMatrixCSR) = oneArray{Float64, 1, oneAPI.oneL0.DeviceBuffer} +_is_csr(J::oneSparseMatrixCSR) = true include("gpu.jl") @testset "Intel -- oneAPI.jl" begin @test oneAPI.functional() oneAPI.allowscalar(false) + + @testset "KrylovOperator" begin + @testset "oneSparseMatrixCSR -- $FC" for FC in (Float32,) # ComplexF32) + test_operator(FC, oneVector{FC}, oneMatrix{FC}, oneSparseMatrixCSR) + end + end + + @testset "TriangularOperator" begin + @testset "oneSparseMatrixCSR -- $FC" for FC in (Float32,) # ComplexF32) + test_triangular(FC, oneVector{FC}, oneMatrix{FC}, oneSparseMatrixCSR) + end + end + + # @testset "Block Jacobi preconditioner" begin + # test_block_jacobi(oneAPIBackend(), oneArray, oneSparseMatrixCSR) + # end end diff --git a/test/runtests.jl b/test/runtests.jl index 778a6fb..9a18c02 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,6 +1,6 @@ using AMDGPU using CUDA -# using oneAPI +using oneAPI using Test @testset "KrylovPreconditioners" begin @@ -18,10 +18,10 @@ if CUDA.functional() end end -# if oneAPI.functional() -# @info "Testing oneAPI backend" -# @testset "Testing oneAPI backend" begin -# include("gpu/intel.jl") -# end -# end +if oneAPI.functional() + @info "Testing oneAPI backend" + @testset "Testing oneAPI backend" begin + include("gpu/intel.jl") + end +end end