Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add an extension for oneAPI.jl #52

Merged
merged 9 commits into from
May 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions .buildkite/pipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,8 @@ steps:
julia --color=yes --project=test/gpu -e '
using Pkg
Pkg.develop(path=".")
# Pkg.add("AMDGPU")
Pkg.add(url="https://github.com/JuliaGPU/AMDGPU.jl", rev="master")
Pkg.add("AMDGPU")
# Pkg.add(url="https://github.com/JuliaGPU/AMDGPU.jl", rev="master")
Pkg.add("Krylov")
Pkg.instantiate()
include("test/gpu/amd.jl")'
Expand All @@ -53,6 +53,7 @@ steps:
using Pkg
Pkg.develop(path=".")
Pkg.add("oneAPI")
# Pkg.add(url="https://github.com/JuliaGPU/oneAPI.jl", rev="master")
Pkg.add("Krylov")
Pkg.instantiate()
include("test/gpu/intel.jl")'
Expand Down
5 changes: 4 additions & 1 deletion Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,18 @@ SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
[weakdeps]
AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
oneAPI = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b"

[extensions]
KrylovPreconditionersAMDGPUExt = "AMDGPU"
KrylovPreconditionersCUDAExt = "CUDA"
KrylovPreconditionersoneAPIExt = "oneAPI"

[compat]
AMDGPU = "0.8.3, 0.9"
AMDGPU = "0.9"
Adapt = "3, 4"
CUDA = "5.3.0"
oneAPI = "1.5.0"
KernelAbstractions = "0.9"
Krylov = "0.9.4"
LightGraphs = "1"
Expand Down
6 changes: 3 additions & 3 deletions ext/AMDGPU/blockjacobi.jl
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
KP.BlockJacobiPreconditioner(J::rocSPARSE.ROCSparseMatrixCSR; options...) = BlockJacobiPreconditioner(SparseMatrixCSC(J); options...)

function KP.create_blocklist(cublocks::ROCArray, npart)
blocklist = Array{ROCArray{Float64,2}}(undef, npart)
blocklist = Array{ROCMatrix{Float64}}(undef, npart)
for b in 1:npart
blocklist[b] = ROCMatrix{Float64}(undef, size(cublocks,1), size(cublocks,2))
end
Expand All @@ -25,8 +25,8 @@ function _update_gpu(p, j_rowptr, j_colval, j_nzval, device::ROCBackend)
for b in 1:nblocks
p.blocklist[b] .= p.cublocks[:,:,b]
end
AMDGPU.@sync pivot, info = AMDGPU.rocSOLVER.getrf_batched!(p.blocklist)
AMDGPU.@sync pivot, info, p.blocklist = AMDGPU.rocSOLVER.getri_batched!(p.blocklist, pivot)
AMDGPU.@sync pivot, info = rocSOLVER.getrf_batched!(p.blocklist)
AMDGPU.@sync pivot, info, p.blocklist = rocSOLVER.getri_batched!(p.blocklist, pivot)
for b in 1:nblocks
p.cublocks[:,:,b] .= p.blocklist[b]
end
Expand Down
6 changes: 3 additions & 3 deletions ext/CUDA/blockjacobi.jl
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
KP.BlockJacobiPreconditioner(J::CUSPARSE.CuSparseMatrixCSR; options...) = BlockJacobiPreconditioner(SparseMatrixCSC(J); options...)

function KP.create_blocklist(cublocks::CuArray, npart)
blocklist = Array{CuArray{Float64,2}}(undef, npart)
blocklist = Array{CuMatrix{Float64}}(undef, npart)
for b in 1:npart
blocklist[b] = CuMatrix{Float64}(undef, size(cublocks,1), size(cublocks,2))
end
Expand All @@ -25,8 +25,8 @@ function _update_gpu(p, j_rowptr, j_colval, j_nzval, device::CUDABackend)
for b in 1:nblocks
p.blocklist[b] .= p.cublocks[:,:,b]
end
CUDA.@sync pivot, info = CUDA.CUBLAS.getrf_batched!(p.blocklist, true)
CUDA.@sync pivot, info, p.blocklist = CUDA.CUBLAS.getri_batched(p.blocklist, pivot)
CUDA.@sync pivot, info = CUBLAS.getrf_batched!(p.blocklist, true)
CUDA.@sync pivot, info, p.blocklist = CUBLAS.getri_batched(p.blocklist, pivot)
for b in 1:nblocks
p.cublocks[:,:,b] .= p.blocklist[b]
end
Expand Down
2 changes: 1 addition & 1 deletion ext/KrylovPreconditionersAMDGPUExt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ module KrylovPreconditionersAMDGPUExt
using LinearAlgebra
using SparseArrays
using AMDGPU
using AMDGPU.rocSPARSE
using AMDGPU.rocSPARSE, AMDGPU.rocSOLVER
using LinearAlgebra: checksquare, BlasReal, BlasFloat
import LinearAlgebra: ldiv!, mul!
import Base: size, eltype, unsafe_convert
Expand Down
2 changes: 1 addition & 1 deletion ext/KrylovPreconditionersCUDAExt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ module KrylovPreconditionersCUDAExt
using LinearAlgebra
using SparseArrays
using CUDA
using CUDA.CUSPARSE
using CUDA.CUSPARSE, CUDA.CUBLAS
using LinearAlgebra: checksquare, BlasReal, BlasFloat
import LinearAlgebra: ldiv!, mul!
import Base: size, eltype, unsafe_convert
Expand Down
19 changes: 19 additions & 0 deletions ext/KrylovPreconditionersoneAPIExt.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
module KrylovPreconditionersoneAPIExt
using LinearAlgebra
using SparseArrays
using oneAPI
using oneAPI: global_queue, sycl_queue, context, device
using oneAPI.oneMKL
using LinearAlgebra: checksquare, BlasReal, BlasFloat
import LinearAlgebra: ldiv!, mul!
import Base: size, eltype, unsafe_convert

using KrylovPreconditioners
const KP = KrylovPreconditioners
using KernelAbstractions
const KA = KernelAbstractions

include("oneAPI/blockjacobi.jl")
include("oneAPI/operators.jl")

end
48 changes: 48 additions & 0 deletions ext/oneAPI/blockjacobi.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
KP.BlockJacobiPreconditioner(J::oneMKL.oneSparseMatrixCSR; options...) = BlockJacobiPreconditioner(SparseMatrixCSC(J); options...)

function KP.create_blocklist(cublocks::oneArray, npart)
blocklist = Array{oneMatrix{Float64}}(undef, npart)
for b in 1:npart
blocklist[b] = oneMatrix{Float64}(undef, size(cublocks,1), size(cublocks,2))
end
return blocklist
end

function _update_gpu(p, j_rowptr, j_colval, j_nzval, device::oneAPIBackend)
nblocks = p.nblocks
blocksize = p.blocksize
fillblock_gpu_kernel! = KP._fillblock_gpu!(device)
# Fill Block Jacobi" begin
fillblock_gpu_kernel!(
p.cublocks, size(p.id,1),
p.cupartitions, p.cumap,
j_rowptr, j_colval, j_nzval,
p.cupart, p.culpartitions, p.id,
ndrange=(nblocks, blocksize),
)
KA.synchronize(device)
# Invert blocks begin
for b in 1:nblocks
p.blocklist[b] .= p.cublocks[:,:,b]
end
oneAPI.@sync pivot, p.blocklist = oneMKL.getrf_batched!(p.blocklist)
oneAPI.@sync pivot, p.blocklist = oneMKL.getri_batched!(p.blocklist, pivot)
for b in 1:nblocks
p.cublocks[:,:,b] .= p.blocklist[b]
end
return
end

"""
function update!(J::oneSparseMatrixCSR, p)

Update the preconditioner `p` from the sparse Jacobian `J` in CSR format for oneAPI

1) The dense blocks `cuJs` are filled from the sparse Jacobian `J`
2) To a batch inversion of the dense blocks using oneMKL
3) Extract the preconditioner matrix `p.P` from the dense blocks `cuJs`

"""
function KP.update!(p::BlockJacobiPreconditioner, J::oneMKL.oneSparseMatrixCSR)
_update_gpu(p, J.rowPtr, J.colVal, J.nzVal, p.device)
end
96 changes: 96 additions & 0 deletions ext/oneAPI/operators.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
mutable struct INTEL_KrylovOperator{T} <: AbstractKrylovOperator{T}
type::Type{T}
m::Int
n::Int
nrhs::Int
transa::Char
matrix::oneSparseMatrixCSR{T}
end

eltype(A::INTEL_KrylovOperator{T}) where T = T
size(A::INTEL_KrylovOperator) = (A.m, A.n)

for (SparseMatrixType, BlasType) in ((:(oneSparseMatrixCSR{T}), :BlasFloat),)
@eval begin
function KP.KrylovOperator(A::$SparseMatrixType; nrhs::Int=1, transa::Char='N') where T <: $BlasType
m,n = size(A)
if nrhs == 1
oneMKL.sparse_optimize_gemv!(transa, A)
end
# sparse_optimize_gemm! is only available with oneAPI > v2024.1.0
return INTEL_KrylovOperator{T}(T, m, n, nrhs, transa, A)
end

function KP.update!(A::INTEL_KrylovOperator{T}, B::$SparseMatrixType) where T <: $BlasFloat
error("The update of an INTEL_KrylovOperator is not supported.")
end
end
end

function LinearAlgebra.mul!(y::oneVector{T}, A::INTEL_KrylovOperator{T}, x::oneVector{T}) where T <: BlasFloat
(length(y) != A.m) && throw(DimensionMismatch("length(y) != A.m"))
(length(x) != A.n) && throw(DimensionMismatch("length(x) != A.n"))
(A.nrhs == 1) || throw(DimensionMismatch("A.nrhs != 1"))
alpha = one(T)
beta = zero(T)
oneMKL.sparse_gemv!(A.transa, alpha, A.matrix, x, beta, y)
end

function LinearAlgebra.mul!(Y::oneMatrix{T}, A::INTEL_KrylovOperator{T}, X::oneMatrix{T}) where T <: BlasFloat
mY, nY = size(Y)
mX, nX = size(X)
(mY != A.m) && throw(DimensionMismatch("mY != A.m"))
(mX != A.n) && throw(DimensionMismatch("mX != A.n"))
(nY == nX == A.nrhs) || throw(DimensionMismatch("nY != A.nrhs or nX != A.nrhs"))
alpha = one(T)
beta = zero(T)
oneMKL.sparse_gemm!(A.transa, 'N', alpha, A.matrix, X, beta, Y)
end

mutable struct INTEL_TriangularOperator{T} <: AbstractTriangularOperator{T}
type::Type{T}
m::Int
n::Int
nrhs::Int
uplo::Char
diag::Char
transa::Char
matrix::oneSparseMatrixCSR{T}
end

eltype(A::INTEL_TriangularOperator{T}) where T = T
size(A::INTEL_TriangularOperator) = (A.m, A.n)

for (SparseMatrixType, BlasType) in ((:(oneSparseMatrixCSR{T}), :BlasFloat),)
@eval begin
function KP.TriangularOperator(A::$SparseMatrixType, uplo::Char, diag::Char; nrhs::Int=1, transa::Char='N') where T <: $BlasType
m,n = size(A)
if nrhs == 1
oneMKL.sparse_optimize_trsv!(uplo, transa, diag, A)
else
oneMKL.sparse_optimize_trsm!(uplo, transa, diag, nrhs, A)
end
return INTEL_TriangularOperator{T}(T, m, n, nrhs, uplo, diag, transa, A)
end

function KP.update!(A::INTEL_TriangularOperator{T}, B::$SparseMatrixType) where T <: $BlasFloat
return error("The update of an INTEL_TriangularOperator is not supported.")
end
end
end

function LinearAlgebra.ldiv!(y::oneVector{T}, A::INTEL_TriangularOperator{T}, x::oneVector{T}) where T <: BlasFloat
(length(y) != A.m) && throw(DimensionMismatch("length(y) != A.m"))
(length(x) != A.n) && throw(DimensionMismatch("length(x) != A.n"))
(A.nrhs == 1) || throw(DimensionMismatch("A.nrhs != 1"))
oneMKL.sparse_trsv!(A.uplo, A.transa, A.diag, one(T), A.matrix, x, y)
end

function LinearAlgebra.ldiv!(Y::oneMatrix{T}, A::INTEL_TriangularOperator{T}, X::oneMatrix{T}) where T <: BlasFloat
mY, nY = size(Y)
mX, nX = size(X)
(mY != A.m) && throw(DimensionMismatch("mY != A.m"))
(mX != A.n) && throw(DimensionMismatch("mX != A.n"))
(nY == nX == A.nrhs) || throw(DimensionMismatch("nY != A.nrhs or nX != A.nrhs"))
oneMKL.sparse_trsm!(A.uplo, A.transa, 'N', A.diag, one(T), A.matrix, X, Y)
end
1 change: 1 addition & 0 deletions test/Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@ LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
oneAPI = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b"
96 changes: 52 additions & 44 deletions test/gpu/gpu.jl
Original file line number Diff line number Diff line change
Expand Up @@ -90,17 +90,19 @@ function test_operator(FC, V, DM, SM)
mul!(y_gpu, opA_gpu, x_gpu)
@test collect(y_gpu) ≈ y_cpu
end
for j = 1:5
y_cpu = rand(FC, m)
x_cpu = rand(FC, n)
A_cpu2 = A_cpu + j*I
mul!(y_cpu, A_cpu2, x_cpu)
y_gpu = V(y_cpu)
x_gpu = V(x_cpu)
A_gpu2 = SM(A_cpu2)
update!(opA_gpu, A_gpu2)
mul!(y_gpu, opA_gpu, x_gpu)
@test collect(y_gpu) ≈ y_cpu
if V.body.name.name != :oneArray
for j = 1:5
y_cpu = rand(FC, m)
x_cpu = rand(FC, n)
A_cpu2 = A_cpu + j*I
mul!(y_cpu, A_cpu2, x_cpu)
y_gpu = V(y_cpu)
x_gpu = V(x_cpu)
A_gpu2 = SM(A_cpu2)
update!(opA_gpu, A_gpu2)
mul!(y_gpu, opA_gpu, x_gpu)
@test collect(y_gpu) ≈ y_cpu
end
end

nrhs = 3
Expand All @@ -114,17 +116,19 @@ function test_operator(FC, V, DM, SM)
mul!(Y_gpu, opA_gpu, X_gpu)
@test collect(Y_gpu) ≈ Y_cpu
end
for j = 1:5
Y_cpu = rand(FC, m, nrhs)
X_cpu = rand(FC, n, nrhs)
A_cpu2 = A_cpu + j*I
mul!(Y_cpu, A_cpu2, X_cpu)
Y_gpu = DM(Y_cpu)
X_gpu = DM(X_cpu)
A_gpu2 = SM(A_cpu2)
update!(opA_gpu, A_gpu2)
mul!(Y_gpu, opA_gpu, X_gpu)
@test collect(Y_gpu) ≈ Y_cpu
if V.body.name.name != :oneArray
for j = 1:5
Y_cpu = rand(FC, m, nrhs)
X_cpu = rand(FC, n, nrhs)
A_cpu2 = A_cpu + j*I
mul!(Y_cpu, A_cpu2, X_cpu)
Y_gpu = DM(Y_cpu)
X_gpu = DM(X_cpu)
A_gpu2 = SM(A_cpu2)
update!(opA_gpu, A_gpu2)
mul!(Y_gpu, opA_gpu, X_gpu)
@test collect(Y_gpu) ≈ Y_cpu
end
end
end

Expand Down Expand Up @@ -152,17 +156,19 @@ function test_triangular(FC, V, DM, SM)
ldiv!(y_gpu, opA_gpu, x_gpu)
@test collect(y_gpu) ≈ y_cpu
end
for j = 1:5
y_cpu = rand(FC, n)
x_cpu = rand(FC, n)
A_cpu2 = A_cpu + j*tril(A_cpu,-1) + j*triu(A_cpu,1)
ldiv!(y_cpu, triangle(A_cpu2), x_cpu)
y_gpu = V(y_cpu)
x_gpu = V(x_cpu)
A_gpu2 = SM(A_cpu2)
update!(opA_gpu, A_gpu2)
ldiv!(y_gpu, opA_gpu, x_gpu)
@test collect(y_gpu) ≈ y_cpu
if V.body.name.name != :oneArray
for j = 1:5
y_cpu = rand(FC, n)
x_cpu = rand(FC, n)
A_cpu2 = A_cpu + j*tril(A_cpu,-1) + j*triu(A_cpu,1)
ldiv!(y_cpu, triangle(A_cpu2), x_cpu)
y_gpu = V(y_cpu)
x_gpu = V(x_cpu)
A_gpu2 = SM(A_cpu2)
update!(opA_gpu, A_gpu2)
ldiv!(y_gpu, opA_gpu, x_gpu)
@test collect(y_gpu) ≈ y_cpu
end
end

nrhs = 3
Expand All @@ -176,17 +182,19 @@ function test_triangular(FC, V, DM, SM)
ldiv!(Y_gpu, opA_gpu, X_gpu)
@test collect(Y_gpu) ≈ Y_cpu
end
for j = 1:5
Y_cpu = rand(FC, n, nrhs)
X_cpu = rand(FC, n, nrhs)
A_cpu2 = A_cpu + j*tril(A_cpu,-1) + j*triu(A_cpu,1)
ldiv!(Y_cpu, triangle(A_cpu2), X_cpu)
Y_gpu = DM(Y_cpu)
X_gpu = DM(X_cpu)
A_gpu2 = SM(A_cpu2)
update!(opA_gpu, A_gpu2)
ldiv!(Y_gpu, opA_gpu, X_gpu)
@test collect(Y_gpu) ≈ Y_cpu
if V.body.name.name != :oneArray
for j = 1:5
Y_cpu = rand(FC, n, nrhs)
X_cpu = rand(FC, n, nrhs)
A_cpu2 = A_cpu + j*tril(A_cpu,-1) + j*triu(A_cpu,1)
ldiv!(Y_cpu, triangle(A_cpu2), X_cpu)
Y_gpu = DM(Y_cpu)
X_gpu = DM(X_cpu)
A_gpu2 = SM(A_cpu2)
update!(opA_gpu, A_gpu2)
ldiv!(Y_gpu, opA_gpu, X_gpu)
@test collect(Y_gpu) ≈ Y_cpu
end
end
end
end
Expand Down
Loading
Loading