Skip to content

Commit

Permalink
Merge branch 'main' into am/KrylovOperator
Browse files Browse the repository at this point in the history
  • Loading branch information
amontoison authored Nov 29, 2023
2 parents 61e2cd0 + 9b91d5d commit b1a4df6
Show file tree
Hide file tree
Showing 3 changed files with 59 additions and 18 deletions.
17 changes: 12 additions & 5 deletions ext/AMDGPU/blockjacobi.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,13 @@
KP.BlockJacobiPreconditioner(J::rocSPARSE.ROCSparseMatrixCSR; options...) = BlockJacobiPreconditioner(SparseMatrixCSC(J); options...)

function KP.create_blocklist(cublocks::ROCArray, npart)
blocklist = Array{CuArray{Float64,2}}(undef, npart)
for b in 1:npart
blocklist[b] = ROCMatrix{Float64}(undef, size(cublocks,1), size(cublocks,2))
end
return blocklist
end

function _update_gpu(p, j_rowptr, j_colval, j_nzval, device::ROCBackend)
nblocks = p.nblocks
fillblock_gpu_kernel! = KP._fillblock_gpu!(device)
Expand All @@ -13,14 +21,13 @@ function _update_gpu(p, j_rowptr, j_colval, j_nzval, device::ROCBackend)
)
KA.synchronize(device)
# Invert blocks begin
blocklist = Array{ROCArray{Float64,2}}(undef, nblocks)
for b in 1:nblocks
blocklist[b] = p.cublocks[:,:,b]
p.blocklist[b] .= p.cublocks[:,:,b]
end
AMDGPU.@sync pivot, info = AMDGPU.rocSOLVER.getrf_batched!(blocklist)
AMDGPU.@sync pivot, info, blocklist = AMDGPU.rocSOLVER.getri_batched!(blocklist, pivot)
AMDGPU.@sync pivot, info = AMDGPU.rocSOLVER.getrf_batched!(p.blocklist)
AMDGPU.@sync pivot, info, p.blocklist = AMDGPU.rocSOLVER.getri_batched!(p.blocklist, pivot)
for b in 1:nblocks
p.cublocks[:,:,b] .= blocklist[b]
p.cublocks[:,:,b] .= p.blocklist[b]
end
return
end
Expand Down
17 changes: 12 additions & 5 deletions ext/CUDA/blockjacobi.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,13 @@
KP.BlockJacobiPreconditioner(J::CUSPARSE.CuSparseMatrixCSR; options...) = BlockJacobiPreconditioner(SparseMatrixCSC(J); options...)

function KP.create_blocklist(cublocks::CuArray, npart)
blocklist = Array{CuArray{Float64,2}}(undef, npart)
for b in 1:npart
blocklist[b] = CuMatrix{Float64}(undef, size(cublocks,1), size(cublocks,2))
end
return blocklist
end

function _update_gpu(p, j_rowptr, j_colval, j_nzval, device::CUDABackend)
nblocks = p.nblocks
fillblock_gpu_kernel! = KP._fillblock_gpu!(device)
Expand All @@ -13,14 +21,13 @@ function _update_gpu(p, j_rowptr, j_colval, j_nzval, device::CUDABackend)
)
KA.synchronize(device)
# Invert blocks begin
blocklist = Array{CuArray{Float64,2}}(undef, nblocks)
for b in 1:nblocks
blocklist[b] = p.cublocks[:,:,b]
p.blocklist[b] .= p.cublocks[:,:,b]
end
CUDA.@sync pivot, info = CUDA.CUBLAS.getrf_batched!(blocklist, true)
CUDA.@sync pivot, info, blocklist = CUDA.CUBLAS.getri_batched(blocklist, pivot)
CUDA.@sync pivot, info = CUDA.CUBLAS.getrf_batched!(p.blocklist, true)
CUDA.@sync pivot, info, p.blocklist = CUDA.CUBLAS.getri_batched(p.blocklist, pivot)
for b in 1:nblocks
p.cublocks[:,:,b] .= blocklist[b]
p.cublocks[:,:,b] .= p.blocklist[b]
end
return
end
Expand Down
43 changes: 35 additions & 8 deletions src/blockjacobi.jl
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ Overlapping-Schwarz preconditioner.
* `part`: Partitioning as output by Metis
* `cupart`: `part` transferred to the GPU
"""
struct BlockJacobiPreconditioner{AT,GAT,VI,GVI,GMT,MI,GMI} <: AbstractKrylovPreconditioner
mutable struct BlockJacobiPreconditioner{AT,GAT,VI,GVI,GMT,MI,GMI} <: AbstractKrylovPreconditioner
nblocks::Int64
blocksize::Int64
partitions::MI
Expand All @@ -58,6 +58,16 @@ struct BlockJacobiPreconditioner{AT,GAT,VI,GVI,GMT,MI,GMI} <: AbstractKrylovPrec
part::VI
cupart::GVI
id::GMT
blocklist::Vector{GMT}
timer_update::Float64
end

function create_blocklist(blocks::Array, npart)
blocklist = Array{Array{Float64,2}}(undef, npart)
for b in 1:npart
blocklist[b] = Matrix{Float64}(undef, size(blocks,1), size(blocks,2))
end
return blocklist
end

function BlockJacobiPreconditioner(J, npart::Int64, device=CPU(), olevel=0) where {}
Expand Down Expand Up @@ -117,14 +127,15 @@ function BlockJacobiPreconditioner(J, npart::Int64, device=CPU(), olevel=0) wher
cublocks = adapt(device, blocks)
cumap = adapt(device, map)
cupart = adapt(device, part)
blocklist = create_blocklist(cublocks, npart)
return BlockJacobiPreconditioner(
npart, blocksize, bpartitions,
cubpartitions, lpartitions,
culpartitions, rest_size,
curest_size, blocks,
cublocks, map,
cumap, part,
cupart, id
cupart, id, blocklist, 0.0
)
end

Expand All @@ -147,7 +158,23 @@ Base.eltype(::BlockJacobiPreconditioner) = Float64
# overall 3x slower than this custom kernel : due to the various sizes
# of the blocks, gemm_strided is performing too many unecessary operations,
# impairing its performance.
@kernel function mblock_kernel!(y, b, p_len, rp_len, part, blocks)
@kernel function mblock_b_kernel!(y, b, p_len, rp_len, part, blocks)
i, j = @index(Global, NTuple)
len = p_len[i]
rlen = rp_len[i]

if j <= rlen
accum = 0.0
idxA = @inbounds part[j, i]
for k=1:len
idxB = @inbounds part[k, i]
@inbounds accum = accum + blocks[j, k, i]*b[idxB]
end
@inbounds y[idxA] = accum
end
end

@kernel function mblock_B_kernel!(y, b, p_len, rp_len, part, blocks)
p = size(b, 2)
i, j = @index(Global, NTuple)
len = p_len[i]
Expand All @@ -161,7 +188,7 @@ Base.eltype(::BlockJacobiPreconditioner) = Float64
idxB = @inbounds part[k, i]
@inbounds accum = accum + blocks[j, k, i]*b[idxB,ℓ]
end
y[idxA,ℓ] = accum
@inbounds y[idxA,ℓ] = accum
end
end
end
Expand Down Expand Up @@ -202,12 +229,13 @@ function LinearAlgebra.mul!(y, C::BlockJacobiPreconditioner, b::AbstractVector{T
fill!(y, zero(T))
max_rlen = maximum(C.rest_size)
ndrange = (C.nblocks, max_rlen)
mblock_kernel!(device)(
C.timer_update += @elapsed begin mblock_b_kernel!(device)(
y, b, C.culpartitions, C.curest_size,
C.cupartitions, C.cublocks,
ndrange=ndrange,
)
KA.synchronize(device)
end
end

function LinearAlgebra.mul!(Y, C::BlockJacobiPreconditioner, B::AbstractMatrix{T}) where T
Expand All @@ -216,12 +244,13 @@ function LinearAlgebra.mul!(Y, C::BlockJacobiPreconditioner, B::AbstractMatrix{T
fill!(Y, zero(T))
max_rlen = maximum(C.rest_size)
ndrange = (C.nblocks, max_rlen)
mblock_kernel!(device)(
C.timer_update += @elapsed begin mblock_B_kernel!(device)(
Y, B, C.culpartitions, C.curest_size,
C.cupartitions, C.cublocks,
ndrange=ndrange,
)
KA.synchronize(device)
end
end

"""
Expand Down Expand Up @@ -315,5 +344,3 @@ function Base.show(precond::BlockJacobiPreconditioner)
" Mbytes = ", (nblock*nblock*npartitions*8.0)/(1024.0*1024.0))
println("Block Jacobi block size: $(precond.nJs)")
end

# NVIDIA

0 comments on commit b1a4df6

Please sign in to comment.