From 283923d9a1c3d4934d5ac571970bb51f33d8fb8d Mon Sep 17 00:00:00 2001 From: Alexis Montoison Date: Mon, 1 Apr 2024 16:36:55 -0400 Subject: [PATCH 1/9] Add an extension for oneAPI.jl --- .buildkite/pipeline.yml | 3 +- Project.toml | 2 ++ ext/AMDGPU/blockjacobi.jl | 8 ++--- ext/CUDA/blockjacobi.jl | 8 ++--- ext/KrylovPreconditionersAMDGPUExt.jl | 2 +- ext/KrylovPreconditionersCUDAExt.jl | 2 +- ext/KrylovPreconditionersoneAPIExt.jl | 17 ++++++++++ ext/oneAPI/blockjacobi.jl | 48 +++++++++++++++++++++++++++ test/gpu/intel.jl | 6 +++- 9 files changed, 84 insertions(+), 12 deletions(-) create mode 100644 ext/KrylovPreconditionersoneAPIExt.jl create mode 100644 ext/oneAPI/blockjacobi.jl diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index d91d5c1..d10d0c1 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -52,7 +52,8 @@ steps: julia --color=yes --project=test/gpu -e ' using Pkg Pkg.develop(path=".") - Pkg.add("oneAPI") + # Pkg.add("oneAPI") + Pkg.add(url="https://github.com/JuliaGPU/oneAPI.jl", rev="master") Pkg.add("Krylov") Pkg.instantiate() include("test/gpu/intel.jl")' diff --git a/Project.toml b/Project.toml index c46016e..eef01fa 100644 --- a/Project.toml +++ b/Project.toml @@ -14,10 +14,12 @@ SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" [weakdeps] AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e" CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" +oneAPI = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b" [extensions] KrylovPreconditionersAMDGPUExt = "AMDGPU" KrylovPreconditionersCUDAExt = "CUDA" +KrylovPreconditionersoneAPIExt = "oneAPI" [compat] AMDGPU = "0.8.3, 0.9" diff --git a/ext/AMDGPU/blockjacobi.jl b/ext/AMDGPU/blockjacobi.jl index dd02070..e5bd722 100644 --- a/ext/AMDGPU/blockjacobi.jl +++ b/ext/AMDGPU/blockjacobi.jl @@ -1,9 +1,9 @@ KP.BlockJacobiPreconditioner(J::rocSPARSE.ROCSparseMatrixCSR; options...) = BlockJacobiPreconditioner(SparseMatrixCSC(J); options...) function KP.create_blocklist(cublocks::ROCArray, npart) - blocklist = Array{ROCArray{Float64,2}}(undef, npart) + blocklist = Array{ROCMatrix{Float64}}(undef, npart) for b in 1:npart - blocklist[b] = ROCMatrix{Float64}(undef, size(cublocks,1), size(cublocks,2)) + blocklist[b] = ROCMatrix{Float64}(undef, size(cublocks)...) end return blocklist end @@ -25,8 +25,8 @@ function _update_gpu(p, j_rowptr, j_colval, j_nzval, device::ROCBackend) for b in 1:nblocks p.blocklist[b] .= p.cublocks[:,:,b] end - AMDGPU.@sync pivot, info = AMDGPU.rocSOLVER.getrf_batched!(p.blocklist) - AMDGPU.@sync pivot, info, p.blocklist = AMDGPU.rocSOLVER.getri_batched!(p.blocklist, pivot) + AMDGPU.@sync pivot, info = rocSOLVER.getrf_batched!(p.blocklist) + AMDGPU.@sync pivot, info, p.blocklist = rocSOLVER.getri_batched!(p.blocklist, pivot) for b in 1:nblocks p.cublocks[:,:,b] .= p.blocklist[b] end diff --git a/ext/CUDA/blockjacobi.jl b/ext/CUDA/blockjacobi.jl index 1e28d8a..f8344c6 100644 --- a/ext/CUDA/blockjacobi.jl +++ b/ext/CUDA/blockjacobi.jl @@ -1,9 +1,9 @@ KP.BlockJacobiPreconditioner(J::CUSPARSE.CuSparseMatrixCSR; options...) = BlockJacobiPreconditioner(SparseMatrixCSC(J); options...) function KP.create_blocklist(cublocks::CuArray, npart) - blocklist = Array{CuArray{Float64,2}}(undef, npart) + blocklist = Array{CuMatrix{Float64}}(undef, npart) for b in 1:npart - blocklist[b] = CuMatrix{Float64}(undef, size(cublocks,1), size(cublocks,2)) + blocklist[b] = CuMatrix{Float64}(undef, size(cublocks)...) end return blocklist end @@ -25,8 +25,8 @@ function _update_gpu(p, j_rowptr, j_colval, j_nzval, device::CUDABackend) for b in 1:nblocks p.blocklist[b] .= p.cublocks[:,:,b] end - CUDA.@sync pivot, info = CUDA.CUBLAS.getrf_batched!(p.blocklist, true) - CUDA.@sync pivot, info, p.blocklist = CUDA.CUBLAS.getri_batched(p.blocklist, pivot) + CUDA.@sync pivot, info = CUBLAS.getrf_batched!(p.blocklist, true) + CUDA.@sync pivot, info, p.blocklist = CUBLAS.getri_batched(p.blocklist, pivot) for b in 1:nblocks p.cublocks[:,:,b] .= p.blocklist[b] end diff --git a/ext/KrylovPreconditionersAMDGPUExt.jl b/ext/KrylovPreconditionersAMDGPUExt.jl index 1c9cc18..9f9b0e0 100644 --- a/ext/KrylovPreconditionersAMDGPUExt.jl +++ b/ext/KrylovPreconditionersAMDGPUExt.jl @@ -2,7 +2,7 @@ module KrylovPreconditionersAMDGPUExt using LinearAlgebra using SparseArrays using AMDGPU -using AMDGPU.rocSPARSE +using AMDGPU.rocSPARSE, AMDGPU.rocSOLVER using LinearAlgebra: checksquare, BlasReal, BlasFloat import LinearAlgebra: ldiv!, mul! import Base: size, eltype, unsafe_convert diff --git a/ext/KrylovPreconditionersCUDAExt.jl b/ext/KrylovPreconditionersCUDAExt.jl index b074141..47a6899 100644 --- a/ext/KrylovPreconditionersCUDAExt.jl +++ b/ext/KrylovPreconditionersCUDAExt.jl @@ -2,7 +2,7 @@ module KrylovPreconditionersCUDAExt using LinearAlgebra using SparseArrays using CUDA -using CUDA.CUSPARSE +using CUDA.CUSPARSE, CUDA.CUBLAS using LinearAlgebra: checksquare, BlasReal, BlasFloat import LinearAlgebra: ldiv!, mul! import Base: size, eltype, unsafe_convert diff --git a/ext/KrylovPreconditionersoneAPIExt.jl b/ext/KrylovPreconditionersoneAPIExt.jl new file mode 100644 index 0000000..32c2e67 --- /dev/null +++ b/ext/KrylovPreconditionersoneAPIExt.jl @@ -0,0 +1,17 @@ +module KrylovPreconditionersoneAPIExt +using LinearAlgebra +using SparseArrays +using oneAPI +using oneAPI.oneMKL +using LinearAlgebra: checksquare, BlasReal, BlasFloat +import LinearAlgebra: ldiv!, mul! +import Base: size, eltype, unsafe_convert + +using KrylovPreconditioners +const KP = KrylovPreconditioners +using KernelAbstractions +const KA = KernelAbstractions + +include("CUDA/blockjacobi.jl") + +end diff --git a/ext/oneAPI/blockjacobi.jl b/ext/oneAPI/blockjacobi.jl new file mode 100644 index 0000000..d5c768c --- /dev/null +++ b/ext/oneAPI/blockjacobi.jl @@ -0,0 +1,48 @@ +KP.BlockJacobiPreconditioner(J::oneMKL.oneSparseMatrixCSR; options...) = BlockJacobiPreconditioner(SparseMatrixCSC(J); options...) + +function KP.create_blocklist(cublocks::oneArray, npart) + blocklist = Array{oneMatrix{Float64}}(undef, npart) + for b in 1:npart + blocklist[b] = oneMatrix{Float64}(undef, size(cublocks)...) + end + return blocklist +end + +function _update_gpu(p, j_rowptr, j_colval, j_nzval, device::oneAPIBackend) + nblocks = p.nblocks + blocksize = p.blocksize + fillblock_gpu_kernel! = KP._fillblock_gpu!(device) + # Fill Block Jacobi" begin + fillblock_gpu_kernel!( + p.cublocks, size(p.id,1), + p.cupartitions, p.cumap, + j_rowptr, j_colval, j_nzval, + p.cupart, p.culpartitions, p.id, + ndrange=(nblocks, blocksize), + ) + KA.synchronize(device) + # Invert blocks begin + for b in 1:nblocks + p.blocklist[b] .= p.cublocks[:,:,b] + end + oneAPI.@sync pivot = oneMKL.getrf_batched!(p.blocklist) + oneAPI.@sync pivot, p.blocklist = oneMKL.getri_batched!(p.blocklist, pivot) + for b in 1:nblocks + p.cublocks[:,:,b] .= p.blocklist[b] + end + return +end + +""" + function update!(J::oneSparseMatrixCSR, p) + +Update the preconditioner `p` from the sparse Jacobian `J` in CSR format for oneAPI + +1) The dense blocks `cuJs` are filled from the sparse Jacobian `J` +2) To a batch inversion of the dense blocks using oneMKL +3) Extract the preconditioner matrix `p.P` from the dense blocks `cuJs` + +""" +function KP.update!(p::BlockJacobiPreconditioner, J::oneMKL.oneSparseMatrixCSR) + _update_gpu(p, J.rowPtr, J.colVal, J.nzVal, p.device) +end diff --git a/test/gpu/intel.jl b/test/gpu/intel.jl index 0ab2c6c..1f09257 100644 --- a/test/gpu/intel.jl +++ b/test/gpu/intel.jl @@ -1,4 +1,4 @@ -using oneAPI +using oneAPI, oneAPI.oneMKL include("gpu.jl") @@ -6,4 +6,8 @@ include("gpu.jl") @test oneAPI.functional() oneAPI.allowscalar(false) + + @testset "Block Jacobi preconditioner" begin + test_block_jacobi(oneAPIBackend(), oneArray, oneSparseMatrixCSR) + end end From d8db40a3d68e93ab5620dc290452220c0a8e249a Mon Sep 17 00:00:00 2001 From: Alexis Montoison Date: Mon, 1 Apr 2024 16:54:09 -0400 Subject: [PATCH 2/9] Fix ext files --- ext/AMDGPU/blockjacobi.jl | 2 +- ext/CUDA/blockjacobi.jl | 2 +- ext/oneAPI/blockjacobi.jl | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ext/AMDGPU/blockjacobi.jl b/ext/AMDGPU/blockjacobi.jl index e5bd722..1dab800 100644 --- a/ext/AMDGPU/blockjacobi.jl +++ b/ext/AMDGPU/blockjacobi.jl @@ -3,7 +3,7 @@ KP.BlockJacobiPreconditioner(J::rocSPARSE.ROCSparseMatrixCSR; options...) = Bloc function KP.create_blocklist(cublocks::ROCArray, npart) blocklist = Array{ROCMatrix{Float64}}(undef, npart) for b in 1:npart - blocklist[b] = ROCMatrix{Float64}(undef, size(cublocks)...) + blocklist[b] = ROCMatrix{Float64}(undef, size(cublocks,1), size(cublocks,2)) end return blocklist end diff --git a/ext/CUDA/blockjacobi.jl b/ext/CUDA/blockjacobi.jl index f8344c6..5c33865 100644 --- a/ext/CUDA/blockjacobi.jl +++ b/ext/CUDA/blockjacobi.jl @@ -3,7 +3,7 @@ KP.BlockJacobiPreconditioner(J::CUSPARSE.CuSparseMatrixCSR; options...) = BlockJ function KP.create_blocklist(cublocks::CuArray, npart) blocklist = Array{CuMatrix{Float64}}(undef, npart) for b in 1:npart - blocklist[b] = CuMatrix{Float64}(undef, size(cublocks)...) + blocklist[b] = CuMatrix{Float64}(undef, size(cublocks,1), size(cublocks,2)) end return blocklist end diff --git a/ext/oneAPI/blockjacobi.jl b/ext/oneAPI/blockjacobi.jl index d5c768c..0bb7cea 100644 --- a/ext/oneAPI/blockjacobi.jl +++ b/ext/oneAPI/blockjacobi.jl @@ -3,7 +3,7 @@ KP.BlockJacobiPreconditioner(J::oneMKL.oneSparseMatrixCSR; options...) = BlockJa function KP.create_blocklist(cublocks::oneArray, npart) blocklist = Array{oneMatrix{Float64}}(undef, npart) for b in 1:npart - blocklist[b] = oneMatrix{Float64}(undef, size(cublocks)...) + blocklist[b] = oneMatrix{Float64}(undef, size(cublocks,1), size(cublocks,2)) end return blocklist end From 0462ac0f0993360ae68d97729bad1f800a352697 Mon Sep 17 00:00:00 2001 From: Alexis Montoison Date: Mon, 1 Apr 2024 17:10:02 -0400 Subject: [PATCH 3/9] Fix a typo in KrylovPreconditionersoneAPIExt.jl --- ext/KrylovPreconditionersoneAPIExt.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ext/KrylovPreconditionersoneAPIExt.jl b/ext/KrylovPreconditionersoneAPIExt.jl index 32c2e67..4435ba0 100644 --- a/ext/KrylovPreconditionersoneAPIExt.jl +++ b/ext/KrylovPreconditionersoneAPIExt.jl @@ -12,6 +12,6 @@ const KP = KrylovPreconditioners using KernelAbstractions const KA = KernelAbstractions -include("CUDA/blockjacobi.jl") +include("oneAPI/blockjacobi.jl") end From abe7cccc7734b9a82b7becc914c66c346c015399 Mon Sep 17 00:00:00 2001 From: Alexis Montoison Date: Mon, 1 Apr 2024 21:16:35 -0400 Subject: [PATCH 4/9] Fix the tests for oneAPI.jl --- Project.toml | 1 + ext/oneAPI/blockjacobi.jl | 2 +- test/gpu/intel.jl | 2 ++ test/runtests.jl | 14 +++++++------- 4 files changed, 11 insertions(+), 8 deletions(-) diff --git a/Project.toml b/Project.toml index eef01fa..27be2e3 100644 --- a/Project.toml +++ b/Project.toml @@ -25,6 +25,7 @@ KrylovPreconditionersoneAPIExt = "oneAPI" AMDGPU = "0.8.3, 0.9" Adapt = "3, 4" CUDA = "5.3.0" +oneAPI = "1.5.0" KernelAbstractions = "0.9" Krylov = "0.9.4" LightGraphs = "1" diff --git a/ext/oneAPI/blockjacobi.jl b/ext/oneAPI/blockjacobi.jl index 0bb7cea..8f195a4 100644 --- a/ext/oneAPI/blockjacobi.jl +++ b/ext/oneAPI/blockjacobi.jl @@ -25,7 +25,7 @@ function _update_gpu(p, j_rowptr, j_colval, j_nzval, device::oneAPIBackend) for b in 1:nblocks p.blocklist[b] .= p.cublocks[:,:,b] end - oneAPI.@sync pivot = oneMKL.getrf_batched!(p.blocklist) + oneAPI.@sync pivot, p.blocklist = oneMKL.getrf_batched!(p.blocklist) oneAPI.@sync pivot, p.blocklist = oneMKL.getri_batched!(p.blocklist, pivot) for b in 1:nblocks p.cublocks[:,:,b] .= p.blocklist[b] diff --git a/test/gpu/intel.jl b/test/gpu/intel.jl index 1f09257..199bd08 100644 --- a/test/gpu/intel.jl +++ b/test/gpu/intel.jl @@ -1,5 +1,7 @@ using oneAPI, oneAPI.oneMKL +_get_type(J::oneSparseMatrixCSR) = oneArray{Float64, 1, oneAPI.oneL0.DeviceBuffer} +_is_csr(J::oneSparseMatrixCSR) = true include("gpu.jl") @testset "Intel -- oneAPI.jl" begin diff --git a/test/runtests.jl b/test/runtests.jl index 778a6fb..9a18c02 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,6 +1,6 @@ using AMDGPU using CUDA -# using oneAPI +using oneAPI using Test @testset "KrylovPreconditioners" begin @@ -18,10 +18,10 @@ if CUDA.functional() end end -# if oneAPI.functional() -# @info "Testing oneAPI backend" -# @testset "Testing oneAPI backend" begin -# include("gpu/intel.jl") -# end -# end +if oneAPI.functional() + @info "Testing oneAPI backend" + @testset "Testing oneAPI backend" begin + include("gpu/intel.jl") + end +end end From 0090e7947960e4f2b980f9b050786680113cf80e Mon Sep 17 00:00:00 2001 From: Alexis Montoison Date: Thu, 4 Apr 2024 01:16:50 -0400 Subject: [PATCH 5/9] Add KrylovOperator and TriangularOperator for oneAPI.jl --- ext/KrylovPreconditionersoneAPIExt.jl | 2 + ext/oneAPI/operators.jl | 95 +++++++++++++++++++++ test/gpu/gpu.jl | 118 ++++++++++++++------------ test/gpu/intel.jl | 12 +++ 4 files changed, 172 insertions(+), 55 deletions(-) create mode 100644 ext/oneAPI/operators.jl diff --git a/ext/KrylovPreconditionersoneAPIExt.jl b/ext/KrylovPreconditionersoneAPIExt.jl index 4435ba0..36ff42a 100644 --- a/ext/KrylovPreconditionersoneAPIExt.jl +++ b/ext/KrylovPreconditionersoneAPIExt.jl @@ -2,6 +2,7 @@ module KrylovPreconditionersoneAPIExt using LinearAlgebra using SparseArrays using oneAPI +using oneAPI: global_queue, sycl_queue, context, device using oneAPI.oneMKL using LinearAlgebra: checksquare, BlasReal, BlasFloat import LinearAlgebra: ldiv!, mul! @@ -13,5 +14,6 @@ using KernelAbstractions const KA = KernelAbstractions include("oneAPI/blockjacobi.jl") +include("oneAPI/operators.jl") end diff --git a/ext/oneAPI/operators.jl b/ext/oneAPI/operators.jl new file mode 100644 index 0000000..b4d8603 --- /dev/null +++ b/ext/oneAPI/operators.jl @@ -0,0 +1,95 @@ +mutable struct INTEL_KrylovOperator{T} <: AbstractKrylovOperator{T} + type::Type{T} + m::Int + n::Int + nrhs::Int + transa::Char + matrix::oneSparseMatrixCSR{T} +end + +eltype(A::INTEL_KrylovOperator{T}) where T = T +size(A::INTEL_KrylovOperator) = (A.m, A.n) + +for (SparseMatrixType, BlasType) in ((:(oneSparseMatrixCSR{T}), :BlasFloat),) + @eval begin + function KP.KrylovOperator(A::$SparseMatrixType; nrhs::Int=1, transa::Char='N') where T <: $BlasType + m,n = size(A) + if nrhs == 1 + oneMKL.sparse_optimize_gemv!(transa, A) + end + # sparse_optimize_gemm! is only available with oneAPI 2024.1.0 + return INTEL_KrylovOperator{T}(T, m, n, nrhs, transa, A) + end + + function KP.update!(A::INTEL_KrylovOperator{T}, B::$SparseMatrixType) where T <: $BlasFloat + error("The update of an INTEL_KrylovOperator is not supported.") + end + end +end + +function LinearAlgebra.mul!(y::oneVector{T}, A::INTEL_KrylovOperator{T}, x::oneVector{T}) where T <: BlasFloat + (length(y) != A.m) && throw(DimensionMismatch("length(y) != A.m")) + (length(x) != A.n) && throw(DimensionMismatch("length(x) != A.n")) + (A.nrhs == 1) || throw(DimensionMismatch("A.nrhs != 1")) + alpha = one(T) + beta = zero(T) + oneMKL.sparse_gemv!(A.transa, alpha, A.matrix, x, beta, y) +end + +function LinearAlgebra.mul!(Y::oneMatrix{T}, A::INTEL_KrylovOperator{T}, X::oneMatrix{T}) where T <: BlasFloat + mY, nY = size(Y) + mX, nX = size(X) + (mY != A.m) && throw(DimensionMismatch("mY != A.m")) + (mX != A.n) && throw(DimensionMismatch("mX != A.n")) + (nY == nX == A.nrhs) || throw(DimensionMismatch("nY != A.nrhs or nX != A.nrhs")) + alpha = one(T) + beta = zero(T) + oneMKL.sparse_gemm!(A.transa, 'N', alpha, A.matrix, X, beta, Y) +end + +mutable struct INTEL_TriangularOperator{T} <: AbstractTriangularOperator{T} + type::Type{T} + m::Int + n::Int + nrhs::Int + uplo::Char + diag::Char + transa::Char + matrix::oneSparseMatrixCSR{T} +end + +eltype(A::INTEL_TriangularOperator{T}) where T = T +size(A::INTEL_TriangularOperator) = (A.m, A.n) + +for (SparseMatrixType, BlasType) in ((:(oneSparseMatrixCSR{T}), :BlasFloat),) + @eval begin + function KP.TriangularOperator(A::$SparseMatrixType, uplo::Char, diag::Char; nrhs::Int=1, transa::Char='N') where T <: $BlasType + m,n = size(A) + if nrhs == 1 + oneMKL.sparse_optimize_trsv!(uplo, transa, diag, A) + end + # sparse_optimize_trsm! is only available with oneAPI 2024.1.0 + return INTEL_TriangularOperator{T}(T, m, n, nrhs, uplo, diag, transa, A) + end + + function KP.update!(A::INTEL_TriangularOperator{T}, B::$SparseMatrixType) where T <: $BlasFloat + return error("The update of an INTEL_TriangularOperator is not supported.") + end + end +end + +function LinearAlgebra.ldiv!(y::oneVector{T}, A::INTEL_TriangularOperator{T}, x::oneVector{T}) where T <: BlasFloat + (length(y) != A.m) && throw(DimensionMismatch("length(y) != A.m")) + (length(x) != A.n) && throw(DimensionMismatch("length(x) != A.n")) + (A.nrhs == 1) || throw(DimensionMismatch("A.nrhs != 1")) + oneMKL.sparse_trsv!(A.uplo, A.transa, A.diag, A.matrix, x, y) +end + +function LinearAlgebra.ldiv!(Y::oneMatrix{T}, A::INTEL_TriangularOperator{T}, X::oneMatrix{T}) where T <: BlasFloat + mY, nY = size(Y) + mX, nX = size(X) + (mY != A.m) && throw(DimensionMismatch("mY != A.m")) + (mX != A.n) && throw(DimensionMismatch("mX != A.n")) + (nY == nX == A.nrhs) || throw(DimensionMismatch("nY != A.nrhs or nX != A.nrhs")) + error("The routine sparse_trsm! is only available with oneAPI 2024.1.0") +end diff --git a/test/gpu/gpu.jl b/test/gpu/gpu.jl index 9f98d6d..87bbd45 100644 --- a/test/gpu/gpu.jl +++ b/test/gpu/gpu.jl @@ -90,41 +90,45 @@ function test_operator(FC, V, DM, SM) mul!(y_gpu, opA_gpu, x_gpu) @test collect(y_gpu) ≈ y_cpu end - for j = 1:5 - y_cpu = rand(FC, m) - x_cpu = rand(FC, n) - A_cpu2 = A_cpu + j*I - mul!(y_cpu, A_cpu2, x_cpu) - y_gpu = V(y_cpu) - x_gpu = V(x_cpu) - A_gpu2 = SM(A_cpu2) - update!(opA_gpu, A_gpu2) - mul!(y_gpu, opA_gpu, x_gpu) - @test collect(y_gpu) ≈ y_cpu + if V.body.name.name != :oneArray + for j = 1:5 + y_cpu = rand(FC, m) + x_cpu = rand(FC, n) + A_cpu2 = A_cpu + j*I + mul!(y_cpu, A_cpu2, x_cpu) + y_gpu = V(y_cpu) + x_gpu = V(x_cpu) + A_gpu2 = SM(A_cpu2) + update!(opA_gpu, A_gpu2) + mul!(y_gpu, opA_gpu, x_gpu) + @test collect(y_gpu) ≈ y_cpu + end end - nrhs = 3 - opA_gpu = KrylovOperator(A_gpu; nrhs) - for i = 1:5 - Y_cpu = rand(FC, m, nrhs) - X_cpu = rand(FC, n, nrhs) - mul!(Y_cpu, A_cpu, X_cpu) - Y_gpu = DM(Y_cpu) - X_gpu = DM(X_cpu) - mul!(Y_gpu, opA_gpu, X_gpu) - @test collect(Y_gpu) ≈ Y_cpu - end - for j = 1:5 - Y_cpu = rand(FC, m, nrhs) - X_cpu = rand(FC, n, nrhs) - A_cpu2 = A_cpu + j*I - mul!(Y_cpu, A_cpu2, X_cpu) - Y_gpu = DM(Y_cpu) - X_gpu = DM(X_cpu) - A_gpu2 = SM(A_cpu2) - update!(opA_gpu, A_gpu2) - mul!(Y_gpu, opA_gpu, X_gpu) - @test collect(Y_gpu) ≈ Y_cpu + if V.body.name.name != :oneArray + nrhs = 3 + opA_gpu = KrylovOperator(A_gpu; nrhs) + for i = 1:5 + Y_cpu = rand(FC, m, nrhs) + X_cpu = rand(FC, n, nrhs) + mul!(Y_cpu, A_cpu, X_cpu) + Y_gpu = DM(Y_cpu) + X_gpu = DM(X_cpu) + mul!(Y_gpu, opA_gpu, X_gpu) + @test collect(Y_gpu) ≈ Y_cpu + end + for j = 1:5 + Y_cpu = rand(FC, m, nrhs) + X_cpu = rand(FC, n, nrhs) + A_cpu2 = A_cpu + j*I + mul!(Y_cpu, A_cpu2, X_cpu) + Y_gpu = DM(Y_cpu) + X_gpu = DM(X_cpu) + A_gpu2 = SM(A_cpu2) + update!(opA_gpu, A_gpu2) + mul!(Y_gpu, opA_gpu, X_gpu) + @test collect(Y_gpu) ≈ Y_cpu + end end end @@ -152,17 +156,19 @@ function test_triangular(FC, V, DM, SM) ldiv!(y_gpu, opA_gpu, x_gpu) @test collect(y_gpu) ≈ y_cpu end - for j = 1:5 - y_cpu = rand(FC, n) - x_cpu = rand(FC, n) - A_cpu2 = A_cpu + j*tril(A_cpu,-1) + j*triu(A_cpu,1) - ldiv!(y_cpu, triangle(A_cpu2), x_cpu) - y_gpu = V(y_cpu) - x_gpu = V(x_cpu) - A_gpu2 = SM(A_cpu2) - update!(opA_gpu, A_gpu2) - ldiv!(y_gpu, opA_gpu, x_gpu) - @test collect(y_gpu) ≈ y_cpu + if V.body.name.name != :oneArray + for j = 1:5 + y_cpu = rand(FC, n) + x_cpu = rand(FC, n) + A_cpu2 = A_cpu + j*tril(A_cpu,-1) + j*triu(A_cpu,1) + ldiv!(y_cpu, triangle(A_cpu2), x_cpu) + y_gpu = V(y_cpu) + x_gpu = V(x_cpu) + A_gpu2 = SM(A_cpu2) + update!(opA_gpu, A_gpu2) + ldiv!(y_gpu, opA_gpu, x_gpu) + @test collect(y_gpu) ≈ y_cpu + end end nrhs = 3 @@ -176,17 +182,19 @@ function test_triangular(FC, V, DM, SM) ldiv!(Y_gpu, opA_gpu, X_gpu) @test collect(Y_gpu) ≈ Y_cpu end - for j = 1:5 - Y_cpu = rand(FC, n, nrhs) - X_cpu = rand(FC, n, nrhs) - A_cpu2 = A_cpu + j*tril(A_cpu,-1) + j*triu(A_cpu,1) - ldiv!(Y_cpu, triangle(A_cpu2), X_cpu) - Y_gpu = DM(Y_cpu) - X_gpu = DM(X_cpu) - A_gpu2 = SM(A_cpu2) - update!(opA_gpu, A_gpu2) - ldiv!(Y_gpu, opA_gpu, X_gpu) - @test collect(Y_gpu) ≈ Y_cpu + if V.body.name.name != :oneArray + for j = 1:5 + Y_cpu = rand(FC, n, nrhs) + X_cpu = rand(FC, n, nrhs) + A_cpu2 = A_cpu + j*tril(A_cpu,-1) + j*triu(A_cpu,1) + ldiv!(Y_cpu, triangle(A_cpu2), X_cpu) + Y_gpu = DM(Y_cpu) + X_gpu = DM(X_cpu) + A_gpu2 = SM(A_cpu2) + update!(opA_gpu, A_gpu2) + ldiv!(Y_gpu, opA_gpu, X_gpu) + @test collect(Y_gpu) ≈ Y_cpu + end end end end diff --git a/test/gpu/intel.jl b/test/gpu/intel.jl index 199bd08..966780c 100644 --- a/test/gpu/intel.jl +++ b/test/gpu/intel.jl @@ -9,6 +9,18 @@ include("gpu.jl") @test oneAPI.functional() oneAPI.allowscalar(false) + @testset "KrylovOperator" begin + @testset "oneSparseMatrixCSR -- $FC" for FC in (Float64, ComplexF64) + test_operator(FC, oneVector{FC}, oneMatrix{FC}, oneSparseMatrixCSR) + end + end + + @testset "TriangularOperator" begin + @testset "oneSparseMatrixCSR -- $FC" for FC in (Float64, ComplexF64) + test_triangular(FC, oneVector{FC}, oneMatrix{FC}, oneSparseMatrixCSR) + end + end + @testset "Block Jacobi preconditioner" begin test_block_jacobi(oneAPIBackend(), oneArray, oneSparseMatrixCSR) end From 384c9d5499b44abdbd3086cac2b5465c6c861c24 Mon Sep 17 00:00:00 2001 From: Alexis Montoison Date: Wed, 29 May 2024 22:21:14 -0400 Subject: [PATCH 6/9] Update oneAPI extension --- .buildkite/pipeline.yml | 8 ++++---- ext/oneAPI/operators.jl | 9 +++++---- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index d10d0c1..878e0a3 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -34,8 +34,8 @@ steps: julia --color=yes --project=test/gpu -e ' using Pkg Pkg.develop(path=".") - # Pkg.add("AMDGPU") - Pkg.add(url="https://github.com/JuliaGPU/AMDGPU.jl", rev="master") + Pkg.add("AMDGPU") + # Pkg.add(url="https://github.com/JuliaGPU/AMDGPU.jl", rev="master") Pkg.add("Krylov") Pkg.instantiate() include("test/gpu/amd.jl")' @@ -52,8 +52,8 @@ steps: julia --color=yes --project=test/gpu -e ' using Pkg Pkg.develop(path=".") - # Pkg.add("oneAPI") - Pkg.add(url="https://github.com/JuliaGPU/oneAPI.jl", rev="master") + Pkg.add("oneAPI") + # Pkg.add(url="https://github.com/JuliaGPU/oneAPI.jl", rev="master") Pkg.add("Krylov") Pkg.instantiate() include("test/gpu/intel.jl")' diff --git a/ext/oneAPI/operators.jl b/ext/oneAPI/operators.jl index b4d8603..f69bab8 100644 --- a/ext/oneAPI/operators.jl +++ b/ext/oneAPI/operators.jl @@ -17,7 +17,7 @@ for (SparseMatrixType, BlasType) in ((:(oneSparseMatrixCSR{T}), :BlasFloat),) if nrhs == 1 oneMKL.sparse_optimize_gemv!(transa, A) end - # sparse_optimize_gemm! is only available with oneAPI 2024.1.0 + # sparse_optimize_gemm! is only available with oneAPI > v2024.1.0 return INTEL_KrylovOperator{T}(T, m, n, nrhs, transa, A) end @@ -67,8 +67,9 @@ for (SparseMatrixType, BlasType) in ((:(oneSparseMatrixCSR{T}), :BlasFloat),) m,n = size(A) if nrhs == 1 oneMKL.sparse_optimize_trsv!(uplo, transa, diag, A) + else + oneMKL.sparse_optimize_trsm!(uplo, transa, diag, nrhs, A) end - # sparse_optimize_trsm! is only available with oneAPI 2024.1.0 return INTEL_TriangularOperator{T}(T, m, n, nrhs, uplo, diag, transa, A) end @@ -82,7 +83,7 @@ function LinearAlgebra.ldiv!(y::oneVector{T}, A::INTEL_TriangularOperator{T}, x: (length(y) != A.m) && throw(DimensionMismatch("length(y) != A.m")) (length(x) != A.n) && throw(DimensionMismatch("length(x) != A.n")) (A.nrhs == 1) || throw(DimensionMismatch("A.nrhs != 1")) - oneMKL.sparse_trsv!(A.uplo, A.transa, A.diag, A.matrix, x, y) + oneMKL.sparse_trsv!(A.uplo, A.transa, A.diag, one(T), A.matrix, x, y) end function LinearAlgebra.ldiv!(Y::oneMatrix{T}, A::INTEL_TriangularOperator{T}, X::oneMatrix{T}) where T <: BlasFloat @@ -91,5 +92,5 @@ function LinearAlgebra.ldiv!(Y::oneMatrix{T}, A::INTEL_TriangularOperator{T}, X: (mY != A.m) && throw(DimensionMismatch("mY != A.m")) (mX != A.n) && throw(DimensionMismatch("mX != A.n")) (nY == nX == A.nrhs) || throw(DimensionMismatch("nY != A.nrhs or nX != A.nrhs")) - error("The routine sparse_trsm! is only available with oneAPI 2024.1.0") + oneMKL.sparse_trsm!(A.uplo, A.transa, 'N', A.diag, one(T), A.matrix, X, Y) end From a6c678578e816ee6d157b598e0fb1bfb4c53a357 Mon Sep 17 00:00:00 2001 From: Alexis Montoison Date: Wed, 29 May 2024 22:27:29 -0400 Subject: [PATCH 7/9] Update oneAPI extension --- test/gpu/gpu.jl | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/test/gpu/gpu.jl b/test/gpu/gpu.jl index 87bbd45..20b5911 100644 --- a/test/gpu/gpu.jl +++ b/test/gpu/gpu.jl @@ -105,18 +105,18 @@ function test_operator(FC, V, DM, SM) end end + nrhs = 3 + opA_gpu = KrylovOperator(A_gpu; nrhs) + for i = 1:5 + Y_cpu = rand(FC, m, nrhs) + X_cpu = rand(FC, n, nrhs) + mul!(Y_cpu, A_cpu, X_cpu) + Y_gpu = DM(Y_cpu) + X_gpu = DM(X_cpu) + mul!(Y_gpu, opA_gpu, X_gpu) + @test collect(Y_gpu) ≈ Y_cpu + end if V.body.name.name != :oneArray - nrhs = 3 - opA_gpu = KrylovOperator(A_gpu; nrhs) - for i = 1:5 - Y_cpu = rand(FC, m, nrhs) - X_cpu = rand(FC, n, nrhs) - mul!(Y_cpu, A_cpu, X_cpu) - Y_gpu = DM(Y_cpu) - X_gpu = DM(X_cpu) - mul!(Y_gpu, opA_gpu, X_gpu) - @test collect(Y_gpu) ≈ Y_cpu - end for j = 1:5 Y_cpu = rand(FC, m, nrhs) X_cpu = rand(FC, n, nrhs) From 48bcd8c7eb0edf724baf2c6dd26fb02d13b5cf37 Mon Sep 17 00:00:00 2001 From: Alexis Montoison Date: Wed, 29 May 2024 22:30:03 -0400 Subject: [PATCH 8/9] Update oneAPI extension --- Project.toml | 2 +- test/Project.toml | 1 + test/gpu/intel.jl | 10 +++++----- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/Project.toml b/Project.toml index 27be2e3..065c820 100644 --- a/Project.toml +++ b/Project.toml @@ -22,7 +22,7 @@ KrylovPreconditionersCUDAExt = "CUDA" KrylovPreconditionersoneAPIExt = "oneAPI" [compat] -AMDGPU = "0.8.3, 0.9" +AMDGPU = "0.9" Adapt = "3, 4" CUDA = "5.3.0" oneAPI = "1.5.0" diff --git a/test/Project.toml b/test/Project.toml index f16ff85..b04c043 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -6,3 +6,4 @@ LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" +oneAPI = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b" diff --git a/test/gpu/intel.jl b/test/gpu/intel.jl index 966780c..67aff43 100644 --- a/test/gpu/intel.jl +++ b/test/gpu/intel.jl @@ -10,18 +10,18 @@ include("gpu.jl") oneAPI.allowscalar(false) @testset "KrylovOperator" begin - @testset "oneSparseMatrixCSR -- $FC" for FC in (Float64, ComplexF64) + @testset "oneSparseMatrixCSR -- $FC" for FC in (Float32, ComplexF32) test_operator(FC, oneVector{FC}, oneMatrix{FC}, oneSparseMatrixCSR) end end @testset "TriangularOperator" begin - @testset "oneSparseMatrixCSR -- $FC" for FC in (Float64, ComplexF64) + @testset "oneSparseMatrixCSR -- $FC" for FC in (Float32, ComplexF32) test_triangular(FC, oneVector{FC}, oneMatrix{FC}, oneSparseMatrixCSR) end end - @testset "Block Jacobi preconditioner" begin - test_block_jacobi(oneAPIBackend(), oneArray, oneSparseMatrixCSR) - end + # @testset "Block Jacobi preconditioner" begin + # test_block_jacobi(oneAPIBackend(), oneArray, oneSparseMatrixCSR) + # end end From 73fdbc94063fce6b2716ce138c033fd33c3d8321 Mon Sep 17 00:00:00 2001 From: Alexis Montoison Date: Wed, 29 May 2024 22:33:32 -0400 Subject: [PATCH 9/9] Update oneAPI extension --- test/gpu/intel.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/gpu/intel.jl b/test/gpu/intel.jl index 67aff43..7b06824 100644 --- a/test/gpu/intel.jl +++ b/test/gpu/intel.jl @@ -10,13 +10,13 @@ include("gpu.jl") oneAPI.allowscalar(false) @testset "KrylovOperator" begin - @testset "oneSparseMatrixCSR -- $FC" for FC in (Float32, ComplexF32) + @testset "oneSparseMatrixCSR -- $FC" for FC in (Float32,) # ComplexF32) test_operator(FC, oneVector{FC}, oneMatrix{FC}, oneSparseMatrixCSR) end end @testset "TriangularOperator" begin - @testset "oneSparseMatrixCSR -- $FC" for FC in (Float32, ComplexF32) + @testset "oneSparseMatrixCSR -- $FC" for FC in (Float32,) # ComplexF32) test_triangular(FC, oneVector{FC}, oneMatrix{FC}, oneSparseMatrixCSR) end end