diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index d91d5c1..878e0a3 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -34,8 +34,8 @@ steps:
       julia --color=yes --project=test/gpu -e '
         using Pkg
         Pkg.develop(path=".")
-        # Pkg.add("AMDGPU")
-        Pkg.add(url="https://github.com/JuliaGPU/AMDGPU.jl", rev="master")
+        Pkg.add("AMDGPU")
+        # Pkg.add(url="https://github.com/JuliaGPU/AMDGPU.jl", rev="master")
         Pkg.add("Krylov")
         Pkg.instantiate()
         include("test/gpu/amd.jl")'
@@ -53,6 +53,7 @@ steps:
         using Pkg
         Pkg.develop(path=".")
         Pkg.add("oneAPI")
+        # Pkg.add(url="https://github.com/JuliaGPU/oneAPI.jl", rev="master")
         Pkg.add("Krylov")
         Pkg.instantiate()
         include("test/gpu/intel.jl")'
diff --git a/Project.toml b/Project.toml
index c46016e..065c820 100644
--- a/Project.toml
+++ b/Project.toml
@@ -14,15 +14,18 @@ SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 [weakdeps]
 AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+oneAPI = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b"
 
 [extensions]
 KrylovPreconditionersAMDGPUExt = "AMDGPU"
 KrylovPreconditionersCUDAExt = "CUDA"
+KrylovPreconditionersoneAPIExt = "oneAPI"
 
 [compat]
-AMDGPU = "0.8.3, 0.9"
+AMDGPU = "0.9"
 Adapt = "3, 4"
 CUDA = "5.3.0"
+oneAPI = "1.5.0"
 KernelAbstractions = "0.9"
 Krylov = "0.9.4"
 LightGraphs = "1"
diff --git a/ext/AMDGPU/blockjacobi.jl b/ext/AMDGPU/blockjacobi.jl
index dd02070..1dab800 100644
--- a/ext/AMDGPU/blockjacobi.jl
+++ b/ext/AMDGPU/blockjacobi.jl
@@ -1,7 +1,7 @@
 KP.BlockJacobiPreconditioner(J::rocSPARSE.ROCSparseMatrixCSR; options...) = BlockJacobiPreconditioner(SparseMatrixCSC(J); options...)
 
 function KP.create_blocklist(cublocks::ROCArray, npart)
-    blocklist = Array{ROCArray{Float64,2}}(undef, npart)
+    blocklist = Array{ROCMatrix{Float64}}(undef, npart)
     for b in 1:npart
         blocklist[b] = ROCMatrix{Float64}(undef, size(cublocks,1), size(cublocks,2))
     end
@@ -25,8 +25,8 @@ function _update_gpu(p, j_rowptr, j_colval, j_nzval, device::ROCBackend)
     for b in 1:nblocks
         p.blocklist[b] .= p.cublocks[:,:,b]
     end
-    AMDGPU.@sync pivot, info = AMDGPU.rocSOLVER.getrf_batched!(p.blocklist)
-    AMDGPU.@sync pivot, info, p.blocklist = AMDGPU.rocSOLVER.getri_batched!(p.blocklist, pivot)
+    AMDGPU.@sync pivot, info = rocSOLVER.getrf_batched!(p.blocklist)
+    AMDGPU.@sync pivot, info, p.blocklist = rocSOLVER.getri_batched!(p.blocklist, pivot)
     for b in 1:nblocks
         p.cublocks[:,:,b] .= p.blocklist[b]
     end
diff --git a/ext/CUDA/blockjacobi.jl b/ext/CUDA/blockjacobi.jl
index 1e28d8a..5c33865 100644
--- a/ext/CUDA/blockjacobi.jl
+++ b/ext/CUDA/blockjacobi.jl
@@ -1,7 +1,7 @@
 KP.BlockJacobiPreconditioner(J::CUSPARSE.CuSparseMatrixCSR; options...) = BlockJacobiPreconditioner(SparseMatrixCSC(J); options...)
 
 function KP.create_blocklist(cublocks::CuArray, npart)
-    blocklist = Array{CuArray{Float64,2}}(undef, npart)
+    blocklist = Array{CuMatrix{Float64}}(undef, npart)
     for b in 1:npart
         blocklist[b] = CuMatrix{Float64}(undef, size(cublocks,1), size(cublocks,2))
     end
@@ -25,8 +25,8 @@ function _update_gpu(p, j_rowptr, j_colval, j_nzval, device::CUDABackend)
     for b in 1:nblocks
         p.blocklist[b] .= p.cublocks[:,:,b]
     end
-    CUDA.@sync pivot, info = CUDA.CUBLAS.getrf_batched!(p.blocklist, true)
-    CUDA.@sync pivot, info, p.blocklist = CUDA.CUBLAS.getri_batched(p.blocklist, pivot)
+    CUDA.@sync pivot, info = CUBLAS.getrf_batched!(p.blocklist, true)
+    CUDA.@sync pivot, info, p.blocklist = CUBLAS.getri_batched(p.blocklist, pivot)
     for b in 1:nblocks
         p.cublocks[:,:,b] .= p.blocklist[b]
     end
diff --git a/ext/KrylovPreconditionersAMDGPUExt.jl b/ext/KrylovPreconditionersAMDGPUExt.jl
index 1c9cc18..9f9b0e0 100644
--- a/ext/KrylovPreconditionersAMDGPUExt.jl
+++ b/ext/KrylovPreconditionersAMDGPUExt.jl
@@ -2,7 +2,7 @@ module KrylovPreconditionersAMDGPUExt
 using LinearAlgebra
 using SparseArrays
 using AMDGPU
-using AMDGPU.rocSPARSE
+using AMDGPU.rocSPARSE, AMDGPU.rocSOLVER
 using LinearAlgebra: checksquare, BlasReal, BlasFloat
 import LinearAlgebra: ldiv!, mul!
 import Base: size, eltype, unsafe_convert
diff --git a/ext/KrylovPreconditionersCUDAExt.jl b/ext/KrylovPreconditionersCUDAExt.jl
index b074141..47a6899 100644
--- a/ext/KrylovPreconditionersCUDAExt.jl
+++ b/ext/KrylovPreconditionersCUDAExt.jl
@@ -2,7 +2,7 @@ module KrylovPreconditionersCUDAExt
 using LinearAlgebra
 using SparseArrays
 using CUDA
-using CUDA.CUSPARSE
+using CUDA.CUSPARSE, CUDA.CUBLAS
 using LinearAlgebra: checksquare, BlasReal, BlasFloat
 import LinearAlgebra: ldiv!, mul!
 import Base: size, eltype, unsafe_convert
diff --git a/ext/KrylovPreconditionersoneAPIExt.jl b/ext/KrylovPreconditionersoneAPIExt.jl
new file mode 100644
index 0000000..36ff42a
--- /dev/null
+++ b/ext/KrylovPreconditionersoneAPIExt.jl
@@ -0,0 +1,19 @@
+module KrylovPreconditionersoneAPIExt
+using LinearAlgebra
+using SparseArrays
+using oneAPI
+using oneAPI: global_queue, sycl_queue, context, device
+using oneAPI.oneMKL
+using LinearAlgebra: checksquare, BlasReal, BlasFloat
+import LinearAlgebra: ldiv!, mul!
+import Base: size, eltype, unsafe_convert
+
+using KrylovPreconditioners
+const KP = KrylovPreconditioners
+using KernelAbstractions
+const KA = KernelAbstractions
+
+include("oneAPI/blockjacobi.jl")
+include("oneAPI/operators.jl")
+
+end
diff --git a/ext/oneAPI/blockjacobi.jl b/ext/oneAPI/blockjacobi.jl
new file mode 100644
index 0000000..8f195a4
--- /dev/null
+++ b/ext/oneAPI/blockjacobi.jl
@@ -0,0 +1,48 @@
+KP.BlockJacobiPreconditioner(J::oneMKL.oneSparseMatrixCSR; options...) = BlockJacobiPreconditioner(SparseMatrixCSC(J); options...)
+
+function KP.create_blocklist(cublocks::oneArray, npart)
+    blocklist = Array{oneMatrix{Float64}}(undef, npart)
+    for b in 1:npart
+        blocklist[b] = oneMatrix{Float64}(undef, size(cublocks,1), size(cublocks,2))
+    end
+    return blocklist
+end
+
+function _update_gpu(p, j_rowptr, j_colval, j_nzval, device::oneAPIBackend)
+    nblocks = p.nblocks
+    blocksize = p.blocksize
+    fillblock_gpu_kernel! = KP._fillblock_gpu!(device)
+    # Fill Block Jacobi" begin
+    fillblock_gpu_kernel!(
+        p.cublocks, size(p.id,1),
+        p.cupartitions, p.cumap,
+        j_rowptr, j_colval, j_nzval,
+        p.cupart, p.culpartitions, p.id,
+        ndrange=(nblocks, blocksize),
+    )
+    KA.synchronize(device)
+    # Invert blocks begin
+    for b in 1:nblocks
+        p.blocklist[b] .= p.cublocks[:,:,b]
+    end
+    oneAPI.@sync pivot, p.blocklist = oneMKL.getrf_batched!(p.blocklist)
+    oneAPI.@sync pivot, p.blocklist = oneMKL.getri_batched!(p.blocklist, pivot)
+    for b in 1:nblocks
+        p.cublocks[:,:,b] .= p.blocklist[b]
+    end
+    return
+end
+
+"""
+    function update!(J::oneSparseMatrixCSR, p)
+
+Update the preconditioner `p` from the sparse Jacobian `J` in CSR format for oneAPI
+
+1) The dense blocks `cuJs` are filled from the sparse Jacobian `J`
+2) To a batch inversion of the dense blocks using oneMKL
+3) Extract the preconditioner matrix `p.P` from the dense blocks `cuJs`
+
+"""
+function KP.update!(p::BlockJacobiPreconditioner, J::oneMKL.oneSparseMatrixCSR)
+    _update_gpu(p, J.rowPtr, J.colVal, J.nzVal, p.device)
+end
diff --git a/ext/oneAPI/operators.jl b/ext/oneAPI/operators.jl
new file mode 100644
index 0000000..f69bab8
--- /dev/null
+++ b/ext/oneAPI/operators.jl
@@ -0,0 +1,96 @@
+mutable struct INTEL_KrylovOperator{T} <: AbstractKrylovOperator{T}
+    type::Type{T}
+    m::Int
+    n::Int
+    nrhs::Int
+    transa::Char
+    matrix::oneSparseMatrixCSR{T}
+end
+
+eltype(A::INTEL_KrylovOperator{T}) where T = T
+size(A::INTEL_KrylovOperator) = (A.m, A.n)
+
+for (SparseMatrixType, BlasType) in ((:(oneSparseMatrixCSR{T}), :BlasFloat),)
+    @eval begin
+        function KP.KrylovOperator(A::$SparseMatrixType; nrhs::Int=1, transa::Char='N') where T <: $BlasType
+            m,n = size(A)
+            if nrhs == 1
+                oneMKL.sparse_optimize_gemv!(transa, A)
+            end
+            # sparse_optimize_gemm! is only available with oneAPI > v2024.1.0
+            return INTEL_KrylovOperator{T}(T, m, n, nrhs, transa, A)
+        end
+
+        function KP.update!(A::INTEL_KrylovOperator{T}, B::$SparseMatrixType) where T <: $BlasFloat
+            error("The update of an INTEL_KrylovOperator is not supported.")
+        end
+    end
+end
+
+function LinearAlgebra.mul!(y::oneVector{T}, A::INTEL_KrylovOperator{T}, x::oneVector{T}) where T <: BlasFloat
+    (length(y) != A.m) && throw(DimensionMismatch("length(y) != A.m"))
+    (length(x) != A.n) && throw(DimensionMismatch("length(x) != A.n"))
+    (A.nrhs == 1) || throw(DimensionMismatch("A.nrhs != 1"))
+    alpha = one(T)
+    beta = zero(T)
+    oneMKL.sparse_gemv!(A.transa, alpha, A.matrix, x, beta, y)
+end
+
+function LinearAlgebra.mul!(Y::oneMatrix{T}, A::INTEL_KrylovOperator{T}, X::oneMatrix{T}) where T <: BlasFloat
+    mY, nY = size(Y)
+    mX, nX = size(X)
+    (mY != A.m) && throw(DimensionMismatch("mY != A.m"))
+    (mX != A.n) && throw(DimensionMismatch("mX != A.n"))
+    (nY == nX == A.nrhs) || throw(DimensionMismatch("nY != A.nrhs or nX != A.nrhs"))
+    alpha = one(T)
+    beta = zero(T)
+    oneMKL.sparse_gemm!(A.transa, 'N', alpha, A.matrix, X, beta, Y)
+end
+
+mutable struct INTEL_TriangularOperator{T} <: AbstractTriangularOperator{T}
+    type::Type{T}
+    m::Int
+    n::Int
+    nrhs::Int
+    uplo::Char
+    diag::Char
+    transa::Char
+    matrix::oneSparseMatrixCSR{T}
+end
+
+eltype(A::INTEL_TriangularOperator{T}) where T = T
+size(A::INTEL_TriangularOperator) = (A.m, A.n)
+
+for (SparseMatrixType, BlasType) in ((:(oneSparseMatrixCSR{T}), :BlasFloat),)
+    @eval begin
+        function KP.TriangularOperator(A::$SparseMatrixType, uplo::Char, diag::Char; nrhs::Int=1, transa::Char='N') where T <: $BlasType
+            m,n = size(A)
+            if nrhs == 1
+                oneMKL.sparse_optimize_trsv!(uplo, transa, diag, A)
+            else
+                oneMKL.sparse_optimize_trsm!(uplo, transa, diag, nrhs, A)
+            end
+            return INTEL_TriangularOperator{T}(T, m, n, nrhs, uplo, diag, transa, A)
+        end
+
+        function KP.update!(A::INTEL_TriangularOperator{T}, B::$SparseMatrixType) where T <: $BlasFloat
+            return error("The update of an INTEL_TriangularOperator is not supported.")
+        end
+    end
+end
+
+function LinearAlgebra.ldiv!(y::oneVector{T}, A::INTEL_TriangularOperator{T}, x::oneVector{T}) where T <: BlasFloat
+    (length(y) != A.m) && throw(DimensionMismatch("length(y) != A.m"))
+    (length(x) != A.n) && throw(DimensionMismatch("length(x) != A.n"))
+    (A.nrhs == 1) || throw(DimensionMismatch("A.nrhs != 1"))
+    oneMKL.sparse_trsv!(A.uplo, A.transa, A.diag, one(T), A.matrix, x, y)
+end
+
+function LinearAlgebra.ldiv!(Y::oneMatrix{T}, A::INTEL_TriangularOperator{T}, X::oneMatrix{T}) where T <: BlasFloat
+    mY, nY = size(Y)
+    mX, nX = size(X)
+    (mY != A.m) && throw(DimensionMismatch("mY != A.m"))
+    (mX != A.n) && throw(DimensionMismatch("mX != A.n"))
+    (nY == nX == A.nrhs) || throw(DimensionMismatch("nY != A.nrhs or nX != A.nrhs"))
+    oneMKL.sparse_trsm!(A.uplo, A.transa, 'N', A.diag, one(T), A.matrix, X, Y)
+end
diff --git a/test/Project.toml b/test/Project.toml
index f16ff85..b04c043 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -6,3 +6,4 @@ LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+oneAPI = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b"
diff --git a/test/gpu/gpu.jl b/test/gpu/gpu.jl
index 9f98d6d..20b5911 100644
--- a/test/gpu/gpu.jl
+++ b/test/gpu/gpu.jl
@@ -90,17 +90,19 @@ function test_operator(FC, V, DM, SM)
     mul!(y_gpu, opA_gpu, x_gpu)
     @test collect(y_gpu) ≈ y_cpu
   end
-  for j = 1:5
-    y_cpu = rand(FC, m)
-    x_cpu = rand(FC, n)
-    A_cpu2 = A_cpu + j*I
-    mul!(y_cpu, A_cpu2, x_cpu)
-    y_gpu = V(y_cpu)
-    x_gpu = V(x_cpu)
-    A_gpu2 = SM(A_cpu2)
-    update!(opA_gpu, A_gpu2)
-    mul!(y_gpu, opA_gpu, x_gpu)
-    @test collect(y_gpu) ≈ y_cpu
+  if V.body.name.name != :oneArray
+    for j = 1:5
+      y_cpu = rand(FC, m)
+      x_cpu = rand(FC, n)
+      A_cpu2 = A_cpu + j*I
+      mul!(y_cpu, A_cpu2, x_cpu)
+      y_gpu = V(y_cpu)
+      x_gpu = V(x_cpu)
+      A_gpu2 = SM(A_cpu2)
+      update!(opA_gpu, A_gpu2)
+      mul!(y_gpu, opA_gpu, x_gpu)
+      @test collect(y_gpu) ≈ y_cpu
+    end
   end
 
   nrhs = 3
@@ -114,17 +116,19 @@ function test_operator(FC, V, DM, SM)
     mul!(Y_gpu, opA_gpu, X_gpu)
     @test collect(Y_gpu) ≈ Y_cpu
   end
-  for j = 1:5
-    Y_cpu = rand(FC, m, nrhs)
-    X_cpu = rand(FC, n, nrhs)
-    A_cpu2 = A_cpu + j*I
-    mul!(Y_cpu, A_cpu2, X_cpu)
-    Y_gpu = DM(Y_cpu)
-    X_gpu = DM(X_cpu)
-    A_gpu2 = SM(A_cpu2)
-    update!(opA_gpu, A_gpu2)
-    mul!(Y_gpu, opA_gpu, X_gpu)
-    @test collect(Y_gpu) ≈ Y_cpu
+  if V.body.name.name != :oneArray
+    for j = 1:5
+      Y_cpu = rand(FC, m, nrhs)
+      X_cpu = rand(FC, n, nrhs)
+      A_cpu2 = A_cpu + j*I
+      mul!(Y_cpu, A_cpu2, X_cpu)
+      Y_gpu = DM(Y_cpu)
+      X_gpu = DM(X_cpu)
+      A_gpu2 = SM(A_cpu2)
+      update!(opA_gpu, A_gpu2)
+      mul!(Y_gpu, opA_gpu, X_gpu)
+      @test collect(Y_gpu) ≈ Y_cpu
+    end
   end
 end
 
@@ -152,17 +156,19 @@ function test_triangular(FC, V, DM, SM)
       ldiv!(y_gpu, opA_gpu, x_gpu)
       @test collect(y_gpu) ≈ y_cpu
     end
-    for j = 1:5
-      y_cpu = rand(FC, n)
-      x_cpu = rand(FC, n)
-      A_cpu2 = A_cpu + j*tril(A_cpu,-1) + j*triu(A_cpu,1)
-      ldiv!(y_cpu, triangle(A_cpu2), x_cpu)
-      y_gpu = V(y_cpu)
-      x_gpu = V(x_cpu)
-      A_gpu2 = SM(A_cpu2)
-      update!(opA_gpu, A_gpu2)
-      ldiv!(y_gpu, opA_gpu, x_gpu)
-      @test collect(y_gpu) ≈ y_cpu
+    if V.body.name.name != :oneArray
+      for j = 1:5
+        y_cpu = rand(FC, n)
+        x_cpu = rand(FC, n)
+        A_cpu2 = A_cpu + j*tril(A_cpu,-1) + j*triu(A_cpu,1)
+        ldiv!(y_cpu, triangle(A_cpu2), x_cpu)
+        y_gpu = V(y_cpu)
+        x_gpu = V(x_cpu)
+        A_gpu2 = SM(A_cpu2)
+        update!(opA_gpu, A_gpu2)
+        ldiv!(y_gpu, opA_gpu, x_gpu)
+        @test collect(y_gpu) ≈ y_cpu
+      end
     end
 
     nrhs = 3
@@ -176,17 +182,19 @@ function test_triangular(FC, V, DM, SM)
       ldiv!(Y_gpu, opA_gpu, X_gpu)
       @test collect(Y_gpu) ≈ Y_cpu
     end
-    for j = 1:5
-      Y_cpu = rand(FC, n, nrhs)
-      X_cpu = rand(FC, n, nrhs)
-      A_cpu2 = A_cpu + j*tril(A_cpu,-1) + j*triu(A_cpu,1)
-      ldiv!(Y_cpu, triangle(A_cpu2), X_cpu)
-      Y_gpu = DM(Y_cpu)
-      X_gpu = DM(X_cpu)
-      A_gpu2 = SM(A_cpu2)
-      update!(opA_gpu, A_gpu2)
-      ldiv!(Y_gpu, opA_gpu, X_gpu)
-      @test collect(Y_gpu) ≈ Y_cpu
+    if V.body.name.name != :oneArray
+      for j = 1:5
+        Y_cpu = rand(FC, n, nrhs)
+        X_cpu = rand(FC, n, nrhs)
+        A_cpu2 = A_cpu + j*tril(A_cpu,-1) + j*triu(A_cpu,1)
+        ldiv!(Y_cpu, triangle(A_cpu2), X_cpu)
+        Y_gpu = DM(Y_cpu)
+        X_gpu = DM(X_cpu)
+        A_gpu2 = SM(A_cpu2)
+        update!(opA_gpu, A_gpu2)
+        ldiv!(Y_gpu, opA_gpu, X_gpu)
+        @test collect(Y_gpu) ≈ Y_cpu
+      end
     end
   end
 end
diff --git a/test/gpu/intel.jl b/test/gpu/intel.jl
index 0ab2c6c..7b06824 100644
--- a/test/gpu/intel.jl
+++ b/test/gpu/intel.jl
@@ -1,9 +1,27 @@
-using oneAPI
+using oneAPI, oneAPI.oneMKL
 
+_get_type(J::oneSparseMatrixCSR) = oneArray{Float64, 1, oneAPI.oneL0.DeviceBuffer}
+_is_csr(J::oneSparseMatrixCSR) = true
 include("gpu.jl")
 
 @testset "Intel -- oneAPI.jl" begin
 
   @test oneAPI.functional()
   oneAPI.allowscalar(false)
+
+  @testset "KrylovOperator" begin
+    @testset "oneSparseMatrixCSR -- $FC" for FC in (Float32,) # ComplexF32)
+      test_operator(FC, oneVector{FC}, oneMatrix{FC}, oneSparseMatrixCSR)
+    end
+  end
+
+  @testset "TriangularOperator" begin
+    @testset "oneSparseMatrixCSR -- $FC" for FC in (Float32,) # ComplexF32)
+      test_triangular(FC, oneVector{FC}, oneMatrix{FC}, oneSparseMatrixCSR)
+    end
+  end
+
+  # @testset "Block Jacobi preconditioner" begin
+  #   test_block_jacobi(oneAPIBackend(), oneArray, oneSparseMatrixCSR)
+  # end
 end
diff --git a/test/runtests.jl b/test/runtests.jl
index 778a6fb..9a18c02 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,6 +1,6 @@
 using AMDGPU
 using CUDA
-# using oneAPI
+using oneAPI
 using Test
 
 @testset "KrylovPreconditioners" begin
@@ -18,10 +18,10 @@ if CUDA.functional()
     end
 end
 
-# if oneAPI.functional()
-#     @info "Testing oneAPI backend"
-#     @testset "Testing oneAPI backend" begin
-#         include("gpu/intel.jl")
-#     end
-# end
+if oneAPI.functional()
+    @info "Testing oneAPI backend"
+    @testset "Testing oneAPI backend" begin
+        include("gpu/intel.jl")
+    end
+end
 end