diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index 56d7207..2942169 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -33,7 +33,8 @@ steps:
       julia --color=yes --project=test/gpu -e '
         using Pkg
         Pkg.develop(path=".")
-        Pkg.add("AMDGPU")
+        # Pkg.add("AMDGPU")
+        Pkg.add(url="https://github.com/JuliaGPU/AMDGPU.jl", rev="master")
         Pkg.add("Krylov")
         Pkg.instantiate()
         include("test/gpu/amd.jl")'
diff --git a/ext/AMDGPU/operators.jl b/ext/AMDGPU/operators.jl
new file mode 100644
index 0000000..7413d46
--- /dev/null
+++ b/ext/AMDGPU/operators.jl
@@ -0,0 +1,74 @@
+mutable struct AMD_KrylovOperator{T} <: AbstractKrylovOperator{T}
+    type::Type{T}
+    m::Int
+    n::Int
+    nrhs::Int
+    transa::Char
+    descA::rocSPARSE.ROCSparseMatrixDescriptor
+    buffer_size::Ref{Csize_t}
+    buffer::ROCVector{UInt8}
+end
+
+eltype(A::AMD_KrylovOperator{T}) where T = T
+size(A::AMD_KrylovOperator) = (A.m, A.n)
+
+for (SparseMatrixType, BlasType) in ((:(ROCSparseMatrixCSR{T}), :BlasFloat),
+                                     (:(ROCSparseMatrixCSC{T}), :BlasFloat),
+                                     (:(ROCSparseMatrixCOO{T}), :BlasFloat))
+    @eval begin
+        function KP.KrylovOperator(A::$SparseMatrixType; nrhs::Int=1, transa::Char='N') where T <: $BlasType
+            m,n = size(A)
+            if nrhs == 1
+                alpha = Ref{T}(one(T))
+                beta = Ref{T}(zero(T))
+                descA = rocSPARSE.ROCSparseMatrixDescriptor(A, 'O')
+                descX = rocSPARSE.ROCDenseVectorDescriptor(T, n)
+                descY = rocSPARSE.ROCDenseVectorDescriptor(T, m)
+                algo = rocSPARSE.rocSPARSE.rocsparse_spmv_alg_default
+                buffer_size = Ref{Csize_t}()
+                rocSPARSE.rocSPARSE.rocsparse_spmv(rocSPARSE.handle(), transa, alpha, descA, descX, beta, descY, T, algo, buffer_size, C_NULL)
+                buffer = ROCVector{UInt8}(undef, buffer_size[])
+                return AMD_KrylovOperator{T}(T, m, n, nrhs, transa, descA, buffer_size, buffer)
+            else
+                alpha = Ref{T}(one(T))
+                beta = Ref{T}(zero(T))
+                descA = rocSPARSE.ROCSparseMatrixDescriptor(A, 'O')
+                descX = rocSPARSE.ROCDenseMatrixDescriptor(T, n, nrhs)
+                descY = rocSPARSE.ROCDenseMatrixDescriptor(T, m, nrhs)
+                algo = rocSPARSE.rocsparse_spmm_alg_default
+                buffer_size = Ref{Csize_t}()
+                transb = 'N'
+                rocSPARSE.rocsparse_spmm(rocSPARSE.handle(), transa, 'N', alpha, descA, descX, beta, descY, T,
+                                         algo, rocSPARSE.rocsparse_spmm_stage_buffer_size, buffer_size, C_NULL)
+                buffer = ROCVector{UInt8}(undef, buffer_size[])
+                rocSPARSE.rocsparse_spmm(rocSPARSE.handle(), transa, 'N', alpha, descA, descX, beta, descY, T,
+                                         algo, rocSPARSE.rocsparse_spmm_stage_preprocess, buffer_size, buffer)
+                return AMD_KrylovOperator{T}(T, m, n, nrhs, transa, descA, buffer_size, buffer)
+            end
+        end
+    end
+end
+
+function LinearAlgebra.mul!(y::ROCVector{T}, A::AMD_KrylovOperator{T}, x::ROCVector{T}) where T <: BlasFloat
+    (length(y) != A.m) && throw(DimensionMismatch("length(y) != A.m"))
+    (length(x) != A.n) && throw(DimensionMismatch("length(x) != A.n"))
+    (A.nrhs == 1) || throw(DimensionMismatch("A.nrhs != 1"))
+    descY = rocSPARSE.ROCDenseVectorDescriptor(y)
+    descX = rocSPARSE.ROCDenseVectorDescriptor(x)
+    algo = rocSPARSE.rocsparse_spmv_alg_default
+    rocSPARSE.rocsparse_spmv(rocSPARSE.handle(), A.transa, Ref{T}(one(T)), A.descA, descX,
+                             Ref{T}(zero(T)), descY, T, algo, A.buffer_size, A.buffer)
+end
+
+function LinearAlgebra.mul!(Y::ROCMatrix{T}, A::AMD_KrylovOperator{T}, X::ROCMatrix{T}) where T <: BlasFloat
+    mY, nY = size(Y)
+    mX, nX = size(X)
+    (mY != A.m) && throw(DimensionMismatch("mY != A.m"))
+    (mX != A.n) && throw(DimensionMismatch("mX != A.n"))
+    (nY == nX == A.nrhs) || throw(DimensionMismatch("nY != A.nrhs or nX != A.nrhs"))
+    descY = rocSPARSE.ROCDenseMatrixDescriptor(Y)
+    descX = rocSPARSE.ROCDenseMatrixDescriptor(X)
+    algo = rocSPARSE.rocsparse_spmm_alg_default
+    rocSPARSE.rocsparse_spmm(rocSPARSE.handle(), A.transa, 'N', Ref{T}(one(T)), A.descA, descX,
+                             Ref{T}(zero(T)), descY, T, algo, rocSPARSE.rocsparse_spmm_stage_compute, A.buffer_size, A.buffer)
+end
diff --git a/ext/CUDA/operators.jl b/ext/CUDA/operators.jl
new file mode 100644
index 0000000..fb0ec88
--- /dev/null
+++ b/ext/CUDA/operators.jl
@@ -0,0 +1,71 @@
+mutable struct NVIDIA_KrylovOperator{T} <: AbstractKrylovOperator{T}
+    type::Type{T}
+    m::Int
+    n::Int
+    nrhs::Int
+    transa::Char
+    descA::CUSPARSE.CuSparseMatrixDescriptor
+    buffer::CuVector{UInt8}
+end
+
+eltype(A::NVIDIA_KrylovOperator{T}) where T = T
+size(A::NVIDIA_KrylovOperator) = (A.m, A.n)
+
+for (SparseMatrixType, BlasType) in ((:(CuSparseMatrixCSR{T}), :BlasFloat),
+                                     (:(CuSparseMatrixCSC{T}), :BlasFloat),
+                                     (:(CuSparseMatrixCOO{T}), :BlasFloat))
+    @eval begin
+        function KP.KrylovOperator(A::$SparseMatrixType; nrhs::Int=1, transa::Char='N') where T <: $BlasType
+            m,n = size(A)
+            if nrhs == 1
+                alpha = Ref{T}(one(T))
+                beta = Ref{T}(zero(T))
+                descA = CUSPARSE.CuSparseMatrixDescriptor(A, 'O')
+                descX = CUSPARSE.CuDenseVectorDescriptor(T, n)
+                descY = CUSPARSE.CuDenseVectorDescriptor(T, m)
+                algo = CUSPARSE.CUSPARSE_SPMV_ALG_DEFAULT
+                buffer_size = Ref{Csize_t}()
+                CUSPARSE.cusparseSpMV_bufferSize(CUSPARSE.handle(), transa, alpha, descA, descX, beta, descY, T, algo, buffer_size)
+                buffer = CuVector{UInt8}(undef, buffer_size[])
+                return NVIDIA_KrylovOperator{T}(T, m, n, nrhs, transa, descA, buffer)
+            else
+                alpha = Ref{T}(one(T))
+                beta = Ref{T}(zero(T))
+                descA = CUSPARSE.CuSparseMatrixDescriptor(A, 'O')
+                descX = CUSPARSE.CuDenseMatrixDescriptor(T, n, nrhs)
+                descY = CUSPARSE.CuDenseMatrixDescriptor(T, m, nrhs)
+                algo = CUSPARSE.CUSPARSE_SPMM_ALG_DEFAULT
+                buffer_size = Ref{Csize_t}()
+                transb = 'N'
+                CUSPARSE.cusparseSpMM_bufferSize(CUSPARSE.handle(), transa, transb, alpha, descA, descX, beta, descY, T, algo, buffer_size)
+                buffer = CuVector{UInt8}(undef, buffer_size[])
+                if !(A isa CuSparseMatrixCOO)
+                    CUSPARSE.cusparseSpMM_preprocess(CUSPARSE.handle(), transa, transb, alpha, descA, descX, beta, descY, T, algo, buffer)
+                end
+                return NVIDIA_KrylovOperator{T}(T, m, n, nrhs, transa, descA, buffer)
+            end
+        end
+    end
+end
+
+function LinearAlgebra.mul!(y::CuVector{T}, A::NVIDIA_KrylovOperator{T}, x::CuVector{T}) where T <: BlasFloat
+    (length(y) != A.m) && throw(DimensionMismatch("length(y) != A.m"))
+    (length(x) != A.n) && throw(DimensionMismatch("length(x) != A.n"))
+    (A.nrhs == 1) || throw(DimensionMismatch("A.nrhs != 1"))
+    descY = CUSPARSE.CuDenseVectorDescriptor(y)
+    descX = CUSPARSE.CuDenseVectorDescriptor(x)
+    algo = CUSPARSE.CUSPARSE_SPMV_ALG_DEFAULT
+    CUSPARSE.cusparseSpMV(CUSPARSE.handle(), A.transa, Ref{T}(one(T)), A.descA, descX, Ref{T}(zero(T)), descY, T, algo, A.buffer)
+end
+
+function LinearAlgebra.mul!(Y::CuMatrix{T}, A::NVIDIA_KrylovOperator{T}, X::CuMatrix{T}) where T <: BlasFloat
+    mY, nY = size(Y)
+    mX, nX = size(X)
+    (mY != A.m) && throw(DimensionMismatch("mY != A.m"))
+    (mX != A.n) && throw(DimensionMismatch("mX != A.n"))
+    (nY == nX == A.nrhs) || throw(DimensionMismatch("nY != A.nrhs or nX != A.nrhs"))
+    descY = CUSPARSE.CuDenseMatrixDescriptor(Y)
+    descX = CUSPARSE.CuDenseMatrixDescriptor(X)
+    algo = CUSPARSE.CUSPARSE_SPMM_ALG_DEFAULT
+    CUSPARSE.cusparseSpMM(CUSPARSE.handle(), A.transa, 'N', Ref{T}(one(T)), A.descA, descX, Ref{T}(zero(T)), descY, T, algo, A.buffer)
+end
diff --git a/ext/KrylovPreconditionersAMDGPUExt.jl b/ext/KrylovPreconditionersAMDGPUExt.jl
index b75c28c..0355a2a 100644
--- a/ext/KrylovPreconditionersAMDGPUExt.jl
+++ b/ext/KrylovPreconditionersAMDGPUExt.jl
@@ -3,8 +3,8 @@ using LinearAlgebra
 using AMDGPU
 using AMDGPU.rocSPARSE
 using LinearAlgebra: checksquare, BlasReal, BlasFloat
-import LinearAlgebra: ldiv!
-using SparseArrays
+import LinearAlgebra: ldiv!, mul!
+import Base: size, eltype
 
 using KrylovPreconditioners
 const KP = KrylovPreconditioners
@@ -14,5 +14,6 @@ const KA = KernelAbstractions
 include("AMDGPU/ic0.jl")
 include("AMDGPU/ilu0.jl")
 include("AMDGPU/blockjacobi.jl")
+include("AMDGPU/operators.jl")
 
 end
diff --git a/ext/KrylovPreconditionersCUDAExt.jl b/ext/KrylovPreconditionersCUDAExt.jl
index c0e0f83..a74f941 100644
--- a/ext/KrylovPreconditionersCUDAExt.jl
+++ b/ext/KrylovPreconditionersCUDAExt.jl
@@ -3,8 +3,8 @@ using LinearAlgebra
 using CUDA
 using CUDA.CUSPARSE
 using LinearAlgebra: checksquare, BlasReal, BlasFloat
-import LinearAlgebra: ldiv!
-using SparseArrays
+import LinearAlgebra: ldiv!, mul!
+import Base: size, eltype
 
 using KrylovPreconditioners
 const KP = KrylovPreconditioners
@@ -14,5 +14,6 @@ const KA = KernelAbstractions
 include("CUDA/ic0.jl")
 include("CUDA/ilu0.jl")
 include("CUDA/blockjacobi.jl")
+include("CUDA/operators.jl")
 
 end
diff --git a/src/KrylovPreconditioners.jl b/src/KrylovPreconditioners.jl
index 8c43404..97e0e22 100644
--- a/src/KrylovPreconditioners.jl
+++ b/src/KrylovPreconditioners.jl
@@ -12,6 +12,12 @@ import LinearAlgebra: ldiv!
 abstract type AbstractKrylovPreconditioner end
 export AbstractKrylovPreconditioner
 
+abstract type AbstractKrylovOperator{T} end
+export AbstractKrylovOperator
+
+function KrylovOperator end
+export KrylovOperator
+
 # Preconditioners
 include("ic0.jl")
 include("ilu0.jl")
diff --git a/test/gpu/amd.jl b/test/gpu/amd.jl
index 366e2cd..fbd7a07 100644
--- a/test/gpu/amd.jl
+++ b/test/gpu/amd.jl
@@ -26,6 +26,18 @@ include("gpu.jl")
     end
   end
 
+  @testset "KrylovOperator" begin
+    @testset "ROCSparseMatrixCOO -- $FC" for FC in (Float64, ComplexF64)
+      test_operator(FC, ROCVector{FC}, ROCMatrix{FC}, ROCSparseMatrixCOO{FC})
+    end
+    @testset "ROCSparseMatrixCSC -- $FC" for FC in (Float64, ComplexF64)
+      test_operator(FC, ROCVector{FC}, ROCMatrix{FC}, ROCSparseMatrixCSC{FC})
+    end
+    @testset "ROCSparseMatrixCSR -- $FC" for FC in (Float64, ComplexF64)
+      test_operator(FC, ROCVector{FC}, ROCMatrix{FC}, ROCSparseMatrixCSR{FC})
+    end
+  end
+
   @testset "Block Jacobi preconditioner" begin
       if AMDGPU.functional()
           test_block_jacobi(ROCBackend(), ROCArray, ROCSparseMatrixCSR)
diff --git a/test/gpu/gpu.jl b/test/gpu/gpu.jl
index 107ad94..c852ad7 100644
--- a/test/gpu/gpu.jl
+++ b/test/gpu/gpu.jl
@@ -18,7 +18,7 @@ function test_ic0(FC, V, M)
   x_gpu, stats = cg(A_gpu, b_gpu, M=P, ldiv=true)
   r_gpu = b_gpu - A_gpu * x_gpu
   @test stats.niter ≤ 5
-  if (FC <: ComplexF64) && typeof(V).name.name == :ROCArray
+  if (FC <: ComplexF64) && V.body.name.name == :ROCArray
     @test_broken norm(r_gpu) ≤ 1e-6
   else
     @test norm(r_gpu) ≤ 1e-8
@@ -42,6 +42,50 @@ function test_ilu0(FC, V, M)
   @test norm(r_gpu) ≤ 1e-8
 end
 
+function test_operator(FC, V, DM, SM)
+  m = 200
+  n = 100
+  A_cpu = rand(FC, n, n)
+  A_cpu = sparse(A_cpu)
+  b_cpu = rand(FC, n)
+
+  A_gpu = SM(A_cpu)
+  b_gpu = V(b_cpu)
+  opA_gpu = KrylovOperator(A_gpu)
+
+  x_gpu, stats = gmres(opA_gpu, b_gpu)
+  r_gpu = b_gpu - A_gpu * x_gpu
+  @test stats.solved
+  @test norm(r_gpu) ≤ 1e-8
+
+  A_cpu = rand(FC, m, n)
+  A_cpu = sparse(A_cpu)
+  A_gpu = SM(A_cpu)
+
+  opA_gpu = KrylovOperator(A_gpu)
+  for i = 1:5
+    y_cpu = rand(FC, m)
+    x_cpu = rand(FC, n)
+    mul!(y_cpu, A_cpu, x_cpu)
+    y_gpu = V(y_cpu)
+    x_gpu = V(x_cpu)
+    mul!(y_gpu, opA_gpu, x_gpu)
+    @test collect(y_gpu) ≈ y_cpu
+  end
+
+  nrhs = 3
+  opA_gpu = KrylovOperator(A_gpu; nrhs)
+  for i = 1:5
+    Y_cpu = rand(FC, m, nrhs)
+    X_cpu = rand(FC, n, nrhs)
+    mul!(Y_cpu, A_cpu, X_cpu)
+    Y_gpu = DM(Y_cpu)
+    X_gpu = DM(X_cpu)
+    mul!(Y_gpu, opA_gpu, X_gpu)
+    @test collect(Y_gpu) ≈ Y_cpu
+  end
+end
+
 _get_type(J::SparseMatrixCSC) = Vector{Float64}
 
 function generate_random_system(n::Int, m::Int)
diff --git a/test/gpu/nvidia.jl b/test/gpu/nvidia.jl
index cb155da..655e1df 100644
--- a/test/gpu/nvidia.jl
+++ b/test/gpu/nvidia.jl
@@ -26,6 +26,18 @@ include("gpu.jl")
     end
   end
 
+  @testset "KrylovOperator" begin
+    @testset "CuSparseMatrixCOO -- $FC" for FC in (Float64, ComplexF64)
+      test_operator(FC, CuVector{FC}, CuMatrix{FC}, CuSparseMatrixCOO{FC})
+    end
+    @testset "CuSparseMatrixCSC -- $FC" for FC in (Float64, ComplexF64)
+      test_operator(FC, CuVector{FC}, CuMatrix{FC}, CuSparseMatrixCSC{FC})
+    end
+    @testset "CuSparseMatrixCSR -- $FC" for FC in (Float64, ComplexF64)
+      test_operator(FC, CuVector{FC}, CuMatrix{FC}, CuSparseMatrixCSR{FC})
+    end
+  end
+
   @testset "Block Jacobi preconditioner" begin
       if CUDA.functional()
           test_block_jacobi(CUDABackend(), CuArray, CuSparseMatrixCSR)