diff --git a/examples/Manifest.toml b/examples/Manifest.toml
index aeecc76..a650675 100644
--- a/examples/Manifest.toml
+++ b/examples/Manifest.toml
@@ -342,9 +342,9 @@ version = "1.14.1"
 
 [[deps.JuliaInterpreter]]
 deps = ["CodeTracking", "InteractiveUtils", "Random", "UUIDs"]
-git-tree-sha1 = "2984284a8abcfcc4784d95a9e2ea4e352dd8ede7"
+git-tree-sha1 = "fc8504eca188aaae4345649ca6105806bc584b70"
 uuid = "aa1ae85d-cabe-5617-a682-6adf51b2e16a"
-version = "0.9.36"
+version = "0.9.37"
 
 [[deps.Krylov]]
 deps = ["LinearAlgebra", "Printf", "SparseArrays"]
@@ -459,7 +459,7 @@ version = "5.1.2+0"
 
 [[deps.MPI]]
 deps = ["Distributed", "DocStringExtensions", "Libdl", "MPICH_jll", "MPIPreferences", "MPItrampoline_jll", "MicrosoftMPI_jll", "OpenMPI_jll", "PkgVersion", "PrecompileTools", "Requires", "Serialization", "Sockets"]
-git-tree-sha1 = "8faa547a424cbd7eca2529c6ddf9929c4ec64e71"
+git-tree-sha1 = "71c417a539693107d1b0b0d413cc58e3f743c937"
 repo-rev = "master"
 repo-url = "https://github.com/PetrKryslUCSD/MPI.jl.git"
 uuid = "da04e1cc-30fd-572f-bb4f-1f8673147195"
@@ -652,9 +652,9 @@ version = "1.3.0"
 
 [[deps.Revise]]
 deps = ["CodeTracking", "Distributed", "FileWatching", "JuliaInterpreter", "LibGit2", "LoweredCodeUtils", "OrderedCollections", "REPL", "Requires", "UUIDs", "Unicode"]
-git-tree-sha1 = "7f4228017b83c66bd6aa4fddeb170ce487e53bc7"
+git-tree-sha1 = "834aedb1369919a7b2026d7e04c2d49a311d26f4"
 uuid = "295af30f-e4ad-537b-8983-00126c2a3abe"
-version = "3.6.2"
+version = "3.6.3"
 
 [[deps.SHA]]
 uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
@@ -837,9 +837,9 @@ version = "1.21.1"
 
 [[deps.XML2_jll]]
 deps = ["Artifacts", "JLLWrappers", "Libdl", "Libiconv_jll", "Zlib_jll"]
-git-tree-sha1 = "6a451c6f33a176150f315726eba8b92fbfdb9ae7"
+git-tree-sha1 = "a2fccc6559132927d4c5dc183e3e01048c6dcbd6"
 uuid = "02c8fc9c-b97f-50b9-bbe4-9be30ff0a78a"
-version = "2.13.4+0"
+version = "2.13.5+0"
 
 [[deps.Zlib_jll]]
 deps = ["Libdl"]
diff --git a/examples/conc/pace.jl b/examples/conc/pace.jl
index b6e63a6..39ffbd2 100644
--- a/examples/conc/pace.jl
+++ b/examples/conc/pace.jl
@@ -7,11 +7,6 @@ run(cmd)
 @info "======================================\nExpected 26 iterations"
 
 
-cmd = `julia --project=. ./conc/shells/barrel_seq_driver.jl`
-run(cmd)
-@info "======================================\nExpected 69 iterations"
-
-
 cmd = `julia --project=. ./conc/shells/hyp_seq_driver.jl`
 run(cmd)
 @info "======================================\nExpected 28 iterations"
@@ -21,3 +16,8 @@ cmd = `julia --project=. ./conc/lindef/fib_seq_driver.jl --Np 7`
 run(cmd)
 @info "======================================\nExpected 67 iterations"
 
+
+cmd = `julia --project=. ./conc/shells/barrel_seq_driver.jl`
+run(cmd)
+@info "======================================\nExpected 69 iterations"
+
diff --git a/examples/conc/pace_mpi.jl b/examples/conc/pace_mpi.jl
index d48bae6..3698e6c 100644
--- a/examples/conc/pace_mpi.jl
+++ b/examples/conc/pace_mpi.jl
@@ -9,9 +9,7 @@ run(cmd)
 @info "======================================\nExpected 26 iterations"
 
 
-cmd = `$(mpiexecjl) -n 7 julia --project=. ./conc/shells/barrel_mpi_driver.jl`
-run(cmd)
-@info "======================================\nExpected 69 iterations" 
+
 
 
 cmd = `$(mpiexecjl) -n 7 julia --project=. ./conc/shells/hyp_mpi_driver.jl`
@@ -23,3 +21,6 @@ cmd = `$(mpiexecjl) -n 7 julia --project=. ./conc/lindef/fib_mpi_driver.jl`
 run(cmd)
 @info "======================================\nExpected 67 iterations"
 
+cmd = `$(mpiexecjl) -n 7 julia --project=. ./conc/shells/barrel_mpi_driver.jl`
+run(cmd)
+@info "======================================\nExpected 69 iterations" 
\ No newline at end of file
diff --git a/examples/mpi_experiments/mpi_experiment_4.jl b/examples/mpi_experiments/mpi_experiment_4.jl
new file mode 100644
index 0000000..df7f834
--- /dev/null
+++ b/examples/mpi_experiments/mpi_experiment_4.jl
@@ -0,0 +1,20 @@
+# examples/03-reduce.jl
+# This example shows how to use custom datatypes and reduction operators
+# It computes the variance in parallel in a numerically stable way
+
+using MPI, Statistics
+
+MPI.Init()
+const comm = MPI.COMM_WORLD
+const root = 0
+
+rank = MPI.Comm_rank(comm)
+
+X = fill(rank, 7)
+
+# Perform a sum reduction
+X = MPI.Allreduce(X, MPI.SUM, comm)
+
+if MPI.Comm_rank(comm) == root
+    println("The sum of the arrays is: ", X)
+end
\ No newline at end of file
diff --git a/examples/mpi_experiments/mpi_experiment_5.jl b/examples/mpi_experiments/mpi_experiment_5.jl
new file mode 100644
index 0000000..923215e
--- /dev/null
+++ b/examples/mpi_experiments/mpi_experiment_5.jl
@@ -0,0 +1,70 @@
+# examples/03-reduce.jl
+# This example shows how to use custom datatypes and reduction operators
+# It computes the variance in parallel in a numerically stable way
+
+using MPI, Statistics
+
+RBuffer = MPI.RBuffer
+Op = MPI.Op
+API  = MPI.API
+MPI_Op = MPI.MPI_Op
+IN_PLACE = MPI.IN_PLACE
+Comm = MPI.Comm
+AbstractRequest = MPI.AbstractRequest
+Request = MPI.Request
+_doc_external = x -> "For more information, see the [MPI documentation]($x)."
+
+## Iallreduce
+
+# mutating
+"""
+    Iallreduce!(sendbuf, recvbuf, op, comm::Comm, req::AbstractRequest=Request())
+    Iallreduce!(sendrecvbuf, op, comm::Comm, req::AbstractRequest=Request())
+
+Performs elementwise reduction using the operator `op` on the buffer `sendbuf`, storing
+the result in the `recvbuf` of all processes in the group.
+
+If only one `sendrecvbuf` buffer is provided, then the operation is performed in-place.
+
+# See also
+- [`Iallreduce`](@ref), to handle allocation of the output buffer.
+- [`Op`](@ref) for details on reduction operators.
+
+# External links
+$(_doc_external("MPI_Iallreduce"))
+"""
+function Iallreduce!(rbuf::RBuffer, op::Union{Op,MPI_Op}, comm::Comm, req::AbstractRequest=Request())
+    # int MPI_Iallreduce(const void *sendbuf, void *recvbuf, int count,
+    # MPI_Datatype datatype, MPI_Op op, MPI_Comm comm,
+    # MPI_Request *request)
+    API.MPI_Iallreduce(rbuf.senddata, rbuf.recvdata, rbuf.count, rbuf.datatype, op, comm, req)
+    return req
+end
+Iallreduce!(rbuf::RBuffer, op, comm::Comm, req::AbstractRequest=Request()) =
+    Iallreduce!(rbuf, Op(op, eltype(rbuf)), comm, req)
+Iallreduce!(sendbuf, recvbuf, op, comm::Comm, req::AbstractRequest=Request()) =
+    Iallreduce!(RBuffer(sendbuf, recvbuf), op, comm, req)
+
+# inplace
+Iallreduce!(buf, op, comm::Comm, req::AbstractRequest=Request()) = Iallreduce!(IN_PLACE, buf, op, comm, req)
+
+
+MPI.Init()
+const comm = MPI.COMM_WORLD
+const root = 0
+
+rank = MPI.Comm_rank(comm)
+
+X = fill(rank, 7)
+
+# Perform a sum reduction
+req = Iallreduce!(X, MPI.SUM, comm)
+sleep(rand())
+MPI.Wait(req)
+
+if MPI.Comm_rank(comm) == root
+    println("Rank $(MPI.Comm_rank(comm)): The array is: ", X)
+    println("Rank $(MPI.Comm_rank(comm)): Should have gotten: ", sum(0:MPI.Comm_size(comm)-1))
+end
+
+MPI.Finalize()
\ No newline at end of file
diff --git a/src/DDCoNCMPIModule.jl b/src/DDCoNCMPIModule.jl
index 9c7ae5d..ca44579 100644
--- a/src/DDCoNCMPIModule.jl
+++ b/src/DDCoNCMPIModule.jl
@@ -57,17 +57,16 @@ using LinearAlgebra
 using Statistics: mean
 using ..FENodeToPartitionMapModule: FENodeToPartitionMap
 using ShellStructureTopo
-using MPI
-
 using ..PartitionCoNCModule: CoNCPartitioningInfo, CoNCPartitionData, npartitions
 using ..FinEtoolsDDMethods: set_up_timers, update_timer!, reset_timers!
-
 import ..CGModule: vec_copyto!
 import ..CGModule: vec_aypx!
 import ..CGModule: vec_ypax!
 import ..CGModule: vec_dot
 import Base: deepcopy
 
+using MPI
+
 torank(i) = i - 1
 topartitionnumber(r) = r + 1
 
@@ -329,8 +328,6 @@ end
 
 mutable struct TwoLevelPreConditioner{DDC<:DDCoNCMPIComm, T, IT, FACTOR}
     ddcomm::DDC
-    napps::Int
-    nskip::Int
     n::IT
     buff_Phi::SparseMatrixCSC{T, IT}
     Kr_ff_factor::FACTOR
@@ -340,8 +337,6 @@ end
 
 function TwoLevelPreConditioner(ddcomm::DDC, Phi) where {DDC<:DDCoNCMPIComm}
     comm = ddcomm.comm
-    napps = 0
-    nskip = 0
     partition = ddcomm.partition
     rank = ddcomm.partition.rank 
     n = size(Phi, 1)
@@ -373,29 +368,28 @@ function TwoLevelPreConditioner(ddcomm::DDC, Phi) where {DDC<:DDCoNCMPIComm}
     buff_Phi = P[pel.ldofs_own_only, :]
     buffPp = fill(zero(eltype(Kr_ff_factor)), nr)
     buffKiPp = fill(zero(eltype(Kr_ff_factor)), nr)
-    return TwoLevelPreConditioner(ddcomm, napps, nskip, n, buff_Phi, Kr_ff_factor, buffPp, buffKiPp)
+    return TwoLevelPreConditioner(ddcomm, n, buff_Phi, Kr_ff_factor, buffPp, buffKiPp)
 end
 
 function (pre::TwoLevelPreConditioner)(q::PV, p::PV) where {PV<:PartitionedVector}
     partition = p.ddcomm.partition
     _rhs_update_xt!(p)
     q.buffers.ownv .= 0
-    pre.napps += 1
-    if pre.napps > pre.nskip
-        # Narrow by the transformation 
-        ld = partition.entity_list.own.ldofs_own_only
-        pre.buffPp .= pre.buff_Phi' * p.buffers.ownv[ld]
-        # Communicate
-        pre.buffPp .= MPI.Allreduce!(pre.buffPp, MPI.SUM, pre.ddcomm.comm)
-        # Solve the reduced problem
-        pre.buffKiPp .= pre.Kr_ff_factor \ pre.buffPp
-        # Expand by the transformation 
-        ld = partition.entity_list.own.ldofs_own_only
-        q.buffers.ownv[ld] .= pre.buff_Phi * pre.buffKiPp
-        pre.napps = 0
-    end
+    # Level 2, narrow by the transformation 
+    ld = partition.entity_list.own.ldofs_own_only
+    pre.buffPp .= pre.buff_Phi' * p.buffers.ownv[ld]
+    # Level 2, communicate
+    req = MPI.Iallreduce!(pre.buffPp, MPI.SUM, pre.ddcomm.comm)
     # Level 1
     q.buffers.extv .= partition.Kxt_ff_factor \ p.buffers.extv
+    # Level 2, wait for the communication
+    MPI.Wait(req)
+    # Level 2, solve the reduced problem
+    pre.buffKiPp .= pre.Kr_ff_factor \ pre.buffPp
+    # Level 2, expand by the transformation 
+    ld = partition.entity_list.own.ldofs_own_only
+    q.buffers.ownv[ld] .= pre.buff_Phi * pre.buffKiPp
+    
     _lhs_update_xt!(q)
     q
 end