diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 8caf9940..0290b35d 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -2,12 +2,14 @@ set(SRC
   allocator.f90
   backend.f90
   common.f90
+  poisson_fft.f90
   solver.f90
   tdsops.f90
   time_integrator.f90
   omp/backend.f90
   omp/common.f90
   omp/kernels/distributed.f90
+  omp/poisson_fft.f90
   omp/sendrecv.f90
   omp/exec_dist.f90
 )
@@ -17,7 +19,9 @@ set(CUDASRC
   cuda/common.f90
   cuda/exec_dist.f90
   cuda/kernels/distributed.f90
+  cuda/kernels/complex.f90
   cuda/kernels/reorder.f90
+  cuda/poisson_fft.f90
   cuda/sendrecv.f90
   cuda/tdsops.f90
 )
@@ -40,6 +44,7 @@ if(${CMAKE_Fortran_COMPILER_ID} STREQUAL "PGI" OR
   set(CMAKE_Fortran_FLAGS_DEBUG "-g -O0 -traceback -Mbounds -Mchkptr -Ktrap=fp")
   set(CMAKE_Fortran_FLAGS_RELEASE "-O3 -fast")
   target_link_options(x3d2 INTERFACE "-cuda")
+  target_link_options(x3d2 INTERFACE "-lcufft")
 
   target_compile_options(xcompact PRIVATE "-DCUDA")
   #  target_link_options(xcompact INTERFACE "-cuda")
diff --git a/src/backend.f90 b/src/backend.f90
index a3958859..e17970e5 100644
--- a/src/backend.f90
+++ b/src/backend.f90
@@ -1,6 +1,7 @@
 module m_base_backend
    use m_allocator, only: allocator_t, field_t
    use m_common, only: dp
+   use m_poisson_fft, only: poisson_fft_t
    use m_tdsops, only: tdsops_t, dirps_t
 
    implicit none
@@ -24,6 +25,7 @@ module m_base_backend
       integer :: nx_loc, ny_loc, nz_loc
       class(allocator_t), pointer :: allocator
       class(dirps_t), pointer :: xdirps, ydirps, zdirps
+      class(poisson_fft_t), pointer :: poisson_fft
    contains
       procedure(transeq_ders), deferred :: transeq_x
       procedure(transeq_ders), deferred :: transeq_y
@@ -33,9 +35,11 @@ module m_base_backend
       procedure(sum_intox), deferred :: sum_yintox
       procedure(sum_intox), deferred :: sum_zintox
       procedure(vecadd), deferred :: vecadd
-      procedure(get_fields), deferred :: get_fields
-      procedure(set_fields), deferred :: set_fields
+      procedure(scalar_product), deferred :: scalar_product
+      procedure(get_field), deferred :: get_field
+      procedure(set_field), deferred :: set_field
       procedure(alloc_tdsops), deferred :: alloc_tdsops
+      procedure(init_poisson_fft), deferred :: init_poisson_fft
    end type base_backend_t
 
    abstract interface
@@ -126,7 +130,20 @@ end subroutine vecadd
    end interface
 
    abstract interface
-      subroutine get_fields(self, u_out, v_out, w_out, u, v, w)
+      real(dp) function scalar_product(self, x, y) result(s)
+         !! Calculates the scalar product of two input fields
+         import :: base_backend_t
+         import :: dp
+         import :: field_t
+         implicit none
+
+         class(base_backend_t) :: self
+         class(field_t), intent(in) :: x, y
+      end function scalar_product
+   end interface
+
+   abstract interface
+      subroutine get_field(self, arr, f)
          !! copy the specialist data structure from device or host back
          !! to a regular 3D data structure.
          import :: base_backend_t
@@ -135,13 +152,13 @@ subroutine get_fields(self, u_out, v_out, w_out, u, v, w)
          implicit none
 
          class(base_backend_t) :: self
-         real(dp), dimension(:, :, :), intent(out) :: u_out, v_out, w_out
-         class(field_t), intent(in) :: u, v, w
-      end subroutine get_fields
+         real(dp), dimension(:, :, :), intent(out) :: arr
+         class(field_t), intent(in) :: f
+      end subroutine get_field
 
-      subroutine set_fields(self, u, v, w, u_in, v_in, w_in)
+      subroutine set_field(self, f, arr)
          !! copy the initial condition stored in a regular 3D data
-         !! structure into the specialist data structure arrays on the
+         !! structure into the specialist data structure array on the
          !! device or host.
          import :: base_backend_t
          import :: dp
@@ -149,9 +166,9 @@ subroutine set_fields(self, u, v, w, u_in, v_in, w_in)
          implicit none
 
          class(base_backend_t) :: self
-         class(field_t), intent(inout) :: u, v, w
-         real(dp), dimension(:, :, :), intent(in) :: u_in, v_in, w_in
-      end subroutine set_fields
+         class(field_t), intent(inout) :: f
+         real(dp), dimension(:, :, :), intent(in) :: arr
+      end subroutine set_field
    end interface
 
    abstract interface
@@ -174,4 +191,15 @@ subroutine alloc_tdsops(self, tdsops, n, dx, operation, scheme, n_halo, &
       end subroutine alloc_tdsops
    end interface
 
+   abstract interface
+      subroutine init_poisson_fft(self, xdirps, ydirps, zdirps)
+         import :: base_backend_t
+         import :: dirps_t
+         implicit none
+
+         class(base_backend_t) :: self
+         type(dirps_t), intent(in) :: xdirps, ydirps, zdirps
+      end subroutine init_poisson_fft
+   end interface
+
 end module m_base_backend
diff --git a/src/common.f90 b/src/common.f90
index 2dbd9bd5..beac1f5f 100644
--- a/src/common.f90
+++ b/src/common.f90
@@ -5,8 +5,9 @@ module m_common
    real(dp), parameter :: pi = 4*atan(1.0_dp)
 
    integer, parameter :: RDR_X2Y = 12, RDR_X2Z = 13, RDR_Y2X = 21, &
-                         RDR_Y2Z = 23, RDR_Z2Y = 32
+                         RDR_Y2Z = 23, RDR_Z2X = 31, RDR_Z2Y = 32
    integer, parameter :: dir_X = 1, dir_Y = 2, dir_Z = 3
+   integer, parameter :: POISSON_SOLVER_FFT = 0, POISSON_SOLVER_CG = 1
 
    type :: globs_t
       integer :: nx, ny, nz
@@ -14,8 +15,11 @@ module m_common
       integer :: n_groups_x, n_groups_y, n_groups_z
       real(dp) :: Lx, Ly, Lz
       real(dp) :: dx, dy, dz
+      real(dp) :: nu, dt
+      integer :: n_iters, n_output
       integer :: nproc_x = 1, nproc_y = 1, nproc_z = 1
       character(len=20) :: BC_x_s, BC_x_e, BC_y_s, BC_y_e, BC_z_s, BC_z_e
+      integer :: poisson_solver_type
    end type globs_t
 
 contains
diff --git a/src/cuda/backend.f90 b/src/cuda/backend.f90
index 8ab91370..a5322ff6 100644
--- a/src/cuda/backend.f90
+++ b/src/cuda/backend.f90
@@ -1,21 +1,25 @@
 module m_cuda_backend
    use iso_fortran_env, only: stderr => error_unit
    use cudafor
+   use mpi
 
    use m_allocator, only: allocator_t, field_t
    use m_base_backend, only: base_backend_t
-   use m_common, only: dp, globs_t, RDR_X2Y, RDR_X2Z, RDR_Y2X, RDR_Y2Z, RDR_Z2Y
+   use m_common, only: dp, globs_t, &
+                       RDR_X2Y, RDR_X2Z, RDR_Y2X, RDR_Y2Z, RDR_Z2X, RDR_Z2Y
+   use m_poisson_fft, only: poisson_fft_t
    use m_tdsops, only: dirps_t, tdsops_t
 
    use m_cuda_allocator, only: cuda_allocator_t, cuda_field_t
    use m_cuda_common, only: SZ
    use m_cuda_exec_dist, only: exec_dist_transeq_3fused, exec_dist_tds_compact
+   use m_cuda_poisson_fft, only: cuda_poisson_fft_t
    use m_cuda_sendrecv, only: sendrecv_fields, sendrecv_3fields
    use m_cuda_tdsops, only: cuda_tdsops_t
    use m_cuda_kernels_dist, only: transeq_3fused_dist, transeq_3fused_subs
    use m_cuda_kernels_reorder, only: &
-       reorder_x2y, reorder_x2z, reorder_y2x, reorder_y2z, reorder_z2y, &
-       sum_yintox, sum_zintox, axpby, buffer_copy
+       reorder_x2y, reorder_x2z, reorder_y2x, reorder_y2z, reorder_z2x, &
+       reorder_z2y, sum_yintox, sum_zintox, scalar_product, axpby, buffer_copy
 
    implicit none
 
@@ -40,8 +44,10 @@ module m_cuda_backend
       procedure :: sum_yintox => sum_yintox_cuda
       procedure :: sum_zintox => sum_zintox_cuda
       procedure :: vecadd => vecadd_cuda
-      procedure :: set_fields => set_fields_cuda
-      procedure :: get_fields => get_fields_cuda
+      procedure :: scalar_product => scalar_product_cuda
+      procedure :: set_field => set_field_cuda
+      procedure :: get_field => get_field_cuda
+      procedure :: init_poisson_fft => init_cuda_poisson_fft
       procedure :: transeq_cuda_dist
       procedure :: transeq_cuda_thom
       procedure :: tds_solve_dist
@@ -60,6 +66,7 @@ function init(globs, allocator) result(backend)
       class(allocator_t), target, intent(inout) :: allocator
       type(cuda_backend_t) :: backend
 
+      type(cuda_poisson_fft_t) :: cuda_poisson_fft
       integer :: n_halo, n_block
 
       select type(allocator)
@@ -452,6 +459,10 @@ subroutine reorder_cuda(self, u_o, u_i, direction)
          threads = dim3(SZ, SZ, 1)
          call reorder_y2z<<<blocks, threads>>>(u_o_d, u_i_d, &
                                                self%nx_loc, self%nz_loc)
+      case (RDR_Z2X) ! z2x
+         blocks = dim3(self%nx_loc, self%ny_loc/SZ, 1)
+         threads = dim3(SZ, 1, 1)
+         call reorder_z2x<<<blocks, threads>>>(u_o_d, u_i_d, self%nz_loc)
       case (RDR_Z2Y) ! z2y
          blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc)
          threads = dim3(SZ, SZ, 1)
@@ -524,6 +535,35 @@ subroutine vecadd_cuda(self, a, x, b, y)
 
    end subroutine vecadd_cuda
 
+   real(dp) function scalar_product_cuda(self, x, y) result(s)
+      implicit none
+
+      class(cuda_backend_t) :: self
+      class(field_t), intent(in) :: x, y
+
+      real(dp), device, pointer, dimension(:, :, :) :: x_d, y_d
+      real(dp), device, allocatable :: sum_d
+      type(dim3) :: blocks, threads
+      integer :: n, ierr
+
+      select type(x); type is (cuda_field_t); x_d => x%data_d; end select
+      select type(y); type is (cuda_field_t); y_d => y%data_d; end select
+
+      allocate (sum_d)
+      sum_d = 0._dp
+
+      n = size(x_d, dim = 2)
+      blocks = dim3(size(x_d, dim = 3), 1, 1)
+      threads = dim3(SZ, 1, 1)
+      call scalar_product<<<blocks, threads>>>(sum_d, x_d, y_d, n)
+
+      s = sum_d
+
+      call MPI_Allreduce(MPI_IN_PLACE, s, 1, MPI_DOUBLE_PRECISION, MPI_SUM, &
+                         MPI_COMM_WORLD, ierr)
+
+   end function scalar_product_cuda
+
    subroutine copy_into_buffers(u_send_s_dev, u_send_e_dev, u_dev, n)
       implicit none
 
@@ -542,31 +582,42 @@ subroutine copy_into_buffers(u_send_s_dev, u_send_e_dev, u_dev, n)
 
    end subroutine copy_into_buffers
 
-   subroutine set_fields_cuda(self, u, v, w, u_in, v_in, w_in)
+   subroutine set_field_cuda(self, f, arr)
       implicit none
 
       class(cuda_backend_t) :: self
-      class(field_t), intent(inout) :: u, v, w
-      real(dp), dimension(:, :, :), intent(in) :: u_in, v_in, w_in
+      class(field_t), intent(inout) :: f
+      real(dp), dimension(:, :, :), intent(in) :: arr
 
-      select type(u); type is (cuda_field_t); u%data_d = u_in; end select
-      select type(v); type is (cuda_field_t); v%data_d = v_in; end select
-      select type(w); type is (cuda_field_t); w%data_d = w_in; end select
+      select type(f); type is (cuda_field_t); f%data_d = arr; end select
 
-   end subroutine set_fields_cuda
+   end subroutine set_field_cuda
 
-   subroutine get_fields_cuda(self, u_out, v_out, w_out, u, v, w)
+   subroutine get_field_cuda(self, arr, f)
       implicit none
 
       class(cuda_backend_t) :: self
-      real(dp), dimension(:, :, :), intent(out) :: u_out, v_out, w_out
-      class(field_t), intent(in) :: u, v, w
+      real(dp), dimension(:, :, :), intent(out) :: arr
+      class(field_t), intent(in) :: f
+
+      select type(f); type is (cuda_field_t); arr = f%data_d; end select
 
-      select type(u); type is (cuda_field_t); u_out = u%data_d; end select
-      select type(v); type is (cuda_field_t); v_out = v%data_d; end select
-      select type(w); type is (cuda_field_t); w_out = w%data_d; end select
+   end subroutine get_field_cuda
+
+   subroutine init_cuda_poisson_fft(self, xdirps, ydirps, zdirps)
+      implicit none
+
+      class(cuda_backend_t) :: self
+      type(dirps_t), intent(in) :: xdirps, ydirps, zdirps
+
+      allocate(cuda_poisson_fft_t :: self%poisson_fft)
+
+      select type (poisson_fft => self%poisson_fft)
+      type is (cuda_poisson_fft_t)
+         poisson_fft = cuda_poisson_fft_t(xdirps, ydirps, zdirps)
+      end select
 
-   end subroutine get_fields_cuda
+   end subroutine init_cuda_poisson_fft
 
 end module m_cuda_backend
 
diff --git a/src/cuda/kernels/complex.f90 b/src/cuda/kernels/complex.f90
new file mode 100644
index 00000000..32c5041f
--- /dev/null
+++ b/src/cuda/kernels/complex.f90
@@ -0,0 +1,295 @@
+module m_cuda_complex
+   use cudafor
+
+   use m_common, only: dp
+   use m_cuda_common, only: SZ
+
+   implicit none
+
+contains
+
+   attributes(global) subroutine process_spectral_div_u( &
+         div, waves, nx, ny, nz, ax, bx, ay, by, az, bz &
+         )
+      implicit none
+
+      ! Arguments
+      complex(dp), device, intent(inout), dimension(:, :, :) :: div
+      complex(dp), device, intent(in), dimension(:, :, :) :: waves
+      real(dp), device, intent(in), dimension(:) :: ax, bx, ay, by, az, bz
+      integer, value, intent(in) :: nx, ny, nz
+
+      ! Local variables
+      integer :: i, j, b, ix, iy, iz
+      real(dp) :: tmp_r, tmp_c, div_r, div_c
+
+      i = threadIdx%x
+      b = blockIdx%x
+
+      do j = 1, nx
+         ! normalisation
+         div_r = real(div(i, j, b), kind=dp)/(nx*ny*nz)
+         div_c = aimag(div(i, j, b))/(nx*ny*nz)
+
+         ! get the indices for x, y, z directions
+         ix = j; iy = i + (b - 1)/(nz/2 + 1)*SZ; iz = mod(b - 1, nz/2 + 1) + 1
+
+         ! post-process forward
+         ! post-process in z
+         tmp_r = div_r
+         tmp_c = div_c
+         div_r = tmp_r*bz(iz) + tmp_c*az(iz)
+         div_c = tmp_c*bz(iz) - tmp_r*az(iz)
+
+         ! post-process in y
+         tmp_r = div_r
+         tmp_c = div_c
+         div_r = tmp_r*by(iy) + tmp_c*ay(iy)
+         div_c = tmp_c*by(iy) - tmp_r*ay(iy)
+         if (iy > ny/2 + 1) div_r = -div_r
+         if (iy > ny/2 + 1) div_c = -div_c
+
+         ! post-process in x
+         tmp_r = div_r
+         tmp_c = div_c
+         div_r = tmp_r*bx(ix) + tmp_c*ax(ix)
+         div_c = tmp_c*bx(ix) - tmp_r*ax(ix)
+         if (ix > nx/2 + 1) div_r = -div_r
+         if (ix > nx/2 + 1) div_c = -div_c
+
+         ! Solve Poisson
+         tmp_r = real(waves(i, j, b), kind=dp)
+         tmp_c = aimag(waves(i, j, b))
+         if ((tmp_r < 1.e-16_dp) .or. (tmp_c < 1.e-16_dp)) then
+            div_r = 0._dp; div_c = 0._dp
+         else
+            div_r = -div_r/tmp_r
+            div_c = -div_c/tmp_c
+         end if
+
+         ! post-process backward
+         ! post-process in z
+         tmp_r = div_r
+         tmp_c = div_c
+         div_r = tmp_r*bz(iz) - tmp_c*az(iz)
+         div_c = -tmp_c*bz(iz) - tmp_r*az(iz)
+
+         ! post-process in y
+         tmp_r = div_r
+         tmp_c = div_c
+         div_r = tmp_r*by(iy) + tmp_c*ay(iy)
+         div_c = tmp_c*by(iy) - tmp_r*ay(iy)
+         if (iy > ny/2 + 1) div_r = -div_r
+         if (iy > ny/2 + 1) div_c = -div_c
+
+         ! post-process in x
+         tmp_r = div_r
+         tmp_c = div_c
+         div_r = tmp_r*bx(ix) + tmp_c*ax(ix)
+         div_c = -tmp_c*bx(ix) + tmp_r*ax(ix)
+         if (ix > nx/2 + 1) div_r = -div_r
+         if (ix > nx/2 + 1) div_c = -div_c
+
+         ! update the entry
+         div(i, j, b) = cmplx(div_r, div_c, kind=dp)
+      end do
+
+   end subroutine process_spectral_div_u
+
+   attributes(global) subroutine reorder_cmplx_x2y_T(u_y, u_x, nz)
+      implicit none
+
+      complex(dp), device, intent(out), dimension(:, :, :) :: u_y
+      complex(dp), device, intent(in), dimension(:, :, :) :: u_x
+      integer, value, intent(in) :: nz
+
+      complex(dp), shared :: tile(SZ, SZ)
+
+      integer :: i, j, b_i, b_j, b_k
+
+      i = threadIdx%x; j = threadIdx%y
+      b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z
+
+      ! copy into shared
+      tile(i, j) = u_x((b_i - 1)*SZ + j, i, b_k + nz*(b_j - 1))
+
+      call syncthreads()
+
+      ! copy into output array from shared
+      u_y((b_j - 1)*SZ + j, i, (b_i - 1)*nz + b_k) = tile(j, i)
+
+   end subroutine reorder_cmplx_x2y_T
+
+   attributes(global) subroutine reorder_cmplx_y2x_T(u_x, u_y, nz)
+      implicit none
+
+      complex(dp), device, intent(out), dimension(:, :, :) :: u_x
+      complex(dp), device, intent(in), dimension(:, :, :) :: u_y
+      integer, value, intent(in) :: nz
+
+      complex(dp), shared :: tile(SZ, SZ)
+
+      integer :: i, j, b_i, b_j, b_k
+
+      i = threadIdx%x; j = threadIdx%y
+      b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z
+
+      ! copy into shared
+      tile(i, j) = u_y((b_j - 1)*SZ + j, i, b_k + nz*(b_i - 1))
+
+      call syncthreads()
+
+      ! copy into output array from shared
+      u_x((b_i - 1)*SZ + j, i, (b_j - 1)*nz + b_k) = tile(j, i)
+
+   end subroutine reorder_cmplx_y2x_T
+
+   attributes(global) subroutine reorder_cmplx_y2z_T(u_z, u_y, nx, nz)
+      implicit none
+
+      complex(dp), device, intent(out), dimension(:, :, :) :: u_z
+      complex(dp), device, intent(in), dimension(:, :, :) :: u_y
+      integer, value, intent(in) :: nx, nz
+
+      complex(dp), shared :: tile(SZ, SZ)
+
+      integer :: i, j, k, b_i, b_j, b_k, b_x, b_y, b_z
+
+      i = threadIdx%x
+      j = threadIdx%y
+      k = threadIdx%z
+
+      b_x = blockIdx%z
+      b_y = blockIdx%y
+      b_z = blockIdx%x
+
+      ! copy into shared
+      if ( j + (b_z - 1)*SZ <= nz ) &
+         tile(i, j) = u_y(i + (b_y - 1)*SZ, mod(b_x - 1, SZ) + 1, &
+                          j + (b_z - 1)*SZ + ((b_x - 1)/SZ)*nz)
+
+      call syncthreads()
+
+      ! copy into output array from shared
+      if ( i + (b_z - 1)*SZ <= nz ) &
+         u_z(i + (b_z - 1)*SZ, j, b_x + (b_y - 1)*nx) = tile(j, i)
+
+   end subroutine reorder_cmplx_y2z_T
+
+   attributes(global) subroutine reorder_cmplx_z2y_T(u_y, u_z, nx, nz)
+      implicit none
+
+      complex(dp), device, intent(out), dimension(:, :, :) :: u_y
+      complex(dp), device, intent(in), dimension(:, :, :) :: u_z
+      integer, value, intent(in) :: nx, nz
+
+      complex(dp), shared :: tile(SZ, SZ)
+
+      integer :: i, j, k, b_x, b_y, b_z
+
+      i = threadIdx%x
+      j = threadIdx%y
+      k = threadIdx%z
+
+      b_x = blockIdx%z
+      b_y = blockIdx%y
+      b_z = blockIdx%x
+
+      ! copy into shared
+      if ( i + (b_z - 1)*SZ <= nz ) &
+         tile(i, j) = u_z(i + (b_z - 1)*SZ, j, b_x + (b_y - 1)*nx)
+
+      call syncthreads()
+
+      ! copy into output array from shared
+      if ( j + (b_z - 1)*SZ <= nz ) &
+         u_y(i + (b_y - 1)*SZ, mod(b_x - 1, SZ) + 1, &
+             j + (b_z - 1)*SZ + ((b_x - 1)/SZ)*nz) = tile(j, i)
+
+   end subroutine reorder_cmplx_z2y_T
+
+   attributes(global) subroutine reshapeDSF(uout, uin)
+      implicit none
+
+      real(dp), device, intent(out), dimension(:, :, :) :: uout
+      real(dp), device, intent(in), dimension(:, :, :) :: uin
+
+      real(dp), shared :: tile(SZ + 1, SZ)
+
+      integer :: i, j, b_i, b
+
+      i = threadIdx%x; j = threadIdx%y
+      b_i = blockIdx%x; b = blockIdx%y
+
+      tile(i, j) = uin(i, j + (b_i - 1)*SZ, b)
+
+      call syncthreads()
+
+      uout(i + (b_i - 1)*SZ, j, b) = tile(j, i)
+
+   end subroutine reshapeDSF
+
+   attributes(global) subroutine reshapeDSB(uout, uin)
+      implicit none
+
+      real(dp), device, intent(out), dimension(:, :, :) :: uout
+      real(dp), device, intent(in), dimension(:, :, :) :: uin
+
+      real(dp), shared :: tile(SZ + 1, SZ)
+
+      integer :: i, j, b_i, b
+
+      i = threadIdx%x; j = threadIdx%y
+      b_i = blockIdx%x; b = blockIdx%y
+
+      tile(i, j) = uin(i + (b_i - 1)*SZ, j, b)
+
+      call syncthreads()
+
+      uout(i, j + (b_i - 1)*SZ, b) = tile(j, i)
+
+   end subroutine reshapeDSB
+
+   attributes(global) subroutine reshapeCDSF(uout, uin)
+      implicit none
+
+      complex(dp), device, intent(out), dimension(:, :, :) :: uout
+      complex(dp), device, intent(in), dimension(:, :, :) :: uin
+
+      complex(dp), shared :: tile(SZ + 1, SZ)
+
+      integer :: i, j, b_i, b
+
+      i = threadIdx%x; j = threadIdx%y
+      b_i = blockIdx%x; b = blockIdx%y
+
+      tile(i, j) = uin(i, j + (b_i - 1)*SZ, b)
+
+      call syncthreads()
+
+      uout(i + (b_i - 1)*SZ, j, b) = tile(j, i)
+
+   end subroutine reshapeCDSF
+
+   attributes(global) subroutine reshapeCDSB(uout, uin)
+      implicit none
+
+      complex(dp), device, intent(out), dimension(:, :, :) :: uout
+      complex(dp), device, intent(in), dimension(:, :, :) :: uin
+
+      complex(dp), shared :: tile(SZ + 1, SZ)
+
+      integer :: i, j, b_i, b
+
+      i = threadIdx%x; j = threadIdx%y
+      b_i = blockIdx%x; b = blockIdx%y
+
+      tile(i, j) = uin(i + (b_i - 1)*SZ, j, b)
+
+      call syncthreads()
+
+      uout(i, j + (b_i - 1)*SZ, b) = tile(j, i)
+
+   end subroutine reshapeCDSB
+
+end module m_cuda_complex
diff --git a/src/cuda/kernels/reorder.f90 b/src/cuda/kernels/reorder.f90
index 8ee984d0..11c74cac 100644
--- a/src/cuda/kernels/reorder.f90
+++ b/src/cuda/kernels/reorder.f90
@@ -95,6 +95,24 @@ attributes(global) subroutine reorder_y2z(u_z, u_y, nx, nz)
 
    end subroutine reorder_y2z
 
+   attributes(global) subroutine reorder_z2x(u_x, u_z, nz)
+      implicit none
+
+      real(dp), device, intent(out), dimension(:, :, :) :: u_x
+      real(dp), device, intent(in), dimension(:, :, :) :: u_z
+      integer, value, intent(in) :: nz
+
+      integer :: i, j, b_i, b_j, nx
+
+      i = threadIdx%x; b_i = blockIdx%x; b_j = blockIdx%y
+      nx = gridDim%x
+
+      do j = 1, nz
+         u_x(i, b_i, j + (b_j - 1)*nz) = u_z(i, j, b_i + (b_j - 1)*nx)
+      end do
+
+   end subroutine reorder_z2x
+
    attributes(global) subroutine reorder_z2y(u_y, u_z, nx, nz)
       implicit none
 
@@ -181,6 +199,27 @@ attributes(global) subroutine axpby(n, alpha, x, beta, y)
 
    end subroutine axpby
 
+   attributes(global) subroutine scalar_product(s, x, y, n)
+      implicit none
+
+      real(dp), device, intent(inout) :: s
+      real(dp), device, intent(in), dimension(:, :, :) :: x, y
+      integer, value, intent(in) :: n
+
+      real(dp) :: s_pncl !! pencil sum
+      integer :: i, j, b, ierr
+
+      i = threadIdx%x
+      b = blockIdx%x
+
+      s_pncl = 0._dp
+      do j = 1, n
+         s_pncl = s_pncl + x(i, j, b)*y(i, j, b)
+      end do
+      ierr = atomicadd(s, s_pncl)
+
+   end subroutine scalar_product
+
    attributes(global) subroutine buffer_copy(u_send_s, u_send_e, u, n, n_halo)
       implicit none
 
diff --git a/src/cuda/poisson_fft.f90 b/src/cuda/poisson_fft.f90
new file mode 100644
index 00000000..3df787a2
--- /dev/null
+++ b/src/cuda/poisson_fft.f90
@@ -0,0 +1,244 @@
+module m_cuda_poisson_fft
+   use cudafor
+   use cufft
+
+   use m_allocator, only: field_t
+   use m_common, only: dp
+   use m_poisson_fft, only: poisson_fft_t
+   use m_tdsops, only: dirps_t
+
+   use m_cuda_allocator, only: cuda_field_t
+   use m_cuda_common, only: SZ
+   use m_cuda_complex, only: reorder_cmplx_x2y_T, reorder_cmplx_y2x_T, &
+                             reorder_cmplx_y2z_T, reorder_cmplx_z2y_T, &
+                             process_spectral_div_u
+
+   implicit none
+
+   type, extends(poisson_fft_t) :: cuda_poisson_fft_t
+      !! FFT based Poisson solver
+      !! It can only handle 1D decompositions along z direction.
+
+      !> Local domain sized arrays to store data in spectral space
+      complex(dp), device, pointer, dimension(:) :: c_x_dev, c_y_dev, c_z_dev
+      !> Local domain sized array storing the spectral equivalence constants
+      complex(dp), device, allocatable, dimension(:, :, :) :: waves_dev
+
+      real(dp), device, allocatable, dimension(:) :: ax_dev, bx_dev, &
+         ay_dev, by_dev, az_dev, bz_dev
+
+      real(dp), device, allocatable, dimension(:, :, :) :: f_tmp
+
+      integer :: planD2Zz, planZ2Dz, planZ2Zx, planZ2Zy
+   contains
+      procedure :: fft_forward => fft_forward_cuda
+      procedure :: fft_backward => fft_backward_cuda
+      procedure :: fft_postprocess => fft_postprocess_cuda
+   end type cuda_poisson_fft_t
+
+   interface cuda_poisson_fft_t
+      module procedure init
+   end interface cuda_poisson_fft_t
+
+   private :: init
+
+contains
+
+   function init(xdirps, ydirps, zdirps) result(poisson_fft)
+      implicit none
+
+      class(dirps_t), intent(in) :: xdirps, ydirps, zdirps
+
+      type(cuda_poisson_fft_t) :: poisson_fft
+
+      integer :: nx, ny, nz
+
+      integer :: ierrfft
+      integer(int_ptr_kind()) :: worksize
+
+      call poisson_fft%base_init(xdirps, ydirps, zdirps, SZ)
+
+      nx = poisson_fft%nx; ny = poisson_fft%ny; nz = poisson_fft%nz
+
+      allocate (poisson_fft%waves_dev(SZ, nx, (ny*(nz/2 + 1))/SZ))
+      poisson_fft%waves_dev = poisson_fft%waves
+
+      allocate (poisson_fft%ax_dev(nx), poisson_fft%bx_dev(nx))
+      allocate (poisson_fft%ay_dev(ny), poisson_fft%by_dev(ny))
+      allocate (poisson_fft%az_dev(nz), poisson_fft%bz_dev(nz))
+      poisson_fft%ax_dev = poisson_fft%ax; poisson_fft%bx_dev = poisson_fft%bx
+      poisson_fft%ay_dev = poisson_fft%ay; poisson_fft%by_dev = poisson_fft%by
+      poisson_fft%az_dev = poisson_fft%az; poisson_fft%bz_dev = poisson_fft%bz
+
+      allocate (poisson_fft%c_x_dev(nx*ny*(nz/2 + 1)))
+      allocate (poisson_fft%c_y_dev(nx*ny*(nz/2 + 1)))
+      allocate (poisson_fft%c_z_dev(nx*ny*(nz/2 + 1)))
+
+      ! We can't currently ask allocator to pass us an array with
+      ! exact shape we want, so we allocate an extra array here.
+      ! This will be removed when allocator is fixed.
+      allocate (poisson_fft%f_tmp(nz, SZ, nx*ny/SZ))
+
+      ! set cufft plans
+      ierrfft = cufftCreate(poisson_fft%planD2Zz)
+      ierrfft = cufftMakePlanMany(poisson_fft%planD2Zz, 1, nz, &
+                                  nz, 1, nz, nz/2+1, 1, nz/2+1, &
+                                  CUFFT_D2Z, nx*ny, worksize)
+      ierrfft = cufftSetWorkArea(poisson_fft%planD2Zz, poisson_fft%c_x_dev)
+
+      ierrfft = cufftCreate(poisson_fft%planZ2Dz)
+      ierrfft = cufftMakePlanMany(poisson_fft%planZ2Dz, 1, nz, &
+                                  nz/2+1, 1, nz/2+1, nz, 1, nz, &
+                                  CUFFT_Z2D, nx*ny, worksize)
+      ierrfft = cufftSetWorkArea(poisson_fft%planZ2Dz, poisson_fft%c_x_dev)
+
+      ierrfft = cufftCreate(poisson_fft%planZ2Zy)
+      ierrfft = cufftMakePlanMany(poisson_fft%planZ2Zy, 1, ny, &
+                                  ny, 1, ny, ny, 1, ny, &
+                                  CUFFT_Z2Z, nx*(nz/2 + 1), worksize)
+      ierrfft = cufftSetWorkArea(poisson_fft%planZ2Zy, poisson_fft%c_x_dev)
+
+      ierrfft = cufftCreate(poisson_fft%planZ2Zx)
+      ierrfft = cufftMakePlanMany(poisson_fft%planZ2Zx, 1, nx, &
+                                  nx, 1, nx, nx, 1, nx, &
+                                  CUFFT_Z2Z, ny*(nz/2 + 1), worksize)
+      ierrfft = cufftSetWorkArea(poisson_fft%planZ2Zx, poisson_fft%c_y_dev)
+
+   end function init
+
+   subroutine fft_forward_cuda(self, f)
+      implicit none
+
+      class(cuda_poisson_fft_t) :: self
+      class(field_t), intent(in) :: f
+
+      real(dp), device, pointer, dimension(:, :, :) :: f_dev
+      complex(dp), device, dimension(:, :, :), pointer :: c_x_ptr, c_y_ptr, &
+                                                          c_z_ptr
+
+      type(dim3) :: blocks, threads
+      integer :: ierrfft
+
+      select type(f); type is (cuda_field_t); f_dev => f%data_d; end select
+
+      ! First reorder f into cartesian-like data structure
+      blocks = dim3(self%nz/SZ, self%nx*self%ny/SZ, 1)
+      threads = dim3(SZ, SZ, 1)
+      call reshapeDSF<<<blocks, threads>>>(self%f_tmp, f_dev)
+
+      ! Forward FFT transform in z from real to complex
+      ierrfft = cufftExecD2Z(self%planD2Zz, self%f_tmp, self%c_z_dev)
+
+      ! Reorder from z to y
+      blocks = dim3(self%nz/2/SZ + 1, self%ny/SZ, self%nx)
+      threads = dim3(SZ, SZ, 1)
+      c_y_ptr(1:self%ny, 1:SZ, 1:(self%nx*(self%nz/2 + 1))/SZ) => self%c_y_dev
+      c_z_ptr(1:self%nz/2 + 1, 1:SZ, 1:self%nx*self%ny/SZ) => self%c_z_dev
+
+      call reorder_cmplx_z2y_T<<<blocks, threads>>>(c_y_ptr, c_z_ptr, &
+                                                    self%nx, self%nz/2 + 1)
+
+      ! In-place forward FFT in y
+      ierrfft = cufftExecZ2Z(self%planZ2Zy, self%c_y_dev, self%c_y_dev, &
+                             CUFFT_FORWARD)
+
+      ! Reorder from y to x
+      blocks = dim3(self%nx/SZ, self%ny/SZ, self%nz/2 + 1)
+      threads = dim3(SZ, SZ, 1)
+      c_x_ptr(1:self%nx, 1:SZ, 1:(self%ny*(self%nz/2 + 1))/SZ) => self%c_x_dev
+      c_y_ptr(1:self%ny, 1:SZ, 1:(self%nx*(self%nz/2 + 1))/SZ) => self%c_y_dev
+
+      call reorder_cmplx_y2x_T<<<blocks, threads>>>(c_x_ptr, c_y_ptr, &
+                                                    self%nz/2 + 1)
+
+      ! In-place forward FFT in x
+      ierrfft = cufftExecZ2Z(self%planZ2Zx, self%c_x_dev, self%c_x_dev, &
+                             CUFFT_FORWARD)
+
+   end subroutine fft_forward_cuda
+
+   subroutine fft_backward_cuda(self, f)
+      implicit none
+
+      class(cuda_poisson_fft_t) :: self
+      class(field_t), intent(inout) :: f
+
+      real(dp), device, pointer, dimension(:, :, :) :: f_dev
+      complex(dp), device, dimension(:, :, :), pointer :: c_x_ptr, c_y_ptr, &
+                                                          c_z_ptr
+
+      type(dim3) :: blocks, threads
+      integer :: ierrfft
+
+      select type(f); type is (cuda_field_t); f_dev => f%data_d; end select
+
+      ! In-place backward FFT in x
+      ierrfft = cufftExecZ2Z(self%planZ2Zx, self%c_x_dev, self%c_x_dev, &
+                             CUFFT_INVERSE)
+
+      ! Reorder from x to y
+      blocks = dim3(self%nx/SZ, self%ny/SZ, self%nz/2 + 1)
+      threads = dim3(SZ, SZ, 1)
+      c_x_ptr(1:self%nx, 1:SZ, 1:(self%ny*(self%nz/2 + 1))/SZ) => self%c_x_dev
+      c_y_ptr(1:self%ny, 1:SZ, 1:(self%nx*(self%nz/2 + 1))/SZ) => self%c_y_dev
+
+      call reorder_cmplx_x2y_T<<<blocks, threads>>>(c_y_ptr, c_x_ptr, &
+                                                    self%nz/2 + 1)
+
+      ! In-place backward FFT in y
+      ierrfft = cufftExecZ2Z(self%planZ2Zy, self%c_y_dev, self%c_y_dev, &
+                             CUFFT_INVERSE)
+
+      ! Reorder from y to z
+      blocks = dim3(self%nz/2/SZ + 1, self%ny/SZ, self%nx)
+      threads = dim3(SZ, SZ, 1)
+      c_y_ptr(1:self%ny, 1:SZ, 1:(self%nx*(self%nz/2 + 1))/SZ) => self%c_y_dev
+      c_z_ptr(1:self%nz/2 + 1, 1:SZ, 1:self%nx*self%ny/SZ) => self%c_z_dev
+
+      call reorder_cmplx_y2z_T<<<blocks, threads>>>(c_z_ptr, c_y_ptr, &
+                                                    self%nx, self%nz/2 + 1)
+
+      ! Backward FFT transform in z from complex to real
+      ierrfft = cufftExecZ2D(self%planZ2Dz, self%c_z_dev, self%f_tmp)
+
+      ! Finally reorder f back into our specialist data structure
+      blocks = dim3(self%nz/SZ, self%nx*self%ny/SZ, 1)
+      threads = dim3(SZ, SZ, 1)
+      call reshapeDSB<<<blocks, threads>>>(f_dev, self%f_tmp)
+
+   end subroutine fft_backward_cuda
+
+   subroutine fft_postprocess_cuda(self)
+      implicit none
+
+      class(cuda_poisson_fft_t) :: self
+
+      complex(dp), device, dimension(:, :, :), pointer :: c_dev, c_x_ptr
+      type(dim3) :: blocks, threads
+
+      ! Get some complex array pointers with right shape
+      c_x_ptr(1:self%nx, 1:SZ, 1:(self%ny*(self%nz/2 + 1))/SZ) => self%c_x_dev
+      c_dev(1:SZ, 1:self%nx, 1:(self%ny*(self%nz/2 + 1))/SZ) => self%c_y_dev
+
+      ! Reshape from cartesian-like to our specialist data structure
+      blocks = dim3(self%nx/SZ, (self%ny*(self%nz/2 + 1))/SZ, 1)
+      threads = dim3(SZ, SZ, 1)
+      call reshapeCDSB<<<blocks, threads>>>(c_dev, c_x_ptr)
+
+      ! Postprocess
+      blocks = dim3((self%ny*(self%nz/2 + 1))/SZ, 1, 1)
+      threads = dim3(SZ, 1, 1)
+      call process_spectral_div_u<<<blocks, threads>>>( &
+         c_dev, self%waves_dev, self%nx, self%ny, self%nz, &
+         self%ax_dev, self%bx_dev, self%ay_dev, self%by_dev, &
+         self%az_dev, self%bz_dev &
+         )
+
+      ! Reshape from our specialist data structure to cartesian-like
+      blocks = dim3(self%nx/SZ, (self%ny*(self%nz/2 + 1))/SZ, 1)
+      threads = dim3(SZ, SZ, 1)
+      call reshapeCDSF<<<blocks, threads>>>(c_x_ptr, c_dev)
+
+   end subroutine fft_postprocess_cuda
+
+end module m_cuda_poisson_fft
diff --git a/src/omp/backend.f90 b/src/omp/backend.f90
index 813ca82f..1488f705 100644
--- a/src/omp/backend.f90
+++ b/src/omp/backend.f90
@@ -7,6 +7,7 @@ module m_omp_backend
    use m_omp_sendrecv, only: sendrecv_fields
 
    use m_omp_common, only: SZ
+   use m_omp_poisson_fft, only: omp_poisson_fft_t
 
    implicit none
 
@@ -30,8 +31,10 @@ module m_omp_backend
       procedure :: sum_yintox => sum_yintox_omp
       procedure :: sum_zintox => sum_zintox_omp
       procedure :: vecadd => vecadd_omp
-      procedure :: set_fields => set_fields_omp
-      procedure :: get_fields => get_fields_omp
+      procedure :: scalar_product => scalar_product_omp
+      procedure :: set_field => set_field_omp
+      procedure :: get_field => get_field_omp
+      procedure :: init_poisson_fft => init_omp_poisson_fft
       procedure :: transeq_omp_dist
    end type omp_backend_t
 
@@ -313,6 +316,16 @@ subroutine vecadd_omp(self, a, x, b, y)
 
    end subroutine vecadd_omp
 
+   real(dp) function scalar_product_omp(self, x, y) result(s)
+      implicit none
+
+      class(omp_backend_t) :: self
+      class(field_t), intent(in) :: x, y
+
+      s = 0._dp
+
+   end function scalar_product_omp
+
    subroutine copy_into_buffers(u_send_s, u_send_e, u, n, n_blocks)
       implicit none
 
@@ -338,31 +351,42 @@ subroutine copy_into_buffers(u_send_s, u_send_e, u, n, n_blocks)
 
    end subroutine copy_into_buffers
 
-   subroutine set_fields_omp(self, u, v, w, u_in, v_in, w_in)
+   subroutine set_field_omp(self, f, arr)
       implicit none
 
       class(omp_backend_t) :: self
-      class(field_t), intent(inout) :: u, v, w
-      real(dp), dimension(:, :, :), intent(in) :: u_in, v_in, w_in
+      class(field_t), intent(inout) :: f
+      real(dp), dimension(:, :, :), intent(in) :: arr
 
-      u%data = u_in
-      v%data = v_in
-      w%data = w_in
+      f%data = arr
 
-   end subroutine set_fields_omp
+   end subroutine set_field_omp
 
-   subroutine get_fields_omp(self, u_out, v_out, w_out, u, v, w)
+   subroutine get_field_omp(self, arr, f)
       implicit none
 
       class(omp_backend_t) :: self
-      real(dp), dimension(:, :, :), intent(out) :: u_out, v_out, w_out
-      class(field_t), intent(in) :: u, v, w
+      real(dp), dimension(:, :, :), intent(out) :: arr
+      class(field_t), intent(in) :: f
+
+      arr = f%data
 
-      u_out = u%data
-      v_out = v%data
-      w_out = w%data
+   end subroutine get_field_omp
+
+   subroutine init_omp_poisson_fft(self, xdirps, ydirps, zdirps)
+      implicit none
+
+      class(omp_backend_t) :: self
+      type(dirps_t), intent(in) :: xdirps, ydirps, zdirps
+
+      allocate(omp_poisson_fft_t :: self%poisson_fft)
+
+      select type (poisson_fft => self%poisson_fft)
+      type is (omp_poisson_fft_t)
+         poisson_fft = omp_poisson_fft_t(xdirps, ydirps, zdirps)
+      end select
 
-   end subroutine get_fields_omp
+   end subroutine init_omp_poisson_fft
 
 end module m_omp_backend
 
diff --git a/src/omp/poisson_fft.f90 b/src/omp/poisson_fft.f90
new file mode 100644
index 00000000..a796f1fc
--- /dev/null
+++ b/src/omp/poisson_fft.f90
@@ -0,0 +1,60 @@
+module m_omp_poisson_fft
+   use m_allocator, only: field_t
+   use m_common, only: dp
+   use m_poisson_fft, only: poisson_fft_t
+   use m_tdsops, only: dirps_t
+
+   use m_omp_common, only: SZ
+
+   implicit none
+
+   type, extends(poisson_fft_t) :: omp_poisson_fft_t
+      !! FFT based Poisson solver
+      !! It can only handle 1D decompositions along z direction.
+      complex(dp), allocatable, dimension(:, :, :) :: c_x, c_y, c_z
+   contains
+      procedure :: fft_forward => fft_forward_omp
+      procedure :: fft_backward => fft_backward_omp
+      procedure :: fft_postprocess => fft_postprocess_omp
+   end type omp_poisson_fft_t
+
+   interface omp_poisson_fft_t
+      module procedure init
+   end interface omp_poisson_fft_t
+
+   private :: init
+
+contains
+
+   function init(xdirps, ydirps, zdirps) result(poisson_fft)
+      implicit none
+
+      class(dirps_t), intent(in) :: xdirps, ydirps, zdirps
+
+      type(omp_poisson_fft_t) :: poisson_fft
+
+      call poisson_fft%base_init(xdirps, ydirps, zdirps, SZ)
+
+   end function init
+
+   subroutine fft_forward_omp(self, f_in)
+      implicit none
+
+      class(omp_poisson_fft_t) :: self
+      class(field_t), intent(in) :: f_in
+   end subroutine fft_forward_omp
+
+   subroutine fft_backward_omp(self, f_out)
+      implicit none
+
+      class(omp_poisson_fft_t) :: self
+      class(field_t), intent(inout) :: f_out
+   end subroutine fft_backward_omp
+
+   subroutine fft_postprocess_omp(self)
+      implicit none
+
+      class(omp_poisson_fft_t) :: self
+   end subroutine fft_postprocess_omp
+
+end module m_omp_poisson_fft
diff --git a/src/poisson_fft.f90 b/src/poisson_fft.f90
new file mode 100644
index 00000000..9ee0f575
--- /dev/null
+++ b/src/poisson_fft.f90
@@ -0,0 +1,198 @@
+module m_poisson_fft
+   use m_allocator, only: field_t
+   use m_common, only: dp, pi
+   use m_tdsops, only: dirps_t
+
+   implicit none
+
+   type, abstract :: poisson_fft_t
+      !! FFT based Poisson solver
+      !! It can only handle 1D decompositions along z direction.
+      integer :: nx, ny, nz
+      complex(dp), allocatable, dimension(:, :, :) :: waves
+      complex(dp), allocatable, dimension(:) :: ax, bx, ay, by, az, bz
+   contains
+      procedure(fft_forward), deferred :: fft_forward
+      procedure(fft_backward), deferred :: fft_backward
+      procedure(fft_postprocess), deferred :: fft_postprocess
+      procedure :: base_init
+      procedure :: waves_set
+   end type poisson_fft_t
+
+   abstract interface
+      subroutine fft_forward(self, f_in)
+         import :: poisson_fft_t
+         import :: field_t
+         implicit none
+
+         class(poisson_fft_t) :: self
+         class(field_t), intent(in) :: f_in
+      end subroutine fft_forward
+
+      subroutine fft_backward(self, f_out)
+         import :: poisson_fft_t
+         import :: field_t
+         implicit none
+
+         class(poisson_fft_t) :: self
+         class(field_t), intent(inout) :: f_out
+      end subroutine fft_backward
+
+      subroutine fft_postprocess(self)
+         import :: poisson_fft_t
+         implicit none
+
+         class(poisson_fft_t) :: self
+      end subroutine fft_postprocess
+   end interface
+
+contains
+
+   subroutine base_init(self, xdirps, ydirps, zdirps, sz)
+      implicit none
+
+      class(poisson_fft_t) :: self
+      class(dirps_t), intent(in) :: xdirps, ydirps, zdirps
+      integer, intent(in) :: sz
+
+      self%nx = xdirps%n; self%ny = ydirps%n; self%nz = zdirps%n
+
+      allocate (self%ax(self%nx), self%bx(self%nx))
+      allocate (self%ay(self%nx), self%by(self%nx))
+      allocate (self%az(self%nx), self%bz(self%nx))
+
+      allocate (self%waves(sz, self%nx, (self%ny*(self%nz/2 + 1))/sz))
+
+      ! waves_set requires some of the preprocessed tdsops variables.
+      call self%waves_set(xdirps, ydirps, zdirps, sz)
+
+   end subroutine base_init
+
+   subroutine waves_set(self, xdirps, ydirps, zdirps, sz)
+      !! Ref. JCP 228 (2009), 5989–6015, Sec 4
+      implicit none
+
+      class(poisson_fft_t) :: self
+      type(dirps_t), intent(in) :: xdirps, ydirps, zdirps
+      integer, intent(in) :: sz
+
+      complex(dp), allocatable, dimension(:) :: xkx, xk2, yky, yk2, zkz, zk2, &
+                                                exs, eys, ezs
+
+      integer :: nx, ny, nz
+      real(dp) :: w, wp, rlexs, rleys, rlezs, xtt, ytt, ztt, xt1, yt1, zt1
+      complex(dp) :: xt2, yt2, zt2, xyzk
+
+      integer :: i, j, ka, kb, ix, iy, iz
+
+      nx = xdirps%n; ny = ydirps%n; nz = zdirps%n
+
+      do i = 1, nx
+         self%ax(i) = sin((i-1)*pi/nx)
+         self%bx(i) = cos((i-1)*pi/nx)
+      end do
+
+      do i = 1, ny
+         self%ay(i) = sin((i-1)*pi/ny)
+         self%by(i) = cos((i-1)*pi/ny)
+      end do
+
+      do i = 1, nz
+         self%az(i) = sin((i-1)*pi/nz)
+         self%bz(i) = cos((i-1)*pi/nz)
+      end do
+
+      ! Now kxyz
+      allocate(xkx(nx), xk2(nx), exs(nx))
+      allocate(yky(ny), yk2(ny), eys(ny))
+      allocate(zkz(nz), zk2(nz), ezs(nz))
+      xkx(:) = 0; xk2(:) = 0; yky(:) = 0; yk2(:) = 0; zkz(:) = 0; zk2(:) = 0
+
+      ! periodic-x
+      do i = 1, nx/2 + 1
+         w = 2*pi*(i - 1)/nx
+         wp = xdirps%stagder_v2p%a*2*xdirps%d*sin(0.5_dp*w) &
+              + xdirps%stagder_v2p%b*2*xdirps%d*sin(1.5_dp*w)
+         wp = wp/(1._dp + 2*xdirps%stagder_v2p%alpha*cos(w))
+
+         xkx(i) = cmplx(1._dp, 1._dp, kind=dp)*(nx*wp/xdirps%L)
+         exs(i) = cmplx(1._dp, 1._dp, kind=dp)*(nx*w/xdirps%L)
+         xk2(i) = cmplx(1._dp, 1._dp, kind=dp)*(nx*wp/xdirps%L)**2
+      end do
+      do i = nx/2 + 2, nx
+         xkx(i) = xkx(nx - i + 2)
+         exs(i) = exs(nx - i + 2)
+         xk2(i) = xk2(nx - i + 2)
+      end do
+
+      ! periodic-y
+      do i = 1, ny/2 + 1
+         w = 2*pi*(i - 1)/ny
+         wp = ydirps%stagder_v2p%a*2*ydirps%d*sin(0.5_dp*w) &
+              + ydirps%stagder_v2p%b*2*ydirps%d*sin(1.5_dp*w)
+         wp = wp/(1._dp + 2*ydirps%stagder_v2p%alpha*cos(w))
+
+         yky(i) = cmplx(1._dp, 1._dp, kind=dp)*(ny*wp/ydirps%L)
+         eys(i) = cmplx(1._dp, 1._dp, kind=dp)*(ny*w/ydirps%L)
+         yk2(i) = cmplx(1._dp, 1._dp, kind=dp)*(ny*wp/ydirps%L)**2
+      end do
+      do i = ny/2 + 2, ny
+         yky(i) = yky(ny-i+2)
+         eys(i) = eys(ny-i+2)
+         yk2(i) = yk2(ny-i+2)
+      end do
+
+      ! periodic-z
+      do i = 1, nz/2 + 1
+         w = 2*pi*(i - 1)/nz
+         wp = zdirps%stagder_v2p%a*2*zdirps%d*sin(0.5_dp*w) &
+              + zdirps%stagder_v2p%b*2*zdirps%d*sin(1.5_dp*w)
+         wp = wp/(1._dp + 2*zdirps%stagder_v2p%alpha*cos(w))
+
+         zkz(i) = cmplx(1._dp, 1._dp, kind=dp)*(nz*wp/zdirps%L)
+         ezs(i) = cmplx(1._dp, 1._dp, kind=dp)*(nz*w/zdirps%L)
+         zk2(i) = cmplx(1._dp, 1._dp, kind=dp)*(nz*wp/zdirps%L)**2
+      end do
+
+      print*, 'waves array is correctly set only for a single rank run'
+      ! TODO: do loop ranges below are valid only for single rank runs
+      do ka = 1, nz/2 + 1
+         do kb = 1, ny/sz
+            do j = 1, nx
+               do i = 1, sz
+                  ix = j; iy = (kb - 1)*sz + i; iz = ka
+                  rlexs = real(exs(ix), kind=dp)*xdirps%d
+                  rleys = real(eys(iy), kind=dp)*ydirps%d
+                  rlezs = real(ezs(iz), kind=dp)*zdirps%d
+
+                  xtt = 2*(xdirps%interpl_v2p%a*cos(rlexs*0.5_dp) &
+                           + xdirps%interpl_v2p%b*cos(rlexs*1.5_dp) &
+                           + xdirps%interpl_v2p%c*cos(rlexs*2.5_dp) &
+                           + xdirps%interpl_v2p%d*cos(rlexs*3.5_dp))
+                  ytt = 2*(ydirps%interpl_v2p%a*cos(rleys*0.5_dp) &
+                           + ydirps%interpl_v2p%b*cos(rleys*1.5_dp) &
+                           + ydirps%interpl_v2p%c*cos(rleys*2.5_dp) &
+                           + ydirps%interpl_v2p%d*cos(rleys*3.5_dp))
+                  ztt = 2*(zdirps%interpl_v2p%a*cos(rlezs*0.5_dp) &
+                           + zdirps%interpl_v2p%b*cos(rlezs*1.5_dp) &
+                           + zdirps%interpl_v2p%c*cos(rlezs*2.5_dp) &
+                           + zdirps%interpl_v2p%d*cos(rlezs*3.5_dp))
+
+                  xt1 = 1._dp + 2*xdirps%interpl_v2p%alpha*cos(rlexs)
+                  yt1 = 1._dp + 2*ydirps%interpl_v2p%alpha*cos(rleys)
+                  zt1 = 1._dp + 2*zdirps%interpl_v2p%alpha*cos(rlezs)
+
+                  xt2 = xk2(ix)*(((ytt/yt1)*(ztt/zt1))**2)
+                  yt2 = yk2(iy)*(((xtt/xt1)*(ztt/zt1))**2)
+                  zt2 = zk2(iz)*(((xtt/xt1)*(ytt/yt1))**2)
+
+                  xyzk = xt2 + yt2 + zt2
+                  self%waves(i, j, ka + (kb - 1)*(nz/2 + 1)) = xyzk
+               end do
+            end do
+         end do
+      end do
+
+   end subroutine waves_set
+
+end module m_poisson_fft
diff --git a/src/solver.f90 b/src/solver.f90
index 64339411..c9a04a11 100644
--- a/src/solver.f90
+++ b/src/solver.f90
@@ -1,7 +1,9 @@
 module m_solver
    use m_allocator, only: allocator_t, field_t
    use m_base_backend, only: base_backend_t
-   use m_common, only: dp, globs_t, RDR_X2Y, RDR_X2Z, RDR_Y2X, RDR_Y2Z, RDR_Z2Y
+   use m_common, only: dp, globs_t, &
+                       RDR_X2Y, RDR_X2Z, RDR_Y2X, RDR_Y2Z, RDR_Z2X, RDR_Z2Y, &
+                       POISSON_SOLVER_FFT, POISSON_SOLVER_CG
    use m_tdsops, only: tdsops_t, dirps_t
    use m_time_integrator, only: time_intg_t
 
@@ -36,23 +38,39 @@ module m_solver
       !! for later use.
 
       real(dp) :: dt, nu
+      integer :: n_iters, n_output
 
       class(field_t), pointer :: u, v, w
 
       class(base_backend_t), pointer :: backend
       class(dirps_t), pointer :: xdirps, ydirps, zdirps
       class(time_intg_t), pointer :: time_integrator
+      procedure(poisson_solver), pointer :: poisson => null()
    contains
       procedure :: transeq
       procedure :: divergence_v2p
       procedure :: gradient_p2v
       procedure :: curl
+      procedure :: output
       procedure :: run
    end type solver_t
 
+   abstract interface
+      subroutine poisson_solver(self, pressure, div_u)
+         import :: solver_t
+         import :: field_t
+         implicit none
+
+         class(solver_t) :: self
+         class(field_t), intent(inout) :: pressure
+         class(field_t), intent(in) :: div_u
+      end subroutine poisson_solver
+   end interface
+
    interface solver_t
       module procedure init
    end interface solver_t
+
 contains
 
    function init(backend, time_integrator, xdirps, ydirps, zdirps, globs) &
@@ -68,8 +86,8 @@ function init(backend, time_integrator, xdirps, ydirps, zdirps, globs) &
       real(dp), allocatable, dimension(:, :, :) :: u_init, v_init, w_init
       integer :: dims(3)
 
-      real(dp) :: dx, dy, dz
-      integer :: nx, ny, nz
+      real(dp) :: x, y, z
+      integer :: nx, ny, nz, i, j, b, ka, kb, ix, iy, iz, sz
 
       solver%backend => backend
       solver%time_integrator => time_integrator
@@ -88,24 +106,53 @@ function init(backend, time_integrator, xdirps, ydirps, zdirps, globs) &
       allocate(v_init(dims(1), dims(2), dims(3)))
       allocate(w_init(dims(1), dims(2), dims(3)))
 
-      u_init = 0
-      v_init = 0
-      w_init = 0
+      solver%dt = globs%dt
+      solver%backend%nu = globs%nu
+      solver%n_iters = globs%n_iters
+      solver%n_output = globs%n_output
 
-      call solver%backend%set_fields( &
-         solver%u, solver%v, solver%w, u_init, v_init, w_init &
-      )
+      sz = dims(1)
+      nx = globs%nx_loc; ny = globs%ny_loc; nz = globs%nz_loc
+      do ka = 1, nz
+         do kb = 1, ny/sz
+            do j = 1, nx
+               do i = 1, sz
+                  ! Mapping to ix, iy, iz depends on global group numbering
+                  ix = j; iy = (kb - 1)*sz + i; iz = ka
+                  x = (ix - 1)*globs%dx
+                  y = (iy - 1)*globs%dy
+                  z = (iz - 1)*globs%dz
+
+                  b = ka + (kb - 1)*xdirps%n
+                  u_init(i, j, b) = sin(x)*cos(y)*cos(z)
+                  v_init(i, j, b) =-cos(x)*sin(y)*cos(z)
+                  w_init(i, j, b) = 0
+               end do
+            end do
+         end do
+      end do
+
+      call solver%backend%set_field(solver%u, u_init)
+      call solver%backend%set_field(solver%v, v_init)
+      call solver%backend%set_field(solver%w, w_init)
 
       deallocate(u_init, v_init, w_init)
       print*, 'initial conditions are set'
 
-      nx = globs%nx_loc; ny = globs%ny_loc; nz = globs%nz_loc
-      dx = globs%dx; dy = globs%dy; dz = globs%dz
-
       ! Allocate and set the tdsops
-      call allocate_tdsops(solver%xdirps, nx, dx, solver%backend)
-      call allocate_tdsops(solver%ydirps, ny, dy, solver%backend)
-      call allocate_tdsops(solver%zdirps, nz, dz, solver%backend)
+      call allocate_tdsops(solver%xdirps, nx, globs%dx, solver%backend)
+      call allocate_tdsops(solver%ydirps, ny, globs%dy, solver%backend)
+      call allocate_tdsops(solver%zdirps, nz, globs%dz, solver%backend)
+
+      select case (globs%poisson_solver_type)
+      case (POISSON_SOLVER_FFT)
+         print*, 'Poisson solver: FFT'
+         call solver%backend%init_poisson_fft(xdirps, ydirps, zdirps)
+         solver%poisson => poisson_fft
+      case (POISSON_SOLVER_CG)
+         print*, 'Poisson solver: CG, not yet implemented'
+         solver%poisson => poisson_cg
+      end select
 
    end function init
 
@@ -136,6 +183,10 @@ subroutine allocate_tdsops(dirps, nx, dx, backend)
    end subroutine
 
    subroutine transeq(self, du, dv, dw, u, v, w)
+      !! Skew-symmetric form of convection-diffusion terms in the
+      !! incompressible Navier-Stokes momemtum equations, excluding
+      !! pressure terms.
+      !! Inputs from velocity grid and outputs to velocity grid.
       implicit none
 
       class(solver_t) :: self
@@ -217,6 +268,8 @@ subroutine transeq(self, du, dv, dw, u, v, w)
    end subroutine transeq
 
    subroutine divergence_v2p(self, div_u, u, v, w)
+      !! Divergence of a vector field (u, v, w).
+      !! Inputs from velocity grid and outputs to pressure grid.
       implicit none
 
       class(solver_t) :: self
@@ -279,8 +332,8 @@ subroutine divergence_v2p(self, div_u, u, v, w)
       u_z => self%backend%allocator%get_block()
       w_z => self%backend%allocator%get_block()
 
-      ! dv_y = dv_y + dw_y
-      call self%backend%vecadd(1._dp, dw_y, 1._dp, dv_y)
+      ! du_y = dv_y + du_y
+      call self%backend%vecadd(1._dp, dv_y, 1._dp, du_y)
 
       ! reorder from y to z
       call self%backend%reorder(u_z, du_y, RDR_Y2Z)
@@ -312,6 +365,8 @@ subroutine divergence_v2p(self, div_u, u, v, w)
    end subroutine divergence_v2p
 
    subroutine gradient_p2v(self, dpdx, dpdy, dpdz, pressure)
+      !! Gradient of a scalar field 'pressure'.
+      !! Inputs from pressure grid and outputs to velocity grid.
       implicit none
 
       class(solver_t) :: self
@@ -379,9 +434,9 @@ subroutine gradient_p2v(self, dpdx, dpdy, dpdz, pressure)
       call self%backend%tds_solve(dpdx, p_sx_x, self%xdirps, &
                                   self%xdirps%stagder_p2v)
       call self%backend%tds_solve(dpdy, dpdy_sx_x, self%xdirps, &
-                                  self%xdirps%interpl_v2p)
+                                  self%xdirps%interpl_p2v)
       call self%backend%tds_solve(dpdz, dpdz_sx_x, self%xdirps, &
-                                  self%xdirps%interpl_v2p)
+                                  self%xdirps%interpl_p2v)
 
       ! release temporary x fields
       call self%backend%allocator%release_block(p_sx_x)
@@ -390,80 +445,176 @@ subroutine gradient_p2v(self, dpdx, dpdy, dpdz, pressure)
 
    end subroutine gradient_p2v
 
-   subroutine curl(self, o_x, o_y, o_z, u, v, w)
+   subroutine curl(self, o_i_hat, o_j_hat, o_k_hat, u, v, w)
+      !! Curl of a vector field (u, v, w).
+      !! Inputs from velocity grid and outputs to velocity grid.
       implicit none
 
       class(solver_t) :: self
-      class(field_t), intent(inout) :: o_x, o_y, o_z !! omega_x/_y/_z
+      !> Vector components of the output vector field Omega
+      class(field_t), intent(inout) :: o_i_hat, o_j_hat, o_k_hat
       class(field_t), intent(in) :: u, v, w
 
-      class(field_t), pointer :: u_y, w_y, du_y, u_z, v_z, du_z, dv_z
-
-      ! o_x = dw/dy - dv/dz
-      ! o_y = du/dz - dw/dx
-      ! o_z = dv/dx - du/dy
+      class(field_t), pointer :: u_y, u_z, v_z, w_y, dwdy_y, dvdz_z, dvdz_x, &
+                                 dudz_z, dudz_x, dudy_y, dudy_x
 
-      ! obtain dw/dx, dv/dx and store them directly in omega_y, omega_z
-      call self%backend%tds_solve(o_y, w, self%xdirps, self%xdirps%der1st)
-      call self%backend%tds_solve(o_z, v, self%xdirps, self%xdirps%der1st)
+      ! omega_i_hat = dw/dy - dv/dz
+      ! omega_j_hat = du/dz - dw/dx
+      ! omega_k_hat = dv/dx - du/dy
 
-      u_y => self%backend%allocator%get_block()
+      ! omega_i_hat
+      ! dw/dy
       w_y => self%backend%allocator%get_block()
-
-      call self%backend%reorder(u_y, u, RDR_X2Y)
+      dwdy_y => self%backend%allocator%get_block()
       call self%backend%reorder(w_y, w, RDR_X2Y)
+      call self%backend%tds_solve(dwdy_y, w_y, self%ydirps, self%ydirps%der1st)
 
-      du_y => self%backend%allocator%get_block()
-
-      ! obtain du/dy, dw/dy
-      ! store du/dy in a temporary field to add into omega_z later
-      ! dw/dy can be stored directly in omega_x as it is empty
-      call self%backend%tds_solve(du_y, u_y, self%ydirps, self%ydirps%der1st)
-      call self%backend%tds_solve(o_x, w_y, self%ydirps, self%ydirps%der1st)
+      call self%backend%reorder(o_i_hat, dwdy_y, RDR_Y2X)
 
-      call self%backend%allocator%release_block(u_y)
       call self%backend%allocator%release_block(w_y)
+      call self%backend%allocator%release_block(dwdy_y)
 
-      ! omega_z = dv/dz - du/dy
-      call self%backend%vecadd(-1._dp, du_y, 1._dp, o_z)
+      ! dv/dz
+      v_z => self%backend%allocator%get_block()
+      dvdz_z => self%backend%allocator%get_block()
+      call self%backend%reorder(v_z, v, RDR_X2Z)
+      call self%backend%tds_solve(dvdz_z, v_z, self%zdirps, self%zdirps%der1st)
 
-      call self%backend%allocator%release_block(du_y)
+      dvdz_x => self%backend%allocator%get_block()
+      call self%backend%reorder(dvdz_x, dvdz_z, RDR_Z2X)
 
-      u_z => self%backend%allocator%get_block()
-      v_z => self%backend%allocator%get_block()
-      du_z => self%backend%allocator%get_block()
-      dv_z => self%backend%allocator%get_block()
+      call self%backend%allocator%release_block(v_z)
+      call self%backend%allocator%release_block(dvdz_z)
 
-      ! obtain du/dz, dv/dz and store them in temporary fields
-      call self%backend%tds_solve(du_z, u_z, self%zdirps, self%zdirps%der1st)
-      call self%backend%tds_solve(dv_z, v_z, self%zdirps, self%zdirps%der1st)
+      ! omega_i_hat = dw/dy - dv/dz
+      call self%backend%vecadd(-1._dp, dvdz_x, 1._dp, o_i_hat)
 
-      ! omega_x = dw/dy - dv/dz
-      call self%backend%vecadd(-1._dp, dv_z, 1._dp, o_x)
-      ! omega_y = du/dz - dw/dx
-      call self%backend%vecadd(1._dp, du_z, -1._dp, o_y)
+      call self%backend%allocator%release_block(dvdz_x)
+
+      ! omega_j_hat
+      ! du/dz
+      u_z => self%backend%allocator%get_block()
+      dudz_z => self%backend%allocator%get_block()
+      call self%backend%reorder(u_z, u, RDR_X2Z)
+      call self%backend%tds_solve(dudz_z, u_z, self%zdirps, self%zdirps%der1st)
+
+      dudz_x => self%backend%allocator%get_block()
+      call self%backend%reorder(dudz_x, dudz_z, RDR_Z2X)
 
       call self%backend%allocator%release_block(u_z)
-      call self%backend%allocator%release_block(v_z)
-      call self%backend%allocator%release_block(du_z)
-      call self%backend%allocator%release_block(dv_z)
+      call self%backend%allocator%release_block(dudz_z)
+
+      ! dw/dx
+      call self%backend%tds_solve(o_j_hat, w, self%xdirps, self%xdirps%der1st)
+
+      ! omega_j_hat = du/dz - dw/dx
+      call self%backend%vecadd(1._dp, dudz_x, -1._dp, o_j_hat)
+
+      call self%backend%allocator%release_block(dudz_x)
+
+      ! omega_k_hat
+      ! dv/dx
+      call self%backend%tds_solve(o_k_hat, v, self%xdirps, self%xdirps%der1st)
+
+      ! du/dy
+      u_y => self%backend%allocator%get_block()
+      dudy_y => self%backend%allocator%get_block()
+      call self%backend%reorder(u_y, u, RDR_X2Y)
+      call self%backend%tds_solve(dudy_y, u_y, self%ydirps, self%ydirps%der1st)
+
+      dudy_x => self%backend%allocator%get_block()
+      call self%backend%reorder(dudy_x, dudy_y, RDR_Y2X)
+
+      call self%backend%allocator%release_block(u_y)
+      call self%backend%allocator%release_block(dudy_y)
+
+      ! omega_k_hat = dv/dx - du/dy
+      call self%backend%vecadd(-1._dp, dudy_x, 1._dp, o_k_hat)
+
+      call self%backend%allocator%release_block(dudy_x)
 
    end subroutine curl
 
-   subroutine run(self, n_iter, u_out, v_out, w_out)
+   subroutine poisson_fft(self, pressure, div_u)
+      implicit none
+
+      class(solver_t) :: self
+      class(field_t), intent(inout) :: pressure
+      class(field_t), intent(in) :: div_u
+
+      ! call forward FFT
+      ! output array in spectral space is stored at poisson_fft class
+      call self%backend%poisson_fft%fft_forward(div_u)
+
+      ! postprocess
+      call self%backend%poisson_fft%fft_postprocess
+
+      ! call backward FFT
+      call self%backend%poisson_fft%fft_backward(pressure)
+
+   end subroutine poisson_fft
+
+   subroutine poisson_cg(self, pressure, div_u)
+      implicit none
+
+      class(solver_t) :: self
+      class(field_t), intent(inout) :: pressure
+      class(field_t), intent(in) :: div_u
+
+   end subroutine poisson_cg
+
+   subroutine output(self, t, u_out)
+      implicit none
+
+      class(solver_t), intent(in) :: self
+      real(dp), intent(in) :: t
+      real(dp), dimension(:, :, :), intent(inout) :: u_out
+
+      class(field_t), pointer :: du, dv, dw
+      integer :: ngrid
+
+      ngrid = self%xdirps%n*self%ydirps%n*self%zdirps%n
+      print*, 'time = ', t
+
+      du => self%backend%allocator%get_block()
+      dv => self%backend%allocator%get_block()
+      dw => self%backend%allocator%get_block()
+
+      call self%curl(du, dv, dw, self%u, self%v, self%w)
+      print*, 'enstrophy:', 0.5_dp*( &
+         self%backend%scalar_product(du, du) &
+         + self%backend%scalar_product(dv, dv) &
+         + self%backend%scalar_product(dw, dw) &
+         )/ngrid
+
+      call self%backend%allocator%release_block(du)
+      call self%backend%allocator%release_block(dv)
+      call self%backend%allocator%release_block(dw)
+
+      call self%divergence_v2p(du, self%u, self%v, self%w)
+      call self%backend%get_field(u_out, du)
+      print*, 'div u max mean:', maxval(abs(u_out)), sum(abs(u_out))/ngrid
+
+   end subroutine output
+
+   subroutine run(self, u_out, v_out, w_out)
       implicit none
 
       class(solver_t), intent(in) :: self
-      integer, intent(in) :: n_iter
       real(dp), dimension(:, :, :), intent(inout) :: u_out, v_out, w_out
 
       class(field_t), pointer :: du, dv, dw, div_u, pressure, dpdx, dpdy, dpdz
 
+      real(dp) :: t
       integer :: i
 
+      print*, 'initial conditions'
+      t = 0._dp
+      call self%output(t, u_out)
+
       print*, 'start run'
 
-      do i = 1, n_iter
+      do i = 1, self%n_iters
          du => self%backend%allocator%get_block()
          dv => self%backend%allocator%get_block()
          dw => self%backend%allocator%get_block()
@@ -485,7 +636,7 @@ subroutine run(self, n_iter, u_out, v_out, w_out)
 
          pressure => self%backend%allocator%get_block()
 
-         !call self%poisson(pressure, div_u)
+         call self%poisson(pressure, div_u)
 
          call self%backend%allocator%release_block(div_u)
 
@@ -505,13 +656,18 @@ subroutine run(self, n_iter, u_out, v_out, w_out)
          call self%backend%allocator%release_block(dpdx)
          call self%backend%allocator%release_block(dpdy)
          call self%backend%allocator%release_block(dpdz)
+
+         if ( mod(i, self%n_output) == 0 ) then
+            t = i*self%dt
+            call self%output(t, u_out)
+         end if
       end do
 
       print*, 'run end'
 
-      call self%backend%get_fields( &
-         u_out, v_out, w_out, self%u, self%v, self%w &
-      )
+      call self%backend%get_field(u_out, self%u)
+      call self%backend%get_field(v_out, self%v)
+      call self%backend%get_field(w_out, self%w)
 
    end subroutine run
 
diff --git a/src/tdsops.f90 b/src/tdsops.f90
index c159cdd2..3db7b4d9 100644
--- a/src/tdsops.f90
+++ b/src/tdsops.f90
@@ -27,6 +27,7 @@ module m_tdsops
                                              dist_sa, dist_sc, & !! back subs.
                                              dist_af !! the auxiliary factors
       real(dp), allocatable :: coeffs(:), coeffs_s(:, :), coeffs_e(:, :)
+      real(dp) :: alpha, a, b, c = 0._dp, d = 0._dp
       integer :: n, n_halo
    contains
       procedure :: deriv_1st, deriv_2nd, interpl_mid, stagder_1st, preprocess
@@ -46,6 +47,7 @@ module m_tdsops
                                       stagder_v2p, stagder_p2v, &
                                       interpl_v2p, interpl_p2v
       integer :: nrank, nproc, pnext, pprev, n, n_blocks
+      real(dp) :: L, d
    end type dirps_t
 
 contains
@@ -154,6 +156,9 @@ subroutine deriv_1st(self, delta, scheme, bc_start, bc_end, sym)
          error stop 'scheme is not defined'
       end select
 
+      self%alpha = alpha
+      self%a = afi; self%b = bfi
+
       self%coeffs(:) = [0._dp, 0._dp, -bfi, -afi, &
                         0._dp, &
                         afi, bfi, 0._dp, 0._dp]
@@ -321,6 +326,9 @@ subroutine deriv_2nd(self, delta, scheme, bc_start, bc_end, sym, &
          error stop 'scheme is not defined'
       end select
 
+      self%alpha = alpha
+      self%a = asi; self%b = bsi; self%c = csi; self%d = dsi
+
       self%coeffs(:) = [dsi, csi, bsi, asi, &
                         -2._dp*(asi + bsi + csi + dsi), &
                         asi, bsi, csi, dsi]
@@ -511,6 +519,9 @@ subroutine interpl_mid(self, scheme, from_to, bc_start, bc_end, sym)
          error stop 'scheme is not defined'
       end select
 
+      self%alpha = alpha
+      self%a = aici; self%b = bici; self%c = cici; self%d = dici
+
       select case (from_to)
       case ('v2p')
          self%coeffs(:) = [0._dp, dici, cici, bici, &
@@ -633,6 +644,9 @@ subroutine stagder_1st(self, delta, scheme, from_to, bc_start, bc_end, sym)
          error stop 'scheme is not defined'
       end select
 
+      self%alpha = alpha
+      self%a = aci; self%b = bci
+
       select case (from_to)
       case ('v2p')
          self%coeffs(:) = [0._dp, 0._dp, 0._dp, -bci, &
diff --git a/src/time_integrator.f90 b/src/time_integrator.f90
index d12a942e..8a725169 100644
--- a/src/time_integrator.f90
+++ b/src/time_integrator.f90
@@ -61,6 +61,10 @@ subroutine step(self, u, v, w, du, dv, dw, dt)
 
       real(dp), intent(in) :: dt
 
+      call self%backend%vecadd(dt, du, 1._dp, u)
+      call self%backend%vecadd(dt, dv, 1._dp, v)
+      call self%backend%vecadd(dt, dw, 1._dp, w)
+
    end subroutine step
 
    subroutine adams_bashford_1st(vels, olds, coeffs)
diff --git a/src/xcompact.f90 b/src/xcompact.f90
index be9295d9..635db2a7 100644
--- a/src/xcompact.f90
+++ b/src/xcompact.f90
@@ -3,7 +3,8 @@ program xcompact
 
    use m_allocator
    use m_base_backend
-   use m_common, only: pi, globs_t, set_pprev_pnext
+   use m_common, only: pi, globs_t, set_pprev_pnext, &
+                       POISSON_SOLVER_FFT, POISSON_SOLVER_CG
    use m_solver, only: solver_t
    use m_time_integrator, only: time_intg_t
    use m_tdsops, only: tdsops_t
@@ -55,13 +56,21 @@ program xcompact
 
    ! read L_x/y/z from the input file
    globs%Lx = 2*pi; globs%Ly = 2*pi; globs%Lz = 2*pi
+   xdirps%L = globs%Lx; ydirps%L = globs%Ly; zdirps%L = globs%Lz
 
    ! read ns from the input file
-   globs%nx = 512; globs%ny = 512; globs%nz = 512
+   globs%nx = 256; globs%ny = 256; globs%nz = 256
+
+   globs%dt = 0.001_dp
+   globs%nu = 1._dp/1600._dp
+   globs%n_iters = 20000
+   globs%n_output = 100
 
    ! set nprocs based on run time arguments
    globs%nproc_x = 1; globs%nproc_y = 1; globs%nproc_z = 1
 
+   globs%poisson_solver_type = POISSON_SOLVER_FFT
+
    ! Lets allow a 1D decomposition for the moment
    !globs%nproc_x = nproc
 
@@ -91,6 +100,8 @@ program xcompact
    globs%dy = globs%Ly/globs%ny
    globs%dz = globs%Lz/globs%nz
 
+   xdirps%d = globs%dx; ydirps%d = globs%dy; zdirps%d = globs%dz
+
    xdirps%n = globs%nx_loc
    ydirps%n = globs%ny_loc
    zdirps%n = globs%nz_loc
@@ -117,8 +128,6 @@ program xcompact
    print*, 'OpenMP backend instantiated'
 #endif
 
-   backend%nu = 1._dp
-
    allocate(u(SZ, globs%nx_loc, globs%n_groups_x))
    allocate(v(SZ, globs%nx_loc, globs%n_groups_x))
    allocate(w(SZ, globs%nx_loc, globs%n_groups_x))
@@ -130,7 +139,7 @@ program xcompact
 
    call cpu_time(t_start)
 
-   call solver%run(100, u, v, w)
+   call solver%run(u, v, w)
 
    call cpu_time(t_end)
 
@@ -138,4 +147,6 @@ program xcompact
 
    print*, 'norms', norm2(u), norm2(v), norm2(w)
 
+   call MPI_Finalize(ierr)
+
 end program xcompact
diff --git a/tests/cuda/test_cuda_reorder.f90 b/tests/cuda/test_cuda_reorder.f90
index 833b0774..bacd3450 100644
--- a/tests/cuda/test_cuda_reorder.f90
+++ b/tests/cuda/test_cuda_reorder.f90
@@ -6,7 +6,7 @@ program test_cuda_reorder
    use m_common, only: dp
    use m_cuda_common, only: SZ
    use m_cuda_kernels_reorder, only: reorder_x2y, reorder_x2z, reorder_y2x, &
-                                     reorder_y2z, reorder_z2y
+                                     reorder_y2z, reorder_z2x, reorder_z2y
 
    implicit none
 
@@ -83,50 +83,46 @@ program test_cuda_reorder
    threads = dim3(SZ, SZ, 1)
    call reorder_y2z<<<blocks, threads>>>(u_o_d, u_temp_d, nx, nz)
 
-   ! store this in host
-   u_o = u_o_d
+   ! store initial z oriented field
+   u_temp = u_temp_d
 
-   ! and zeroize u_o_d again in any case
-   u_o_d = 0
+   ! z oriented field into y
+   blocks = dim3(nx/SZ, ny/SZ, nz)
+   threads = dim3(SZ, SZ, 1)
+   call reorder_z2y<<<blocks, threads>>>(u_temp_d, u_o_d, nx, nz)
 
-   ! reorder initial random field into z orientation
-   blocks = dim3(nx, ny/SZ, 1)
-   threads = dim3(SZ, 1, 1)
-   call reorder_x2z<<<blocks, threads>>>(u_o_d, u_i_d, nz)
-   u_temp = u_o_d
+   u_o = u_temp_d
 
-   ! compare two z oriented fields
+   ! compare two y oriented fields
    norm_u = norm2(u_o - u_temp)
    if (nrank == 0) then
       if ( norm_u > tol ) then
          allpass = .false.
-         write(stderr, '(a)') 'Check reorder x2z and y2z... failed'
+         write(stderr, '(a)') 'Check reorder y2z and y2z... failed'
       else
-         write(stderr, '(a)') 'Check reorder x2z and y2z... passed'
+         write(stderr, '(a)') 'Check reorder y2z and y2z... passed'
       end if
    end if
 
-   ! z oriented field into y
-   blocks = dim3(nx/SZ, ny/SZ, nz)
-   threads = dim3(SZ, SZ, 1)
-   call reorder_z2y<<<blocks, threads>>>(u_temp_d, u_o_d, nx, nz)
-
-   ! zeroize just in case for reusing
-   u_o_d = 0
+   ! reorder initial random field into z orientation
+   blocks = dim3(nx, ny/SZ, 1)
+   threads = dim3(SZ, 1, 1)
+   call reorder_x2z<<<blocks, threads>>>(u_o_d, u_i_d, nz)
 
-   blocks = dim3(nx/SZ, ny/SZ, nz)
-   threads = dim3(SZ, SZ, 1)
-   call reorder_y2x<<<blocks, threads>>>(u_o_d, u_temp_d, nz)
-   u_o = u_o_d
+   ! z oriented field into x
+   blocks = dim3(nx, ny/SZ, 1)
+   threads = dim3(SZ, 1, 1)
+   call reorder_z2x<<<blocks, threads>>>(u_temp_d, u_o_d, nz)
+   u_o = u_temp_d
 
-   ! and check whether it matches the initial random field
+   ! compare two z oriented fields
    norm_u = norm2(u_o - u_i)
    if (nrank == 0) then
       if ( norm_u > tol ) then
          allpass = .false.
-         write(stderr, '(a)') 'Check reorder z2y and y2x... failed'
+         write(stderr, '(a)') 'Check reorder x2z and z2x... failed'
       else
-         write(stderr, '(a)') 'Check reorder z2y and y2x... passed'
+         write(stderr, '(a)') 'Check reorder x2z and z2x... passed'
       end if
    end if
 
@@ -171,6 +167,16 @@ program test_cuda_reorder
 
    call checkperf(tend - tstart, n_iters, ndof, 2._dp)
 
+   call cpu_time(tstart)
+   do i = 1, n_iters
+      blocks = dim3(nx, ny/SZ, 1)
+      threads = dim3(SZ, 1, 1)
+      call reorder_z2x<<<blocks, threads>>>(u_o_d, u_i_d, nz)
+   end do
+   call cpu_time(tend)
+
+   call checkperf(tend - tstart, n_iters, ndof, 2._dp)
+
    call cpu_time(tstart)
    do i = 1, n_iters
       blocks = dim3(nx/SZ, ny/SZ, nz)