diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index cbddf720..00d75fee 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -20,8 +20,8 @@ set(CUDASRC
   cuda/common.f90
   cuda/exec_dist.f90
   cuda/kernels/distributed.f90
-  cuda/kernels/complex.f90
   cuda/kernels/reorder.f90
+  cuda/kernels/spectral_processing.f90
   cuda/poisson_fft.f90
   cuda/sendrecv.f90
   cuda/tdsops.f90
diff --git a/src/cuda/kernels/complex.f90 b/src/cuda/kernels/complex.f90
deleted file mode 100644
index fcdfd1da..00000000
--- a/src/cuda/kernels/complex.f90
+++ /dev/null
@@ -1,295 +0,0 @@
-module m_cuda_complex
-  use cudafor
-
-  use m_common, only: dp
-  use m_cuda_common, only: SZ
-
-  implicit none
-
-contains
-
-  attributes(global) subroutine process_spectral_div_u( &
-    div, waves, nx, ny, nz, ax, bx, ay, by, az, bz &
-    )
-    implicit none
-
-    ! Arguments
-    complex(dp), device, intent(inout), dimension(:, :, :) :: div
-    complex(dp), device, intent(in), dimension(:, :, :) :: waves
-    real(dp), device, intent(in), dimension(:) :: ax, bx, ay, by, az, bz
-    integer, value, intent(in) :: nx, ny, nz
-
-    ! Local variables
-    integer :: i, j, b, ix, iy, iz
-    real(dp) :: tmp_r, tmp_c, div_r, div_c
-
-    i = threadIdx%x
-    b = blockIdx%x
-
-    do j = 1, nx
-      ! normalisation
-      div_r = real(div(i, j, b), kind=dp)/(nx*ny*nz)
-      div_c = aimag(div(i, j, b))/(nx*ny*nz)
-
-      ! get the indices for x, y, z directions
-      ix = j; iy = i + (b - 1)/(nz/2 + 1)*SZ; iz = mod(b - 1, nz/2 + 1) + 1
-
-      ! post-process forward
-      ! post-process in z
-      tmp_r = div_r
-      tmp_c = div_c
-      div_r = tmp_r*bz(iz) + tmp_c*az(iz)
-      div_c = tmp_c*bz(iz) - tmp_r*az(iz)
-
-      ! post-process in y
-      tmp_r = div_r
-      tmp_c = div_c
-      div_r = tmp_r*by(iy) + tmp_c*ay(iy)
-      div_c = tmp_c*by(iy) - tmp_r*ay(iy)
-      if (iy > ny/2 + 1) div_r = -div_r
-      if (iy > ny/2 + 1) div_c = -div_c
-
-      ! post-process in x
-      tmp_r = div_r
-      tmp_c = div_c
-      div_r = tmp_r*bx(ix) + tmp_c*ax(ix)
-      div_c = tmp_c*bx(ix) - tmp_r*ax(ix)
-      if (ix > nx/2 + 1) div_r = -div_r
-      if (ix > nx/2 + 1) div_c = -div_c
-
-      ! Solve Poisson
-      tmp_r = real(waves(i, j, b), kind=dp)
-      tmp_c = aimag(waves(i, j, b))
-      if ((tmp_r < 1.e-16_dp) .or. (tmp_c < 1.e-16_dp)) then
-        div_r = 0._dp; div_c = 0._dp
-      else
-        div_r = -div_r/tmp_r
-        div_c = -div_c/tmp_c
-      end if
-
-      ! post-process backward
-      ! post-process in z
-      tmp_r = div_r
-      tmp_c = div_c
-      div_r = tmp_r*bz(iz) - tmp_c*az(iz)
-      div_c = -tmp_c*bz(iz) - tmp_r*az(iz)
-
-      ! post-process in y
-      tmp_r = div_r
-      tmp_c = div_c
-      div_r = tmp_r*by(iy) + tmp_c*ay(iy)
-      div_c = tmp_c*by(iy) - tmp_r*ay(iy)
-      if (iy > ny/2 + 1) div_r = -div_r
-      if (iy > ny/2 + 1) div_c = -div_c
-
-      ! post-process in x
-      tmp_r = div_r
-      tmp_c = div_c
-      div_r = tmp_r*bx(ix) + tmp_c*ax(ix)
-      div_c = -tmp_c*bx(ix) + tmp_r*ax(ix)
-      if (ix > nx/2 + 1) div_r = -div_r
-      if (ix > nx/2 + 1) div_c = -div_c
-
-      ! update the entry
-      div(i, j, b) = cmplx(div_r, div_c, kind=dp)
-    end do
-
-  end subroutine process_spectral_div_u
-
-  attributes(global) subroutine reorder_cmplx_x2y_T(u_y, u_x, nz)
-    implicit none
-
-    complex(dp), device, intent(out), dimension(:, :, :) :: u_y
-    complex(dp), device, intent(in), dimension(:, :, :) :: u_x
-    integer, value, intent(in) :: nz
-
-    complex(dp), shared :: tile(SZ, SZ)
-
-    integer :: i, j, b_i, b_j, b_k
-
-    i = threadIdx%x; j = threadIdx%y
-    b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z
-
-    ! copy into shared
-    tile(i, j) = u_x((b_i - 1)*SZ + j, i, b_k + nz*(b_j - 1))
-
-    call syncthreads()
-
-    ! copy into output array from shared
-    u_y((b_j - 1)*SZ + j, i, (b_i - 1)*nz + b_k) = tile(j, i)
-
-  end subroutine reorder_cmplx_x2y_T
-
-  attributes(global) subroutine reorder_cmplx_y2x_T(u_x, u_y, nz)
-    implicit none
-
-    complex(dp), device, intent(out), dimension(:, :, :) :: u_x
-    complex(dp), device, intent(in), dimension(:, :, :) :: u_y
-    integer, value, intent(in) :: nz
-
-    complex(dp), shared :: tile(SZ, SZ)
-
-    integer :: i, j, b_i, b_j, b_k
-
-    i = threadIdx%x; j = threadIdx%y
-    b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z
-
-    ! copy into shared
-    tile(i, j) = u_y((b_j - 1)*SZ + j, i, b_k + nz*(b_i - 1))
-
-    call syncthreads()
-
-    ! copy into output array from shared
-    u_x((b_i - 1)*SZ + j, i, (b_j - 1)*nz + b_k) = tile(j, i)
-
-  end subroutine reorder_cmplx_y2x_T
-
-  attributes(global) subroutine reorder_cmplx_y2z_T(u_z, u_y, nx, nz)
-    implicit none
-
-    complex(dp), device, intent(out), dimension(:, :, :) :: u_z
-    complex(dp), device, intent(in), dimension(:, :, :) :: u_y
-    integer, value, intent(in) :: nx, nz
-
-    complex(dp), shared :: tile(SZ, SZ)
-
-    integer :: i, j, k, b_i, b_j, b_k, b_x, b_y, b_z
-
-    i = threadIdx%x
-    j = threadIdx%y
-    k = threadIdx%z
-
-    b_x = blockIdx%z
-    b_y = blockIdx%y
-    b_z = blockIdx%x
-
-    ! copy into shared
-    if (j + (b_z - 1)*SZ <= nz) &
-      tile(i, j) = u_y(i + (b_y - 1)*SZ, mod(b_x - 1, SZ) + 1, &
-                       j + (b_z - 1)*SZ + ((b_x - 1)/SZ)*nz)
-
-    call syncthreads()
-
-    ! copy into output array from shared
-    if (i + (b_z - 1)*SZ <= nz) &
-      u_z(i + (b_z - 1)*SZ, j, b_x + (b_y - 1)*nx) = tile(j, i)
-
-  end subroutine reorder_cmplx_y2z_T
-
-  attributes(global) subroutine reorder_cmplx_z2y_T(u_y, u_z, nx, nz)
-    implicit none
-
-    complex(dp), device, intent(out), dimension(:, :, :) :: u_y
-    complex(dp), device, intent(in), dimension(:, :, :) :: u_z
-    integer, value, intent(in) :: nx, nz
-
-    complex(dp), shared :: tile(SZ, SZ)
-
-    integer :: i, j, k, b_x, b_y, b_z
-
-    i = threadIdx%x
-    j = threadIdx%y
-    k = threadIdx%z
-
-    b_x = blockIdx%z
-    b_y = blockIdx%y
-    b_z = blockIdx%x
-
-    ! copy into shared
-    if (i + (b_z - 1)*SZ <= nz) &
-      tile(i, j) = u_z(i + (b_z - 1)*SZ, j, b_x + (b_y - 1)*nx)
-
-    call syncthreads()
-
-    ! copy into output array from shared
-    if (j + (b_z - 1)*SZ <= nz) &
-      u_y(i + (b_y - 1)*SZ, mod(b_x - 1, SZ) + 1, &
-          j + (b_z - 1)*SZ + ((b_x - 1)/SZ)*nz) = tile(j, i)
-
-  end subroutine reorder_cmplx_z2y_T
-
-  attributes(global) subroutine reshapeDSF(uout, uin)
-    implicit none
-
-    real(dp), device, intent(out), dimension(:, :, :) :: uout
-    real(dp), device, intent(in), dimension(:, :, :) :: uin
-
-    real(dp), shared :: tile(SZ + 1, SZ)
-
-    integer :: i, j, b_i, b
-
-    i = threadIdx%x; j = threadIdx%y
-    b_i = blockIdx%x; b = blockIdx%y
-
-    tile(i, j) = uin(i, j + (b_i - 1)*SZ, b)
-
-    call syncthreads()
-
-    uout(i + (b_i - 1)*SZ, j, b) = tile(j, i)
-
-  end subroutine reshapeDSF
-
-  attributes(global) subroutine reshapeDSB(uout, uin)
-    implicit none
-
-    real(dp), device, intent(out), dimension(:, :, :) :: uout
-    real(dp), device, intent(in), dimension(:, :, :) :: uin
-
-    real(dp), shared :: tile(SZ + 1, SZ)
-
-    integer :: i, j, b_i, b
-
-    i = threadIdx%x; j = threadIdx%y
-    b_i = blockIdx%x; b = blockIdx%y
-
-    tile(i, j) = uin(i + (b_i - 1)*SZ, j, b)
-
-    call syncthreads()
-
-    uout(i, j + (b_i - 1)*SZ, b) = tile(j, i)
-
-  end subroutine reshapeDSB
-
-  attributes(global) subroutine reshapeCDSF(uout, uin)
-    implicit none
-
-    complex(dp), device, intent(out), dimension(:, :, :) :: uout
-    complex(dp), device, intent(in), dimension(:, :, :) :: uin
-
-    complex(dp), shared :: tile(SZ + 1, SZ)
-
-    integer :: i, j, b_i, b
-
-    i = threadIdx%x; j = threadIdx%y
-    b_i = blockIdx%x; b = blockIdx%y
-
-    tile(i, j) = uin(i, j + (b_i - 1)*SZ, b)
-
-    call syncthreads()
-
-    uout(i + (b_i - 1)*SZ, j, b) = tile(j, i)
-
-  end subroutine reshapeCDSF
-
-  attributes(global) subroutine reshapeCDSB(uout, uin)
-    implicit none
-
-    complex(dp), device, intent(out), dimension(:, :, :) :: uout
-    complex(dp), device, intent(in), dimension(:, :, :) :: uin
-
-    complex(dp), shared :: tile(SZ + 1, SZ)
-
-    integer :: i, j, b_i, b
-
-    i = threadIdx%x; j = threadIdx%y
-    b_i = blockIdx%x; b = blockIdx%y
-
-    tile(i, j) = uin(i + (b_i - 1)*SZ, j, b)
-
-    call syncthreads()
-
-    uout(i, j + (b_i - 1)*SZ, b) = tile(j, i)
-
-  end subroutine reshapeCDSB
-
-end module m_cuda_complex
diff --git a/src/cuda/kernels/spectral_processing.f90 b/src/cuda/kernels/spectral_processing.f90
new file mode 100644
index 00000000..76d717de
--- /dev/null
+++ b/src/cuda/kernels/spectral_processing.f90
@@ -0,0 +1,102 @@
+module m_cuda_spectral
+  use cudafor
+
+  use m_common, only: dp
+
+  implicit none
+
+contains
+
+  attributes(global) subroutine process_spectral_div_u( &
+    div_u, waves, nx, ny, nz, ax, bx, ay, by, az, bz &
+    )
+    !! Post-processes the divergence of velocity in spectral space, including
+    !! scaling w.r.t. grid size.
+    !!
+    !! Ref. JCP 228 (2009), 5989–6015, Sec 4
+    implicit none
+
+    !> Divergence of velocity in spectral space
+    complex(dp), device, intent(inout), dimension(:, :, :) :: div_u
+    !> Spectral equivalence constants
+    complex(dp), device, intent(in), dimension(:, :, :) :: waves
+    real(dp), device, intent(in), dimension(:) :: ax, bx, ay, by, az, bz
+    !> Grid size
+    integer, value, intent(in) :: nx, ny, nz
+
+    integer :: i, j, k
+    real(dp) :: tmp_r, tmp_c, div_r, div_c
+
+    j = threadIdx%x + (blockIdx%x - 1)*blockDim%x
+    k = blockIdx%y
+
+    if (j <= ny) then
+      do i = 1, nx/2 + 1
+        ! normalisation
+        div_r = real(div_u(i, j, k), kind=dp)/(nx*ny*nz)
+        div_c = aimag(div_u(i, j, k))/(nx*ny*nz)
+
+        ! post-process forward
+        ! post-process in z
+        tmp_r = div_r
+        tmp_c = div_c
+        div_r = tmp_r*bz(k) + tmp_c*az(k)
+        div_c = tmp_c*bz(k) - tmp_r*az(k)
+
+        ! post-process in y
+        tmp_r = div_r
+        tmp_c = div_c
+        div_r = tmp_r*by(j) + tmp_c*ay(j)
+        div_c = tmp_c*by(j) - tmp_r*ay(j)
+        if (j > ny/2 + 1) div_r = -div_r
+        if (j > ny/2 + 1) div_c = -div_c
+
+        ! post-process in x
+        tmp_r = div_r
+        tmp_c = div_c
+        div_r = tmp_r*bx(i) + tmp_c*ax(i)
+        div_c = tmp_c*bx(i) - tmp_r*ax(i)
+        if (i > nx/2 + 1) div_r = -div_r
+        if (i > nx/2 + 1) div_c = -div_c
+
+        ! Solve Poisson
+        tmp_r = real(waves(i, j, k), kind=dp)
+        tmp_c = aimag(waves(i, j, k))
+        if ((tmp_r < 1.e-16_dp) .or. (tmp_c < 1.e-16_dp)) then
+          div_r = 0._dp; div_c = 0._dp
+        else
+          div_r = -div_r/tmp_r
+          div_c = -div_c/tmp_c
+        end if
+
+        ! post-process backward
+        ! post-process in z
+        tmp_r = div_r
+        tmp_c = div_c
+        div_r = tmp_r*bz(k) - tmp_c*az(k)
+        div_c = -tmp_c*bz(k) - tmp_r*az(k)
+
+        ! post-process in y
+        tmp_r = div_r
+        tmp_c = div_c
+        div_r = tmp_r*by(j) + tmp_c*ay(j)
+        div_c = tmp_c*by(j) - tmp_r*ay(j)
+        if (j > ny/2 + 1) div_r = -div_r
+        if (j > ny/2 + 1) div_c = -div_c
+
+        ! post-process in x
+        tmp_r = div_r
+        tmp_c = div_c
+        div_r = tmp_r*bx(i) + tmp_c*ax(i)
+        div_c = -tmp_c*bx(i) + tmp_r*ax(i)
+        if (i > nx/2 + 1) div_r = -div_r
+        if (i > nx/2 + 1) div_c = -div_c
+
+        ! update the entry
+        div_u(i, j, k) = cmplx(div_r, div_c, kind=dp)
+      end do
+    end if
+
+  end subroutine process_spectral_div_u
+
+end module m_cuda_spectral
diff --git a/src/cuda/poisson_fft.f90 b/src/cuda/poisson_fft.f90
index 015e9150..0b27b537 100644
--- a/src/cuda/poisson_fft.f90
+++ b/src/cuda/poisson_fft.f90
@@ -1,4 +1,6 @@
 module m_cuda_poisson_fft
+  use iso_c_binding, only: c_loc, c_ptr, c_f_pointer
+  use iso_fortran_env, only: stderr => error_unit
   use cudafor
   use cufft
 
@@ -8,29 +10,25 @@ module m_cuda_poisson_fft
   use m_tdsops, only: dirps_t
 
   use m_cuda_allocator, only: cuda_field_t
-  use m_cuda_common, only: SZ
-  use m_cuda_complex, only: reorder_cmplx_x2y_T, reorder_cmplx_y2x_T, &
-                            reorder_cmplx_y2z_T, reorder_cmplx_z2y_T, &
-                            process_spectral_div_u
+  use m_cuda_spectral, only: process_spectral_div_u
 
   implicit none
 
   type, extends(poisson_fft_t) :: cuda_poisson_fft_t
-      !! FFT based Poisson solver
-      !! It can only handle 1D decompositions along z direction.
+    !! FFT based Poisson solver
 
-    !> Local domain sized arrays to store data in spectral space
-    complex(dp), device, pointer, dimension(:) :: c_x_dev, c_y_dev, c_z_dev
+    !> Local domain sized array to store data in spectral space
+    complex(dp), device, allocatable, dimension(:, :, :) :: c_w_dev
     !> Local domain sized array storing the spectral equivalence constants
     complex(dp), device, allocatable, dimension(:, :, :) :: waves_dev
+    !> cufft requires a local domain sized storage
+    complex(dp), device, allocatable, dimension(:) :: fft_worksize
 
     real(dp), device, allocatable, dimension(:) :: ax_dev, bx_dev, &
                                                    ay_dev, by_dev, &
                                                    az_dev, bz_dev
 
-    real(dp), device, allocatable, dimension(:, :, :) :: f_tmp
-
-    integer :: planD2Zz, planZ2Dz, planZ2Zx, planZ2Zy
+    integer :: plan3D_fw, plan3D_bw
   contains
     procedure :: fft_forward => fft_forward_cuda
     procedure :: fft_backward => fft_backward_cuda
@@ -54,14 +52,14 @@ function init(xdirps, ydirps, zdirps) result(poisson_fft)
 
     integer :: nx, ny, nz
 
-    integer :: ierrfft
+    integer :: ierr
     integer(int_ptr_kind()) :: worksize
 
-    call poisson_fft%base_init(xdirps, ydirps, zdirps, SZ)
+    call poisson_fft%base_init(xdirps, ydirps, zdirps)
 
     nx = poisson_fft%nx; ny = poisson_fft%ny; nz = poisson_fft%nz
 
-    allocate (poisson_fft%waves_dev(SZ, nx, (ny*(nz/2 + 1))/SZ))
+    allocate (poisson_fft%waves_dev(nx/2 + 1, ny, nz))
     poisson_fft%waves_dev = poisson_fft%waves
 
     allocate (poisson_fft%ax_dev(nx), poisson_fft%bx_dev(nx))
@@ -71,39 +69,27 @@ function init(xdirps, ydirps, zdirps) result(poisson_fft)
     poisson_fft%ay_dev = poisson_fft%ay; poisson_fft%by_dev = poisson_fft%by
     poisson_fft%az_dev = poisson_fft%az; poisson_fft%bz_dev = poisson_fft%bz
 
-    allocate (poisson_fft%c_x_dev(nx*ny*(nz/2 + 1)))
-    allocate (poisson_fft%c_y_dev(nx*ny*(nz/2 + 1)))
-    allocate (poisson_fft%c_z_dev(nx*ny*(nz/2 + 1)))
-
-    ! We can't currently ask allocator to pass us an array with
-    ! exact shape we want, so we allocate an extra array here.
-    ! This will be removed when allocator is fixed.
-    allocate (poisson_fft%f_tmp(nz, SZ, nx*ny/SZ))
-
-    ! set cufft plans
-    ierrfft = cufftCreate(poisson_fft%planD2Zz)
-    ierrfft = cufftMakePlanMany(poisson_fft%planD2Zz, 1, nz, &
-                                nz, 1, nz, nz/2 + 1, 1, nz/2 + 1, &
-                                CUFFT_D2Z, nx*ny, worksize)
-    ierrfft = cufftSetWorkArea(poisson_fft%planD2Zz, poisson_fft%c_x_dev)
-
-    ierrfft = cufftCreate(poisson_fft%planZ2Dz)
-    ierrfft = cufftMakePlanMany(poisson_fft%planZ2Dz, 1, nz, &
-                                nz/2 + 1, 1, nz/2 + 1, nz, 1, nz, &
-                                CUFFT_Z2D, nx*ny, worksize)
-    ierrfft = cufftSetWorkArea(poisson_fft%planZ2Dz, poisson_fft%c_x_dev)
-
-    ierrfft = cufftCreate(poisson_fft%planZ2Zy)
-    ierrfft = cufftMakePlanMany(poisson_fft%planZ2Zy, 1, ny, &
-                                ny, 1, ny, ny, 1, ny, &
-                                CUFFT_Z2Z, nx*(nz/2 + 1), worksize)
-    ierrfft = cufftSetWorkArea(poisson_fft%planZ2Zy, poisson_fft%c_x_dev)
-
-    ierrfft = cufftCreate(poisson_fft%planZ2Zx)
-    ierrfft = cufftMakePlanMany(poisson_fft%planZ2Zx, 1, nx, &
-                                nx, 1, nx, nx, 1, nx, &
-                                CUFFT_Z2Z, ny*(nz/2 + 1), worksize)
-    ierrfft = cufftSetWorkArea(poisson_fft%planZ2Zx, poisson_fft%c_y_dev)
+    allocate (poisson_fft%c_w_dev(nx/2 + 1, ny, nz))
+    allocate (poisson_fft%fft_worksize((nx/2 + 1)*ny*nz))
+
+    ! 3D plans
+    ierr = cufftCreate(poisson_fft%plan3D_fw)
+    ierr = cufftMakePlan3D(poisson_fft%plan3D_fw, nz, ny, nx, CUFFT_D2Z, &
+                           worksize)
+    ierr = cufftSetWorkArea(poisson_fft%plan3D_fw, poisson_fft%fft_worksize)
+    if (ierr /= 0) then
+      write (stderr, *), "cuFFT Error Code: ", ierr
+      error stop 'Forward 3D FFT plan generation failed'
+    end if
+
+    ierr = cufftCreate(poisson_fft%plan3D_bw)
+    ierr = cufftMakePlan3D(poisson_fft%plan3D_bw, nz, ny, nx, CUFFT_Z2D, &
+                           worksize)
+    ierr = cufftSetWorkArea(poisson_fft%plan3D_bw, poisson_fft%fft_worksize)
+    if (ierr /= 0) then
+      write (stderr, *), "cuFFT Error Code: ", ierr
+      error stop 'Backward 3D FFT plan generation failed'
+    end if
 
   end function init
 
@@ -114,47 +100,24 @@ subroutine fft_forward_cuda(self, f)
     class(field_t), intent(in) :: f
 
     real(dp), device, pointer, dimension(:, :, :) :: f_dev
-    complex(dp), device, dimension(:, :, :), pointer :: c_x_ptr, c_y_ptr, &
-                                                        c_z_ptr
+    real(dp), device, pointer :: f_ptr
+    type(c_ptr) :: f_c_ptr
 
     type(dim3) :: blocks, threads
-    integer :: ierrfft
+    integer :: ierr
 
     select type (f); type is (cuda_field_t); f_dev => f%data_d; end select
 
-    ! First reorder f into cartesian-like data structure
-    blocks = dim3(self%nz/SZ, self%nx*self%ny/SZ, 1)
-    threads = dim3(SZ, SZ, 1)
-    call reshapeDSF<<<blocks, threads>>>(self%f_tmp, f_dev) !&
-
-    ! Forward FFT transform in z from real to complex
-    ierrfft = cufftExecD2Z(self%planD2Zz, self%f_tmp, self%c_z_dev)
-
-    ! Reorder from z to y
-    blocks = dim3(self%nz/2/SZ + 1, self%ny/SZ, self%nx)
-    threads = dim3(SZ, SZ, 1)
-    c_y_ptr(1:self%ny, 1:SZ, 1:(self%nx*(self%nz/2 + 1))/SZ) => self%c_y_dev
-    c_z_ptr(1:self%nz/2 + 1, 1:SZ, 1:self%nx*self%ny/SZ) => self%c_z_dev
-
-    call reorder_cmplx_z2y_T<<<blocks, threads>>>(c_y_ptr, c_z_ptr, & !&
-                                                  self%nx, self%nz/2 + 1)
+    ! Using f_dev directly in cufft call causes a segfault
+    ! Pointer switches below fixes the problem
+    f_c_ptr = c_loc(f_dev)
+    call c_f_pointer(f_c_ptr, f_ptr)
 
-    ! In-place forward FFT in y
-    ierrfft = cufftExecZ2Z(self%planZ2Zy, self%c_y_dev, self%c_y_dev, &
-                           CUFFT_FORWARD)
-
-    ! Reorder from y to x
-    blocks = dim3(self%nx/SZ, self%ny/SZ, self%nz/2 + 1)
-    threads = dim3(SZ, SZ, 1)
-    c_x_ptr(1:self%nx, 1:SZ, 1:(self%ny*(self%nz/2 + 1))/SZ) => self%c_x_dev
-    c_y_ptr(1:self%ny, 1:SZ, 1:(self%nx*(self%nz/2 + 1))/SZ) => self%c_y_dev
-
-    call reorder_cmplx_y2x_T<<<blocks, threads>>>(c_x_ptr, c_y_ptr, & !&
-                                                  self%nz/2 + 1)
-
-    ! In-place forward FFT in x
-    ierrfft = cufftExecZ2Z(self%planZ2Zx, self%c_x_dev, self%c_x_dev, &
-                           CUFFT_FORWARD)
+    ierr = cufftExecD2Z(self%plan3D_fw, f_ptr, self%c_w_dev)
+    if (ierr /= 0) then
+      write (stderr, *), "cuFFT Error Code: ", ierr
+      error stop 'Forward 3D FFT execution failed'
+    end if
 
   end subroutine fft_forward_cuda
 
@@ -165,47 +128,24 @@ subroutine fft_backward_cuda(self, f)
     class(field_t), intent(inout) :: f
 
     real(dp), device, pointer, dimension(:, :, :) :: f_dev
-    complex(dp), device, dimension(:, :, :), pointer :: c_x_ptr, c_y_ptr, &
-                                                        c_z_ptr
+    real(dp), device, pointer :: f_ptr
+    type(c_ptr) :: f_c_ptr
 
     type(dim3) :: blocks, threads
-    integer :: ierrfft
+    integer :: ierr
 
     select type (f); type is (cuda_field_t); f_dev => f%data_d; end select
 
-    ! In-place backward FFT in x
-    ierrfft = cufftExecZ2Z(self%planZ2Zx, self%c_x_dev, self%c_x_dev, &
-                           CUFFT_INVERSE)
-
-    ! Reorder from x to y
-    blocks = dim3(self%nx/SZ, self%ny/SZ, self%nz/2 + 1)
-    threads = dim3(SZ, SZ, 1)
-    c_x_ptr(1:self%nx, 1:SZ, 1:(self%ny*(self%nz/2 + 1))/SZ) => self%c_x_dev
-    c_y_ptr(1:self%ny, 1:SZ, 1:(self%nx*(self%nz/2 + 1))/SZ) => self%c_y_dev
+    ! Using f_dev directly in cufft call causes a segfault
+    ! Pointer switches below fixes the problem
+    f_c_ptr = c_loc(f_dev)
+    call c_f_pointer(f_c_ptr, f_ptr)
 
-    call reorder_cmplx_x2y_T<<<blocks, threads>>>(c_y_ptr, c_x_ptr, & !&
-                                                  self%nz/2 + 1)
-
-    ! In-place backward FFT in y
-    ierrfft = cufftExecZ2Z(self%planZ2Zy, self%c_y_dev, self%c_y_dev, &
-                           CUFFT_INVERSE)
-
-    ! Reorder from y to z
-    blocks = dim3(self%nz/2/SZ + 1, self%ny/SZ, self%nx)
-    threads = dim3(SZ, SZ, 1)
-    c_y_ptr(1:self%ny, 1:SZ, 1:(self%nx*(self%nz/2 + 1))/SZ) => self%c_y_dev
-    c_z_ptr(1:self%nz/2 + 1, 1:SZ, 1:self%nx*self%ny/SZ) => self%c_z_dev
-
-    call reorder_cmplx_y2z_T<<<blocks, threads>>>(c_z_ptr, c_y_ptr, & !&
-                                                  self%nx, self%nz/2 + 1)
-
-    ! Backward FFT transform in z from complex to real
-    ierrfft = cufftExecZ2D(self%planZ2Dz, self%c_z_dev, self%f_tmp)
-
-    ! Finally reorder f back into our specialist data structure
-    blocks = dim3(self%nz/SZ, self%nx*self%ny/SZ, 1)
-    threads = dim3(SZ, SZ, 1)
-    call reshapeDSB<<<blocks, threads>>>(f_dev, self%f_tmp) !&
+    ierr = cufftExecZ2D(self%plan3D_bw, self%c_w_dev, f_ptr)
+    if (ierr /= 0) then
+      write (stderr, *), "cuFFT Error Code: ", ierr
+      error stop 'Backward 3D FFT execution failed'
+    end if
 
   end subroutine fft_backward_cuda
 
@@ -214,32 +154,23 @@ subroutine fft_postprocess_cuda(self)
 
     class(cuda_poisson_fft_t) :: self
 
-    complex(dp), device, dimension(:, :, :), pointer :: c_dev, c_x_ptr
+    complex(dp), device, dimension(:, :, :), pointer :: c_dev
     type(dim3) :: blocks, threads
+    integer :: tsize
 
-    ! Get some complex array pointers with right shape
-    c_x_ptr(1:self%nx, 1:SZ, 1:(self%ny*(self%nz/2 + 1))/SZ) => self%c_x_dev
-    c_dev(1:SZ, 1:self%nx, 1:(self%ny*(self%nz/2 + 1))/SZ) => self%c_y_dev
-
-    ! Reshape from cartesian-like to our specialist data structure
-    blocks = dim3(self%nx/SZ, (self%ny*(self%nz/2 + 1))/SZ, 1)
-    threads = dim3(SZ, SZ, 1)
-    call reshapeCDSB<<<blocks, threads>>>(c_dev, c_x_ptr) !&
+    ! tsize is different than SZ, because here we work on a 3D Cartesian
+    ! data structure, and free to specify any suitable thread/block size.
+    tsize = 16
+    blocks = dim3((self%ny - 1)/tsize + 1, self%nz, 1)
+    threads = dim3(tsize, 1, 1)
 
-    ! Postprocess
-    blocks = dim3((self%ny*(self%nz/2 + 1))/SZ, 1, 1)
-    threads = dim3(SZ, 1, 1)
+    ! Postprocess div_u in spectral space
     call process_spectral_div_u<<<blocks, threads>>>( & !&
-      c_dev, self%waves_dev, self%nx, self%ny, self%nz, &
+      self%c_w_dev, self%waves_dev, self%nx, self%ny, self%nz, &
       self%ax_dev, self%bx_dev, self%ay_dev, self%by_dev, &
       self%az_dev, self%bz_dev &
       )
 
-    ! Reshape from our specialist data structure to cartesian-like
-    blocks = dim3(self%nx/SZ, (self%ny*(self%nz/2 + 1))/SZ, 1)
-    threads = dim3(SZ, SZ, 1)
-    call reshapeCDSF<<<blocks, threads>>>(c_x_ptr, c_dev) !&
-
   end subroutine fft_postprocess_cuda
 
 end module m_cuda_poisson_fft
diff --git a/src/omp/poisson_fft.f90 b/src/omp/poisson_fft.f90
index 835cf75a..f25a67b6 100644
--- a/src/omp/poisson_fft.f90
+++ b/src/omp/poisson_fft.f90
@@ -4,8 +4,6 @@ module m_omp_poisson_fft
   use m_poisson_fft, only: poisson_fft_t
   use m_tdsops, only: dirps_t
 
-  use m_omp_common, only: SZ
-
   implicit none
 
   type, extends(poisson_fft_t) :: omp_poisson_fft_t
@@ -33,7 +31,7 @@ function init(xdirps, ydirps, zdirps) result(poisson_fft)
 
     type(omp_poisson_fft_t) :: poisson_fft
 
-    call poisson_fft%base_init(xdirps, ydirps, zdirps, SZ)
+    call poisson_fft%base_init(xdirps, ydirps, zdirps)
 
   end function init
 
diff --git a/src/poisson_fft.f90 b/src/poisson_fft.f90
index d95f0940..08342a03 100644
--- a/src/poisson_fft.f90
+++ b/src/poisson_fft.f90
@@ -6,8 +6,7 @@ module m_poisson_fft
   implicit none
 
   type, abstract :: poisson_fft_t
-      !! FFT based Poisson solver
-      !! It can only handle 1D decompositions along z direction.
+    !! FFT based Poisson solver
     integer :: nx, ny, nz
     complex(dp), allocatable, dimension(:, :, :) :: waves
     complex(dp), allocatable, dimension(:) :: ax, bx, ay, by, az, bz
@@ -48,33 +47,34 @@ end subroutine fft_postprocess
 
 contains
 
-  subroutine base_init(self, xdirps, ydirps, zdirps, sz)
+  subroutine base_init(self, xdirps, ydirps, zdirps)
     implicit none
 
     class(poisson_fft_t) :: self
     class(dirps_t), intent(in) :: xdirps, ydirps, zdirps
-    integer, intent(in) :: sz
 
     self%nx = xdirps%n; self%ny = ydirps%n; self%nz = zdirps%n
 
     allocate (self%ax(self%nx), self%bx(self%nx))
-    allocate (self%ay(self%nx), self%by(self%nx))
-    allocate (self%az(self%nx), self%bz(self%nx))
+    allocate (self%ay(self%ny), self%by(self%ny))
+    allocate (self%az(self%nz), self%bz(self%nz))
 
-    allocate (self%waves(sz, self%nx, (self%ny*(self%nz/2 + 1))/sz))
+    ! cuFFT 3D transform halves the first index.
+    allocate (self%waves(self%nx/2 + 1, self%ny, self%nz))
 
     ! waves_set requires some of the preprocessed tdsops variables.
-    call self%waves_set(xdirps, ydirps, zdirps, sz)
+    call self%waves_set(xdirps, ydirps, zdirps)
 
   end subroutine base_init
 
-  subroutine waves_set(self, xdirps, ydirps, zdirps, sz)
-      !! Ref. JCP 228 (2009), 5989–6015, Sec 4
+  subroutine waves_set(self, xdirps, ydirps, zdirps)
+    !! Spectral equivalence constants
+    !!
+    !! Ref. JCP 228 (2009), 5989–6015, Sec 4
     implicit none
 
     class(poisson_fft_t) :: self
     type(dirps_t), intent(in) :: xdirps, ydirps, zdirps
-    integer, intent(in) :: sz
 
     complex(dp), allocatable, dimension(:) :: xkx, xk2, yky, yk2, zkz, zk2, &
                                               exs, eys, ezs
@@ -83,7 +83,7 @@ subroutine waves_set(self, xdirps, ydirps, zdirps, sz)
     real(dp) :: w, wp, rlexs, rleys, rlezs, xtt, ytt, ztt, xt1, yt1, zt1
     complex(dp) :: xt2, yt2, zt2, xyzk
 
-    integer :: i, j, ka, kb, ix, iy, iz
+    integer :: i, j, k
 
     nx = xdirps%n; ny = ydirps%n; nz = zdirps%n
 
@@ -153,42 +153,44 @@ subroutine waves_set(self, xdirps, ydirps, zdirps, sz)
       ezs(i) = cmplx(1._dp, 1._dp, kind=dp)*(nz*w/zdirps%L)
       zk2(i) = cmplx(1._dp, 1._dp, kind=dp)*(nz*wp/zdirps%L)**2
     end do
+    do i = nz/2 + 2, nz
+      zkz(i) = zkz(nz - i + 2)
+      ezs(i) = ezs(nz - i + 2)
+      zk2(i) = zk2(nz - i + 2)
+    end do
 
     print *, 'waves array is correctly set only for a single rank run'
     ! TODO: do loop ranges below are valid only for single rank runs
-    do ka = 1, nz/2 + 1
-      do kb = 1, ny/sz
-        do j = 1, nx
-          do i = 1, sz
-            ix = j; iy = (kb - 1)*sz + i; iz = ka
-            rlexs = real(exs(ix), kind=dp)*xdirps%d
-            rleys = real(eys(iy), kind=dp)*ydirps%d
-            rlezs = real(ezs(iz), kind=dp)*zdirps%d
-
-            xtt = 2*(xdirps%interpl_v2p%a*cos(rlexs*0.5_dp) &
-                     + xdirps%interpl_v2p%b*cos(rlexs*1.5_dp) &
-                     + xdirps%interpl_v2p%c*cos(rlexs*2.5_dp) &
-                     + xdirps%interpl_v2p%d*cos(rlexs*3.5_dp))
-            ytt = 2*(ydirps%interpl_v2p%a*cos(rleys*0.5_dp) &
-                     + ydirps%interpl_v2p%b*cos(rleys*1.5_dp) &
-                     + ydirps%interpl_v2p%c*cos(rleys*2.5_dp) &
-                     + ydirps%interpl_v2p%d*cos(rleys*3.5_dp))
-            ztt = 2*(zdirps%interpl_v2p%a*cos(rlezs*0.5_dp) &
-                     + zdirps%interpl_v2p%b*cos(rlezs*1.5_dp) &
-                     + zdirps%interpl_v2p%c*cos(rlezs*2.5_dp) &
-                     + zdirps%interpl_v2p%d*cos(rlezs*3.5_dp))
-
-            xt1 = 1._dp + 2*xdirps%interpl_v2p%alpha*cos(rlexs)
-            yt1 = 1._dp + 2*ydirps%interpl_v2p%alpha*cos(rleys)
-            zt1 = 1._dp + 2*zdirps%interpl_v2p%alpha*cos(rlezs)
-
-            xt2 = xk2(ix)*(((ytt/yt1)*(ztt/zt1))**2)
-            yt2 = yk2(iy)*(((xtt/xt1)*(ztt/zt1))**2)
-            zt2 = zk2(iz)*(((xtt/xt1)*(ytt/yt1))**2)
-
-            xyzk = xt2 + yt2 + zt2
-            self%waves(i, j, ka + (kb - 1)*(nz/2 + 1)) = xyzk
-          end do
+    do i = 1, nx/2 + 1
+      do j = 1, ny
+        do k = 1, nz
+          rlexs = real(exs(i), kind=dp)*xdirps%d
+          rleys = real(eys(j), kind=dp)*ydirps%d
+          rlezs = real(ezs(k), kind=dp)*zdirps%d
+
+          xtt = 2*(xdirps%interpl_v2p%a*cos(rlexs*0.5_dp) &
+                   + xdirps%interpl_v2p%b*cos(rlexs*1.5_dp) &
+                   + xdirps%interpl_v2p%c*cos(rlexs*2.5_dp) &
+                   + xdirps%interpl_v2p%d*cos(rlexs*3.5_dp))
+          ytt = 2*(ydirps%interpl_v2p%a*cos(rleys*0.5_dp) &
+                   + ydirps%interpl_v2p%b*cos(rleys*1.5_dp) &
+                   + ydirps%interpl_v2p%c*cos(rleys*2.5_dp) &
+                   + ydirps%interpl_v2p%d*cos(rleys*3.5_dp))
+          ztt = 2*(zdirps%interpl_v2p%a*cos(rlezs*0.5_dp) &
+                   + zdirps%interpl_v2p%b*cos(rlezs*1.5_dp) &
+                   + zdirps%interpl_v2p%c*cos(rlezs*2.5_dp) &
+                   + zdirps%interpl_v2p%d*cos(rlezs*3.5_dp))
+
+          xt1 = 1._dp + 2*xdirps%interpl_v2p%alpha*cos(rlexs)
+          yt1 = 1._dp + 2*ydirps%interpl_v2p%alpha*cos(rleys)
+          zt1 = 1._dp + 2*zdirps%interpl_v2p%alpha*cos(rlezs)
+
+          xt2 = xk2(i)*(((ytt/yt1)*(ztt/zt1))**2)
+          yt2 = yk2(j)*(((xtt/xt1)*(ztt/zt1))**2)
+          zt2 = zk2(k)*(((xtt/xt1)*(ytt/yt1))**2)
+
+          xyzk = xt2 + yt2 + zt2
+          self%waves(i, j, k) = xyzk
         end do
       end do
     end do
diff --git a/src/solver.f90 b/src/solver.f90
index d944d120..d9b298db 100644
--- a/src/solver.f90
+++ b/src/solver.f90
@@ -3,8 +3,9 @@ module m_solver
   use m_base_backend, only: base_backend_t
   use m_common, only: dp, globs_t, &
                       RDR_X2Y, RDR_X2Z, RDR_Y2X, RDR_Y2Z, RDR_Z2X, RDR_Z2Y, &
+                      RDR_Z2C, RDR_C2Z, &
                       POISSON_SOLVER_FFT, POISSON_SOLVER_CG, &
-                      DIR_X, DIR_Y, DIR_Z
+                      DIR_X, DIR_Y, DIR_Z, DIR_C
   use m_tdsops, only: tdsops_t, dirps_t
   use m_time_integrator, only: time_intg_t
 
@@ -536,15 +537,26 @@ subroutine poisson_fft(self, pressure, div_u)
     class(field_t), intent(inout) :: pressure
     class(field_t), intent(in) :: div_u
 
+    class(field_t), pointer :: p_temp
+
+    ! reorder into 3D Cartesian data structure
+    p_temp => self%backend%allocator%get_block(DIR_C)
+    call self%backend%reorder(p_temp, div_u, RDR_Z2C)
+
     ! call forward FFT
     ! output array in spectral space is stored at poisson_fft class
-    call self%backend%poisson_fft%fft_forward(div_u)
+    call self%backend%poisson_fft%fft_forward(p_temp)
 
     ! postprocess
     call self%backend%poisson_fft%fft_postprocess
 
     ! call backward FFT
-    call self%backend%poisson_fft%fft_backward(pressure)
+    call self%backend%poisson_fft%fft_backward(p_temp)
+
+    ! reorder back to our specialist data structure from 3D Cartesian
+    call self%backend%reorder(pressure, p_temp, RDR_C2Z)
+
+    call self%backend%allocator%release_block(p_temp)
 
   end subroutine poisson_fft