diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index cbddf720..00d75fee 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -20,8 +20,8 @@ set(CUDASRC cuda/common.f90 cuda/exec_dist.f90 cuda/kernels/distributed.f90 - cuda/kernels/complex.f90 cuda/kernels/reorder.f90 + cuda/kernels/spectral_processing.f90 cuda/poisson_fft.f90 cuda/sendrecv.f90 cuda/tdsops.f90 diff --git a/src/cuda/kernels/complex.f90 b/src/cuda/kernels/complex.f90 deleted file mode 100644 index fcdfd1da..00000000 --- a/src/cuda/kernels/complex.f90 +++ /dev/null @@ -1,295 +0,0 @@ -module m_cuda_complex - use cudafor - - use m_common, only: dp - use m_cuda_common, only: SZ - - implicit none - -contains - - attributes(global) subroutine process_spectral_div_u( & - div, waves, nx, ny, nz, ax, bx, ay, by, az, bz & - ) - implicit none - - ! Arguments - complex(dp), device, intent(inout), dimension(:, :, :) :: div - complex(dp), device, intent(in), dimension(:, :, :) :: waves - real(dp), device, intent(in), dimension(:) :: ax, bx, ay, by, az, bz - integer, value, intent(in) :: nx, ny, nz - - ! Local variables - integer :: i, j, b, ix, iy, iz - real(dp) :: tmp_r, tmp_c, div_r, div_c - - i = threadIdx%x - b = blockIdx%x - - do j = 1, nx - ! normalisation - div_r = real(div(i, j, b), kind=dp)/(nx*ny*nz) - div_c = aimag(div(i, j, b))/(nx*ny*nz) - - ! get the indices for x, y, z directions - ix = j; iy = i + (b - 1)/(nz/2 + 1)*SZ; iz = mod(b - 1, nz/2 + 1) + 1 - - ! post-process forward - ! post-process in z - tmp_r = div_r - tmp_c = div_c - div_r = tmp_r*bz(iz) + tmp_c*az(iz) - div_c = tmp_c*bz(iz) - tmp_r*az(iz) - - ! post-process in y - tmp_r = div_r - tmp_c = div_c - div_r = tmp_r*by(iy) + tmp_c*ay(iy) - div_c = tmp_c*by(iy) - tmp_r*ay(iy) - if (iy > ny/2 + 1) div_r = -div_r - if (iy > ny/2 + 1) div_c = -div_c - - ! post-process in x - tmp_r = div_r - tmp_c = div_c - div_r = tmp_r*bx(ix) + tmp_c*ax(ix) - div_c = tmp_c*bx(ix) - tmp_r*ax(ix) - if (ix > nx/2 + 1) div_r = -div_r - if (ix > nx/2 + 1) div_c = -div_c - - ! Solve Poisson - tmp_r = real(waves(i, j, b), kind=dp) - tmp_c = aimag(waves(i, j, b)) - if ((tmp_r < 1.e-16_dp) .or. (tmp_c < 1.e-16_dp)) then - div_r = 0._dp; div_c = 0._dp - else - div_r = -div_r/tmp_r - div_c = -div_c/tmp_c - end if - - ! post-process backward - ! post-process in z - tmp_r = div_r - tmp_c = div_c - div_r = tmp_r*bz(iz) - tmp_c*az(iz) - div_c = -tmp_c*bz(iz) - tmp_r*az(iz) - - ! post-process in y - tmp_r = div_r - tmp_c = div_c - div_r = tmp_r*by(iy) + tmp_c*ay(iy) - div_c = tmp_c*by(iy) - tmp_r*ay(iy) - if (iy > ny/2 + 1) div_r = -div_r - if (iy > ny/2 + 1) div_c = -div_c - - ! post-process in x - tmp_r = div_r - tmp_c = div_c - div_r = tmp_r*bx(ix) + tmp_c*ax(ix) - div_c = -tmp_c*bx(ix) + tmp_r*ax(ix) - if (ix > nx/2 + 1) div_r = -div_r - if (ix > nx/2 + 1) div_c = -div_c - - ! update the entry - div(i, j, b) = cmplx(div_r, div_c, kind=dp) - end do - - end subroutine process_spectral_div_u - - attributes(global) subroutine reorder_cmplx_x2y_T(u_y, u_x, nz) - implicit none - - complex(dp), device, intent(out), dimension(:, :, :) :: u_y - complex(dp), device, intent(in), dimension(:, :, :) :: u_x - integer, value, intent(in) :: nz - - complex(dp), shared :: tile(SZ, SZ) - - integer :: i, j, b_i, b_j, b_k - - i = threadIdx%x; j = threadIdx%y - b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z - - ! copy into shared - tile(i, j) = u_x((b_i - 1)*SZ + j, i, b_k + nz*(b_j - 1)) - - call syncthreads() - - ! copy into output array from shared - u_y((b_j - 1)*SZ + j, i, (b_i - 1)*nz + b_k) = tile(j, i) - - end subroutine reorder_cmplx_x2y_T - - attributes(global) subroutine reorder_cmplx_y2x_T(u_x, u_y, nz) - implicit none - - complex(dp), device, intent(out), dimension(:, :, :) :: u_x - complex(dp), device, intent(in), dimension(:, :, :) :: u_y - integer, value, intent(in) :: nz - - complex(dp), shared :: tile(SZ, SZ) - - integer :: i, j, b_i, b_j, b_k - - i = threadIdx%x; j = threadIdx%y - b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z - - ! copy into shared - tile(i, j) = u_y((b_j - 1)*SZ + j, i, b_k + nz*(b_i - 1)) - - call syncthreads() - - ! copy into output array from shared - u_x((b_i - 1)*SZ + j, i, (b_j - 1)*nz + b_k) = tile(j, i) - - end subroutine reorder_cmplx_y2x_T - - attributes(global) subroutine reorder_cmplx_y2z_T(u_z, u_y, nx, nz) - implicit none - - complex(dp), device, intent(out), dimension(:, :, :) :: u_z - complex(dp), device, intent(in), dimension(:, :, :) :: u_y - integer, value, intent(in) :: nx, nz - - complex(dp), shared :: tile(SZ, SZ) - - integer :: i, j, k, b_i, b_j, b_k, b_x, b_y, b_z - - i = threadIdx%x - j = threadIdx%y - k = threadIdx%z - - b_x = blockIdx%z - b_y = blockIdx%y - b_z = blockIdx%x - - ! copy into shared - if (j + (b_z - 1)*SZ <= nz) & - tile(i, j) = u_y(i + (b_y - 1)*SZ, mod(b_x - 1, SZ) + 1, & - j + (b_z - 1)*SZ + ((b_x - 1)/SZ)*nz) - - call syncthreads() - - ! copy into output array from shared - if (i + (b_z - 1)*SZ <= nz) & - u_z(i + (b_z - 1)*SZ, j, b_x + (b_y - 1)*nx) = tile(j, i) - - end subroutine reorder_cmplx_y2z_T - - attributes(global) subroutine reorder_cmplx_z2y_T(u_y, u_z, nx, nz) - implicit none - - complex(dp), device, intent(out), dimension(:, :, :) :: u_y - complex(dp), device, intent(in), dimension(:, :, :) :: u_z - integer, value, intent(in) :: nx, nz - - complex(dp), shared :: tile(SZ, SZ) - - integer :: i, j, k, b_x, b_y, b_z - - i = threadIdx%x - j = threadIdx%y - k = threadIdx%z - - b_x = blockIdx%z - b_y = blockIdx%y - b_z = blockIdx%x - - ! copy into shared - if (i + (b_z - 1)*SZ <= nz) & - tile(i, j) = u_z(i + (b_z - 1)*SZ, j, b_x + (b_y - 1)*nx) - - call syncthreads() - - ! copy into output array from shared - if (j + (b_z - 1)*SZ <= nz) & - u_y(i + (b_y - 1)*SZ, mod(b_x - 1, SZ) + 1, & - j + (b_z - 1)*SZ + ((b_x - 1)/SZ)*nz) = tile(j, i) - - end subroutine reorder_cmplx_z2y_T - - attributes(global) subroutine reshapeDSF(uout, uin) - implicit none - - real(dp), device, intent(out), dimension(:, :, :) :: uout - real(dp), device, intent(in), dimension(:, :, :) :: uin - - real(dp), shared :: tile(SZ + 1, SZ) - - integer :: i, j, b_i, b - - i = threadIdx%x; j = threadIdx%y - b_i = blockIdx%x; b = blockIdx%y - - tile(i, j) = uin(i, j + (b_i - 1)*SZ, b) - - call syncthreads() - - uout(i + (b_i - 1)*SZ, j, b) = tile(j, i) - - end subroutine reshapeDSF - - attributes(global) subroutine reshapeDSB(uout, uin) - implicit none - - real(dp), device, intent(out), dimension(:, :, :) :: uout - real(dp), device, intent(in), dimension(:, :, :) :: uin - - real(dp), shared :: tile(SZ + 1, SZ) - - integer :: i, j, b_i, b - - i = threadIdx%x; j = threadIdx%y - b_i = blockIdx%x; b = blockIdx%y - - tile(i, j) = uin(i + (b_i - 1)*SZ, j, b) - - call syncthreads() - - uout(i, j + (b_i - 1)*SZ, b) = tile(j, i) - - end subroutine reshapeDSB - - attributes(global) subroutine reshapeCDSF(uout, uin) - implicit none - - complex(dp), device, intent(out), dimension(:, :, :) :: uout - complex(dp), device, intent(in), dimension(:, :, :) :: uin - - complex(dp), shared :: tile(SZ + 1, SZ) - - integer :: i, j, b_i, b - - i = threadIdx%x; j = threadIdx%y - b_i = blockIdx%x; b = blockIdx%y - - tile(i, j) = uin(i, j + (b_i - 1)*SZ, b) - - call syncthreads() - - uout(i + (b_i - 1)*SZ, j, b) = tile(j, i) - - end subroutine reshapeCDSF - - attributes(global) subroutine reshapeCDSB(uout, uin) - implicit none - - complex(dp), device, intent(out), dimension(:, :, :) :: uout - complex(dp), device, intent(in), dimension(:, :, :) :: uin - - complex(dp), shared :: tile(SZ + 1, SZ) - - integer :: i, j, b_i, b - - i = threadIdx%x; j = threadIdx%y - b_i = blockIdx%x; b = blockIdx%y - - tile(i, j) = uin(i + (b_i - 1)*SZ, j, b) - - call syncthreads() - - uout(i, j + (b_i - 1)*SZ, b) = tile(j, i) - - end subroutine reshapeCDSB - -end module m_cuda_complex diff --git a/src/cuda/kernels/spectral_processing.f90 b/src/cuda/kernels/spectral_processing.f90 new file mode 100644 index 00000000..76d717de --- /dev/null +++ b/src/cuda/kernels/spectral_processing.f90 @@ -0,0 +1,102 @@ +module m_cuda_spectral + use cudafor + + use m_common, only: dp + + implicit none + +contains + + attributes(global) subroutine process_spectral_div_u( & + div_u, waves, nx, ny, nz, ax, bx, ay, by, az, bz & + ) + !! Post-processes the divergence of velocity in spectral space, including + !! scaling w.r.t. grid size. + !! + !! Ref. JCP 228 (2009), 5989–6015, Sec 4 + implicit none + + !> Divergence of velocity in spectral space + complex(dp), device, intent(inout), dimension(:, :, :) :: div_u + !> Spectral equivalence constants + complex(dp), device, intent(in), dimension(:, :, :) :: waves + real(dp), device, intent(in), dimension(:) :: ax, bx, ay, by, az, bz + !> Grid size + integer, value, intent(in) :: nx, ny, nz + + integer :: i, j, k + real(dp) :: tmp_r, tmp_c, div_r, div_c + + j = threadIdx%x + (blockIdx%x - 1)*blockDim%x + k = blockIdx%y + + if (j <= ny) then + do i = 1, nx/2 + 1 + ! normalisation + div_r = real(div_u(i, j, k), kind=dp)/(nx*ny*nz) + div_c = aimag(div_u(i, j, k))/(nx*ny*nz) + + ! post-process forward + ! post-process in z + tmp_r = div_r + tmp_c = div_c + div_r = tmp_r*bz(k) + tmp_c*az(k) + div_c = tmp_c*bz(k) - tmp_r*az(k) + + ! post-process in y + tmp_r = div_r + tmp_c = div_c + div_r = tmp_r*by(j) + tmp_c*ay(j) + div_c = tmp_c*by(j) - tmp_r*ay(j) + if (j > ny/2 + 1) div_r = -div_r + if (j > ny/2 + 1) div_c = -div_c + + ! post-process in x + tmp_r = div_r + tmp_c = div_c + div_r = tmp_r*bx(i) + tmp_c*ax(i) + div_c = tmp_c*bx(i) - tmp_r*ax(i) + if (i > nx/2 + 1) div_r = -div_r + if (i > nx/2 + 1) div_c = -div_c + + ! Solve Poisson + tmp_r = real(waves(i, j, k), kind=dp) + tmp_c = aimag(waves(i, j, k)) + if ((tmp_r < 1.e-16_dp) .or. (tmp_c < 1.e-16_dp)) then + div_r = 0._dp; div_c = 0._dp + else + div_r = -div_r/tmp_r + div_c = -div_c/tmp_c + end if + + ! post-process backward + ! post-process in z + tmp_r = div_r + tmp_c = div_c + div_r = tmp_r*bz(k) - tmp_c*az(k) + div_c = -tmp_c*bz(k) - tmp_r*az(k) + + ! post-process in y + tmp_r = div_r + tmp_c = div_c + div_r = tmp_r*by(j) + tmp_c*ay(j) + div_c = tmp_c*by(j) - tmp_r*ay(j) + if (j > ny/2 + 1) div_r = -div_r + if (j > ny/2 + 1) div_c = -div_c + + ! post-process in x + tmp_r = div_r + tmp_c = div_c + div_r = tmp_r*bx(i) + tmp_c*ax(i) + div_c = -tmp_c*bx(i) + tmp_r*ax(i) + if (i > nx/2 + 1) div_r = -div_r + if (i > nx/2 + 1) div_c = -div_c + + ! update the entry + div_u(i, j, k) = cmplx(div_r, div_c, kind=dp) + end do + end if + + end subroutine process_spectral_div_u + +end module m_cuda_spectral diff --git a/src/cuda/poisson_fft.f90 b/src/cuda/poisson_fft.f90 index 015e9150..0b27b537 100644 --- a/src/cuda/poisson_fft.f90 +++ b/src/cuda/poisson_fft.f90 @@ -1,4 +1,6 @@ module m_cuda_poisson_fft + use iso_c_binding, only: c_loc, c_ptr, c_f_pointer + use iso_fortran_env, only: stderr => error_unit use cudafor use cufft @@ -8,29 +10,25 @@ module m_cuda_poisson_fft use m_tdsops, only: dirps_t use m_cuda_allocator, only: cuda_field_t - use m_cuda_common, only: SZ - use m_cuda_complex, only: reorder_cmplx_x2y_T, reorder_cmplx_y2x_T, & - reorder_cmplx_y2z_T, reorder_cmplx_z2y_T, & - process_spectral_div_u + use m_cuda_spectral, only: process_spectral_div_u implicit none type, extends(poisson_fft_t) :: cuda_poisson_fft_t - !! FFT based Poisson solver - !! It can only handle 1D decompositions along z direction. + !! FFT based Poisson solver - !> Local domain sized arrays to store data in spectral space - complex(dp), device, pointer, dimension(:) :: c_x_dev, c_y_dev, c_z_dev + !> Local domain sized array to store data in spectral space + complex(dp), device, allocatable, dimension(:, :, :) :: c_w_dev !> Local domain sized array storing the spectral equivalence constants complex(dp), device, allocatable, dimension(:, :, :) :: waves_dev + !> cufft requires a local domain sized storage + complex(dp), device, allocatable, dimension(:) :: fft_worksize real(dp), device, allocatable, dimension(:) :: ax_dev, bx_dev, & ay_dev, by_dev, & az_dev, bz_dev - real(dp), device, allocatable, dimension(:, :, :) :: f_tmp - - integer :: planD2Zz, planZ2Dz, planZ2Zx, planZ2Zy + integer :: plan3D_fw, plan3D_bw contains procedure :: fft_forward => fft_forward_cuda procedure :: fft_backward => fft_backward_cuda @@ -54,14 +52,14 @@ function init(xdirps, ydirps, zdirps) result(poisson_fft) integer :: nx, ny, nz - integer :: ierrfft + integer :: ierr integer(int_ptr_kind()) :: worksize - call poisson_fft%base_init(xdirps, ydirps, zdirps, SZ) + call poisson_fft%base_init(xdirps, ydirps, zdirps) nx = poisson_fft%nx; ny = poisson_fft%ny; nz = poisson_fft%nz - allocate (poisson_fft%waves_dev(SZ, nx, (ny*(nz/2 + 1))/SZ)) + allocate (poisson_fft%waves_dev(nx/2 + 1, ny, nz)) poisson_fft%waves_dev = poisson_fft%waves allocate (poisson_fft%ax_dev(nx), poisson_fft%bx_dev(nx)) @@ -71,39 +69,27 @@ function init(xdirps, ydirps, zdirps) result(poisson_fft) poisson_fft%ay_dev = poisson_fft%ay; poisson_fft%by_dev = poisson_fft%by poisson_fft%az_dev = poisson_fft%az; poisson_fft%bz_dev = poisson_fft%bz - allocate (poisson_fft%c_x_dev(nx*ny*(nz/2 + 1))) - allocate (poisson_fft%c_y_dev(nx*ny*(nz/2 + 1))) - allocate (poisson_fft%c_z_dev(nx*ny*(nz/2 + 1))) - - ! We can't currently ask allocator to pass us an array with - ! exact shape we want, so we allocate an extra array here. - ! This will be removed when allocator is fixed. - allocate (poisson_fft%f_tmp(nz, SZ, nx*ny/SZ)) - - ! set cufft plans - ierrfft = cufftCreate(poisson_fft%planD2Zz) - ierrfft = cufftMakePlanMany(poisson_fft%planD2Zz, 1, nz, & - nz, 1, nz, nz/2 + 1, 1, nz/2 + 1, & - CUFFT_D2Z, nx*ny, worksize) - ierrfft = cufftSetWorkArea(poisson_fft%planD2Zz, poisson_fft%c_x_dev) - - ierrfft = cufftCreate(poisson_fft%planZ2Dz) - ierrfft = cufftMakePlanMany(poisson_fft%planZ2Dz, 1, nz, & - nz/2 + 1, 1, nz/2 + 1, nz, 1, nz, & - CUFFT_Z2D, nx*ny, worksize) - ierrfft = cufftSetWorkArea(poisson_fft%planZ2Dz, poisson_fft%c_x_dev) - - ierrfft = cufftCreate(poisson_fft%planZ2Zy) - ierrfft = cufftMakePlanMany(poisson_fft%planZ2Zy, 1, ny, & - ny, 1, ny, ny, 1, ny, & - CUFFT_Z2Z, nx*(nz/2 + 1), worksize) - ierrfft = cufftSetWorkArea(poisson_fft%planZ2Zy, poisson_fft%c_x_dev) - - ierrfft = cufftCreate(poisson_fft%planZ2Zx) - ierrfft = cufftMakePlanMany(poisson_fft%planZ2Zx, 1, nx, & - nx, 1, nx, nx, 1, nx, & - CUFFT_Z2Z, ny*(nz/2 + 1), worksize) - ierrfft = cufftSetWorkArea(poisson_fft%planZ2Zx, poisson_fft%c_y_dev) + allocate (poisson_fft%c_w_dev(nx/2 + 1, ny, nz)) + allocate (poisson_fft%fft_worksize((nx/2 + 1)*ny*nz)) + + ! 3D plans + ierr = cufftCreate(poisson_fft%plan3D_fw) + ierr = cufftMakePlan3D(poisson_fft%plan3D_fw, nz, ny, nx, CUFFT_D2Z, & + worksize) + ierr = cufftSetWorkArea(poisson_fft%plan3D_fw, poisson_fft%fft_worksize) + if (ierr /= 0) then + write (stderr, *), "cuFFT Error Code: ", ierr + error stop 'Forward 3D FFT plan generation failed' + end if + + ierr = cufftCreate(poisson_fft%plan3D_bw) + ierr = cufftMakePlan3D(poisson_fft%plan3D_bw, nz, ny, nx, CUFFT_Z2D, & + worksize) + ierr = cufftSetWorkArea(poisson_fft%plan3D_bw, poisson_fft%fft_worksize) + if (ierr /= 0) then + write (stderr, *), "cuFFT Error Code: ", ierr + error stop 'Backward 3D FFT plan generation failed' + end if end function init @@ -114,47 +100,24 @@ subroutine fft_forward_cuda(self, f) class(field_t), intent(in) :: f real(dp), device, pointer, dimension(:, :, :) :: f_dev - complex(dp), device, dimension(:, :, :), pointer :: c_x_ptr, c_y_ptr, & - c_z_ptr + real(dp), device, pointer :: f_ptr + type(c_ptr) :: f_c_ptr type(dim3) :: blocks, threads - integer :: ierrfft + integer :: ierr select type (f); type is (cuda_field_t); f_dev => f%data_d; end select - ! First reorder f into cartesian-like data structure - blocks = dim3(self%nz/SZ, self%nx*self%ny/SZ, 1) - threads = dim3(SZ, SZ, 1) - call reshapeDSF<<>>(self%f_tmp, f_dev) !& - - ! Forward FFT transform in z from real to complex - ierrfft = cufftExecD2Z(self%planD2Zz, self%f_tmp, self%c_z_dev) - - ! Reorder from z to y - blocks = dim3(self%nz/2/SZ + 1, self%ny/SZ, self%nx) - threads = dim3(SZ, SZ, 1) - c_y_ptr(1:self%ny, 1:SZ, 1:(self%nx*(self%nz/2 + 1))/SZ) => self%c_y_dev - c_z_ptr(1:self%nz/2 + 1, 1:SZ, 1:self%nx*self%ny/SZ) => self%c_z_dev - - call reorder_cmplx_z2y_T<<>>(c_y_ptr, c_z_ptr, & !& - self%nx, self%nz/2 + 1) + ! Using f_dev directly in cufft call causes a segfault + ! Pointer switches below fixes the problem + f_c_ptr = c_loc(f_dev) + call c_f_pointer(f_c_ptr, f_ptr) - ! In-place forward FFT in y - ierrfft = cufftExecZ2Z(self%planZ2Zy, self%c_y_dev, self%c_y_dev, & - CUFFT_FORWARD) - - ! Reorder from y to x - blocks = dim3(self%nx/SZ, self%ny/SZ, self%nz/2 + 1) - threads = dim3(SZ, SZ, 1) - c_x_ptr(1:self%nx, 1:SZ, 1:(self%ny*(self%nz/2 + 1))/SZ) => self%c_x_dev - c_y_ptr(1:self%ny, 1:SZ, 1:(self%nx*(self%nz/2 + 1))/SZ) => self%c_y_dev - - call reorder_cmplx_y2x_T<<>>(c_x_ptr, c_y_ptr, & !& - self%nz/2 + 1) - - ! In-place forward FFT in x - ierrfft = cufftExecZ2Z(self%planZ2Zx, self%c_x_dev, self%c_x_dev, & - CUFFT_FORWARD) + ierr = cufftExecD2Z(self%plan3D_fw, f_ptr, self%c_w_dev) + if (ierr /= 0) then + write (stderr, *), "cuFFT Error Code: ", ierr + error stop 'Forward 3D FFT execution failed' + end if end subroutine fft_forward_cuda @@ -165,47 +128,24 @@ subroutine fft_backward_cuda(self, f) class(field_t), intent(inout) :: f real(dp), device, pointer, dimension(:, :, :) :: f_dev - complex(dp), device, dimension(:, :, :), pointer :: c_x_ptr, c_y_ptr, & - c_z_ptr + real(dp), device, pointer :: f_ptr + type(c_ptr) :: f_c_ptr type(dim3) :: blocks, threads - integer :: ierrfft + integer :: ierr select type (f); type is (cuda_field_t); f_dev => f%data_d; end select - ! In-place backward FFT in x - ierrfft = cufftExecZ2Z(self%planZ2Zx, self%c_x_dev, self%c_x_dev, & - CUFFT_INVERSE) - - ! Reorder from x to y - blocks = dim3(self%nx/SZ, self%ny/SZ, self%nz/2 + 1) - threads = dim3(SZ, SZ, 1) - c_x_ptr(1:self%nx, 1:SZ, 1:(self%ny*(self%nz/2 + 1))/SZ) => self%c_x_dev - c_y_ptr(1:self%ny, 1:SZ, 1:(self%nx*(self%nz/2 + 1))/SZ) => self%c_y_dev + ! Using f_dev directly in cufft call causes a segfault + ! Pointer switches below fixes the problem + f_c_ptr = c_loc(f_dev) + call c_f_pointer(f_c_ptr, f_ptr) - call reorder_cmplx_x2y_T<<>>(c_y_ptr, c_x_ptr, & !& - self%nz/2 + 1) - - ! In-place backward FFT in y - ierrfft = cufftExecZ2Z(self%planZ2Zy, self%c_y_dev, self%c_y_dev, & - CUFFT_INVERSE) - - ! Reorder from y to z - blocks = dim3(self%nz/2/SZ + 1, self%ny/SZ, self%nx) - threads = dim3(SZ, SZ, 1) - c_y_ptr(1:self%ny, 1:SZ, 1:(self%nx*(self%nz/2 + 1))/SZ) => self%c_y_dev - c_z_ptr(1:self%nz/2 + 1, 1:SZ, 1:self%nx*self%ny/SZ) => self%c_z_dev - - call reorder_cmplx_y2z_T<<>>(c_z_ptr, c_y_ptr, & !& - self%nx, self%nz/2 + 1) - - ! Backward FFT transform in z from complex to real - ierrfft = cufftExecZ2D(self%planZ2Dz, self%c_z_dev, self%f_tmp) - - ! Finally reorder f back into our specialist data structure - blocks = dim3(self%nz/SZ, self%nx*self%ny/SZ, 1) - threads = dim3(SZ, SZ, 1) - call reshapeDSB<<>>(f_dev, self%f_tmp) !& + ierr = cufftExecZ2D(self%plan3D_bw, self%c_w_dev, f_ptr) + if (ierr /= 0) then + write (stderr, *), "cuFFT Error Code: ", ierr + error stop 'Backward 3D FFT execution failed' + end if end subroutine fft_backward_cuda @@ -214,32 +154,23 @@ subroutine fft_postprocess_cuda(self) class(cuda_poisson_fft_t) :: self - complex(dp), device, dimension(:, :, :), pointer :: c_dev, c_x_ptr + complex(dp), device, dimension(:, :, :), pointer :: c_dev type(dim3) :: blocks, threads + integer :: tsize - ! Get some complex array pointers with right shape - c_x_ptr(1:self%nx, 1:SZ, 1:(self%ny*(self%nz/2 + 1))/SZ) => self%c_x_dev - c_dev(1:SZ, 1:self%nx, 1:(self%ny*(self%nz/2 + 1))/SZ) => self%c_y_dev - - ! Reshape from cartesian-like to our specialist data structure - blocks = dim3(self%nx/SZ, (self%ny*(self%nz/2 + 1))/SZ, 1) - threads = dim3(SZ, SZ, 1) - call reshapeCDSB<<>>(c_dev, c_x_ptr) !& + ! tsize is different than SZ, because here we work on a 3D Cartesian + ! data structure, and free to specify any suitable thread/block size. + tsize = 16 + blocks = dim3((self%ny - 1)/tsize + 1, self%nz, 1) + threads = dim3(tsize, 1, 1) - ! Postprocess - blocks = dim3((self%ny*(self%nz/2 + 1))/SZ, 1, 1) - threads = dim3(SZ, 1, 1) + ! Postprocess div_u in spectral space call process_spectral_div_u<<>>( & !& - c_dev, self%waves_dev, self%nx, self%ny, self%nz, & + self%c_w_dev, self%waves_dev, self%nx, self%ny, self%nz, & self%ax_dev, self%bx_dev, self%ay_dev, self%by_dev, & self%az_dev, self%bz_dev & ) - ! Reshape from our specialist data structure to cartesian-like - blocks = dim3(self%nx/SZ, (self%ny*(self%nz/2 + 1))/SZ, 1) - threads = dim3(SZ, SZ, 1) - call reshapeCDSF<<>>(c_x_ptr, c_dev) !& - end subroutine fft_postprocess_cuda end module m_cuda_poisson_fft diff --git a/src/omp/poisson_fft.f90 b/src/omp/poisson_fft.f90 index 835cf75a..f25a67b6 100644 --- a/src/omp/poisson_fft.f90 +++ b/src/omp/poisson_fft.f90 @@ -4,8 +4,6 @@ module m_omp_poisson_fft use m_poisson_fft, only: poisson_fft_t use m_tdsops, only: dirps_t - use m_omp_common, only: SZ - implicit none type, extends(poisson_fft_t) :: omp_poisson_fft_t @@ -33,7 +31,7 @@ function init(xdirps, ydirps, zdirps) result(poisson_fft) type(omp_poisson_fft_t) :: poisson_fft - call poisson_fft%base_init(xdirps, ydirps, zdirps, SZ) + call poisson_fft%base_init(xdirps, ydirps, zdirps) end function init diff --git a/src/poisson_fft.f90 b/src/poisson_fft.f90 index d95f0940..08342a03 100644 --- a/src/poisson_fft.f90 +++ b/src/poisson_fft.f90 @@ -6,8 +6,7 @@ module m_poisson_fft implicit none type, abstract :: poisson_fft_t - !! FFT based Poisson solver - !! It can only handle 1D decompositions along z direction. + !! FFT based Poisson solver integer :: nx, ny, nz complex(dp), allocatable, dimension(:, :, :) :: waves complex(dp), allocatable, dimension(:) :: ax, bx, ay, by, az, bz @@ -48,33 +47,34 @@ end subroutine fft_postprocess contains - subroutine base_init(self, xdirps, ydirps, zdirps, sz) + subroutine base_init(self, xdirps, ydirps, zdirps) implicit none class(poisson_fft_t) :: self class(dirps_t), intent(in) :: xdirps, ydirps, zdirps - integer, intent(in) :: sz self%nx = xdirps%n; self%ny = ydirps%n; self%nz = zdirps%n allocate (self%ax(self%nx), self%bx(self%nx)) - allocate (self%ay(self%nx), self%by(self%nx)) - allocate (self%az(self%nx), self%bz(self%nx)) + allocate (self%ay(self%ny), self%by(self%ny)) + allocate (self%az(self%nz), self%bz(self%nz)) - allocate (self%waves(sz, self%nx, (self%ny*(self%nz/2 + 1))/sz)) + ! cuFFT 3D transform halves the first index. + allocate (self%waves(self%nx/2 + 1, self%ny, self%nz)) ! waves_set requires some of the preprocessed tdsops variables. - call self%waves_set(xdirps, ydirps, zdirps, sz) + call self%waves_set(xdirps, ydirps, zdirps) end subroutine base_init - subroutine waves_set(self, xdirps, ydirps, zdirps, sz) - !! Ref. JCP 228 (2009), 5989–6015, Sec 4 + subroutine waves_set(self, xdirps, ydirps, zdirps) + !! Spectral equivalence constants + !! + !! Ref. JCP 228 (2009), 5989–6015, Sec 4 implicit none class(poisson_fft_t) :: self type(dirps_t), intent(in) :: xdirps, ydirps, zdirps - integer, intent(in) :: sz complex(dp), allocatable, dimension(:) :: xkx, xk2, yky, yk2, zkz, zk2, & exs, eys, ezs @@ -83,7 +83,7 @@ subroutine waves_set(self, xdirps, ydirps, zdirps, sz) real(dp) :: w, wp, rlexs, rleys, rlezs, xtt, ytt, ztt, xt1, yt1, zt1 complex(dp) :: xt2, yt2, zt2, xyzk - integer :: i, j, ka, kb, ix, iy, iz + integer :: i, j, k nx = xdirps%n; ny = ydirps%n; nz = zdirps%n @@ -153,42 +153,44 @@ subroutine waves_set(self, xdirps, ydirps, zdirps, sz) ezs(i) = cmplx(1._dp, 1._dp, kind=dp)*(nz*w/zdirps%L) zk2(i) = cmplx(1._dp, 1._dp, kind=dp)*(nz*wp/zdirps%L)**2 end do + do i = nz/2 + 2, nz + zkz(i) = zkz(nz - i + 2) + ezs(i) = ezs(nz - i + 2) + zk2(i) = zk2(nz - i + 2) + end do print *, 'waves array is correctly set only for a single rank run' ! TODO: do loop ranges below are valid only for single rank runs - do ka = 1, nz/2 + 1 - do kb = 1, ny/sz - do j = 1, nx - do i = 1, sz - ix = j; iy = (kb - 1)*sz + i; iz = ka - rlexs = real(exs(ix), kind=dp)*xdirps%d - rleys = real(eys(iy), kind=dp)*ydirps%d - rlezs = real(ezs(iz), kind=dp)*zdirps%d - - xtt = 2*(xdirps%interpl_v2p%a*cos(rlexs*0.5_dp) & - + xdirps%interpl_v2p%b*cos(rlexs*1.5_dp) & - + xdirps%interpl_v2p%c*cos(rlexs*2.5_dp) & - + xdirps%interpl_v2p%d*cos(rlexs*3.5_dp)) - ytt = 2*(ydirps%interpl_v2p%a*cos(rleys*0.5_dp) & - + ydirps%interpl_v2p%b*cos(rleys*1.5_dp) & - + ydirps%interpl_v2p%c*cos(rleys*2.5_dp) & - + ydirps%interpl_v2p%d*cos(rleys*3.5_dp)) - ztt = 2*(zdirps%interpl_v2p%a*cos(rlezs*0.5_dp) & - + zdirps%interpl_v2p%b*cos(rlezs*1.5_dp) & - + zdirps%interpl_v2p%c*cos(rlezs*2.5_dp) & - + zdirps%interpl_v2p%d*cos(rlezs*3.5_dp)) - - xt1 = 1._dp + 2*xdirps%interpl_v2p%alpha*cos(rlexs) - yt1 = 1._dp + 2*ydirps%interpl_v2p%alpha*cos(rleys) - zt1 = 1._dp + 2*zdirps%interpl_v2p%alpha*cos(rlezs) - - xt2 = xk2(ix)*(((ytt/yt1)*(ztt/zt1))**2) - yt2 = yk2(iy)*(((xtt/xt1)*(ztt/zt1))**2) - zt2 = zk2(iz)*(((xtt/xt1)*(ytt/yt1))**2) - - xyzk = xt2 + yt2 + zt2 - self%waves(i, j, ka + (kb - 1)*(nz/2 + 1)) = xyzk - end do + do i = 1, nx/2 + 1 + do j = 1, ny + do k = 1, nz + rlexs = real(exs(i), kind=dp)*xdirps%d + rleys = real(eys(j), kind=dp)*ydirps%d + rlezs = real(ezs(k), kind=dp)*zdirps%d + + xtt = 2*(xdirps%interpl_v2p%a*cos(rlexs*0.5_dp) & + + xdirps%interpl_v2p%b*cos(rlexs*1.5_dp) & + + xdirps%interpl_v2p%c*cos(rlexs*2.5_dp) & + + xdirps%interpl_v2p%d*cos(rlexs*3.5_dp)) + ytt = 2*(ydirps%interpl_v2p%a*cos(rleys*0.5_dp) & + + ydirps%interpl_v2p%b*cos(rleys*1.5_dp) & + + ydirps%interpl_v2p%c*cos(rleys*2.5_dp) & + + ydirps%interpl_v2p%d*cos(rleys*3.5_dp)) + ztt = 2*(zdirps%interpl_v2p%a*cos(rlezs*0.5_dp) & + + zdirps%interpl_v2p%b*cos(rlezs*1.5_dp) & + + zdirps%interpl_v2p%c*cos(rlezs*2.5_dp) & + + zdirps%interpl_v2p%d*cos(rlezs*3.5_dp)) + + xt1 = 1._dp + 2*xdirps%interpl_v2p%alpha*cos(rlexs) + yt1 = 1._dp + 2*ydirps%interpl_v2p%alpha*cos(rleys) + zt1 = 1._dp + 2*zdirps%interpl_v2p%alpha*cos(rlezs) + + xt2 = xk2(i)*(((ytt/yt1)*(ztt/zt1))**2) + yt2 = yk2(j)*(((xtt/xt1)*(ztt/zt1))**2) + zt2 = zk2(k)*(((xtt/xt1)*(ytt/yt1))**2) + + xyzk = xt2 + yt2 + zt2 + self%waves(i, j, k) = xyzk end do end do end do diff --git a/src/solver.f90 b/src/solver.f90 index d944d120..d9b298db 100644 --- a/src/solver.f90 +++ b/src/solver.f90 @@ -3,8 +3,9 @@ module m_solver use m_base_backend, only: base_backend_t use m_common, only: dp, globs_t, & RDR_X2Y, RDR_X2Z, RDR_Y2X, RDR_Y2Z, RDR_Z2X, RDR_Z2Y, & + RDR_Z2C, RDR_C2Z, & POISSON_SOLVER_FFT, POISSON_SOLVER_CG, & - DIR_X, DIR_Y, DIR_Z + DIR_X, DIR_Y, DIR_Z, DIR_C use m_tdsops, only: tdsops_t, dirps_t use m_time_integrator, only: time_intg_t @@ -536,15 +537,26 @@ subroutine poisson_fft(self, pressure, div_u) class(field_t), intent(inout) :: pressure class(field_t), intent(in) :: div_u + class(field_t), pointer :: p_temp + + ! reorder into 3D Cartesian data structure + p_temp => self%backend%allocator%get_block(DIR_C) + call self%backend%reorder(p_temp, div_u, RDR_Z2C) + ! call forward FFT ! output array in spectral space is stored at poisson_fft class - call self%backend%poisson_fft%fft_forward(div_u) + call self%backend%poisson_fft%fft_forward(p_temp) ! postprocess call self%backend%poisson_fft%fft_postprocess ! call backward FFT - call self%backend%poisson_fft%fft_backward(pressure) + call self%backend%poisson_fft%fft_backward(p_temp) + + ! reorder back to our specialist data structure from 3D Cartesian + call self%backend%reorder(pressure, p_temp, RDR_C2Z) + + call self%backend%allocator%release_block(p_temp) end subroutine poisson_fft