diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 8caf9940..0290b35d 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -2,12 +2,14 @@ set(SRC allocator.f90 backend.f90 common.f90 + poisson_fft.f90 solver.f90 tdsops.f90 time_integrator.f90 omp/backend.f90 omp/common.f90 omp/kernels/distributed.f90 + omp/poisson_fft.f90 omp/sendrecv.f90 omp/exec_dist.f90 ) @@ -17,7 +19,9 @@ set(CUDASRC cuda/common.f90 cuda/exec_dist.f90 cuda/kernels/distributed.f90 + cuda/kernels/complex.f90 cuda/kernels/reorder.f90 + cuda/poisson_fft.f90 cuda/sendrecv.f90 cuda/tdsops.f90 ) @@ -40,6 +44,7 @@ if(${CMAKE_Fortran_COMPILER_ID} STREQUAL "PGI" OR set(CMAKE_Fortran_FLAGS_DEBUG "-g -O0 -traceback -Mbounds -Mchkptr -Ktrap=fp") set(CMAKE_Fortran_FLAGS_RELEASE "-O3 -fast") target_link_options(x3d2 INTERFACE "-cuda") + target_link_options(x3d2 INTERFACE "-lcufft") target_compile_options(xcompact PRIVATE "-DCUDA") # target_link_options(xcompact INTERFACE "-cuda") diff --git a/src/backend.f90 b/src/backend.f90 index a3958859..e17970e5 100644 --- a/src/backend.f90 +++ b/src/backend.f90 @@ -1,6 +1,7 @@ module m_base_backend use m_allocator, only: allocator_t, field_t use m_common, only: dp + use m_poisson_fft, only: poisson_fft_t use m_tdsops, only: tdsops_t, dirps_t implicit none @@ -24,6 +25,7 @@ module m_base_backend integer :: nx_loc, ny_loc, nz_loc class(allocator_t), pointer :: allocator class(dirps_t), pointer :: xdirps, ydirps, zdirps + class(poisson_fft_t), pointer :: poisson_fft contains procedure(transeq_ders), deferred :: transeq_x procedure(transeq_ders), deferred :: transeq_y @@ -33,9 +35,11 @@ module m_base_backend procedure(sum_intox), deferred :: sum_yintox procedure(sum_intox), deferred :: sum_zintox procedure(vecadd), deferred :: vecadd - procedure(get_fields), deferred :: get_fields - procedure(set_fields), deferred :: set_fields + procedure(scalar_product), deferred :: scalar_product + procedure(get_field), deferred :: get_field + procedure(set_field), deferred :: set_field procedure(alloc_tdsops), deferred :: alloc_tdsops + procedure(init_poisson_fft), deferred :: init_poisson_fft end type base_backend_t abstract interface @@ -126,7 +130,20 @@ end subroutine vecadd end interface abstract interface - subroutine get_fields(self, u_out, v_out, w_out, u, v, w) + real(dp) function scalar_product(self, x, y) result(s) + !! Calculates the scalar product of two input fields + import :: base_backend_t + import :: dp + import :: field_t + implicit none + + class(base_backend_t) :: self + class(field_t), intent(in) :: x, y + end function scalar_product + end interface + + abstract interface + subroutine get_field(self, arr, f) !! copy the specialist data structure from device or host back !! to a regular 3D data structure. import :: base_backend_t @@ -135,13 +152,13 @@ subroutine get_fields(self, u_out, v_out, w_out, u, v, w) implicit none class(base_backend_t) :: self - real(dp), dimension(:, :, :), intent(out) :: u_out, v_out, w_out - class(field_t), intent(in) :: u, v, w - end subroutine get_fields + real(dp), dimension(:, :, :), intent(out) :: arr + class(field_t), intent(in) :: f + end subroutine get_field - subroutine set_fields(self, u, v, w, u_in, v_in, w_in) + subroutine set_field(self, f, arr) !! copy the initial condition stored in a regular 3D data - !! structure into the specialist data structure arrays on the + !! structure into the specialist data structure array on the !! device or host. import :: base_backend_t import :: dp @@ -149,9 +166,9 @@ subroutine set_fields(self, u, v, w, u_in, v_in, w_in) implicit none class(base_backend_t) :: self - class(field_t), intent(inout) :: u, v, w - real(dp), dimension(:, :, :), intent(in) :: u_in, v_in, w_in - end subroutine set_fields + class(field_t), intent(inout) :: f + real(dp), dimension(:, :, :), intent(in) :: arr + end subroutine set_field end interface abstract interface @@ -174,4 +191,15 @@ subroutine alloc_tdsops(self, tdsops, n, dx, operation, scheme, n_halo, & end subroutine alloc_tdsops end interface + abstract interface + subroutine init_poisson_fft(self, xdirps, ydirps, zdirps) + import :: base_backend_t + import :: dirps_t + implicit none + + class(base_backend_t) :: self + type(dirps_t), intent(in) :: xdirps, ydirps, zdirps + end subroutine init_poisson_fft + end interface + end module m_base_backend diff --git a/src/common.f90 b/src/common.f90 index 2dbd9bd5..beac1f5f 100644 --- a/src/common.f90 +++ b/src/common.f90 @@ -5,8 +5,9 @@ module m_common real(dp), parameter :: pi = 4*atan(1.0_dp) integer, parameter :: RDR_X2Y = 12, RDR_X2Z = 13, RDR_Y2X = 21, & - RDR_Y2Z = 23, RDR_Z2Y = 32 + RDR_Y2Z = 23, RDR_Z2X = 31, RDR_Z2Y = 32 integer, parameter :: dir_X = 1, dir_Y = 2, dir_Z = 3 + integer, parameter :: POISSON_SOLVER_FFT = 0, POISSON_SOLVER_CG = 1 type :: globs_t integer :: nx, ny, nz @@ -14,8 +15,11 @@ module m_common integer :: n_groups_x, n_groups_y, n_groups_z real(dp) :: Lx, Ly, Lz real(dp) :: dx, dy, dz + real(dp) :: nu, dt + integer :: n_iters, n_output integer :: nproc_x = 1, nproc_y = 1, nproc_z = 1 character(len=20) :: BC_x_s, BC_x_e, BC_y_s, BC_y_e, BC_z_s, BC_z_e + integer :: poisson_solver_type end type globs_t contains diff --git a/src/cuda/backend.f90 b/src/cuda/backend.f90 index 8ab91370..a5322ff6 100644 --- a/src/cuda/backend.f90 +++ b/src/cuda/backend.f90 @@ -1,21 +1,25 @@ module m_cuda_backend use iso_fortran_env, only: stderr => error_unit use cudafor + use mpi use m_allocator, only: allocator_t, field_t use m_base_backend, only: base_backend_t - use m_common, only: dp, globs_t, RDR_X2Y, RDR_X2Z, RDR_Y2X, RDR_Y2Z, RDR_Z2Y + use m_common, only: dp, globs_t, & + RDR_X2Y, RDR_X2Z, RDR_Y2X, RDR_Y2Z, RDR_Z2X, RDR_Z2Y + use m_poisson_fft, only: poisson_fft_t use m_tdsops, only: dirps_t, tdsops_t use m_cuda_allocator, only: cuda_allocator_t, cuda_field_t use m_cuda_common, only: SZ use m_cuda_exec_dist, only: exec_dist_transeq_3fused, exec_dist_tds_compact + use m_cuda_poisson_fft, only: cuda_poisson_fft_t use m_cuda_sendrecv, only: sendrecv_fields, sendrecv_3fields use m_cuda_tdsops, only: cuda_tdsops_t use m_cuda_kernels_dist, only: transeq_3fused_dist, transeq_3fused_subs use m_cuda_kernels_reorder, only: & - reorder_x2y, reorder_x2z, reorder_y2x, reorder_y2z, reorder_z2y, & - sum_yintox, sum_zintox, axpby, buffer_copy + reorder_x2y, reorder_x2z, reorder_y2x, reorder_y2z, reorder_z2x, & + reorder_z2y, sum_yintox, sum_zintox, scalar_product, axpby, buffer_copy implicit none @@ -40,8 +44,10 @@ module m_cuda_backend procedure :: sum_yintox => sum_yintox_cuda procedure :: sum_zintox => sum_zintox_cuda procedure :: vecadd => vecadd_cuda - procedure :: set_fields => set_fields_cuda - procedure :: get_fields => get_fields_cuda + procedure :: scalar_product => scalar_product_cuda + procedure :: set_field => set_field_cuda + procedure :: get_field => get_field_cuda + procedure :: init_poisson_fft => init_cuda_poisson_fft procedure :: transeq_cuda_dist procedure :: transeq_cuda_thom procedure :: tds_solve_dist @@ -60,6 +66,7 @@ function init(globs, allocator) result(backend) class(allocator_t), target, intent(inout) :: allocator type(cuda_backend_t) :: backend + type(cuda_poisson_fft_t) :: cuda_poisson_fft integer :: n_halo, n_block select type(allocator) @@ -452,6 +459,10 @@ subroutine reorder_cuda(self, u_o, u_i, direction) threads = dim3(SZ, SZ, 1) call reorder_y2z<<>>(u_o_d, u_i_d, & self%nx_loc, self%nz_loc) + case (RDR_Z2X) ! z2x + blocks = dim3(self%nx_loc, self%ny_loc/SZ, 1) + threads = dim3(SZ, 1, 1) + call reorder_z2x<<>>(u_o_d, u_i_d, self%nz_loc) case (RDR_Z2Y) ! z2y blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc) threads = dim3(SZ, SZ, 1) @@ -524,6 +535,35 @@ subroutine vecadd_cuda(self, a, x, b, y) end subroutine vecadd_cuda + real(dp) function scalar_product_cuda(self, x, y) result(s) + implicit none + + class(cuda_backend_t) :: self + class(field_t), intent(in) :: x, y + + real(dp), device, pointer, dimension(:, :, :) :: x_d, y_d + real(dp), device, allocatable :: sum_d + type(dim3) :: blocks, threads + integer :: n, ierr + + select type(x); type is (cuda_field_t); x_d => x%data_d; end select + select type(y); type is (cuda_field_t); y_d => y%data_d; end select + + allocate (sum_d) + sum_d = 0._dp + + n = size(x_d, dim = 2) + blocks = dim3(size(x_d, dim = 3), 1, 1) + threads = dim3(SZ, 1, 1) + call scalar_product<<>>(sum_d, x_d, y_d, n) + + s = sum_d + + call MPI_Allreduce(MPI_IN_PLACE, s, 1, MPI_DOUBLE_PRECISION, MPI_SUM, & + MPI_COMM_WORLD, ierr) + + end function scalar_product_cuda + subroutine copy_into_buffers(u_send_s_dev, u_send_e_dev, u_dev, n) implicit none @@ -542,31 +582,42 @@ subroutine copy_into_buffers(u_send_s_dev, u_send_e_dev, u_dev, n) end subroutine copy_into_buffers - subroutine set_fields_cuda(self, u, v, w, u_in, v_in, w_in) + subroutine set_field_cuda(self, f, arr) implicit none class(cuda_backend_t) :: self - class(field_t), intent(inout) :: u, v, w - real(dp), dimension(:, :, :), intent(in) :: u_in, v_in, w_in + class(field_t), intent(inout) :: f + real(dp), dimension(:, :, :), intent(in) :: arr - select type(u); type is (cuda_field_t); u%data_d = u_in; end select - select type(v); type is (cuda_field_t); v%data_d = v_in; end select - select type(w); type is (cuda_field_t); w%data_d = w_in; end select + select type(f); type is (cuda_field_t); f%data_d = arr; end select - end subroutine set_fields_cuda + end subroutine set_field_cuda - subroutine get_fields_cuda(self, u_out, v_out, w_out, u, v, w) + subroutine get_field_cuda(self, arr, f) implicit none class(cuda_backend_t) :: self - real(dp), dimension(:, :, :), intent(out) :: u_out, v_out, w_out - class(field_t), intent(in) :: u, v, w + real(dp), dimension(:, :, :), intent(out) :: arr + class(field_t), intent(in) :: f + + select type(f); type is (cuda_field_t); arr = f%data_d; end select - select type(u); type is (cuda_field_t); u_out = u%data_d; end select - select type(v); type is (cuda_field_t); v_out = v%data_d; end select - select type(w); type is (cuda_field_t); w_out = w%data_d; end select + end subroutine get_field_cuda + + subroutine init_cuda_poisson_fft(self, xdirps, ydirps, zdirps) + implicit none + + class(cuda_backend_t) :: self + type(dirps_t), intent(in) :: xdirps, ydirps, zdirps + + allocate(cuda_poisson_fft_t :: self%poisson_fft) + + select type (poisson_fft => self%poisson_fft) + type is (cuda_poisson_fft_t) + poisson_fft = cuda_poisson_fft_t(xdirps, ydirps, zdirps) + end select - end subroutine get_fields_cuda + end subroutine init_cuda_poisson_fft end module m_cuda_backend diff --git a/src/cuda/kernels/complex.f90 b/src/cuda/kernels/complex.f90 new file mode 100644 index 00000000..32c5041f --- /dev/null +++ b/src/cuda/kernels/complex.f90 @@ -0,0 +1,295 @@ +module m_cuda_complex + use cudafor + + use m_common, only: dp + use m_cuda_common, only: SZ + + implicit none + +contains + + attributes(global) subroutine process_spectral_div_u( & + div, waves, nx, ny, nz, ax, bx, ay, by, az, bz & + ) + implicit none + + ! Arguments + complex(dp), device, intent(inout), dimension(:, :, :) :: div + complex(dp), device, intent(in), dimension(:, :, :) :: waves + real(dp), device, intent(in), dimension(:) :: ax, bx, ay, by, az, bz + integer, value, intent(in) :: nx, ny, nz + + ! Local variables + integer :: i, j, b, ix, iy, iz + real(dp) :: tmp_r, tmp_c, div_r, div_c + + i = threadIdx%x + b = blockIdx%x + + do j = 1, nx + ! normalisation + div_r = real(div(i, j, b), kind=dp)/(nx*ny*nz) + div_c = aimag(div(i, j, b))/(nx*ny*nz) + + ! get the indices for x, y, z directions + ix = j; iy = i + (b - 1)/(nz/2 + 1)*SZ; iz = mod(b - 1, nz/2 + 1) + 1 + + ! post-process forward + ! post-process in z + tmp_r = div_r + tmp_c = div_c + div_r = tmp_r*bz(iz) + tmp_c*az(iz) + div_c = tmp_c*bz(iz) - tmp_r*az(iz) + + ! post-process in y + tmp_r = div_r + tmp_c = div_c + div_r = tmp_r*by(iy) + tmp_c*ay(iy) + div_c = tmp_c*by(iy) - tmp_r*ay(iy) + if (iy > ny/2 + 1) div_r = -div_r + if (iy > ny/2 + 1) div_c = -div_c + + ! post-process in x + tmp_r = div_r + tmp_c = div_c + div_r = tmp_r*bx(ix) + tmp_c*ax(ix) + div_c = tmp_c*bx(ix) - tmp_r*ax(ix) + if (ix > nx/2 + 1) div_r = -div_r + if (ix > nx/2 + 1) div_c = -div_c + + ! Solve Poisson + tmp_r = real(waves(i, j, b), kind=dp) + tmp_c = aimag(waves(i, j, b)) + if ((tmp_r < 1.e-16_dp) .or. (tmp_c < 1.e-16_dp)) then + div_r = 0._dp; div_c = 0._dp + else + div_r = -div_r/tmp_r + div_c = -div_c/tmp_c + end if + + ! post-process backward + ! post-process in z + tmp_r = div_r + tmp_c = div_c + div_r = tmp_r*bz(iz) - tmp_c*az(iz) + div_c = -tmp_c*bz(iz) - tmp_r*az(iz) + + ! post-process in y + tmp_r = div_r + tmp_c = div_c + div_r = tmp_r*by(iy) + tmp_c*ay(iy) + div_c = tmp_c*by(iy) - tmp_r*ay(iy) + if (iy > ny/2 + 1) div_r = -div_r + if (iy > ny/2 + 1) div_c = -div_c + + ! post-process in x + tmp_r = div_r + tmp_c = div_c + div_r = tmp_r*bx(ix) + tmp_c*ax(ix) + div_c = -tmp_c*bx(ix) + tmp_r*ax(ix) + if (ix > nx/2 + 1) div_r = -div_r + if (ix > nx/2 + 1) div_c = -div_c + + ! update the entry + div(i, j, b) = cmplx(div_r, div_c, kind=dp) + end do + + end subroutine process_spectral_div_u + + attributes(global) subroutine reorder_cmplx_x2y_T(u_y, u_x, nz) + implicit none + + complex(dp), device, intent(out), dimension(:, :, :) :: u_y + complex(dp), device, intent(in), dimension(:, :, :) :: u_x + integer, value, intent(in) :: nz + + complex(dp), shared :: tile(SZ, SZ) + + integer :: i, j, b_i, b_j, b_k + + i = threadIdx%x; j = threadIdx%y + b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z + + ! copy into shared + tile(i, j) = u_x((b_i - 1)*SZ + j, i, b_k + nz*(b_j - 1)) + + call syncthreads() + + ! copy into output array from shared + u_y((b_j - 1)*SZ + j, i, (b_i - 1)*nz + b_k) = tile(j, i) + + end subroutine reorder_cmplx_x2y_T + + attributes(global) subroutine reorder_cmplx_y2x_T(u_x, u_y, nz) + implicit none + + complex(dp), device, intent(out), dimension(:, :, :) :: u_x + complex(dp), device, intent(in), dimension(:, :, :) :: u_y + integer, value, intent(in) :: nz + + complex(dp), shared :: tile(SZ, SZ) + + integer :: i, j, b_i, b_j, b_k + + i = threadIdx%x; j = threadIdx%y + b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z + + ! copy into shared + tile(i, j) = u_y((b_j - 1)*SZ + j, i, b_k + nz*(b_i - 1)) + + call syncthreads() + + ! copy into output array from shared + u_x((b_i - 1)*SZ + j, i, (b_j - 1)*nz + b_k) = tile(j, i) + + end subroutine reorder_cmplx_y2x_T + + attributes(global) subroutine reorder_cmplx_y2z_T(u_z, u_y, nx, nz) + implicit none + + complex(dp), device, intent(out), dimension(:, :, :) :: u_z + complex(dp), device, intent(in), dimension(:, :, :) :: u_y + integer, value, intent(in) :: nx, nz + + complex(dp), shared :: tile(SZ, SZ) + + integer :: i, j, k, b_i, b_j, b_k, b_x, b_y, b_z + + i = threadIdx%x + j = threadIdx%y + k = threadIdx%z + + b_x = blockIdx%z + b_y = blockIdx%y + b_z = blockIdx%x + + ! copy into shared + if ( j + (b_z - 1)*SZ <= nz ) & + tile(i, j) = u_y(i + (b_y - 1)*SZ, mod(b_x - 1, SZ) + 1, & + j + (b_z - 1)*SZ + ((b_x - 1)/SZ)*nz) + + call syncthreads() + + ! copy into output array from shared + if ( i + (b_z - 1)*SZ <= nz ) & + u_z(i + (b_z - 1)*SZ, j, b_x + (b_y - 1)*nx) = tile(j, i) + + end subroutine reorder_cmplx_y2z_T + + attributes(global) subroutine reorder_cmplx_z2y_T(u_y, u_z, nx, nz) + implicit none + + complex(dp), device, intent(out), dimension(:, :, :) :: u_y + complex(dp), device, intent(in), dimension(:, :, :) :: u_z + integer, value, intent(in) :: nx, nz + + complex(dp), shared :: tile(SZ, SZ) + + integer :: i, j, k, b_x, b_y, b_z + + i = threadIdx%x + j = threadIdx%y + k = threadIdx%z + + b_x = blockIdx%z + b_y = blockIdx%y + b_z = blockIdx%x + + ! copy into shared + if ( i + (b_z - 1)*SZ <= nz ) & + tile(i, j) = u_z(i + (b_z - 1)*SZ, j, b_x + (b_y - 1)*nx) + + call syncthreads() + + ! copy into output array from shared + if ( j + (b_z - 1)*SZ <= nz ) & + u_y(i + (b_y - 1)*SZ, mod(b_x - 1, SZ) + 1, & + j + (b_z - 1)*SZ + ((b_x - 1)/SZ)*nz) = tile(j, i) + + end subroutine reorder_cmplx_z2y_T + + attributes(global) subroutine reshapeDSF(uout, uin) + implicit none + + real(dp), device, intent(out), dimension(:, :, :) :: uout + real(dp), device, intent(in), dimension(:, :, :) :: uin + + real(dp), shared :: tile(SZ + 1, SZ) + + integer :: i, j, b_i, b + + i = threadIdx%x; j = threadIdx%y + b_i = blockIdx%x; b = blockIdx%y + + tile(i, j) = uin(i, j + (b_i - 1)*SZ, b) + + call syncthreads() + + uout(i + (b_i - 1)*SZ, j, b) = tile(j, i) + + end subroutine reshapeDSF + + attributes(global) subroutine reshapeDSB(uout, uin) + implicit none + + real(dp), device, intent(out), dimension(:, :, :) :: uout + real(dp), device, intent(in), dimension(:, :, :) :: uin + + real(dp), shared :: tile(SZ + 1, SZ) + + integer :: i, j, b_i, b + + i = threadIdx%x; j = threadIdx%y + b_i = blockIdx%x; b = blockIdx%y + + tile(i, j) = uin(i + (b_i - 1)*SZ, j, b) + + call syncthreads() + + uout(i, j + (b_i - 1)*SZ, b) = tile(j, i) + + end subroutine reshapeDSB + + attributes(global) subroutine reshapeCDSF(uout, uin) + implicit none + + complex(dp), device, intent(out), dimension(:, :, :) :: uout + complex(dp), device, intent(in), dimension(:, :, :) :: uin + + complex(dp), shared :: tile(SZ + 1, SZ) + + integer :: i, j, b_i, b + + i = threadIdx%x; j = threadIdx%y + b_i = blockIdx%x; b = blockIdx%y + + tile(i, j) = uin(i, j + (b_i - 1)*SZ, b) + + call syncthreads() + + uout(i + (b_i - 1)*SZ, j, b) = tile(j, i) + + end subroutine reshapeCDSF + + attributes(global) subroutine reshapeCDSB(uout, uin) + implicit none + + complex(dp), device, intent(out), dimension(:, :, :) :: uout + complex(dp), device, intent(in), dimension(:, :, :) :: uin + + complex(dp), shared :: tile(SZ + 1, SZ) + + integer :: i, j, b_i, b + + i = threadIdx%x; j = threadIdx%y + b_i = blockIdx%x; b = blockIdx%y + + tile(i, j) = uin(i + (b_i - 1)*SZ, j, b) + + call syncthreads() + + uout(i, j + (b_i - 1)*SZ, b) = tile(j, i) + + end subroutine reshapeCDSB + +end module m_cuda_complex diff --git a/src/cuda/kernels/reorder.f90 b/src/cuda/kernels/reorder.f90 index 8ee984d0..11c74cac 100644 --- a/src/cuda/kernels/reorder.f90 +++ b/src/cuda/kernels/reorder.f90 @@ -95,6 +95,24 @@ attributes(global) subroutine reorder_y2z(u_z, u_y, nx, nz) end subroutine reorder_y2z + attributes(global) subroutine reorder_z2x(u_x, u_z, nz) + implicit none + + real(dp), device, intent(out), dimension(:, :, :) :: u_x + real(dp), device, intent(in), dimension(:, :, :) :: u_z + integer, value, intent(in) :: nz + + integer :: i, j, b_i, b_j, nx + + i = threadIdx%x; b_i = blockIdx%x; b_j = blockIdx%y + nx = gridDim%x + + do j = 1, nz + u_x(i, b_i, j + (b_j - 1)*nz) = u_z(i, j, b_i + (b_j - 1)*nx) + end do + + end subroutine reorder_z2x + attributes(global) subroutine reorder_z2y(u_y, u_z, nx, nz) implicit none @@ -181,6 +199,27 @@ attributes(global) subroutine axpby(n, alpha, x, beta, y) end subroutine axpby + attributes(global) subroutine scalar_product(s, x, y, n) + implicit none + + real(dp), device, intent(inout) :: s + real(dp), device, intent(in), dimension(:, :, :) :: x, y + integer, value, intent(in) :: n + + real(dp) :: s_pncl !! pencil sum + integer :: i, j, b, ierr + + i = threadIdx%x + b = blockIdx%x + + s_pncl = 0._dp + do j = 1, n + s_pncl = s_pncl + x(i, j, b)*y(i, j, b) + end do + ierr = atomicadd(s, s_pncl) + + end subroutine scalar_product + attributes(global) subroutine buffer_copy(u_send_s, u_send_e, u, n, n_halo) implicit none diff --git a/src/cuda/poisson_fft.f90 b/src/cuda/poisson_fft.f90 new file mode 100644 index 00000000..3df787a2 --- /dev/null +++ b/src/cuda/poisson_fft.f90 @@ -0,0 +1,244 @@ +module m_cuda_poisson_fft + use cudafor + use cufft + + use m_allocator, only: field_t + use m_common, only: dp + use m_poisson_fft, only: poisson_fft_t + use m_tdsops, only: dirps_t + + use m_cuda_allocator, only: cuda_field_t + use m_cuda_common, only: SZ + use m_cuda_complex, only: reorder_cmplx_x2y_T, reorder_cmplx_y2x_T, & + reorder_cmplx_y2z_T, reorder_cmplx_z2y_T, & + process_spectral_div_u + + implicit none + + type, extends(poisson_fft_t) :: cuda_poisson_fft_t + !! FFT based Poisson solver + !! It can only handle 1D decompositions along z direction. + + !> Local domain sized arrays to store data in spectral space + complex(dp), device, pointer, dimension(:) :: c_x_dev, c_y_dev, c_z_dev + !> Local domain sized array storing the spectral equivalence constants + complex(dp), device, allocatable, dimension(:, :, :) :: waves_dev + + real(dp), device, allocatable, dimension(:) :: ax_dev, bx_dev, & + ay_dev, by_dev, az_dev, bz_dev + + real(dp), device, allocatable, dimension(:, :, :) :: f_tmp + + integer :: planD2Zz, planZ2Dz, planZ2Zx, planZ2Zy + contains + procedure :: fft_forward => fft_forward_cuda + procedure :: fft_backward => fft_backward_cuda + procedure :: fft_postprocess => fft_postprocess_cuda + end type cuda_poisson_fft_t + + interface cuda_poisson_fft_t + module procedure init + end interface cuda_poisson_fft_t + + private :: init + +contains + + function init(xdirps, ydirps, zdirps) result(poisson_fft) + implicit none + + class(dirps_t), intent(in) :: xdirps, ydirps, zdirps + + type(cuda_poisson_fft_t) :: poisson_fft + + integer :: nx, ny, nz + + integer :: ierrfft + integer(int_ptr_kind()) :: worksize + + call poisson_fft%base_init(xdirps, ydirps, zdirps, SZ) + + nx = poisson_fft%nx; ny = poisson_fft%ny; nz = poisson_fft%nz + + allocate (poisson_fft%waves_dev(SZ, nx, (ny*(nz/2 + 1))/SZ)) + poisson_fft%waves_dev = poisson_fft%waves + + allocate (poisson_fft%ax_dev(nx), poisson_fft%bx_dev(nx)) + allocate (poisson_fft%ay_dev(ny), poisson_fft%by_dev(ny)) + allocate (poisson_fft%az_dev(nz), poisson_fft%bz_dev(nz)) + poisson_fft%ax_dev = poisson_fft%ax; poisson_fft%bx_dev = poisson_fft%bx + poisson_fft%ay_dev = poisson_fft%ay; poisson_fft%by_dev = poisson_fft%by + poisson_fft%az_dev = poisson_fft%az; poisson_fft%bz_dev = poisson_fft%bz + + allocate (poisson_fft%c_x_dev(nx*ny*(nz/2 + 1))) + allocate (poisson_fft%c_y_dev(nx*ny*(nz/2 + 1))) + allocate (poisson_fft%c_z_dev(nx*ny*(nz/2 + 1))) + + ! We can't currently ask allocator to pass us an array with + ! exact shape we want, so we allocate an extra array here. + ! This will be removed when allocator is fixed. + allocate (poisson_fft%f_tmp(nz, SZ, nx*ny/SZ)) + + ! set cufft plans + ierrfft = cufftCreate(poisson_fft%planD2Zz) + ierrfft = cufftMakePlanMany(poisson_fft%planD2Zz, 1, nz, & + nz, 1, nz, nz/2+1, 1, nz/2+1, & + CUFFT_D2Z, nx*ny, worksize) + ierrfft = cufftSetWorkArea(poisson_fft%planD2Zz, poisson_fft%c_x_dev) + + ierrfft = cufftCreate(poisson_fft%planZ2Dz) + ierrfft = cufftMakePlanMany(poisson_fft%planZ2Dz, 1, nz, & + nz/2+1, 1, nz/2+1, nz, 1, nz, & + CUFFT_Z2D, nx*ny, worksize) + ierrfft = cufftSetWorkArea(poisson_fft%planZ2Dz, poisson_fft%c_x_dev) + + ierrfft = cufftCreate(poisson_fft%planZ2Zy) + ierrfft = cufftMakePlanMany(poisson_fft%planZ2Zy, 1, ny, & + ny, 1, ny, ny, 1, ny, & + CUFFT_Z2Z, nx*(nz/2 + 1), worksize) + ierrfft = cufftSetWorkArea(poisson_fft%planZ2Zy, poisson_fft%c_x_dev) + + ierrfft = cufftCreate(poisson_fft%planZ2Zx) + ierrfft = cufftMakePlanMany(poisson_fft%planZ2Zx, 1, nx, & + nx, 1, nx, nx, 1, nx, & + CUFFT_Z2Z, ny*(nz/2 + 1), worksize) + ierrfft = cufftSetWorkArea(poisson_fft%planZ2Zx, poisson_fft%c_y_dev) + + end function init + + subroutine fft_forward_cuda(self, f) + implicit none + + class(cuda_poisson_fft_t) :: self + class(field_t), intent(in) :: f + + real(dp), device, pointer, dimension(:, :, :) :: f_dev + complex(dp), device, dimension(:, :, :), pointer :: c_x_ptr, c_y_ptr, & + c_z_ptr + + type(dim3) :: blocks, threads + integer :: ierrfft + + select type(f); type is (cuda_field_t); f_dev => f%data_d; end select + + ! First reorder f into cartesian-like data structure + blocks = dim3(self%nz/SZ, self%nx*self%ny/SZ, 1) + threads = dim3(SZ, SZ, 1) + call reshapeDSF<<>>(self%f_tmp, f_dev) + + ! Forward FFT transform in z from real to complex + ierrfft = cufftExecD2Z(self%planD2Zz, self%f_tmp, self%c_z_dev) + + ! Reorder from z to y + blocks = dim3(self%nz/2/SZ + 1, self%ny/SZ, self%nx) + threads = dim3(SZ, SZ, 1) + c_y_ptr(1:self%ny, 1:SZ, 1:(self%nx*(self%nz/2 + 1))/SZ) => self%c_y_dev + c_z_ptr(1:self%nz/2 + 1, 1:SZ, 1:self%nx*self%ny/SZ) => self%c_z_dev + + call reorder_cmplx_z2y_T<<>>(c_y_ptr, c_z_ptr, & + self%nx, self%nz/2 + 1) + + ! In-place forward FFT in y + ierrfft = cufftExecZ2Z(self%planZ2Zy, self%c_y_dev, self%c_y_dev, & + CUFFT_FORWARD) + + ! Reorder from y to x + blocks = dim3(self%nx/SZ, self%ny/SZ, self%nz/2 + 1) + threads = dim3(SZ, SZ, 1) + c_x_ptr(1:self%nx, 1:SZ, 1:(self%ny*(self%nz/2 + 1))/SZ) => self%c_x_dev + c_y_ptr(1:self%ny, 1:SZ, 1:(self%nx*(self%nz/2 + 1))/SZ) => self%c_y_dev + + call reorder_cmplx_y2x_T<<>>(c_x_ptr, c_y_ptr, & + self%nz/2 + 1) + + ! In-place forward FFT in x + ierrfft = cufftExecZ2Z(self%planZ2Zx, self%c_x_dev, self%c_x_dev, & + CUFFT_FORWARD) + + end subroutine fft_forward_cuda + + subroutine fft_backward_cuda(self, f) + implicit none + + class(cuda_poisson_fft_t) :: self + class(field_t), intent(inout) :: f + + real(dp), device, pointer, dimension(:, :, :) :: f_dev + complex(dp), device, dimension(:, :, :), pointer :: c_x_ptr, c_y_ptr, & + c_z_ptr + + type(dim3) :: blocks, threads + integer :: ierrfft + + select type(f); type is (cuda_field_t); f_dev => f%data_d; end select + + ! In-place backward FFT in x + ierrfft = cufftExecZ2Z(self%planZ2Zx, self%c_x_dev, self%c_x_dev, & + CUFFT_INVERSE) + + ! Reorder from x to y + blocks = dim3(self%nx/SZ, self%ny/SZ, self%nz/2 + 1) + threads = dim3(SZ, SZ, 1) + c_x_ptr(1:self%nx, 1:SZ, 1:(self%ny*(self%nz/2 + 1))/SZ) => self%c_x_dev + c_y_ptr(1:self%ny, 1:SZ, 1:(self%nx*(self%nz/2 + 1))/SZ) => self%c_y_dev + + call reorder_cmplx_x2y_T<<>>(c_y_ptr, c_x_ptr, & + self%nz/2 + 1) + + ! In-place backward FFT in y + ierrfft = cufftExecZ2Z(self%planZ2Zy, self%c_y_dev, self%c_y_dev, & + CUFFT_INVERSE) + + ! Reorder from y to z + blocks = dim3(self%nz/2/SZ + 1, self%ny/SZ, self%nx) + threads = dim3(SZ, SZ, 1) + c_y_ptr(1:self%ny, 1:SZ, 1:(self%nx*(self%nz/2 + 1))/SZ) => self%c_y_dev + c_z_ptr(1:self%nz/2 + 1, 1:SZ, 1:self%nx*self%ny/SZ) => self%c_z_dev + + call reorder_cmplx_y2z_T<<>>(c_z_ptr, c_y_ptr, & + self%nx, self%nz/2 + 1) + + ! Backward FFT transform in z from complex to real + ierrfft = cufftExecZ2D(self%planZ2Dz, self%c_z_dev, self%f_tmp) + + ! Finally reorder f back into our specialist data structure + blocks = dim3(self%nz/SZ, self%nx*self%ny/SZ, 1) + threads = dim3(SZ, SZ, 1) + call reshapeDSB<<>>(f_dev, self%f_tmp) + + end subroutine fft_backward_cuda + + subroutine fft_postprocess_cuda(self) + implicit none + + class(cuda_poisson_fft_t) :: self + + complex(dp), device, dimension(:, :, :), pointer :: c_dev, c_x_ptr + type(dim3) :: blocks, threads + + ! Get some complex array pointers with right shape + c_x_ptr(1:self%nx, 1:SZ, 1:(self%ny*(self%nz/2 + 1))/SZ) => self%c_x_dev + c_dev(1:SZ, 1:self%nx, 1:(self%ny*(self%nz/2 + 1))/SZ) => self%c_y_dev + + ! Reshape from cartesian-like to our specialist data structure + blocks = dim3(self%nx/SZ, (self%ny*(self%nz/2 + 1))/SZ, 1) + threads = dim3(SZ, SZ, 1) + call reshapeCDSB<<>>(c_dev, c_x_ptr) + + ! Postprocess + blocks = dim3((self%ny*(self%nz/2 + 1))/SZ, 1, 1) + threads = dim3(SZ, 1, 1) + call process_spectral_div_u<<>>( & + c_dev, self%waves_dev, self%nx, self%ny, self%nz, & + self%ax_dev, self%bx_dev, self%ay_dev, self%by_dev, & + self%az_dev, self%bz_dev & + ) + + ! Reshape from our specialist data structure to cartesian-like + blocks = dim3(self%nx/SZ, (self%ny*(self%nz/2 + 1))/SZ, 1) + threads = dim3(SZ, SZ, 1) + call reshapeCDSF<<>>(c_x_ptr, c_dev) + + end subroutine fft_postprocess_cuda + +end module m_cuda_poisson_fft diff --git a/src/omp/backend.f90 b/src/omp/backend.f90 index 813ca82f..1488f705 100644 --- a/src/omp/backend.f90 +++ b/src/omp/backend.f90 @@ -7,6 +7,7 @@ module m_omp_backend use m_omp_sendrecv, only: sendrecv_fields use m_omp_common, only: SZ + use m_omp_poisson_fft, only: omp_poisson_fft_t implicit none @@ -30,8 +31,10 @@ module m_omp_backend procedure :: sum_yintox => sum_yintox_omp procedure :: sum_zintox => sum_zintox_omp procedure :: vecadd => vecadd_omp - procedure :: set_fields => set_fields_omp - procedure :: get_fields => get_fields_omp + procedure :: scalar_product => scalar_product_omp + procedure :: set_field => set_field_omp + procedure :: get_field => get_field_omp + procedure :: init_poisson_fft => init_omp_poisson_fft procedure :: transeq_omp_dist end type omp_backend_t @@ -313,6 +316,16 @@ subroutine vecadd_omp(self, a, x, b, y) end subroutine vecadd_omp + real(dp) function scalar_product_omp(self, x, y) result(s) + implicit none + + class(omp_backend_t) :: self + class(field_t), intent(in) :: x, y + + s = 0._dp + + end function scalar_product_omp + subroutine copy_into_buffers(u_send_s, u_send_e, u, n, n_blocks) implicit none @@ -338,31 +351,42 @@ subroutine copy_into_buffers(u_send_s, u_send_e, u, n, n_blocks) end subroutine copy_into_buffers - subroutine set_fields_omp(self, u, v, w, u_in, v_in, w_in) + subroutine set_field_omp(self, f, arr) implicit none class(omp_backend_t) :: self - class(field_t), intent(inout) :: u, v, w - real(dp), dimension(:, :, :), intent(in) :: u_in, v_in, w_in + class(field_t), intent(inout) :: f + real(dp), dimension(:, :, :), intent(in) :: arr - u%data = u_in - v%data = v_in - w%data = w_in + f%data = arr - end subroutine set_fields_omp + end subroutine set_field_omp - subroutine get_fields_omp(self, u_out, v_out, w_out, u, v, w) + subroutine get_field_omp(self, arr, f) implicit none class(omp_backend_t) :: self - real(dp), dimension(:, :, :), intent(out) :: u_out, v_out, w_out - class(field_t), intent(in) :: u, v, w + real(dp), dimension(:, :, :), intent(out) :: arr + class(field_t), intent(in) :: f + + arr = f%data - u_out = u%data - v_out = v%data - w_out = w%data + end subroutine get_field_omp + + subroutine init_omp_poisson_fft(self, xdirps, ydirps, zdirps) + implicit none + + class(omp_backend_t) :: self + type(dirps_t), intent(in) :: xdirps, ydirps, zdirps + + allocate(omp_poisson_fft_t :: self%poisson_fft) + + select type (poisson_fft => self%poisson_fft) + type is (omp_poisson_fft_t) + poisson_fft = omp_poisson_fft_t(xdirps, ydirps, zdirps) + end select - end subroutine get_fields_omp + end subroutine init_omp_poisson_fft end module m_omp_backend diff --git a/src/omp/poisson_fft.f90 b/src/omp/poisson_fft.f90 new file mode 100644 index 00000000..a796f1fc --- /dev/null +++ b/src/omp/poisson_fft.f90 @@ -0,0 +1,60 @@ +module m_omp_poisson_fft + use m_allocator, only: field_t + use m_common, only: dp + use m_poisson_fft, only: poisson_fft_t + use m_tdsops, only: dirps_t + + use m_omp_common, only: SZ + + implicit none + + type, extends(poisson_fft_t) :: omp_poisson_fft_t + !! FFT based Poisson solver + !! It can only handle 1D decompositions along z direction. + complex(dp), allocatable, dimension(:, :, :) :: c_x, c_y, c_z + contains + procedure :: fft_forward => fft_forward_omp + procedure :: fft_backward => fft_backward_omp + procedure :: fft_postprocess => fft_postprocess_omp + end type omp_poisson_fft_t + + interface omp_poisson_fft_t + module procedure init + end interface omp_poisson_fft_t + + private :: init + +contains + + function init(xdirps, ydirps, zdirps) result(poisson_fft) + implicit none + + class(dirps_t), intent(in) :: xdirps, ydirps, zdirps + + type(omp_poisson_fft_t) :: poisson_fft + + call poisson_fft%base_init(xdirps, ydirps, zdirps, SZ) + + end function init + + subroutine fft_forward_omp(self, f_in) + implicit none + + class(omp_poisson_fft_t) :: self + class(field_t), intent(in) :: f_in + end subroutine fft_forward_omp + + subroutine fft_backward_omp(self, f_out) + implicit none + + class(omp_poisson_fft_t) :: self + class(field_t), intent(inout) :: f_out + end subroutine fft_backward_omp + + subroutine fft_postprocess_omp(self) + implicit none + + class(omp_poisson_fft_t) :: self + end subroutine fft_postprocess_omp + +end module m_omp_poisson_fft diff --git a/src/poisson_fft.f90 b/src/poisson_fft.f90 new file mode 100644 index 00000000..9ee0f575 --- /dev/null +++ b/src/poisson_fft.f90 @@ -0,0 +1,198 @@ +module m_poisson_fft + use m_allocator, only: field_t + use m_common, only: dp, pi + use m_tdsops, only: dirps_t + + implicit none + + type, abstract :: poisson_fft_t + !! FFT based Poisson solver + !! It can only handle 1D decompositions along z direction. + integer :: nx, ny, nz + complex(dp), allocatable, dimension(:, :, :) :: waves + complex(dp), allocatable, dimension(:) :: ax, bx, ay, by, az, bz + contains + procedure(fft_forward), deferred :: fft_forward + procedure(fft_backward), deferred :: fft_backward + procedure(fft_postprocess), deferred :: fft_postprocess + procedure :: base_init + procedure :: waves_set + end type poisson_fft_t + + abstract interface + subroutine fft_forward(self, f_in) + import :: poisson_fft_t + import :: field_t + implicit none + + class(poisson_fft_t) :: self + class(field_t), intent(in) :: f_in + end subroutine fft_forward + + subroutine fft_backward(self, f_out) + import :: poisson_fft_t + import :: field_t + implicit none + + class(poisson_fft_t) :: self + class(field_t), intent(inout) :: f_out + end subroutine fft_backward + + subroutine fft_postprocess(self) + import :: poisson_fft_t + implicit none + + class(poisson_fft_t) :: self + end subroutine fft_postprocess + end interface + +contains + + subroutine base_init(self, xdirps, ydirps, zdirps, sz) + implicit none + + class(poisson_fft_t) :: self + class(dirps_t), intent(in) :: xdirps, ydirps, zdirps + integer, intent(in) :: sz + + self%nx = xdirps%n; self%ny = ydirps%n; self%nz = zdirps%n + + allocate (self%ax(self%nx), self%bx(self%nx)) + allocate (self%ay(self%nx), self%by(self%nx)) + allocate (self%az(self%nx), self%bz(self%nx)) + + allocate (self%waves(sz, self%nx, (self%ny*(self%nz/2 + 1))/sz)) + + ! waves_set requires some of the preprocessed tdsops variables. + call self%waves_set(xdirps, ydirps, zdirps, sz) + + end subroutine base_init + + subroutine waves_set(self, xdirps, ydirps, zdirps, sz) + !! Ref. JCP 228 (2009), 5989–6015, Sec 4 + implicit none + + class(poisson_fft_t) :: self + type(dirps_t), intent(in) :: xdirps, ydirps, zdirps + integer, intent(in) :: sz + + complex(dp), allocatable, dimension(:) :: xkx, xk2, yky, yk2, zkz, zk2, & + exs, eys, ezs + + integer :: nx, ny, nz + real(dp) :: w, wp, rlexs, rleys, rlezs, xtt, ytt, ztt, xt1, yt1, zt1 + complex(dp) :: xt2, yt2, zt2, xyzk + + integer :: i, j, ka, kb, ix, iy, iz + + nx = xdirps%n; ny = ydirps%n; nz = zdirps%n + + do i = 1, nx + self%ax(i) = sin((i-1)*pi/nx) + self%bx(i) = cos((i-1)*pi/nx) + end do + + do i = 1, ny + self%ay(i) = sin((i-1)*pi/ny) + self%by(i) = cos((i-1)*pi/ny) + end do + + do i = 1, nz + self%az(i) = sin((i-1)*pi/nz) + self%bz(i) = cos((i-1)*pi/nz) + end do + + ! Now kxyz + allocate(xkx(nx), xk2(nx), exs(nx)) + allocate(yky(ny), yk2(ny), eys(ny)) + allocate(zkz(nz), zk2(nz), ezs(nz)) + xkx(:) = 0; xk2(:) = 0; yky(:) = 0; yk2(:) = 0; zkz(:) = 0; zk2(:) = 0 + + ! periodic-x + do i = 1, nx/2 + 1 + w = 2*pi*(i - 1)/nx + wp = xdirps%stagder_v2p%a*2*xdirps%d*sin(0.5_dp*w) & + + xdirps%stagder_v2p%b*2*xdirps%d*sin(1.5_dp*w) + wp = wp/(1._dp + 2*xdirps%stagder_v2p%alpha*cos(w)) + + xkx(i) = cmplx(1._dp, 1._dp, kind=dp)*(nx*wp/xdirps%L) + exs(i) = cmplx(1._dp, 1._dp, kind=dp)*(nx*w/xdirps%L) + xk2(i) = cmplx(1._dp, 1._dp, kind=dp)*(nx*wp/xdirps%L)**2 + end do + do i = nx/2 + 2, nx + xkx(i) = xkx(nx - i + 2) + exs(i) = exs(nx - i + 2) + xk2(i) = xk2(nx - i + 2) + end do + + ! periodic-y + do i = 1, ny/2 + 1 + w = 2*pi*(i - 1)/ny + wp = ydirps%stagder_v2p%a*2*ydirps%d*sin(0.5_dp*w) & + + ydirps%stagder_v2p%b*2*ydirps%d*sin(1.5_dp*w) + wp = wp/(1._dp + 2*ydirps%stagder_v2p%alpha*cos(w)) + + yky(i) = cmplx(1._dp, 1._dp, kind=dp)*(ny*wp/ydirps%L) + eys(i) = cmplx(1._dp, 1._dp, kind=dp)*(ny*w/ydirps%L) + yk2(i) = cmplx(1._dp, 1._dp, kind=dp)*(ny*wp/ydirps%L)**2 + end do + do i = ny/2 + 2, ny + yky(i) = yky(ny-i+2) + eys(i) = eys(ny-i+2) + yk2(i) = yk2(ny-i+2) + end do + + ! periodic-z + do i = 1, nz/2 + 1 + w = 2*pi*(i - 1)/nz + wp = zdirps%stagder_v2p%a*2*zdirps%d*sin(0.5_dp*w) & + + zdirps%stagder_v2p%b*2*zdirps%d*sin(1.5_dp*w) + wp = wp/(1._dp + 2*zdirps%stagder_v2p%alpha*cos(w)) + + zkz(i) = cmplx(1._dp, 1._dp, kind=dp)*(nz*wp/zdirps%L) + ezs(i) = cmplx(1._dp, 1._dp, kind=dp)*(nz*w/zdirps%L) + zk2(i) = cmplx(1._dp, 1._dp, kind=dp)*(nz*wp/zdirps%L)**2 + end do + + print*, 'waves array is correctly set only for a single rank run' + ! TODO: do loop ranges below are valid only for single rank runs + do ka = 1, nz/2 + 1 + do kb = 1, ny/sz + do j = 1, nx + do i = 1, sz + ix = j; iy = (kb - 1)*sz + i; iz = ka + rlexs = real(exs(ix), kind=dp)*xdirps%d + rleys = real(eys(iy), kind=dp)*ydirps%d + rlezs = real(ezs(iz), kind=dp)*zdirps%d + + xtt = 2*(xdirps%interpl_v2p%a*cos(rlexs*0.5_dp) & + + xdirps%interpl_v2p%b*cos(rlexs*1.5_dp) & + + xdirps%interpl_v2p%c*cos(rlexs*2.5_dp) & + + xdirps%interpl_v2p%d*cos(rlexs*3.5_dp)) + ytt = 2*(ydirps%interpl_v2p%a*cos(rleys*0.5_dp) & + + ydirps%interpl_v2p%b*cos(rleys*1.5_dp) & + + ydirps%interpl_v2p%c*cos(rleys*2.5_dp) & + + ydirps%interpl_v2p%d*cos(rleys*3.5_dp)) + ztt = 2*(zdirps%interpl_v2p%a*cos(rlezs*0.5_dp) & + + zdirps%interpl_v2p%b*cos(rlezs*1.5_dp) & + + zdirps%interpl_v2p%c*cos(rlezs*2.5_dp) & + + zdirps%interpl_v2p%d*cos(rlezs*3.5_dp)) + + xt1 = 1._dp + 2*xdirps%interpl_v2p%alpha*cos(rlexs) + yt1 = 1._dp + 2*ydirps%interpl_v2p%alpha*cos(rleys) + zt1 = 1._dp + 2*zdirps%interpl_v2p%alpha*cos(rlezs) + + xt2 = xk2(ix)*(((ytt/yt1)*(ztt/zt1))**2) + yt2 = yk2(iy)*(((xtt/xt1)*(ztt/zt1))**2) + zt2 = zk2(iz)*(((xtt/xt1)*(ytt/yt1))**2) + + xyzk = xt2 + yt2 + zt2 + self%waves(i, j, ka + (kb - 1)*(nz/2 + 1)) = xyzk + end do + end do + end do + end do + + end subroutine waves_set + +end module m_poisson_fft diff --git a/src/solver.f90 b/src/solver.f90 index 64339411..c9a04a11 100644 --- a/src/solver.f90 +++ b/src/solver.f90 @@ -1,7 +1,9 @@ module m_solver use m_allocator, only: allocator_t, field_t use m_base_backend, only: base_backend_t - use m_common, only: dp, globs_t, RDR_X2Y, RDR_X2Z, RDR_Y2X, RDR_Y2Z, RDR_Z2Y + use m_common, only: dp, globs_t, & + RDR_X2Y, RDR_X2Z, RDR_Y2X, RDR_Y2Z, RDR_Z2X, RDR_Z2Y, & + POISSON_SOLVER_FFT, POISSON_SOLVER_CG use m_tdsops, only: tdsops_t, dirps_t use m_time_integrator, only: time_intg_t @@ -36,23 +38,39 @@ module m_solver !! for later use. real(dp) :: dt, nu + integer :: n_iters, n_output class(field_t), pointer :: u, v, w class(base_backend_t), pointer :: backend class(dirps_t), pointer :: xdirps, ydirps, zdirps class(time_intg_t), pointer :: time_integrator + procedure(poisson_solver), pointer :: poisson => null() contains procedure :: transeq procedure :: divergence_v2p procedure :: gradient_p2v procedure :: curl + procedure :: output procedure :: run end type solver_t + abstract interface + subroutine poisson_solver(self, pressure, div_u) + import :: solver_t + import :: field_t + implicit none + + class(solver_t) :: self + class(field_t), intent(inout) :: pressure + class(field_t), intent(in) :: div_u + end subroutine poisson_solver + end interface + interface solver_t module procedure init end interface solver_t + contains function init(backend, time_integrator, xdirps, ydirps, zdirps, globs) & @@ -68,8 +86,8 @@ function init(backend, time_integrator, xdirps, ydirps, zdirps, globs) & real(dp), allocatable, dimension(:, :, :) :: u_init, v_init, w_init integer :: dims(3) - real(dp) :: dx, dy, dz - integer :: nx, ny, nz + real(dp) :: x, y, z + integer :: nx, ny, nz, i, j, b, ka, kb, ix, iy, iz, sz solver%backend => backend solver%time_integrator => time_integrator @@ -88,24 +106,53 @@ function init(backend, time_integrator, xdirps, ydirps, zdirps, globs) & allocate(v_init(dims(1), dims(2), dims(3))) allocate(w_init(dims(1), dims(2), dims(3))) - u_init = 0 - v_init = 0 - w_init = 0 + solver%dt = globs%dt + solver%backend%nu = globs%nu + solver%n_iters = globs%n_iters + solver%n_output = globs%n_output - call solver%backend%set_fields( & - solver%u, solver%v, solver%w, u_init, v_init, w_init & - ) + sz = dims(1) + nx = globs%nx_loc; ny = globs%ny_loc; nz = globs%nz_loc + do ka = 1, nz + do kb = 1, ny/sz + do j = 1, nx + do i = 1, sz + ! Mapping to ix, iy, iz depends on global group numbering + ix = j; iy = (kb - 1)*sz + i; iz = ka + x = (ix - 1)*globs%dx + y = (iy - 1)*globs%dy + z = (iz - 1)*globs%dz + + b = ka + (kb - 1)*xdirps%n + u_init(i, j, b) = sin(x)*cos(y)*cos(z) + v_init(i, j, b) =-cos(x)*sin(y)*cos(z) + w_init(i, j, b) = 0 + end do + end do + end do + end do + + call solver%backend%set_field(solver%u, u_init) + call solver%backend%set_field(solver%v, v_init) + call solver%backend%set_field(solver%w, w_init) deallocate(u_init, v_init, w_init) print*, 'initial conditions are set' - nx = globs%nx_loc; ny = globs%ny_loc; nz = globs%nz_loc - dx = globs%dx; dy = globs%dy; dz = globs%dz - ! Allocate and set the tdsops - call allocate_tdsops(solver%xdirps, nx, dx, solver%backend) - call allocate_tdsops(solver%ydirps, ny, dy, solver%backend) - call allocate_tdsops(solver%zdirps, nz, dz, solver%backend) + call allocate_tdsops(solver%xdirps, nx, globs%dx, solver%backend) + call allocate_tdsops(solver%ydirps, ny, globs%dy, solver%backend) + call allocate_tdsops(solver%zdirps, nz, globs%dz, solver%backend) + + select case (globs%poisson_solver_type) + case (POISSON_SOLVER_FFT) + print*, 'Poisson solver: FFT' + call solver%backend%init_poisson_fft(xdirps, ydirps, zdirps) + solver%poisson => poisson_fft + case (POISSON_SOLVER_CG) + print*, 'Poisson solver: CG, not yet implemented' + solver%poisson => poisson_cg + end select end function init @@ -136,6 +183,10 @@ subroutine allocate_tdsops(dirps, nx, dx, backend) end subroutine subroutine transeq(self, du, dv, dw, u, v, w) + !! Skew-symmetric form of convection-diffusion terms in the + !! incompressible Navier-Stokes momemtum equations, excluding + !! pressure terms. + !! Inputs from velocity grid and outputs to velocity grid. implicit none class(solver_t) :: self @@ -217,6 +268,8 @@ subroutine transeq(self, du, dv, dw, u, v, w) end subroutine transeq subroutine divergence_v2p(self, div_u, u, v, w) + !! Divergence of a vector field (u, v, w). + !! Inputs from velocity grid and outputs to pressure grid. implicit none class(solver_t) :: self @@ -279,8 +332,8 @@ subroutine divergence_v2p(self, div_u, u, v, w) u_z => self%backend%allocator%get_block() w_z => self%backend%allocator%get_block() - ! dv_y = dv_y + dw_y - call self%backend%vecadd(1._dp, dw_y, 1._dp, dv_y) + ! du_y = dv_y + du_y + call self%backend%vecadd(1._dp, dv_y, 1._dp, du_y) ! reorder from y to z call self%backend%reorder(u_z, du_y, RDR_Y2Z) @@ -312,6 +365,8 @@ subroutine divergence_v2p(self, div_u, u, v, w) end subroutine divergence_v2p subroutine gradient_p2v(self, dpdx, dpdy, dpdz, pressure) + !! Gradient of a scalar field 'pressure'. + !! Inputs from pressure grid and outputs to velocity grid. implicit none class(solver_t) :: self @@ -379,9 +434,9 @@ subroutine gradient_p2v(self, dpdx, dpdy, dpdz, pressure) call self%backend%tds_solve(dpdx, p_sx_x, self%xdirps, & self%xdirps%stagder_p2v) call self%backend%tds_solve(dpdy, dpdy_sx_x, self%xdirps, & - self%xdirps%interpl_v2p) + self%xdirps%interpl_p2v) call self%backend%tds_solve(dpdz, dpdz_sx_x, self%xdirps, & - self%xdirps%interpl_v2p) + self%xdirps%interpl_p2v) ! release temporary x fields call self%backend%allocator%release_block(p_sx_x) @@ -390,80 +445,176 @@ subroutine gradient_p2v(self, dpdx, dpdy, dpdz, pressure) end subroutine gradient_p2v - subroutine curl(self, o_x, o_y, o_z, u, v, w) + subroutine curl(self, o_i_hat, o_j_hat, o_k_hat, u, v, w) + !! Curl of a vector field (u, v, w). + !! Inputs from velocity grid and outputs to velocity grid. implicit none class(solver_t) :: self - class(field_t), intent(inout) :: o_x, o_y, o_z !! omega_x/_y/_z + !> Vector components of the output vector field Omega + class(field_t), intent(inout) :: o_i_hat, o_j_hat, o_k_hat class(field_t), intent(in) :: u, v, w - class(field_t), pointer :: u_y, w_y, du_y, u_z, v_z, du_z, dv_z - - ! o_x = dw/dy - dv/dz - ! o_y = du/dz - dw/dx - ! o_z = dv/dx - du/dy + class(field_t), pointer :: u_y, u_z, v_z, w_y, dwdy_y, dvdz_z, dvdz_x, & + dudz_z, dudz_x, dudy_y, dudy_x - ! obtain dw/dx, dv/dx and store them directly in omega_y, omega_z - call self%backend%tds_solve(o_y, w, self%xdirps, self%xdirps%der1st) - call self%backend%tds_solve(o_z, v, self%xdirps, self%xdirps%der1st) + ! omega_i_hat = dw/dy - dv/dz + ! omega_j_hat = du/dz - dw/dx + ! omega_k_hat = dv/dx - du/dy - u_y => self%backend%allocator%get_block() + ! omega_i_hat + ! dw/dy w_y => self%backend%allocator%get_block() - - call self%backend%reorder(u_y, u, RDR_X2Y) + dwdy_y => self%backend%allocator%get_block() call self%backend%reorder(w_y, w, RDR_X2Y) + call self%backend%tds_solve(dwdy_y, w_y, self%ydirps, self%ydirps%der1st) - du_y => self%backend%allocator%get_block() - - ! obtain du/dy, dw/dy - ! store du/dy in a temporary field to add into omega_z later - ! dw/dy can be stored directly in omega_x as it is empty - call self%backend%tds_solve(du_y, u_y, self%ydirps, self%ydirps%der1st) - call self%backend%tds_solve(o_x, w_y, self%ydirps, self%ydirps%der1st) + call self%backend%reorder(o_i_hat, dwdy_y, RDR_Y2X) - call self%backend%allocator%release_block(u_y) call self%backend%allocator%release_block(w_y) + call self%backend%allocator%release_block(dwdy_y) - ! omega_z = dv/dz - du/dy - call self%backend%vecadd(-1._dp, du_y, 1._dp, o_z) + ! dv/dz + v_z => self%backend%allocator%get_block() + dvdz_z => self%backend%allocator%get_block() + call self%backend%reorder(v_z, v, RDR_X2Z) + call self%backend%tds_solve(dvdz_z, v_z, self%zdirps, self%zdirps%der1st) - call self%backend%allocator%release_block(du_y) + dvdz_x => self%backend%allocator%get_block() + call self%backend%reorder(dvdz_x, dvdz_z, RDR_Z2X) - u_z => self%backend%allocator%get_block() - v_z => self%backend%allocator%get_block() - du_z => self%backend%allocator%get_block() - dv_z => self%backend%allocator%get_block() + call self%backend%allocator%release_block(v_z) + call self%backend%allocator%release_block(dvdz_z) - ! obtain du/dz, dv/dz and store them in temporary fields - call self%backend%tds_solve(du_z, u_z, self%zdirps, self%zdirps%der1st) - call self%backend%tds_solve(dv_z, v_z, self%zdirps, self%zdirps%der1st) + ! omega_i_hat = dw/dy - dv/dz + call self%backend%vecadd(-1._dp, dvdz_x, 1._dp, o_i_hat) - ! omega_x = dw/dy - dv/dz - call self%backend%vecadd(-1._dp, dv_z, 1._dp, o_x) - ! omega_y = du/dz - dw/dx - call self%backend%vecadd(1._dp, du_z, -1._dp, o_y) + call self%backend%allocator%release_block(dvdz_x) + + ! omega_j_hat + ! du/dz + u_z => self%backend%allocator%get_block() + dudz_z => self%backend%allocator%get_block() + call self%backend%reorder(u_z, u, RDR_X2Z) + call self%backend%tds_solve(dudz_z, u_z, self%zdirps, self%zdirps%der1st) + + dudz_x => self%backend%allocator%get_block() + call self%backend%reorder(dudz_x, dudz_z, RDR_Z2X) call self%backend%allocator%release_block(u_z) - call self%backend%allocator%release_block(v_z) - call self%backend%allocator%release_block(du_z) - call self%backend%allocator%release_block(dv_z) + call self%backend%allocator%release_block(dudz_z) + + ! dw/dx + call self%backend%tds_solve(o_j_hat, w, self%xdirps, self%xdirps%der1st) + + ! omega_j_hat = du/dz - dw/dx + call self%backend%vecadd(1._dp, dudz_x, -1._dp, o_j_hat) + + call self%backend%allocator%release_block(dudz_x) + + ! omega_k_hat + ! dv/dx + call self%backend%tds_solve(o_k_hat, v, self%xdirps, self%xdirps%der1st) + + ! du/dy + u_y => self%backend%allocator%get_block() + dudy_y => self%backend%allocator%get_block() + call self%backend%reorder(u_y, u, RDR_X2Y) + call self%backend%tds_solve(dudy_y, u_y, self%ydirps, self%ydirps%der1st) + + dudy_x => self%backend%allocator%get_block() + call self%backend%reorder(dudy_x, dudy_y, RDR_Y2X) + + call self%backend%allocator%release_block(u_y) + call self%backend%allocator%release_block(dudy_y) + + ! omega_k_hat = dv/dx - du/dy + call self%backend%vecadd(-1._dp, dudy_x, 1._dp, o_k_hat) + + call self%backend%allocator%release_block(dudy_x) end subroutine curl - subroutine run(self, n_iter, u_out, v_out, w_out) + subroutine poisson_fft(self, pressure, div_u) + implicit none + + class(solver_t) :: self + class(field_t), intent(inout) :: pressure + class(field_t), intent(in) :: div_u + + ! call forward FFT + ! output array in spectral space is stored at poisson_fft class + call self%backend%poisson_fft%fft_forward(div_u) + + ! postprocess + call self%backend%poisson_fft%fft_postprocess + + ! call backward FFT + call self%backend%poisson_fft%fft_backward(pressure) + + end subroutine poisson_fft + + subroutine poisson_cg(self, pressure, div_u) + implicit none + + class(solver_t) :: self + class(field_t), intent(inout) :: pressure + class(field_t), intent(in) :: div_u + + end subroutine poisson_cg + + subroutine output(self, t, u_out) + implicit none + + class(solver_t), intent(in) :: self + real(dp), intent(in) :: t + real(dp), dimension(:, :, :), intent(inout) :: u_out + + class(field_t), pointer :: du, dv, dw + integer :: ngrid + + ngrid = self%xdirps%n*self%ydirps%n*self%zdirps%n + print*, 'time = ', t + + du => self%backend%allocator%get_block() + dv => self%backend%allocator%get_block() + dw => self%backend%allocator%get_block() + + call self%curl(du, dv, dw, self%u, self%v, self%w) + print*, 'enstrophy:', 0.5_dp*( & + self%backend%scalar_product(du, du) & + + self%backend%scalar_product(dv, dv) & + + self%backend%scalar_product(dw, dw) & + )/ngrid + + call self%backend%allocator%release_block(du) + call self%backend%allocator%release_block(dv) + call self%backend%allocator%release_block(dw) + + call self%divergence_v2p(du, self%u, self%v, self%w) + call self%backend%get_field(u_out, du) + print*, 'div u max mean:', maxval(abs(u_out)), sum(abs(u_out))/ngrid + + end subroutine output + + subroutine run(self, u_out, v_out, w_out) implicit none class(solver_t), intent(in) :: self - integer, intent(in) :: n_iter real(dp), dimension(:, :, :), intent(inout) :: u_out, v_out, w_out class(field_t), pointer :: du, dv, dw, div_u, pressure, dpdx, dpdy, dpdz + real(dp) :: t integer :: i + print*, 'initial conditions' + t = 0._dp + call self%output(t, u_out) + print*, 'start run' - do i = 1, n_iter + do i = 1, self%n_iters du => self%backend%allocator%get_block() dv => self%backend%allocator%get_block() dw => self%backend%allocator%get_block() @@ -485,7 +636,7 @@ subroutine run(self, n_iter, u_out, v_out, w_out) pressure => self%backend%allocator%get_block() - !call self%poisson(pressure, div_u) + call self%poisson(pressure, div_u) call self%backend%allocator%release_block(div_u) @@ -505,13 +656,18 @@ subroutine run(self, n_iter, u_out, v_out, w_out) call self%backend%allocator%release_block(dpdx) call self%backend%allocator%release_block(dpdy) call self%backend%allocator%release_block(dpdz) + + if ( mod(i, self%n_output) == 0 ) then + t = i*self%dt + call self%output(t, u_out) + end if end do print*, 'run end' - call self%backend%get_fields( & - u_out, v_out, w_out, self%u, self%v, self%w & - ) + call self%backend%get_field(u_out, self%u) + call self%backend%get_field(v_out, self%v) + call self%backend%get_field(w_out, self%w) end subroutine run diff --git a/src/tdsops.f90 b/src/tdsops.f90 index c159cdd2..3db7b4d9 100644 --- a/src/tdsops.f90 +++ b/src/tdsops.f90 @@ -27,6 +27,7 @@ module m_tdsops dist_sa, dist_sc, & !! back subs. dist_af !! the auxiliary factors real(dp), allocatable :: coeffs(:), coeffs_s(:, :), coeffs_e(:, :) + real(dp) :: alpha, a, b, c = 0._dp, d = 0._dp integer :: n, n_halo contains procedure :: deriv_1st, deriv_2nd, interpl_mid, stagder_1st, preprocess @@ -46,6 +47,7 @@ module m_tdsops stagder_v2p, stagder_p2v, & interpl_v2p, interpl_p2v integer :: nrank, nproc, pnext, pprev, n, n_blocks + real(dp) :: L, d end type dirps_t contains @@ -154,6 +156,9 @@ subroutine deriv_1st(self, delta, scheme, bc_start, bc_end, sym) error stop 'scheme is not defined' end select + self%alpha = alpha + self%a = afi; self%b = bfi + self%coeffs(:) = [0._dp, 0._dp, -bfi, -afi, & 0._dp, & afi, bfi, 0._dp, 0._dp] @@ -321,6 +326,9 @@ subroutine deriv_2nd(self, delta, scheme, bc_start, bc_end, sym, & error stop 'scheme is not defined' end select + self%alpha = alpha + self%a = asi; self%b = bsi; self%c = csi; self%d = dsi + self%coeffs(:) = [dsi, csi, bsi, asi, & -2._dp*(asi + bsi + csi + dsi), & asi, bsi, csi, dsi] @@ -511,6 +519,9 @@ subroutine interpl_mid(self, scheme, from_to, bc_start, bc_end, sym) error stop 'scheme is not defined' end select + self%alpha = alpha + self%a = aici; self%b = bici; self%c = cici; self%d = dici + select case (from_to) case ('v2p') self%coeffs(:) = [0._dp, dici, cici, bici, & @@ -633,6 +644,9 @@ subroutine stagder_1st(self, delta, scheme, from_to, bc_start, bc_end, sym) error stop 'scheme is not defined' end select + self%alpha = alpha + self%a = aci; self%b = bci + select case (from_to) case ('v2p') self%coeffs(:) = [0._dp, 0._dp, 0._dp, -bci, & diff --git a/src/time_integrator.f90 b/src/time_integrator.f90 index d12a942e..8a725169 100644 --- a/src/time_integrator.f90 +++ b/src/time_integrator.f90 @@ -61,6 +61,10 @@ subroutine step(self, u, v, w, du, dv, dw, dt) real(dp), intent(in) :: dt + call self%backend%vecadd(dt, du, 1._dp, u) + call self%backend%vecadd(dt, dv, 1._dp, v) + call self%backend%vecadd(dt, dw, 1._dp, w) + end subroutine step subroutine adams_bashford_1st(vels, olds, coeffs) diff --git a/src/xcompact.f90 b/src/xcompact.f90 index be9295d9..635db2a7 100644 --- a/src/xcompact.f90 +++ b/src/xcompact.f90 @@ -3,7 +3,8 @@ program xcompact use m_allocator use m_base_backend - use m_common, only: pi, globs_t, set_pprev_pnext + use m_common, only: pi, globs_t, set_pprev_pnext, & + POISSON_SOLVER_FFT, POISSON_SOLVER_CG use m_solver, only: solver_t use m_time_integrator, only: time_intg_t use m_tdsops, only: tdsops_t @@ -55,13 +56,21 @@ program xcompact ! read L_x/y/z from the input file globs%Lx = 2*pi; globs%Ly = 2*pi; globs%Lz = 2*pi + xdirps%L = globs%Lx; ydirps%L = globs%Ly; zdirps%L = globs%Lz ! read ns from the input file - globs%nx = 512; globs%ny = 512; globs%nz = 512 + globs%nx = 256; globs%ny = 256; globs%nz = 256 + + globs%dt = 0.001_dp + globs%nu = 1._dp/1600._dp + globs%n_iters = 20000 + globs%n_output = 100 ! set nprocs based on run time arguments globs%nproc_x = 1; globs%nproc_y = 1; globs%nproc_z = 1 + globs%poisson_solver_type = POISSON_SOLVER_FFT + ! Lets allow a 1D decomposition for the moment !globs%nproc_x = nproc @@ -91,6 +100,8 @@ program xcompact globs%dy = globs%Ly/globs%ny globs%dz = globs%Lz/globs%nz + xdirps%d = globs%dx; ydirps%d = globs%dy; zdirps%d = globs%dz + xdirps%n = globs%nx_loc ydirps%n = globs%ny_loc zdirps%n = globs%nz_loc @@ -117,8 +128,6 @@ program xcompact print*, 'OpenMP backend instantiated' #endif - backend%nu = 1._dp - allocate(u(SZ, globs%nx_loc, globs%n_groups_x)) allocate(v(SZ, globs%nx_loc, globs%n_groups_x)) allocate(w(SZ, globs%nx_loc, globs%n_groups_x)) @@ -130,7 +139,7 @@ program xcompact call cpu_time(t_start) - call solver%run(100, u, v, w) + call solver%run(u, v, w) call cpu_time(t_end) @@ -138,4 +147,6 @@ program xcompact print*, 'norms', norm2(u), norm2(v), norm2(w) + call MPI_Finalize(ierr) + end program xcompact diff --git a/tests/cuda/test_cuda_reorder.f90 b/tests/cuda/test_cuda_reorder.f90 index 833b0774..bacd3450 100644 --- a/tests/cuda/test_cuda_reorder.f90 +++ b/tests/cuda/test_cuda_reorder.f90 @@ -6,7 +6,7 @@ program test_cuda_reorder use m_common, only: dp use m_cuda_common, only: SZ use m_cuda_kernels_reorder, only: reorder_x2y, reorder_x2z, reorder_y2x, & - reorder_y2z, reorder_z2y + reorder_y2z, reorder_z2x, reorder_z2y implicit none @@ -83,50 +83,46 @@ program test_cuda_reorder threads = dim3(SZ, SZ, 1) call reorder_y2z<<>>(u_o_d, u_temp_d, nx, nz) - ! store this in host - u_o = u_o_d + ! store initial z oriented field + u_temp = u_temp_d - ! and zeroize u_o_d again in any case - u_o_d = 0 + ! z oriented field into y + blocks = dim3(nx/SZ, ny/SZ, nz) + threads = dim3(SZ, SZ, 1) + call reorder_z2y<<>>(u_temp_d, u_o_d, nx, nz) - ! reorder initial random field into z orientation - blocks = dim3(nx, ny/SZ, 1) - threads = dim3(SZ, 1, 1) - call reorder_x2z<<>>(u_o_d, u_i_d, nz) - u_temp = u_o_d + u_o = u_temp_d - ! compare two z oriented fields + ! compare two y oriented fields norm_u = norm2(u_o - u_temp) if (nrank == 0) then if ( norm_u > tol ) then allpass = .false. - write(stderr, '(a)') 'Check reorder x2z and y2z... failed' + write(stderr, '(a)') 'Check reorder y2z and y2z... failed' else - write(stderr, '(a)') 'Check reorder x2z and y2z... passed' + write(stderr, '(a)') 'Check reorder y2z and y2z... passed' end if end if - ! z oriented field into y - blocks = dim3(nx/SZ, ny/SZ, nz) - threads = dim3(SZ, SZ, 1) - call reorder_z2y<<>>(u_temp_d, u_o_d, nx, nz) - - ! zeroize just in case for reusing - u_o_d = 0 + ! reorder initial random field into z orientation + blocks = dim3(nx, ny/SZ, 1) + threads = dim3(SZ, 1, 1) + call reorder_x2z<<>>(u_o_d, u_i_d, nz) - blocks = dim3(nx/SZ, ny/SZ, nz) - threads = dim3(SZ, SZ, 1) - call reorder_y2x<<>>(u_o_d, u_temp_d, nz) - u_o = u_o_d + ! z oriented field into x + blocks = dim3(nx, ny/SZ, 1) + threads = dim3(SZ, 1, 1) + call reorder_z2x<<>>(u_temp_d, u_o_d, nz) + u_o = u_temp_d - ! and check whether it matches the initial random field + ! compare two z oriented fields norm_u = norm2(u_o - u_i) if (nrank == 0) then if ( norm_u > tol ) then allpass = .false. - write(stderr, '(a)') 'Check reorder z2y and y2x... failed' + write(stderr, '(a)') 'Check reorder x2z and z2x... failed' else - write(stderr, '(a)') 'Check reorder z2y and y2x... passed' + write(stderr, '(a)') 'Check reorder x2z and z2x... passed' end if end if @@ -171,6 +167,16 @@ program test_cuda_reorder call checkperf(tend - tstart, n_iters, ndof, 2._dp) + call cpu_time(tstart) + do i = 1, n_iters + blocks = dim3(nx, ny/SZ, 1) + threads = dim3(SZ, 1, 1) + call reorder_z2x<<>>(u_o_d, u_i_d, nz) + end do + call cpu_time(tend) + + call checkperf(tend - tstart, n_iters, ndof, 2._dp) + call cpu_time(tstart) do i = 1, n_iters blocks = dim3(nx/SZ, ny/SZ, nz)