Skip to content

Commit

Permalink
Merge github.com:xcompact3d/x3d2 into omp-reorder
Browse files Browse the repository at this point in the history
  • Loading branch information
Nanoseb committed Mar 11, 2024
2 parents cd6d7a6 + e916fe8 commit 5c889cb
Show file tree
Hide file tree
Showing 15 changed files with 1,282 additions and 143 deletions.
5 changes: 5 additions & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,14 @@ set(SRC
allocator.f90
backend.f90
common.f90
poisson_fft.f90
solver.f90
tdsops.f90
time_integrator.f90
omp/backend.f90
omp/common.f90
omp/kernels/distributed.f90
omp/poisson_fft.f90
omp/sendrecv.f90
omp/exec_dist.f90
)
Expand All @@ -17,7 +19,9 @@ set(CUDASRC
cuda/common.f90
cuda/exec_dist.f90
cuda/kernels/distributed.f90
cuda/kernels/complex.f90
cuda/kernels/reorder.f90
cuda/poisson_fft.f90
cuda/sendrecv.f90
cuda/tdsops.f90
)
Expand All @@ -40,6 +44,7 @@ if(${CMAKE_Fortran_COMPILER_ID} STREQUAL "PGI" OR
set(CMAKE_Fortran_FLAGS_DEBUG "-g -O0 -traceback -Mbounds -Mchkptr -Ktrap=fp")
set(CMAKE_Fortran_FLAGS_RELEASE "-O3 -fast")
target_link_options(x3d2 INTERFACE "-cuda")
target_link_options(x3d2 INTERFACE "-lcufft")

target_compile_options(xcompact PRIVATE "-DCUDA")
# target_link_options(xcompact INTERFACE "-cuda")
Expand Down
50 changes: 39 additions & 11 deletions src/backend.f90
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
module m_base_backend
use m_allocator, only: allocator_t, field_t
use m_common, only: dp
use m_poisson_fft, only: poisson_fft_t
use m_tdsops, only: tdsops_t, dirps_t

implicit none
Expand All @@ -24,6 +25,7 @@ module m_base_backend
integer :: nx_loc, ny_loc, nz_loc
class(allocator_t), pointer :: allocator
class(dirps_t), pointer :: xdirps, ydirps, zdirps
class(poisson_fft_t), pointer :: poisson_fft
contains
procedure(transeq_ders), deferred :: transeq_x
procedure(transeq_ders), deferred :: transeq_y
Expand All @@ -33,9 +35,11 @@ module m_base_backend
procedure(sum_intox), deferred :: sum_yintox
procedure(sum_intox), deferred :: sum_zintox
procedure(vecadd), deferred :: vecadd
procedure(get_fields), deferred :: get_fields
procedure(set_fields), deferred :: set_fields
procedure(scalar_product), deferred :: scalar_product
procedure(get_field), deferred :: get_field
procedure(set_field), deferred :: set_field
procedure(alloc_tdsops), deferred :: alloc_tdsops
procedure(init_poisson_fft), deferred :: init_poisson_fft
end type base_backend_t

abstract interface
Expand Down Expand Up @@ -126,7 +130,20 @@ end subroutine vecadd
end interface

abstract interface
subroutine get_fields(self, u_out, v_out, w_out, u, v, w)
real(dp) function scalar_product(self, x, y) result(s)
!! Calculates the scalar product of two input fields
import :: base_backend_t
import :: dp
import :: field_t
implicit none

class(base_backend_t) :: self
class(field_t), intent(in) :: x, y
end function scalar_product
end interface

abstract interface
subroutine get_field(self, arr, f)
!! copy the specialist data structure from device or host back
!! to a regular 3D data structure.
import :: base_backend_t
Expand All @@ -135,23 +152,23 @@ subroutine get_fields(self, u_out, v_out, w_out, u, v, w)
implicit none

class(base_backend_t) :: self
real(dp), dimension(:, :, :), intent(out) :: u_out, v_out, w_out
class(field_t), intent(in) :: u, v, w
end subroutine get_fields
real(dp), dimension(:, :, :), intent(out) :: arr
class(field_t), intent(in) :: f
end subroutine get_field

subroutine set_fields(self, u, v, w, u_in, v_in, w_in)
subroutine set_field(self, f, arr)
!! copy the initial condition stored in a regular 3D data
!! structure into the specialist data structure arrays on the
!! structure into the specialist data structure array on the
!! device or host.
import :: base_backend_t
import :: dp
import :: field_t
implicit none

class(base_backend_t) :: self
class(field_t), intent(inout) :: u, v, w
real(dp), dimension(:, :, :), intent(in) :: u_in, v_in, w_in
end subroutine set_fields
class(field_t), intent(inout) :: f
real(dp), dimension(:, :, :), intent(in) :: arr
end subroutine set_field
end interface

abstract interface
Expand All @@ -174,4 +191,15 @@ subroutine alloc_tdsops(self, tdsops, n, dx, operation, scheme, n_halo, &
end subroutine alloc_tdsops
end interface

abstract interface
subroutine init_poisson_fft(self, xdirps, ydirps, zdirps)
import :: base_backend_t
import :: dirps_t
implicit none

class(base_backend_t) :: self
type(dirps_t), intent(in) :: xdirps, ydirps, zdirps
end subroutine init_poisson_fft
end interface

end module m_base_backend
6 changes: 5 additions & 1 deletion src/common.f90
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,21 @@ module m_common
real(dp), parameter :: pi = 4*atan(1.0_dp)

integer, parameter :: RDR_X2Y = 12, RDR_X2Z = 13, RDR_Y2X = 21, &
RDR_Y2Z = 23, RDR_Z2Y = 32
RDR_Y2Z = 23, RDR_Z2X = 31, RDR_Z2Y = 32
integer, parameter :: dir_X = 1, dir_Y = 2, dir_Z = 3
integer, parameter :: POISSON_SOLVER_FFT = 0, POISSON_SOLVER_CG = 1

type :: globs_t
integer :: nx, ny, nz
integer :: nx_loc, ny_loc, nz_loc
integer :: n_groups_x, n_groups_y, n_groups_z
real(dp) :: Lx, Ly, Lz
real(dp) :: dx, dy, dz
real(dp) :: nu, dt
integer :: n_iters, n_output
integer :: nproc_x = 1, nproc_y = 1, nproc_z = 1
character(len=20) :: BC_x_s, BC_x_e, BC_y_s, BC_y_e, BC_z_s, BC_z_e
integer :: poisson_solver_type
end type globs_t

contains
Expand Down
89 changes: 70 additions & 19 deletions src/cuda/backend.f90
Original file line number Diff line number Diff line change
@@ -1,21 +1,25 @@
module m_cuda_backend
use iso_fortran_env, only: stderr => error_unit
use cudafor
use mpi

use m_allocator, only: allocator_t, field_t
use m_base_backend, only: base_backend_t
use m_common, only: dp, globs_t, RDR_X2Y, RDR_X2Z, RDR_Y2X, RDR_Y2Z, RDR_Z2Y
use m_common, only: dp, globs_t, &
RDR_X2Y, RDR_X2Z, RDR_Y2X, RDR_Y2Z, RDR_Z2X, RDR_Z2Y
use m_poisson_fft, only: poisson_fft_t
use m_tdsops, only: dirps_t, tdsops_t

use m_cuda_allocator, only: cuda_allocator_t, cuda_field_t
use m_cuda_common, only: SZ
use m_cuda_exec_dist, only: exec_dist_transeq_3fused, exec_dist_tds_compact
use m_cuda_poisson_fft, only: cuda_poisson_fft_t
use m_cuda_sendrecv, only: sendrecv_fields, sendrecv_3fields
use m_cuda_tdsops, only: cuda_tdsops_t
use m_cuda_kernels_dist, only: transeq_3fused_dist, transeq_3fused_subs
use m_cuda_kernels_reorder, only: &
reorder_x2y, reorder_x2z, reorder_y2x, reorder_y2z, reorder_z2y, &
sum_yintox, sum_zintox, axpby, buffer_copy
reorder_x2y, reorder_x2z, reorder_y2x, reorder_y2z, reorder_z2x, &
reorder_z2y, sum_yintox, sum_zintox, scalar_product, axpby, buffer_copy

implicit none

Expand All @@ -40,8 +44,10 @@ module m_cuda_backend
procedure :: sum_yintox => sum_yintox_cuda
procedure :: sum_zintox => sum_zintox_cuda
procedure :: vecadd => vecadd_cuda
procedure :: set_fields => set_fields_cuda
procedure :: get_fields => get_fields_cuda
procedure :: scalar_product => scalar_product_cuda
procedure :: set_field => set_field_cuda
procedure :: get_field => get_field_cuda
procedure :: init_poisson_fft => init_cuda_poisson_fft
procedure :: transeq_cuda_dist
procedure :: transeq_cuda_thom
procedure :: tds_solve_dist
Expand All @@ -60,6 +66,7 @@ function init(globs, allocator) result(backend)
class(allocator_t), target, intent(inout) :: allocator
type(cuda_backend_t) :: backend

type(cuda_poisson_fft_t) :: cuda_poisson_fft
integer :: n_halo, n_block

select type(allocator)
Expand Down Expand Up @@ -452,6 +459,10 @@ subroutine reorder_cuda(self, u_o, u_i, direction)
threads = dim3(SZ, SZ, 1)
call reorder_y2z<<<blocks, threads>>>(u_o_d, u_i_d, &
self%nx_loc, self%nz_loc)
case (RDR_Z2X) ! z2x
blocks = dim3(self%nx_loc, self%ny_loc/SZ, 1)
threads = dim3(SZ, 1, 1)
call reorder_z2x<<<blocks, threads>>>(u_o_d, u_i_d, self%nz_loc)
case (RDR_Z2Y) ! z2y
blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc)
threads = dim3(SZ, SZ, 1)
Expand Down Expand Up @@ -524,6 +535,35 @@ subroutine vecadd_cuda(self, a, x, b, y)

end subroutine vecadd_cuda

real(dp) function scalar_product_cuda(self, x, y) result(s)
implicit none

class(cuda_backend_t) :: self
class(field_t), intent(in) :: x, y

real(dp), device, pointer, dimension(:, :, :) :: x_d, y_d
real(dp), device, allocatable :: sum_d
type(dim3) :: blocks, threads
integer :: n, ierr

select type(x); type is (cuda_field_t); x_d => x%data_d; end select
select type(y); type is (cuda_field_t); y_d => y%data_d; end select

allocate (sum_d)
sum_d = 0._dp

n = size(x_d, dim = 2)
blocks = dim3(size(x_d, dim = 3), 1, 1)
threads = dim3(SZ, 1, 1)
call scalar_product<<<blocks, threads>>>(sum_d, x_d, y_d, n)

s = sum_d

call MPI_Allreduce(MPI_IN_PLACE, s, 1, MPI_DOUBLE_PRECISION, MPI_SUM, &
MPI_COMM_WORLD, ierr)

end function scalar_product_cuda

subroutine copy_into_buffers(u_send_s_dev, u_send_e_dev, u_dev, n)
implicit none

Expand All @@ -542,31 +582,42 @@ subroutine copy_into_buffers(u_send_s_dev, u_send_e_dev, u_dev, n)

end subroutine copy_into_buffers

subroutine set_fields_cuda(self, u, v, w, u_in, v_in, w_in)
subroutine set_field_cuda(self, f, arr)
implicit none

class(cuda_backend_t) :: self
class(field_t), intent(inout) :: u, v, w
real(dp), dimension(:, :, :), intent(in) :: u_in, v_in, w_in
class(field_t), intent(inout) :: f
real(dp), dimension(:, :, :), intent(in) :: arr

select type(u); type is (cuda_field_t); u%data_d = u_in; end select
select type(v); type is (cuda_field_t); v%data_d = v_in; end select
select type(w); type is (cuda_field_t); w%data_d = w_in; end select
select type(f); type is (cuda_field_t); f%data_d = arr; end select

end subroutine set_fields_cuda
end subroutine set_field_cuda

subroutine get_fields_cuda(self, u_out, v_out, w_out, u, v, w)
subroutine get_field_cuda(self, arr, f)
implicit none

class(cuda_backend_t) :: self
real(dp), dimension(:, :, :), intent(out) :: u_out, v_out, w_out
class(field_t), intent(in) :: u, v, w
real(dp), dimension(:, :, :), intent(out) :: arr
class(field_t), intent(in) :: f

select type(f); type is (cuda_field_t); arr = f%data_d; end select

select type(u); type is (cuda_field_t); u_out = u%data_d; end select
select type(v); type is (cuda_field_t); v_out = v%data_d; end select
select type(w); type is (cuda_field_t); w_out = w%data_d; end select
end subroutine get_field_cuda

subroutine init_cuda_poisson_fft(self, xdirps, ydirps, zdirps)
implicit none

class(cuda_backend_t) :: self
type(dirps_t), intent(in) :: xdirps, ydirps, zdirps

allocate(cuda_poisson_fft_t :: self%poisson_fft)

select type (poisson_fft => self%poisson_fft)
type is (cuda_poisson_fft_t)
poisson_fft = cuda_poisson_fft_t(xdirps, ydirps, zdirps)
end select

end subroutine get_fields_cuda
end subroutine init_cuda_poisson_fft

end module m_cuda_backend

Loading

0 comments on commit 5c889cb

Please sign in to comment.