Skip to content

Commit

Permalink
Refactor boundary hiding
Browse files Browse the repository at this point in the history
  • Loading branch information
utkinis committed Oct 16, 2023
1 parent 82b17c6 commit 684e406
Show file tree
Hide file tree
Showing 14 changed files with 228 additions and 149 deletions.
13 changes: 6 additions & 7 deletions ext/AMDGPUExt/AMDGPUExt.jl
Original file line number Diff line number Diff line change
@@ -1,16 +1,15 @@
module AMDGPUExt

using AMDGPU
using KernelAbstractions

using FastIce.Architecture
import FastIce.Architecture: heuristic_groupsize, set_device!

set_device!(dev::HIPDevice) = AMDGPU.device!(dev)

set_device!(::ROCBackend, id::Integer) = AMDGPU.device_id!(id)
set_device!(::HIPDevice, id::Integer) = AMDGPU.device_id!(id)

heuristic_groupsize(::ROCBackend, ::Val{1}) = (256, )
heuristic_groupsize(::ROCBackend, ::Val{2}) = (128, 2, )
heuristic_groupsize(::ROCBackend, ::Val{3}) = (128, 2, 1, )
heuristic_groupsize(::HIPDevice, ::Val{1}) = (256, )
heuristic_groupsize(::HIPDevice, ::Val{2}) = (128, 2, )
heuristic_groupsize(::HIPDevice, ::Val{3}) = (128, 2, 1, )

end
end
13 changes: 6 additions & 7 deletions ext/CUDAExt/CUDAExt.jl
Original file line number Diff line number Diff line change
@@ -1,16 +1,15 @@
module CUDAExt

using CUDA
using KernelAbstractions

using FastIce.Architecture
import FastIce.Architecture: heuristic_groupsize, set_device!

set_device!(dev::CuDevice) = CUDA.device!(dev)

set_device!(::CUDABackend, id::Integer) = CUDA.device!(id-1)
set_device!(::CuDevice, id::Integer) = CUDA.device!(id-1)

heuristic_groupsize(::CUDABackend, ::Val{1}) = (256, )
heuristic_groupsize(::CUDABackend, ::Val{2}) = (32, 8, )
heuristic_groupsize(::CUDABackend, ::Val{3}) = (32, 8, 1, )
heuristic_groupsize(::CuDevice, ::Val{1}) = (256, )
heuristic_groupsize(::CuDevice, ::Val{2}) = (32, 8, )
heuristic_groupsize(::CuDevice, ::Val{3}) = (32, 8, 1, )

end
end
52 changes: 52 additions & 0 deletions scripts_future_API/benchmark_dbc.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
using FastIce.Architectures
using FastIce.Distributed
using FastIce.Fields
using FastIce.Grids
using FastIce.BoundaryConditions
using FastIce.KernelLaunch

using KernelAbstractions
using MPI

@kernel function fill_field!(f, val, offset=nothing)
I = @index(Global, Cartesian)
if !isnothing(offset)
I += offset
end
f[I] = val
end

MPI.Init()

arch = Architecture(CPU(), (0, 0))
grid = CartesianGrid(; origin=(0.0, 0.0), extent=(1.0, 1.0), size=(10, 10))
field = Field(backend(arch), grid, (Center(), Center()); halo=1)

me = global_rank(details(arch))

bc = FieldBoundaryConditions((field,), (DirichletBC{HalfCell}(me),))

boundary_conditions = ((bc, bc),
(bc, bc))

boundary_conditions = ntuple(Val(length(boundary_conditions))) do D
ntuple(Val(2)) do S
if neighbor(details(arch), D, S) != MPI.PROC_NULL
DistributedBoundaryConditions(Val(S), Val(D), (field, ))
else
boundary_conditions[D][S]
end
end
end

hide_boundaries = HideBoundaries{2}(arch)

outer_width = (4, 4)

launch!(arch, grid, fill_field! => (field, me); location=Center(), hide_boundaries, boundary_conditions, outer_width)

sleep(me)
@show coordinates(details(arch))
display(parent(field))

MPI.Finalize()
67 changes: 22 additions & 45 deletions src/Architectures.jl
Original file line number Diff line number Diff line change
@@ -1,69 +1,46 @@
module Architectures

export AbstractArchitecture

export SingleDeviceArchitecture
export Architecture

export launch!, set_device!, set_device_and_priority!, heuristic_groupsize
export synchronize
export synchronize, backend, device, details

using FastIce.Grids

using KernelAbstractions
import KernelAbstractions.Kernel

abstract type AbstractArchitecture end

device(::AbstractArchitecture) = error("device function must be defined for architecture")
backend(::AbstractArchitecture) = error("backend function must be defined for architecture")

set_device!(arch::AbstractArchitecture) = set_device!(device(arch))

synchronize(arch::AbstractArchitecture) = KernelAbstractions.synchronize(backend(arch))

function set_device_and_priority!(arch::AbstractArchitecture, prio::Symbol)
set_device!(arch)
KernelAbstractions.priority!(backend(arch), prio)
return
end

heuristic_groupsize(arch::AbstractArchitecture) = heuristic_groupsize(device(arch))

struct SingleDeviceArchitecture{B,D} <: AbstractArchitecture
struct Architecture{Kind,B,D,Details}
backend::B
device::D
details::Details
end

function SingleDeviceArchitecture(backend::Backend)
device = set_device!(backend, 1)
return SingleDeviceArchitecture(backend, device)
end

set_device!(::SingleDeviceArchitecture{CPU}) = nothing
set_device!(::CPU, id::Integer) = nothing

heuristic_groupsize(::SingleDeviceArchitecture{CPU}) = 256

device(arch::SingleDeviceArchitecture) = arch.device
struct SingleDevice end

backend(arch::SingleDeviceArchitecture) = arch.backend

function launch!(arch::SingleDeviceArchitecture, grid::CartesianGrid, kernel; kwargs...)
worksize = size(grid, Vertex())
launch!(arch, worksize, kernel; kwargs...)
function Architecture(backend::Backend, device_id::Integer=1)
device = set_device!(backend, device_id)
return Architecture{SingleDevice,typeof(backend),typeof(device),Nothing}(backend, device, nothing)
end

function launch!(arch::SingleDeviceArchitecture, worksize::NTuple{N,Int}, kernel::Pair{Kernel,Args};
boundary_conditions=nothing, async=true) where {N,Args}
fun, args = kernel
device(arch::Architecture) = arch.device
backend(arch::Architecture) = arch.backend
details(arch::Architecture) = arch.details

groupsize = heuristic_groupsize(arch)
synchronize(arch::Architecture) = KernelAbstractions.synchronize(arch.backend)

fun(arch.backend, groupsize, worksize)(args...)
isnothing(boundary_conditions) || apply_boundary_conditions!(boundary_conditions)
set_device!(arch::Architecture) = set_device!(arch.device)

async || synchronize(arch.backend)
function set_device_and_priority!(arch::Architecture, prio::Symbol)
set_device!(arch)
KernelAbstractions.priority!(arch.backend, prio)
return
end

set_device!(::Architecture{Kind,CPU}) where {Kind} = nothing
set_device!(::CPU, id::Integer) = nothing

heuristic_groupsize(arch::Architecture, ::Val{N}) where {N} = heuristic_groupsize(arch.device, Val(N))
heuristic_groupsize(::Architecture{Kind,CPU}, N) where {Kind} = 256

end
6 changes: 5 additions & 1 deletion src/BoundaryConditions/BoundaryConditions.jl
Original file line number Diff line number Diff line change
@@ -1,15 +1,18 @@
module BoundaryConditions

export FieldBoundaryConditions
export apply_boundary_conditions!
export apply_boundary_conditions!, apply_all_boundary_conditions!

export DirichletBC, HalfCell, FullCell
export ContinuousBC, DiscreteBC
export BoundaryFunction, DiscreteBoundaryFunction, ContinuousBoundaryFunction

export HideBoundaries, hide

using FastIce.Grids
using FastIce.Fields
using FastIce.Utils
using FastIce.Architectures

using KernelAbstractions
using Adapt
Expand All @@ -18,5 +21,6 @@ include("utils.jl")
include("boundary_function.jl")
include("dirichlet_bc.jl")
include("field_boundary_conditions.jl")
include("hide_boundaries.jl")

end
22 changes: 11 additions & 11 deletions src/BoundaryConditions/field_boundary_conditions.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,22 +3,13 @@ struct FieldBoundaryConditions{F<:Tuple,B<:Tuple}
conditions::B
end

function _validate_boundary_conditions(bc::FieldBoundaryConditions, dim, side)
for f in bc.fields
if halo(f, dim, side) < 1
error("to apply boundary conditions, halo width must be at least 1")
end
end
return
end

function apply_boundary_conditions!(::Val{S}, ::Val{D}, backend::Backend, grid::CartesianGrid,
function apply_boundary_conditions!(::Val{S}, ::Val{D}, arch::Architecture, grid::CartesianGrid,
bc::FieldBoundaryConditions; async=true) where {S,D}
_validate_boundary_conditions(bc, D, S)
sizes = ntuple(ifield -> remove_dim(Val(D), size(bc.fields[ifield])), Val(length(bc.fields)))
worksize = remove_dim(Val(D), size(grid, Vertex()))
# launch!(_apply_boundary_conditions! => (Val(S), Val(D), grid, bc.fields, bc.conditions); backend, worksize)
_apply_boundary_conditions!(backend, 256, worksize)(Val(S), Val(D), grid, sizes, bc.fields, bc.conditions)
_apply_boundary_conditions!(arch.backend, 256, worksize)(Val(S), Val(D), grid, sizes, bc.fields, bc.conditions)
async || KernelAbstractions.synchronize(backend)
return
end
Expand All @@ -37,3 +28,12 @@ end
end

@inline _apply_field_boundary_condition!(side, dim, grid, f, loc, Ibc, ::Nothing) = nothing

function _validate_boundary_conditions(bc::FieldBoundaryConditions, dim, side)
for f in bc.fields
if halo(f, dim, side) < 1
error("to apply boundary conditions, halo width must be at least 1")
end
end
return
end
31 changes: 31 additions & 0 deletions src/BoundaryConditions/hide_boundaries.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
struct HideBoundaries{N}
pipelines::NTuple{N,Tuple{Pipeline,Pipeline}}
function HideBoundaries{N}(arch::Architecture) where {N}
pre() = set_device_and_priority!(arch, :high)
pipelines = ntuple(Val(N)) do _
return ntuple(_ -> Pipeline(; pre), Val(2))
end
return new{N}(pipelines)
end
end

function hide(fun::F, hb::HideBoundaries{N}, arch::Architecture, grid::CartesianGrid{N}, boundary_conditions, worksize;
outer_width=nothing) where {F,N}
inner_range, outer_ranges = split_ndrange(worksize, outer_width)
fun(inner_range)
for dim in N:-1:1
ntuple(Val(2)) do side
pipe = hb.pipelines[dim][side]
range = outer_ranges[dim][side]
bc = boundary_conditions[dim][side]
# execute outer range and boundary conditions asynchronously
put!(pipe) do
fun(range)
apply_boundary_conditions!(Val(side), Val(dim), arch, grid, bc)
Architectures.synchronize(arch)
end
end
wait.(hb.pipelines[dim]) # synchronize spatial dimension
end
return
end
21 changes: 8 additions & 13 deletions src/Distributed/Distributed.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,31 +3,26 @@ module Distributed
using FastIce.Architectures
using FastIce.Grids
import FastIce.BoundaryConditions: apply_boundary_conditions!

using MPI
using KernelAbstractions

export CartesianTopology

export global_rank, shared_rank, node_name, cartesian_communicator, shared_communicator

export global_rank, shared_rank, node_name, cartesian_communicator, shared_communicator, coordinates
export dimensions, global_size, node_size

export global_grid_size, local_grid
export neighbors, neighbor

struct DistributedArchitecture{C,T,R} <: AbstractArchitecture
child_arch::C
topology::T
end
export DistributedBoundaryConditions

struct DistributedMPI end

function DistributedArchitecture(backend::Backend, dims::NTuple{N,Int}; comm=MPI.COMM_WORLD) where {N}
function Architectures.Architecture(backend::Backend, dims::NTuple{N,Int}, comm::MPI.Comm=MPI.COMM_WORLD) where {N}
topo = CartesianTopology(dims; comm)
device = set_device!(backend, shared_rank(topo))
child_arch = SingleDeviceArchitecture(backend, device)
return DistributedArchitecture(child_arch, topo)
return Architecture{DistributedMPI,typeof(backend),typeof(device),typeof(topo)}(backend, device, topo)
end

device(arch::DistributedArchitecture) = device(arch.child_arch)

include("topology.jl")
include("boundary_conditions.jl")

Expand Down
14 changes: 8 additions & 6 deletions src/Distributed/boundary_conditions.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,9 @@ struct DistributedBoundaryConditions{F,B}
exchange_infos = ntuple(Val(N)) do idx
send_view = get_send_view(Val(S), Val(D), fields[idx])
recv_view = get_recv_view(Val(S), Val(D), fields[idx])
ExchangeInfo(similar(send_view), similar(recv_view))
send_buffer = similar(parent(send_view), eltype(send_view), size(send_view))
recv_buffer = similar(parent(recv_view), eltype(recv_view), size(recv_view))
ExchangeInfo(send_buffer, recv_buffer)
end
return new{typeof(fields),typeof(exchange_infos)}(fields, exchange_infos)
end
Expand All @@ -21,10 +23,10 @@ end

ExchangeInfo(send_buf, recv_buf) = ExchangeInfo(send_buf, recv_buf, MPI.REQUEST_NULL, MPI.REQUEST_NULL)

function apply_boundary_conditions!(::Val{S}, ::Val{D}, arch::DistributedArchitecture, grid::CartesianGrid,
function apply_boundary_conditions!(::Val{S}, ::Val{D}, arch::Architecture, grid::CartesianGrid,
bc::DistributedBoundaryConditions; async=true) where {S,D}
comm = cartesian_communicator(arch.topology)
nbrank = neighbor(arch.topology, D, S)
comm = cartesian_communicator(details(arch))
nbrank = neighbor(details(arch), D, S)

# initiate non-blocking MPI recieve and device-to-device copy to the send buffer
for idx in eachindex(bc.fields)
Expand All @@ -41,8 +43,8 @@ function apply_boundary_conditions!(::Val{S}, ::Val{D}, arch::DistributedArchite
info.send_request = MPI.Isend(info.send_buffer, comm; dest=nbrank)
end

recv_ready = BitVector(false for _ in eachindex(recv_requests))
send_ready = BitVector(false for _ in eachindex(send_requests))
recv_ready = BitVector(false for _ in eachindex(bc.exchange_infos))
send_ready = BitVector(false for _ in eachindex(bc.exchange_infos))

# test send and receive requests, initiating device-to-device copy
# to the receive buffer if the receive is complete
Expand Down
5 changes: 5 additions & 0 deletions src/FastIce.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
module FastIce

# export KernelLaunch.launch!

using KernelAbstractions

include("Grids/Grids.jl")
Expand All @@ -11,11 +13,14 @@ include("Architectures.jl")

include("Utils/Utils.jl")


include("Physics.jl")

include("BoundaryConditions/BoundaryConditions.jl")
include("Models/models.jl")

include("KernelLaunch.jl")

include("Distributed/Distributed.jl")

end # module
Loading

0 comments on commit 684e406

Please sign in to comment.