From 532a64f9f5d031b6d13357766c56d143e4d10881 Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Wed, 4 Dec 2024 12:12:59 +0100 Subject: [PATCH] Use fully parallel atomic initialization --- src/PointNeighbors.jl | 2 +- src/cell_lists/full_grid.jl | 2 +- src/nhs_grid.jl | 60 ++++++++++++++++++++++++++++++++++--- 3 files changed, 58 insertions(+), 6 deletions(-) diff --git a/src/PointNeighbors.jl b/src/PointNeighbors.jl index 22ef3893..986f18a0 100644 --- a/src/PointNeighbors.jl +++ b/src/PointNeighbors.jl @@ -23,7 +23,7 @@ include("gpu.jl") export foreach_point_neighbor, foreach_neighbor export TrivialNeighborhoodSearch, GridNeighborhoodSearch, PrecomputedNeighborhoodSearch export DictionaryCellList, FullGridCellList -export ParallelUpdate, SemiParallelUpdate, SerialUpdate +export ParallelUpdate, SemiParallelUpdate, SerialUpdate, ParallelIncrementalUpdate export initialize!, update!, initialize_grid!, update_grid! export PolyesterBackend, ThreadsDynamicBackend, ThreadsStaticBackend export PeriodicBox, copy_neighborhood_search diff --git a/src/cell_lists/full_grid.jl b/src/cell_lists/full_grid.jl index 32cde5d1..bf468d1b 100644 --- a/src/cell_lists/full_grid.jl +++ b/src/cell_lists/full_grid.jl @@ -37,7 +37,7 @@ struct FullGridCellList{C, LI, MINC, MAXC} <: AbstractCellList end function supported_update_strategies(::FullGridCellList{<:DynamicVectorOfVectors}) - return (ParallelUpdate, SemiParallelUpdate, SerialUpdate) + return (ParallelUpdate, SemiParallelUpdate, ParallelIncrementalUpdate, SerialUpdate) end supported_update_strategies(::FullGridCellList) = (SemiParallelUpdate, SerialUpdate) diff --git a/src/nhs_grid.jl b/src/nhs_grid.jl index 3b5dd266..fd2489ec 100644 --- a/src/nhs_grid.jl +++ b/src/nhs_grid.jl @@ -106,14 +106,25 @@ end """ ParallelUpdate() -Fully parallel update by using atomic operations to avoid race conditions when adding points -into the same cell. +Fully parallel initialization and update by using atomic operations to avoid race conditions +when adding points into the same cell. This is not available for all cell list implementations, but is the default when available. See [`GridNeighborhoodSearch`](@ref) for usage information. """ struct ParallelUpdate end +""" + ParallelIncrementalUpdate() + +Like [`ParallelUpdate`](@ref), but only updates the cells that have changed. +This is generally slower than a full reinitialization with [`ParallelUpdate`](@ref), +but is included for benchmarking purposes. + +See [`GridNeighborhoodSearch`](@ref) for usage information. +""" +struct ParallelIncrementalUpdate end + """ SemiParallelUpdate() @@ -138,7 +149,10 @@ See [`GridNeighborhoodSearch`](@ref) for usage information. struct SerialUpdate end # No update buffer needed for fully parallel update -@inline create_update_buffer(::ParallelUpdate, _, _) = nothing +@inline function create_update_buffer(::Union{ParallelUpdate, ParallelIncrementalUpdate}, + _, _) + return nothing +end @inline function create_update_buffer(::SemiParallelUpdate, cell_list, n_points) # Create update buffer and initialize it with empty vectors @@ -185,6 +199,35 @@ function initialize_grid!(neighborhood_search::GridNeighborhoodSearch, y::Abstra return neighborhood_search end +# WARNING! Undocumented, experimental feature: +# By default, determine the parallelization backend from the type of `y`. +# Optionally, pass a `KernelAbstractions.Backend` to run the KernelAbstractions.jl code +# on this backend. This can be useful to run GPU kernels on the CPU by passing +# `parallelization_backend = KernelAbstractions.CPU()`, even though `y isa Array`. +function initialize_grid!(neighborhood_search::GridNeighborhoodSearch{<:Any, ParallelUpdate}, + y::AbstractMatrix; parallelization_backend = y) + (; cell_list) = neighborhood_search + + empty!(cell_list) + + if neighborhood_search.search_radius < eps() + # Cannot initialize with zero search radius. + # This is used in TrixiParticles when a neighborhood search is not used. + return neighborhood_search + end + + @threaded parallelization_backend for point in axes(y, 2) + # Get cell index of the point's cell + point_coords = extract_svector(y, Val(ndims(neighborhood_search)), point) + cell = cell_coords(point_coords, neighborhood_search) + + # Add point to corresponding cell + push_cell_atomic!(cell_list, cell, point) + end + + return neighborhood_search +end + # WARNING! Undocumented, experimental feature: # By default, determine the parallelization backend from the type of `x`. # Optionally, pass a `KernelAbstractions.Backend` to run the KernelAbstractions.jl code @@ -300,7 +343,8 @@ end # Fully parallel update with atomic push. # See the warning above. `parallelization_backend = nothing` will use `Polyester.@batch`. -function update_grid!(neighborhood_search::GridNeighborhoodSearch{<:Any, ParallelUpdate}, +function update_grid!(neighborhood_search::Union{GridNeighborhoodSearch{<:Any, ParallelUpdate}, + GridNeighborhoodSearch{<:Any, ParallelIncrementalUpdate}}, coords_fun::Function; parallelization_backend = nothing) (; cell_list) = neighborhood_search @@ -343,6 +387,14 @@ function update_grid!(neighborhood_search::GridNeighborhoodSearch{<:Any, Paralle return neighborhood_search end +# Note that this is only defined when a matrix `y` is passed. When updating with a function, +# it will fall back to the semi-parallel update. +function update_grid!(neighborhood_search::GridNeighborhoodSearch{<:Any, ParallelUpdate}, + y::AbstractMatrix; parallelization_backend = y) + # The parallel (atomic) initialization is usually faster than the incremental update + initialize_grid!(neighborhood_search, y; parallelization_backend) +end + @propagate_inbounds function foreach_neighbor(f, system_coords, neighbor_system_coords, neighborhood_search::GridNeighborhoodSearch, point;