trixi-framework · efaulhaber · Dec 23, 2024 · Dec 23, 2024 · Dec 23, 2024 · Dec 23, 2024
diff --git a/Project.toml b/Project.toml
@@ -6,6 +6,7 @@ version = "0.2.4-dev"
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
+ChangePrecision = "3cb15238-376d-56a3-8042-d33272777c9a"
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
 DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab"
@@ -34,6 +35,7 @@ WriteVTK = "64499a7a-5c06-52f2-abe2-ccb03c286192"
 [compat]
 Adapt = "3, 4"
 CSV = "0.10"
+ChangePrecision = "1.1.0"
 DataFrames = "1.6"
 DelimitedFiles = "1"
 DiffEqCallbacks = "2, 3, 4"
@@ -44,14 +46,14 @@ GPUArraysCore = "0.1, 0.2"
 JSON = "0.21"
 KernelAbstractions = "0.9"
 MuladdMacro = "0.2"
-PointNeighbors = "0.4.2"
+PointNeighbors = "0.4.6"
 Polyester = "0.7.10"
 RecipesBase = "1"
 Reexport = "1"
 SciMLBase = "2"
 StaticArrays = "1"
 StrideArrays = "0.1"
 TimerOutputs = "0.5.25"
-TrixiBase = "0.1.3"
+TrixiBase = "0.1.5"
 WriteVTK = "1"
 julia = "1.9"
diff --git a/docs/src/gpu.md b/docs/src/gpu.md
@@ -1,24 +1,27 @@
-# GPU Support
+# [GPU Support](@id gpu_support)
 
 GPU support is still an experimental feature that is actively being worked on.
-As of now, the [`WeaklyCompressibleSPHSystem`](@ref) and the [`BoundarySPHSystem`](@ref)
-are supported on GPUs.
-We have tested this on GPUs by Nvidia and AMD.
+As of now, the [`WeaklyCompressibleSPHSystem`](@ref), the [`TotalLagrangianSPHSystem`](@ref)
+and the [`BoundarySPHSystem`](@ref) are supported on GPUs.
+We have tested this on GPUs by Nvidia, AMD and Apple.
+Note that Apple GPUs currently require the development version of
+[GPUCompiler.jl](https://github.com/JuliaGPU/GPUCompiler.jl) and most of them
+do not support `Float64`.
+See [below on how to run single precision simulations](@ref single_precision).
 
 To run a simulation on a GPU, we need to use the [`FullGridCellList`](@ref)
 as cell list for the [`GridNeighborhoodSearch`](@ref).
 This cell list requires a bounding box for the domain, unlike the default cell list, which
 uses an unbounded domain.
-For simulations that are bounded by a closed tank, we can use the boundary of the tank
-to obtain the bounding box as follows.
+For simulations that are bounded by a closed tank, we can simply use the boundary
+of the tank to obtain the bounding box as follows.
 ```jldoctest gpu; output=false, setup=:(using TrixiParticles; trixi_include(@__MODULE__, joinpath(examples_dir(), "fluid", "hydrostatic_water_column_2d.jl"), sol=nothing))
-search_radius = TrixiParticles.compact_support(smoothing_kernel, smoothing_length)
-min_corner = minimum(tank.boundary.coordinates, dims=2) .- search_radius
-max_corner = maximum(tank.boundary.coordinates, dims=2) .+ search_radius
+min_corner = minimum(tank.boundary.coordinates, dims=2)
+max_corner = maximum(tank.boundary.coordinates, dims=2)
 cell_list = TrixiParticles.PointNeighbors.FullGridCellList(; min_corner, max_corner)
 
 # output
-PointNeighbors.FullGridCellList{PointNeighbors.DynamicVectorOfVectors{Int32, Matrix{Int32}, Vector{Int32}, Base.RefValue{Int32}}, Nothing, SVector{2, Float64}, SVector{2, Float64}}(Vector{Int32}[], nothing, [-0.24500000000000002, -0.24500000000000002], [1.245, 1.245])
+PointNeighbors.FullGridCellList{PointNeighbors.DynamicVectorOfVectors{Int32, Matrix{Int32}, Vector{Int32}, Base.RefValue{Int32}}, Nothing, SVector{2, Float64}, SVector{2, Float64}}(Vector{Int32}[], nothing, [-0.12500000000000003, -0.12500000000000003], [1.125, 1.125])
 ```
 
 We then need to pass this cell list to the neighborhood search and the neighborhood search
@@ -55,7 +58,50 @@ On an AMD GPU, we use:
 using AMDGPU
 ode = semidiscretize(semi, tspan, data_type=ROCArray)
 ```
-Then, we can run the simulation as usual.
+Now, we can run the simulation as usual.
 All data is transferred to the GPU during initialization and all loops over particles
 and their neighbors will be executed on the GPU as kernels generated by KernelAbstractions.jl.
 Data is only copied to the CPU for saving VTK files via the [`SolutionSavingCallback`](@ref).
+
+## Run an existing example file on the GPU
+
+The example file `examples/fluid/dam_break_2d_gpu.jl` demonstrates how to run an existing
+example file on a GPU.
+It first loads the variables from the example file `examples/fluid/dam_break_2d.jl`
+without running the simulation by overwriting the line that starts the simulation
+with `trixi_include(..., sol=nothing)`.
+Then, a GPU-compatible neighborhood search is defined, and the original example file
+is included with the new neighborhood search.
+This requires the assignments `neighborhood_search = ...` and `data_type = ...` in the
+original example file.
+Note that in `examples/fluid/dam_break_2d.jl`, we specifically set `data_type=nothing`, even though
+this is the default value, so that we can use `trixi_include` to replace this value.
+
+To now run this simulation on a GPU, all we have to do is change `data_type` to the
+array type of the installed GPU.
+For example, we can run this simulation on an Nvidia GPU as follows.
+```julia
+using CUDA
+trixi_include(joinpath(examples_dir(), "fluid", "dam_break_2d_gpu.jl"), data_type=CuArray)
+```
+
+## [Single precision simulations](@id single_precision)
+
+All features that currently support GPUs can also be used with single precision.
+This is orders of magnitude faster on most GPUs and required to run code
+on Metal (Apple GPUs).
+
+To run a simulation with single precision, all `Float64` literals in an example file
+have to be changed to `Float32` (e.g. `0.0` to `0.0f0`).
+TrixiParticles provides a function to do this conveniently:
+```@docs
+    trixi_include_changeprecision
+```
+
+All we have to do to run the previous example with single precision is the following.
+```julia
+using CUDA
+trixi_include_changeprecision(Float32,
+                              joinpath(examples_dir(), "fluid", "dam_break_2d_gpu.jl"),
+                              data_type=CuArray)
+```
diff --git a/src/TrixiParticles.jl b/src/TrixiParticles.jl
@@ -4,6 +4,7 @@ using Reexport: @reexport
 
 using Adapt: Adapt
 using Base: @propagate_inbounds
+using ChangePrecision: ChangePrecision
 using CSV: CSV
 using Dates
 using DataFrames: DataFrame
@@ -28,7 +29,7 @@ using StaticArrays: @SMatrix, SMatrix, setindex
 using StrideArrays: PtrArray, StaticInt
 using TimerOutputs: TimerOutput, TimerOutputs, print_timer, reset_timer!
 using TrixiBase: trixi_include, @trixi_timeit, timer, timeit_debug_enabled,
-                 disable_debug_timings, enable_debug_timings
+                 disable_debug_timings, enable_debug_timings, TrixiBase
 @reexport using PointNeighbors: TrivialNeighborhoodSearch, GridNeighborhoodSearch,
                                 PrecomputedNeighborhoodSearch, PeriodicBox,
                                 ParallelUpdate, SemiParallelUpdate, SerialUpdate
@@ -74,7 +75,7 @@ export BoundaryModelMonaghanKajtar, BoundaryModelDummyParticles, AdamiPressureEx
        BernoulliPressureExtrapolation
 
 export BoundaryMovement
-export examples_dir, validation_dir, trixi_include
+export examples_dir, validation_dir, trixi_include, trixi_include_changeprecision
 export trixi2vtk
 export RectangularTank, RectangularShape, SphereShape, ComplexShape
 export WindingNumberHormann, WindingNumberJacobson

diff --git a/src/callbacks/solution_saving.jl b/src/callbacks/solution_saving.jl
@@ -67,7 +67,7 @@ saving_callback = SolutionSavingCallback(dt=0.1, my_custom_quantity=kinetic_ener
 """
 mutable struct SolutionSavingCallback{I, CQ}
     interval              :: I
-    save_times            :: Array{Float64, 1}
+    save_times            :: Vector{Float64}
     save_initial_solution :: Bool
     save_final_solution   :: Bool
     write_meta_data       :: Bool
@@ -81,7 +81,7 @@ mutable struct SolutionSavingCallback{I, CQ}
 end
 
 function SolutionSavingCallback(; interval::Integer=0, dt=0.0,
-                                save_times=Array{Float64, 1}([]),
+                                save_times=Float64[],
                                 save_initial_solution=true, save_final_solution=true,
                                 output_directory="out", append_timestamp=false,
                                 prefix="", verbose=false, write_meta_data=true,
@@ -99,7 +99,7 @@ function SolutionSavingCallback(; interval::Integer=0, dt=0.0,
         output_directory *= string("_", Dates.format(now(), "YY-mm-ddTHHMMSS"))
     end
 
-    solution_callback = SolutionSavingCallback(interval, save_times,
+    solution_callback = SolutionSavingCallback(interval, Float64.(save_times),
                                                save_initial_solution, save_final_solution,
                                                write_meta_data, verbose, output_directory,
                                                prefix, max_coordinates, custom_quantities,

diff --git a/src/general/corrections.jl b/src/general/corrections.jl
@@ -399,7 +399,7 @@ function correction_matrix_inversion_step!(corr_matrix, system)
         # so `L` is singular if and only if the position vectors X_ab don't span the
         # full space, i.e., particle a and all neighbors lie on the same line (in 2D)
         # or plane (in 3D).
-        if abs(det(L)) < 1e-9
+        if abs(det(L)) < 1f-9
             L_inv = I
         else
             L_inv = inv(L)

diff --git a/src/schemes/boundary/dummy_particles/dummy_particles.jl b/src/schemes/boundary/dummy_particles/dummy_particles.jl
@@ -70,40 +70,40 @@ function BoundaryModelDummyParticles(initial_density, hydrodynamic_mass,
 end
 
 @doc raw"""
-    AdamiPressureExtrapolation(; pressure_offset=0.0)
+    AdamiPressureExtrapolation(; pressure_offset=0)
 
 `density_calculator` for `BoundaryModelDummyParticles`.
 
 # Keywords
-- `pressure_offset=0.0`: Sometimes it is necessary to artificially increase the boundary pressure
-                         to prevent penetration, which is possible by increasing this value.
+- `pressure_offset=0`: Sometimes it is necessary to artificially increase the boundary pressure
+                       to prevent penetration, which is possible by increasing this value.
 
 """
 struct AdamiPressureExtrapolation{ELTYPE}
     pressure_offset::ELTYPE
 
-    function AdamiPressureExtrapolation(; pressure_offset=0.0)
+    function AdamiPressureExtrapolation(; pressure_offset=0)
         return new{eltype(pressure_offset)}(pressure_offset)
     end
 end
 
 @doc raw"""
-    BernoulliPressureExtrapolation(; pressure_offset=0.0, factor=1.0)
+    BernoulliPressureExtrapolation(; pressure_offset=0, factor=1)
 
 `density_calculator` for `BoundaryModelDummyParticles`.
 
 # Keywords
-- `pressure_offset=0.0`: Sometimes it is necessary to artificially increase the boundary pressure
+- `pressure_offset=0`:   Sometimes it is necessary to artificially increase the boundary pressure
                          to prevent penetration, which is possible by increasing this value.
-- `factor=1.0`         : Setting `factor` allows to just increase the strength of the dynamic
+- `factor=1`         :   Setting `factor` allows to just increase the strength of the dynamic
                          pressure part.
 
 """
 struct BernoulliPressureExtrapolation{ELTYPE}
     pressure_offset :: ELTYPE
     factor          :: ELTYPE
 
-    function BernoulliPressureExtrapolation(; pressure_offset=0.0, factor=0.0)
+    function BernoulliPressureExtrapolation(; pressure_offset=0, factor=1)
         return new{eltype(pressure_offset)}(pressure_offset, factor)
     end
 end
@@ -331,7 +331,7 @@ end
 # Otherwise, `@threaded` does not work here with Julia ARM on macOS.
 # See https://github.com/JuliaSIMD/Polyester.jl/issues/88.
 @inline function apply_state_equation!(boundary_model, density, particle)
-    boundary_model.pressure[particle] = max(boundary_model.state_equation(density), 0.0)
+    boundary_model.pressure[particle] = max(boundary_model.state_equation(density), 0)
 end
 
 function compute_pressure!(boundary_model,
@@ -360,7 +360,7 @@ function compute_pressure!(boundary_model,
         # Note: The version iterating neighbors first is not thread parallelizable.
         # The factor is based on the achievable speed-up of the thread parallelizable version.
         if nparticles(system) >
-           ceil(Int, 0.5 * Threads.nthreads()) * nparticles(neighbor_system)
+           ceil(Int, Threads.nthreads() / 2) * nparticles(neighbor_system)
             nhs = get_neighborhood_search(neighbor_system, system, semi)
 
             # Loop over fluid particles and then the neighboring boundary particles to extrapolate fluid pressure to the boundaries
@@ -381,7 +381,7 @@ function compute_pressure!(boundary_model,
         @threaded system for particle in eachparticle(system)
             # Limit pressure to be non-negative to avoid attractive forces between fluid and
             # boundary particles at free surfaces (sticking artifacts).
-            pressure[particle] = max(pressure[particle], 0.0)
+            pressure[particle] = max(pressure[particle], 0)
         end
     end
 
@@ -515,8 +515,8 @@ end
                             current_velocity(v_neighbor_system, neighbor_system, neighbor)
         normal_velocity = dot(relative_velocity, pos_diff)
 
-        return 0.5 * boundary_density_calculator.factor * density_neighbor *
-               normal_velocity^2 / distance
+        return boundary_density_calculator.factor * density_neighbor *
+               normal_velocity^2 / distance / 2
     end
     return zero(density_neighbor)
 end

diff --git a/src/schemes/solid/total_lagrangian_sph/penalty_force.jl b/src/schemes/solid/total_lagrangian_sph/penalty_force.jl
@@ -35,7 +35,7 @@ end
     eps_sum = (J_a + J_b) * initial_pos_diff - 2 * current_pos_diff
     delta_sum = dot(eps_sum, current_pos_diff) / current_distance
 
-    f = 0.5 * penalty_force.alpha * volume_particle * volume_neighbor *
+    f = (penalty_force.alpha / 2) * volume_particle * volume_neighbor *
         kernel_weight / initial_distance^2 * young_modulus * delta_sum *
         current_pos_diff / current_distance
 

diff --git a/src/schemes/solid/total_lagrangian_sph/system.jl b/src/schemes/solid/total_lagrangian_sph/system.jl
@@ -105,7 +105,7 @@ function TotalLagrangianSPHSystem(initial_condition,
 
     lame_lambda = young_modulus * poisson_ratio /
                   ((1 + poisson_ratio) * (1 - 2 * poisson_ratio))
-    lame_mu = 0.5 * young_modulus / (1 + poisson_ratio)
+    lame_mu = (young_modulus / 2) / (1 + poisson_ratio)
 
     return TotalLagrangianSPHSystem(initial_condition, initial_coordinates,
                                     current_coordinates, mass, correction_matrix,
@@ -227,7 +227,7 @@ end
 function update_positions!(system::TotalLagrangianSPHSystem, v, u, v_ode, u_ode, semi, t)
     (; current_coordinates) = system
 
-    for particle in each_moving_particle(system)
+    @threaded system for particle in each_moving_particle(system)
         for i in 1:ndims(system)
             current_coordinates[i, particle] = u[i, particle]
         end
@@ -313,7 +313,7 @@ end
     (; lame_lambda, lame_mu) = system
 
     # Compute the Green-Lagrange strain
-    E = 0.5 * (transpose(F) * F - I)
+    E = (transpose(F) * F - I) / 2
 
     return lame_lambda * tr(E) * I + 2 * lame_mu * E
 end
@@ -327,25 +327,19 @@ end
 function write_u0!(u0, system::TotalLagrangianSPHSystem)
     (; initial_condition) = system
 
-    for particle in each_moving_particle(system)
-        # Write particle coordinates
-        for dim in 1:ndims(system)
-            u0[dim, particle] = initial_condition.coordinates[dim, particle]
-        end
-    end
+    # This is as fast as a loop with `@inbounds`, but it's GPU-compatible
+    indices = CartesianIndices((ndims(system), each_moving_particle(system)))
+    copyto!(u0, indices, initial_condition.coordinates, indices)
 
     return u0
 end
 
 function write_v0!(v0, system::TotalLagrangianSPHSystem)
     (; initial_condition, boundary_model) = system
 
-    for particle in each_moving_particle(system)
-        # Write particle velocities
-        for dim in 1:ndims(system)
-            v0[dim, particle] = initial_condition.velocity[dim, particle]
-        end
-    end
+    # This is as fast as a loop with `@inbounds`, but it's GPU-compatible
+    indices = CartesianIndices((ndims(system), each_moving_particle(system)))
+    copyto!(v0, indices, initial_condition.velocity, indices)
 
     write_v0!(v0, boundary_model, system)
 
@@ -356,19 +350,6 @@ function write_v0!(v0, model, system::TotalLagrangianSPHSystem)
     return v0
 end
 
-function write_v0!(v0, ::BoundaryModelDummyParticles{ContinuityDensity},
-                   system::TotalLagrangianSPHSystem)
-    (; cache) = system.boundary_model
-    (; initial_density) = cache
-
-    for particle in each_moving_particle(system)
-        # Set particle densities
-        v0[ndims(system) + 1, particle] = initial_density[particle]
-    end
-
-    return v0
-end
-
 function restart_with!(system::TotalLagrangianSPHSystem, v, u)
     for particle in each_moving_particle(system)
         system.current_coordinates[:, particle] .= u[:, particle]