diff --git a/Project.toml b/Project.toml index 398268160..f8fd398ba 100644 --- a/Project.toml +++ b/Project.toml @@ -6,6 +6,7 @@ version = "0.2.4-dev" [deps] Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b" +ChangePrecision = "3cb15238-376d-56a3-8042-d33272777c9a" DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab" @@ -34,6 +35,7 @@ WriteVTK = "64499a7a-5c06-52f2-abe2-ccb03c286192" [compat] Adapt = "3, 4" CSV = "0.10" +ChangePrecision = "1.1.0" DataFrames = "1.6" DelimitedFiles = "1" DiffEqCallbacks = "2, 3, 4" @@ -44,7 +46,7 @@ GPUArraysCore = "0.1, 0.2" JSON = "0.21" KernelAbstractions = "0.9" MuladdMacro = "0.2" -PointNeighbors = "0.4.2" +PointNeighbors = "0.4.6" Polyester = "0.7.10" RecipesBase = "1" Reexport = "1" @@ -52,6 +54,6 @@ SciMLBase = "2" StaticArrays = "1" StrideArrays = "0.1" TimerOutputs = "0.5.25" -TrixiBase = "0.1.3" +TrixiBase = "0.1.5" WriteVTK = "1" julia = "1.9" diff --git a/docs/src/gpu.md b/docs/src/gpu.md index 8e8a12708..71cca0d68 100644 --- a/docs/src/gpu.md +++ b/docs/src/gpu.md @@ -1,24 +1,27 @@ -# GPU Support +# [GPU Support](@id gpu_support) GPU support is still an experimental feature that is actively being worked on. -As of now, the [`WeaklyCompressibleSPHSystem`](@ref) and the [`BoundarySPHSystem`](@ref) -are supported on GPUs. -We have tested this on GPUs by Nvidia and AMD. +As of now, the [`WeaklyCompressibleSPHSystem`](@ref), the [`TotalLagrangianSPHSystem`](@ref) +and the [`BoundarySPHSystem`](@ref) are supported on GPUs. +We have tested this on GPUs by Nvidia, AMD and Apple. +Note that Apple GPUs currently require the development version of +[GPUCompiler.jl](https://github.com/JuliaGPU/GPUCompiler.jl) and most of them +do not support `Float64`. +See [below on how to run single precision simulations](@ref single_precision). To run a simulation on a GPU, we need to use the [`FullGridCellList`](@ref) as cell list for the [`GridNeighborhoodSearch`](@ref). This cell list requires a bounding box for the domain, unlike the default cell list, which uses an unbounded domain. -For simulations that are bounded by a closed tank, we can use the boundary of the tank -to obtain the bounding box as follows. +For simulations that are bounded by a closed tank, we can simply use the boundary +of the tank to obtain the bounding box as follows. ```jldoctest gpu; output=false, setup=:(using TrixiParticles; trixi_include(@__MODULE__, joinpath(examples_dir(), "fluid", "hydrostatic_water_column_2d.jl"), sol=nothing)) -search_radius = TrixiParticles.compact_support(smoothing_kernel, smoothing_length) -min_corner = minimum(tank.boundary.coordinates, dims=2) .- search_radius -max_corner = maximum(tank.boundary.coordinates, dims=2) .+ search_radius +min_corner = minimum(tank.boundary.coordinates, dims=2) +max_corner = maximum(tank.boundary.coordinates, dims=2) cell_list = TrixiParticles.PointNeighbors.FullGridCellList(; min_corner, max_corner) # output -PointNeighbors.FullGridCellList{PointNeighbors.DynamicVectorOfVectors{Int32, Matrix{Int32}, Vector{Int32}, Base.RefValue{Int32}}, Nothing, SVector{2, Float64}, SVector{2, Float64}}(Vector{Int32}[], nothing, [-0.24500000000000002, -0.24500000000000002], [1.245, 1.245]) +PointNeighbors.FullGridCellList{PointNeighbors.DynamicVectorOfVectors{Int32, Matrix{Int32}, Vector{Int32}, Base.RefValue{Int32}}, Nothing, SVector{2, Float64}, SVector{2, Float64}}(Vector{Int32}[], nothing, [-0.12500000000000003, -0.12500000000000003], [1.125, 1.125]) ``` We then need to pass this cell list to the neighborhood search and the neighborhood search @@ -55,7 +58,50 @@ On an AMD GPU, we use: using AMDGPU ode = semidiscretize(semi, tspan, data_type=ROCArray) ``` -Then, we can run the simulation as usual. +Now, we can run the simulation as usual. All data is transferred to the GPU during initialization and all loops over particles and their neighbors will be executed on the GPU as kernels generated by KernelAbstractions.jl. Data is only copied to the CPU for saving VTK files via the [`SolutionSavingCallback`](@ref). + +## Run an existing example file on the GPU + +The example file `examples/fluid/dam_break_2d_gpu.jl` demonstrates how to run an existing +example file on a GPU. +It first loads the variables from the example file `examples/fluid/dam_break_2d.jl` +without running the simulation by overwriting the line that starts the simulation +with `trixi_include(..., sol=nothing)`. +Then, a GPU-compatible neighborhood search is defined, and the original example file +is included with the new neighborhood search. +This requires the assignments `neighborhood_search = ...` and `data_type = ...` in the +original example file. +Note that in `examples/fluid/dam_break_2d.jl`, we specifically set `data_type=nothing`, even though +this is the default value, so that we can use `trixi_include` to replace this value. + +To now run this simulation on a GPU, all we have to do is change `data_type` to the +array type of the installed GPU. +For example, we can run this simulation on an Nvidia GPU as follows. +```julia +using CUDA +trixi_include(joinpath(examples_dir(), "fluid", "dam_break_2d_gpu.jl"), data_type=CuArray) +``` + +## [Single precision simulations](@id single_precision) + +All features that currently support GPUs can also be used with single precision. +This is orders of magnitude faster on most GPUs and required to run code +on Metal (Apple GPUs). + +To run a simulation with single precision, all `Float64` literals in an example file +have to be changed to `Float32` (e.g. `0.0` to `0.0f0`). +TrixiParticles provides a function to do this conveniently: +```@docs + trixi_include_changeprecision +``` + +All we have to do to run the previous example with single precision is the following. +```julia +using CUDA +trixi_include_changeprecision(Float32, + joinpath(examples_dir(), "fluid", "dam_break_2d_gpu.jl"), + data_type=CuArray) +``` diff --git a/src/TrixiParticles.jl b/src/TrixiParticles.jl index d85096860..dd5f05396 100644 --- a/src/TrixiParticles.jl +++ b/src/TrixiParticles.jl @@ -4,6 +4,7 @@ using Reexport: @reexport using Adapt: Adapt using Base: @propagate_inbounds +using ChangePrecision: ChangePrecision using CSV: CSV using Dates using DataFrames: DataFrame @@ -28,7 +29,7 @@ using StaticArrays: @SMatrix, SMatrix, setindex using StrideArrays: PtrArray, StaticInt using TimerOutputs: TimerOutput, TimerOutputs, print_timer, reset_timer! using TrixiBase: trixi_include, @trixi_timeit, timer, timeit_debug_enabled, - disable_debug_timings, enable_debug_timings + disable_debug_timings, enable_debug_timings, TrixiBase @reexport using PointNeighbors: TrivialNeighborhoodSearch, GridNeighborhoodSearch, PrecomputedNeighborhoodSearch, PeriodicBox, ParallelUpdate, SemiParallelUpdate, SerialUpdate @@ -74,7 +75,7 @@ export BoundaryModelMonaghanKajtar, BoundaryModelDummyParticles, AdamiPressureEx BernoulliPressureExtrapolation export BoundaryMovement -export examples_dir, validation_dir, trixi_include +export examples_dir, validation_dir, trixi_include, trixi_include_changeprecision export trixi2vtk export RectangularTank, RectangularShape, SphereShape, ComplexShape export WindingNumberHormann, WindingNumberJacobson diff --git a/src/callbacks/solution_saving.jl b/src/callbacks/solution_saving.jl index 91dc03ff7..95ee8f5c0 100644 --- a/src/callbacks/solution_saving.jl +++ b/src/callbacks/solution_saving.jl @@ -67,7 +67,7 @@ saving_callback = SolutionSavingCallback(dt=0.1, my_custom_quantity=kinetic_ener """ mutable struct SolutionSavingCallback{I, CQ} interval :: I - save_times :: Array{Float64, 1} + save_times :: Vector{Float64} save_initial_solution :: Bool save_final_solution :: Bool write_meta_data :: Bool @@ -81,7 +81,7 @@ mutable struct SolutionSavingCallback{I, CQ} end function SolutionSavingCallback(; interval::Integer=0, dt=0.0, - save_times=Array{Float64, 1}([]), + save_times=Float64[], save_initial_solution=true, save_final_solution=true, output_directory="out", append_timestamp=false, prefix="", verbose=false, write_meta_data=true, @@ -99,7 +99,7 @@ function SolutionSavingCallback(; interval::Integer=0, dt=0.0, output_directory *= string("_", Dates.format(now(), "YY-mm-ddTHHMMSS")) end - solution_callback = SolutionSavingCallback(interval, save_times, + solution_callback = SolutionSavingCallback(interval, Float64.(save_times), save_initial_solution, save_final_solution, write_meta_data, verbose, output_directory, prefix, max_coordinates, custom_quantities, diff --git a/src/general/corrections.jl b/src/general/corrections.jl index b1c66a06a..7072ecef1 100644 --- a/src/general/corrections.jl +++ b/src/general/corrections.jl @@ -399,7 +399,7 @@ function correction_matrix_inversion_step!(corr_matrix, system) # so `L` is singular if and only if the position vectors X_ab don't span the # full space, i.e., particle a and all neighbors lie on the same line (in 2D) # or plane (in 3D). - if abs(det(L)) < 1e-9 + if abs(det(L)) < 1f-9 L_inv = I else L_inv = inv(L) diff --git a/src/schemes/boundary/dummy_particles/dummy_particles.jl b/src/schemes/boundary/dummy_particles/dummy_particles.jl index 8b7e507ed..e89c093de 100644 --- a/src/schemes/boundary/dummy_particles/dummy_particles.jl +++ b/src/schemes/boundary/dummy_particles/dummy_particles.jl @@ -70,32 +70,32 @@ function BoundaryModelDummyParticles(initial_density, hydrodynamic_mass, end @doc raw""" - AdamiPressureExtrapolation(; pressure_offset=0.0) + AdamiPressureExtrapolation(; pressure_offset=0) `density_calculator` for `BoundaryModelDummyParticles`. # Keywords -- `pressure_offset=0.0`: Sometimes it is necessary to artificially increase the boundary pressure - to prevent penetration, which is possible by increasing this value. +- `pressure_offset=0`: Sometimes it is necessary to artificially increase the boundary pressure + to prevent penetration, which is possible by increasing this value. """ struct AdamiPressureExtrapolation{ELTYPE} pressure_offset::ELTYPE - function AdamiPressureExtrapolation(; pressure_offset=0.0) + function AdamiPressureExtrapolation(; pressure_offset=0) return new{eltype(pressure_offset)}(pressure_offset) end end @doc raw""" - BernoulliPressureExtrapolation(; pressure_offset=0.0, factor=1.0) + BernoulliPressureExtrapolation(; pressure_offset=0, factor=1) `density_calculator` for `BoundaryModelDummyParticles`. # Keywords -- `pressure_offset=0.0`: Sometimes it is necessary to artificially increase the boundary pressure +- `pressure_offset=0`: Sometimes it is necessary to artificially increase the boundary pressure to prevent penetration, which is possible by increasing this value. -- `factor=1.0` : Setting `factor` allows to just increase the strength of the dynamic +- `factor=1` : Setting `factor` allows to just increase the strength of the dynamic pressure part. """ @@ -103,7 +103,7 @@ struct BernoulliPressureExtrapolation{ELTYPE} pressure_offset :: ELTYPE factor :: ELTYPE - function BernoulliPressureExtrapolation(; pressure_offset=0.0, factor=0.0) + function BernoulliPressureExtrapolation(; pressure_offset=0, factor=1) return new{eltype(pressure_offset)}(pressure_offset, factor) end end @@ -331,7 +331,7 @@ end # Otherwise, `@threaded` does not work here with Julia ARM on macOS. # See https://github.com/JuliaSIMD/Polyester.jl/issues/88. @inline function apply_state_equation!(boundary_model, density, particle) - boundary_model.pressure[particle] = max(boundary_model.state_equation(density), 0.0) + boundary_model.pressure[particle] = max(boundary_model.state_equation(density), 0) end function compute_pressure!(boundary_model, @@ -360,7 +360,7 @@ function compute_pressure!(boundary_model, # Note: The version iterating neighbors first is not thread parallelizable. # The factor is based on the achievable speed-up of the thread parallelizable version. if nparticles(system) > - ceil(Int, 0.5 * Threads.nthreads()) * nparticles(neighbor_system) + ceil(Int, Threads.nthreads() / 2) * nparticles(neighbor_system) nhs = get_neighborhood_search(neighbor_system, system, semi) # Loop over fluid particles and then the neighboring boundary particles to extrapolate fluid pressure to the boundaries @@ -381,7 +381,7 @@ function compute_pressure!(boundary_model, @threaded system for particle in eachparticle(system) # Limit pressure to be non-negative to avoid attractive forces between fluid and # boundary particles at free surfaces (sticking artifacts). - pressure[particle] = max(pressure[particle], 0.0) + pressure[particle] = max(pressure[particle], 0) end end @@ -515,8 +515,8 @@ end current_velocity(v_neighbor_system, neighbor_system, neighbor) normal_velocity = dot(relative_velocity, pos_diff) - return 0.5 * boundary_density_calculator.factor * density_neighbor * - normal_velocity^2 / distance + return boundary_density_calculator.factor * density_neighbor * + normal_velocity^2 / distance / 2 end return zero(density_neighbor) end diff --git a/src/schemes/solid/total_lagrangian_sph/penalty_force.jl b/src/schemes/solid/total_lagrangian_sph/penalty_force.jl index 04f3d9890..f8bf670c3 100644 --- a/src/schemes/solid/total_lagrangian_sph/penalty_force.jl +++ b/src/schemes/solid/total_lagrangian_sph/penalty_force.jl @@ -35,7 +35,7 @@ end eps_sum = (J_a + J_b) * initial_pos_diff - 2 * current_pos_diff delta_sum = dot(eps_sum, current_pos_diff) / current_distance - f = 0.5 * penalty_force.alpha * volume_particle * volume_neighbor * + f = (penalty_force.alpha / 2) * volume_particle * volume_neighbor * kernel_weight / initial_distance^2 * young_modulus * delta_sum * current_pos_diff / current_distance diff --git a/src/schemes/solid/total_lagrangian_sph/system.jl b/src/schemes/solid/total_lagrangian_sph/system.jl index dfd71a2d6..e39cd9a9c 100644 --- a/src/schemes/solid/total_lagrangian_sph/system.jl +++ b/src/schemes/solid/total_lagrangian_sph/system.jl @@ -105,7 +105,7 @@ function TotalLagrangianSPHSystem(initial_condition, lame_lambda = young_modulus * poisson_ratio / ((1 + poisson_ratio) * (1 - 2 * poisson_ratio)) - lame_mu = 0.5 * young_modulus / (1 + poisson_ratio) + lame_mu = (young_modulus / 2) / (1 + poisson_ratio) return TotalLagrangianSPHSystem(initial_condition, initial_coordinates, current_coordinates, mass, correction_matrix, @@ -227,7 +227,7 @@ end function update_positions!(system::TotalLagrangianSPHSystem, v, u, v_ode, u_ode, semi, t) (; current_coordinates) = system - for particle in each_moving_particle(system) + @threaded system for particle in each_moving_particle(system) for i in 1:ndims(system) current_coordinates[i, particle] = u[i, particle] end @@ -313,7 +313,7 @@ end (; lame_lambda, lame_mu) = system # Compute the Green-Lagrange strain - E = 0.5 * (transpose(F) * F - I) + E = (transpose(F) * F - I) / 2 return lame_lambda * tr(E) * I + 2 * lame_mu * E end @@ -327,12 +327,9 @@ end function write_u0!(u0, system::TotalLagrangianSPHSystem) (; initial_condition) = system - for particle in each_moving_particle(system) - # Write particle coordinates - for dim in 1:ndims(system) - u0[dim, particle] = initial_condition.coordinates[dim, particle] - end - end + # This is as fast as a loop with `@inbounds`, but it's GPU-compatible + indices = CartesianIndices((ndims(system), each_moving_particle(system))) + copyto!(u0, indices, initial_condition.coordinates, indices) return u0 end @@ -340,12 +337,9 @@ end function write_v0!(v0, system::TotalLagrangianSPHSystem) (; initial_condition, boundary_model) = system - for particle in each_moving_particle(system) - # Write particle velocities - for dim in 1:ndims(system) - v0[dim, particle] = initial_condition.velocity[dim, particle] - end - end + # This is as fast as a loop with `@inbounds`, but it's GPU-compatible + indices = CartesianIndices((ndims(system), each_moving_particle(system))) + copyto!(v0, indices, initial_condition.velocity, indices) write_v0!(v0, boundary_model, system) @@ -356,19 +350,6 @@ function write_v0!(v0, model, system::TotalLagrangianSPHSystem) return v0 end -function write_v0!(v0, ::BoundaryModelDummyParticles{ContinuityDensity}, - system::TotalLagrangianSPHSystem) - (; cache) = system.boundary_model - (; initial_density) = cache - - for particle in each_moving_particle(system) - # Set particle densities - v0[ndims(system) + 1, particle] = initial_density[particle] - end - - return v0 -end - function restart_with!(system::TotalLagrangianSPHSystem, v, u) for particle in each_moving_particle(system) system.current_coordinates[:, particle] .= u[:, particle] diff --git a/src/util.jl b/src/util.jl index 077654a06..11e0b10a3 100644 --- a/src/util.jl +++ b/src/util.jl @@ -134,3 +134,48 @@ function compute_git_hash() return "UnknownVersion" end end + +""" + trixi_include_changeprecision(T, [mod::Module=Main,] file::AbstractString; kwargs...) + +`include` the file `file` and evaluate its content in the global scope of module `mod`. +You can override specific assignments in `elixir` by supplying keyword arguments, +similar to [`trixi_include`](@ref). + +The only difference to [`trixi_include`](@ref) is that the precision of floating-point +numbers in the included file is changed to `T`. +More precisely, the package [ChangePrecision.jl](https://github.com/JuliaMath/ChangePrecision.jl) +is used to convert all `Float64` literals, operations like `/` that produce `Float64` results, +and functions like `ones` that return `Float64` arrays by default, to the desired type `T`. +See the documentation of ChangePrecision.jl for more details. + +The purpose of this function is to conveniently run a full simulation with `Float32`, +which is orders of magnitude faster on most GPUs than `Float64`, by just including +the simulation file with `trixi_include_changeprecision(Float32, file)`. +TrixiParticles code is written in a way that changing all floating-point numbers in the +example file to `Float32` manually will run the full simulation with single precision. + +See [the docs on GPU support](@ref gpu_support) for more information. +""" +function trixi_include_changeprecision(T, mod::Module, filename::AbstractString; kwargs...) + trixi_include(expr -> ChangePrecision.changeprecision(T, replace_trixi_include(T, expr)), + mod, filename; kwargs...) +end + +function trixi_include_changeprecision(T, filename::AbstractString; kwargs...) + trixi_include_changeprecision(T, Main, filename; kwargs...) +end + +function replace_trixi_include(T, expr) + expr = TrixiBase.walkexpr(expr) do x + if x isa Expr + if x.head === :call && x.args[1] === :trixi_include + x.args[1] = :trixi_include_changeprecision + insert!(x.args, 2, :($T)) + end + end + return x + end + + return expr +end diff --git a/src/visualization/recipes_plots.jl b/src/visualization/recipes_plots.jl index a14de2948..ef02ea636 100644 --- a/src/visualization/recipes_plots.jl +++ b/src/visualization/recipes_plots.jl @@ -5,20 +5,45 @@ const TrixiParticlesODESolution = ODESolution{<:Any, <:Any, <:Any, <:Any, <:Any, <:Semidiscretization}} RecipesBase.@recipe function f(sol::TrixiParticlesODESolution) - # Redirect everything to the recipe + # Redirect everything to the next recipe return sol.u[end].x..., sol.prob.p end +# GPU version +RecipesBase.@recipe function f(sol::TrixiParticlesODESolution, semi::Semidiscretization) + # Move GPU data to the CPU + v_ode = Array(sol.u[end].x[1]) + u_ode = Array(sol.u[end].x[2]) + semi_ = Adapt.adapt(Array, sol.prob.p) + + # Redirect everything to the next recipe + return v_ode, u_ode, semi_, particle_spacings(semi) +end + +RecipesBase.@recipe function f(v_ode::AbstractGPUArray, u_ode, semi::Semidiscretization; + particle_spacings=nothing, + size=(600, 400), # Default size + xlims=(-Inf, Inf), ylims=(-Inf, Inf)) + throw(ArgumentError("to plot GPU data, use `plot(sol, semi)`")) +end + RecipesBase.@recipe function f(v_ode, u_ode, semi::Semidiscretization; + particle_spacings=TrixiParticles.particle_spacings(semi), size=(600, 400), # Default size xlims=(-Inf, Inf), ylims=(-Inf, Inf)) - systems_data = map(semi.systems) do system + return v_ode, u_ode, semi, particle_spacings +end + +RecipesBase.@recipe function f(v_ode, u_ode, semi::Semidiscretization, particle_spacings; + size=(600, 400), # Default size + xlims=(-Inf, Inf), ylims=(-Inf, Inf)) + systems_data = map(enumerate(semi.systems)) do (i, system) u = wrap_u(u_ode, system, semi) coordinates = active_coordinates(u, system) x = collect(coordinates[1, :]) y = collect(coordinates[2, :]) - particle_spacing = system.initial_condition.particle_spacing + particle_spacing = particle_spacings[i] if particle_spacing < 0 particle_spacing = 0.0 end @@ -41,6 +66,10 @@ RecipesBase.@recipe function f(v_ode, u_ode, semi::Semidiscretization; return (semi, systems_data...) end +function particle_spacings(semi::Semidiscretization) + return [system.initial_condition.particle_spacing for system in semi.systems] +end + RecipesBase.@recipe function f((initial_conditions::InitialCondition)...; xlims=(Inf, Inf), ylims=(Inf, Inf)) idx = 0 diff --git a/test/examples/examples.jl b/test/examples/examples.jl index b80e18353..20f9e81a6 100644 --- a/test/examples/examples.jl +++ b/test/examples/examples.jl @@ -189,6 +189,18 @@ @test count_rhs_allocations(sol, semi) == 0 end end + + @testset "Float32" begin + @test_nowarn_mod trixi_include_changeprecision(Float32, + @__MODULE__, + joinpath(examples_dir(), + "fluid", + "dam_break_2d.jl"), + tspan=(0, 0.1)) + @test sol.retcode == ReturnCode.Success + @test count_rhs_allocations(sol, semi) == 0 + @test eltype(sol) == Float32 + end end @trixi_testset "fluid/dam_break_oil_film_2d.jl" begin