diff --git a/doc/sphinx/lb.rst b/doc/sphinx/lb.rst index 9f1f018969..a1d4e7388e 100644 --- a/doc/sphinx/lb.rst +++ b/doc/sphinx/lb.rst @@ -397,6 +397,39 @@ of the first LB GPU instance:: system.cuda_init_handle.call_method("set_device_id_per_rank") +Due to padding, the memory footprint of the GPU fields is not a linear function +of the grid size. Instead, it is a step function of the size along the x-direction +of the rank-local LB domain. +For illustration, a local LB domain with dimensions 64x256x256 will take as +much VRAM as a domain with size 127x256x256 in single-precision mode. +As a rule of thumb, the VRAM in GiB per rank-local LB domain will be: + +.. math:: + + \label{eq:lj} + f(n_x, n_y, n_z) = + \begin{cases} + \left\lceil n_x / 64 \right\rceil \cdot 64 \cdot n_y \cdot n_z \cdot 204 / 1024^3 + & \text{(in single-precision)}\\ + \left\lceil n_x / 32 \right\rceil \cdot 32 \cdot n_y \cdot n_z \cdot 410 / 1024^3 + & \text{(in double-precision)} + \end{cases} + +with :math:`n_x`, :math:`n_y`, :math:`n_z` the LB domain size in agrid units, including the ghost layer. + +Regarding communication between GPUs, for optimal performance the MPI topology +should divide the z-direction first, the y-direction second, and the x-direction +last, i.e. ascending order of the prime factors. Please note the default MPI +Cartesian grid in |es| is sorted in descending order of the prime factors, +and leads to poor performance. For illustration, a Cartesian grid with +shape ``[1, 1, 8]`` yields 94% weak scaling efficiency, +shape ``[8, 1, 1]`` yields 90%, +shape ``[1, 2, 4]`` yields 88%, +shape ``[4, 2, 1]`` yields 86%, +shape ``[2, 2, 2]`` yields 81%. +This is assuming 1 GPU per CPU. Using more than 1 CPU per GPU or more +than 1 GPU per CPU can degrade weak scaling efficiency further. + .. _Electrohydrodynamics: Electrohydrodynamics diff --git a/maintainer/walberla_kernels/generate_lb_kernels.py b/maintainer/walberla_kernels/generate_lb_kernels.py index bef9badc54..f9e93f16a1 100644 --- a/maintainer/walberla_kernels/generate_lb_kernels.py +++ b/maintainer/walberla_kernels/generate_lb_kernels.py @@ -221,6 +221,7 @@ def patch_file(class_name, extension, target_suffix, patch): def patch_packinfo_header(content, target_suffix): if target_suffix in ["", "AVX"]: + # fix MPI buffer memory alignment token = "\n //TODO: optimize by generating kernel for this case\n" assert token in content content = content.replace(token, "\n") @@ -232,6 +233,7 @@ def patch_packinfo_header(content, target_suffix): assert token in content content = content.replace(token, f"{token[:-1]} + sizeof({ft}))") elif target_suffix in ["CUDA"]: + # replace preprocessor macros and pragmas token = "#define FUNC_PREFIX __global__" assert token in content content = content.replace(token, "") @@ -240,16 +242,18 @@ def patch_packinfo_header(content, target_suffix): def patch_packinfo_kernel(content, target_suffix): if target_suffix in ["", "AVX"]: - # fix MPI buffer + # fix MPI buffer memory alignment m = re.search("(float|double) *\* *buffer = reinterpret_cast<(?:float|double) *\*>\(byte_buffer\);\n", content) # nopep8 assert m is not None content = content.replace(m.group(0), f"byte_buffer += sizeof({m.group(1)}) - (reinterpret_cast(byte_buffer) - (reinterpret_cast(byte_buffer) / sizeof({m.group(1)})) * sizeof({m.group(1)}));\n {m.group(0)}") # nopep8 if target_suffix in ["CUDA"]: + # replace preprocessor macros and pragmas token = "#define FUNC_PREFIX __global__" assert token in content push, _ = custom_additional_extensions.generate_device_preprocessor( "packinfo", defines=("RESTRICT",)) content = content.replace(token, f"{token}\n{push}") + # add missing includes token = '#include "PackInfo' assert token in content content = content.replace(token, f'#include "core/DataTypes.h"\n#include "core/cell/CellInterval.h"\n#include "domain_decomposition/IBlock.h"\n#include "stencil/Directions.h"\n\n{token}') # nopep8 @@ -276,10 +280,12 @@ def patch_packinfo_kernel(content, target_suffix): # pylint: disable=unused-argument def patch_boundary_header(content, target_suffix): + # replace real_t by actual floating-point type return content.replace("real_t", config.data_type.default_factory().c_name) # nopep8 def patch_boundary_kernel(content, target_suffix): if target_suffix in ["CUDA"]: + # replace preprocessor macros and pragmas push, pop = custom_additional_extensions.generate_device_preprocessor( "ubb_boundary", defines=("RESTRICT",)) content = re.sub(r"#ifdef __GNUC__[\s\S]+?#endif(?=\n\n|\n//)", "", content) # nopep8