diff --git a/doc/sphinx/lb.rst b/doc/sphinx/lb.rst
index 9f1f018969..a1d4e7388e 100644
--- a/doc/sphinx/lb.rst
+++ b/doc/sphinx/lb.rst
@@ -397,6 +397,39 @@ of the first LB GPU instance::
 
     system.cuda_init_handle.call_method("set_device_id_per_rank")
 
+Due to padding, the memory footprint of the GPU fields is not a linear function
+of the grid size. Instead, it is a step function of the size along the x-direction
+of the rank-local LB domain.
+For illustration, a local LB domain with dimensions 64x256x256 will take as
+much VRAM as a domain with size 127x256x256 in single-precision mode.
+As a rule of thumb, the VRAM in GiB per rank-local LB domain will be:
+
+.. math::
+
+   \label{eq:lj}
+     f(n_x, n_y, n_z) =
+       \begin{cases}
+         \left\lceil n_x / 64 \right\rceil \cdot 64 \cdot n_y \cdot n_z \cdot 204 / 1024^3
+         & \text{(in single-precision)}\\
+         \left\lceil n_x / 32 \right\rceil \cdot 32 \cdot n_y \cdot n_z \cdot 410 / 1024^3
+         & \text{(in double-precision)}
+       \end{cases}
+
+with :math:`n_x`, :math:`n_y`, :math:`n_z` the LB domain size in agrid units, including the ghost layer.
+
+Regarding communication between GPUs, for optimal performance the MPI topology
+should divide the z-direction first, the y-direction second, and the x-direction
+last, i.e. ascending order of the prime factors. Please note the default MPI
+Cartesian grid in |es| is sorted in descending order of the prime factors,
+and leads to poor performance. For illustration, a Cartesian grid with
+shape ``[1, 1, 8]`` yields 94% weak scaling efficiency,
+shape ``[8, 1, 1]`` yields 90%,
+shape ``[1, 2, 4]`` yields 88%,
+shape ``[4, 2, 1]`` yields 86%,
+shape ``[2, 2, 2]`` yields 81%.
+This is assuming 1 GPU per CPU. Using more than 1 CPU per GPU or more
+than 1 GPU per CPU can degrade weak scaling efficiency further.
+
 .. _Electrohydrodynamics:
 
 Electrohydrodynamics
diff --git a/maintainer/walberla_kernels/generate_lb_kernels.py b/maintainer/walberla_kernels/generate_lb_kernels.py
index bef9badc54..f9e93f16a1 100644
--- a/maintainer/walberla_kernels/generate_lb_kernels.py
+++ b/maintainer/walberla_kernels/generate_lb_kernels.py
@@ -221,6 +221,7 @@ def patch_file(class_name, extension, target_suffix, patch):
 
     def patch_packinfo_header(content, target_suffix):
         if target_suffix in ["", "AVX"]:
+            # fix MPI buffer memory alignment
             token = "\n       //TODO: optimize by generating kernel for this case\n"
             assert token in content
             content = content.replace(token, "\n")
@@ -232,6 +233,7 @@ def patch_packinfo_header(content, target_suffix):
             assert token in content
             content = content.replace(token, f"{token[:-1]} + sizeof({ft}))")
         elif target_suffix in ["CUDA"]:
+            # replace preprocessor macros and pragmas
             token = "#define FUNC_PREFIX __global__"
             assert token in content
             content = content.replace(token, "")
@@ -240,16 +242,18 @@ def patch_packinfo_header(content, target_suffix):
 
     def patch_packinfo_kernel(content, target_suffix):
         if target_suffix in ["", "AVX"]:
-            # fix MPI buffer
+            # fix MPI buffer memory alignment
             m = re.search("(float|double) *\* *buffer = reinterpret_cast<(?:float|double) *\*>\(byte_buffer\);\n", content)  # nopep8
             assert m is not None
             content = content.replace(m.group(0), f"byte_buffer += sizeof({m.group(1)}) - (reinterpret_cast<std::size_t>(byte_buffer) - (reinterpret_cast<std::size_t>(byte_buffer) / sizeof({m.group(1)})) * sizeof({m.group(1)}));\n  {m.group(0)}")  # nopep8
         if target_suffix in ["CUDA"]:
+            # replace preprocessor macros and pragmas
             token = "#define FUNC_PREFIX __global__"
             assert token in content
             push, _ = custom_additional_extensions.generate_device_preprocessor(
                 "packinfo", defines=("RESTRICT",))
             content = content.replace(token, f"{token}\n{push}")
+            # add missing includes
             token = '#include "PackInfo'
             assert token in content
             content = content.replace(token, f'#include "core/DataTypes.h"\n#include "core/cell/CellInterval.h"\n#include "domain_decomposition/IBlock.h"\n#include "stencil/Directions.h"\n\n{token}')  # nopep8
@@ -276,10 +280,12 @@ def patch_packinfo_kernel(content, target_suffix):
 
     # pylint: disable=unused-argument
     def patch_boundary_header(content, target_suffix):
+        # replace real_t by actual floating-point type
         return content.replace("real_t", config.data_type.default_factory().c_name)  # nopep8
 
     def patch_boundary_kernel(content, target_suffix):
         if target_suffix in ["CUDA"]:
+            # replace preprocessor macros and pragmas
             push, pop = custom_additional_extensions.generate_device_preprocessor(
                 "ubb_boundary", defines=("RESTRICT",))
             content = re.sub(r"#ifdef __GNUC__[\s\S]+?#endif(?=\n\n|\n//)", "", content)  # nopep8