espressomd · kodiakhq · Nov 5, 2024 · Oct 30, 2024 · Nov 4, 2024
diff --git a/doc/sphinx/lb.rst b/doc/sphinx/lb.rst
@@ -397,6 +397,39 @@ of the first LB GPU instance::
 
     system.cuda_init_handle.call_method("set_device_id_per_rank")
 
+Due to padding, the memory footprint of the GPU fields is not a linear function
+of the grid size. Instead, it is a step function of the size along the x-direction
+of the rank-local LB domain.
+For illustration, a local LB domain with dimensions 64x256x256 will take as
+much VRAM as a domain with size 127x256x256 in single-precision mode.
+As a rule of thumb, the VRAM in GiB per rank-local LB domain will be:
+
+.. math::
+
+   \label{eq:lj}
+     f(n_x, n_y, n_z) =
+       \begin{cases}
+         \left\lceil n_x / 64 \right\rceil \cdot 64 \cdot n_y \cdot n_z \cdot 204 / 1024^3
+         & \text{(in single-precision)}\\
+         \left\lceil n_x / 32 \right\rceil \cdot 32 \cdot n_y \cdot n_z \cdot 410 / 1024^3
+         & \text{(in double-precision)}
+       \end{cases}
+
+with :math:`n_x`, :math:`n_y`, :math:`n_z` the LB domain size in agrid units, including the ghost layer.
+
+Regarding communication between GPUs, for optimal performance the MPI topology
+should divide the z-direction first, the y-direction second, and the x-direction
+last, i.e. ascending order of the prime factors. Please note the default MPI
+Cartesian grid in |es| is sorted in descending order of the prime factors,
+and leads to poor performance. For illustration, a Cartesian grid with
+shape ``[1, 1, 8]`` yields 94% weak scaling efficiency,
+shape ``[8, 1, 1]`` yields 90%,
+shape ``[1, 2, 4]`` yields 88%,
+shape ``[4, 2, 1]`` yields 86%,
+shape ``[2, 2, 2]`` yields 81%.
+This is assuming 1 GPU per CPU. Using more than 1 CPU per GPU or more
+than 1 GPU per CPU can degrade weak scaling efficiency further.
+
 .. _Electrohydrodynamics:
 
 Electrohydrodynamics

diff --git a/maintainer/benchmarks/lb.py b/maintainer/benchmarks/lb.py
@@ -31,7 +31,7 @@
 parser.add_argument("--particles_per_core", metavar="N", action="store",
                     type=int, default=125, required=False,
                     help="Number of particles per core")
-parser.add_argument("--box_l", action="store",
+parser.add_argument("--box_l", action="store", nargs="+",
                     type=int, default=argparse.SUPPRESS, required=False,
                     help="Box length (cubic box)")
 parser.add_argument("--lb_sites_per_particle", metavar="N_LB", action="store",
@@ -45,6 +45,8 @@
                     help="Using single-precision floating point accuracy")
 parser.add_argument("--gpu", action=argparse.BooleanOptionalAction,
                     default=False, required=False, help="Use GPU implementation")
+parser.add_argument("--multi-gpu", action=argparse.BooleanOptionalAction,
+                    default=False, required=False, help="Use multi-GPU implementation")
 parser.add_argument("--output", metavar="FILEPATH", action="store",
                     type=str, required=False, default="benchmarks.csv",
                     help="Output file (default: benchmarks.csv)")
@@ -83,9 +85,9 @@
 n_proc = system.cell_system.get_state()["n_nodes"]
 n_part = n_proc * args.particles_per_core
 if n_part == 0:
-    box_l = args.box_l
+    box_l = 3 * args.box_l if len(args.box_l) == 1 else args.box_l
     agrid = 1.
-    lb_grid = args.box_l
+    lb_grid = box_l
     measurement_steps = 80
 else:
     # volume of N spheres with radius r: N * (4/3*pi*r^3)
@@ -96,13 +98,16 @@
     agrid = box_l / lb_grid
     measurement_steps = max(50, int(120**3 / lb_grid**3))
     measurement_steps = 40
+    lb_grid = 3 * [lb_grid]
+    box_l = 3 * [box_l]
 
-print(f"LB shape: [{lb_grid}, {lb_grid}, {lb_grid}]")
+print(f"box length: {box_l}")
+print(f"LB shape: {lb_grid}")
 print(f"LB agrid: {agrid:.3f}")
 
 # System
 #############################################################
-system.box_l = 3 * (box_l,)
+system.box_l = box_l
 
 # Integration parameters
 #############################################################
@@ -135,8 +140,10 @@
 # LB fluid setup
 #############################################################
 lb_class = espressomd.lb.LBFluidWalberla
-if args.gpu:
+if args.gpu or args.multi_gpu:
     lb_class = espressomd.lb.LBFluidWalberlaGPU
+if args.multi_gpu:
+    system.cuda_init_handle.call_method("set_device_id_per_rank")
 lbf = lb_class(agrid=agrid, tau=system.time_step, kinematic_viscosity=1.,
                density=1., single_precision=args.single_precision)
 system.lb = lbf

diff --git a/maintainer/walberla_kernels/Readme.md b/maintainer/walberla_kernels/Readme.md
@@ -49,7 +49,6 @@ generate_lb_kernels --single-precision
 generate_lb_kernels --gpu
 generate_lb_kernels --gpu --single-precision
 format_lb_kernels
-git diff src/walberla_bridge/src/lattice_boltzmann/generated_kernels/Dynamic_UBB_*CUDA*.cu # verify pragmas
 
 # EK kernels
 cd $(git rev-parse --show-toplevel)/src/walberla_bridge/src/electrokinetics/generated_kernels/

diff --git a/maintainer/walberla_kernels/custom_additional_extensions.py b/maintainer/walberla_kernels/custom_additional_extensions.py
@@ -349,3 +349,48 @@ def generate_kernel_selector(
         "templates/ReactionKernelSelector.tmpl.h").render(**context)
 
     generation_context.write_file(f"{class_name}_all.h", header)
+
+
+def generate_device_preprocessor(kernel, defines=()):
+    """
+    Generate device preprocessor directives.
+    """
+    pragmas = {
+        "packinfo": {
+            "nvcc": ["diag_suppress 177 // unused variable"],
+            "clang_host": ["-Wunused-variable"],
+            "clang_dev": ["-Wunused-variable"],
+            "gcc": ["-Wunused-variable"],
+        },
+        "ubb_boundary": {
+            "nvcc": ["diag_suppress 177 // unused variable"],
+            "clang_host": ["-Wstrict-aliasing", "-Wunused-variable", "-Wconversion", "-Wsign-compare"],  # nopep8
+            "clang_dev": ["-Wstrict-aliasing", "-Wunused-variable", "-Wconversion", "-Wsign-compare"],  # nopep8
+            "gcc": ["-Wstrict-aliasing", "-Wunused-variable", "-Wconversion"],
+        },
+    }
+
+    defines_table = {
+        "nvcc": {"RESTRICT": "__restrict__", "FUNC_PREFIX": "__global__"},
+        "msvc": {"RESTRICT": "__restrict", "FUNC_PREFIX": ""},
+        "clang_host": {"RESTRICT": "__restrict__", "FUNC_PREFIX": ""},
+        "clang_dev": {"RESTRICT": "__restrict__", "FUNC_PREFIX": "__global__"},
+        "gcc": {"RESTRICT": "__restrict__", "FUNC_PREFIX": ""},
+        "other": {"RESTRICT": "", "FUNC_PREFIX": ""},
+    }
+
+    context = {
+        "pragmas": pragmas[kernel],
+        "defines_table": defines_table,
+        "defines": defines,
+    }
+
+    custom_env = jinja2.Environment(
+        loader=jinja2.FileSystemLoader(pathlib.Path(__file__).parent),
+        undefined=jinja2.StrictUndefined
+    )
+
+    content = custom_env.get_template(
+        "templates/preprocessor.tmpl.cuh").render(**context)
+
+    return content.split("\n/* section */\n")[1:]
diff --git a/maintainer/walberla_kernels/generate_lb_kernels.py b/maintainer/walberla_kernels/generate_lb_kernels.py
@@ -1,5 +1,5 @@
 #
-# Copyright (C) 2020-2023 The ESPResSo project
+# Copyright (C) 2020-2024 The ESPResSo project
 #
 # This file is part of ESPResSo.
 #
@@ -17,6 +17,7 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #
 
+import re
 import argparse
 import packaging.specifiers
 
@@ -40,6 +41,7 @@
 import relaxation_rates
 import walberla_lbm_generation
 import code_generation_context
+import custom_additional_extensions
 
 parser = argparse.ArgumentParser(description="Generate the waLBerla kernels.")
 parser.add_argument("--single-precision", action="store_true", required=False,
@@ -65,6 +67,24 @@ def paramlist(parameters, keys):
             yield parameters[key]
 
 
+def get_ext_header(target_suffix):
+    return {"CUDA": "h"}.get(target_suffix, "h")
+
+
+def get_ext_source(target_suffix):
+    return {"CUDA": "cu"}.get(target_suffix, "cpp")
+
+
+def patch_file(class_name, extension, target_suffix, patch):
+    with open(f"{class_name}.{extension}", "r+") as f:
+        old_content = f.read()
+        new_content = patch(old_content, target_suffix)
+        if new_content != old_content:
+            f.seek(0)
+            f.truncate()
+            f.write(new_content)
+
+
 with code_generation_context.CodeGeneration() as ctx:
     ctx.double_accuracy = not args.single_precision
     if target == ps.Target.GPU:
@@ -196,44 +216,92 @@ def paramlist(parameters, keys):
     # generate PackInfo
     assignments = pystencils_espresso.generate_pack_info_pdfs_field_assignments(
         fields, streaming_pattern="pull")
-    spec = pystencils_espresso.generate_pack_info_vector_field_specifications(
+    spec = pystencils_espresso.generate_pack_info_field_specifications(
         config, stencil, force_field.layout)
-    for params, target_suffix in paramlist(parameters, ["CPU"]):
+
+    def patch_packinfo_header(content, target_suffix):
+        if target_suffix in ["", "AVX"]:
+            # fix MPI buffer memory alignment
+            token = "\n       //TODO: optimize by generating kernel for this case\n"
+            assert token in content
+            content = content.replace(token, "\n")
+            ft = "float" if "SinglePrecision" in content else "double"
+            token = " pack(dir, outBuffer.forward(dataSize)"
+            assert token in content
+            content = content.replace(token, f"{token[:-1]} + sizeof({ft}))")
+            token = " unpack(dir, buffer.skip(dataSize)"
+            assert token in content
+            content = content.replace(token, f"{token[:-1]} + sizeof({ft}))")
+        elif target_suffix in ["CUDA"]:
+            # replace preprocessor macros and pragmas
+            token = "#define FUNC_PREFIX __global__"
+            assert token in content
+            content = content.replace(token, "")
+            content = re.sub(r"#ifdef __GNUC__[\s\S]+?#endif\n\n", "", content)
+        return content
+
+    def patch_packinfo_kernel(content, target_suffix):
+        if target_suffix in ["", "AVX"]:
+            # fix MPI buffer memory alignment
+            m = re.search("(float|double) *\* *buffer = reinterpret_cast<(?:float|double) *\*>\(byte_buffer\);\n", content)  # nopep8
+            assert m is not None
+            content = content.replace(m.group(0), f"byte_buffer += sizeof({m.group(1)}) - (reinterpret_cast<std::size_t>(byte_buffer) - (reinterpret_cast<std::size_t>(byte_buffer) / sizeof({m.group(1)})) * sizeof({m.group(1)}));\n  {m.group(0)}")  # nopep8
+        if target_suffix in ["CUDA"]:
+            # replace preprocessor macros and pragmas
+            token = "#define FUNC_PREFIX __global__"
+            assert token in content
+            push, _ = custom_additional_extensions.generate_device_preprocessor(
+                "packinfo", defines=("RESTRICT",))
+            content = content.replace(token, f"{token}\n{push}")
+            # add missing includes
+            token = '#include "PackInfo'
+            assert token in content
+            content = content.replace(token, f'#include "core/DataTypes.h"\n#include "core/cell/CellInterval.h"\n#include "domain_decomposition/IBlock.h"\n#include "stencil/Directions.h"\n\n{token}')  # nopep8
+        return content
+
+    for params, target_suffix in paramlist(parameters, ["CPU", "GPU"]):
         pystencils_walberla.generate_pack_info_from_kernel(
             ctx, f"PackInfoPdf{precision_prefix}{target_suffix}", assignments,
             kind="pull", **params)
         pystencils_walberla.generate_pack_info(
             ctx, f"PackInfoVec{precision_prefix}{target_suffix}", spec, **params)
-        if target_suffix == "CUDA":
-            continue
-        token = "\n       //TODO: optimize by generating kernel for this case\n"
-        for field_suffix in ["Pdf", "Vec"]:
-            class_name = f"PackInfo{field_suffix}{precision_prefix}{target_suffix}"  # nopep8
-            with open(f"{class_name}.h", "r+") as f:
-                content = f.read()
-                assert token in content
-                content = content.replace(token, "\n")
-                f.seek(0)
-                f.truncate()
-                f.write(content)
+        for suffix in ["Pdf", "Vec"]:
+            class_name = f"PackInfo{suffix}{precision_prefix}{target_suffix}"
+            patch_file(class_name, get_ext_header(target_suffix),
+                       target_suffix, patch_packinfo_header)
+            patch_file(class_name, get_ext_source(target_suffix),
+                       target_suffix, patch_packinfo_kernel)
 
     # boundary conditions
     ubb_dynamic = lbmpy_espresso.UBB(
         lambda *args: None, dim=3, data_type=config.data_type.default_factory())
     ubb_data_handler = lbmpy_espresso.BounceBackSlipVelocityUBB(
         method.stencil, ubb_dynamic)
 
-    for _, target_suffix in paramlist(parameters, ("GPU", "CPU")):
+    # pylint: disable=unused-argument
+    def patch_boundary_header(content, target_suffix):
+        # replace real_t by actual floating-point type
+        return content.replace("real_t", config.data_type.default_factory().c_name)  # nopep8
+
+    def patch_boundary_kernel(content, target_suffix):
+        if target_suffix in ["CUDA"]:
+            # replace preprocessor macros and pragmas
+            push, pop = custom_additional_extensions.generate_device_preprocessor(
+                "ubb_boundary", defines=("RESTRICT",))
+            content = re.sub(r"#ifdef __GNUC__[\s\S]+?#endif(?=\n\n|\n//)", "", content)  # nopep8
+            content = re.sub(r"#ifdef __CUDACC__[\s\S]+?#endif(?=\n\n|\n//)", push, content, 1)  # nopep8
+            content = re.sub(r"#ifdef __CUDACC__[\s\S]+?#endif(?=\n\n|\n//)", pop, content, 1)  # nopep8
+            assert push in content
+            assert pop in content
+        return content
+
+    for _, target_suffix in paramlist(parameters, ("CPU", "GPU")):
+        class_name = f"Dynamic_UBB_{precision_suffix}{target_suffix}"
         lbmpy_walberla.generate_boundary(
-            ctx, f"Dynamic_UBB_{precision_suffix}{target_suffix}", ubb_dynamic,
-            method, additional_data_handler=ubb_data_handler,
+            ctx, class_name, ubb_dynamic, method,
+            additional_data_handler=ubb_data_handler,
             streaming_pattern=streaming_pattern, target=target)
-
-        with open(f"Dynamic_UBB_{precision_suffix}{target_suffix}.h", "r+") as f:
-            content = f.read()
-            f.seek(0)
-            f.truncate(0)
-            # patch for floating point accuracy
-            content = content.replace("real_t",
-                                      config.data_type.default_factory().c_name)
-            f.write(content)
+        patch_file(class_name, get_ext_header(target_suffix),
+                   target_suffix, patch_boundary_header)
+        patch_file(class_name, get_ext_source(target_suffix),
+                   target_suffix, patch_boundary_kernel)
diff --git a/maintainer/walberla_kernels/pystencils_espresso.py b/maintainer/walberla_kernels/pystencils_espresso.py
@@ -239,7 +239,8 @@ def generate_pack_info_pdfs_field_assignments(fields, streaming_pattern):
     return lbm_update_rule.all_assignments
 
 
-def generate_pack_info_vector_field_specifications(config, stencil, layout):
+def generate_pack_info_field_specifications(
+        config, stencil, layout, vec_len=3):
     import collections
     import itertools
     field = ps.Field.create_generic(
@@ -248,7 +249,7 @@ def generate_pack_info_vector_field_specifications(config, stencil, layout):
         data_type_np[config.data_type.default_factory().c_name],
         index_dimensions=1,
         layout=layout,
-        index_shape=(3,)
+        index_shape=(vec_len,)
     )
     q = len(stencil)
     coord = itertools.product(*[(-1, 0, 1)] * 3)
@@ -257,7 +258,7 @@ def generate_pack_info_vector_field_specifications(config, stencil, layout):
     else:
         dirs = tuple((i, j, k) for i, j, k in coord)
     spec = collections.defaultdict(set)
-    spec[dirs] = {field[0, 0, 0](i) for i in range(3)}
+    spec[dirs] = {field[0, 0, 0](i) for i in range(vec_len)}
     return spec