Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement multi-GPU LB #5007

Merged
merged 2 commits into from
Nov 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions doc/sphinx/lb.rst
Original file line number Diff line number Diff line change
Expand Up @@ -397,6 +397,39 @@ of the first LB GPU instance::

system.cuda_init_handle.call_method("set_device_id_per_rank")

Due to padding, the memory footprint of the GPU fields is not a linear function
of the grid size. Instead, it is a step function of the size along the x-direction
of the rank-local LB domain.
For illustration, a local LB domain with dimensions 64x256x256 will take as
much VRAM as a domain with size 127x256x256 in single-precision mode.
As a rule of thumb, the VRAM in GiB per rank-local LB domain will be:

.. math::

\label{eq:lj}
f(n_x, n_y, n_z) =
\begin{cases}
\left\lceil n_x / 64 \right\rceil \cdot 64 \cdot n_y \cdot n_z \cdot 204 / 1024^3
& \text{(in single-precision)}\\
\left\lceil n_x / 32 \right\rceil \cdot 32 \cdot n_y \cdot n_z \cdot 410 / 1024^3
& \text{(in double-precision)}
\end{cases}

with :math:`n_x`, :math:`n_y`, :math:`n_z` the LB domain size in agrid units, including the ghost layer.

Regarding communication between GPUs, for optimal performance the MPI topology
should divide the z-direction first, the y-direction second, and the x-direction
last, i.e. ascending order of the prime factors. Please note the default MPI
Cartesian grid in |es| is sorted in descending order of the prime factors,
and leads to poor performance. For illustration, a Cartesian grid with
shape ``[1, 1, 8]`` yields 94% weak scaling efficiency,
shape ``[8, 1, 1]`` yields 90%,
shape ``[1, 2, 4]`` yields 88%,
shape ``[4, 2, 1]`` yields 86%,
shape ``[2, 2, 2]`` yields 81%.
This is assuming 1 GPU per CPU. Using more than 1 CPU per GPU or more
than 1 GPU per CPU can degrade weak scaling efficiency further.

.. _Electrohydrodynamics:

Electrohydrodynamics
Expand Down
19 changes: 13 additions & 6 deletions maintainer/benchmarks/lb.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
parser.add_argument("--particles_per_core", metavar="N", action="store",
type=int, default=125, required=False,
help="Number of particles per core")
parser.add_argument("--box_l", action="store",
parser.add_argument("--box_l", action="store", nargs="+",
type=int, default=argparse.SUPPRESS, required=False,
help="Box length (cubic box)")
parser.add_argument("--lb_sites_per_particle", metavar="N_LB", action="store",
Expand All @@ -45,6 +45,8 @@
help="Using single-precision floating point accuracy")
parser.add_argument("--gpu", action=argparse.BooleanOptionalAction,
default=False, required=False, help="Use GPU implementation")
parser.add_argument("--multi-gpu", action=argparse.BooleanOptionalAction,
default=False, required=False, help="Use multi-GPU implementation")
parser.add_argument("--output", metavar="FILEPATH", action="store",
type=str, required=False, default="benchmarks.csv",
help="Output file (default: benchmarks.csv)")
Expand Down Expand Up @@ -83,9 +85,9 @@
n_proc = system.cell_system.get_state()["n_nodes"]
n_part = n_proc * args.particles_per_core
if n_part == 0:
box_l = args.box_l
box_l = 3 * args.box_l if len(args.box_l) == 1 else args.box_l
agrid = 1.
lb_grid = args.box_l
lb_grid = box_l
measurement_steps = 80
else:
# volume of N spheres with radius r: N * (4/3*pi*r^3)
Expand All @@ -96,13 +98,16 @@
agrid = box_l / lb_grid
measurement_steps = max(50, int(120**3 / lb_grid**3))
measurement_steps = 40
lb_grid = 3 * [lb_grid]
box_l = 3 * [box_l]

print(f"LB shape: [{lb_grid}, {lb_grid}, {lb_grid}]")
print(f"box length: {box_l}")
print(f"LB shape: {lb_grid}")
print(f"LB agrid: {agrid:.3f}")

# System
#############################################################
system.box_l = 3 * (box_l,)
system.box_l = box_l

# Integration parameters
#############################################################
Expand Down Expand Up @@ -135,8 +140,10 @@
# LB fluid setup
#############################################################
lb_class = espressomd.lb.LBFluidWalberla
if args.gpu:
if args.gpu or args.multi_gpu:
lb_class = espressomd.lb.LBFluidWalberlaGPU
if args.multi_gpu:
system.cuda_init_handle.call_method("set_device_id_per_rank")
lbf = lb_class(agrid=agrid, tau=system.time_step, kinematic_viscosity=1.,
density=1., single_precision=args.single_precision)
system.lb = lbf
Expand Down
1 change: 0 additions & 1 deletion maintainer/walberla_kernels/Readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,6 @@ generate_lb_kernels --single-precision
generate_lb_kernels --gpu
generate_lb_kernels --gpu --single-precision
format_lb_kernels
git diff src/walberla_bridge/src/lattice_boltzmann/generated_kernels/Dynamic_UBB_*CUDA*.cu # verify pragmas

# EK kernels
cd $(git rev-parse --show-toplevel)/src/walberla_bridge/src/electrokinetics/generated_kernels/
Expand Down
45 changes: 45 additions & 0 deletions maintainer/walberla_kernels/custom_additional_extensions.py
Original file line number Diff line number Diff line change
Expand Up @@ -349,3 +349,48 @@ def generate_kernel_selector(
"templates/ReactionKernelSelector.tmpl.h").render(**context)

generation_context.write_file(f"{class_name}_all.h", header)


def generate_device_preprocessor(kernel, defines=()):
"""
Generate device preprocessor directives.
"""
pragmas = {
"packinfo": {
"nvcc": ["diag_suppress 177 // unused variable"],
"clang_host": ["-Wunused-variable"],
"clang_dev": ["-Wunused-variable"],
"gcc": ["-Wunused-variable"],
},
"ubb_boundary": {
"nvcc": ["diag_suppress 177 // unused variable"],
"clang_host": ["-Wstrict-aliasing", "-Wunused-variable", "-Wconversion", "-Wsign-compare"], # nopep8
"clang_dev": ["-Wstrict-aliasing", "-Wunused-variable", "-Wconversion", "-Wsign-compare"], # nopep8
"gcc": ["-Wstrict-aliasing", "-Wunused-variable", "-Wconversion"],
},
}

defines_table = {
"nvcc": {"RESTRICT": "__restrict__", "FUNC_PREFIX": "__global__"},
"msvc": {"RESTRICT": "__restrict", "FUNC_PREFIX": ""},
"clang_host": {"RESTRICT": "__restrict__", "FUNC_PREFIX": ""},
"clang_dev": {"RESTRICT": "__restrict__", "FUNC_PREFIX": "__global__"},
"gcc": {"RESTRICT": "__restrict__", "FUNC_PREFIX": ""},
"other": {"RESTRICT": "", "FUNC_PREFIX": ""},
}

context = {
"pragmas": pragmas[kernel],
"defines_table": defines_table,
"defines": defines,
}

custom_env = jinja2.Environment(
loader=jinja2.FileSystemLoader(pathlib.Path(__file__).parent),
undefined=jinja2.StrictUndefined
)

content = custom_env.get_template(
"templates/preprocessor.tmpl.cuh").render(**context)

return content.split("\n/* section */\n")[1:]
122 changes: 95 additions & 27 deletions maintainer/walberla_kernels/generate_lb_kernels.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#
# Copyright (C) 2020-2023 The ESPResSo project
# Copyright (C) 2020-2024 The ESPResSo project
#
# This file is part of ESPResSo.
#
Expand All @@ -17,6 +17,7 @@
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#

import re
import argparse
import packaging.specifiers

Expand All @@ -40,6 +41,7 @@
import relaxation_rates
import walberla_lbm_generation
import code_generation_context
import custom_additional_extensions

parser = argparse.ArgumentParser(description="Generate the waLBerla kernels.")
parser.add_argument("--single-precision", action="store_true", required=False,
Expand All @@ -65,6 +67,24 @@ def paramlist(parameters, keys):
yield parameters[key]


def get_ext_header(target_suffix):
return {"CUDA": "h"}.get(target_suffix, "h")


def get_ext_source(target_suffix):
return {"CUDA": "cu"}.get(target_suffix, "cpp")


def patch_file(class_name, extension, target_suffix, patch):
with open(f"{class_name}.{extension}", "r+") as f:
old_content = f.read()
new_content = patch(old_content, target_suffix)
if new_content != old_content:
f.seek(0)
f.truncate()
f.write(new_content)


with code_generation_context.CodeGeneration() as ctx:
ctx.double_accuracy = not args.single_precision
if target == ps.Target.GPU:
Expand Down Expand Up @@ -196,44 +216,92 @@ def paramlist(parameters, keys):
# generate PackInfo
assignments = pystencils_espresso.generate_pack_info_pdfs_field_assignments(
fields, streaming_pattern="pull")
spec = pystencils_espresso.generate_pack_info_vector_field_specifications(
spec = pystencils_espresso.generate_pack_info_field_specifications(
config, stencil, force_field.layout)
for params, target_suffix in paramlist(parameters, ["CPU"]):

def patch_packinfo_header(content, target_suffix):
if target_suffix in ["", "AVX"]:
# fix MPI buffer memory alignment
token = "\n //TODO: optimize by generating kernel for this case\n"
assert token in content
content = content.replace(token, "\n")
ft = "float" if "SinglePrecision" in content else "double"
token = " pack(dir, outBuffer.forward(dataSize)"
assert token in content
content = content.replace(token, f"{token[:-1]} + sizeof({ft}))")
token = " unpack(dir, buffer.skip(dataSize)"
assert token in content
content = content.replace(token, f"{token[:-1]} + sizeof({ft}))")
elif target_suffix in ["CUDA"]:
# replace preprocessor macros and pragmas
token = "#define FUNC_PREFIX __global__"
assert token in content
content = content.replace(token, "")
content = re.sub(r"#ifdef __GNUC__[\s\S]+?#endif\n\n", "", content)
return content

def patch_packinfo_kernel(content, target_suffix):
if target_suffix in ["", "AVX"]:
# fix MPI buffer memory alignment
m = re.search("(float|double) *\* *buffer = reinterpret_cast<(?:float|double) *\*>\(byte_buffer\);\n", content) # nopep8
assert m is not None
content = content.replace(m.group(0), f"byte_buffer += sizeof({m.group(1)}) - (reinterpret_cast<std::size_t>(byte_buffer) - (reinterpret_cast<std::size_t>(byte_buffer) / sizeof({m.group(1)})) * sizeof({m.group(1)}));\n {m.group(0)}") # nopep8
if target_suffix in ["CUDA"]:
# replace preprocessor macros and pragmas
token = "#define FUNC_PREFIX __global__"
assert token in content
push, _ = custom_additional_extensions.generate_device_preprocessor(
"packinfo", defines=("RESTRICT",))
content = content.replace(token, f"{token}\n{push}")
# add missing includes
token = '#include "PackInfo'
assert token in content
content = content.replace(token, f'#include "core/DataTypes.h"\n#include "core/cell/CellInterval.h"\n#include "domain_decomposition/IBlock.h"\n#include "stencil/Directions.h"\n\n{token}') # nopep8
return content

for params, target_suffix in paramlist(parameters, ["CPU", "GPU"]):
pystencils_walberla.generate_pack_info_from_kernel(
ctx, f"PackInfoPdf{precision_prefix}{target_suffix}", assignments,
kind="pull", **params)
pystencils_walberla.generate_pack_info(
ctx, f"PackInfoVec{precision_prefix}{target_suffix}", spec, **params)
if target_suffix == "CUDA":
continue
token = "\n //TODO: optimize by generating kernel for this case\n"
for field_suffix in ["Pdf", "Vec"]:
class_name = f"PackInfo{field_suffix}{precision_prefix}{target_suffix}" # nopep8
with open(f"{class_name}.h", "r+") as f:
content = f.read()
assert token in content
content = content.replace(token, "\n")
f.seek(0)
f.truncate()
f.write(content)
for suffix in ["Pdf", "Vec"]:
class_name = f"PackInfo{suffix}{precision_prefix}{target_suffix}"
patch_file(class_name, get_ext_header(target_suffix),
target_suffix, patch_packinfo_header)
patch_file(class_name, get_ext_source(target_suffix),
target_suffix, patch_packinfo_kernel)

# boundary conditions
ubb_dynamic = lbmpy_espresso.UBB(
lambda *args: None, dim=3, data_type=config.data_type.default_factory())
ubb_data_handler = lbmpy_espresso.BounceBackSlipVelocityUBB(
method.stencil, ubb_dynamic)

for _, target_suffix in paramlist(parameters, ("GPU", "CPU")):
# pylint: disable=unused-argument
def patch_boundary_header(content, target_suffix):
# replace real_t by actual floating-point type
return content.replace("real_t", config.data_type.default_factory().c_name) # nopep8

def patch_boundary_kernel(content, target_suffix):
if target_suffix in ["CUDA"]:
# replace preprocessor macros and pragmas
push, pop = custom_additional_extensions.generate_device_preprocessor(
"ubb_boundary", defines=("RESTRICT",))
content = re.sub(r"#ifdef __GNUC__[\s\S]+?#endif(?=\n\n|\n//)", "", content) # nopep8
content = re.sub(r"#ifdef __CUDACC__[\s\S]+?#endif(?=\n\n|\n//)", push, content, 1) # nopep8
content = re.sub(r"#ifdef __CUDACC__[\s\S]+?#endif(?=\n\n|\n//)", pop, content, 1) # nopep8
assert push in content
assert pop in content
return content

for _, target_suffix in paramlist(parameters, ("CPU", "GPU")):
class_name = f"Dynamic_UBB_{precision_suffix}{target_suffix}"
lbmpy_walberla.generate_boundary(
ctx, f"Dynamic_UBB_{precision_suffix}{target_suffix}", ubb_dynamic,
method, additional_data_handler=ubb_data_handler,
ctx, class_name, ubb_dynamic, method,
additional_data_handler=ubb_data_handler,
streaming_pattern=streaming_pattern, target=target)

with open(f"Dynamic_UBB_{precision_suffix}{target_suffix}.h", "r+") as f:
content = f.read()
f.seek(0)
f.truncate(0)
# patch for floating point accuracy
content = content.replace("real_t",
config.data_type.default_factory().c_name)
f.write(content)
patch_file(class_name, get_ext_header(target_suffix),
target_suffix, patch_boundary_header)
patch_file(class_name, get_ext_source(target_suffix),
target_suffix, patch_boundary_kernel)
7 changes: 4 additions & 3 deletions maintainer/walberla_kernels/pystencils_espresso.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,8 @@ def generate_pack_info_pdfs_field_assignments(fields, streaming_pattern):
return lbm_update_rule.all_assignments


def generate_pack_info_vector_field_specifications(config, stencil, layout):
def generate_pack_info_field_specifications(
config, stencil, layout, vec_len=3):
import collections
import itertools
field = ps.Field.create_generic(
Expand All @@ -248,7 +249,7 @@ def generate_pack_info_vector_field_specifications(config, stencil, layout):
data_type_np[config.data_type.default_factory().c_name],
index_dimensions=1,
layout=layout,
index_shape=(3,)
index_shape=(vec_len,)
)
q = len(stencil)
coord = itertools.product(*[(-1, 0, 1)] * 3)
Expand All @@ -257,7 +258,7 @@ def generate_pack_info_vector_field_specifications(config, stencil, layout):
else:
dirs = tuple((i, j, k) for i, j, k in coord)
spec = collections.defaultdict(set)
spec[dirs] = {field[0, 0, 0](i) for i in range(3)}
spec[dirs] = {field[0, 0, 0](i) for i in range(vec_len)}
return spec


Expand Down
Loading