From 716a3771ba0358ad38e2671ff66b7f18f253199c Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Wed, 10 Apr 2024 10:27:20 +0000 Subject: [PATCH 01/58] compile: Reuse BlockDimensions when it makes sense --- devito/passes/clusters/blocking.py | 74 ++++++++++++++++++++++-------- tests/test_dimension.py | 10 ++-- tests/test_dle.py | 41 +++++++++++++++-- tests/test_dse.py | 2 +- 4 files changed, 98 insertions(+), 29 deletions(-) diff --git a/devito/passes/clusters/blocking.py b/devito/passes/clusters/blocking.py index 3a47dbd2b5..b7ec5a2408 100644 --- a/devito/passes/clusters/blocking.py +++ b/devito/passes/clusters/blocking.py @@ -7,7 +7,7 @@ IntervalGroup, IterationSpace, Scope) from devito.passes import is_on_device from devito.symbolics import search, uxreplace, xreplace_indices -from devito.tools import (UnboundedMultiTuple, UnboundTuple, as_tuple, +from devito.tools import (UnboundedMultiTuple, UnboundTuple, as_mapper, as_tuple, filter_ordered, flatten, is_integer, prod) from devito.types import BlockDimension @@ -299,6 +299,11 @@ def __init__(self, sregistry, options): self.levels = options['blocklevels'] self.par_tile = options['par-tile'] + # Track the BlockDimensions created so far so that we can reuse them + # in case of Clusters that are different but share the same number of + # stencil points + self.mapper = {} + super().__init__() def process(self, clusters): @@ -310,29 +315,25 @@ def process(self, clusters): return self._process_fdta(clusters, 1, blk_size_gen=blk_size_gen) - def callback(self, clusters, prefix, blk_size_gen=None): - if not prefix: - return clusters - - d = prefix[-1].dim - - if not any(c.properties.is_blockable(d) for c in clusters): - return clusters - - # Create the block Dimensions (in total `self.levels` Dimensions) - base = self.sregistry.make_name(prefix=d.name) - + def _derive_block_dims(self, clusters, prefix, d, blk_size_gen): if blk_size_gen is not None: # By passing a suitable key to `next` we ensure that we pull the # next par-tile entry iff we're now blocking an unseen TILABLE nest - try: - step = sympify(blk_size_gen.next(prefix, d, clusters)) - except StopIteration: - return clusters + step = sympify(blk_size_gen.next(prefix, d, clusters)) else: # This will result in a parametric step, e.g. `x0_blk0_size` step = None + # Can I reuse existing BlockDimensions to avoid a proliferation of steps? + k = stencil_footprint(clusters, d) + if step is None: + try: + return self.mapper[k] + except KeyError: + pass + + base = self.sregistry.make_name(prefix=d.name) + name = self.sregistry.make_name(prefix="%s_blk" % base) bd = BlockDimension(name, d, d.symbolic_min, d.symbolic_max, step) step = bd.step @@ -346,6 +347,26 @@ def callback(self, clusters, prefix, blk_size_gen=None): bd = BlockDimension(d.name, bd, bd, bd + bd.step - 1, 1, size=step) block_dims.append(bd) + retval = self.mapper[k] = tuple(block_dims), bd + + return retval + + def callback(self, clusters, prefix, blk_size_gen=None): + if not prefix: + return clusters + + d = prefix[-1].dim + + if not any(c.properties.is_blockable(d) for c in clusters): + return clusters + + try: + block_dims, bd = self._derive_block_dims( + clusters, prefix, d, blk_size_gen + ) + except StopIteration: + return clusters + processed = [] for c in clusters: if c.properties.is_blockable(d): @@ -368,6 +389,23 @@ def callback(self, clusters, prefix, blk_size_gen=None): return processed +def stencil_footprint(clusters, d): + """ + Compute the number of stencil points in the given Dimension `d` across the + provided Clusters. + """ + # The following would be an approximation in the case of irregular Clusters, + # but if we're here it means the Clusters are likely regular, so it's fine + indexeds = set().union(*[c.scope.indexeds for c in clusters]) + indexeds = [i for i in indexeds if d._defines & set(i.dimensions)] + + # Distinguish between footprints pertaining to different Functions + mapper = as_mapper(indexeds, lambda i: i.function) + n = tuple(sorted(len(v) for v in mapper.values())) + + return (d, n) + + def decompose(ispace, d, block_dims): """ Create a new IterationSpace in which the `d` Interval is decomposed @@ -384,7 +422,7 @@ def decompose(ispace, d, block_dims): # Create the intervals relations # 1: `bbd > bd` - relations = [tuple(block_dims)] + relations = [block_dims] # 2: Suitably replace `d` with all `bd`'s for r in ispace.relations: diff --git a/tests/test_dimension.py b/tests/test_dimension.py index dd8b9f94e3..8fc01837ce 100644 --- a/tests/test_dimension.py +++ b/tests/test_dimension.py @@ -1481,7 +1481,7 @@ def test_no_fusion_simple(self): eqns = [Eq(f.forward, f + 1), Eq(h, f + 1), - Eq(g, f.dx + 1, implicit_dims=[ctime])] + Eq(g, f.dx + h + 1, implicit_dims=[ctime])] op = Operator(eqns) @@ -1511,10 +1511,10 @@ def test_no_fusion_convoluted(self): eqns = [Eq(f.forward, f + 1), Eq(h, f + 1), - Eq(g, f + 1, implicit_dims=[ctime]), - Eq(f.forward, f + 1, implicit_dims=[ctime]), - Eq(f.forward, f + 1), - Eq(g, f + 1)] + Eq(g, f + h + 1, implicit_dims=[ctime]), + Eq(f.forward, f + h + 1, implicit_dims=[ctime]), + Eq(f.forward, f.dx + h + 1), + Eq(g, f.dx + h + 1)] op = Operator(eqns) diff --git a/tests/test_dle.py b/tests/test_dle.py index 0a44da9fbe..e2af0ed7e3 100644 --- a/tests/test_dle.py +++ b/tests/test_dle.py @@ -1,6 +1,7 @@ from functools import reduce from operator import mul +import sympy import numpy as np import pytest @@ -14,7 +15,7 @@ retrieve_iteration_tree, Expression) from devito.passes.iet.languages.openmp import Ompizer, OmpRegion from devito.tools import as_tuple -from devito.types import Scalar, Symbol +from devito.types import Barrier, Scalar, Symbol def get_blocksizes(op, opt, grid, blockshape, level=0): @@ -253,8 +254,7 @@ def test_leftright_subdims(self): op = Operator(eqns, opt=('fission', 'blocking', {'blockrelax': 'device-aware'})) - bns, _ = assert_blocking(op, {'x0_blk0', 'xl0_blk0', 'xr0_blk0', - 'x1_blk0', 'x2_blk0'}) + bns, _ = assert_blocking(op, {'x0_blk0', 'xl0_blk0', 'xr0_blk0'}) assert all(IsPerfectIteration().visit(i) for i in bns.values()) assert all(len(FindNodes(Iteration).visit(i)) == 4 for i in bns.values()) @@ -385,8 +385,8 @@ def test_custom_rule0(self): # Check generated code. By having specified "1" as rule, we expect the # given par-tile to be applied to the kernel with id 1 - bns, _ = assert_blocking(op, {'z0_blk0', 'x1_blk0', 'z2_blk0'}) - root = bns['x1_blk0'] + bns, _ = assert_blocking(op, {'z0_blk0', 'x0_blk0', 'z2_blk0'}) + root = bns['x0_blk0'] iters = FindNodes(Iteration).visit(root) iters = [i for i in iters if i.dim.is_Block and i.dim._depth == 1] assert len(iters) == 3 @@ -600,6 +600,37 @@ def test_cache_blocking_imperfect_nest_v2(blockinner): assert np.allclose(u.data, u2.data, rtol=1e-07) +def test_cache_blocking_reuse_blk_dims(): + grid = Grid(shape=(16, 16, 16)) + time = grid.time_dim + + u = TimeFunction(name='u', grid=grid, space_order=4) + v = TimeFunction(name='v', grid=grid, space_order=4) + w = TimeFunction(name='w', grid=grid, space_order=4) + r = TimeFunction(name='r', grid=grid, space_order=4) + + # Use barriers to prevent fusion of otherwise fusible expressions; I could + # have created data dependencies to achieve the same effect, but that would + # have made the test more complex + class DummyBarrier(sympy.Function, Barrier): + pass + + eqns = [Eq(u.forward, u.dx + v.dy), + Eq(Symbol('dummy0'), DummyBarrier(time)), + Eq(v.forward, v.dx), + Eq(Symbol('dummy1'), DummyBarrier(time)), + Eq(w.forward, w.dx), + Eq(Symbol('dummy2'), DummyBarrier(time)), + Eq(r.forward, r.dy + 1)] + + op = Operator(eqns, openmp=False) + + unique = 't,x0_blk0,y0_blk0,x,y,z' + reused = 't,x1_blk0,y1_blk0,x,y,z' + assert_structure(op, [unique, 't', reused, reused, reused], + unique+reused[1:]+reused[1:]+reused[1:]) + + class TestNodeParallelism: def test_nthreads_generation(self): diff --git a/tests/test_dse.py b/tests/test_dse.py index 4a71e86c0b..2a52b443a7 100644 --- a/tests/test_dse.py +++ b/tests/test_dse.py @@ -1686,7 +1686,7 @@ def test_full_shape_big_temporaries(self): op1 = Operator(eqn, opt=('advanced-fsg', {'openmp': True, 'cire-mingain': 0})) # Check code generation - bns, _ = assert_blocking(op1, {'x0_blk0', 'x1_blk0'}) + bns, _ = assert_blocking(op1, {'x0_blk0'}) xs, ys, zs = get_params(op1, 'x_size', 'y_size', 'z_size') arrays = [i for i in FindSymbols().visit(bns['x0_blk0']) if i.is_Array] assert len(arrays) == 1 From 953590a1fdad93ab15d2afff9ef97a1d3af89b33 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Wed, 10 Apr 2024 12:43:04 +0000 Subject: [PATCH 02/58] api: Accept frozendict for opt-options --- devito/operator/operator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/devito/operator/operator.py b/devito/operator/operator.py index f2fad8244c..43cc0f1055 100644 --- a/devito/operator/operator.py +++ b/devito/operator/operator.py @@ -1153,7 +1153,7 @@ def parse_kwargs(**kwargs): elif isinstance(opt, tuple): if len(opt) == 0: mode, options = 'noop', {} - elif isinstance(opt[-1], dict): + elif isinstance(opt[-1], (dict, frozendict)): if len(opt) == 2: mode, options = opt else: From ce8673ce0f6d2290e6ef213b2b799e337de6f27d Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Wed, 10 Apr 2024 15:49:22 +0000 Subject: [PATCH 03/58] compiler: Tweak profiling of subsections --- devito/ir/stree/algorithms.py | 5 +++-- devito/ir/stree/tree.py | 6 ++++++ devito/operator/profiling.py | 35 ++++++++------------------------- devito/passes/iet/instrument.py | 24 +++++++++++++--------- tests/test_gpu_common.py | 18 ++++++++--------- 5 files changed, 41 insertions(+), 47 deletions(-) diff --git a/devito/ir/stree/algorithms.py b/devito/ir/stree/algorithms.py index 73ac219166..31b47ea869 100644 --- a/devito/ir/stree/algorithms.py +++ b/devito/ir/stree/algorithms.py @@ -114,8 +114,9 @@ def stree_build(clusters, profiler=None, **kwargs): if i.is_Halo: candidate = i elif i.is_Sync: - attach_section(i) - section = None + if profiler._verbosity > 0 or not i.is_async: + attach_section(i) + section = None break elif i.is_Iteration: if (i.dim.is_Time and SEQUENTIAL in i.properties): diff --git a/devito/ir/stree/tree.py b/devito/ir/stree/tree.py index f461c91367..252349d9e6 100644 --- a/devito/ir/stree/tree.py +++ b/devito/ir/stree/tree.py @@ -1,5 +1,7 @@ from anytree import NodeMixin, PostOrderIter, RenderTree, ContStyle +from devito.ir.support import WithLock, PrefetchUpdate + __all__ = ["ScheduleTree", "NodeSection", "NodeIteration", "NodeConditional", "NodeSync", "NodeExprs", "NodeHalo"] @@ -98,6 +100,10 @@ def __init__(self, sync_ops, parent=None): def __repr_render__(self): return "Sync[%s]" % ",".join(i.__class__.__name__ for i in self.sync_ops) + @property + def is_async(self): + return any(isinstance(i, (WithLock, PrefetchUpdate)) for i in self.sync_ops) + class NodeExprs(ScheduleTree): diff --git a/devito/operator/profiling.py b/devito/operator/profiling.py index e2d576d0f3..e4befb7484 100644 --- a/devito/operator/profiling.py +++ b/devito/operator/profiling.py @@ -11,12 +11,11 @@ import numpy as np from sympy import S -from devito.ir.iet import (BusyWait, ExpressionBundle, List, TimedList, Section, +from devito.ir.iet import (ExpressionBundle, List, TimedList, Section, Iteration, FindNodes, Transformer) from devito.ir.support import IntervalGroup from devito.logger import warning, error from devito.mpi import MPI -from devito.mpi.routines import MPICall, MPIList, RemainderCall, ComputeCall from devito.parameters import configuration from devito.symbolics import subs_op_args from devito.tools import DefaultOrderedDict, flatten @@ -38,6 +37,8 @@ class Profiler: _supports_async_sections = False + _verbosity = 0 + def __init__(self, name): self.name = name @@ -175,10 +176,6 @@ def record_ops_variation(self, initial, final): def all_sections(self): return list(self._sections) + flatten(self._subsections.values()) - @property - def trackable_subsections(self): - return () - def summary(self, args, dtype, reduce_over=None): """ Return a PerformanceSummary of the profiled sections. @@ -213,17 +210,11 @@ def summary(self, args, dtype, reduce_over=None): class ProfilerVerbose1(Profiler): - - @property - def trackable_subsections(self): - return (MPIList, RemainderCall, BusyWait) + _verbosity = 1 class ProfilerVerbose2(Profiler): - - @property - def trackable_subsections(self): - return (MPICall, BusyWait) + _verbosity = 2 class AdvancedProfiler(Profiler): @@ -352,17 +343,11 @@ class AdvancedProfilerVerbose(AdvancedProfiler): class AdvancedProfilerVerbose1(AdvancedProfilerVerbose): - - @property - def trackable_subsections(self): - return (MPIList, RemainderCall, BusyWait) + _verbosity = 1 class AdvancedProfilerVerbose2(AdvancedProfilerVerbose): - - @property - def trackable_subsections(self): - return (MPICall, BusyWait, ComputeCall) + _verbosity = 2 class AdvisorProfiler(AdvancedProfiler): @@ -432,14 +417,10 @@ def add(self, name, rank, time, # Do not show unexecuted Sections (i.e., loop trip count was 0) if traffic == 0: return - # Do not show dynamic Sections (i.e., loop trip counts varies dynamically) - if traffic is not None and np.isnan(traffic): - assert np.isnan(points) - return k = PerfKey(name, rank) - if not ops: + if not ops or any(np.isnan(i) for i in [ops, points, traffic]): self[k] = PerfEntry(time, 0.0, 0.0, 0.0, 0, []) else: gflops = float(ops)/10**9 diff --git a/devito/passes/iet/instrument.py b/devito/passes/iet/instrument.py index b844fd9a08..214f4c6fa1 100644 --- a/devito/passes/iet/instrument.py +++ b/devito/passes/iet/instrument.py @@ -1,7 +1,7 @@ from itertools import groupby -from devito.ir.iet import (BusyWait, FindNodes, FindSymbols, MapNodes, Iteration, - Section, TimedList, Transformer) +from devito.ir.iet import (BusyWait, Iteration, Section, TimedList, + FindNodes, FindSymbols, MapNodes, Transformer) from devito.mpi.routines import (HaloUpdateCall, HaloWaitCall, MPICall, MPIList, HaloUpdateList, HaloWaitList, RemainderCall, ComputeCall) @@ -12,12 +12,13 @@ def instrument(graph, **kwargs): - track_subsections(graph, **kwargs) - - # Construct a fresh Timer object profiler = kwargs['profiler'] if profiler is None: return + + track_subsections(graph, **kwargs) + + # Construct a fresh Timer object timer = Timer(profiler.name, list(profiler.all_sections)) instrument_sections(graph, timer=timer, **kwargs) @@ -47,14 +48,19 @@ def track_subsections(iet, **kwargs): BusyWait: 'busywait' } + verbosity_mapper = { + 0: (), + 1: (MPIList, RemainderCall, BusyWait), + 2: (MPICall, ComputeCall, BusyWait), + } + mapper = {} - # MPI Calls, busy-waiting - for NodeType in [MPIList, MPICall, BusyWait, ComputeCall]: + # Enable/disable profiling of sub-Sections + for NodeType in verbosity_mapper[profiler._verbosity]: for k, v in MapNodes(Section, NodeType).visit(iet).items(): for i in v: - if i in mapper or not any(issubclass(i.__class__, n) - for n in profiler.trackable_subsections): + if i in mapper: continue name = sregistry.make_name(prefix=name_mapper[i.__class__]) mapper[i] = Section(name, body=i, is_subsection=True) diff --git a/tests/test_gpu_common.py b/tests/test_gpu_common.py index 8da55da384..d791511d3f 100644 --- a/tests/test_gpu_common.py +++ b/tests/test_gpu_common.py @@ -185,9 +185,8 @@ def test_tasking_in_isolation(self, opt): assert len(retrieve_iteration_tree(op)) == 3 assert len([i for i in FindSymbols().visit(op) if isinstance(i, Lock)]) == 1 sections = FindNodes(Section).visit(op) - assert len(sections) == 3 + assert len(sections) == 2 assert str(sections[0].body[0].body[0].body[0].body[0]) == 'while(lock0[0] == 0);' - body = sections[2].body[0].body[0] body = op._func_table['release_lock0'].root.body assert str(body.body[0].condition) == 'Ne(lock0[0], 2)' assert str(body.body[1]) == 'lock0[0] = 0;' @@ -257,13 +256,14 @@ def test_tasking_unfused_two_locks(self): op = Operator(eqns, opt=('tasking', 'fuse', 'orchestrate', {'linearize': False})) # Check generated code - assert len(retrieve_iteration_tree(op)) == 3 + trees = retrieve_iteration_tree(op) + assert len(trees) == 3 assert len([i for i in FindSymbols().visit(op) if isinstance(i, Lock)]) == 4 sections = FindNodes(Section).visit(op) - assert len(sections) == 4 + assert len(sections) == 2 assert (str(sections[1].body[0].body[0].body[0].body[0]) == 'while(lock0[0] == 0 || lock1[0] == 0);') # Wait-lock - body = sections[2].body[0].body[0] + body = trees[-1].root.nodes[-2] assert str(body.body[0]) == 'release_lock0(lock0);' assert str(body.body[1]) == 'activate0(time,sdata0);' assert len(op._func_table) == 5 @@ -300,7 +300,7 @@ def test_tasking_forcefuse(self): assert len(retrieve_iteration_tree(op)) == 3 assert len([i for i in FindSymbols().visit(op) if isinstance(i, Lock)]) == 2 sections = FindNodes(Section).visit(op) - assert len(sections) == 3 + assert len(sections) == 2 assert (str(sections[1].body[0].body[0].body[0].body[0]) == 'while(lock0[0] == 0 || lock1[0] == 0);') # Wait-lock body = op._func_table['release_lock0'].root.body @@ -369,7 +369,7 @@ def test_tasking_multi_output(self): assert len(retrieve_iteration_tree(op1)) == 3 assert len([i for i in FindSymbols().visit(op1) if isinstance(i, Lock)]) == 1 sections = FindNodes(Section).visit(op1) - assert len(sections) == 2 + assert len(sections) == 1 assert str(sections[0].body[0].body[0].body[0].body[0]) ==\ 'while(lock0[t2] == 0);' body = op1._func_table['release_lock0'].root.body @@ -407,7 +407,7 @@ def test_tasking_lock_placement(self): assert len(retrieve_iteration_tree(op)) == 4 assert len([i for i in FindSymbols().visit(op) if isinstance(i, Lock)]) == 1 sections = FindNodes(Section).visit(op) - assert len(sections) == 3 + assert len(sections) == 2 assert sections[0].body[0].body[0].body[0].is_Iteration assert str(sections[1].body[0].body[0].body[0].body[0]) ==\ 'while(lock0[t1] == 0);' @@ -889,7 +889,7 @@ def test_tasking_over_compiler_generated(self): assert len(retrieve_iteration_tree(op)) == 4 assert len([i for i in FindSymbols().visit(op) if isinstance(i, Lock)]) == 1 sections = FindNodes(Section).visit(op) - assert len(sections) == 4 + assert len(sections) == 3 assert 'while(lock0[t1] == 0)' in str(sections[2].body[0].body[0].body[0]) op0.apply(time_M=nt-1) From e257a09d184c189aac31bcef2ba958ea60401d49 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Thu, 11 Apr 2024 09:10:04 +0000 Subject: [PATCH 04/58] compiler: Trim down Operator apply profiling --- devito/operator/operator.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/devito/operator/operator.py b/devito/operator/operator.py index 43cc0f1055..3cbeb1360b 100644 --- a/devito/operator/operator.py +++ b/devito/operator/operator.py @@ -952,20 +952,28 @@ def _emit_apply_profiling(self, args): # Emit local, i.e. "per-rank" performance. Without MPI, this is the only # thing that will be emitted def lower_perfentry(v): + values = [] + if v.oi: + values.append("OI=%.2f" % fround(v.oi)) if v.gflopss: - oi = "OI=%.2f" % fround(v.oi) - gflopss = "%.2f GFlops/s" % fround(v.gflopss) - gpointss = "%.2f GPts/s" % fround(v.gpointss) - return "[%s]" % ", ".join([oi, gflopss, gpointss]) - elif v.gpointss: - gpointss = "%.2f GPts/s" % fround(v.gpointss) - return "[%s]" % gpointss + values.append("%.2f GFlops/s" % fround(v.gflopss)) + if v.gpointss: + values.append("%.2f GPts/s" % fround(v.gpointss)) + + if values: + return "[%s]" % ", ".join(values) else: return "" for k, v in summary.items(): rank = "[rank%d]" % k.rank if k.rank is not None else "" + if v.time <= 0.01: + # Trim down the output for very fast sections + name = "%s%s<>" % (k.name, rank) + perf("%s* %s ran in %.2f s" % (indent, name, fround(v.time))) + continue + metrics = lower_perfentry(v) itershapes = [",".join(str(i) for i in its) for its in v.itershapes] From 05a409f240fa08cbbea570f124f62ab53549a72e Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Tue, 16 Apr 2024 13:54:20 +0000 Subject: [PATCH 05/58] compiler: Add Operator._sanitize_exprs --- devito/operator/operator.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/devito/operator/operator.py b/devito/operator/operator.py index 3cbeb1360b..d8400259a8 100644 --- a/devito/operator/operator.py +++ b/devito/operator/operator.py @@ -152,9 +152,10 @@ def __new__(cls, expressions, **kwargs): # The Operator type for the given target cls = operator_selector(**kwargs) - # Normalize input arguments for the selected Operator + # Preprocess input arguments kwargs = cls._normalize_kwargs(**kwargs) cls._check_kwargs(**kwargs) + expressions = cls._sanitize_exprs(expressions, **kwargs) # Lower to a JIT-compilable object with timed_region('op-compile') as r: @@ -174,6 +175,15 @@ def _normalize_kwargs(cls, **kwargs): def _check_kwargs(cls, **kwargs): return + @classmethod + def _sanitize_exprs(cls, expressions, **kwargs): + expressions = as_tuple(expressions) + + if any(not isinstance(i, Evaluable) for i in expressions): + raise InvalidOperator("Only `devito.Evaluable` are allowed.") + + return expressions + @classmethod def _build(cls, expressions, **kwargs): # Python- (i.e., compile-) and C-level (i.e., run-time) performance @@ -244,10 +254,6 @@ def _lower(cls, expressions, **kwargs): expressions = as_tuple(expressions) - # Input check - if any(not isinstance(i, Evaluable) for i in expressions): - raise InvalidOperator("Only `devito.Evaluable` are allowed.") - # Enable recursive lowering # This may be used by a compilation pass that constructs a new # expression for which a partial or complete lowering is desired From 1688c5aa4fe8b70cdabbb2d6c0aa9be043770681 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Fri, 12 Apr 2024 13:53:47 +0000 Subject: [PATCH 06/58] compiler: Add VirtualDimension --- devito/ir/clusters/cluster.py | 4 ++-- devito/ir/iet/algorithms.py | 8 +++++-- devito/ir/stree/algorithms.py | 5 ++-- devito/passes/clusters/misc.py | 2 +- devito/types/dimension.py | 44 ++++++++++++++++++++++++++++++++-- 5 files changed, 54 insertions(+), 9 deletions(-) diff --git a/devito/ir/clusters/cluster.py b/devito/ir/clusters/cluster.py index 3179cc44ae..542bf6647a 100644 --- a/devito/ir/clusters/cluster.py +++ b/devito/ir/clusters/cluster.py @@ -341,9 +341,9 @@ def dspace(self): if len(ret) != 1: continue if ret.pop().direction is Forward: - intervals = intervals.translate(d, v1=-1) + intervals = intervals.translate(d._defines, v1=-1) else: - intervals = intervals.translate(d, 1) + intervals = intervals.translate(d._defines, 1) for d in self.properties: if self.properties.is_inbound(d): intervals = intervals.zero(d._defines) diff --git a/devito/ir/iet/algorithms.py b/devito/ir/iet/algorithms.py index ec6381698e..0b57b876f7 100644 --- a/devito/ir/iet/algorithms.py +++ b/devito/ir/iet/algorithms.py @@ -32,8 +32,12 @@ def iet_build(stree): body = Conditional(i.guard, queues.pop(i)) elif i.is_Iteration: - body = Iteration(queues.pop(i), i.dim, i.limits, direction=i.direction, - properties=i.properties, uindices=i.sub_iterators) + if i.dim.is_Virtual: + body = List(body=queues.pop(i)) + else: + body = Iteration(queues.pop(i), i.dim, i.limits, + direction=i.direction, properties=i.properties, + uindices=i.sub_iterators) elif i.is_Section: body = Section('section%d' % nsections, body=queues.pop(i)) diff --git a/devito/ir/stree/algorithms.py b/devito/ir/stree/algorithms.py index 31b47ea869..f154197a97 100644 --- a/devito/ir/stree/algorithms.py +++ b/devito/ir/stree/algorithms.py @@ -76,8 +76,9 @@ def stree_build(clusters, profiler=None, **kwargs): # Add in Node{Iteration,Conditional,Sync} for it in c.itintervals[index:]: d = it.dim - tip = NodeIteration(c.ispace.project([d]), tip, c.properties.get(d, ())) - mapper[it].top = tip + ispace = c.ispace.project([d]) + properties = c.properties.get(d, ()) + mapper[it].top = tip = NodeIteration(ispace, tip, properties) tip = augment_whole_subtree(c, tip, mapper, it) # Attach NodeHalo if necessary diff --git a/devito/passes/clusters/misc.py b/devito/passes/clusters/misc.py index 32f26ad782..c38a1bac26 100644 --- a/devito/passes/clusters/misc.py +++ b/devito/passes/clusters/misc.py @@ -32,8 +32,8 @@ def callback(self, clusters, prefix): if not prefix: # No iteration space to be lifted from return clusters - dim = prefix[-1].dim + hope_invariant = dim._defines outer = set().union(*[i.dim._defines for i in prefix[:-1]]) diff --git a/devito/types/dimension.py b/devito/types/dimension.py index f9a6847a9e..0c6f36f432 100644 --- a/devito/types/dimension.py +++ b/devito/types/dimension.py @@ -15,8 +15,9 @@ from devito.types.constant import Constant __all__ = ['Dimension', 'SpaceDimension', 'TimeDimension', 'DefaultDimension', - 'CustomDimension', 'SteppingDimension', 'SubDimension', 'ConditionalDimension', - 'ModuloDimension', 'IncrDimension', 'BlockDimension', 'StencilDimension', + 'CustomDimension', 'SteppingDimension', 'SubDimension', + 'ConditionalDimension', 'ModuloDimension', 'IncrDimension', + 'BlockDimension', 'StencilDimension', 'VirtualDimension', 'Spacing', 'dimensions'] @@ -108,6 +109,7 @@ class Dimension(ArgProvider): is_Modulo = False is_Incr = False is_Block = False + is_Virtual = False # Prioritize self's __add__ and __sub__ to construct AffineIndexAccessFunction _op_priority = sympy.Expr._op_priority + 1. @@ -1493,6 +1495,44 @@ def _arg_values(self, *args, **kwargs): return {} +class VirtualDimension(CustomDimension): + + """ + Dimension symbol representing a mock iteration space, which as such + is eventually ditched by the compiler. + + Mock iteration spaces are used for compilation purposes only, typically + to bind objects such as Guards and Syncs to a specific point in the + program flow. + + Examples + -------- + To generate nested conditionals within the same loop nest, one may use + VirtualDimensions to represent the different branches of the conditionals. + + .. code-block:: C + + for (int i = i_m; i <= i_M; i += 1) + if (i < 10) + if (i < 5) + do A(i); + if (i >= 5) + do B(i); + + The above code can be obtained by using one VirtualDimension for the + `i < 5` conditional and another VirtualDimension for the `i >= 5` conditional. + """ + + is_Virtual = True + + __rkwargs__ = ('parent',) + + def __init_finalize__(self, name, parent=None): + super().__init_finalize__(name, parent=parent, + symbolic_min=sympy.S.Zero, + symbolic_max=sympy.S.One) + + # *** # The Dimensions below are created by Devito and may eventually be # accessed in user code to e.g. construct or manipulate Eqs From 5ab35e3d3691100f77ed99fdcf5d7271f3c4a075 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Mon, 15 Apr 2024 08:46:36 +0000 Subject: [PATCH 07/58] compiler: Add IterationSpace.{split,insert} --- devito/ir/support/space.py | 30 ++++++++++++++++++++++++++++++ devito/passes/clusters/implicit.py | 11 +++++++++++ 2 files changed, 41 insertions(+) diff --git a/devito/ir/support/space.py b/devito/ir/support/space.py index dc7b26ec7c..689258a3d0 100644 --- a/devito/ir/support/space.py +++ b/devito/ir/support/space.py @@ -958,6 +958,36 @@ def prefix(self, key): return self[:self.index(i.dim) + 1] + def split(self, d): + """ + Split the IterationSpace into two IterationSpaces, the first + containing all Intervals up to and including the one defined + over Dimension `d`, and the second containing all Intervals starting + from the one defined over Dimension `d`. + """ + if isinstance(d, IterationInterval): + d = d.dim + + try: + n = self.index(d) + 1 + return self[:n], self[n:] + except ValueError: + return self, IterationSpace([]) + + def insert(self, d, dimensions, sub_iterators=None, directions=None): + """ + Insert new Dimensions into the IterationSpace before Dimension `d`. + """ + dimensions = as_tuple(dimensions) + + intervals = IntervalGroup([Interval(i) for i in dimensions]) + ispace = IterationSpace(intervals, sub_iterators, directions) + + ispace0, ispace1 = self.split(d) + relations = (ispace0.itdims + dimensions, dimensions + ispace1.itdims) + + return IterationSpace.union(ispace0, ispace, ispace1, relations=relations) + def reorder(self, relations=None, mode=None): relations = relations or self.relations mode = mode or self.mode diff --git a/devito/passes/clusters/implicit.py b/devito/passes/clusters/implicit.py index a85d713b4e..a9018881ae 100644 --- a/devito/passes/clusters/implicit.py +++ b/devito/passes/clusters/implicit.py @@ -100,6 +100,11 @@ def callback(self, clusters, prefix): processed.append(c) continue + #TODO: use c.ispace.split(dd) and run test suite... not doing it + # now because the state of the current branches is a bit fragile + #ALSO note that .split would require the dimension after `dd` so + #we'll have to change idx to idx+1, or simply take `dim`!! + #This will simplify this code a lot!! n = c.ispace.index(dd) ispace0 = c.ispace[:n] ispace1 = c.ispace[n:] @@ -116,6 +121,12 @@ def callback(self, clusters, prefix): # The IterationSpace induced by the MultiSubDomain if dims: + #TODO: use the new c.ispace.insert !!! + #TODO: in fact, scratch the first TODO above... just call insert directly + #here, I won't even have to call split as it will be done automatically + #by insert... but then I will have to + #TODO: actually there's an issue with this new insert as I'll be like + #constructing ispace0,ispaceN,ispace1 while below I only have ispace0,ispaceN? intervals = [Interval(i) for i in dims] ispaceN = IterationSpace(IntervalGroup(intervals), sub_iterators) From 1744f04fda1dbd885a89f65316180aded060468a Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Wed, 17 Apr 2024 09:05:38 +0000 Subject: [PATCH 08/58] compiler: Simplify gpu callbacks --- devito/core/gpu.py | 95 +++++++++++++++++----------- devito/passes/clusters/asynchrony.py | 39 ++++-------- devito/passes/clusters/buffering.py | 9 ++- 3 files changed, 74 insertions(+), 69 deletions(-) diff --git a/devito/core/gpu.py b/devito/core/gpu.py index 266c198647..0af10ea6a3 100644 --- a/devito/core/gpu.py +++ b/devito/core/gpu.py @@ -7,7 +7,7 @@ from devito.operator.operator import rcompile from devito.passes import is_on_device from devito.passes.equations import collect_derivatives -from devito.passes.clusters import (Lift, Streaming, Tasker, blocking, buffering, +from devito.passes.clusters import (Lift, MemcpyAsync, Tasker, blocking, buffering, cire, cse, factorize, fission, fuse, optimize_pows) from devito.passes.iet import (DeviceOmpTarget, DeviceAccTarget, mpiize, @@ -266,15 +266,15 @@ def _make_clusters_passes_mapper(cls, **kwargs): platform = kwargs['platform'] sregistry = kwargs['sregistry'] - # Callbacks used by `buffering`, `Tasking` and `Streaming` - callback = lambda f: on_host(f, options) - runs_on_host, reads_if_on_host = make_callbacks(options) + stream_key = stream_key_wrap(options) + task_key = task_wrap(stream_key) + memcpy_key = memcpy_wrap(stream_key, task_key) return { - 'buffering': lambda i: buffering(i, callback, sregistry, options), + 'buffering': lambda i: buffering(i, stream_key, sregistry, options), 'blocking': lambda i: blocking(i, sregistry, options), - 'tasking': Tasker(runs_on_host, sregistry).process, - 'streaming': Streaming(reads_if_on_host, sregistry).process, + 'tasking': Tasker(task_key, sregistry).process, + 'streaming': MemcpyAsync(memcpy_key, sregistry).process, 'factorize': factorize, 'fission': fission, 'fuse': lambda i: fuse(i, options=options), @@ -419,39 +419,60 @@ def _make_iet_passes_mapper(cls, **kwargs): assert not (set(_known_passes) & set(DeviceCustomOperator._known_passes_disabled)) -# Utils +# *** Utils -def on_host(f, options): - # A Dimension in `f` defining an IterationSpace that definitely - # gets executed on the host, regardless of whether it's parallel - # or sequential - if not is_on_device(f, options['gpu-fit']): - return f.time_dim - else: - return None +def stream_key_wrap(options): + def func(items): + """ + Given one or more Functions `f(d_1, ...d_n)`, return the Dimension `d_i` + that is more likely to require data streaming, since the size of + `(*, d_{i+1}, ..., d_n)` is unlikely to fit into the device memory. + If more than one such Dimension is found, an exception is raised. + """ + functions = as_tuple(items) + retval = {None if is_on_device(f, options['gpu-fit']) else f.time_dim + for f in functions} + retval.discard(None) + if len(retval) > 1: + raise ValueError("Ambiguous stream key") + try: + return retval.pop() + except KeyError: + return None + return func -def make_callbacks(options, key=None): - """ - Options-dependent callbacks used by various compiler passes. - """ - if key is None: - key = lambda f: on_host(f, options) - - def runs_on_host(c): - # The only situation in which a Cluster doesn't get offloaded to - # the device is when it writes to a host Function - retval = {key(f) for f in c.scope.writes} - {None} - retval = set().union(*[d._defines for d in retval]) - return retval - - def reads_if_on_host(c): - if not runs_on_host(c): - retval = {key(f) for f in c.scope.reads} - {None} - retval = set().union(*[d._defines for d in retval]) - return retval - else: +def task_wrap(stream_key): + def func(c): + """ + A Cluster `c` is turned into an "asynchronous task" if it writes to a + Function that cannot be stored in device memory. + + This function returns the Dimension(s) that is more likely to require data + streaming, or the empty set if no such Dimension is found. + """ + try: + return stream_key(c.scope.writes)._defines + except AttributeError: return set() + return func - return runs_on_host, reads_if_on_host + +def memcpy_wrap(stream_key, task_key): + def func(c): + """ + A Cluster `c` is turned into a "memcpy task" if it reads from a Function + that cannot be stored in device memory. + + This function returns the Dimension(s) requiring no memcpy as they are + not in device memory, or the empty set if no memcpy is required. + """ + if task_key(c): + # Writes would take precedence over reads + return set() + try: + return stream_key(c.scope.reads)._defines + except AttributeError: + return set() + return func diff --git a/devito/passes/clusters/asynchrony.py b/devito/passes/clusters/asynchrony.py index 885787939f..3bb8cac172 100644 --- a/devito/passes/clusters/asynchrony.py +++ b/devito/passes/clusters/asynchrony.py @@ -9,35 +9,29 @@ from devito.tools import OrderedSet, is_integer, timed_pass from devito.types import CustomDimension, Lock -__all__ = ['Tasker', 'Streaming'] +__all__ = ['Tasker', 'MemcpyAsync'] class Asynchronous(Queue): - def __init__(self, key, sregistry): - assert callable(key) - self.key = key - self.sregistry = sregistry - - super().__init__() - - -class Tasker(Asynchronous): - """ Create asynchronous Clusters, or "tasks". - Parameters - ---------- - key : callable, optional - A Cluster `c` becomes an asynchronous task only if `key(c)` returns True - Notes ----- From an implementation viewpoint, an asynchronous Cluster is a Cluster with attached suitable SyncOps, such as WaitLock, WithLock, etc. """ + def __init__(self, key, sregistry): + self.key = key + self.sregistry = sregistry + + super().__init__() + + +class Tasker(Asynchronous): + @timed_pass(name='tasker') def process(self, clusters): return super().process(clusters) @@ -155,18 +149,9 @@ def callback(self, clusters, prefix): return processed -class Streaming(Asynchronous): - - """ - Tag Clusters with SyncOps to stream Functions in and out the device memory. - - Parameters - ---------- - key : callable, optional - Return the Functions that need to be streamed in a given Cluster. - """ +class MemcpyAsync(Asynchronous): - @timed_pass(name='streaming') + @timed_pass(name='async_memcpy') def process(self, clusters): return super().process(clusters) diff --git a/devito/passes/clusters/buffering.py b/devito/passes/clusters/buffering.py index 38bfeb2745..81a347323c 100644 --- a/devito/passes/clusters/buffering.py +++ b/devito/passes/clusters/buffering.py @@ -113,13 +113,12 @@ def callback(f): else: init_onwrite = lambda f: v1 - options = { - 'buf-async-degree': options['buf-async-degree'], - 'buf-fuse-tasks': options['fuse-tasks'], + options = dict(options) + options.update({ 'buf-init-onread': init_onread, 'buf-init-onwrite': init_onwrite, 'buf-callback': kwargs.get('opt_buffer'), - } + }) # Escape hatch to selectively disable buffering if options['buf-async-degree'] == 0: @@ -284,7 +283,7 @@ def callback(self, clusters, prefix, cache=None): ) # Lift {write,read}-only buffers into separate IterationSpaces - if self.options['buf-fuse-tasks']: + if self.options['fuse-tasks']: return init + processed for b in buffers: if b.is_writeonly: From 2995c6951c4f9103c1ac00dcc1f0240837059074 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Thu, 18 Apr 2024 13:49:30 +0000 Subject: [PATCH 09/58] compiler: Simplify tasking --- devito/core/gpu.py | 45 +++--- devito/ir/support/space.py | 8 +- devito/passes/clusters/asynchrony.py | 222 +++++++++++++-------------- 3 files changed, 139 insertions(+), 136 deletions(-) diff --git a/devito/core/gpu.py b/devito/core/gpu.py index 0af10ea6a3..4fe79044b5 100644 --- a/devito/core/gpu.py +++ b/devito/core/gpu.py @@ -7,7 +7,7 @@ from devito.operator.operator import rcompile from devito.passes import is_on_device from devito.passes.equations import collect_derivatives -from devito.passes.clusters import (Lift, MemcpyAsync, Tasker, blocking, buffering, +from devito.passes.clusters import (Lift, MemcpyAsync, tasking, blocking, buffering, cire, cse, factorize, fission, fuse, optimize_pows) from devito.passes.iet import (DeviceOmpTarget, DeviceAccTarget, mpiize, @@ -273,7 +273,7 @@ def _make_clusters_passes_mapper(cls, **kwargs): return { 'buffering': lambda i: buffering(i, stream_key, sregistry, options), 'blocking': lambda i: blocking(i, sregistry, options), - 'tasking': Tasker(task_key, sregistry).process, + 'tasking': lambda i: tasking(i, task_key, sregistry), 'streaming': MemcpyAsync(memcpy_key, sregistry).process, 'factorize': factorize, 'fission': fission, @@ -428,14 +428,15 @@ def func(items): that is more likely to require data streaming, since the size of `(*, d_{i+1}, ..., d_n)` is unlikely to fit into the device memory. - If more than one such Dimension is found, an exception is raised. + If more than one such Dimension is found, a set is returned. """ - functions = as_tuple(items) - retval = {None if is_on_device(f, options['gpu-fit']) else f.time_dim - for f in functions} - retval.discard(None) + retval = set() + for f in as_tuple(items): + if not is_on_device(f, options['gpu-fit']): + retval.add(f.time_dim) if len(retval) > 1: - raise ValueError("Ambiguous stream key") + # E.g., `t_sub0, t_sub1, ..., time` + return retval try: return retval.pop() except KeyError: @@ -449,13 +450,16 @@ def func(c): A Cluster `c` is turned into an "asynchronous task" if it writes to a Function that cannot be stored in device memory. - This function returns the Dimension(s) that is more likely to require data - streaming, or the empty set if no such Dimension is found. + This function returns a Dimension in `c`'s IterationSpace defining the + scope of the asynchronous task, or None if no task is required. """ + dims = {c.ispace[d].dim for d in as_tuple(stream_key(c.scope.writes))} + if len(dims) > 1: + raise ValueError("Cannot determine a unique task Dimension") try: - return stream_key(c.scope.writes)._defines - except AttributeError: - return set() + return dims.pop() + except KeyError: + return None return func @@ -465,14 +469,17 @@ def func(c): A Cluster `c` is turned into a "memcpy task" if it reads from a Function that cannot be stored in device memory. - This function returns the Dimension(s) requiring no memcpy as they are - not in device memory, or the empty set if no memcpy is required. + This function returns the Dimension in `c`'s IterationSpace within which + the memcpy is to be performed, or None if no memcpy is required. """ if task_key(c): # Writes would take precedence over reads - return set() + return None + dims = {c.ispace[d].dim for d in as_tuple(stream_key(c.scope.reads))} + if len(dims) > 1: + raise ValueError("Cannot determine a unique memcpy Dimension") try: - return stream_key(c.scope.reads)._defines - except AttributeError: - return set() + return dims.pop() + except KeyError: + return None return func diff --git a/devito/ir/support/space.py b/devito/ir/support/space.py index 689258a3d0..87b832ea3f 100644 --- a/devito/ir/support/space.py +++ b/devito/ir/support/space.py @@ -544,11 +544,15 @@ def __getitem__(self, key): return NullInterval(key) for i in self: + # Obvious case if i.dim is key: return i + + # There are a few special cases for which we can support + # derived-Dimension-based indexing if key.is_NonlinearDerived and i.dim in key._defines: - # NonlinearDerived Dimensions cannot appear in iteration Intervals, - # but their parent can + return i + if i.dim.is_Custom and key in i.dim._defines: return i return NullInterval(key) diff --git a/devito/passes/clusters/asynchrony.py b/devito/passes/clusters/asynchrony.py index 3bb8cac172..edc028eb6e 100644 --- a/devito/passes/clusters/asynchrony.py +++ b/devito/passes/clusters/asynchrony.py @@ -2,14 +2,15 @@ from sympy import And +from devito.exceptions import CompilationError from devito.ir import (Forward, GuardBoundNext, Queue, Vector, WaitLock, WithLock, FetchUpdate, PrefetchUpdate, ReleaseLock, normalize_syncs) from devito.passes.clusters.utils import bind_critical_regions, is_memcpy from devito.symbolics import IntDiv, uxreplace -from devito.tools import OrderedSet, is_integer, timed_pass +from devito.tools import OrderedSet, as_mapper, is_integer, timed_pass from devito.types import CustomDimension, Lock -__all__ = ['Tasker', 'MemcpyAsync'] +__all__ = ['tasking', 'MemcpyAsync'] class Asynchronous(Queue): @@ -30,123 +31,114 @@ def __init__(self, key, sregistry): super().__init__() -class Tasker(Asynchronous): - - @timed_pass(name='tasker') - def process(self, clusters): - return super().process(clusters) - - def callback(self, clusters, prefix): - if not prefix: - return clusters - - d = prefix[-1].dim - - locks = {} - waits = defaultdict(OrderedSet) - tasks = defaultdict(list) - for c0 in clusters: - dims = self.key(c0) - if d not in dims: - # Not a candidate asynchronous task - continue - - # Prevent future writes to interfere with a task by waiting on a lock - may_require_lock = c0.scope.reads - - # We can ignore scalars as they're passed by value - may_require_lock = {f for f in may_require_lock if f.is_AbstractFunction} - - # Sort for deterministic code generation - may_require_lock = sorted(may_require_lock, key=lambda i: i.name) - - protected = defaultdict(set) - for c1 in clusters: - offset = int(clusters.index(c1) <= clusters.index(c0)) - - for target in may_require_lock: +@timed_pass(name='tasking') +def tasking(clusters, key, sregistry): + candidates = as_mapper(clusters, key) + candidates.pop(None, None) + + if len(candidates) == 0: + return clusters + elif len(candidates) > 1: + raise CompilationError("Cannot handle multiple tasking dimensions") + d, candidates = candidates.popitem() + + locks = {} + waits = defaultdict(OrderedSet) + tasks = defaultdict(list) + for c0 in candidates: + # Prevent future writes to interfere with a task by waiting on a lock + may_require_lock = {f for f in c0.scope.reads if f.is_AbstractFunction} + + # Sort for deterministic code generation + may_require_lock = sorted(may_require_lock, key=lambda i: i.name) + + protected = defaultdict(set) + for c1 in clusters: + offset = int(clusters.index(c1) <= clusters.index(c0)) + + for target in may_require_lock: + try: + writes = c1.scope.writes[target] + except KeyError: + # No read-write dependency, ignore + continue + + try: + if all(w.aindices[d].is_Stepping for w in writes) or \ + all(w.aindices[d].is_Modulo for w in writes): + sz = target.shape_allocated[d] + assert is_integer(sz) + ld = CustomDimension(name='ld', symbolic_size=sz, parent=d) + elif all(w[d] == 0 for w in writes): + # Special case, degenerates to scalar lock + raise KeyError + else: + # Functions over non-stepping Dimensions need no lock + continue + except (AttributeError, KeyError): + # Would degenerate to a scalar, but we rather use a lock + # of size 1 for simplicity + ld = CustomDimension(name='ld', symbolic_size=1, parent=d) + + try: + lock = locks[target] + except KeyError: + name = sregistry.make_name(prefix='lock') + lock = locks[target] = Lock(name=name, dimensions=ld) + + for w in writes: try: - writes = c1.scope.writes[target] - except KeyError: - # No read-write dependency, ignore + index = w[d] + logical_index = index + offset + except TypeError: + assert ld.symbolic_size == 1 + index = 0 + logical_index = 0 + + if logical_index in protected[target]: continue - try: - if all(w.aindices[d].is_Stepping for w in writes) or \ - all(w.aindices[d].is_Modulo for w in writes): - size = target.shape_allocated[d] - assert is_integer(size) - ld = CustomDimension(name='ld', symbolic_size=size, parent=d) - elif all(w[d] == 0 for w in writes): - # Special case, degenerates to scalar lock - raise KeyError - else: - # Functions over non-stepping Dimensions need no lock - continue - except (AttributeError, KeyError): - # Would degenerate to a scalar, but we rather use a lock - # of size 1 for simplicity - ld = CustomDimension(name='ld', symbolic_size=1, parent=d) + waits[c1].add(WaitLock(lock[index], target)) + protected[target].add(logical_index) - try: - lock = locks[target] - except KeyError: - name = self.sregistry.make_name(prefix='lock') - lock = locks[target] = Lock(name=name, dimensions=ld) - - for w in writes: - try: - index = w[d] - logical_index = index + offset - except TypeError: - assert ld.symbolic_size == 1 - index = 0 - logical_index = 0 - - if logical_index in protected[target]: - continue - - waits[c1].add(WaitLock(lock[index], target)) - protected[target].add(logical_index) - - # Taskify `c0` - for target in protected: - lock = locks[target] - - indices = sorted({r[d] for r in c0.scope.reads[target]}) - if indices == [None]: - # `lock` is protecting a Function which isn't defined over `d` - # E.g., `d=time` and the protected function is `a(x, y)` - assert lock.size == 1 - indices = [0] - - if wraps_memcpy(c0): - e = c0.exprs[0] - function = e.lhs.function - findex = e.lhs.indices[d] - else: - # Only for backwards compatibility (e.g., tasking w/o buffering) - function = None - findex = None - - for i in indices: - tasks[c0].append(ReleaseLock(lock[i], target)) - tasks[c0].append(WithLock(lock[i], target, i, function, findex, d)) - - # CriticalRegions preempt WaitLocks, by definition - mapper = bind_critical_regions(clusters) - for c in clusters: - for c1 in mapper.get(c, []): - waits[c].update(waits.pop(c1, [])) + # Taskify `c0` + for target in protected: + lock = locks[target] - processed = [] - for c in clusters: - if waits[c] or tasks[c]: - processed.append(c.rebuild(syncs={d: list(waits[c]) + tasks[c]})) + indices = sorted({r[d] for r in c0.scope.reads[target]}) + if indices == [None]: + # `lock` is protecting a Function which isn't defined over `d` + # E.g., `d=time` and the protected function is `a(x, y)` + assert lock.size == 1 + indices = [0] + + if wraps_memcpy(c0): + e = c0.exprs[0] + function = e.lhs.function + findex = e.lhs.indices[d] else: - processed.append(c) + # Only for backwards compatibility (e.g., tasking w/o buffering) + function = None + findex = None + + for i in indices: + tasks[c0].append(ReleaseLock(lock[i], target)) + tasks[c0].append(WithLock(lock[i], target, i, function, findex, d)) + + # CriticalRegions preempt WaitLocks, by definition + mapper = bind_critical_regions(clusters) + for c in clusters: + for c1 in mapper.get(c, []): + waits[c].update(waits.pop(c1, [])) + + processed = [] + for c in clusters: + if waits[c] or tasks[c]: + processed.append(c.rebuild(syncs={d: list(waits[c]) + tasks[c]})) + else: + processed.append(c) - return processed + return processed class MemcpyAsync(Asynchronous): @@ -167,8 +159,8 @@ def callback(self, clusters, prefix): # Case 1 if d.is_Custom and is_integer(it.size): for c in clusters: - dims = self.key(c) - if d._defines & dims: + dim = self.key(c) + if dim in d._defines: if wraps_memcpy(c): # Case 1A (special case, leading to more efficient streaming) self._actions_from_init(c, prefix, actions) @@ -179,7 +171,7 @@ def callback(self, clusters, prefix): # Case 2 else: mapper = OrderedDict([(c, wraps_memcpy(c)) for c in clusters - if d in self.key(c)]) + if self.key(c) in d._defines]) # Case 2A (special case, leading to more efficient streaming) if all(mapper.values()): From 756f2042ec9772a090ceb3c8e1ba04ef4adffae6 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Thu, 18 Apr 2024 14:54:22 +0000 Subject: [PATCH 10/58] compiler: Simplify streaming --- devito/core/gpu.py | 6 +- devito/passes/clusters/asynchrony.py | 266 +++++++++++---------------- 2 files changed, 114 insertions(+), 158 deletions(-) diff --git a/devito/core/gpu.py b/devito/core/gpu.py index 4fe79044b5..28e8f14e84 100644 --- a/devito/core/gpu.py +++ b/devito/core/gpu.py @@ -7,8 +7,8 @@ from devito.operator.operator import rcompile from devito.passes import is_on_device from devito.passes.equations import collect_derivatives -from devito.passes.clusters import (Lift, MemcpyAsync, tasking, blocking, buffering, - cire, cse, factorize, fission, fuse, +from devito.passes.clusters import (Lift, tasking, memcpy_prefetch, blocking, + buffering, cire, cse, factorize, fission, fuse, optimize_pows) from devito.passes.iet import (DeviceOmpTarget, DeviceAccTarget, mpiize, hoist_prodders, linearize, pthreadify, @@ -274,7 +274,7 @@ def _make_clusters_passes_mapper(cls, **kwargs): 'buffering': lambda i: buffering(i, stream_key, sregistry, options), 'blocking': lambda i: blocking(i, sregistry, options), 'tasking': lambda i: tasking(i, task_key, sregistry), - 'streaming': MemcpyAsync(memcpy_key, sregistry).process, + 'streaming': lambda i: memcpy_prefetch(i, memcpy_key, sregistry), 'factorize': factorize, 'fission': fission, 'fuse': lambda i: fuse(i, options=options), diff --git a/devito/passes/clusters/asynchrony.py b/devito/passes/clusters/asynchrony.py index edc028eb6e..25c5b08f0b 100644 --- a/devito/passes/clusters/asynchrony.py +++ b/devito/passes/clusters/asynchrony.py @@ -1,34 +1,16 @@ -from collections import OrderedDict, defaultdict +from collections import defaultdict from sympy import And from devito.exceptions import CompilationError -from devito.ir import (Forward, GuardBoundNext, Queue, Vector, WaitLock, WithLock, +from devito.ir import (Forward, GuardBoundNext, Vector, WaitLock, WithLock, FetchUpdate, PrefetchUpdate, ReleaseLock, normalize_syncs) from devito.passes.clusters.utils import bind_critical_regions, is_memcpy from devito.symbolics import IntDiv, uxreplace from devito.tools import OrderedSet, as_mapper, is_integer, timed_pass from devito.types import CustomDimension, Lock -__all__ = ['tasking', 'MemcpyAsync'] - - -class Asynchronous(Queue): - - """ - Create asynchronous Clusters, or "tasks". - - Notes - ----- - From an implementation viewpoint, an asynchronous Cluster is a Cluster - with attached suitable SyncOps, such as WaitLock, WithLock, etc. - """ - - def __init__(self, key, sregistry): - self.key = key - self.sregistry = sregistry - - super().__init__() +__all__ = ['tasking', 'memcpy_prefetch'] @timed_pass(name='tasking') @@ -40,6 +22,7 @@ def tasking(clusters, key, sregistry): return clusters elif len(candidates) > 1: raise CompilationError("Cannot handle multiple tasking dimensions") + d, candidates = candidates.popitem() locks = {} @@ -141,160 +124,133 @@ def tasking(clusters, key, sregistry): return processed -class MemcpyAsync(Asynchronous): - - @timed_pass(name='async_memcpy') - def process(self, clusters): - return super().process(clusters) +@timed_pass(name='memcpy_prefetch') +def memcpy_prefetch(clusters, key, sregistry): + actions = defaultdict(Actions) - def callback(self, clusters, prefix): - if not prefix: - return clusters + for c in clusters: + d = key(c) + if d is None: + continue - it = prefix[-1] - d = it.dim + if d.is_Custom and is_integer(c.ispace[d].size): + if wraps_memcpy(c): + _actions_from_init(c, d, actions) + else: + raise NotImplementedError - actions = defaultdict(Actions) + elif wraps_memcpy(c): + _actions_from_update_memcpy(c, d, clusters, actions, sregistry) - # Case 1 - if d.is_Custom and is_integer(it.size): - for c in clusters: - dim = self.key(c) - if dim in d._defines: - if wraps_memcpy(c): - # Case 1A (special case, leading to more efficient streaming) - self._actions_from_init(c, prefix, actions) - else: - # Case 1B (actually, we expect to never end up here) - raise NotImplementedError + # Attach the computed Actions + processed = [] + for c in clusters: + v = actions[c] - # Case 2 + if v.drop: + assert not v.syncs + continue + elif v.syncs: + processed.append(c.rebuild(syncs=normalize_syncs(c.syncs, v.syncs))) else: - mapper = OrderedDict([(c, wraps_memcpy(c)) for c in clusters - if self.key(c) in d._defines]) - - # Case 2A (special case, leading to more efficient streaming) - if all(mapper.values()): - for c in mapper: - self._actions_from_update_memcpy(c, clusters, prefix, actions) - - # Case 2B - elif mapper: - # There used to be a handler for this case, but it was dropped - # because it created inefficient code and in practice never used - raise NotImplementedError + processed.append(c) - # Perform the necessary actions; this will ultimately attach SyncOps to Clusters - processed = [] - for c in clusters: - v = actions[c] + if v.insert: + processed.extend(v.insert) - if v.drop: - assert not v.syncs - continue - elif v.syncs: - processed.append(c.rebuild(syncs=normalize_syncs(c.syncs, v.syncs))) - else: - processed.append(c) + return processed - if v.insert: - processed.extend(v.insert) - return processed +def _actions_from_init(c, d, actions): + i = c.ispace.index(d) + if i > 0: + pd = c.ispace[i].dim + else: + pd = None - def _actions_from_init(self, cluster, prefix, actions): - it = prefix[-1] - d = it.dim - try: - pd = prefix[-2].dim - except IndexError: - pd = None + e = c.exprs[0] + function = e.rhs.function + target = e.lhs.function - e = cluster.exprs[0] - function = e.rhs.function - target = e.lhs.function + findex = e.rhs.indices[d] - findex = e.rhs.indices[d] + size = d.symbolic_size + assert is_integer(size) - size = d.symbolic_size - assert is_integer(size) + actions[c].syncs[pd].append( + FetchUpdate(None, target, 0, function, findex, d, size) + ) - actions[cluster].syncs[pd].append( - FetchUpdate(None, target, 0, function, findex, d, size) - ) - def _actions_from_update_memcpy(self, cluster, clusters, prefix, actions): - it = prefix[-1] - d = it.dim - direction = it.direction +def _actions_from_update_memcpy(c, d, clusters, actions, sregistry): + direction = c.ispace[d].direction - # Prepare the data to instantiate a PrefetchUpdate SyncOp - e = cluster.exprs[0] - function = e.rhs.function - target = e.lhs.function + # Prepare the data to instantiate a PrefetchUpdate SyncOp + e = c.exprs[0] + function = e.rhs.function + target = e.lhs.function - fetch = e.rhs.indices[d] - if direction is Forward: - findex = fetch + 1 - else: - findex = fetch - 1 + fetch = e.rhs.indices[d] + if direction is Forward: + findex = fetch + 1 + else: + findex = fetch - 1 - # If fetching into e.g. `ub[sb1]` we'll need to prefetch into e.g. `ub[sb0]` - tindex0 = e.lhs.indices[d] - if is_integer(tindex0) or isinstance(tindex0, IntDiv): - tindex = tindex0 + # If fetching into e.g. `ub[sb1]` we'll need to prefetch into e.g. `ub[sb0]` + tindex0 = e.lhs.indices[d] + if is_integer(tindex0) or isinstance(tindex0, IntDiv): + tindex = tindex0 + else: + assert tindex0.is_Modulo + subiters = [i for i in c.sub_iterators[d] if i.parent is tindex0.parent] + osubiters = sorted(subiters, key=lambda i: Vector(i.offset)) + n = osubiters.index(tindex0) + if direction is Forward: + tindex = osubiters[(n + 1) % len(osubiters)] else: - assert tindex0.is_Modulo - subiters = [i for i in cluster.sub_iterators[d] if i.parent is tindex0.parent] - osubiters = sorted(subiters, key=lambda i: Vector(i.offset)) - n = osubiters.index(tindex0) - if direction is Forward: - tindex = osubiters[(n + 1) % len(osubiters)] - else: - tindex = osubiters[(n - 1) % len(osubiters)] - - # We need a lock to synchronize the copy-in - name = self.sregistry.make_name(prefix='lock') - ld = CustomDimension(name='ld', symbolic_size=1, parent=d) - lock = Lock(name=name, dimensions=ld) - handle = lock[0] - - # Turn `cluster` into a prefetch Cluster - expr = uxreplace(e, {tindex0: tindex, fetch: findex}) - - guards = {d: And( - cluster.guards.get(d, True), - GuardBoundNext(function.indices[d], direction), - )} - - syncs = {d: [ReleaseLock(handle, target), - PrefetchUpdate(handle, target, tindex, function, findex, d, 1, - e.rhs)]} - - pcluster = cluster.rebuild(exprs=expr, guards=guards, syncs=syncs) - - # Since we're turning `e` into a prefetch, we need to: - # 1) attach a WaitLock SyncOp to the first Cluster accessing `target` - # 2) insert the prefetch Cluster right after the last Cluster accessing `target` - # 3) drop the original Cluster performing a memcpy-like fetch - n = clusters.index(cluster) - first = None - last = None - for c in clusters[n+1:]: - if target in c.scope.reads: - if first is None: - first = c - last = c - assert first is not None - assert last is not None - actions[first].syncs[d].append(WaitLock(handle, target)) - actions[last].insert.append(pcluster) - actions[cluster].drop = True - - return last, pcluster - - -# Utilities + tindex = osubiters[(n - 1) % len(osubiters)] + + # We need a lock to synchronize the copy-in + name = sregistry.make_name(prefix='lock') + ld = CustomDimension(name='ld', symbolic_size=1, parent=d) + lock = Lock(name=name, dimensions=ld) + handle = lock[0] + + # Turn `c` into a prefetch Cluster `pc` + expr = uxreplace(e, {tindex0: tindex, fetch: findex}) + + guards = {d: And( + c.guards.get(d, True), + GuardBoundNext(function.indices[d], direction), + )} + + syncs = {d: [ + ReleaseLock(handle, target), + PrefetchUpdate(handle, target, tindex, function, findex, d, 1, e.rhs) + ]} + + pc = c.rebuild(exprs=expr, guards=guards, syncs=syncs) + + # Since we're turning `e` into a prefetch, we need to: + # 1) attach a WaitLock SyncOp to the first Cluster accessing `target` + # 2) insert the prefetch Cluster right after the last Cluster accessing `target` + # 3) drop the original Cluster performing a memcpy-like fetch + n = clusters.index(c) + first = None + last = None + for c1 in clusters[n+1:]: + if target in c1.scope.reads: + if first is None: + first = c1 + last = c1 + assert first is not None + assert last is not None + + actions[first].syncs[d].append(WaitLock(handle, target)) + actions[last].insert.append(pc) + actions[c].drop = True + + return last, pc class Actions: From 3027b00d9a00f555e91206baae7dbe5aed8e6b5d Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Fri, 19 Apr 2024 08:31:31 +0000 Subject: [PATCH 11/58] compiler: Add IterationSpace.innermost --- devito/ir/support/space.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/devito/ir/support/space.py b/devito/ir/support/space.py index 87b832ea3f..7c7dd78ddc 100644 --- a/devito/ir/support/space.py +++ b/devito/ir/support/space.py @@ -1032,6 +1032,10 @@ def sub_iterators(self): def directions(self): return self._directions + @property + def innermost(self): + return self[-1] + @cached_property def itintervals(self): return tuple(self[d] for d in self.itdims) From 0ac23f1acebccee9eb12f5603146241eb231ef89 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Mon, 15 Apr 2024 09:44:45 +0000 Subject: [PATCH 12/58] compiler: Revamp buffering --- devito/core/cpu.py | 4 +- devito/core/gpu.py | 77 +-- devito/ir/support/properties.py | 11 + devito/ir/support/space.py | 43 +- devito/passes/clusters/asynchrony.py | 68 +- devito/passes/clusters/buffering.py | 892 +++++++++++++-------------- devito/passes/iet/orchestration.py | 4 +- tests/test_buffering.py | 49 +- 8 files changed, 541 insertions(+), 607 deletions(-) diff --git a/devito/core/cpu.py b/devito/core/cpu.py index 508a71d99d..184d8955a1 100644 --- a/devito/core/cpu.py +++ b/devito/core/cpu.py @@ -240,9 +240,9 @@ def _make_clusters_passes_mapper(cls, **kwargs): # Callback used by `buffering`; it mimics `is_on_device`, which is used # on device backends - def callback(f): + def callback(f, *args): if f.is_TimeFunction and f.save is not None: - return f.time_dim + return f.space_dimensions else: return None diff --git a/devito/core/gpu.py b/devito/core/gpu.py index 28e8f14e84..228849c67e 100644 --- a/devito/core/gpu.py +++ b/devito/core/gpu.py @@ -266,15 +266,14 @@ def _make_clusters_passes_mapper(cls, **kwargs): platform = kwargs['platform'] sregistry = kwargs['sregistry'] - stream_key = stream_key_wrap(options) - task_key = task_wrap(stream_key) - memcpy_key = memcpy_wrap(stream_key, task_key) + callback = lambda f: not is_on_device(f, options['gpu-fit']) + stream_key = stream_wrap(callback) return { - 'buffering': lambda i: buffering(i, stream_key, sregistry, options), 'blocking': lambda i: blocking(i, sregistry, options), - 'tasking': lambda i: tasking(i, task_key, sregistry), - 'streaming': lambda i: memcpy_prefetch(i, memcpy_key, sregistry), + 'buffering': lambda i: buffering(i, stream_key, sregistry, options), + 'tasking': lambda i: tasking(i, stream_key, sregistry), + 'streaming': lambda i: memcpy_prefetch(i, stream_key, sregistry), 'factorize': factorize, 'fission': fission, 'fuse': lambda i: fuse(i, options=options), @@ -421,65 +420,19 @@ def _make_iet_passes_mapper(cls, **kwargs): # *** Utils -def stream_key_wrap(options): - def func(items): +def stream_wrap(callback): + def stream_key(items, *args): """ - Given one or more Functions `f(d_1, ...d_n)`, return the Dimension `d_i` - that is more likely to require data streaming, since the size of - `(*, d_{i+1}, ..., d_n)` is unlikely to fit into the device memory. - - If more than one such Dimension is found, a set is returned. + Given one or more Functions `f(d_1, ...d_n)`, return the Dimensions + `(d_i, ..., d_n)` requiring data streaming. """ - retval = set() - for f in as_tuple(items): - if not is_on_device(f, options['gpu-fit']): - retval.add(f.time_dim) + found = [f for f in as_tuple(items) if callback(f)] + retval = {f.space_dimensions for f in found} if len(retval) > 1: - # E.g., `t_sub0, t_sub1, ..., time` - return retval - try: + raise ValueError("Cannot determine homogenous stream Dimensions") + elif len(retval) == 1: return retval.pop() - except KeyError: + else: return None - return func - -def task_wrap(stream_key): - def func(c): - """ - A Cluster `c` is turned into an "asynchronous task" if it writes to a - Function that cannot be stored in device memory. - - This function returns a Dimension in `c`'s IterationSpace defining the - scope of the asynchronous task, or None if no task is required. - """ - dims = {c.ispace[d].dim for d in as_tuple(stream_key(c.scope.writes))} - if len(dims) > 1: - raise ValueError("Cannot determine a unique task Dimension") - try: - return dims.pop() - except KeyError: - return None - return func - - -def memcpy_wrap(stream_key, task_key): - def func(c): - """ - A Cluster `c` is turned into a "memcpy task" if it reads from a Function - that cannot be stored in device memory. - - This function returns the Dimension in `c`'s IterationSpace within which - the memcpy is to be performed, or None if no memcpy is required. - """ - if task_key(c): - # Writes would take precedence over reads - return None - dims = {c.ispace[d].dim for d in as_tuple(stream_key(c.scope.reads))} - if len(dims) > 1: - raise ValueError("Cannot determine a unique memcpy Dimension") - try: - return dims.pop() - except KeyError: - return None - return func + return stream_key diff --git a/devito/ir/support/properties.py b/devito/ir/support/properties.py index 0cb8cb5231..1181fcd083 100644 --- a/devito/ir/support/properties.py +++ b/devito/ir/support/properties.py @@ -85,6 +85,8 @@ def __init__(self, name, val=None): is used for iteration spaces that are larger than the data space. """ +PREFETCHABLE = Property('prefetchable') + # Bundles PARALLELS = {PARALLEL, PARALLEL_INDEP, PARALLEL_IF_ATOMIC, PARALLEL_IF_PVT} @@ -181,6 +183,12 @@ def sequentialize(self, dims=None): m[d] = normalize_properties(set(self.get(d, [])), {SEQUENTIAL}) return Properties(m) + def prefetchable(self, dims): + m = dict(self) + for d in as_tuple(dims): + m[d] = self.get(d, set()) | {PREFETCHABLE} + return Properties(m) + def block(self, dims, kind='default'): if kind == 'default': p = TILABLE @@ -224,6 +232,9 @@ def is_blockable(self, d): def is_blockable_small(self, d): return TILABLE_SMALL in self.get(d, set()) + def is_prefetchable(self, d): + return PREFETCHABLE in self.get(d, set()) + @property def nblockable(self): return sum([self.is_blockable(d) for d in self]) diff --git a/devito/ir/support/space.py b/devito/ir/support/space.py index 7c7dd78ddc..14611664ea 100644 --- a/devito/ir/support/space.py +++ b/devito/ir/support/space.py @@ -455,9 +455,9 @@ def relaxed(self, d=None): intervals = [i.relaxed if i.dim in d else i for i in self] return IntervalGroup(intervals, relations=self.relations, mode=self.mode) - def promote(self, cond): + def promote(self, cond, mode='unordered'): intervals = [i.promote(cond) for i in self] - intervals = IntervalGroup(intervals, relations=None, mode='unordered') + intervals = IntervalGroup(intervals, relations=None, mode=mode) # There could be duplicate Dimensions at this point, so we sum up the # Intervals defined over the same Dimension to produce a well-defined @@ -466,13 +466,17 @@ def promote(self, cond): return intervals - def drop(self, d, strict=False): + def drop(self, maybe_callable, strict=False): + if callable(maybe_callable): + dims = {i.dim for i in self if maybe_callable(i.dim)} + else: + dims = set(as_tuple(maybe_callable)) + # Dropping if strict: - dims = set(as_tuple(d)) intervals = [i._rebuild() for i in self if i.dim not in dims] else: - dims = set().union(*[i._defines for i in as_tuple(d)]) + dims = set().union(*[i._defines for i in dims]) intervals = [i._rebuild() for i in self if not i.dim._defines & dims] # Clean up relations @@ -593,7 +597,7 @@ class IterationInterval(Interval): An Interval associated with metadata. """ - def __init__(self, interval, sub_iterators, direction): + def __init__(self, interval, sub_iterators=(), direction=Forward): super().__init__(interval.dim, *interval.offsets, stamp=interval.stamp) self.sub_iterators = sub_iterators self.direction = direction @@ -693,8 +697,15 @@ class DataSpace(Space): def __init__(self, intervals, parts=None): super().__init__(intervals) + # Normalize parts parts = {k: v.expand() for k, v in (parts or {}).items()} - self._parts = frozendict(parts) + #TODO: POLISH THIS + a = {} + for k, v in parts.items(): + dims = set().union(*[d._defines for d in k.dimensions]) + v1 = v.drop(lambda d: d not in dims) + a[k] = v1 + self._parts = frozendict(a) def __eq__(self, other): return (isinstance(other, DataSpace) and @@ -868,12 +879,16 @@ def augment(self, sub_iterators): return IterationSpace(self.intervals, items, self.directions) - def switch(self, d0, d1): + def switch(self, d0, d1, direction=None): intervals = self.intervals.switch(d0, d1) - sub_iterators = {d1 if k is d0 else k: v - for k, v in self.sub_iterators.items()} - directions = {d1 if k is d0 else k: v - for k, v in self.directions.items()} + + sub_iterators = dict(self.sub_iterators) + sub_iterators.pop(d0, None) + sub_iterators[d1] = () + + directions = dict(self.directions) + v = directions.pop(d0, None) + directions[d1] = direction or v return IterationSpace(intervals, sub_iterators, directions) @@ -941,8 +956,8 @@ def relaxed(self, cond): return IterationSpace(intervals, sub_iterators, directions) - def promote(self, cond): - intervals = self.intervals.promote(cond) + def promote(self, cond, mode='unordered'): + intervals = self.intervals.promote(cond, mode=mode) sub_iterators = {i.promote(cond).dim: self.sub_iterators[i.dim] for i in self.intervals} directions = {i.promote(cond).dim: self.directions[i.dim] diff --git a/devito/passes/clusters/asynchrony.py b/devito/passes/clusters/asynchrony.py index 25c5b08f0b..eef69f3d90 100644 --- a/devito/passes/clusters/asynchrony.py +++ b/devito/passes/clusters/asynchrony.py @@ -13,8 +13,45 @@ __all__ = ['tasking', 'memcpy_prefetch'] +def async_trigger(c, dims): + """ + Return the Dimension in `c`'s IterationSpace that triggers the + asynchronous execution of `c`. + """ + if not dims: + return None + else: + key = lambda d: not d._defines.intersection(dims) + ispace = c.ispace.project(key) + return ispace.innermost.dim + + +def keys(key0): + """ + Callbacks for `tasking` and `memcpy_prefetch` given a user-defined `key`. + """ + + def task_key(c): + return async_trigger(c, key0(c.scope.writes)) + + def memcpy_key(c): + if task_key(c): + # Writes would take precedence over reads + return None + else: + return async_trigger(c, key0(c.scope.reads)) + + return task_key, memcpy_key + + @timed_pass(name='tasking') -def tasking(clusters, key, sregistry): +def tasking(clusters, key0, sregistry): + """ + Turn a Cluster `c` into an asynchronous task if `c` writes to a Function `f` + such that `key0(f) = (d_i, ..., d_n)`, where `(d_i, ..., d_n)` are the + task Dimensions. + """ + key, _ = keys(key0) candidates = as_mapper(clusters, key) candidates.pop(None, None) @@ -125,7 +162,12 @@ def tasking(clusters, key, sregistry): @timed_pass(name='memcpy_prefetch') -def memcpy_prefetch(clusters, key, sregistry): +def memcpy_prefetch(clusters, key0, sregistry): + """ + A special form of `tasking` for optimized, asynchronous memcpy-like operations. + Unlike `tasking`, `key0` is here applied to the reads of a given Cluster. + """ + _, key = keys(key0) actions = defaultdict(Actions) for c in clusters: @@ -133,14 +175,14 @@ def memcpy_prefetch(clusters, key, sregistry): if d is None: continue - if d.is_Custom and is_integer(c.ispace[d].size): - if wraps_memcpy(c): - _actions_from_init(c, d, actions) - else: - raise NotImplementedError + if not wraps_memcpy(c): + continue - elif wraps_memcpy(c): + if c.properties.is_prefetchable(d): _actions_from_update_memcpy(c, d, clusters, actions, sregistry) + elif d.is_Custom and is_integer(c.ispace[d].size): + _actions_from_init(c, d, actions) + # Attach the computed Actions processed = [] @@ -162,22 +204,16 @@ def memcpy_prefetch(clusters, key, sregistry): def _actions_from_init(c, d, actions): - i = c.ispace.index(d) - if i > 0: - pd = c.ispace[i].dim - else: - pd = None - e = c.exprs[0] function = e.rhs.function target = e.lhs.function findex = e.rhs.indices[d] - size = d.symbolic_size + size = target.shape[d] assert is_integer(size) - actions[c].syncs[pd].append( + actions[c].syncs[None].append( FetchUpdate(None, target, 0, function, findex, d, size) ) diff --git a/devito/passes/clusters/buffering.py b/devito/passes/clusters/buffering.py index 81a347323c..eb3de7dba9 100644 --- a/devito/passes/clusters/buffering.py +++ b/devito/passes/clusters/buffering.py @@ -1,25 +1,26 @@ -from collections import OrderedDict, defaultdict, namedtuple -from itertools import combinations +from collections import defaultdict, namedtuple from functools import cached_property +from itertools import chain +from sympy import S import numpy as np -from devito.ir import (Cluster, Forward, GuardBound, Interval, IntervalGroup, - IterationSpace, AFFINE, PARALLEL, Queue, Vector, - lower_exprs, vmax, vmin) +from devito.ir import (Cluster, Backward, Forward, GuardBound, Interval, + IntervalGroup, Properties, Queue, Vector, lower_exprs, + vmax, vmin) from devito.exceptions import InvalidOperator from devito.logger import warning from devito.passes.clusters.utils import is_memcpy -from devito.symbolics import IntDiv, retrieve_function_carriers, uxreplace -from devito.tools import (Bunch, DefaultOrderedDict, Stamp, as_tuple, - filter_ordered, flatten, is_integer, timed_pass) -from devito.types import Array, CustomDimension, Dimension, Eq, ModuloDimension +from devito.symbolics import IntDiv, retrieve_functions, uxreplace +from devito.tools import (Stamp, as_mapper, as_tuple, filter_ordered, frozendict, + flatten, is_integer, timed_pass) +from devito.types import Array, CustomDimension, Eq, ModuloDimension __all__ = ['buffering'] @timed_pass() -def buffering(clusters, callback, sregistry, options, **kwargs): +def buffering(clusters, key, sregistry, options, **kwargs): """ Replace written Functions with Arrays. This gives the compiler more control over storage layout, data movement (e.g. between host and device), etc. @@ -28,16 +29,9 @@ def buffering(clusters, callback, sregistry, options, **kwargs): ---------- cluster : list of Cluster Input Clusters, subject of the optimization pass. - callback : callable, optional - A mechanism to express what the buffering candidates are, and what - Dimensions, if any, should be replaced by ModuloDimensions, such that - the buffer has a smaller footprint than that of the Function it stems - from. The callable takes a Function as input and returns either None - or a list. If the output is None, then the input is not a buffering - candidate. Otherwise, the output is a buffering candidate and the list - contains the Dimensions to be replaced by new ModuloDimensions. If - unspecified, by default all DiscreteFunctions are turned into buffers, - but no Dimension replacement occurs. + key : callable, optional + A callback that takes a Function as input and returns the Dimensions + that should be buffered. sregistry : SymbolRegistry The symbol registry, to create unique names for buffers and Dimensions. options : dict @@ -52,15 +46,7 @@ def buffering(clusters, callback, sregistry, options, **kwargs): implemented by other passes). **kwargs Additional compilation options. - Accepted: ['opt_init_onread', 'opt_init_onwrite', 'opt_buffer']. - * 'opt_init_onread': By default, a read buffer always triggers the - generation of an initializing Cluster (see example below). When the - size of the buffer is 1, the step-through Cluster might suffice, however. - In such a case, and with `opt_init_onread=False`, the initalizing - Cluster is omitted. This creates an implicit contract between the - caller and the buffering pass, as the step-through Cluster cannot be - further transformed or the buffer might never be initialized with the - content of the buffered Function. + Accepted: ['opt_init_onwrite', 'opt_buffer']. * 'opt_init_onwrite': By default, a written buffer does not trigger the generation of an initializing Cluster. With `opt_init_onwrite=True`, instead, the buffer gets initialized to zero. @@ -94,19 +80,14 @@ def buffering(clusters, callback, sregistry, options, **kwargs): Eq(ub[d+1, x], ub[d, x] + ub[d-1, x] + 1) Eq(u[time+1, x], ub[d+1, x]) """ - if callback is None: - def callback(f): + if key is None: + def key(f): if f.is_DiscreteFunction: return [] else: return None - assert callable(callback) + assert callable(key) - v0 = kwargs.get('opt_init_onread', True) - if callable(v0): - init_onread = v0 - else: - init_onread = lambda f: v0 v1 = kwargs.get('opt_init_onwrite', False) if callable(v1): init_onwrite = v1 @@ -115,7 +96,6 @@ def callback(f): options = dict(options) options.update({ - 'buf-init-onread': init_onread, 'buf-init-onwrite': init_onwrite, 'buf-callback': kwargs.get('opt_buffer'), }) @@ -124,175 +104,158 @@ def callback(f): if options['buf-async-degree'] == 0: return clusters - return Buffering(callback, sregistry, options).process(clusters) + # First we generate all the necessary buffers + mapper = generate_buffers(clusters, key, sregistry, options) + + # Then we inject them into the Clusters. This involves creating the + # initializing Clusters, and replacing the buffered Functions with the buffers + clusters = InjectBuffers(mapper, sregistry, options).process(clusters) + + return clusters -class Buffering(Queue): +class InjectBuffers(Queue): - def __init__(self, callback0, sregistry, options): + # NOTE: We need to use a Queue because with multi-buffering we will need + # to process the `db0` and `time` Dimensions separately for `last_idx` and + # `first_idx` to work correctly + + def __init__(self, mapper, sregistry, options): super().__init__() - self.callback0 = callback0 + self.mapper = mapper self.sregistry = sregistry self.options = options def process(self, clusters): - return self._process_fatd(clusters, 1, cache={}) + return self._process_fatd(clusters, 1) - def callback(self, clusters, prefix, cache=None): + def callback(self, clusters, prefix): if not prefix: return clusters - d = prefix[-1].dim - try: - pd = prefix[-2].dim - except IndexError: - pd = None - - # Locate all Function accesses within the provided `clusters` - accessmap = AccessMapper(clusters) - - init_onread = self.options['buf-init-onread'] - init_onwrite = self.options['buf-init-onwrite'] - - # Create and initialize buffers - init = [] - buffers = [] - for f, accessv in accessmap.items(): - # Is `f` really a buffering candidate? - dim = self.callback0(f) - if dim is None or not d._defines & dim._defines: - continue - - b = Buffer(f, dim, d, accessv, cache, self.options, self.sregistry) - buffers.append(b) - - guards = {} - if b.is_read or not b.has_uniform_subdims: - # Special case: avoid initialization if not strictly necessary - # See docstring for more info about what this implies - if b.size == 1 and not init_onread(b.function): - continue - - # Special case: avoid initialization in the case of double - # (or multiple levels of) buffering because it will have been - # already performed - if b.size > 1 and b.multi_buffering: - continue - - dims = b.function.dimensions - lhs = b.indexed[dims]._subs(dim, b.firstidx.b) - rhs = b.function[dims]._subs(dim, b.firstidx.f) - - guards[b.xd] = GuardBound(0, b.firstidx.f) + key = lambda f, *args: f in self.mapper + bfmap = map_buffered_functions(clusters, key) - elif b.is_write and init_onwrite(b.function): - dims = b.buffer.dimensions - lhs = b.buffer.indexify() - rhs = 0 + # A BufferDescriptor is a simple data structure storing additional + # information about a buffer, harvested from the subset of `clusters` + # that access it + descriptors = {b: BufferDescriptor(f, b, bfmap[f]) + for f, b in self.mapper.items() + if f in bfmap} - else: - # NOTE: a buffer must be initialized only if the buffered - # Function is read in at least one place or in the case of - # non-uniform SubDimensions, to avoid uninitialized values to - # be copied-back into the buffered Function - continue + # Are we inside the right `d`? + descriptors = {b: v for b, v in descriptors.items() if d in v.itdims} - expr = lower_exprs(Eq(lhs, rhs)) - ispace = b.writeto - guards[pd] = GuardBound(dim.root.symbolic_min, dim.root.symbolic_max) - properties = {d: {AFFINE, PARALLEL} for d in ispace.itdims} + if not descriptors: + return clusters - init.append(Cluster(expr, ispace, guards=guards, properties=properties)) + # Then we create the initializing Clusters when necessary + init = init_buffers(descriptors, self.options) - if not buffers: - return clusters + # Create all the necessary ModuloDimensions to step through the buffer + mds = make_mds(descriptors, self.sregistry) + subiters = as_mapper(mds.values(), lambda i: i.root) # Substitution rules to replace buffered Functions with buffers + # E.g., `usave[time+1, x+1, y+1] -> ub0[sb1, x+1, y+1]` subs = {} - for b in buffers: - for a in b.accessv.accesses: - subs[a] = b.indexed[[b.index_mapper.get(i, i) for i in a.indices]] + for b, v in descriptors.items(): + accesses = chain(*[c.scope[v.f] for c in v.clusters]) + index_mapper = {i: mds[(v.xd, i)] for i in v.indices} + for a in accesses: + subs[a.access] = b.indexed[[index_mapper.get(i, i) for i in a]] processed = [] for c in clusters: # If a buffer is read but never written, then we need to add # an Eq to step through the next slot # E.g., `ub[0, x] = usave[time+2, x]` - for b in buffers: - if not b.is_readonly: + for b, v in descriptors.items(): + if not v.is_readonly: continue - try: - c.exprs.index(b.firstread) - except ValueError: + if c is not v.firstread: continue - dims = b.function.dimensions - lhs = b.indexed[dims]._subs(b.dim, b.lastidx.b) - rhs = b.function[dims]._subs(b.dim, b.lastidx.f) + idxf = v.last_idx + idxb = mds[(v.xd, idxf)] + + lhs = v.b.indexify()._subs(v.xd, idxb) + rhs = v.f.indexify()._subs(v.dim, idxf) - expr = lower_exprs(Eq(lhs, rhs)) - ispace = b.readfrom - try: - guards = c.guards.xandg(b.xd, GuardBound(0, b.firstidx.f)) - except KeyError: + expr = Eq(lhs, rhs) + expr = lower_exprs(expr) + + ispace = v.step_to + + if v.xd in ispace.itdims: + guards = c.guards.xandg(v.xd, GuardBound(0, v.first_idx.f)) + else: guards = c.guards + properties = c.properties.sequentialize(d) + if not isinstance(d, BufferDimension): + properties = properties.prefetchable(d) + + syncs = c.syncs - processed.append( - c.rebuild(exprs=expr, ispace=ispace, - guards=guards, properties=properties) - ) + processed.append(Cluster(expr, ispace, guards, properties, syncs)) - # Substitute buffered Functions with the newly created buffers + # Substitute the buffered Functions with the buffers exprs = [uxreplace(e, subs) for e in c.exprs] - ispace = c.ispace - for b in buffers: - ispace = ispace.augment(b.sub_iterators) + ispace = c.ispace.augment(subiters) properties = c.properties.sequentialize(d) processed.append( c.rebuild(exprs=exprs, ispace=ispace, properties=properties) ) - # Also append the copy-back if `e` is the last-write of some buffers - # E.g., `usave[time + 1, x] = ub[sb1, x]` - for b in buffers: - if b.is_readonly: + # Append the copy-back if `c` is the last-write of some buffers + # E.g., `usave[time+1, x] = ub[sb1, x]` + for b, v in descriptors.items(): + if v.is_readonly: continue - try: - c.exprs.index(b.lastwrite) - except ValueError: + if c is not v.lastwrite: continue - dims = b.function.dimensions - lhs = b.function[dims]._subs(b.dim, b.lastidx.f) - rhs = b.indexed[dims]._subs(b.dim, b.lastidx.b) + idxf = v.last_idx + idxb = mds[(v.xd, idxf)] - expr = lower_exprs(uxreplace(Eq(lhs, rhs), b.subdims_mapper)) - ispace = b.written - try: - guards = c.guards.xandg(b.xd, GuardBound(0, b.firstidx.f)) - except KeyError: + lhs = v.f.indexify()._subs(v.dim, idxf) + rhs = v.b.indexify()._subs(v.xd, idxb) + + expr = Eq(lhs, rhs) + expr = lower_exprs(uxreplace(expr, v.subdims_mapper)) + + ispace = v.written + + if v.xd in ispace.itdims: + guards = c.guards.xandg(v.xd, GuardBound(0, v.first_idx.f)) + else: guards = c.guards + properties = c.properties.sequentialize(d) - processed.append( - c.rebuild(exprs=expr, ispace=ispace, - guards=guards, properties=properties) - ) + syncs = c.syncs + + processed.append(Cluster(expr, ispace, guards, properties, syncs)) # Lift {write,read}-only buffers into separate IterationSpaces if self.options['fuse-tasks']: return init + processed - for b in buffers: - if b.is_writeonly: - # `b` might be written by multiple, potentially mutually-exclusive, - # equations. For example, two equations that have or will have - # complementary guards, hence only one will be executed. In such a - # case, we can split the equations over separate IterationSpaces + else: + return init + self._optimize(processed, descriptors) + + def _optimize(self, clusters, descriptors): + for b, v in descriptors.items(): + if v.is_writeonly: + # `b` might be written by multiple, potentially mutually + # exclusive, equations. For example, two equations that have or + # will have complementary guards, hence only one will be + # executed. In such a case, we can split the equations over + # separate IterationSpaces key0 = lambda: Stamp() - elif b.is_readonly: + elif v.is_readonly: # `b` is read multiple times -- this could just be the case of # coupled equations, so we more cautiously perform a # "buffer-wise" splitting of the IterationSpaces (i.e., only @@ -302,224 +265,173 @@ def callback(self, clusters, prefix, cache=None): else: continue - processed1 = [] - for c in processed: - if b.buffer in c.functions: - key1 = lambda d: d not in b.dim._defines - dims = c.ispace.project(key1).itdims - ispace = c.ispace.lift(dims, key0()) - processed1.append(c.rebuild(ispace=ispace)) - else: - processed1.append(c) - processed = processed1 + processed = [] + for c in clusters: + if b not in c.functions: + processed.append(c) + continue - return init + processed + key1 = lambda d: not d._defines & v.dim._defines + dims = c.ispace.project(key1).itdims + ispace = c.ispace.lift(dims, key0()) + processed.append(c.rebuild(ispace=ispace)) + clusters = processed -class Buffer: + return clusters - """ - A buffer with metadata attached. - Parameters - ---------- - function : DiscreteFunction - The object for which the buffer is created. - dim : Dimension - The Dimension in `function` to be replaced with a ModuloDimension. - d : Dimension - The iteration Dimension from which `function` was extracted. - accessv : AccessValue - All accesses involving `function`. - cache : dict - Mapper between buffered Functions and previously created Buffers. - options : dict, optional - The compilation options. See `buffering.__doc__`. - sregistry : SymbolRegistry - The symbol registry, to create unique names for buffers and Dimensions. - """ +Map = namedtuple('Map', 'b f') - def __init__(self, function, dim, d, accessv, cache, options, sregistry): - # Parse compilation options - async_degree = options['buf-async-degree'] - callback = options['buf-callback'] - - self.function = function - self.dim = dim - self.d = d - self.accessv = accessv - - self.index_mapper = {} - self.sub_iterators = defaultdict(list) - self.subdims_mapper = DefaultOrderedDict(set) - - # Initialize the buffer metadata. Depending on whether it's multi-level - # buffering (e.g., double buffering) or first-level, we need to perform - # different actions. Multi-level is trivial, because it essentially - # inherits metadata from the previous buffering level - if self.multi_buffering: - self.__init_multi_buffering__() - else: - self.__init_firstlevel_buffering__(async_degree, sregistry) - - # Track the SubDimensions used to index into `function` - for e in accessv.mapper: - m = {i.root: i for i in e.free_symbols - if isinstance(i, Dimension) and (i.is_Sub or not i.is_Derived)} - for d0, d1 in m.items(): - self.subdims_mapper[d0].add(d1) - if any(len(v) > 1 for v in self.subdims_mapper.values()): - # Non-uniform SubDimensions. At this point we're going to raise - # an exception. It's either illegal or still unsupported - for v in self.subdims_mapper.values(): - for d0, d1 in combinations(v, 2): - if d0.overlap(d1): - raise InvalidOperator( - "Cannot apply `buffering` to `%s` as it is accessed " - "over the overlapping SubDimensions `<%s, %s>`" % - (function, d0, d1)) - raise NotImplementedError("`buffering` does not support multiple " - "non-overlapping SubDimensions yet.") + +class BufferDimension(CustomDimension): + pass + + +def generate_buffers(clusters, key, sregistry, options, **kwargs): + async_degree = options['buf-async-degree'] + callback = options['buf-callback'] + + # {candidate buffered Function -> [Clusters that access it]} + bfmap = map_buffered_functions(clusters, key) + + # {buffered Function -> Buffer} + xds = {} + mapper = {} + for f, clusters in bfmap.items(): + exprs = flatten(c.exprs for c in clusters) + + bdims = key(f, exprs) + + dims = [d for d in f.dimensions if d not in bdims] + if len(dims) != 1: + raise InvalidOperator("Unsupported multi-dimensional `buffering` " + "required by `%s`" % f) + dim = dims.pop() + + if is_buffering(exprs): + # Multi-level buffering + # NOTE: a bit rudimentary (we could go through the exprs one by one + # instead), but it's much shorter this way + buffers = [f for f in retrieve_functions(exprs) if f.is_Array] + assert len(buffers) == 1, "Unexpected form of multi-level buffering" + buffer, = buffers + xd = buffer.indices[dim] else: - self.subdims_mapper = {d0: v.pop() - for d0, v in self.subdims_mapper.items()} - - # Build and sanity-check the buffer IterationIntervals - self.itintervals_mapper = {} - for e in accessv.mapper: - for i in e.ispace.itintervals: - v = self.itintervals_mapper.setdefault(i.dim, i.args) - if v != self.itintervals_mapper[i.dim]: - raise NotImplementedError( - "Cannot apply `buffering` as the buffered function `%s` is " - "accessed over multiple, non-compatible iteration spaces " - "along the Dimension `%s`" % (function.name, i.dim)) - # Also add IterationIntervals for initialization along `x`, should `xi` be - # the only written Dimension in the `x` hierarchy - for d0, (interval, _, _) in list(self.itintervals_mapper.items()): - for i in d0._defines: - self.itintervals_mapper.setdefault(i, (interval.relaxed, (), Forward)) + size = infer_buffer_size(f, dim, clusters) + + if async_degree is not None: + if async_degree < size: + warning("Ignoring provided asynchronous degree as it'd be " + "too small for the required buffer (provided %d, " + "but need at least %d for `%s`)" + % (async_degree, size, f.name)) + else: + size = async_degree + + # A special CustomDimension to use in place of `dim` in the buffer + try: + xd = xds[(dim, size)] + except KeyError: + name = sregistry.make_name(prefix='db') + xd = xds[(dim, size)] = BufferDimension(name, 0, size-1, size, dim) # The buffer dimensions - dims = list(function.dimensions) - assert dim in function.dimensions - dims[dims.index(dim)] = self.xd + dimensions = list(f.dimensions) + assert dim in f.dimensions + dimensions[dimensions.index(dim)] = xd # Finally create the actual buffer - if function in cache: - self.buffer = cache[function] - else: - kwargs = { - 'name': sregistry.make_name(prefix='%sb' % function.name), - 'dimensions': dims, - 'dtype': function.dtype, - 'grid': function.grid, - 'halo': function.halo, - 'space': 'mapped', - 'mapped': function - } - try: - self.buffer = cache[function] = callback(function, **kwargs) - except TypeError: - self.buffer = cache[function] = Array(**kwargs) + cls = callback or Array + name = sregistry.make_name(prefix='%sb' % f.name) + mapper[f] = cls(name=name, dimensions=dimensions, dtype=f.dtype, + grid=f.grid, halo=f.halo, space='mapped', mapped=f, f=f) - def __init_multi_buffering__(self): - try: - expr, = self.accessv.exprs - except ValueError: - assert False + return mapper - lhs, rhs = expr.args - maybe_xd = lhs.function.indices[self.dim] - if not isinstance(maybe_xd, CustomDimension): - maybe_xd = rhs.function.indices[self.dim] - assert isinstance(maybe_xd, CustomDimension) - self.xd = maybe_xd +def map_buffered_functions(clusters, key): + """ + Map each candidate Function to the Clusters that access it. + """ + bfmap = defaultdict(list) + for c in clusters: + for f in c.functions: + if key(f, c.exprs): + bfmap[f].append(c) - idx0 = lhs.indices[self.dim] - idx1 = rhs.indices[self.dim] + return frozendict(bfmap) - if self.is_read: - if is_integer(idx0) or isinstance(idx0, ModuloDimension): - # This is just for aesthetics of the generated code - self.index_mapper[idx1] = 0 - else: - self.index_mapper[idx1] = idx1 - else: - self.index_mapper[idx0] = idx1 - def __init_firstlevel_buffering__(self, async_degree, sregistry): - d = self.d - dim = self.dim - function = self.function +class BufferDescriptor: - indices = filter_ordered(i.indices[dim] for i in self.accessv.accesses) - slots = [i.subs({dim: 0, dim.spacing: 1}) for i in indices] + def __init__(self, f, b, clusters): + self.f = f + self.b = b + self.clusters = clusters - try: - size = max(slots) - min(slots) + 1 - except TypeError: - # E.g., special case `slots=[-1 + time/factor, 2 + time/factor]` - # Resort to the fast vector-based comparison machinery (rather than - # the slower sympy.simplify) - slots = [Vector(i) for i in slots] - size = int((vmax(*slots) - vmin(*slots) + 1)[0]) - - if async_degree is not None: - if async_degree < size: - warning("Ignoring provided asynchronous degree as it'd be " - "too small for the required buffer (provided %d, " - "but need at least %d for `%s`)" - % (async_degree, size, function.name)) - else: - size = async_degree + self.xd, = b.find(BufferDimension) + self.bdims = tuple(d for d in b.dimensions if d is not self.xd) - # Create `xd` -- a contraction Dimension for `dim` - try: - xd = sregistry.get('xds', (dim, size)) - except KeyError: - name = sregistry.make_name(prefix='db') - v = CustomDimension(name, 0, size-1, size, dim) - xd = sregistry.setdefault('xds', (dim, size), v) - self.xd = xd - - # Create the ModuloDimensions to step through the buffer - if size > 1: - # Note: indices are sorted so that the semantic order (sb0, sb1, sb2) - # follows SymPy's index ordering (time, time-1, time+1) after modulo - # replacement, so that associativity errors are consistent. This very - # same strategy is also applied in clusters/algorithms/Stepper - p, _ = offset_from_centre(d, indices) - indices = sorted(indices, - key=lambda i: -np.inf if i - p == 0 else (i - p)) - for i in indices: - try: - md = sregistry.get('mds', (xd, i)) - except KeyError: - name = sregistry.make_name(prefix='sb') - v = ModuloDimension(name, xd, i, size) - md = sregistry.setdefault('mds', (xd, i), v) - self.index_mapper[i] = md - self.sub_iterators[d.root].append(md) - else: - assert len(indices) == 1 - self.index_mapper[indices[0]] = 0 + self.dim = f.indices[self.xd] + + self.indices = extract_indices(f, self.dim, clusters) + + # The IterationSpace within which the buffer will be accessed + # NOTE: The `key` is to avoid Clusters including `f` but not directly + # using it in an expression, such as HaloTouch Clusters + key = lambda c: any(i in c.ispace.dimensions for i in self.bdims) + ispaces = {c.ispace for c in clusters if key(c)} + + if len(ispaces) > 1: + # Best effort to make buffering work in the presence of multiple + # IterationSpaces + stamp = Stamp() + ispaces = {i.lift(self.bdims, v=stamp) for i in ispaces} + + if len(ispaces) > 1: + raise InvalidOperator("Unsupported `buffering` over different " + "IterationSpaces") + + assert len(ispaces) == 1, "Unexpected form of `buffering`" + self.ispace = ispaces.pop() def __repr__(self): - return "Buffer[%s,<%s>]" % (self.buffer.name, self.xd) + return "Descriptor[%s -> %s]" % (self.f, self.b) @property def size(self): return self.xd.symbolic_size @property - def firstread(self): - return self.accessv.firstread + def exprs(self): + return flatten(c.exprs for c in self.clusters) @property + def itdims(self): + """ + The Dimensions defining an IterationSpace in which the buffer + may be accessed. + """ + return (self.xd, self.dim.root) + + @cached_property + def subdims_mapper(self): + return {d.root: d for d in self.ispace.itdims if d.is_AbstractSub} + + @cached_property + def firstread(self): + for c in self.clusters: + if c.scope.reads.get(self.f): + return c + return None + + @cached_property def lastwrite(self): - return self.accessv.lastwrite + for c in reversed(self.clusters): + if c.scope.writes.get(self.f): + return c + return None @property def is_read(self): @@ -538,46 +450,49 @@ def is_writeonly(self): return self.lastwrite is not None and self.firstread is None @property - def has_uniform_subdims(self): - return self.subdims_mapper is not None + def is_forward_buffering(self): + return self.ispace[self.dim].direction is Forward @property - def multi_buffering(self): + def is_double_buffering(self): + return is_buffering(self.exprs) + + @cached_property + def write_to(self): """ - True if double-buffering or more, False otherwise. + The `write_to` IterationSpace, that is the iteration space that must be + iterated over in order to initialize the buffer. """ - return all(is_memcpy(e) for e in self.accessv.exprs) + ispace = self.ispace.switch(self.dim.root, self.xd, Forward) - @cached_property - def indexed(self): - return self.buffer.indexed + # We need everything, not just the SubDomain, as all points in principle + # might be accessed through a stencil + ispace = ispace.promote(lambda d: d.is_AbstractSub, mode='total') + + # Analogous to the above, we need to include the halo region as well + ihalo = IntervalGroup([ + Interval(d, -h.left, h.right) + for d, h in zip(self.b.dimensions, self.b._size_halo) + ]) + ispace = ispace.add(ihalo) + + return ispace @cached_property - def writeto(self): + def step_to(self): """ - The `writeto` IterationSpace, that is the iteration space that must be - iterated over in order to initialize the buffer. + The `step_to` IterationSpace, that is the iteration space that must be + iterated over to update the buffer with the buffered Function values. """ - intervals = [] - sub_iterators = {} - directions = {} - for d, h in zip(self.buffer.dimensions, self.buffer._size_halo): - try: - i, si, direction = self.itintervals_mapper[d] - # The initialization must comprise the halo region as well, since - # in principle this could be accessed through a stencil - interval = Interval(i.dim, -h.left, h.right, i.stamp) - except KeyError: - assert d in self.xd._defines - interval, si, direction = Interval(d), (), Forward - intervals.append(interval) - sub_iterators[d] = si - directions[d] = direction + # May be `db0` (e.g., for double buffering) or `time` + dim = self.ispace[self.dim].dim - relations = (self.buffer.dimensions,) - intervals = IntervalGroup(intervals, relations=relations) + if self.is_forward_buffering: + direction = Forward + else: + direction = Backward - return IterationSpace(intervals, sub_iterators, directions) + return self.write_to.switch(self.xd, dim, direction) @cached_property def written(self): @@ -585,142 +500,173 @@ def written(self): The `written` IterationSpace, that is the iteration space that must be iterated over in order to read all of the written buffer values. """ - intervals = [] - sub_iterators = {} - directions = {} - for dd in self.buffer.dimensions: - d = dd.xreplace(self.subdims_mapper) - try: - interval, si, direction = self.itintervals_mapper[d] - except KeyError: - # E.g., d=time_sub - assert d.is_NonlinearDerived or d.is_Custom - d = d.root - interval, si, direction = self.itintervals_mapper[d] - intervals.append(interval) - sub_iterators[d] = si + as_tuple(self.sub_iterators[d]) - directions[d] = direction - - directions[d.root] = direction - - relations = (tuple(i.dim for i in intervals),) - intervals = IntervalGroup(intervals, relations=relations) - - return IterationSpace(intervals, sub_iterators, directions) + return self.ispace @cached_property - def readfrom(self): - """ - The `readfrom` IterationSpace, that is the iteration space that must be - iterated over to update the buffer with the buffered Function values. + def last_idx(self): """ - ispace0 = self.written.project(lambda d: d in self.xd._defines) - ispace1 = self.writeto.project(lambda d: d not in self.xd._defines) + The "last index" is the index that corresponds to the last slot that + must be read from the buffered Function. - extra = (ispace0.itdims + ispace1.itdims,) - ispace = IterationSpace.union(ispace0, ispace1, relations=extra) + Examples + -------- - return ispace + * `time+1` in the case of `foo(u[time-1], u[time], u[time+1])` + with a forward-propagating `time` Dimension; + * `time-1` in the case of `foo(u[time-1], u[time], u[time+1])` + with a backwards-propagating `time` Dimension. + """ + func = vmax if self.is_forward_buffering else vmin + return func(*[Vector(i) for i in self.indices])[0] @cached_property - def lastidx(self): - """ - A 2-tuple of indices representing, respectively, the *last* write to the - buffer and the *last* read from the buffered Function. For example, - `(sb1, time+1)` in the case of a forward-propagating `time` Dimension. + def first_idx(self): """ - try: - func = max if self.written[self.d].direction is Forward else min - v = func(self.index_mapper) - except TypeError: - func = vmax if self.written[self.d].direction is Forward else vmin - v = func(*[Vector(i) for i in self.index_mapper])[0] + The two "first indices", respectively for the buffer and the buffered + Function. - return Map(self.index_mapper[v], v) + A "first index" is the index that corresponds to the first slot in the + buffer that must be initialized, or the first slot that must be read from + the buffered Function. - @cached_property - def firstidx(self): - """ - A 2-tuple of indices representing the min points for buffer initialization. - For example, in the case of a forward-propagating `time` Dimension, we - could have `((time_m + db0) % 2, (time_m + db0))`; likewise, for - backwards, `((time_M - 2 + db0) % 4, time_M - 2 + db0)`. + Examples + -------- + + * `((time_m + db0) % 2, time_m + db0)` in the case of a + forward-propagating `time` Dimension; + * `((time_M - 2 + db0) % 4, time_M - 2 + db0)` in the case of a + backwards-propagating `time` Dimension. """ - # The buffer is initialized at `d_m(d_M) - offset`. E.g., a buffer with - # six slots, used to replace a buffered Function accessed at `d-3`, `d` - # and `d + 2`, will have `offset = 3` - p, offset = offset_from_centre(self.dim, list(self.index_mapper)) + d = self.dim - if self.written[self.dim].direction is Forward: - v = p._subs(self.dim.root, self.dim.root.symbolic_min) - offset + self.xd + if self.is_double_buffering: + assert len(self.indices) == 1 + v, = self.indices else: - v = p._subs(self.dim.root, self.dim.root.symbolic_max) - offset + self.xd + # The buffer is initialized at `d_m(d_M) - offset`. E.g., a buffer with + # six slots, used to replace a buffered Function accessed at `d-3`, `d` + # and `d + 2`, will have `offset = 3` + p, offset = offset_from_centre(d, self.indices) - return Map(v % self.xd.symbolic_size, v) + if self.is_forward_buffering: + v = p._subs(d.root, d.root.symbolic_min) - offset + self.xd + else: + v = p._subs(d.root, d.root.symbolic_max) - offset + self.xd + return Map(v % self.xd.symbolic_size, v) -class AccessValue: +def make_mds(descriptors, sregistry): + """ + Create the ModuloDimensions to step through the buffers. This is done + inspecting all buffers so that ModuloDimensions are reused when possible. """ - A simple data structure tracking the accesses performed by a given Function - in a sequence of Clusters. + mds = defaultdict(int) + for v in descriptors.values(): + size = v.xd.symbolic_size - Parameters - ---------- - function : Function - The target Function. - mapper : dict - A mapper from expressions to Indexeds, representing all accesses to - `function` in a sequence of expressions. + if size == 1: + continue + + p, _ = offset_from_centre(v.dim, v.indices) + + # NOTE: indices are sorted so that the semantic order (sb0, sb1, sb2) + # follows SymPy's index ordering (time, time-1, time+1) after modulo + # replacement, so that associativity errors are consistent. This very + # same strategy is also applied in clusters/algorithms/Stepper + key = lambda i: -np.inf if i - p == 0 else (i - p) + indices = sorted(v.indices, key=key) + + for i in indices: + k = (v.xd, i) + if k not in mds: + name = sregistry.make_name(prefix='sb') + mds[k] = ModuloDimension(name, v.xd, i, size) + + return mds + + +def init_buffers(descriptors, options): + """ + Create the initializing Clusters for the given buffers. """ + init_onwrite = options['buf-init-onwrite'] - def __init__(self, function, mapper): - self.function = function - self.mapper = mapper + init = [] + for b, v in descriptors.items(): + f = v.f - @cached_property - def exprs(self): - return tuple(self.mapper) + if v.is_read: + # Special case: avoid initialization in the case of double (or + # multiple) buffering because it's completely unnecessary + if v.is_double_buffering: + continue - @cached_property - def accesses(self): - return tuple(flatten(as_tuple(i.reads) + as_tuple(i.write) - for i in self.mapper.values())) + lhs = b.indexify()._subs(v.xd, v.first_idx.b) + rhs = f.indexify()._subs(v.dim, v.first_idx.f) - @cached_property - def is_read(self): - return any(av.reads for av in self.mapper.values()) + elif v.is_write and init_onwrite(f): + lhs = b.indexify() + rhs = S.Zero - @cached_property - def firstread(self): - for e, av in self.mapper.items(): - if av.reads: - return e - return None + else: + continue - @cached_property - def lastwrite(self): - for e, av in reversed(self.mapper.items()): - if av.write is not None: - return e - return None + expr = Eq(lhs, rhs) + expr = lower_exprs(expr) + ispace = v.write_to -AccessTuple = lambda: Bunch(reads=[], write=None) -Map = namedtuple('Map', 'b f') + guards = {} + guards[None] = GuardBound(v.dim.root.symbolic_min, v.dim.root.symbolic_max) + if v.is_read: + guards[v.xd] = GuardBound(0, v.first_idx.f) + properties = Properties() + properties = properties.affine(ispace.itdims) + properties = properties.parallelize(ispace.itdims) -class AccessMapper(OrderedDict): + init.append(Cluster(expr, ispace, guards=guards, properties=properties)) - def __init__(self, clusters): - mapper = DefaultOrderedDict(lambda: DefaultOrderedDict(AccessTuple)) - for c in clusters: - for e in c.exprs: - for i in retrieve_function_carriers(e.rhs): - mapper[i.function][e].reads.append(i) - mapper[e.lhs.function][e].write = e.lhs + return init + + +def is_buffering(exprs): + """ + True if the given N exprs represent N levels of buffering, False otherwise. + """ + return all(is_memcpy(e) for e in as_tuple(exprs)) + + +def extract_indices(f, dim, clusters): + """ + Extract the indices along `dim` in the given Clusters that access `f`. + """ + accesses = chain(*[c.scope[f] for c in clusters]) + indices = filter_ordered(i[dim] for i in accesses) + + return indices + + +def infer_buffer_size(f, dim, clusters): + """ + Infer the buffer size for the buffered Function `f` by analyzing the + accesses along `dim` within the given Clusters. + """ + indices = extract_indices(f, dim, clusters) + + slots = [i.subs({dim: 0, dim.spacing: 1}) for i in indices] + + try: + size = max(slots) - min(slots) + 1 + except TypeError: + # E.g., special case `slots=[-1 + time/factor, 2 + time/factor]` + # Resort to the fast vector-based comparison machinery (rather than + # the slower sympy.simplify) + slots = [Vector(i) for i in slots] + size = int((vmax(*slots) - vmin(*slots) + 1)[0]) - super().__init__([(f, AccessValue(f, mapper[f])) for f in mapper]) + return size def offset_from_centre(d, indices): diff --git a/devito/passes/iet/orchestration.py b/devito/passes/iet/orchestration.py index 7e7adb128e..b4308f6789 100644 --- a/devito/passes/iet/orchestration.py +++ b/devito/passes/iet/orchestration.py @@ -4,7 +4,7 @@ import cgen as c from sympy import Or -from devito.exceptions import CompilationError +from devito.exceptions import InvalidOperator from devito.ir.iet import (Call, Callable, List, SyncSpot, FindNodes, Transformer, BlankLine, BusyWait, DummyExpr, AsyncCall, AsyncCallable, derive_parameters) @@ -131,7 +131,7 @@ def process(self, iet): layers = {infer_layer(s.function) for s in sync_ops} if len(layers) != 1: - raise CompilationError("Unsupported streaming case") + raise InvalidOperator("Unsupported streaming case") layer = layers.pop() subs[n], v = callbacks[t](subs.get(n, n), sync_ops, layer) diff --git a/tests/test_buffering.py b/tests/test_buffering.py index d3c8667025..c1331d3766 100644 --- a/tests/test_buffering.py +++ b/tests/test_buffering.py @@ -2,8 +2,8 @@ import numpy as np from conftest import skipif -from devito import (Constant, Grid, TimeFunction, SparseTimeFunction, Operator, - Eq, ConditionalDimension, SubDimension, SubDomain, configuration) +from devito import (Constant, Grid, TimeFunction, Operator, Eq, SubDimension, + SubDomain, ConditionalDimension) from devito.ir import FindSymbols, retrieve_iteration_tree from devito.exceptions import InvalidOperator @@ -252,36 +252,6 @@ def test_two_heterogeneous_buffers(): assert np.all(v.data == v1.data) -def test_over_injection(): - nt = 10 - grid = Grid(shape=(4, 4)) - - src = SparseTimeFunction(name='src', grid=grid, npoint=1, nt=nt) - rec = SparseTimeFunction(name='rec', grid=grid, npoint=1, nt=nt) - u = TimeFunction(name="u", grid=grid, time_order=2, space_order=2, save=nt) - u1 = TimeFunction(name="u", grid=grid, time_order=2, space_order=2, save=nt) - - src.data[:] = 1. - - eqns = ([Eq(u.forward, u + 1)] + - src.inject(field=u.forward, expr=src) + - rec.interpolate(expr=u.forward)) - - op0 = Operator(eqns, opt='noop') - op1 = Operator(eqns, opt=('buffering', {'par-collapse-work': 0})) - - # Check generated code - assert len(retrieve_iteration_tree(op1)) == \ - 7 + int(configuration['language'] != 'C') - buffers = [i for i in FindSymbols().visit(op1) if i.is_Array] - assert len(buffers) == 1 - - op0.apply(time_M=nt-2) - op1.apply(time_M=nt-2, u=u1) - - assert np.all(u.data == u1.data) - - def test_over_one_subdomain(): class sd0(SubDomain): @@ -706,18 +676,21 @@ def test_stencil_issue_1915(subdomain): nt = 5 grid = Grid(shape=(6, 6)) - u = TimeFunction(name='u', grid=grid, space_order=4, save=nt) - u1 = TimeFunction(name='u', grid=grid, space_order=4, save=nt) + u = TimeFunction(name='u', grid=grid, space_order=4) + u1 = TimeFunction(name='u', grid=grid, space_order=4) + usave = TimeFunction(name='usave', grid=grid, space_order=4, save=nt) + usave1 = TimeFunction(name='usave', grid=grid, space_order=4, save=nt) subdomain = grid.subdomains[subdomain] - eqn = Eq(u.forward, u.dx + 1, subdomain=subdomain) + eqns = [Eq(u.forward, u.dx + 1, subdomain=subdomain), + Eq(usave, u.forward, subdomain=subdomain)] - op0 = Operator(eqn, opt='noop') - op1 = Operator(eqn, opt='buffering') + op0 = Operator(eqns, opt='noop') + op1 = Operator(eqns, opt='buffering') op0.apply(time_M=nt-2) - op1.apply(time_M=nt-2, u=u1) + op1.apply(time_M=nt-2, u=u1, usave=usave1) assert np.all(u.data == u1.data) From a8297dda14617ca6b6dd0c4740a8d679e42603f1 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Mon, 22 Apr 2024 15:20:33 +0000 Subject: [PATCH 13/58] compiler: Revamp ascyn_memcpy --- devito/passes/clusters/asynchrony.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/devito/passes/clusters/asynchrony.py b/devito/passes/clusters/asynchrony.py index eef69f3d90..f98802cf15 100644 --- a/devito/passes/clusters/asynchrony.py +++ b/devito/passes/clusters/asynchrony.py @@ -1,6 +1,6 @@ from collections import defaultdict -from sympy import And +from sympy import And, true from devito.exceptions import CompilationError from devito.ir import (Forward, GuardBoundNext, Vector, WaitLock, WithLock, @@ -183,7 +183,6 @@ def memcpy_prefetch(clusters, key0, sregistry): elif d.is_Custom and is_integer(c.ispace[d].size): _actions_from_init(c, d, actions) - # Attach the computed Actions processed = [] for c in clusters: @@ -219,9 +218,8 @@ def _actions_from_init(c, d, actions): def _actions_from_update_memcpy(c, d, clusters, actions, sregistry): - direction = c.ispace[d].direction + direction = c.ispace[d.root].direction - # Prepare the data to instantiate a PrefetchUpdate SyncOp e = c.exprs[0] function = e.rhs.function target = e.lhs.function @@ -238,7 +236,8 @@ def _actions_from_update_memcpy(c, d, clusters, actions, sregistry): tindex = tindex0 else: assert tindex0.is_Modulo - subiters = [i for i in c.sub_iterators[d] if i.parent is tindex0.parent] + subiters = [i for i in c.sub_iterators[d.root] + if i.parent is tindex0.parent] osubiters = sorted(subiters, key=lambda i: Vector(i.offset)) n = osubiters.index(tindex0) if direction is Forward: @@ -255,10 +254,9 @@ def _actions_from_update_memcpy(c, d, clusters, actions, sregistry): # Turn `c` into a prefetch Cluster `pc` expr = uxreplace(e, {tindex0: tindex, fetch: findex}) - guards = {d: And( - c.guards.get(d, True), - GuardBoundNext(function.indices[d], direction), - )} + guard0 = c.guards.get(d, true)._subs(fetch, findex) + guard1 = GuardBoundNext(function.indices[d], direction) + guards = c.guards.impose(d, And(guard0, guard1)) syncs = {d: [ ReleaseLock(handle, target), From 99fb63ecaf406a4dfad9e2e2d6b3ed83eb0b5aab Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Wed, 24 Apr 2024 07:46:15 +0000 Subject: [PATCH 14/58] compiler: Drop spurious generated comment --- devito/passes/iet/orchestration.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/devito/passes/iet/orchestration.py b/devito/passes/iet/orchestration.py index b4308f6789..8a4195062f 100644 --- a/devito/passes/iet/orchestration.py +++ b/devito/passes/iet/orchestration.py @@ -82,10 +82,7 @@ def _make_fetchupdate(self, iet, sync_ops, layer): efunc = Callable(name, body, 'void', parameters, 'static') # Perform initial fetch by the main thread - iet = List( - header=c.Comment("Initialize data stream"), - body=Call(name, parameters) - ) + iet = Call(name, parameters) return iet, [efunc] From e745e41f9031ce77fd446c66c14c226db7aeee5e Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Wed, 24 Apr 2024 13:40:21 +0000 Subject: [PATCH 15/58] compiler: Simplify and generalize tasking --- devito/passes/clusters/asynchrony.py | 44 ++++++++++------------------ devito/passes/clusters/utils.py | 30 ++++++++----------- 2 files changed, 28 insertions(+), 46 deletions(-) diff --git a/devito/passes/clusters/asynchrony.py b/devito/passes/clusters/asynchrony.py index f98802cf15..619bc5dc8a 100644 --- a/devito/passes/clusters/asynchrony.py +++ b/devito/passes/clusters/asynchrony.py @@ -5,7 +5,7 @@ from devito.exceptions import CompilationError from devito.ir import (Forward, GuardBoundNext, Vector, WaitLock, WithLock, FetchUpdate, PrefetchUpdate, ReleaseLock, normalize_syncs) -from devito.passes.clusters.utils import bind_critical_regions, is_memcpy +from devito.passes.clusters.utils import in_critical_region, is_memcpy from devito.symbolics import IntDiv, uxreplace from devito.tools import OrderedSet, as_mapper, is_integer, timed_pass from devito.types import CustomDimension, Lock @@ -52,20 +52,14 @@ def tasking(clusters, key0, sregistry): task Dimensions. """ key, _ = keys(key0) - candidates = as_mapper(clusters, key) - candidates.pop(None, None) - - if len(candidates) == 0: - return clusters - elif len(candidates) > 1: - raise CompilationError("Cannot handle multiple tasking dimensions") - - d, candidates = candidates.popitem() locks = {} - waits = defaultdict(OrderedSet) - tasks = defaultdict(list) - for c0 in candidates: + syncs = defaultdict(lambda: defaultdict(OrderedSet)) + for c0 in clusters: + d = key(c0) + if d is None: + continue + # Prevent future writes to interfere with a task by waiting on a lock may_require_lock = {f for f in c0.scope.reads if f.is_AbstractFunction} @@ -118,7 +112,10 @@ def tasking(clusters, key0, sregistry): if logical_index in protected[target]: continue - waits[c1].add(WaitLock(lock[index], target)) + # Critical regions preempt WaitLocks + c2 = in_critical_region(c1, clusters) or c1 + + syncs[c2][d].add(WaitLock(lock[index], target)) protected[target].add(logical_index) # Taskify `c0` @@ -142,21 +139,12 @@ def tasking(clusters, key0, sregistry): findex = None for i in indices: - tasks[c0].append(ReleaseLock(lock[i], target)) - tasks[c0].append(WithLock(lock[i], target, i, function, findex, d)) + syncs[c0][d].update([ + ReleaseLock(lock[i], target), + WithLock(lock[i], target, i, function, findex, d) + ]) - # CriticalRegions preempt WaitLocks, by definition - mapper = bind_critical_regions(clusters) - for c in clusters: - for c1 in mapper.get(c, []): - waits[c].update(waits.pop(c1, [])) - - processed = [] - for c in clusters: - if waits[c] or tasks[c]: - processed.append(c.rebuild(syncs={d: list(waits[c]) + tasks[c]})) - else: - processed.append(c) + processed = [c.rebuild(syncs=syncs.get(c)) for c in clusters] return processed diff --git a/devito/passes/clusters/utils.py b/devito/passes/clusters/utils.py index abb4cec264..74aba55a14 100644 --- a/devito/passes/clusters/utils.py +++ b/devito/passes/clusters/utils.py @@ -6,7 +6,7 @@ from devito.types import CriticalRegion, Eq, Symbol, Wildcard __all__ = ['makeit_ssa', 'is_memcpy', 'make_critical_sequence', - 'bind_critical_regions', 'in_critical_region'] + 'in_critical_region'] def makeit_ssa(exprs): @@ -74,23 +74,17 @@ def make_critical_sequence(ispace, sequence, **kwargs): return processed -def bind_critical_regions(clusters): - """ - A mapper from CriticalRegions to the critical sequences they open. - """ - critical_region = False - mapper = defaultdict(list) - for c in clusters: - if c.is_critical_region: - critical_region = not critical_region and c - elif critical_region: - mapper[critical_region].append(c) - return mapper - - def in_critical_region(cluster, clusters): """ - True if `cluster` is part of a critical sequence, False otherwise. + Return the opening Cluster of the critical sequence containing `cluster`, + or None if `cluster` is not part of a critical sequence. """ - mapper = bind_critical_regions(clusters) - return cluster in flatten(mapper.values()) + maybe_found = None + for c in clusters: + if c is cluster: + return maybe_found + elif c.is_critical_region and maybe_found: + maybe_found = None + elif c.is_critical_region: + maybe_found = c + return None From 1e7671eadf54913a532c55028bf5923082f0061e Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Wed, 24 Apr 2024 13:55:52 +0000 Subject: [PATCH 16/58] compiler: Tweak and polish lifting --- devito/passes/clusters/misc.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/devito/passes/clusters/misc.py b/devito/passes/clusters/misc.py index c38a1bac26..096a332255 100644 --- a/devito/passes/clusters/misc.py +++ b/devito/passes/clusters/misc.py @@ -72,8 +72,9 @@ def callback(self, clusters, prefix): # r = f(a[x]) for y for y # r[x] = f(a[x, y]) r[x, y] = f(a[x, y]) # - # In 1) and 2) lifting is infeasible; in 3) the statement can be lifted - # outside the `i` loop as `r`'s write-to region contains both `x` and `y` + # In 1) and 2) lifting is infeasible; in 3) the statement can + # be lifted outside the `i` loop as `r`'s write-to region contains + # both `x` and `y` xed = {d._defines for d in c.used_dimensions if d not in outer} if not all(i & set(w.dimensions) for i, w in product(xed, c.scope.writes)): processed.append(c) @@ -85,10 +86,13 @@ def callback(self, clusters, prefix): # Optimization: if not lifting from the innermost Dimension, we can # safely reset the `ispace` to expose potential fusion opportunities - if c.ispace[-1].dim not in hope_invariant: - ispace = ispace.reset() + try: + if c.ispace.innermost.dim not in hope_invariant: + ispace = ispace.reset() + except IndexError: + pass - properties = {d: v for d, v in c.properties.items() if key(d)} + properties = c.properties.filter(key) lifted.append(c.rebuild(ispace=ispace, properties=properties)) From e93e3d77b54bbdbb972da1cdded0e6494c00ae32 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Thu, 25 Apr 2024 08:18:30 +0000 Subject: [PATCH 17/58] compiler: Fix stree construction --- devito/ir/stree/algorithms.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/devito/ir/stree/algorithms.py b/devito/ir/stree/algorithms.py index f154197a97..6deeacbd48 100644 --- a/devito/ir/stree/algorithms.py +++ b/devito/ir/stree/algorithms.py @@ -189,7 +189,7 @@ def preprocess(clusters, options=None, **kwargs): syncs = normalize_syncs(*[c1.syncs for c1 in found]) if syncs: - ispace = c.ispace.project(syncs) + ispace = c.ispace.prefix(syncs) processed.append(c.rebuild(exprs=[], ispace=ispace, syncs=syncs)) if all(c1.ispace.is_subset(c.ispace) for c1 in found): From 2d5a025dbbb535b606eca731b96551dcbf521e5b Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Thu, 25 Apr 2024 09:40:35 +0000 Subject: [PATCH 18/58] compiler: Normalize syncs at Cluster construction --- devito/ir/clusters/cluster.py | 5 ++--- devito/ir/support/syncs.py | 10 ++++------ 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/devito/ir/clusters/cluster.py b/devito/ir/clusters/cluster.py index 542bf6647a..b81b136abb 100644 --- a/devito/ir/clusters/cluster.py +++ b/devito/ir/clusters/cluster.py @@ -13,7 +13,7 @@ from devito.mpi.halo_scheme import HaloScheme, HaloTouch from devito.mpi.reduction_scheme import DistReduce from devito.symbolics import estimate_cost -from devito.tools import as_tuple, flatten, frozendict, infer_dtype +from devito.tools import as_tuple, flatten, infer_dtype from devito.types import WeakFence, CriticalRegion __all__ = ["Cluster", "ClusterGroup"] @@ -49,9 +49,8 @@ def __init__(self, exprs, ispace=null_ispace, guards=None, properties=None, self._exprs = tuple(ClusterizedEq(e, ispace=ispace) for e in as_tuple(exprs)) self._ispace = ispace self._guards = Guards(guards or {}) - self._syncs = frozendict(syncs or {}) + self._syncs = normalize_syncs(syncs or {}) - # Normalize properties properties = Properties(properties or {}) self._properties = tailor_properties(properties, ispace) diff --git a/devito/ir/support/syncs.py b/devito/ir/support/syncs.py index b39353ad12..4fac92c136 100644 --- a/devito/ir/support/syncs.py +++ b/devito/ir/support/syncs.py @@ -5,7 +5,7 @@ from collections import defaultdict from devito.data import FULL -from devito.tools import Pickable, filter_ordered +from devito.tools import Pickable, filter_ordered, frozendict from .utils import IMask __all__ = ['WaitLock', 'ReleaseLock', 'WithLock', 'FetchUpdate', 'PrefetchUpdate', @@ -126,16 +126,14 @@ class PrefetchUpdate(SyncCopyIn): def normalize_syncs(*args): if not args: - return - if len(args) == 1: - return args[0] + return {} syncs = defaultdict(list) for _dict in args: for k, v in _dict.items(): syncs[k].extend(v) - syncs = {k: filter_ordered(v) for k, v in syncs.items()} + syncs = {k: tuple(filter_ordered(v)) for k, v in syncs.items()} for v in syncs.values(): waitlocks = [s for s in v if isinstance(s, WaitLock)] @@ -145,4 +143,4 @@ def normalize_syncs(*args): # We do not allow mixing up WaitLock and WithLock ops raise ValueError("Incompatible SyncOps") - return syncs + return frozendict(syncs) From f43bfdc51b5ec35c531694afa94fcdc48e41f78a Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Mon, 29 Apr 2024 14:18:07 +0000 Subject: [PATCH 19/58] compiler: Attach mode to IMask --- devito/ir/support/syncs.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/devito/ir/support/syncs.py b/devito/ir/support/syncs.py index 4fac92c136..8a71e40f90 100644 --- a/devito/ir/support/syncs.py +++ b/devito/ir/support/syncs.py @@ -76,7 +76,8 @@ def imask(self): return IMask(*ret, getters=self.target.dimensions, function=self.function, - findex=self.findex) + findex=self.findex, + mode='out') class SyncCopyIn(SyncOp): @@ -101,7 +102,8 @@ def imask(self): return IMask(*ret, getters=self.target.dimensions, function=self.function, - findex=self.findex) + findex=self.findex, + mode='in') class WaitLock(SyncCopyOut): From 156d75e2a6bd4f80275f89ced711a032f31bc691 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Mon, 29 Apr 2024 15:48:12 +0000 Subject: [PATCH 20/58] compiler: Add ArrayMapped.size --- devito/passes/iet/definitions.py | 14 ++++++++------ devito/types/array.py | 8 +++++--- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/devito/passes/iet/definitions.py b/devito/passes/iet/definitions.py index a24f31e093..ca4164d184 100644 --- a/devito/passes/iet/definitions.py +++ b/devito/passes/iet/definitions.py @@ -179,14 +179,16 @@ def _alloc_mapped_array_on_high_bw_mem(self, site, obj, storage, *args): nbytes_param = Symbol(name='nbytes', dtype=np.uint64, is_const=True) nbytes_arg = SizeOf(obj.indexed._C_typedata)*obj.size - ffp1 = FieldFromPointer(obj._C_field_data, obj._C_symbol) - memptr = VOID(Byref(ffp1), '**') + ffp0 = FieldFromPointer(obj._C_field_data, obj._C_symbol) + memptr = VOID(Byref(ffp0), '**') allocs.append(self.lang['host-alloc-pin'](memptr, alignment, nbytes_param)) - ffp0 = FieldFromPointer(obj._C_field_nbytes, obj._C_symbol) - init = DummyExpr(ffp0, nbytes_param) + ffp1 = FieldFromPointer(obj._C_field_nbytes, obj._C_symbol) + init0 = DummyExpr(ffp1, nbytes_param) + ffp2 = FieldFromPointer(obj._C_field_size, obj._C_symbol) + init1 = DummyExpr(ffp2, 0) - frees = [self.lang['host-free-pin'](ffp1), + frees = [self.lang['host-free-pin'](ffp0), self.lang['host-free'](obj._C_symbol)] # Not all backends require explicit allocation/deallocation of the @@ -200,7 +202,7 @@ def _alloc_mapped_array_on_high_bw_mem(self, site, obj, storage, *args): ret = Return(obj._C_symbol) name = self.sregistry.make_name(prefix='alloc') - body = (decl, *allocs, init, ret) + body = (decl, *allocs, init0, init1, ret) efunc0 = make_callable(name, body, retval=obj) args = list(efunc0.parameters) args[args.index(nbytes_param)] = nbytes_arg diff --git a/devito/types/array.py b/devito/types/array.py index 82ef432d65..32ce1393b2 100644 --- a/devito/types/array.py +++ b/devito/types/array.py @@ -1,4 +1,4 @@ -from ctypes import POINTER, Structure, c_void_p, c_ulong +from ctypes import POINTER, Structure, c_void_p, c_ulong, c_uint64 from functools import cached_property import numpy as np @@ -195,13 +195,15 @@ class ArrayMapped(Array): _C_structname = 'array' _C_field_data = 'data' - _C_field_nbytes = 'nbytes' _C_field_dmap = 'dmap' + _C_field_nbytes = 'nbytes' + _C_field_size = 'size' _C_ctype = POINTER(type(_C_structname, (Structure,), {'_fields_': [(_C_field_data, c_restrict_void_p), (_C_field_nbytes, c_ulong), - (_C_field_dmap, c_void_p)]})) + (_C_field_dmap, c_void_p), + (_C_field_size, c_uint64)]})) class ArrayObject(ArrayBasic): From b3856d8db6058b82facc432d0dd21f6586a8b0b7 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Tue, 30 Apr 2024 08:00:00 +0000 Subject: [PATCH 21/58] compiler: Revamp MultiSubDomain lowering --- devito/ir/support/space.py | 6 +- devito/passes/clusters/aliases.py | 14 +- devito/passes/clusters/implicit.py | 298 ++++++++++++++++++++--------- devito/passes/iet/misc.py | 3 +- devito/types/dimension.py | 20 +- devito/types/grid.py | 59 +++--- tests/test_subdomains.py | 23 --- 7 files changed, 267 insertions(+), 156 deletions(-) diff --git a/devito/ir/support/space.py b/devito/ir/support/space.py index 14611664ea..efeba69156 100644 --- a/devito/ir/support/space.py +++ b/devito/ir/support/space.py @@ -995,7 +995,7 @@ def split(self, d): def insert(self, d, dimensions, sub_iterators=None, directions=None): """ - Insert new Dimensions into the IterationSpace before Dimension `d`. + Insert new Dimensions into the IterationSpace after Dimension `d`. """ dimensions = as_tuple(dimensions) @@ -1047,6 +1047,10 @@ def sub_iterators(self): def directions(self): return self._directions + @property + def outermost(self): + return self[0] + @property def innermost(self): return self[-1] diff --git a/devito/passes/clusters/aliases.py b/devito/passes/clusters/aliases.py index 798a3b7089..8e6fb07d5f 100644 --- a/devito/passes/clusters/aliases.py +++ b/devito/passes/clusters/aliases.py @@ -853,12 +853,14 @@ def lower_schedule(schedule, meta, sregistry, ftemps): # for zi = z_m + zi_ltkn; zi <= z_M - zi_rtkn; ... # r[zi] = ... # - # Instead of `r[zi - z_m - zi_ltkn]` we have just `r[zi]`, so we'll need - # as much room as in `zi`'s parent to avoid going OOB - # Aside from ugly generated code, the reason we do not rather shift the - # indices is that it prevents future passes to transform the loop bounds - # (e.g., MPI's comp/comm overlap does that) - dimensions = [d.parent if d.is_Sub else d for d in writeto.itdims] + # Instead of `r[zi - z_m - zi_ltkn]` we have just `r[zi]`, so we'll + # need as much room as in `zi`'s parent to avoid going OOB Aside + # from ugly generated code, the reason we do not rather shift the + # indices is that it prevents future passes to transform the loop + # bounds (e.g., MPI's comp/comm overlap does that) + #TODO: JUST d.root ?? + dimensions = [d.parent if d.is_AbstractSub else d + for d in writeto.itdims] # The halo must be set according to the size of `writeto` halo = [(abs(i.lower), abs(i.upper)) for i in writeto] diff --git a/devito/passes/clusters/implicit.py b/devito/passes/clusters/implicit.py index a9018881ae..61fd65b605 100644 --- a/devito/passes/clusters/implicit.py +++ b/devito/passes/clusters/implicit.py @@ -2,14 +2,17 @@ Passes to gather and form implicit equations from DSL abstractions. """ +from collections import defaultdict from functools import singledispatch from math import floor +import numpy as np + from devito.ir import (Cluster, Interval, IntervalGroup, IterationSpace, Queue, - FetchUpdate, PrefetchUpdate, SEQUENTIAL) + FetchUpdate, PrefetchUpdate, SEQUENTIAL, Forward, vmax, vmin) from devito.symbolics import retrieve_dimensions -from devito.tools import as_tuple, timed_pass -from devito.types import Eq +from devito.tools import Bunch, as_tuple, frozendict, timed_pass +from devito.types import Eq, Symbol from devito.types.grid import MultiSubDimension, SubDomainSet __all__ = ['generate_implicit'] @@ -27,16 +30,27 @@ def generate_implicit(clusters, sregistry): * MultiSubDomains attached to input equations. """ - clusters = LowerMultiSubDimensions(sregistry).process(clusters) + clusters = LowerExplicitMSD(sregistry).process(clusters) + clusters = LowerImplicitMSD(sregistry).process(clusters) return clusters -class LowerMultiSubDimensions(Queue): +class LowerMSD(Queue): + + def __init__(self, sregistry): + super().__init__() + self.sregistry = sregistry + + +class LowerExplicitMSD(LowerMSD): """ - Bind the free thickness symbols defined by MultiSubDimensions to - suitable values. + An Explicit MultiSubDomain (MSD) encodes the thickness of N (N > 0) + user-defined SubDomains. + + This pass augments the IterationSpace to iterate over the N SubDomains and + bind the free thickness symbols to their corresponding values. Examples -------- @@ -53,25 +67,8 @@ class LowerMultiSubDimensions(Queue): Cluster([Eq(f[t1, xi_n, yi_n], f[t0, xi_n, yi_n] + 1)]) """ - def __init__(self, sregistry): - super().__init__() - - self.sregistry = sregistry - - def _hook_syncs(self, cluster, level): - """ - The *fetchUpdate SyncOps may require their own suitably adjusted - thickness assigments. This method pulls such SyncOps. - """ - syncs = [] - for i in cluster.ispace[:level]: - for s in cluster.syncs.get(i.dim, ()): - if isinstance(s, (FetchUpdate, PrefetchUpdate)): - syncs.append(s) - return tuple(syncs) - - def _make_key_hook(self, cluster, level): - return (self._hook_syncs(cluster, level),) + _q_guards_in_key = True + _q_syncs_in_key = True def callback(self, clusters, prefix): try: @@ -86,9 +83,7 @@ def callback(self, clusters, prefix): idx = len(prefix) - seen = set() tip = None - processed = [] for c in clusters: try: @@ -100,6 +95,25 @@ def callback(self, clusters, prefix): processed.append(c) continue + # Get the dynamic thickness mapper for the given MultiSubDomain + mapper, dims = lower_msd(d.msd, c) + if not dims: + # An Implicit MSD + processed.append(c) + continue + + exprs, thickness = make_implicit_exprs(mapper, self.sregistry) + + #TODO: DO I NEED THIS ONE??? + # Make sure the "implicit expressions" aren't scheduled in + # an inner loop. E.g schedule both for `t, xi, yi` and `t, d, xi, yi` + edims = set(retrieve_dimensions(exprs, deep=True)) + if dim not in edims and any(d.dim in edims for d in prefix): + #if not dim._defines & edims and any(i.dim in edims for i in prefix): + #TODO: TRY REMOVING SECOND CONDITION?? + processed.append(c) + continue + #TODO: use c.ispace.split(dd) and run test suite... not doing it # now because the state of the current branches is a bit fragile #ALSO note that .split would require the dimension after `dd` so @@ -109,75 +123,125 @@ def callback(self, clusters, prefix): ispace0 = c.ispace[:n] ispace1 = c.ispace[n:] - # The "implicit expressions" created for the MultiSubDomain - exprs, dims, sub_iterators = make_implicit_exprs(d.msd, c) + # The IterationSpace induced by the MultiSubDomain + #TODO: use the new c.ispace.insert !!! + #TODO: in fact, scratch the first TODO above... just call insert directly + #here, I won't even have to call split as it will be done automatically + #by insert... but then I will have to + #TODO: actually there's an issue with this new insert as I'll be like + #constructing ispace0,ispaceN,ispace1 while below I only have ispace0,ispaceN? + intervals = [Interval(i) for i in dims] + ispaceN = IterationSpace(IntervalGroup(intervals)) + + relations = (ispace0.itdims + dims, dims + ispace1.itdims) + ispace = IterationSpace.union(ispace0, ispaceN, relations=relations) - # Make sure the "implicit expressions" aren't scheduled in - # an inner loop. E.g schedule both for `t, xi, yi` and `t, d, xi, yi` - edims = set(retrieve_dimensions(exprs, deep=True)) - if dim not in edims and any(d.dim in edims for d in prefix): - processed.append(c) + properties = {i.dim: {SEQUENTIAL} for i in ispace} + + nxt = ispaceN + #TODO: Use `seen` instead?? + if tip is None or tip != nxt: + processed.append( + c.rebuild(exprs=exprs, ispace=ispace, properties=properties) + ) + tip = nxt + + ispace = IterationSpace.union(c.ispace, ispaceN, relations=relations) + ispace = inject_thickness(ispace, thickness) + + processed.append(c.rebuild(ispace=ispace)) + + return processed + + +class LowerImplicitMSD(LowerMSD): + + """ + An Implicit MultiSubDomain (MSD) encodes the thicknesses of N (N > 0) + indirectly-defined SubDomains, that is SubDomains whose thicknesses are + evaluated (e.g., computed on-the-fly, fetched from a Function) along a + certain problem Dimension. + + Examples + -------- + Given: + + Cluster([Eq(f[t1, xi, yi], f[t0, xi, yi] + 1)]) + + where `xi_n` and `yi_n` are MultiSubDimensions, generate: + + Cluster([Eq(xi_ltkn, xi_n_m[time]) + Eq(xi_rtkn, xi_n_M[time]) + Eq(yi_ltkn, yi_n_m[time]) + Eq(yi_rtkn, yi_n_M[time])]) + Cluster([Eq(f[t1, xi, yi], f[t0, xi, yi] + 1)]) + """ + + def callback(self, clusters, prefix): + try: + dim = prefix[-1].dim + except IndexError: + return clusters + + try: + pd = prefix[-2].dim + except IndexError: + pd = None + + # There could be several MultiSubDomains around, spread over different + # Clusters. At the same time, the same MultiSubDomain might be required + # by multiple Clusters, and each Cluster may require accessing the + # MultiSubDomain at different iteration points along `dim` + found = defaultdict(lambda: Bunch(clusters=[], mapper={})) + for c in clusters: + ispace = c.ispace.project(msdim) + try: + d = msdim(ispace.outermost.dim) + except IndexError: continue - # The IterationSpace induced by the MultiSubDomain + # Get the dynamic thickness mapper for the given MultiSubDomain + mapper, dims = lower_msd(d.msd, c) if dims: - #TODO: use the new c.ispace.insert !!! - #TODO: in fact, scratch the first TODO above... just call insert directly - #here, I won't even have to call split as it will be done automatically - #by insert... but then I will have to - #TODO: actually there's an issue with this new insert as I'll be like - #constructing ispace0,ispaceN,ispace1 while below I only have ispace0,ispaceN? - intervals = [Interval(i) for i in dims] - ispaceN = IterationSpace(IntervalGroup(intervals), sub_iterators) - - relations = (ispace0.itdims + dims, dims + ispace1.itdims) - ispace = IterationSpace.union(ispace0, ispaceN, relations=relations) - else: - ispaceN = None - ispace = ispace0 + # An Explicit MSD + continue - properties = {i.dim: {SEQUENTIAL} for i in ispace} + # Make sure the "implicit expressions" are scheduled in + # the innermost loop such that the thicknesses can be computed + edims = set(retrieve_dimensions(mapper.values(), deep=True)) + if dim not in edims or not edims.issubset(prefix.dimensions): + continue - if not ispaceN: - # Special case: we can factorize the thickness assignments - # once and for all at the top of the current IterationInterval, - # and reuse them for one or more (potentially non-consecutive) - # `clusters` - if d not in seen: - # Retain the guards and the syncs along the outer Dimensions - retained = {None} | set(c.ispace[:n-1].dimensions) - - # A fetch SyncOp along `dim` binds the thickness assignments - if self._hook_syncs(c, n): - retained.add(dim) - - guards = {d: v for d, v in c.guards.items() if d in retained} - syncs = {d: v for d, v in c.syncs.items() if d in retained} - - processed.insert( - 0, Cluster(exprs, ispace, guards, properties, syncs) - ) - seen.add(ispaceN) - else: - nxt = self._make_tip(c, ispaceN) - if tip is None or tip != nxt: - processed.append( - c.rebuild(exprs=exprs, ispace=ispace, properties=properties) - ) - tip = nxt - - if ispaceN: - ispace = IterationSpace.union(c.ispace, ispaceN, relations=relations) + found[d.msd].clusters.append(c) + found[d.msd].mapper = reduce(found[d.msd].mapper, mapper, edims, prefix) + + # Turn the reduced mapper into a list of equations + mapper = {} + processed = [] + for bunch in found.values(): + exprs, thickness = make_implicit_exprs(bunch.mapper, self.sregistry) + + mapper.update({c: thickness for c in bunch.clusters}) + + # Only retain outer guards (e.g., along None) if any + key = lambda i: i is None or i in prefix.prefix([pd]) + guards = c.guards.filter(key) + syncs = {d: v for d, v in c.syncs.items() if key(d)} + + processed.append( + c.rebuild(exprs=exprs, ispace=prefix, guards=guards, syncs=syncs) + ) + + # Add in the dynamic thickness + for c in clusters: + try: + ispace = inject_thickness(c.ispace, mapper[c]) processed.append(c.rebuild(ispace=ispace)) - else: + except KeyError: processed.append(c) - seen.add(d) return processed - def _make_tip(self, c, ispaceN): - return (c.guards, c.syncs, ispaceN) - def msdim(d): try: @@ -190,19 +254,67 @@ def msdim(d): @singledispatch -def make_implicit_exprs(msd, cluster): - # Retval: (exprs, iteration dimensions, subiterators) - return (), (), {} +def lower_msd(msd, cluster): + # Retval: (dynamic thickness mapper, iteration dimensions) + return (), () -@make_implicit_exprs.register(SubDomainSet) +@lower_msd.register(SubDomainSet) def _(msd, *args): - ret = [] + ret = {} for j in range(len(msd._local_bounds)): index = floor(j/2) + side = j % 2 d = msd.dimensions[index] f = msd._functions[j] - ret.append(Eq(d.thickness[j % 2][0], f.indexify())) + ret[(d.root, side)] = f.indexify() + + return frozendict(ret), (msd._implicit_dimension,) + + +def make_implicit_exprs(mapper, sregistry): + exprs = [] + thickness = defaultdict(lambda: [None, None]) + for (d, side), v in mapper.items(): + tkn = 'l' if side == 0 else 'r' + name = sregistry.make_name('%s_%stkn' % (d.name, tkn)) + s = Symbol(name=name, dtype=np.int32, is_const=True, nonnegative=True) + + exprs.append(Eq(s, v)) + thickness[d][side] = s + + return exprs, frozendict(thickness) + + +def inject_thickness(ispace, thickness): + for i in ispace.itdims: + if i.is_Block and i._depth > 1: + # The thickness should be injected once only! + continue + try: + v0, v1 = thickness[i.root] + ispace = ispace.translate(i, v0, -v1) + except KeyError: + pass + return ispace + + +def reduce(m0, m1, edims, prefix): + if len(edims) != 1: + raise NotImplementedError + d, = edims + + if prefix[d].direction is Forward: + func = max + else: + func = min + + key = lambda i: i.indices[d] + + mapper = {} + for k, e in m1.items(): + candidates = {e, m0.get(k, e)} + mapper[k] = func(candidates, key=key) - return as_tuple(ret), (msd._implicit_dimension,), {} + return frozendict(mapper) diff --git a/devito/passes/iet/misc.py b/devito/passes/iet/misc.py index c4a5cb6397..580ff26579 100644 --- a/devito/passes/iet/misc.py +++ b/devito/passes/iet/misc.py @@ -12,6 +12,7 @@ from devito.ir.iet.efunc import DeviceFunction, EntryFunction from devito.symbolics import ValueLimit, evalrel, has_integer_args, limits_mapper from devito.tools import as_mapper, filter_ordered, split +from devito.types.dimension import AbstractSubDimension __all__ = ['avoid_denormals', 'hoist_prodders', 'relax_incr_dimensions', 'generate_macros', 'minimize_symbols'] @@ -261,7 +262,7 @@ def _rename_subdims(target, dimensions): # the expression indexeds = FindSymbols('indexeds').visit(target) dims = pull_dims(indexeds, flag=False) - dims = [d for d in dims if any([dim.is_Sub for dim in d._defines])] + dims = [d for d in dims if any(dim.is_AbstractSub for dim in d._defines)] dims = [d for d in dims if not d.is_SubIterator] names = [d.root.name for d in dims] diff --git a/devito/types/dimension.py b/devito/types/dimension.py index 0c6f36f432..2755d0f92f 100644 --- a/devito/types/dimension.py +++ b/devito/types/dimension.py @@ -101,6 +101,7 @@ class Dimension(ArgProvider): is_Custom = False is_Derived = False is_NonlinearDerived = False + is_AbstractSub = False is_Sub = False is_Conditional = False is_Stepping = False @@ -534,7 +535,22 @@ def _arg_check(self, *args, **kwargs): # the user -class SubDimension(DerivedDimension): +class AbstractSubDimension(DerivedDimension): + + """ + Symbol defining a convex iteration sub-space derived from a ``parent`` + Dimension. + + Notes + ----- + This is an abstract class. The actual implementations are SubDimension + and MultiSubDimension. + """ + + is_AbstractSub = True + + +class SubDimension(AbstractSubDimension): """ Symbol defining a convex iteration sub-space derived from a ``parent`` @@ -556,7 +572,7 @@ class SubDimension(DerivedDimension): The thickness of the left and right regions, respectively. local : bool True if, in case of domain decomposition, the SubDimension is - guaranteed not to span more than one domains, False otherwise. + guaranteed not to span more than one domain, False otherwise. Examples -------- diff --git a/devito/types/grid.py b/devito/types/grid.py index 2d5d46b20e..523d044e2b 100644 --- a/devito/types/grid.py +++ b/devito/types/grid.py @@ -15,7 +15,8 @@ from devito.types.dense import Function from devito.types.utils import DimensionTuple from devito.types.dimension import (Dimension, SpaceDimension, TimeDimension, - Spacing, SteppingDimension, SubDimension) + Spacing, SteppingDimension, SubDimension, + AbstractSubDimension) __all__ = ['Grid', 'SubDomain', 'SubDomainSet'] @@ -546,50 +547,48 @@ def define(self, dimensions): raise NotImplementedError -class MultiSubDimension(SubDimension): +class MultiSubDimension(AbstractSubDimension): """ - A special SubDimension for graceful lowering of MultiSubDomains. + A special Dimension to be used in MultiSubDomains. """ __rargs__ = ('name', 'parent', 'msd') - __rkwargs__ = ('thickness',) - - def __init_finalize__(self, name, parent, msd, thickness=None): - # NOTE: a MultiSubDimension stashes a reference to the originating MultiSubDomain. - # This creates a circular pattern as the `msd` itself carries references to - # its MultiSubDimensions. This is currently necessary because during compilation - # we drop the MultiSubDomain early, but when the MultiSubDimensions are processed - # we still need it to create the implicit equations. Untangling this is - # definitely possible, but not straightforward - self.msd = msd - if not thickness: - lst, rst = self._symbolic_thickness(name) - else: # Used for rebuilding. Reuse thickness symbols rather than making new ones - try: - ((lst, _), (rst, _)) = thickness - except ValueError: - raise ValueError("Invalid thickness specification: %s does not match" - "expected format ((left_symbol, left_thickness)," - " (right_symbol, right_thickness))" % thickness) - left = parent.symbolic_min + lst - right = parent.symbolic_max - rst + def __init_finalize__(self, name, parent, msd): + # NOTE: a MultiSubDimension stashes a reference to the originating + # MultiSubDomain. This creates a circular pattern as the `msd` itself + # carries references to its MultiSubDimensions. This is currently + # necessary because during compilation we drop the MultiSubDomain + # early, but when the MultiSubDimensions are processed we still need it + # to create the implicit equations. Untangling this is definitely + # possible, but not straightforward + self.msd = msd - super().__init_finalize__(name, parent, left, right, ((lst, 0), (rst, 0)), False) + super().__init_finalize__(name, parent) def __hash__(self): - # There is no possibility for two MultiSubDimensions to ever hash the same, since - # a MultiSubDimension carries a reference to a MultiSubDomain, which is unique + # There is no possibility for two MultiSubDimensions to ever hash the + # same, since a MultiSubDimension carries a reference to a MultiSubDomain, + # which is unique return id(self) @cached_property def bound_symbols(self): - # Unlike a SubDimension, a MultiSubDimension does *not* bind its thickness, - # which is rather bound by an Operation (which can alter its value - # dynamically so as to implement a MultiSubDomain) return self.parent.bound_symbols + @cached_property + def symbolic_min(self): + return self.parent.symbolic_min + + @cached_property + def symbolic_max(self): + return self.parent.symbolic_max + + @cached_property + def symbolic_size(self): + return self.parent.symbolic_size + class MultiSubDomain(AbstractSubDomain): diff --git a/tests/test_subdomains.py b/tests/test_subdomains.py index efdea08acd..196df33348 100644 --- a/tests/test_subdomains.py +++ b/tests/test_subdomains.py @@ -672,29 +672,6 @@ class Dummy(SubDomainSet): assert z.is_Parallel -class TestMultiSubDimension: - - def test_rebuild(self): - class Dummy(SubDomainSet): - name = 'dummy' - - dummy = Dummy(N=0, bounds=[(), (), (), ()]) - grid = Grid(shape=(10, 10), subdomains=(dummy,)) - sdims = grid.subdomains['dummy'].dimensions - - # Check normal rebuilding - tkns = [d.thickness for d in sdims] - rebuilt = [d._rebuild() for d in sdims] - assert list(sdims) != rebuilt - # Should build new thickness symbols with same names and values - assert all([d.thickness is not t for d, t in zip(rebuilt, tkns)]) - assert all([d.thickness == t for d, t in zip(rebuilt, tkns)]) - - # Switch the thickness symbols between MultiSubDimensions with the rebuild - remixed = [d._rebuild(thickness=t) for d, t in zip(sdims, tkns[::-1])] - assert [d.thickness for d in remixed] == tkns[::-1] - - class TestSubDomain_w_condition: def test_condition_w_subdomain_v0(self): From e0228907657336fe8af5033e84350cdc9638d8b5 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Wed, 1 May 2024 15:19:04 +0000 Subject: [PATCH 22/58] compiler: Clean up MultiSubDomain lowering --- devito/ir/support/space.py | 3 ++ devito/passes/clusters/implicit.py | 50 ++++++------------------------ 2 files changed, 13 insertions(+), 40 deletions(-) diff --git a/devito/ir/support/space.py b/devito/ir/support/space.py index efeba69156..7d688c9d86 100644 --- a/devito/ir/support/space.py +++ b/devito/ir/support/space.py @@ -987,6 +987,9 @@ def split(self, d): if isinstance(d, IterationInterval): d = d.dim + if d is None: + return IterationSpace([]), self + try: n = self.index(d) + 1 return self[:n], self[n:] diff --git a/devito/passes/clusters/implicit.py b/devito/passes/clusters/implicit.py index 61fd65b605..8bf178d6b5 100644 --- a/devito/passes/clusters/implicit.py +++ b/devito/passes/clusters/implicit.py @@ -104,51 +104,21 @@ def callback(self, clusters, prefix): exprs, thickness = make_implicit_exprs(mapper, self.sregistry) - #TODO: DO I NEED THIS ONE??? - # Make sure the "implicit expressions" aren't scheduled in - # an inner loop. E.g schedule both for `t, xi, yi` and `t, d, xi, yi` - edims = set(retrieve_dimensions(exprs, deep=True)) - if dim not in edims and any(d.dim in edims for d in prefix): - #if not dim._defines & edims and any(i.dim in edims for i in prefix): - #TODO: TRY REMOVING SECOND CONDITION?? - processed.append(c) - continue + ispace = c.ispace.insert(dim, dims) - #TODO: use c.ispace.split(dd) and run test suite... not doing it - # now because the state of the current branches is a bit fragile - #ALSO note that .split would require the dimension after `dd` so - #we'll have to change idx to idx+1, or simply take `dim`!! - #This will simplify this code a lot!! - n = c.ispace.index(dd) - ispace0 = c.ispace[:n] - ispace1 = c.ispace[n:] - - # The IterationSpace induced by the MultiSubDomain - #TODO: use the new c.ispace.insert !!! - #TODO: in fact, scratch the first TODO above... just call insert directly - #here, I won't even have to call split as it will be done automatically - #by insert... but then I will have to - #TODO: actually there's an issue with this new insert as I'll be like - #constructing ispace0,ispaceN,ispace1 while below I only have ispace0,ispaceN? - intervals = [Interval(i) for i in dims] - ispaceN = IterationSpace(IntervalGroup(intervals)) - - relations = (ispace0.itdims + dims, dims + ispace1.itdims) - ispace = IterationSpace.union(ispace0, ispaceN, relations=relations) - - properties = {i.dim: {SEQUENTIAL} for i in ispace} - - nxt = ispaceN - #TODO: Use `seen` instead?? - if tip is None or tip != nxt: + # The Cluster computing the thicknesses + ispaceN = ispace.prefix(dims) + + if tip is None or tip != ispaceN: + properties = {i.dim: {SEQUENTIAL} for i in ispace} processed.append( - c.rebuild(exprs=exprs, ispace=ispace, properties=properties) + c.rebuild(exprs=exprs, ispace=ispaceN, properties=properties) ) - tip = nxt + tip = ispaceN - ispace = IterationSpace.union(c.ispace, ispaceN, relations=relations) + # The Cluster performing the actual computation, enriched with + # the thicknesses ispace = inject_thickness(ispace, thickness) - processed.append(c.rebuild(ispace=ispace)) return processed From 839007830c3634732af971c67d9025286485f709 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Mon, 6 May 2024 12:19:24 +0000 Subject: [PATCH 23/58] compiler: Fix tasking reconstruction --- devito/passes/clusters/asynchrony.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/devito/passes/clusters/asynchrony.py b/devito/passes/clusters/asynchrony.py index 619bc5dc8a..3008388ad3 100644 --- a/devito/passes/clusters/asynchrony.py +++ b/devito/passes/clusters/asynchrony.py @@ -144,7 +144,7 @@ def tasking(clusters, key0, sregistry): WithLock(lock[i], target, i, function, findex, d) ]) - processed = [c.rebuild(syncs=syncs.get(c)) for c in clusters] + processed = [c.rebuild(syncs=syncs.get(c, c.syncs)) for c in clusters] return processed From b3974934f1266aeac25a2cf268c255de50a271c1 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Mon, 6 May 2024 08:07:09 +0000 Subject: [PATCH 24/58] compiler: Fix Guards.xandg From 83dbf4c0136ad1a8a8c662733f5cb2c1b4beef7c Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Mon, 6 May 2024 13:44:26 +0000 Subject: [PATCH 25/58] compiler: Tweak Guards and Properties --- devito/ir/support/guards.py | 4 ++-- devito/ir/support/properties.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/devito/ir/support/guards.py b/devito/ir/support/guards.py index 7895591941..0c5e0aa4fd 100644 --- a/devito/ir/support/guards.py +++ b/devito/ir/support/guards.py @@ -245,7 +245,7 @@ def xandg(self, d, guard): try: m[d] = And(m[d], guard) except KeyError: - pass + m[d] = guard return Guards(m) @@ -257,7 +257,7 @@ def impose(self, d, guard): m[d] = guard - return m + return Guards(m) def popany(self, dims): m = dict(self) diff --git a/devito/ir/support/properties.py b/devito/ir/support/properties.py index 1181fcd083..0bc60ab9e5 100644 --- a/devito/ir/support/properties.py +++ b/devito/ir/support/properties.py @@ -232,8 +232,8 @@ def is_blockable(self, d): def is_blockable_small(self, d): return TILABLE_SMALL in self.get(d, set()) - def is_prefetchable(self, d): - return PREFETCHABLE in self.get(d, set()) + def is_prefetchable(self, dims): + return any(PREFETCHABLE in self.get(d, set()) for d in as_tuple(dims)) @property def nblockable(self): From 79836923dbc43108be6262fd9404d22258a1a6c0 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Mon, 6 May 2024 13:44:44 +0000 Subject: [PATCH 26/58] compiler: Fix memcpy_prefetch pass --- devito/passes/clusters/asynchrony.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/devito/passes/clusters/asynchrony.py b/devito/passes/clusters/asynchrony.py index 3008388ad3..f7f2e8b9b9 100644 --- a/devito/passes/clusters/asynchrony.py +++ b/devito/passes/clusters/asynchrony.py @@ -166,7 +166,7 @@ def memcpy_prefetch(clusters, key0, sregistry): if not wraps_memcpy(c): continue - if c.properties.is_prefetchable(d): + if c.properties.is_prefetchable(d._defines): _actions_from_update_memcpy(c, d, clusters, actions, sregistry) elif d.is_Custom and is_integer(c.ispace[d].size): _actions_from_init(c, d, actions) @@ -206,7 +206,8 @@ def _actions_from_init(c, d, actions): def _actions_from_update_memcpy(c, d, clusters, actions, sregistry): - direction = c.ispace[d.root].direction + pd = d.root # E.g., `vd -> time` + direction = c.ispace[pd].direction e = c.exprs[0] function = e.rhs.function @@ -224,8 +225,7 @@ def _actions_from_update_memcpy(c, d, clusters, actions, sregistry): tindex = tindex0 else: assert tindex0.is_Modulo - subiters = [i for i in c.sub_iterators[d.root] - if i.parent is tindex0.parent] + subiters = [i for i in c.sub_iterators[pd] if i.parent is tindex0.parent] osubiters = sorted(subiters, key=lambda i: Vector(i.offset)) n = osubiters.index(tindex0) if direction is Forward: @@ -244,7 +244,7 @@ def _actions_from_update_memcpy(c, d, clusters, actions, sregistry): guard0 = c.guards.get(d, true)._subs(fetch, findex) guard1 = GuardBoundNext(function.indices[d], direction) - guards = c.guards.impose(d, And(guard0, guard1)) + guards = c.guards.impose(d, guard0).xandg(pd, guard1) syncs = {d: [ ReleaseLock(handle, target), From 55c293cef2d6e97e53b03515129520d2d674e486 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Tue, 7 May 2024 09:17:13 +0000 Subject: [PATCH 27/58] compiler: Tweak lifting --- devito/passes/clusters/misc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/devito/passes/clusters/misc.py b/devito/passes/clusters/misc.py index 096a332255..0a7b454944 100644 --- a/devito/passes/clusters/misc.py +++ b/devito/passes/clusters/misc.py @@ -46,7 +46,7 @@ def callback(self, clusters, prefix): continue # Synchronization prevents lifting - if c.syncs.get(dim) or \ + if any(c.syncs.get(d) for d in dim._defines) or \ in_critical_region(c, clusters): processed.append(c) continue From 036c1c243684782da5b703c738c5e9d56e859b66 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Tue, 7 May 2024 13:11:47 +0000 Subject: [PATCH 28/58] compiler: Fix memcpy_prefetch --- devito/passes/clusters/asynchrony.py | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/devito/passes/clusters/asynchrony.py b/devito/passes/clusters/asynchrony.py index f7f2e8b9b9..7357039c49 100644 --- a/devito/passes/clusters/asynchrony.py +++ b/devito/passes/clusters/asynchrony.py @@ -7,7 +7,7 @@ FetchUpdate, PrefetchUpdate, ReleaseLock, normalize_syncs) from devito.passes.clusters.utils import in_critical_region, is_memcpy from devito.symbolics import IntDiv, uxreplace -from devito.tools import OrderedSet, as_mapper, is_integer, timed_pass +from devito.tools import OrderedSet, is_integer, timed_pass from devito.types import CustomDimension, Lock __all__ = ['tasking', 'memcpy_prefetch'] @@ -226,12 +226,19 @@ def _actions_from_update_memcpy(c, d, clusters, actions, sregistry): else: assert tindex0.is_Modulo subiters = [i for i in c.sub_iterators[pd] if i.parent is tindex0.parent] - osubiters = sorted(subiters, key=lambda i: Vector(i.offset)) - n = osubiters.index(tindex0) + mapper = {i.offset: i for i in subiters} if direction is Forward: - tindex = osubiters[(n + 1) % len(osubiters)] + toffset = tindex0.offset + 1 else: - tindex = osubiters[(n - 1) % len(osubiters)] + toffset = tindex0.offset - 1 + try: + tindex = mapper[toffset] + except KeyError: + # This can happen if e.g. the underlying buffer has size K (because + # e.g. the user has sent `async_degree=K`), but the actual computation + # only uses K-N indices (e.g., indices={t-1,t} and K=5}) + name = sregistry.make_name(prefix='sb') + tindex = tindex0._rebuild(name, offset=toffset) # We need a lock to synchronize the copy-in name = sregistry.make_name(prefix='lock') @@ -242,6 +249,11 @@ def _actions_from_update_memcpy(c, d, clusters, actions, sregistry): # Turn `c` into a prefetch Cluster `pc` expr = uxreplace(e, {tindex0: tindex, fetch: findex}) + if tindex is not tindex0: + ispace = c.ispace.augment({pd: tindex}) + else: + ispace = c.ispace + guard0 = c.guards.get(d, true)._subs(fetch, findex) guard1 = GuardBoundNext(function.indices[d], direction) guards = c.guards.impose(d, guard0).xandg(pd, guard1) @@ -251,7 +263,7 @@ def _actions_from_update_memcpy(c, d, clusters, actions, sregistry): PrefetchUpdate(handle, target, tindex, function, findex, d, 1, e.rhs) ]} - pc = c.rebuild(exprs=expr, guards=guards, syncs=syncs) + pc = c.rebuild(exprs=expr, ispace=ispace, guards=guards, syncs=syncs) # Since we're turning `e` into a prefetch, we need to: # 1) attach a WaitLock SyncOp to the first Cluster accessing `target` From 8571c0691837615b56a8a69a3120da3d2a8c7cf7 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Wed, 8 May 2024 07:53:16 +0000 Subject: [PATCH 29/58] compiler: Move _rcompile_wrapper logic into Cpu64OperatorMixin --- devito/core/cpu.py | 17 +++++++++++++++++ devito/operator/operator.py | 6 ++---- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/devito/core/cpu.py b/devito/core/cpu.py index 184d8955a1..a5c852154b 100644 --- a/devito/core/cpu.py +++ b/devito/core/cpu.py @@ -2,6 +2,7 @@ from devito.core.operator import CoreOperator, CustomOperator, ParTile from devito.exceptions import InvalidOperator +from devito.operator.operator import rcompile from devito.passes.equations import collect_derivatives from devito.passes.clusters import (Lift, blocking, buffering, cire, cse, factorize, fission, fuse, optimize_pows, @@ -92,6 +93,22 @@ def _normalize_kwargs(cls, **kwargs): return kwargs + @classmethod + def _rcompile_wrapper(cls, **kwargs0): + options0 = kwargs0.pop('options') + + def wrapper(expressions, **kwargs1): + options = {**options0, **kwargs1.pop('options', {})} + kwargs = {'options': options, **kwargs0, **kwargs1} + + # User-provided openmp flag has precedence over defaults + if not options['openmp']: + kwargs['language'] = 'C' + + return rcompile(expressions, kwargs) + + return wrapper + # Mode level diff --git a/devito/operator/operator.py b/devito/operator/operator.py index d8400259a8..698a4f3606 100644 --- a/devito/operator/operator.py +++ b/devito/operator/operator.py @@ -277,10 +277,8 @@ def _lower(cls, expressions, **kwargs): return IRs(expressions, clusters, stree, uiet, iet), byproduct @classmethod - def _rcompile_wrapper(cls, **kwargs): - def wrapper(expressions, **options): - return rcompile(expressions, kwargs, options) - return wrapper + def _rcompile_wrapper(cls, **kwargs0): + raise NotImplementedError @classmethod def _initialize_state(cls, **kwargs): From b2a04058f7f98304ac713f0fb2d35284a890ec40 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Wed, 8 May 2024 08:47:14 +0000 Subject: [PATCH 30/58] compiler: Do not ompize asynchronous ops --- devito/ir/iet/nodes.py | 11 ++++++++++- devito/passes/iet/engine.py | 36 ++++++++++++++++++++++++++++++++-- devito/passes/iet/parpragma.py | 14 +++++++++---- 3 files changed, 54 insertions(+), 7 deletions(-) diff --git a/devito/ir/iet/nodes.py b/devito/ir/iet/nodes.py index 29447b3446..b824bc8c5c 100644 --- a/devito/ir/iet/nodes.py +++ b/devito/ir/iet/nodes.py @@ -13,7 +13,7 @@ from devito.ir.equations import DummyEq, OpInc, OpMin, OpMax from devito.ir.support import (INBOUND, SEQUENTIAL, PARALLEL, PARALLEL_IF_ATOMIC, PARALLEL_IF_PVT, VECTORIZED, AFFINE, Property, - Forward, detect_io) + Forward, WithLock, PrefetchUpdate, detect_io) from devito.symbolics import ListInitializer, CallFromPointer, ccode from devito.tools import (Signer, as_tuple, filter_ordered, filter_sorted, flatten, ctypes_to_cstr) @@ -1378,6 +1378,15 @@ def __init__(self, sync_ops, body=None): def __repr__(self): return "" % ",".join(str(i) for i in self.sync_ops) + @property + def is_async_op(self): + """ + True if the SyncSpot contains an asynchronous operation, False otherwise. + If False, the SyncSpot may for example represent a wait on a lock. + """ + return any(isinstance(s, (WithLock, PrefetchUpdate)) + for s in self.sync_ops) + class CBlankLine(List): diff --git a/devito/passes/iet/engine.py b/devito/passes/iet/engine.py index 55b63e0caa..7b93ceedfd 100644 --- a/devito/passes/iet/engine.py +++ b/devito/passes/iet/engine.py @@ -1,8 +1,9 @@ from collections import OrderedDict from functools import partial, singledispatch, wraps -from devito.ir.iet import (Call, ExprStmt, FindNodes, FindSymbols, MetaCall, - Transformer, EntryFunction, ThreadCallable, Uxreplace, +from devito.ir.iet import (Call, ExprStmt, Iteration, SyncSpot, FindNodes, + FindSymbols, MapNodes, MetaCall, Transformer, + EntryFunction, ThreadCallable, Uxreplace, derive_parameters) from devito.ir.support import SymbolRegistry from devito.mpi.distributed import MPINeighborhood @@ -74,6 +75,37 @@ def root(self): def funcs(self): return tuple(MetaCall(v, True) for v in self.efuncs.values())[1:] + @property + def sync_mapper(self): + """ + A mapper {Iteration -> SyncSpot} describing the Iterations, if any, + living an asynchronous region, across all Callables in the Graph. + """ + dag = create_call_graph(self.root.name, self.efuncs) + + mapper = MapNodes(SyncSpot, (Iteration, Call)).visit(self.root) + + found = {} + for k, v in mapper.items(): + if not k.is_async_op: + continue + + while v: + i = v.pop(0) + + if i.is_Iteration: + found[i] = k + continue + + for j in dag.all_predecessors(i.name): + try: + v.extend(FindNodes(Iteration).visit(self.efuncs[j])) + except KeyError: + # `j` is a foreign Callable + pass + + return found + def apply(self, func, **kwargs): """ Apply `func` to all nodes in the Graph. This changes the state of the Graph. diff --git a/devito/passes/iet/parpragma.py b/devito/passes/iet/parpragma.py index 195069d8e7..35f1ec15c8 100644 --- a/devito/passes/iet/parpragma.py +++ b/devito/passes/iet/parpragma.py @@ -373,7 +373,8 @@ def _make_nested_partree(self, partree): return partree - def _make_parallel(self, iet): + @iet_pass + def _make_parallel(self, iet, sync_mapper=None): mapper = {} parrays = {} for tree in retrieve_iteration_tree(iet, mode='superset'): @@ -387,6 +388,12 @@ def _make_parallel(self, iet): if any(isinstance(n, ParallelIteration) for n in candidates): continue + # Ignore if already part of an asynchronous region of code + # (e.g., an Iteartion embedded within a SyncSpot defining an + # asynchronous operation) + if any(n in sync_mapper for n in candidates): + continue + # Outer parallelism root, partree = self._make_partree(candidates) if partree is None or root in mapper: @@ -413,9 +420,8 @@ def _make_parallel(self, iet): return iet, {'includes': [self.lang['header']]} - @iet_pass - def make_parallel(self, iet): - return self._make_parallel(iet) + def make_parallel(self, graph): + return self._make_parallel(graph, sync_mapper=graph.sync_mapper) class PragmaTransfer(Pragma, Transfer): From 1c52ee2ecf87d1f3f5d8de5fb89a537e7bb7bca0 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Wed, 8 May 2024 12:24:23 +0000 Subject: [PATCH 31/58] compiler: Tweak clear_cache --- devito/types/caching.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/devito/types/caching.py b/devito/types/caching.py index c4fa818dbf..b505566ef8 100644 --- a/devito/types/caching.py +++ b/devito/types/caching.py @@ -177,7 +177,9 @@ def clear(cls, force=True): cache_copied = safe_dict_copy(_SymbolCache) # Maybe trigger garbage collection - if force is False: + if force: + gc.collect() + else: if cls.ncalls_w_force_false + 1 == cls.force_ths: # Case 1: too long since we called gc.collect, let's do it now gc.collect() @@ -189,8 +191,6 @@ def clear(cls, force=True): else: # We won't call gc.collect() this time cls.ncalls_w_force_false += 1 - else: - gc.collect() for key in cache_copied: obj = _SymbolCache.get(key) @@ -198,6 +198,10 @@ def clear(cls, force=True): # deleted by another thread since we took the copy continue if obj() is None: - # pop(x, None) does not error if already gone # (key could be removed in another thread since get() above) _SymbolCache.pop(key, None) + + # Maybe trigger garbage collection + if force: + del cache_copied + gc.collect() From 747420846e8f6e1bd00addaebc4904f974cbddc6 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Fri, 10 May 2024 07:57:56 +0000 Subject: [PATCH 32/58] compiler: Enhance retrieve_functions --- devito/symbolics/search.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/devito/symbolics/search.py b/devito/symbolics/search.py index 5ef7c70b2d..9d2194abce 100644 --- a/devito/symbolics/search.py +++ b/devito/symbolics/search.py @@ -145,8 +145,13 @@ def retrieve_indexed(exprs, mode='all', deep=False): def retrieve_functions(exprs, mode='all'): - """Shorthand to retrieve the DiscreteFunctions in ``exprs``.""" - return search(exprs, q_function, mode, 'dfs') + """Shorthand to retrieve the DiscreteFunctions in `exprs`.""" + indexeds = search(exprs, q_indexed, mode, 'dfs') + + functions = search(exprs, q_function, mode, 'dfs') + functions.update({i.function for i in indexeds}) + + return functions def retrieve_symbols(exprs, mode='all'): From ef839a4c536ef5ed0035cb4486c5add9f1f723aa Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Wed, 8 May 2024 13:05:40 +0000 Subject: [PATCH 33/58] misc: pep8 happiness --- devito/ir/support/space.py | 9 +++------ devito/passes/clusters/aliases.py | 1 - devito/passes/clusters/asynchrony.py | 7 +++---- devito/passes/clusters/implicit.py | 5 ++--- devito/passes/clusters/utils.py | 4 +--- devito/passes/iet/misc.py | 1 - 6 files changed, 9 insertions(+), 18 deletions(-) diff --git a/devito/ir/support/space.py b/devito/ir/support/space.py index 7d688c9d86..d28f6b33ee 100644 --- a/devito/ir/support/space.py +++ b/devito/ir/support/space.py @@ -699,13 +699,10 @@ def __init__(self, intervals, parts=None): # Normalize parts parts = {k: v.expand() for k, v in (parts or {}).items()} - #TODO: POLISH THIS - a = {} - for k, v in parts.items(): + for k, v in list(parts.items()): dims = set().union(*[d._defines for d in k.dimensions]) - v1 = v.drop(lambda d: d not in dims) - a[k] = v1 - self._parts = frozendict(a) + parts[k] = v.drop(lambda d: d not in dims) + self._parts = frozendict(parts) def __eq__(self, other): return (isinstance(other, DataSpace) and diff --git a/devito/passes/clusters/aliases.py b/devito/passes/clusters/aliases.py index 8e6fb07d5f..ee3892f1f4 100644 --- a/devito/passes/clusters/aliases.py +++ b/devito/passes/clusters/aliases.py @@ -858,7 +858,6 @@ def lower_schedule(schedule, meta, sregistry, ftemps): # from ugly generated code, the reason we do not rather shift the # indices is that it prevents future passes to transform the loop # bounds (e.g., MPI's comp/comm overlap does that) - #TODO: JUST d.root ?? dimensions = [d.parent if d.is_AbstractSub else d for d in writeto.itdims] diff --git a/devito/passes/clusters/asynchrony.py b/devito/passes/clusters/asynchrony.py index 7357039c49..ddefb02cb3 100644 --- a/devito/passes/clusters/asynchrony.py +++ b/devito/passes/clusters/asynchrony.py @@ -1,10 +1,9 @@ from collections import defaultdict -from sympy import And, true +from sympy import true -from devito.exceptions import CompilationError -from devito.ir import (Forward, GuardBoundNext, Vector, WaitLock, WithLock, - FetchUpdate, PrefetchUpdate, ReleaseLock, normalize_syncs) +from devito.ir import (Forward, GuardBoundNext, WaitLock, WithLock, FetchUpdate, + PrefetchUpdate, ReleaseLock, normalize_syncs) from devito.passes.clusters.utils import in_critical_region, is_memcpy from devito.symbolics import IntDiv, uxreplace from devito.tools import OrderedSet, is_integer, timed_pass diff --git a/devito/passes/clusters/implicit.py b/devito/passes/clusters/implicit.py index 8bf178d6b5..560e293b5a 100644 --- a/devito/passes/clusters/implicit.py +++ b/devito/passes/clusters/implicit.py @@ -8,10 +8,9 @@ import numpy as np -from devito.ir import (Cluster, Interval, IntervalGroup, IterationSpace, Queue, - FetchUpdate, PrefetchUpdate, SEQUENTIAL, Forward, vmax, vmin) +from devito.ir import SEQUENTIAL, Queue, Forward from devito.symbolics import retrieve_dimensions -from devito.tools import Bunch, as_tuple, frozendict, timed_pass +from devito.tools import Bunch, frozendict, timed_pass from devito.types import Eq, Symbol from devito.types.grid import MultiSubDimension, SubDomainSet diff --git a/devito/passes/clusters/utils.py b/devito/passes/clusters/utils.py index 74aba55a14..7405dd984d 100644 --- a/devito/passes/clusters/utils.py +++ b/devito/passes/clusters/utils.py @@ -1,8 +1,6 @@ -from collections import defaultdict - from devito.ir import Cluster from devito.symbolics import uxreplace -from devito.tools import as_tuple, flatten +from devito.tools import as_tuple from devito.types import CriticalRegion, Eq, Symbol, Wildcard __all__ = ['makeit_ssa', 'is_memcpy', 'make_critical_sequence', diff --git a/devito/passes/iet/misc.py b/devito/passes/iet/misc.py index 580ff26579..3d7d1386d9 100644 --- a/devito/passes/iet/misc.py +++ b/devito/passes/iet/misc.py @@ -12,7 +12,6 @@ from devito.ir.iet.efunc import DeviceFunction, EntryFunction from devito.symbolics import ValueLimit, evalrel, has_integer_args, limits_mapper from devito.tools import as_mapper, filter_ordered, split -from devito.types.dimension import AbstractSubDimension __all__ = ['avoid_denormals', 'hoist_prodders', 'relax_incr_dimensions', 'generate_macros', 'minimize_symbols'] From e5534d42eff425d095cd5b6edfa74e13735405c9 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Mon, 13 May 2024 12:49:53 +0000 Subject: [PATCH 34/58] compiler: Refactor wraps_memcpy --- devito/passes/clusters/asynchrony.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/devito/passes/clusters/asynchrony.py b/devito/passes/clusters/asynchrony.py index ddefb02cb3..4026595ba5 100644 --- a/devito/passes/clusters/asynchrony.py +++ b/devito/passes/clusters/asynchrony.py @@ -295,7 +295,4 @@ def __init__(self, drop=False, syncs=None, insert=None): def wraps_memcpy(cluster): - if len(cluster.exprs) != 1: - return False - - return is_memcpy(cluster.exprs[0]) + return len(cluster.exprs) == 1 and is_memcpy(cluster.exprs[0]) From 0c018b56ab4a5e56f9c1da0c1c7407b33bc6f86a Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Tue, 14 May 2024 10:36:27 +0000 Subject: [PATCH 35/58] compiler: Tweak double buffering --- devito/passes/clusters/buffering.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/devito/passes/clusters/buffering.py b/devito/passes/clusters/buffering.py index eb3de7dba9..894fb9cccd 100644 --- a/devito/passes/clusters/buffering.py +++ b/devito/passes/clusters/buffering.py @@ -6,8 +6,8 @@ import numpy as np from devito.ir import (Cluster, Backward, Forward, GuardBound, Interval, - IntervalGroup, Properties, Queue, Vector, lower_exprs, - vmax, vmin) + IntervalGroup, IterationSpace, Properties, Queue, Vector, + lower_exprs, vmax, vmin) from devito.exceptions import InvalidOperator from devito.logger import warning from devito.passes.clusters.utils import is_memcpy @@ -471,10 +471,11 @@ def write_to(self): # Analogous to the above, we need to include the halo region as well ihalo = IntervalGroup([ - Interval(d, -h.left, h.right) - for d, h in zip(self.b.dimensions, self.b._size_halo) + Interval(i.dim, -h.left, h.right, i.stamp) + for i, h in zip(ispace, self.b._size_halo) ]) - ispace = ispace.add(ihalo) + + ispace = IterationSpace.union(ispace, IterationSpace(ihalo)) return ispace @@ -567,6 +568,8 @@ def make_mds(descriptors, sregistry): if size == 1: continue + if v.is_double_buffering: + continue p, _ = offset_from_centre(v.dim, v.indices) From 9541d8496406b46badf051dbc1b559b2c17ec40e Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Tue, 14 May 2024 10:36:31 +0000 Subject: [PATCH 36/58] compiler: Revamp generation of sub-iterators --- devito/ir/clusters/algorithms.py | 7 +++++-- devito/passes/clusters/asynchrony.py | 11 +++++------ devito/passes/clusters/buffering.py | 25 ++++++++++++++++++------- devito/passes/iet/misc.py | 2 +- devito/types/dimension.py | 4 ++-- 5 files changed, 31 insertions(+), 18 deletions(-) diff --git a/devito/ir/clusters/algorithms.py b/devito/ir/clusters/algorithms.py index e3f815a2f4..774439ebcf 100644 --- a/devito/ir/clusters/algorithms.py +++ b/devito/ir/clusters/algorithms.py @@ -37,7 +37,7 @@ def clusterize(exprs, **kwargs): clusters = Schedule().process(clusters) # Handle SteppingDimensions - clusters = Stepper().process(clusters) + clusters = Stepper(**kwargs).process(clusters) # Handle ConditionalDimensions clusters = guard(clusters) @@ -273,6 +273,9 @@ class Stepper(Queue): sub-iterators induced by a SteppingDimension. """ + def __init__(self, sregistry=None, **kwargs): + self.sregistry = sregistry + def callback(self, clusters, prefix): if not prefix: return clusters @@ -326,7 +329,7 @@ def callback(self, clusters, prefix): siafs = sorted(iafs, key=key) for iaf in siafs: - name = '%s%d' % (si.name, len(mds)) + name = self.sregistry.make_name(prefix='t') offset = uxreplace(iaf, {si: d.root}) mds.append(ModuloDimension(name, si, offset, size, origin=iaf)) diff --git a/devito/passes/clusters/asynchrony.py b/devito/passes/clusters/asynchrony.py index 4026595ba5..5f827e4898 100644 --- a/devito/passes/clusters/asynchrony.py +++ b/devito/passes/clusters/asynchrony.py @@ -218,26 +218,25 @@ def _actions_from_update_memcpy(c, d, clusters, actions, sregistry): else: findex = fetch - 1 - # If fetching into e.g. `ub[sb1]` we'll need to prefetch into e.g. `ub[sb0]` + # If fetching into e.g. `ub[t1]` we might need to prefetch into e.g. `ub[t0]` tindex0 = e.lhs.indices[d] if is_integer(tindex0) or isinstance(tindex0, IntDiv): tindex = tindex0 else: assert tindex0.is_Modulo - subiters = [i for i in c.sub_iterators[pd] if i.parent is tindex0.parent] - mapper = {i.offset: i for i in subiters} + mapper = {(i.offset % i.modulo): i for i in c.sub_iterators[pd]} if direction is Forward: toffset = tindex0.offset + 1 else: toffset = tindex0.offset - 1 try: - tindex = mapper[toffset] + tindex = mapper[toffset % tindex0.modulo] except KeyError: # This can happen if e.g. the underlying buffer has size K (because # e.g. the user has sent `async_degree=K`), but the actual computation # only uses K-N indices (e.g., indices={t-1,t} and K=5}) - name = sregistry.make_name(prefix='sb') - tindex = tindex0._rebuild(name, offset=toffset) + name = sregistry.make_name(prefix='t') + tindex = tindex0._rebuild(name, offset=toffset, origin=None) # We need a lock to synchronize the copy-in name = sregistry.make_name(prefix='lock') diff --git a/devito/passes/clusters/buffering.py b/devito/passes/clusters/buffering.py index 894fb9cccd..e07ac18da2 100644 --- a/devito/passes/clusters/buffering.py +++ b/devito/passes/clusters/buffering.py @@ -155,11 +155,11 @@ def callback(self, clusters, prefix): init = init_buffers(descriptors, self.options) # Create all the necessary ModuloDimensions to step through the buffer - mds = make_mds(descriptors, self.sregistry) + mds = make_mds(descriptors, prefix, self.sregistry) subiters = as_mapper(mds.values(), lambda i: i.root) # Substitution rules to replace buffered Functions with buffers - # E.g., `usave[time+1, x+1, y+1] -> ub0[sb1, x+1, y+1]` + # E.g., `usave[time+1, x+1, y+1] -> ub0[t1, x+1, y+1]` subs = {} for b, v in descriptors.items(): accesses = chain(*[c.scope[v.f] for c in v.clusters]) @@ -188,6 +188,7 @@ def callback(self, clusters, prefix): expr = lower_exprs(expr) ispace = v.step_to + ispace = ispace.augment(subiters).augment(c.sub_iterators) if v.xd in ispace.itdims: guards = c.guards.xandg(v.xd, GuardBound(0, v.first_idx.f)) @@ -211,7 +212,7 @@ def callback(self, clusters, prefix): ) # Append the copy-back if `c` is the last-write of some buffers - # E.g., `usave[time+1, x] = ub[sb1, x]` + # E.g., `usave[time+1, x] = ub[t1, x]` for b, v in descriptors.items(): if v.is_readonly: continue @@ -228,6 +229,7 @@ def callback(self, clusters, prefix): expr = lower_exprs(uxreplace(expr, v.subdims_mapper)) ispace = v.written + ispace = ispace.augment(subiters).augment(c.sub_iterators) if v.xd in ispace.itdims: guards = c.guards.xandg(v.xd, GuardBound(0, v.first_idx.f)) @@ -557,7 +559,7 @@ def first_idx(self): return Map(v % self.xd.symbolic_size, v) -def make_mds(descriptors, sregistry): +def make_mds(descriptors, prefix, sregistry): """ Create the ModuloDimensions to step through the buffers. This is done inspecting all buffers so that ModuloDimensions are reused when possible. @@ -573,7 +575,7 @@ def make_mds(descriptors, sregistry): p, _ = offset_from_centre(v.dim, v.indices) - # NOTE: indices are sorted so that the semantic order (sb0, sb1, sb2) + # NOTE: indices are sorted so that the semantic order (t0, t1, t2) # follows SymPy's index ordering (time, time-1, time+1) after modulo # replacement, so that associativity errors are consistent. This very # same strategy is also applied in clusters/algorithms/Stepper @@ -582,8 +584,17 @@ def make_mds(descriptors, sregistry): for i in indices: k = (v.xd, i) - if k not in mds: - name = sregistry.make_name(prefix='sb') + if k in mds: + continue + + # Can I reuse an existing ModuloDimension or do I need to create + # a new one? + for si in prefix.sub_iterators.get(v.dim.root, []): + if si.offset == i and si.modulo == size: + mds[k] = si + break + else: + name = sregistry.make_name(prefix='t') mds[k] = ModuloDimension(name, v.xd, i, size) return mds diff --git a/devito/passes/iet/misc.py b/devito/passes/iet/misc.py index 3d7d1386d9..44cec6df13 100644 --- a/devito/passes/iet/misc.py +++ b/devito/passes/iet/misc.py @@ -198,7 +198,7 @@ def remove_redundant_moddims(iet): if not mds: return iet - mapper = as_mapper(mds, key=lambda md: md.origin % md.modulo) + mapper = as_mapper(mds, key=lambda md: md.offset % md.modulo) subs = {} for k, v in mapper.items(): diff --git a/devito/types/dimension.py b/devito/types/dimension.py index 2755d0f92f..0e4310bac3 100644 --- a/devito/types/dimension.py +++ b/devito/types/dimension.py @@ -1077,7 +1077,7 @@ def __add__(self, other): try: if self.modulo == other.modulo: return self.origin + other.origin - except (AttributeError, TypeError): + except (AttributeError, TypeError, sympy.SympifyError): pass return super().__add__(other) @@ -1087,7 +1087,7 @@ def __sub__(self, other): try: if self.modulo == other.modulo: return self.origin - other.origin - except (AttributeError, TypeError): + except (AttributeError, TypeError, sympy.SympifyError): pass return super().__sub__(other) From f91cbf72871698a9478dd153a4a8d17a8ff7724a Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Tue, 14 May 2024 08:12:24 +0000 Subject: [PATCH 37/58] compiler: Introduce InitArray and SyncArray ops --- devito/ir/support/syncs.py | 10 ++-- devito/passes/clusters/asynchrony.py | 6 +-- devito/passes/clusters/buffering.py | 6 ++- devito/passes/clusters/misc.py | 7 +-- devito/passes/iet/orchestration.py | 70 ++++++++++++---------------- 5 files changed, 48 insertions(+), 51 deletions(-) diff --git a/devito/ir/support/syncs.py b/devito/ir/support/syncs.py index 8a71e40f90..c7484e0e11 100644 --- a/devito/ir/support/syncs.py +++ b/devito/ir/support/syncs.py @@ -8,8 +8,8 @@ from devito.tools import Pickable, filter_ordered, frozendict from .utils import IMask -__all__ = ['WaitLock', 'ReleaseLock', 'WithLock', 'FetchUpdate', 'PrefetchUpdate', - 'normalize_syncs'] +__all__ = ['WaitLock', 'ReleaseLock', 'WithLock', 'InitArray', 'SyncArray', + 'PrefetchUpdate', 'normalize_syncs'] class SyncOp(Pickable): @@ -118,7 +118,11 @@ class ReleaseLock(SyncCopyOut): pass -class FetchUpdate(SyncCopyIn): +class InitArray(SyncCopyIn): + pass + + +class SyncArray(SyncCopyIn): pass diff --git a/devito/passes/clusters/asynchrony.py b/devito/passes/clusters/asynchrony.py index 5f827e4898..20d8a1d530 100644 --- a/devito/passes/clusters/asynchrony.py +++ b/devito/passes/clusters/asynchrony.py @@ -2,7 +2,7 @@ from sympy import true -from devito.ir import (Forward, GuardBoundNext, WaitLock, WithLock, FetchUpdate, +from devito.ir import (Forward, GuardBoundNext, WaitLock, WithLock, SyncArray, PrefetchUpdate, ReleaseLock, normalize_syncs) from devito.passes.clusters.utils import in_critical_region, is_memcpy from devito.symbolics import IntDiv, uxreplace @@ -199,8 +199,8 @@ def _actions_from_init(c, d, actions): size = target.shape[d] assert is_integer(size) - actions[c].syncs[None].append( - FetchUpdate(None, target, 0, function, findex, d, size) + actions[c].syncs[d].append( + SyncArray(None, target, 0, function, findex, d, size) ) diff --git a/devito/passes/clusters/buffering.py b/devito/passes/clusters/buffering.py index e07ac18da2..5cfebae5f0 100644 --- a/devito/passes/clusters/buffering.py +++ b/devito/passes/clusters/buffering.py @@ -7,7 +7,7 @@ from devito.ir import (Cluster, Backward, Forward, GuardBound, Interval, IntervalGroup, IterationSpace, Properties, Queue, Vector, - lower_exprs, vmax, vmin) + InitArray, lower_exprs, vmax, vmin) from devito.exceptions import InvalidOperator from devito.logger import warning from devito.passes.clusters.utils import is_memcpy @@ -640,7 +640,9 @@ def init_buffers(descriptors, options): properties = properties.affine(ispace.itdims) properties = properties.parallelize(ispace.itdims) - init.append(Cluster(expr, ispace, guards=guards, properties=properties)) + syncs = {None: [InitArray(None, b, 0, f, v.first_idx.f, v.dim, v.size)]} + + init.append(Cluster(expr, ispace, guards, properties, syncs)) return init diff --git a/devito/passes/clusters/misc.py b/devito/passes/clusters/misc.py index 0a7b454944..15574beb60 100644 --- a/devito/passes/clusters/misc.py +++ b/devito/passes/clusters/misc.py @@ -3,8 +3,8 @@ from devito.finite_differences import IndexDerivative from devito.ir.clusters import Cluster, ClusterGroup, Queue, cluster_pass -from devito.ir.support import (SEQUENTIAL, SEPARABLE, Scope, ReleaseLock, - WaitLock, WithLock, FetchUpdate, PrefetchUpdate) +from devito.ir.support import (SEQUENTIAL, SEPARABLE, Scope, ReleaseLock, WaitLock, + WithLock, InitArray, SyncArray, PrefetchUpdate) from devito.passes.clusters.utils import in_critical_region from devito.symbolics import pow_to_mul from devito.tools import DAG, Stamp, as_tuple, flatten, frozendict, timed_pass @@ -203,8 +203,9 @@ def _key(self, c): # be fused, as in the worst case scenario the WaitLocks # get "hoisted" above the first Cluster in the sequence continue - elif (isinstance(s, (FetchUpdate, WaitLock, ReleaseLock)) or + elif (isinstance(s, (InitArray, SyncArray, WaitLock, ReleaseLock)) or (self.fusetasks and isinstance(s, WithLock))): + #TODO: turn into "not isinstance(s, PrefetchUpdate)... mapper[k].add(type(s)) else: mapper[k].add(s) diff --git a/devito/passes/iet/orchestration.py b/devito/passes/iet/orchestration.py index 8a4195062f..483c07655c 100644 --- a/devito/passes/iet/orchestration.py +++ b/devito/passes/iet/orchestration.py @@ -5,15 +5,15 @@ from sympy import Or from devito.exceptions import InvalidOperator -from devito.ir.iet import (Call, Callable, List, SyncSpot, FindNodes, - Transformer, BlankLine, BusyWait, DummyExpr, AsyncCall, - AsyncCallable, derive_parameters) -from devito.ir.support import (WaitLock, WithLock, ReleaseLock, FetchUpdate, - PrefetchUpdate) +from devito.ir.iet import (Call, Callable, List, SyncSpot, FindNodes, Transformer, + BlankLine, BusyWait, DummyExpr, AsyncCall, AsyncCallable, + derive_parameters) +from devito.ir.support import (WaitLock, WithLock, ReleaseLock, InitArray, + SyncArray, PrefetchUpdate) from devito.passes.iet.engine import iet_pass from devito.passes.iet.langbase import LangBB from devito.symbolics import CondEq, CondNe -from devito.tools import as_mapper +from devito.tools import as_mapper, flatten from devito.types import HostLayer __init__ = ['Orchestrator'] @@ -72,20 +72,32 @@ def _make_withlock(self, iet, sync_ops, layer): return iet, [efunc] - def _make_fetchupdate(self, iet, sync_ops, layer): - body, prefix = fetchupdate(layer, iet, sync_ops, self.lang, self.sregistry) - + def _make_initarray(self, iet, sync_ops, layer): # Turn init IET into a Callable - name = self.sregistry.make_name(prefix=prefix) - body = List(body=body) - parameters = derive_parameters(body, ordering='canonical') - efunc = Callable(name, body, 'void', parameters, 'static') + name = self.sregistry.make_name(prefix='init_array') + parameters = derive_parameters(iet, ordering='canonical') + efunc = Callable(name, iet.body, 'void', parameters, 'static') - # Perform initial fetch by the main thread iet = Call(name, parameters) return iet, [efunc] + def _make_syncarray(self, iet, sync_ops, layer): + try: + qid = self.sregistry.queue0 + except AttributeError: + qid = None + + body = list(iet.body) + try: + body.extend([self.lang._map_update_device(s.target, s.imask, qid=qid) + for s in sync_ops]) + except NotImplementedError: + pass + iet = List(body=body) + + return iet, [] + def _make_prefetchupdate(self, iet, sync_ops, layer): body, prefix = prefetchupdate(layer, iet, sync_ops, self.lang, self.sregistry) @@ -107,15 +119,15 @@ def process(self, iet): if not sync_spots: return iet, {} + # The SyncOps are to be processed in a given order callbacks = OrderedDict([ (WaitLock, self._make_waitlock), (WithLock, self._make_withlock), - (FetchUpdate, self._make_fetchupdate), + (SyncArray, self._make_syncarray), + (InitArray, self._make_initarray), (PrefetchUpdate, self._make_prefetchupdate), (ReleaseLock, self._make_releaselock), ]) - - # The SyncOps are to be processed in a given order key = lambda s: list(callbacks).index(s) efuncs = [] @@ -135,6 +147,7 @@ def process(self, iet): efuncs.extend(v) iet = Transformer(subs).visit(iet) + efuncs = [Transformer(subs).visit(i) for i in efuncs] return iet, {'efuncs': efuncs} @@ -185,29 +198,6 @@ def _(layer, iet, sync_ops, lang, sregistry): return body, name -@singledispatch -def fetchupdate(layer, iet, sync_ops, lang, sregistry): - raise NotImplementedError - - -@fetchupdate.register(HostLayer) -def _(layer, iet, sync_ops, lang, sregistry): - try: - qid = sregistry.queue0 - except AttributeError: - qid = None - - body = list(iet.body) - try: - body.extend([lang._map_update_device(s.target, s.imask, qid=qid) - for s in sync_ops]) - name = 'init_from_%s' % layer.suffix - except NotImplementedError: - name = 'init_to_%s' % layer.suffix - - return body, name - - @singledispatch def prefetchupdate(layer, iet, sync_ops, lang, sregistry): raise NotImplementedError From bb6f54b337052a722a378cdb2159cd3a7b62a30b Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Wed, 15 May 2024 09:26:07 +0000 Subject: [PATCH 38/58] compiler: Fix SyncArray codegen --- devito/ir/support/syncs.py | 2 +- devito/passes/clusters/asynchrony.py | 3 ++- devito/passes/clusters/buffering.py | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/devito/ir/support/syncs.py b/devito/ir/support/syncs.py index c7484e0e11..8b6adfee0f 100644 --- a/devito/ir/support/syncs.py +++ b/devito/ir/support/syncs.py @@ -93,7 +93,7 @@ def imask(self): for d in self.target.dimensions: if d.root is self.dim.root: if self.target.is_regular: - ret.append((self.tindex, self.size)) + ret.append((self.tindex, 1)) else: ret.append((0, 1)) else: diff --git a/devito/passes/clusters/asynchrony.py b/devito/passes/clusters/asynchrony.py index 20d8a1d530..d92ae9aba6 100644 --- a/devito/passes/clusters/asynchrony.py +++ b/devito/passes/clusters/asynchrony.py @@ -194,13 +194,14 @@ def _actions_from_init(c, d, actions): function = e.rhs.function target = e.lhs.function + tindex = e.lhs.indices[d] findex = e.rhs.indices[d] size = target.shape[d] assert is_integer(size) actions[c].syncs[d].append( - SyncArray(None, target, 0, function, findex, d, size) + SyncArray(None, target, tindex, function, findex, d, size) ) diff --git a/devito/passes/clusters/buffering.py b/devito/passes/clusters/buffering.py index 5cfebae5f0..343682dc39 100644 --- a/devito/passes/clusters/buffering.py +++ b/devito/passes/clusters/buffering.py @@ -640,7 +640,7 @@ def init_buffers(descriptors, options): properties = properties.affine(ispace.itdims) properties = properties.parallelize(ispace.itdims) - syncs = {None: [InitArray(None, b, 0, f, v.first_idx.f, v.dim, v.size)]} + syncs = {None: [InitArray(None, b)]} init.append(Cluster(expr, ispace, guards, properties, syncs)) From 85714bed51c18420e63aa3fd96c894f9f468a52d Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Wed, 15 May 2024 10:29:33 +0000 Subject: [PATCH 39/58] compiler: Switch tasking back to being a Queue --- devito/passes/clusters/asynchrony.py | 51 ++++++++++++++++++++-------- 1 file changed, 37 insertions(+), 14 deletions(-) diff --git a/devito/passes/clusters/asynchrony.py b/devito/passes/clusters/asynchrony.py index d92ae9aba6..f42081ad10 100644 --- a/devito/passes/clusters/asynchrony.py +++ b/devito/passes/clusters/asynchrony.py @@ -3,7 +3,7 @@ from sympy import true from devito.ir import (Forward, GuardBoundNext, WaitLock, WithLock, SyncArray, - PrefetchUpdate, ReleaseLock, normalize_syncs) + PrefetchUpdate, ReleaseLock, Queue, normalize_syncs) from devito.passes.clusters.utils import in_critical_region, is_memcpy from devito.symbolics import IntDiv, uxreplace from devito.tools import OrderedSet, is_integer, timed_pass @@ -51,17 +51,42 @@ def tasking(clusters, key0, sregistry): task Dimensions. """ key, _ = keys(key0) + return Tasking(key, sregistry).process(clusters) - locks = {} - syncs = defaultdict(lambda: defaultdict(OrderedSet)) - for c0 in clusters: - d = key(c0) - if d is None: - continue +class Tasking(Queue): + + """ + Carry out the bulk of `tasking`. + """ + + def __init__(self, key0, sregistry): + self.key0 = key0 + self.sregistry = sregistry + + def callback(self, clusters, prefix): + if not prefix: + return clusters + + dim = prefix[-1].dim + + locks = {} + syncs = defaultdict(lambda: defaultdict(OrderedSet)) + for c0 in clusters: + d = self.key0(c0) + if d is not dim: + continue + + protected = self._schedule_waitlocks(c0, d, clusters, locks, syncs) + self._schedule_withlocks(c0, d, protected, locks, syncs) + + processed = [c.rebuild(syncs=syncs.get(c, c.syncs)) for c in clusters] + + return processed + + def _schedule_waitlocks(self, c0, d, clusters, locks, syncs): # Prevent future writes to interfere with a task by waiting on a lock may_require_lock = {f for f in c0.scope.reads if f.is_AbstractFunction} - # Sort for deterministic code generation may_require_lock = sorted(may_require_lock, key=lambda i: i.name) @@ -96,7 +121,7 @@ def tasking(clusters, key0, sregistry): try: lock = locks[target] except KeyError: - name = sregistry.make_name(prefix='lock') + name = self.sregistry.make_name(prefix='lock') lock = locks[target] = Lock(name=name, dimensions=ld) for w in writes: @@ -117,7 +142,9 @@ def tasking(clusters, key0, sregistry): syncs[c2][d].add(WaitLock(lock[index], target)) protected[target].add(logical_index) - # Taskify `c0` + return protected + + def _schedule_withlocks(self, c0, d, protected, locks, syncs): for target in protected: lock = locks[target] @@ -143,10 +170,6 @@ def tasking(clusters, key0, sregistry): WithLock(lock[i], target, i, function, findex, d) ]) - processed = [c.rebuild(syncs=syncs.get(c, c.syncs)) for c in clusters] - - return processed - @timed_pass(name='memcpy_prefetch') def memcpy_prefetch(clusters, key0, sregistry): From e0d1343b45b2fc807e6be9261dd24d570a4c2e4a Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Wed, 15 May 2024 14:38:27 +0000 Subject: [PATCH 40/58] compiler: Improve Section placement --- devito/ir/stree/algorithms.py | 25 ++++++++++++++++++------- tests/test_buffering.py | 2 +- 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/devito/ir/stree/algorithms.py b/devito/ir/stree/algorithms.py index 6deeacbd48..8f6ccbb466 100644 --- a/devito/ir/stree/algorithms.py +++ b/devito/ir/stree/algorithms.py @@ -32,6 +32,10 @@ def stree_build(clusters, profiler=None, **kwargs): if reuse_whole_subtree(c, prev): tip = mapper[base].bottom maybe_reusable = prev.itintervals + elif reuse_partial_subtree(c, prev): + tip = mapper[base].middle + tip = augment_partial_subtree(c, tip, mapper, base) + maybe_reusable = [] else: # Add any guards/Syncs outside of the outermost Iteration tip = augment_whole_subtree(c, stree, mapper, base) @@ -110,10 +114,17 @@ def stree_build(clusters, profiler=None, **kwargs): any(i.is_Section for i in reversed(tip.ancestors)): continue - candidate = None - for i in reversed(tip.ancestors + (tip,)): + candidates = tuple(reversed(tip.ancestors[1:] + (tip,))) + + if not any(i.is_Iteration and i.dim.is_Time for i in candidates) and \ + not candidates[-1] is stree: + attach_section(candidates[-1]) + continue + + found = None + for i in candidates: if i.is_Halo: - candidate = i + found = i elif i.is_Sync: if profiler._verbosity > 0 or not i.is_async: attach_section(i) @@ -121,12 +132,12 @@ def stree_build(clusters, profiler=None, **kwargs): break elif i.is_Iteration: if (i.dim.is_Time and SEQUENTIAL in i.properties): - section = attach_section(candidate, section) + section = attach_section(found, section) break else: - candidate = i + found = i else: - attach_section(candidate, section) + attach_section(found, section) return stree @@ -229,7 +240,7 @@ def reuse_whole_subtree(c0, c1, d=None): c0.syncs.get(d) == c1.syncs.get(d)) -def augment_partial_subtree(cluster, tip, mapper, it=None): +def augment_partial_subtree(cluster, tip, mapper, it): d = it.dim if d in cluster.syncs: diff --git a/tests/test_buffering.py b/tests/test_buffering.py index c1331d3766..d4c6b54949 100644 --- a/tests/test_buffering.py +++ b/tests/test_buffering.py @@ -209,7 +209,7 @@ def test_two_homogeneous_buffers(): op2 = Operator(eqns, opt=('buffering', 'fuse')) # Check generated code - assert len(retrieve_iteration_tree(op1)) == 3 + assert len(retrieve_iteration_tree(op1)) == 5 assert len(retrieve_iteration_tree(op2)) == 3 buffers = [i for i in FindSymbols().visit(op1.body) if i.is_Array] assert len(buffers) == 2 From 5359c8db148f09061dd3a64111f7200e6e701168 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Wed, 15 May 2024 14:38:44 +0000 Subject: [PATCH 41/58] compiler: Misc cleanup --- devito/passes/clusters/misc.py | 17 ++++++++++++----- devito/passes/iet/orchestration.py | 2 +- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/devito/passes/clusters/misc.py b/devito/passes/clusters/misc.py index 15574beb60..7ce2bd3422 100644 --- a/devito/passes/clusters/misc.py +++ b/devito/passes/clusters/misc.py @@ -197,20 +197,27 @@ def _key(self, c): mapper = defaultdict(set) for k, v in i.items(): for s in v: - if isinstance(s, PrefetchUpdate) or \ - (not self.fusetasks and isinstance(s, WaitLock)): + if isinstance(s, PrefetchUpdate): + continue + + if isinstance(s, WaitLock) and not self.fusetasks: # NOTE: A mix of Clusters w/ and w/o WaitLocks can safely # be fused, as in the worst case scenario the WaitLocks # get "hoisted" above the first Cluster in the sequence continue - elif (isinstance(s, (InitArray, SyncArray, WaitLock, ReleaseLock)) or - (self.fusetasks and isinstance(s, WithLock))): - #TODO: turn into "not isinstance(s, PrefetchUpdate)... + + if isinstance(s, (InitArray, SyncArray, WaitLock, ReleaseLock)): + mapper[k].add(type(s)) + elif isinstance(s, WithLock) and self.fusetasks: + # NOTE: Different WithLocks aren't fused unless the user + # explicitly asks for it mapper[k].add(type(s)) else: mapper[k].add(s) + if k in mapper: mapper[k] = frozenset(mapper[k]) + strict.append(frozendict(mapper)) weak = [] diff --git a/devito/passes/iet/orchestration.py b/devito/passes/iet/orchestration.py index 483c07655c..4de8806f7c 100644 --- a/devito/passes/iet/orchestration.py +++ b/devito/passes/iet/orchestration.py @@ -13,7 +13,7 @@ from devito.passes.iet.engine import iet_pass from devito.passes.iet.langbase import LangBB from devito.symbolics import CondEq, CondNe -from devito.tools import as_mapper, flatten +from devito.tools import as_mapper from devito.types import HostLayer __init__ = ['Orchestrator'] From 7659338b796ba47a0a4e82c0523bf6b3e718a9b4 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Mon, 20 May 2024 12:27:29 +0000 Subject: [PATCH 42/58] compiler: Patch stree construction --- devito/ir/stree/algorithms.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/devito/ir/stree/algorithms.py b/devito/ir/stree/algorithms.py index 8f6ccbb466..5622555004 100644 --- a/devito/ir/stree/algorithms.py +++ b/devito/ir/stree/algorithms.py @@ -200,7 +200,7 @@ def preprocess(clusters, options=None, **kwargs): syncs = normalize_syncs(*[c1.syncs for c1 in found]) if syncs: - ispace = c.ispace.prefix(syncs) + ispace = c.ispace.prefix(lambda d: d._defines.intersection(syncs)) processed.append(c.rebuild(exprs=[], ispace=ispace, syncs=syncs)) if all(c1.ispace.is_subset(c.ispace) for c1 in found): From 83deb160bc30e5aff5144b72e196f638cf23594b Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Tue, 21 May 2024 08:03:00 +0000 Subject: [PATCH 43/58] compiler: Fix deterministic code gen --- devito/passes/clusters/buffering.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/devito/passes/clusters/buffering.py b/devito/passes/clusters/buffering.py index 343682dc39..55796bbd51 100644 --- a/devito/passes/clusters/buffering.py +++ b/devito/passes/clusters/buffering.py @@ -123,7 +123,10 @@ class InjectBuffers(Queue): def __init__(self, mapper, sregistry, options): super().__init__() - self.mapper = mapper + # Sort the mapper so that we always process the same Function in the + # same order, hence we get deterministic code generation + self.mapper = {i: mapper[i] for i in sorted(mapper, key=lambda i: i.name)} + self.sregistry = sregistry self.options = options From 951dda4a6112ae6c2d4e49860d9e3247f1dc5c73 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Fri, 17 May 2024 14:22:16 +0000 Subject: [PATCH 44/58] compiler: Revamp async codegen --- devito/core/gpu.py | 2 +- devito/ir/clusters/cluster.py | 13 +- devito/ir/iet/nodes.py | 6 + devito/ir/stree/algorithms.py | 5 +- devito/ir/support/syncs.py | 50 +++- devito/passes/clusters/asynchrony.py | 5 +- devito/passes/clusters/misc.py | 3 + devito/passes/iet/asynchrony.py | 332 +++++++++++++++------------ devito/passes/iet/engine.py | 89 ++++--- devito/passes/iet/linearization.py | 12 +- devito/passes/iet/orchestration.py | 32 +-- devito/types/basic.py | 4 + devito/types/misc.py | 17 +- tests/test_gpu_common.py | 30 +-- 14 files changed, 369 insertions(+), 231 deletions(-) diff --git a/devito/core/gpu.py b/devito/core/gpu.py index 228849c67e..414dbf268b 100644 --- a/devito/core/gpu.py +++ b/devito/core/gpu.py @@ -293,7 +293,7 @@ def _make_iet_passes_mapper(cls, **kwargs): sregistry = kwargs['sregistry'] parizer = cls._Target.Parizer(sregistry, options, platform, compiler) - orchestrator = cls._Target.Orchestrator(sregistry) + orchestrator = cls._Target.Orchestrator(**kwargs) return { 'parallel': parizer.make_parallel, diff --git a/devito/ir/clusters/cluster.py b/devito/ir/clusters/cluster.py index b81b136abb..7a13fa5222 100644 --- a/devito/ir/clusters/cluster.py +++ b/devito/ir/clusters/cluster.py @@ -6,8 +6,8 @@ from devito.ir.equations import ClusterizedEq from devito.ir.support import (PARALLEL, PARALLEL_IF_PVT, BaseGuardBoundNext, Forward, Interval, IntervalGroup, IterationSpace, - DataSpace, Guards, Properties, Scope, WithLock, - PrefetchUpdate, detect_accesses, detect_io, + DataSpace, Guards, Properties, Scope, WaitLock, + WithLock, PrefetchUpdate, detect_accesses, detect_io, normalize_properties, normalize_syncs, minimum, maximum, null_ispace) from devito.mpi.halo_scheme import HaloScheme, HaloTouch @@ -278,6 +278,15 @@ def is_async(self): return any(isinstance(s, (WithLock, PrefetchUpdate)) for s in flatten(self.syncs.values())) + @property + def is_wait(self): + """ + True if a Cluster waiting on a lock (that is a special synchronization + operation), False otherwise. + """ + return any(isinstance(s, WaitLock) + for s in flatten(self.syncs.values())) + @cached_property def dtype(self): """ diff --git a/devito/ir/iet/nodes.py b/devito/ir/iet/nodes.py index b824bc8c5c..5b778ad7b9 100644 --- a/devito/ir/iet/nodes.py +++ b/devito/ir/iet/nodes.py @@ -1387,6 +1387,12 @@ def is_async_op(self): return any(isinstance(s, (WithLock, PrefetchUpdate)) for s in self.sync_ops) + @property + def functions(self): + ret = [(s.lock, s.function, s.target) for s in self.sync_ops] + ret = tuple(filter_ordered(f for f in flatten(ret) if f is not None)) + return ret + class CBlankLine(List): diff --git a/devito/ir/stree/algorithms.py b/devito/ir/stree/algorithms.py index 5622555004..94a7f4e592 100644 --- a/devito/ir/stree/algorithms.py +++ b/devito/ir/stree/algorithms.py @@ -171,8 +171,9 @@ def preprocess(clusters, options=None, **kwargs): elif c.is_dist_reduce: processed.append(c) - elif c.is_critical_region and c.syncs: - processed.append(c.rebuild(exprs=None, guards=c.guards, syncs=c.syncs)) + elif c.is_critical_region: + if c.is_wait: + processed.append(c.rebuild(exprs=[], syncs=c.syncs)) elif c.is_wild: continue diff --git a/devito/ir/support/syncs.py b/devito/ir/support/syncs.py index 8b6adfee0f..80dbeae8cd 100644 --- a/devito/ir/support/syncs.py +++ b/devito/ir/support/syncs.py @@ -5,14 +5,18 @@ from collections import defaultdict from devito.data import FULL -from devito.tools import Pickable, filter_ordered, frozendict +from devito.tools import Pickable, as_tuple, filter_ordered, frozendict from .utils import IMask __all__ = ['WaitLock', 'ReleaseLock', 'WithLock', 'InitArray', 'SyncArray', - 'PrefetchUpdate', 'normalize_syncs'] + 'PrefetchUpdate', 'SnapOut', 'SnapIn', 'Ops', 'normalize_syncs'] -class SyncOp(Pickable): +class Operation(Pickable): + pass + + +class SyncOp(Operation): __rargs__ = ('handle', 'target') __rkwargs__ = ('tindex', 'function', 'findex', 'dim', 'size', 'origin') @@ -45,13 +49,16 @@ def __hash__(self): self.function, self.findex, self.dim, self.size, self.origin)) def __repr__(self): - return "%s<%s>" % (self.__class__.__name__, self.handle) + return "%s<%s>" % (self.__class__.__name__, self.handle.name) __str__ = __repr__ @property def lock(self): - return self.handle.function + try: + return self.handle.function + except AttributeError: + return None # Pickling support __reduce_ex__ = Pickable.__reduce_ex__ @@ -130,6 +137,37 @@ class PrefetchUpdate(SyncCopyIn): pass +class SnapOut(SyncOp): + pass + + +class SnapIn(SyncOp): + pass + + +class Ops(frozendict): + + """ + A mapper {Dimension -> {SyncOps}}. + """ + + @property + def dimensions(self): + return tuple(self) + + def add(self, dims, ops): + m = dict(self) + for d in as_tuple(dims): + m[d] = set(self.get(d, [])) | set(as_tuple(ops)) + return Ops(m) + + def update(self, ops): + m = dict(self) + for d, v in ops.items(): + m[d] = set(self.get(d, [])) | set(v) + return Ops(m) + + def normalize_syncs(*args): if not args: return {} @@ -149,4 +187,4 @@ def normalize_syncs(*args): # We do not allow mixing up WaitLock and WithLock ops raise ValueError("Incompatible SyncOps") - return frozendict(syncs) + return Ops(syncs) diff --git a/devito/passes/clusters/asynchrony.py b/devito/passes/clusters/asynchrony.py index f42081ad10..ff62ef3ce4 100644 --- a/devito/passes/clusters/asynchrony.py +++ b/devito/passes/clusters/asynchrony.py @@ -80,7 +80,7 @@ def callback(self, clusters, prefix): protected = self._schedule_waitlocks(c0, d, clusters, locks, syncs) self._schedule_withlocks(c0, d, protected, locks, syncs) - processed = [c.rebuild(syncs=syncs.get(c, c.syncs)) for c in clusters] + processed = [c.rebuild(syncs={**c.syncs, **syncs[c]}) for c in clusters] return processed @@ -278,12 +278,13 @@ def _actions_from_update_memcpy(c, d, clusters, actions, sregistry): guard0 = c.guards.get(d, true)._subs(fetch, findex) guard1 = GuardBoundNext(function.indices[d], direction) - guards = c.guards.impose(d, guard0).xandg(pd, guard1) + guards = c.guards.impose(d, guard0 & guard1) syncs = {d: [ ReleaseLock(handle, target), PrefetchUpdate(handle, target, tindex, function, findex, d, 1, e.rhs) ]} + syncs = {**c.syncs, **syncs} pc = c.rebuild(exprs=expr, ispace=ispace, guards=guards, syncs=syncs) diff --git a/devito/passes/clusters/misc.py b/devito/passes/clusters/misc.py index 7ce2bd3422..d9bc1851e7 100644 --- a/devito/passes/clusters/misc.py +++ b/devito/passes/clusters/misc.py @@ -234,6 +234,9 @@ def _key(self, c): any(f._mem_shared for f in c.scope.reads) ]) + #weak.append(any(f.is_TimeFunction and f.save is not None + # for f in c.scope.writes)) + # Promoting adjacency of IndexDerivatives will maximize their reuse weak.append(any(e.find(IndexDerivative) for e in c.exprs)) diff --git a/devito/passes/iet/asynchrony.py b/devito/passes/iet/asynchrony.py index 29b9cce8a4..47e464fcdd 100644 --- a/devito/passes/iet/asynchrony.py +++ b/devito/passes/iet/asynchrony.py @@ -1,12 +1,14 @@ -from collections import OrderedDict +from collections import OrderedDict, namedtuple +from functools import singledispatch from ctypes import c_int import cgen as c from devito.ir import (AsyncCall, AsyncCallable, BlankLine, Call, Callable, - Conditional, DummyExpr, FindNodes, FindSymbols, - Iteration, List, PointerCast, Return, ThreadCallable, - Transformer, While, make_callable, maybe_alias) + Conditional, DummyEq, DummyExpr, While, Increment, Iteration, + List, PointerCast, Return, FindNodes, FindSymbols, + ThreadCallable, EntryFunction, Transformer, Uxreplace, + make_callable, maybe_alias) from devito.passes.iet.definitions import DataManager from devito.passes.iet.engine import iet_pass from devito.symbolics import (CondEq, CondNe, FieldFromComposite, FieldFromPointer, @@ -19,20 +21,87 @@ def pthreadify(graph, **kwargs): - lower_async_callables(graph, root=graph.root, **kwargs) + if not any(isinstance(i, AsyncCallable) for i in graph.efuncs.values()): + return - track = {i.name: i.sdata for i in graph.efuncs.values() - if isinstance(i, ThreadCallable)} - lower_async_calls(graph, track=track, **kwargs) + # A key function to help identify all non-constant Symbols in the IET + defines = FindSymbols('defines').visit(graph.root.body) + key = lambda i: i in defines and i.is_Symbol + lower_async_objs(graph, key=key, tracker={}, **kwargs) + + # We need this one here to initialize the newly introduced objects, such as + # SharedData and PThreadArray, as well as the packed/unpacked symbols DataManager(**kwargs).place_definitions(graph) +AsyncMeta = namedtuple('AsyncMeta', 'sdata threads init shutdown') + + @iet_pass -def lower_async_callables(iet, root=None, sregistry=None): - if not isinstance(iet, AsyncCallable): - return iet, {} +def lower_async_objs(iet, **kwargs): + # Different actions depending on the Callable type + iet, efuncs = _lower_async_objs(iet, **kwargs) + + metadata = {'includes': ['pthread.h'], + 'efuncs': efuncs} + + if not isinstance(iet, EntryFunction): + return iet, metadata + + iet, efuncs = inject_async_tear_updown(iet, **kwargs) + metadata['efuncs'].extend(efuncs) + + return iet, metadata + + +@singledispatch +def _lower_async_objs(iet, tracker=None, sregistry=None, **kwargs): + # All Callables, except for AsyncCallables, may containg one or more + # AsyncCalls, which we have to lower into thread-activation code + efuncs = [] + subs = {} + for n in FindNodes(AsyncCall).visit(iet): + # The efuncs in a Graph are processed bottom-up, so we can safely assume + # that the `AsyncCallable` corresponding to `n` has already been processed + assert n.name in tracker + sdata = tracker[n.name].sdata + if sdata.size == 1: + d = sdata.index + condition = CondNe(FieldFromComposite(sdata.symbolic_flag, sdata[d]), 1) + activation = [While(condition)] + else: + d = Temp(name=sregistry.make_name(prefix=sdata.index.name)) + condition = CondNe(FieldFromComposite(sdata.symbolic_flag, sdata[d]), 1) + activation = [DummyExpr(d, 0), + While(condition, DummyExpr(d, (d + 1) % sdata.size))] + arguments = [] + for i in sdata.ncfields: + for a in n.arguments: + if maybe_alias(a, i): + arguments.append(a) + break + else: + arguments.append(i) + activation.extend([DummyExpr(FieldFromComposite(i.base, sdata[d]), i) + for i in arguments]) + activation.append( + DummyExpr(FieldFromComposite(sdata.symbolic_flag, sdata[d]), 2) + ) + name = sregistry.make_name(prefix='activate') + efunc = make_callable(name, activation) + + efuncs.append(efunc) + subs[n] = Call(name, efunc.parameters) + + iet = Transformer(subs).visit(iet) + + return iet, efuncs + + +@_lower_async_objs.register(AsyncCallable) +def _(iet, key=None, tracker=None, sregistry=None, **kwargs): # Determine the max number of threads that can run this `iet` in parallel locks = [i for i in iet.parameters if isinstance(i, Lock)] npthreads = min([i.size for i in locks], default=1) @@ -44,9 +113,7 @@ def lower_async_callables(iet, root=None, sregistry=None): # `ncfields` are instead the non-constant fields, that is the fields whose # value may or may not change across different calls to `iet`. Clearly objects # passed by pointer don't really matter - fields = iet.parameters - defines = FindSymbols('defines').visit(root.body) - ncfields, cfields = split(fields, lambda i: i in defines and i.is_Symbol) + ncfields, cfields = split(iet.parameters, key) # Postprocess `ncfields` ncfields = sanitize_ncfields(ncfields) @@ -54,7 +121,7 @@ def lower_async_callables(iet, root=None, sregistry=None): # SharedData -- that is the data structure that will be used by the # main thread to pass information down to the child thread(s) sdata = SharedData( - name='sdata', + name=sregistry.make_name(prefix='sdata'), npthreads=npthreads, cfields=cfields, ncfields=ncfields, @@ -62,6 +129,14 @@ def lower_async_callables(iet, root=None, sregistry=None): ) sbase = sdata.indexed + # PThreadArray -- that is he pthreads that will execute the AsyncCallable + # asynchronously + threads = PThreadArray( + name=sregistry.make_name(prefix='threads'), + npthreads=npthreads + ) + tbase = threads.indexed + # Prepend the SharedData fields available upon thread activation preactions = [DummyExpr(i, FieldFromPointer(i.base, sbase)) for i in ncfields] preactions.append(BlankLine) @@ -76,7 +151,10 @@ def lower_async_callables(iet, root=None, sregistry=None): # The thread has work to do when it receives the signal that all locks have # been set to 0 by the main thread - wrap = Conditional(CondEq(FieldFromPointer(sdata.symbolic_flag, sbase), 2), wrap) + wrap = Conditional( + CondEq(FieldFromPointer(sdata.symbolic_flag, sbase), 2), + wrap + ) # The thread keeps spinning until the alive flag is set to 0 by the main thread wrap = While(CondNe(FieldFromPointer(sdata.symbolic_flag, sbase), 0), wrap) @@ -97,140 +175,94 @@ def lower_async_callables(iet, root=None, sregistry=None): body = iet.body._rebuild(body=[wrap, Return(Null)], unpacks=unpacks) iet = ThreadCallable(iet.name, body, tparameter) - return iet, {'includes': ['pthread.h']} - - -@iet_pass -def lower_async_calls(iet, track=None, sregistry=None): - # Definitely there won't be AsyncCalls within ThreadCallables - if isinstance(iet, ThreadCallable): - return iet, {} - - # Create efuncs to initialize the SharedData objects - efuncs = OrderedDict() - for n in FindNodes(AsyncCall).visit(iet): - if n.name in efuncs: - continue - - assert n.name in track - sdata = track[n.name] - sbase = sdata.indexed - name = sregistry.make_name(prefix='init_%s' % sdata.name) - body = [] - for i in sdata.cfields: - if i.is_AbstractFunction: - body.append( - DummyExpr(FieldFromPointer(i._C_symbol, sbase), i._C_symbol) - ) - else: - body.append(DummyExpr(FieldFromPointer(i.base, sbase), i.base)) - body.extend([ - BlankLine, - DummyExpr(FieldFromPointer(sdata.symbolic_flag, sbase), 1) - ]) - parameters = sdata.cfields + (sdata,) - efuncs[n.name] = Callable(name, body, 'void', parameters, 'static') - - # Transform AsyncCalls - nqueues = 1 # Number of allocated asynchronous queues so far - initialization = [] - finalization = [] - mapper = {} - for n in FindNodes(AsyncCall).visit(iet): - # Bind the abstract `sdata` to `n` - name = sregistry.make_name(prefix='sdata') - sdata = track[n.name]._rebuild(name=name) - - # The pthreads that will execute the AsyncCallable asynchronously - name = sregistry.make_name(prefix='threads') - threads = PThreadArray(name=name, npthreads=sdata.npthreads) - - # Call to `sdata` initialization Callable - sbase = sdata.indexed - d = threads.index - arguments = [] - for a in n.arguments: - if any(maybe_alias(a, i) for i in sdata.ncfields): - continue - elif isinstance(a, QueueID): - # Different pthreads use different queues - arguments.append(nqueues + d) - else: - arguments.append(a) - # Each pthread has its own SharedData copy - arguments.append(sbase + d) - assert len(efuncs[n.name].parameters) == len(arguments) - call0 = Call(efuncs[n.name].name, arguments) - - # Create pthreads - tbase = threads.indexed - call1 = Call('pthread_create', ( - tbase + d, Null, Call(n.name, [], is_indirect=True), sbase + d - )) - nqueues += threads.size - - # Glue together the initialization pieces - if threads.size == 1: - callback = lambda body: body + d = sdata.index + if sdata.size == 1: + callback = lambda body: body + else: + footer = [BlankLine, + Increment(DummyEq(sbase, 1))] + callback = lambda body: Iteration(list(body) + footer, d, threads.size - 1) + + # Create an efunc to initialize `sdata` and tear up the pthreads + name = 'init_%s' % sdata.name + body = [] + for i in sdata.cfields: + if i.is_AbstractFunction: + body.append( + DummyExpr(FieldFromPointer(i._C_symbol, sbase), i._C_symbol) + ) + elif isinstance(i, QueueID): + body.append(DummyExpr(FieldFromPointer(i, sbase), i + d)) else: - callback = lambda body: Iteration(body, d, threads.size - 1) - initialization.append(List( - body=callback([call0, call1]) + body.append(DummyExpr(FieldFromPointer(i.base, sbase), i.base)) + body.extend([ + BlankLine, + DummyExpr(FieldFromPointer(sdata.symbolic_flag, sbase), 1) + ]) + body.extend([ + BlankLine, + Call('pthread_create', ( + tbase + d, Null, Call(iet.name, [], is_indirect=True), sbase )) + ]) + body = callback(body) + parameters = sdata.cfields + (sdata, threads) + init = Callable(name, body, 'void', parameters, 'static') + + # Create an efunc to shutdown the pthreads + name = sregistry.make_name(prefix='shutdown') + body = List(body=callback([ + While(CondEq(FieldFromPointer(sdata.symbolic_flag, sbase), 2)), + DummyExpr(FieldFromPointer(sdata.symbolic_flag, sbase), 0), + Call('pthread_join', (threads[d], Null)) + ])) + shutdown = make_callable(name, body) + + # Track all the newly created objects + tracker[iet.name] = AsyncMeta(sdata, threads, init, shutdown) + + return iet, [] + + +def inject_async_tear_updown(iet, tracker=None, **kwargs): + # Number of allocated asynchronous queues so far + nqueues = 1 + + tearup = [] + teardown = [] + for sdata, threads, init, shutdown in tracker.values(): + # Tear-up + arguments = list(init.parameters) + for n, a in enumerate(list(arguments)): + if isinstance(a, QueueID): + # Different pthreads use different queues + arguments[n] = nqueues + tearup.append(Call(init.name, arguments)) + nqueues += threads.size # Update the next available queue ID + + # Tear-down + arguments = list(shutdown.parameters) + teardown.append(Call(shutdown.name, arguments)) + + # Inject tearup and teardown + tearup = List( + header=c.Comment("Fire up and initialize pthreads"), + body=tearup + [BlankLine] + ) + teardown = List( + header=c.Comment("Wait for completion of pthreads"), + body=teardown + ) + body = iet.body._rebuild( + body=[tearup] + list(iet.body.body) + [BlankLine, teardown] + ) + iet = iet._rebuild(body=body) - # Finalization - name = sregistry.make_name(prefix='shutdown') - body = List(body=callback([ - While(CondEq(FieldFromComposite(sdata.symbolic_flag, sdata[d]), 2)), - DummyExpr(FieldFromComposite(sdata.symbolic_flag, sdata[d]), 0), - Call('pthread_join', (threads[d], Null)) - ])) - efunc = efuncs[name] = make_callable(name, body) - finalization.append(Call(name, efunc.parameters)) - - # Activation - if threads.size == 1: - d = threads.index - condition = CondNe(FieldFromComposite(sdata.symbolic_flag, sdata[d]), 1) - activation = [While(condition)] - else: - d = Temp(name=sregistry.make_name(prefix=threads.index.name)) - condition = CondNe(FieldFromComposite(sdata.symbolic_flag, sdata[d]), 1) - activation = [DummyExpr(d, 0), - While(condition, DummyExpr(d, (d + 1) % threads.size))] - activation.extend([DummyExpr(FieldFromComposite(i.base, sdata[d]), i) - for i in sdata.ncfields]) - activation.append( - DummyExpr(FieldFromComposite(sdata.symbolic_flag, sdata[d]), 2) - ) - name = sregistry.make_name(prefix='activate') - efunc = efuncs[name] = make_callable(name, activation) - mapper[n] = Call(name, efunc.parameters) - - if mapper: - # Inject activation - iet = Transformer(mapper).visit(iet) - - # Inject initialization and finalization - initialization = List( - header=c.Comment("Fire up and initialize pthreads"), - body=initialization + [BlankLine] - ) - - finalization = List( - header=c.Comment("Wait for completion of pthreads"), - body=finalization - ) - - body = iet.body._rebuild( - body=[initialization] + list(iet.body.body) + [BlankLine, finalization] - ) - iet = iet._rebuild(body=body) - else: - assert not initialization - assert not finalization + efuncs = [] + efuncs.extend(i.init for i in tracker.values()) + efuncs.extend(i.shutdown for i in tracker.values()) - return iet, {'efuncs': tuple(efuncs.values())} + return iet, efuncs # *** Utils @@ -248,3 +280,11 @@ def sanitize_ncfields(ncfields): sanitized.append(i) return sanitized + + +def retrieve_sdata(call): + #TODO: DROP? + for a in call.arguments: + if isinstance(a, SharedData): + return a + return None diff --git a/devito/passes/iet/engine.py b/devito/passes/iet/engine.py index 7b93ceedfd..f6b56a776b 100644 --- a/devito/passes/iet/engine.py +++ b/devito/passes/iet/engine.py @@ -1,18 +1,18 @@ -from collections import OrderedDict +from collections import OrderedDict, defaultdict from functools import partial, singledispatch, wraps -from devito.ir.iet import (Call, ExprStmt, Iteration, SyncSpot, FindNodes, - FindSymbols, MapNodes, MetaCall, Transformer, +from devito.ir.iet import (Call, ExprStmt, Iteration, SyncSpot, AsyncCallable, + FindNodes, FindSymbols, MapNodes, MetaCall, Transformer, EntryFunction, ThreadCallable, Uxreplace, derive_parameters) from devito.ir.support import SymbolRegistry from devito.mpi.distributed import MPINeighborhood from devito.passes import needs_transfer from devito.symbolics import FieldFromComposite, FieldFromPointer -from devito.tools import (DAG, as_mapper, as_tuple, filter_ordered, - sorted_priority, timed_pass) +from devito.tools import DAG, as_tuple, filter_ordered, sorted_priority, timed_pass from devito.types import (Array, Bundle, CompositeObject, Lock, IncrDimension, - Indirection, SharedData, ThreadArray, Temp) + Indirection, Pointer, SharedData, ThreadArray, Temp, + NPThreads, NThreadsBase) from devito.types.args import ArgProvider from devito.types.dense import DiscreteFunction from devito.types.dimension import AbstractIncrDimension, BlockDimension @@ -252,23 +252,26 @@ def reuse_compounds(efuncs, sregistry=None): in this case the transformed `foo` is also syntactically identical to the input `foo`, but this isn't necessarily the case. """ - mapper = {} + # Build {key -> [objs]} where [objs] can be, or are already, mapped to the + # same abstract compound type + candidates = defaultdict(list) for efunc in efuncs.values(): - local_sregistry = SymbolRegistry() for i in FindSymbols().visit(efunc): - abstract_compound(i, mapper, local_sregistry) + a = abstract_compound(i) + if a is not None: + candidates[a._C_ctype].append(i) - key = lambda i: mapper[i]._C_ctype subs = {} - for v in as_mapper(mapper, key).values(): - if len(v) == 1: + for v in candidates.values(): + # Do they actually need to be remapped to a common type or do they + # already share the same type? + ctypes = {i._C_ctype for i in v} + if len(ctypes) == 1: continue - # Recreate now using a globally unique type name - abstract_compound(v[0], subs, sregistry) - base = subs[v[0]] + base = abstract_compound(v[0], sregistry) - subs.update({i: base._rebuild(name=mapper[i].name) for i in v}) + subs.update({i: base._rebuild(name=i.name) for i in v}) # Replace all occurrences in the form of FieldFrom{Composite,Pointer} mapper = {} @@ -291,7 +294,7 @@ def reuse_compounds(efuncs, sregistry=None): @singledispatch -def abstract_compound(i, mapper, sregistry): +def abstract_compound(i, sregistry=None): """ Singledispatch-based implementation of type abstraction. """ @@ -299,15 +302,17 @@ def abstract_compound(i, mapper, sregistry): @abstract_compound.register(SharedData) -def _(i, mapper, sregistry): - pname = sregistry.make_name(prefix="tsd") +def _(i, sregistry=None): + try: + pname = sregistry.make_name(prefix='tsd') + except AttributeError: + pname = 'tsd' m = abstract_objects(i.fields) - cfields = [m.get(i, i) for i in i.cfields] - ncfields = [m.get(i, i) for i in i.ncfields] + cfields = tuple(m.get(i, i) for i in i.cfields) + ncfields = tuple(m.get(i, i) for i in i.ncfields) - mapper[i] = i._rebuild(cfields=cfields, ncfields=ncfields, pname=pname, - function=None) + return i._rebuild(pname=pname, cfields=cfields, ncfields=ncfields, function=None) def reuse_efuncs(root, efuncs, sregistry=None): @@ -339,8 +344,14 @@ def reuse_efuncs(root, efuncs, sregistry=None): continue efunc = efuncs[i] - afunc = abstract_efunc(efunc) + # Avoid premature lowering of AsyncCalls -- it would obscenely + # complicate the logic of the `pthredify` pass + if isinstance(efunc, AsyncCallable): + mapper[len(mapper)] = (efunc, [efunc]) + continue + + afunc = abstract_efunc(efunc) key = afunc._signature() try: @@ -377,7 +388,7 @@ def abstract_efunc(efunc): - Arrays are renamed as "a0", "a1", ... - Objects are renamed as "o0", "o1", ... """ - functions = FindSymbols('symbolics|dimensions').visit(efunc) + functions = FindSymbols('basics|symbolics|dimensions').visit(efunc) mapper = abstract_objects(functions) @@ -463,9 +474,9 @@ def _(i, mapper, sregistry): @abstract_object.register(ThreadArray) def _(i, mapper, sregistry): if isinstance(i, SharedData): - name = sregistry.make_name(prefix='sd') + name = sregistry.make_name(prefix='sdata') else: - name = sregistry.make_name(prefix='pta') + name = sregistry.make_name(prefix='threads') v = i._rebuild(name=name) @@ -515,16 +526,28 @@ def _(i, mapper, sregistry): @abstract_object.register(Indirection) +def _(i, mapper, sregistry): + mapper[i] = i._rebuild(name=sregistry.make_name(prefix='ind')) + + @abstract_object.register(Temp) def _(i, mapper, sregistry): - if isinstance(i, Indirection): - name = sregistry.make_name(prefix='ind') - else: - name = sregistry.make_name(prefix='r') + mapper[i] = i._rebuild(name=sregistry.make_name(prefix='r')) - v = i._rebuild(name=name) - mapper[i] = v +@abstract_object.register(Pointer) +def _(i, mapper, sregistry): + mapper[i] = i._rebuild(name=sregistry.make_name(prefix='ptr')) + + +@abstract_object.register(NPThreads) +def _(i, mapper, sregistry): + mapper[i] = i._rebuild(name=sregistry.make_name(prefix='npthreads')) + + +@abstract_object.register(NThreadsBase) +def _(i, mapper, sregistry): + mapper[i] = i._rebuild(name=sregistry.make_name(prefix='nthreads')) def update_args(root, efuncs, dag): diff --git a/devito/passes/iet/linearization.py b/devito/passes/iet/linearization.py index 4336a927cb..b5645ed20a 100644 --- a/devito/passes/iet/linearization.py +++ b/devito/passes/iet/linearization.py @@ -239,10 +239,16 @@ def linearize_accesses(iet, key0, tracker=None): # 2) What `iet` *offers* # E.g. `{x_fsz0 -> u_vec->size[1]}` defines = FindSymbols('defines-aliases').visit(iet) - offers = filter_ordered(f for f in defines if key0(f)) + offers = filter_ordered(i for i in defines if key0(i.function)) instances = {} - for f in offers: - for d in f.dimensions[1:]: + for i in offers: + f = i.function + try: + dimensions = i.dimensions + except AttributeError: + continue + + for d in dimensions[1:]: try: fsz = tracker.get_size(f, d) except KeyError: diff --git a/devito/passes/iet/orchestration.py b/devito/passes/iet/orchestration.py index 4de8806f7c..dbc77aadd0 100644 --- a/devito/passes/iet/orchestration.py +++ b/devito/passes/iet/orchestration.py @@ -1,19 +1,18 @@ from collections import OrderedDict from functools import singledispatch -import cgen as c from sympy import Or from devito.exceptions import InvalidOperator from devito.ir.iet import (Call, Callable, List, SyncSpot, FindNodes, Transformer, BlankLine, BusyWait, DummyExpr, AsyncCall, AsyncCallable, - derive_parameters) + make_callable, derive_parameters) from devito.ir.support import (WaitLock, WithLock, ReleaseLock, InitArray, - SyncArray, PrefetchUpdate) + SyncArray, PrefetchUpdate, SnapOut, SnapIn) from devito.passes.iet.engine import iet_pass from devito.passes.iet.langbase import LangBB from devito.symbolics import CondEq, CondNe -from devito.tools import as_mapper +from devito.tools import as_mapper, as_tuple from devito.types import HostLayer __init__ = ['Orchestrator'] @@ -30,13 +29,11 @@ class Orchestrator: The language used to implement host-device data movements. """ - def __init__(self, sregistry): + def __init__(self, sregistry=None, **kwargs): self.sregistry = sregistry def _make_waitlock(self, iet, sync_ops, *args): waitloop = List( - header=c.Comment("Wait for `%s` to be transferred" % - ",".join(s.target.name for s in sync_ops)), body=BusyWait(Or(*[CondEq(s.handle, 0) for s in sync_ops])), ) @@ -72,16 +69,23 @@ def _make_withlock(self, iet, sync_ops, layer): return iet, [efunc] - def _make_initarray(self, iet, sync_ops, layer): - # Turn init IET into a Callable - name = self.sregistry.make_name(prefix='init_array') - parameters = derive_parameters(iet, ordering='canonical') - efunc = Callable(name, iet.body, 'void', parameters, 'static') + def _make_callable(self, name, iet, *args): + name = self.sregistry.make_name(prefix=name) + efunc = make_callable(name, iet.body) - iet = Call(name, parameters) + iet = Call(name, efunc.parameters) return iet, [efunc] + def _make_initarray(self, iet, *args): + return self._make_callable('init_array', iet, *args) + + def _make_snapout(self, iet, *args): + return self._make_callable('write_snapshot', iet, *args) + + def _make_snapin(self, iet, *args): + return self._make_callable('read_snapshot', iet, *args) + def _make_syncarray(self, iet, sync_ops, layer): try: qid = self.sregistry.queue0 @@ -125,6 +129,8 @@ def process(self, iet): (WithLock, self._make_withlock), (SyncArray, self._make_syncarray), (InitArray, self._make_initarray), + (SnapOut, self._make_snapout), + (SnapIn, self._make_snapin), (PrefetchUpdate, self._make_prefetchupdate), (ReleaseLock, self._make_releaselock), ]) diff --git a/devito/types/basic.py b/devito/types/basic.py index 2b01bd71c2..394d6276b6 100644 --- a/devito/types/basic.py +++ b/devito/types/basic.py @@ -1504,6 +1504,10 @@ def __new__(cls, *args, function=None, **kwargs): def function(self): return self._function + @property + def dimensions(self): + return self.function.dimensions + def _hashable_content(self): return super()._hashable_content() + (self.function,) diff --git a/devito/types/misc.py b/devito/types/misc.py index 34e50cf133..761bc81e51 100644 --- a/devito/types/misc.py +++ b/devito/types/misc.py @@ -4,7 +4,7 @@ import sympy from sympy.core.core import ordering_of_classes -from devito.types import Array, CompositeObject, Indexed, Symbol +from devito.types import Array, CompositeObject, Indexed, Symbol, LocalObject from devito.types.basic import IndexedData from devito.tools import Pickable, as_tuple @@ -137,16 +137,17 @@ def _defines(self): return frozenset().union(*[i._defines for i in self]) -class Pointer(Symbol): +class Pointer(LocalObject): - @classmethod - def __dtype_setup__(cls, **kwargs): - return kwargs.get('dtype', c_void_p) + __rkwargs__ = LocalObject.__rkwargs__ + ('dtype',) + + def __init__(self, *args, dtype=c_void_p, **kwargs): + super().__init__(*args, **kwargs) + self._dtype = dtype @property - def _C_ctype(self): - # `dtype` is a ctypes-derived type! - return self.dtype + def dtype(self): + return self._dtype class Indirection(Symbol): diff --git a/tests/test_gpu_common.py b/tests/test_gpu_common.py index d791511d3f..da9b3fdbe7 100644 --- a/tests/test_gpu_common.py +++ b/tests/test_gpu_common.py @@ -223,13 +223,13 @@ def test_tasking_fused(self): assert len(op._func_table) == 5 exprs = FindNodes(Expression).visit(op._func_table['copy_to_host0'].root) b = 17 if configuration['language'] == 'openacc' else 16 # No `qid` w/ OMP - assert str(exprs[b]) == 'const int deviceid = sdata->deviceid;' - assert str(exprs[b+1]) == 'volatile int time = sdata->time;' + assert str(exprs[b]) == 'const int deviceid = sdata0->deviceid;' + assert str(exprs[b+1]) == 'volatile int time = sdata0->time;' assert str(exprs[b+2]) == 'lock0[0] = 1;' assert exprs[b+3].write is u assert exprs[b+4].write is v assert str(exprs[b+5]) == 'lock0[0] = 2;' - assert str(exprs[b+6]) == 'sdata->flag = 1;' + assert str(exprs[b+6]) == 'sdata0->flag = 1;' op.apply(time_M=nt-2) @@ -258,7 +258,7 @@ def test_tasking_unfused_two_locks(self): # Check generated code trees = retrieve_iteration_tree(op) assert len(trees) == 3 - assert len([i for i in FindSymbols().visit(op) if isinstance(i, Lock)]) == 4 + assert len([i for i in FindSymbols().visit(op) if isinstance(i, Lock)]) == 5 sections = FindNodes(Section).visit(op) assert len(sections) == 2 assert (str(sections[1].body[0].body[0].body[0].body[0]) == @@ -366,7 +366,7 @@ def test_tasking_multi_output(self): op1 = Operator(eqns, opt=('tasking', 'orchestrate', {'linearize': False})) # Check generated code - assert len(retrieve_iteration_tree(op1)) == 3 + assert len(retrieve_iteration_tree(op1)) == 2 assert len([i for i in FindSymbols().visit(op1) if isinstance(i, Lock)]) == 1 sections = FindNodes(Section).visit(op1) assert len(sections) == 1 @@ -404,7 +404,7 @@ def test_tasking_lock_placement(self): op = Operator(eqns, opt=('tasking', 'orchestrate')) # Check generated code -- the wait-lock is expected in section1 - assert len(retrieve_iteration_tree(op)) == 4 + assert len(retrieve_iteration_tree(op)) == 3 assert len([i for i in FindSymbols().visit(op) if isinstance(i, Lock)]) == 1 sections = FindNodes(Section).visit(op) assert len(sections) == 2 @@ -443,7 +443,7 @@ def test_streaming_basic(self, opt, ntmps): assert np.all(u.data[1] == 36) @pytest.mark.parametrize('opt,ntmps', [ - (('buffering', 'streaming', 'orchestrate'), 11), + (('buffering', 'streaming', 'orchestrate'), 13), (('buffering', 'streaming', 'fuse', 'orchestrate', {'fuse-tasks': True}), 7), ]) def test_streaming_two_buffers(self, opt, ntmps): @@ -763,12 +763,12 @@ def test_composite_buffering_tasking_multi_output(self): # Check generated code -- thanks to buffering only expect 1 lock! assert len(retrieve_iteration_tree(op0)) == 2 - assert len(retrieve_iteration_tree(op1)) == 6 - assert len(retrieve_iteration_tree(op2)) == 4 + assert len(retrieve_iteration_tree(op1)) == 4 + assert len(retrieve_iteration_tree(op2)) == 3 symbols = FindSymbols().visit(op1) - assert len([i for i in symbols if isinstance(i, Lock)]) == 4 + assert len([i for i in symbols if isinstance(i, Lock)]) == 5 threads = [i for i in symbols if isinstance(i, PThreadArray)] - assert len(threads) == 3 + assert len(threads) == 2 assert threads[0].size.size == async_degree assert threads[1].size.size == async_degree symbols = FindSymbols().visit(op2) @@ -823,9 +823,9 @@ def test_composite_full_0(self): assert len(retrieve_iteration_tree(op0)) == 1 assert len(retrieve_iteration_tree(op1)) == 3 symbols = FindSymbols().visit(op1) - assert len([i for i in symbols if isinstance(i, Lock)]) == 4 + assert len([i for i in symbols if isinstance(i, Lock)]) == 3 threads = [i for i in symbols if isinstance(i, PThreadArray)] - assert len(threads) == 3 + assert len(threads) == 2 assert all(i.size == 1 for i in threads) op0.apply(time_M=nt-1) @@ -858,7 +858,7 @@ def test_composite_full_1(self, opt): op1 = Operator(eqns, opt=opt) # Check generated code - assert len(retrieve_iteration_tree(op1)) == 5 + assert len(retrieve_iteration_tree(op1)) == 3 assert len([i for i in FindSymbols().visit(op1) if isinstance(i, Lock)]) == 2 op0.apply(time_M=nt-2) @@ -886,7 +886,7 @@ def test_tasking_over_compiler_generated(self): # Check generated code for op in [op1, op2]: - assert len(retrieve_iteration_tree(op)) == 4 + assert len(retrieve_iteration_tree(op)) == 3 assert len([i for i in FindSymbols().visit(op) if isinstance(i, Lock)]) == 1 sections = FindNodes(Section).visit(op) assert len(sections) == 3 From 3773164c713af37986d3f0e8e2c68a38235fa99f Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Wed, 22 May 2024 13:20:56 +0000 Subject: [PATCH 45/58] misc: pep8 happiness --- devito/passes/clusters/misc.py | 3 --- devito/passes/iet/asynchrony.py | 14 +++----------- devito/passes/iet/orchestration.py | 2 +- 3 files changed, 4 insertions(+), 15 deletions(-) diff --git a/devito/passes/clusters/misc.py b/devito/passes/clusters/misc.py index d9bc1851e7..7ce2bd3422 100644 --- a/devito/passes/clusters/misc.py +++ b/devito/passes/clusters/misc.py @@ -234,9 +234,6 @@ def _key(self, c): any(f._mem_shared for f in c.scope.reads) ]) - #weak.append(any(f.is_TimeFunction and f.save is not None - # for f in c.scope.writes)) - # Promoting adjacency of IndexDerivatives will maximize their reuse weak.append(any(e.find(IndexDerivative) for e in c.exprs)) diff --git a/devito/passes/iet/asynchrony.py b/devito/passes/iet/asynchrony.py index 47e464fcdd..457de5e79a 100644 --- a/devito/passes/iet/asynchrony.py +++ b/devito/passes/iet/asynchrony.py @@ -1,4 +1,4 @@ -from collections import OrderedDict, namedtuple +from collections import namedtuple from functools import singledispatch from ctypes import c_int @@ -7,8 +7,8 @@ from devito.ir import (AsyncCall, AsyncCallable, BlankLine, Call, Callable, Conditional, DummyEq, DummyExpr, While, Increment, Iteration, List, PointerCast, Return, FindNodes, FindSymbols, - ThreadCallable, EntryFunction, Transformer, Uxreplace, - make_callable, maybe_alias) + ThreadCallable, EntryFunction, Transformer, make_callable, + maybe_alias) from devito.passes.iet.definitions import DataManager from devito.passes.iet.engine import iet_pass from devito.symbolics import (CondEq, CondNe, FieldFromComposite, FieldFromPointer, @@ -280,11 +280,3 @@ def sanitize_ncfields(ncfields): sanitized.append(i) return sanitized - - -def retrieve_sdata(call): - #TODO: DROP? - for a in call.arguments: - if isinstance(a, SharedData): - return a - return None diff --git a/devito/passes/iet/orchestration.py b/devito/passes/iet/orchestration.py index dbc77aadd0..0409f15e00 100644 --- a/devito/passes/iet/orchestration.py +++ b/devito/passes/iet/orchestration.py @@ -12,7 +12,7 @@ from devito.passes.iet.engine import iet_pass from devito.passes.iet.langbase import LangBB from devito.symbolics import CondEq, CondNe -from devito.tools import as_mapper, as_tuple +from devito.tools import as_mapper from devito.types import HostLayer __init__ = ['Orchestrator'] From bffe9d3bdd50907bac669b8c105e3739eb2ebdaf Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Wed, 22 May 2024 14:14:14 +0000 Subject: [PATCH 46/58] compiler: Patch Uxreplace for ParallelTree --- devito/ir/iet/visitors.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/devito/ir/iet/visitors.py b/devito/ir/iet/visitors.py index f7335d7cd9..dfb52118db 100644 --- a/devito/ir/iet/visitors.py +++ b/devito/ir/iet/visitors.py @@ -1295,6 +1295,12 @@ def visit_PragmaTransfer(self, o): arguments = [uxreplace(i, self.mapper) for i in o.arguments] return o._rebuild(function=function, arguments=arguments) + def visit_ParallelTree(self, o): + prefix = self._visit(o.prefix) + body = self._visit(o.body) + nthreads = self.mapper.get(o.nthreads, o.nthreads) + return o._rebuild(prefix=prefix, body=body, nthreads=nthreads) + def visit_HaloSpot(self, o): hs = o.halo_scheme fmapper = {self.mapper.get(k, k): v for k, v in hs.fmapper.items()} From 1b465d617b9696fd7940495d6073dbea3cf60e9f Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Mon, 20 May 2024 15:13:38 +0000 Subject: [PATCH 47/58] compiler: Revamp linearization --- devito/ir/iet/efunc.py | 9 +++- devito/ir/iet/utils.py | 3 +- devito/operator/operator.py | 2 +- devito/passes/iet/linearization.py | 84 +++++------------------------- devito/passes/iet/misc.py | 71 +++++++++++++++++++------ devito/symbolics/printer.py | 6 ++- devito/types/array.py | 1 + devito/types/misc.py | 78 +++++++++++++++++++-------- tests/test_linearize.py | 6 ++- tests/test_pickle.py | 10 ++-- tests/test_symbolics.py | 24 +++++++-- 11 files changed, 171 insertions(+), 123 deletions(-) diff --git a/devito/ir/iet/efunc.py b/devito/ir/iet/efunc.py index 67f58c8d2e..8ac992bd79 100644 --- a/devito/ir/iet/efunc.py +++ b/devito/ir/iet/efunc.py @@ -128,7 +128,14 @@ def __init__(self, name, body, parameters=None, prefix='static'): class AsyncCall(Call): - pass + + @property + def functions(self): + return () + + @property + def expr_symbols(self): + return () class ThreadCallable(Callable): diff --git a/devito/ir/iet/utils.py b/devito/ir/iet/utils.py index 17f640fbc4..e2a0cde052 100644 --- a/devito/ir/iet/utils.py +++ b/devito/ir/iet/utils.py @@ -122,7 +122,8 @@ def derive_parameters(iet, drop_locals=False, ordering='default'): # Maybe filter out all other compiler-generated objects if drop_locals: - parameters = [p for p in parameters if not (p.is_ArrayBasic or p.is_LocalObject)] + parameters = [p for p in parameters + if not (p.is_ArrayBasic or p.is_LocalObject)] # NOTE: This is requested by the caller when the parameters are used to # construct Callables whose signature only depends on the object types, diff --git a/devito/operator/operator.py b/devito/operator/operator.py index 698a4f3606..b0c9044f7d 100644 --- a/devito/operator/operator.py +++ b/devito/operator/operator.py @@ -472,7 +472,7 @@ def _lower_iet(cls, uiet, profiler=None, **kwargs): cls._Target.instrument(graph, profiler=profiler, **kwargs) # Extract the necessary macros from the symbolic objects - generate_macros(graph) + generate_macros(graph, **kwargs) # Target-independent optimizations minimize_symbols(graph) diff --git a/devito/passes/iet/linearization.py b/devito/passes/iet/linearization.py index b5645ed20a..71015cc22e 100644 --- a/devito/passes/iet/linearization.py +++ b/devito/passes/iet/linearization.py @@ -1,9 +1,6 @@ -from collections import defaultdict - from functools import singledispatch import numpy as np -from sympy import Add from devito.data import FULL from devito.ir import (BlankLine, Call, DummyExpr, Dereference, List, PointerCast, @@ -11,9 +8,8 @@ IMask) from devito.passes.iet.engine import iet_pass from devito.passes.iet.parpragma import PragmaIteration -from devito.symbolics import DefFunction, MacroArgument, ccode -from devito.tools import Bunch, filter_ordered, flatten, prod -from devito.types import Array, Bundle, Symbol, FIndexed, Indexed, Wildcard +from devito.tools import filter_ordered, flatten, prod +from devito.types import Array, Bundle, Symbol, FIndexed, Wildcard from devito.types.dense import DiscreteFunction __all__ = ['linearize'] @@ -21,9 +17,8 @@ def linearize(graph, **kwargs): """ - Turn n-dimensional Indexeds into 1-dimensional Indexed with suitable index - access function, such as `a[i, j] -> a[i*n + j]`. The row-major format - of the underlying Function objects is honored. + Turn Indexeds into FIndexeds so that the generated code uses linearized + access functions, e.g. `a[i*n + j]` instead of `a[i, j]`. """ sregistry = kwargs.get('sregistry') options = kwargs.get('options', {}) @@ -52,9 +47,6 @@ def linearize(graph, **kwargs): linearization(graph, key=key, tracker=tracker, **kwargs) - # Sanity check - assert not tracker.undef - @iet_pass def linearization(iet, key=None, tracker=None, **kwargs): @@ -66,10 +58,7 @@ def linearization(iet, key=None, tracker=None, **kwargs): iet = linearize_transfers(iet, **kwargs) iet = linearize_clauses(iet, **kwargs) - # Postprocess headers - headers = sorted((ccode(i), ccode(e)) for i, e in tracker.headers.values()) - - return iet, {'headers': headers} + return iet, {} def key1(f, d): @@ -135,8 +124,6 @@ def __init__(self, mode, dtype, sregistry): self.strides = {} self.strides_dynamic = {} # Strides varying regularly across iterations self.dists = {} - self.undef = defaultdict(set) - self.headers = {} def add(self, f): # Update unique sizes table @@ -167,19 +154,6 @@ def add_strides_dynamic(self, f, strides): assert len(strides) == f.ndim - 1 self.strides_dynamic[f] = tuple(strides) - def add_header(self, b, define, expr): - # NOTE: the input must be an IndexedBase and not a Function because - # we might require either the `.base` or the `.dmap`, or perhaps both - v = Bunch(define=define, expr=expr) - self.headers[b] = v - return v - - def needs_header(self, b): - return b.function.ndim > 1 and b not in self.headers - - def get_header(self, b): - return self.headers.get(b) - def get_size(self, f, d): return self.sizes[key1(f, d)] @@ -264,8 +238,8 @@ def linearize_accesses(iet, key0, tracker=None): symbols = flatten(i.free_symbols for i in subs.values()) candidates = {i for i in symbols if isinstance(i, (Stride, Dist))} for n in FindNodes(Call).visit(iet): - # Also consider the strides needed by the callee - candidates.update(tracker.undef.pop(n.name, [])) + # Also consider the strides needed by the callees + candidates.update(i for i in n.arguments if isinstance(i, Stride)) # 4) What `strides` can indeed be constructed? mapper = {} @@ -273,8 +247,6 @@ def linearize_accesses(iet, key0, tracker=None): if stride in candidates: if set(sizes).issubset(instances): mapper[stride] = sizes - else: - tracker.undef[iet.name].add(stride) # 5) Construct what needs to *and* can be constructed stmts, stmts1 = [], [] @@ -323,37 +295,6 @@ def _(f, d): return _generate_fsz.registry[Array](f, d) -@singledispatch -def _generate_header_basic(f, tracker): - return - - -@_generate_header_basic.register(DiscreteFunction) -@_generate_header_basic.register(Array) -@_generate_header_basic.register(Bundle) -def _(f, i, tracker): - b = i.base - - if not tracker.needs_header(b): - return tracker.get_header(b) - - # Generate e.g. `usave[(time)*xi_slc0 + (xi)*yi_slc0 + (yi)]` - - strides = tracker.map_strides(f) - - macroargnames = [d.name for d in f.dimensions] - macroargs = [MacroArgument(i) for i in macroargnames] - - items = [m*strides[d] for m, d in zip(macroargs, f.dimensions[1:])] - items.append(MacroArgument(f.dimensions[-1].name)) - - pname = tracker.sregistry.make_name(prefix='%sL' % f.name) - define = DefFunction(pname, macroargnames) - expr = Indexed(b, Add(*items, evaluate=False)) - - return tracker.add_header(b, define, expr) - - @singledispatch def _generate_linearization_basic(f, i, tracker): assert False @@ -363,14 +304,13 @@ def _generate_linearization_basic(f, i, tracker): @_generate_linearization_basic.register(Array) @_generate_linearization_basic.register(Bundle) def _(f, i, tracker): - header = _generate_header_basic(f, i, tracker) - n = len(i.indices) + strides_map = tracker.map_strides(f) - if header and n == i.function.ndim: - pname = header.define.name.value - strides = tuple(tracker.map_strides(f).values())[-n:] - return FIndexed.from_indexed(i, pname, strides=strides) + if n == 1: + return i + elif n == f.ndim and f.is_regular: + return FIndexed(i.base, *i.indices, strides_map=strides_map) else: # Honour custom indexing return i.base[sum(i.indices)] diff --git a/devito/passes/iet/misc.py b/devito/passes/iet/misc.py index 44cec6df13..f6ba18adfc 100644 --- a/devito/passes/iet/misc.py +++ b/devito/passes/iet/misc.py @@ -10,8 +10,10 @@ filter_iterations, retrieve_iteration_tree, pull_dims) from devito.passes.iet.engine import iet_pass from devito.ir.iet.efunc import DeviceFunction, EntryFunction -from devito.symbolics import ValueLimit, evalrel, has_integer_args, limits_mapper -from devito.tools import as_mapper, filter_ordered, split +from devito.symbolics import (ValueLimit, evalrel, has_integer_args, limits_mapper, + ccode) +from devito.tools import Bunch, as_mapper, filter_ordered, split +from devito.types import FIndexed __all__ = ['avoid_denormals', 'hoist_prodders', 'relax_incr_dimensions', 'generate_macros', 'minimize_symbols'] @@ -135,11 +137,24 @@ def relax_incr_dimensions(iet, options=None, **kwargs): return iet, {} +def generate_macros(graph, **kwargs): + _generate_macros(graph, tracker={}, **kwargs) + + @iet_pass -def generate_macros(iet): +def _generate_macros(iet, tracker=None, **kwargs): + # Derive the Macros necessary for the FIndexeds + iet = _generate_macros_findexeds(iet, tracker=tracker, **kwargs) + + headers = [i.header for i in tracker.values()] + headers = sorted((ccode(define), ccode(expr)) for define, expr in headers) + # Generate Macros from higher-level SymPy objects - applications = FindApplications().visit(iet) - headers = set().union(*[_generate_macros(i) for i in applications]) + for i in FindApplications().visit(iet): + headers.extend(_generate_macros_math(i)) + + # Remove redundancies while preserving the order + headers = filter_ordered(headers) # Some special Symbols may represent Macros defined in standard libraries, # so we need to include the respective includes @@ -153,27 +168,53 @@ def generate_macros(iet): return iet, {'headers': headers, 'includes': includes} +def _generate_macros_findexeds(iet, sregistry=None, tracker=None, **kwargs): + indexeds = FindSymbols('indexeds').visit(iet) + indexeds = [i for i in indexeds if isinstance(i, FIndexed)] + if not indexeds: + return iet + + subs = {} + for i in indexeds: + try: + v = tracker[i.base].v + subs[i] = v.func(v.base, *i.indices) + continue + except KeyError: + pass + + pname = sregistry.make_name(prefix='%sL' % i.name) + header, v = i.bind(pname) + + subs[i] = v + tracker[i.base] = Bunch(header=header, v=v) + + iet = Uxreplace(subs).visit(iet) + + return iet + + @singledispatch -def _generate_macros(expr): - return set() +def _generate_macros_math(expr): + return () -@_generate_macros.register(Min) -@_generate_macros.register(sympy.Min) +@_generate_macros_math.register(Min) +@_generate_macros_math.register(sympy.Min) def _(expr): if has_integer_args(*expr.args) and len(expr.args) == 2: - return {('MIN(a,b)', ('(((a) < (b)) ? (a) : (b))'))} + return (('MIN(a,b)', ('(((a) < (b)) ? (a) : (b))')),) else: - return set() + return () -@_generate_macros.register(Max) -@_generate_macros.register(sympy.Max) +@_generate_macros_math.register(Max) +@_generate_macros_math.register(sympy.Max) def _(expr): if has_integer_args(*expr.args) and len(expr.args) == 2: - return {('MAX(a,b)', ('(((a) > (b)) ? (a) : (b))'))} + return (('MAX(a,b)', ('(((a) > (b)) ? (a) : (b))')),) else: - return set() + return () @iet_pass diff --git a/devito/symbolics/printer.py b/devito/symbolics/printer.py index b3fecd47ec..fb8dc4c59d 100644 --- a/devito/symbolics/printer.py +++ b/devito/symbolics/printer.py @@ -75,7 +75,11 @@ def _print_FIndexed(self, expr): U[t,x,y,z] -> U(t,x,y,z) """ inds = ', '.join(self._print(x) for x in expr.indices) - return '%s(%s)' % (self._print(expr.base.label), inds) + try: + label = expr.accessor.label + except AttributeError: + label = expr.base.label + return '%s(%s)' % (self._print(label), inds) def _print_Rational(self, expr): """Print a Rational as a C-like float/float division.""" diff --git a/devito/types/array.py b/devito/types/array.py index 32ce1393b2..73eccefdc5 100644 --- a/devito/types/array.py +++ b/devito/types/array.py @@ -474,6 +474,7 @@ def __getitem__(self, index): _C_field_data = ArrayMapped._C_field_data _C_field_nbytes = ArrayMapped._C_field_nbytes _C_field_dmap = ArrayMapped._C_field_dmap + _C_field_size = ArrayMapped._C_field_size @property def _C_ctype(self): diff --git a/devito/types/misc.py b/devito/types/misc.py index 761bc81e51..72f1ab895a 100644 --- a/devito/types/misc.py +++ b/devito/types/misc.py @@ -6,7 +6,7 @@ from devito.types import Array, CompositeObject, Indexed, Symbol, LocalObject from devito.types.basic import IndexedData -from devito.tools import Pickable, as_tuple +from devito.tools import Pickable, frozendict __all__ = ['Timer', 'Pointer', 'VolatileInt', 'FIndexed', 'Wildcard', 'Fence', 'Global', 'Hyperplane', 'Indirection', 'Temp', 'TempArray', 'Jump', @@ -60,48 +60,49 @@ class Wildcard(Symbol): class FIndexed(Indexed, Pickable): """ - A flatten Indexed with functional (primary) and indexed (secondary) representations. - - Examples - -------- - Consider the Indexed `u[x, y]`. The corresponding FIndexed's functional representation - is `u(x, y)`. This is a multidimensional representation, just like any other Indexed. - The corresponding indexed (secondary) represenation is instead flatten, that is - `uX[x*ny + y]`, where `X` is a string provided by the caller. + An FIndexed is a symbolic object used to represent a multidimensional + array in symbolic equations. It is a subclass of Indexed, and as such + it has a base (the symbol representing the array) and a number of indices. + + However, unlike Indexed, the representation of an FIndexed is functional, + e.g., `u(x, y)`, rather than explicit, e.g., `u[x, y]`. + + An FIndexed also carries the necessary information to generate a 1-dimensional + representation of the array, which is necessary when dealing with actual + memory accesses. For example, an FIndexed carries the strides of the array, + which ultimately allow to compute the actual memory address of an element. + For example, the FIndexed `u(x, y)` corresponds to the indexed representation + `u[x*ny + y]`, where `ny` is the stride of the array `u` along the y-axis. """ __rargs__ = ('base', '*indices') - __rkwargs__ = ('strides',) + __rkwargs__ = ('strides_map', 'accessor') - def __new__(cls, base, *args, strides=None): + def __new__(cls, base, *args, strides_map=None, accessor=None): obj = super().__new__(cls, base, *args) - obj.strides = as_tuple(strides) + obj.strides_map = frozendict(strides_map or {}) + obj.accessor = accessor return obj - @classmethod - def from_indexed(cls, indexed, pname, strides=None): - label = Symbol(name=pname, dtype=indexed.dtype) - base = IndexedData(label, None, function=indexed.function) - return FIndexed(base, *indexed.indices, strides=strides) - def __repr__(self): return "%s(%s)" % (self.name, ", ".join(str(i) for i in self.indices)) __str__ = __repr__ def _hashable_content(self): - return super()._hashable_content() + (self.strides,) + accessor = self.accessor or 0 # Avoids TypeError inside sympy.Basic.compare + return super()._hashable_content() + (self.strides, accessor) func = Pickable._rebuild @property def name(self): - return self.function.name + return self.base.name @property - def pname(self): - return self.base.name + def strides(self): + return tuple(self.strides_map.values()) @property def free_symbols(self): @@ -111,6 +112,39 @@ def free_symbols(self): return (super().free_symbols | set().union(*[i.free_symbols for i in self.strides])) + def bind(self, pname): + """ + Generate a 2-tuple: + + * A macro which expands to the 1-dimensional representation of the + FIndexed, e.g. `aL0(t,x,y) -> a[(t)*x_stride0 + (x)*y_stride0 + (y)]` + * A new FIndexed, with the same indices as `self`, but with a new + base symbol named after `pname`, e.g. `aL0(t, x+1, y-2)`, where + `aL0` is given by the `pname`. + """ + b = self.base + f = self.function + strides_map = self.strides_map + + # TODO: resolve circular import. This is a tough one though, as it + # requires a complete rethinking of `symbolics` vs `types` folders + from devito.symbolics import DefFunction, MacroArgument + + macroargnames = [d.name for d in f.dimensions] + macroargs = [MacroArgument(i) for i in macroargnames] + + items = [m*strides_map[d] for m, d in zip(macroargs, f.dimensions[1:])] + items.append(MacroArgument(f.dimensions[-1].name)) + + define = DefFunction(pname, macroargnames) + expr = Indexed(b, sympy.Add(*items, evaluate=False)) + + label = Symbol(name=pname, dtype=self.dtype) + accessor = IndexedData(label, None, function=f) + findexed = self.func(accessor=accessor) + + return ((define, expr), findexed) + func = Pickable._rebuild # Pickling support diff --git a/tests/test_linearize.py b/tests/test_linearize.py index 088f42f36b..88f533687d 100644 --- a/tests/test_linearize.py +++ b/tests/test_linearize.py @@ -5,7 +5,7 @@ from devito import (Grid, Function, TimeFunction, SparseTimeFunction, Operator, Eq, Inc, MatrixSparseTimeFunction, sin) from devito.ir import Call, Callable, DummyExpr, Expression, FindNodes, SymbolRegistry -from devito.passes import Graph, linearize +from devito.passes import Graph, linearize, generate_macros from devito.types import Array, Bundle, DefaultDimension @@ -512,8 +512,10 @@ def test_call_retval_indexed(): # Emulate what the compiler would do graph = Graph(foo) + sregistry = SymbolRegistry() linearize(graph, callback=True, options={'index-mode': 'int64'}, - sregistry=SymbolRegistry()) + sregistry=sregistry) + generate_macros(graph, sregistry=sregistry) foo = graph.root diff --git a/tests/test_pickle.py b/tests/test_pickle.py index 9b7a639a32..e70fa17a0b 100644 --- a/tests/test_pickle.py +++ b/tests/test_pickle.py @@ -334,16 +334,20 @@ def test_shared_data(self, pickle): def test_findexed(self, pickle): grid = Grid(shape=(3, 3, 3)) + x, y, z = grid.dimensions + f = Function(name='f', grid=grid) - fi = FIndexed.from_indexed(f.indexify(), "foo", strides=(1, 2)) + strides_map = {x: 1, y: 2, z: 3} + fi = FIndexed(f.base, x+1, y, z-2, strides_map=strides_map, accessor='fL') pkl_fi = pickle.dumps(fi) new_fi = pickle.loads(pkl_fi) assert new_fi.name == fi.name - assert new_fi.pname == fi.pname - assert new_fi.strides == fi.strides + assert new_fi.accessor == 'fL' + assert new_fi.indices == (x+1, y, z-2) + assert new_fi.strides_map == fi.strides_map def test_symbolics(self, pickle): a = Symbol('a') diff --git a/tests/test_symbolics.py b/tests/test_symbolics.py index 3f50faa1ff..9dd0c48584 100644 --- a/tests/test_symbolics.py +++ b/tests/test_symbolics.py @@ -365,14 +365,28 @@ class BarCast(Cast): def test_findexed(): grid = Grid(shape=(3, 3, 3)) + x, y, z = grid.dimensions + f = Function(name='f', grid=grid) - fi = FIndexed.from_indexed(f.indexify(), "foo", strides=(1, 2)) - new_fi = fi.func(strides=(3, 4)) + strides_map = {x: 1, y: 2, z: 3} + fi = FIndexed(f.base, x+1, y, z-2, strides_map=strides_map) + assert ccode(fi) == 'f(x + 1, y, z - 2)' + + # Binding + _, fi1 = fi.bind('fL') + assert fi1.base is f.base + assert ccode(fi1) == 'fL(x + 1, y, z - 2)' + + # Reconstruction + strides_map = {x: 3, y: 2, z: 1} + new_fi = fi.func(strides_map=strides_map, accessor=fi1.accessor) assert new_fi.name == fi.name == 'f' + assert new_fi.accessor == fi1.accessor + assert new_fi.accessor.name == 'fL' assert new_fi.indices == fi.indices - assert new_fi.strides == (3, 4) + assert new_fi.strides_map == strides_map def test_symbolic_printing(): @@ -389,9 +403,9 @@ class MyLocalObject(LocalObject, Expr): grid = Grid((10,)) f = Function(name="f", grid=grid) - fi = FIndexed.from_indexed(f.indexify(), "foo", strides=(1, 2)) + fi = FIndexed(f.base, grid.dimensions[0]) df = DefFunction('aaa', arguments=[fi]) - assert ccode(df) == 'aaa(foo(x))' + assert ccode(df) == 'aaa(f(x))' def test_is_on_grid(): From c08f3d082765c5cbb040d9d81151dbd1b881b50b Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Mon, 27 May 2024 09:04:20 +0000 Subject: [PATCH 48/58] tools: Reinstate erroneously dropped filter_ordered implementation --- devito/tools/utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/devito/tools/utils.py b/devito/tools/utils.py index f9dbebcb34..683b8c26e8 100644 --- a/devito/tools/utils.py +++ b/devito/tools/utils.py @@ -166,6 +166,8 @@ def filter_ordered(elements, key=None): key : callable, optional Conversion key used during equality comparison. """ + # This method exploits the fact that dictionary keys are unique and ordered + # (since Python 3.7). It's concise and often faster for larger lists if isinstance(elements, types.GeneratorType): elements = list(elements) From 809bcb80ef44f4c1462453fafb1e3ab6b30b3ddb Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Mon, 27 May 2024 14:11:44 +0000 Subject: [PATCH 49/58] compiler: Fixup orchestration --- devito/ir/iet/efunc.py | 9 +------- devito/passes/iet/engine.py | 2 +- devito/passes/iet/orchestration.py | 34 ++++++++++++++++++++++++------ 3 files changed, 29 insertions(+), 16 deletions(-) diff --git a/devito/ir/iet/efunc.py b/devito/ir/iet/efunc.py index 8ac992bd79..67f58c8d2e 100644 --- a/devito/ir/iet/efunc.py +++ b/devito/ir/iet/efunc.py @@ -128,14 +128,7 @@ def __init__(self, name, body, parameters=None, prefix='static'): class AsyncCall(Call): - - @property - def functions(self): - return () - - @property - def expr_symbols(self): - return () + pass class ThreadCallable(Callable): diff --git a/devito/passes/iet/engine.py b/devito/passes/iet/engine.py index f6b56a776b..0c26aa881c 100644 --- a/devito/passes/iet/engine.py +++ b/devito/passes/iet/engine.py @@ -597,7 +597,7 @@ def update_args(root, efuncs, dag): # linearization) symbols = FindSymbols('basics').visit(root.body) drop_params.extend(a for a in root.parameters - if a.is_Symbol and a not in symbols) + if (a.is_Symbol or a.is_LocalObject) and a not in symbols) # Must record the index, not the param itself, since a param may be # bound to whatever arg, possibly a generic SymPy expr diff --git a/devito/passes/iet/orchestration.py b/devito/passes/iet/orchestration.py index 0409f15e00..952e1cbb6f 100644 --- a/devito/passes/iet/orchestration.py +++ b/devito/passes/iet/orchestration.py @@ -12,7 +12,7 @@ from devito.passes.iet.engine import iet_pass from devito.passes.iet.langbase import LangBB from devito.symbolics import CondEq, CondNe -from devito.tools import as_mapper +from devito.tools import DAG, as_mapper, as_tuple from devito.types import HostLayer __init__ = ['Orchestrator'] @@ -136,11 +136,21 @@ def process(self, iet): ]) key = lambda s: list(callbacks).index(s) + # The SyncSpots may be nested, so we compute a topological ordering + # so that they are processed in a bottom-up fashion. This is necessary + # because e.g. an inner SyncSpot may generate new objects (e.g., a new + # Queue), which in turn must be visible to the outer SyncSpot to + # generate the correct parameters list efuncs = [] - subs = {} - for n in sync_spots: - mapper = as_mapper(n.sync_ops, lambda i: type(i)) + while True: + sync_spots = FindNodes(SyncSpot).visit(iet) + if not sync_spots: + break + n0 = ordered(sync_spots).pop(0) + mapper = as_mapper(n0.sync_ops, lambda i: type(i)) + + subs = {} for t in sorted(mapper, key=key): sync_ops = mapper[t] @@ -149,15 +159,25 @@ def process(self, iet): raise InvalidOperator("Unsupported streaming case") layer = layers.pop() - subs[n], v = callbacks[t](subs.get(n, n), sync_ops, layer) + n1, v = callbacks[t](subs.get(n0, n0), sync_ops, layer) + + subs[n0] = n1 efuncs.extend(v) - iet = Transformer(subs).visit(iet) - efuncs = [Transformer(subs).visit(i) for i in efuncs] + iet = Transformer(subs).visit(iet) return iet, {'efuncs': efuncs} +def ordered(sync_spots): + dag = DAG(nodes=sync_spots) + for n0 in sync_spots: + for n1 in as_tuple(FindNodes(SyncSpot).visit(n0.body)): + dag.add_edge(n1, n0) + + return dag.topological_sort() + + # Task handlers layer_host = HostLayer('host') From e01d67d67cb5f2e19ceafa887baa4a3dad6528e1 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Mon, 27 May 2024 14:49:55 +0000 Subject: [PATCH 50/58] compiler: Fix Uxreplace.PragmaTransfer --- devito/ir/iet/visitors.py | 14 +++++++++++++- devito/passes/iet/engine.py | 3 ++- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/devito/ir/iet/visitors.py b/devito/ir/iet/visitors.py index dfb52118db..3f95f0a11d 100644 --- a/devito/ir/iet/visitors.py +++ b/devito/ir/iet/visitors.py @@ -1293,7 +1293,19 @@ def visit_Pragma(self, o): def visit_PragmaTransfer(self, o): function = uxreplace(o.function, self.mapper) arguments = [uxreplace(i, self.mapper) for i in o.arguments] - return o._rebuild(function=function, arguments=arguments) + if o.imask is None: + return o._rebuild(function=function, arguments=arguments) + # An `imask` may be None, a list of symbols/numbers, or a list of + # 2-tuples representing ranges + imask = [] + for v in o.imask: + try: + i, j = v + imask.append((uxreplace(i, self.mapper), + uxreplace(j, self.mapper))) + except TypeError: + imask.append(uxreplace(v, self.mapper)) + return o._rebuild(function=function, imask=imask, arguments=arguments) def visit_ParallelTree(self, o): prefix = self._visit(o.prefix) diff --git a/devito/passes/iet/engine.py b/devito/passes/iet/engine.py index 0c26aa881c..2e9aebc055 100644 --- a/devito/passes/iet/engine.py +++ b/devito/passes/iet/engine.py @@ -12,7 +12,7 @@ from devito.tools import DAG, as_tuple, filter_ordered, sorted_priority, timed_pass from devito.types import (Array, Bundle, CompositeObject, Lock, IncrDimension, Indirection, Pointer, SharedData, ThreadArray, Temp, - NPThreads, NThreadsBase) + NPThreads, NThreadsBase, Wildcard) from devito.types.args import ArgProvider from devito.types.dense import DiscreteFunction from devito.types.dimension import AbstractIncrDimension, BlockDimension @@ -531,6 +531,7 @@ def _(i, mapper, sregistry): @abstract_object.register(Temp) +@abstract_object.register(Wildcard) def _(i, mapper, sregistry): mapper[i] = i._rebuild(name=sregistry.make_name(prefix='r')) From 7a7bf29de74e0e827195a6d750e85680f50448ff Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Mon, 27 May 2024 14:56:33 +0000 Subject: [PATCH 51/58] compiler: Patch KernelLaunch.expr_symbols --- devito/ir/iet/efunc.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/devito/ir/iet/efunc.py b/devito/ir/iet/efunc.py index 67f58c8d2e..bbc3fb1627 100644 --- a/devito/ir/iet/efunc.py +++ b/devito/ir/iet/efunc.py @@ -215,6 +215,13 @@ def functions(self): launch_args += (self.stream.function,) return super().functions + launch_args + @cached_property + def expr_symbols(self): + launch_symbols = (self.grid, self.block,) + if self.stream is not None: + launch_symbols += (self.stream,) + return super().expr_symbols + launch_symbols + # Other relevant Callable subclasses From 77cd2ed8d31f646d545dd64171cf527deca69775 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Mon, 27 May 2024 15:46:41 +0000 Subject: [PATCH 52/58] example: Update expected output --- examples/performance/01_gpu.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/performance/01_gpu.ipynb b/examples/performance/01_gpu.ipynb index 27ce83892d..b8119552e2 100644 --- a/examples/performance/01_gpu.ipynb +++ b/examples/performance/01_gpu.ipynb @@ -259,9 +259,9 @@ "output_type": "stream", "text": [ "#define _POSIX_C_SOURCE 200809L\n", - "#define uL0(time,x,y) u[(time)*x_stride0 + (x)*y_stride0 + (y)]\n", "#define START(S) struct timeval start_ ## S , end_ ## S ; gettimeofday(&start_ ## S , NULL);\n", "#define STOP(S,T) gettimeofday(&end_ ## S, NULL); T->S += (double)(end_ ## S .tv_sec-start_ ## S.tv_sec)+(double)(end_ ## S .tv_usec-start_ ## S .tv_usec)/1000000;\n", + "#define uL0(time,x,y) u[(time)*x_stride0 + (x)*y_stride0 + (y)]\n", "\n", "#include \"stdlib.h\"\n", "#include \"math.h\"\n", From 84faf2778eeefa24db79fa6797728a0463e63815 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Wed, 5 Jun 2024 14:41:43 +0000 Subject: [PATCH 53/58] tests: Drop --no-summary from custom markers --- conftest.py | 1 + 1 file changed, 1 insertion(+) diff --git a/conftest.py b/conftest.py index f14d595f78..4bb0629327 100644 --- a/conftest.py +++ b/conftest.py @@ -244,6 +244,7 @@ def pytest_runtest_call(item): elif item.get_closest_marker("parallel"): # Spawn parallel processes to run test + outcome = parallel(item, item.funcargs['mode']) if outcome: pytest.skip(f"{item} success in parallel") From ad4b4772c166544a3e3c620037f3fffc1f3cd344 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Wed, 5 Jun 2024 14:42:02 +0000 Subject: [PATCH 54/58] compiler: Fixup rcompile post-rebasing --- devito/core/cpu.py | 8 ++++---- devito/core/gpu.py | 10 +++++++--- devito/mpi/routines.py | 2 +- devito/operator/operator.py | 2 +- 4 files changed, 13 insertions(+), 9 deletions(-) diff --git a/devito/core/cpu.py b/devito/core/cpu.py index a5c852154b..5b0c8f6a22 100644 --- a/devito/core/cpu.py +++ b/devito/core/cpu.py @@ -97,15 +97,15 @@ def _normalize_kwargs(cls, **kwargs): def _rcompile_wrapper(cls, **kwargs0): options0 = kwargs0.pop('options') - def wrapper(expressions, **kwargs1): - options = {**options0, **kwargs1.pop('options', {})} - kwargs = {'options': options, **kwargs0, **kwargs1} + def wrapper(expressions, options=None, **kwargs1): + options = {**options0, **(options or {})} + kwargs = {**kwargs0, **kwargs1} # User-provided openmp flag has precedence over defaults if not options['openmp']: kwargs['language'] = 'C' - return rcompile(expressions, kwargs) + return rcompile(expressions, kwargs, options) return wrapper diff --git a/devito/core/gpu.py b/devito/core/gpu.py index 414dbf268b..1193d4de77 100644 --- a/devito/core/gpu.py +++ b/devito/core/gpu.py @@ -116,11 +116,15 @@ def _normalize_gpu_fit(cls, oo, **kwargs): return as_tuple(cls.GPU_FIT) @classmethod - def _rcompile_wrapper(cls, **kwargs): - def wrapper(expressions, mode='default', **options): + def _rcompile_wrapper(cls, **kwargs0): + options0 = kwargs0.pop('options') + + def wrapper(expressions, mode='default', options=None, **kwargs1): + options = {**options0, **(options or {})} + kwargs = {**kwargs0, **kwargs1} if mode == 'host': - par_disabled = kwargs['options']['par-disabled'] + par_disabled = options['par-disabled'] target = { 'platform': 'cpu64', 'language': 'C' if par_disabled else 'openmp', diff --git a/devito/mpi/routines.py b/devito/mpi/routines.py index 83dff98874..dc89e85164 100644 --- a/devito/mpi/routines.py +++ b/devito/mpi/routines.py @@ -371,7 +371,7 @@ def _make_copy(self, f, hse, key, swap=False): eqns.append(Eq(*swap(buf[[i] + bdims], f[[i] + findices]))) # Compile `eqns` into an IET via recursive compilation - irs, _ = self.rcompile(eqns, mpi=False) + irs, _ = self.rcompile(eqns, options={'mpi': False}) parameters = [buf] + bshape + list(f.handles) + ofs diff --git a/devito/operator/operator.py b/devito/operator/operator.py index b0c9044f7d..373ce9e0e1 100644 --- a/devito/operator/operator.py +++ b/devito/operator/operator.py @@ -1082,7 +1082,7 @@ def rcompile(expressions, kwargs, options, target=None): """ Perform recursive compilation on an ordered sequence of symbolic expressions. """ - options = {**kwargs['options'], **rcompile_registry, **options} + options = {**options, **rcompile_registry} if target is None: cls = operator_selector(**kwargs) From 17e85121128158a7e372ed8e3a30eb415f5c1650 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Wed, 5 Jun 2024 15:16:39 +0000 Subject: [PATCH 55/58] compiler: Polishing --- devito/ir/iet/efunc.py | 2 +- devito/ir/iet/visitors.py | 1 + devito/ir/stree/algorithms.py | 2 +- devito/ir/support/space.py | 9 +++++---- devito/ir/support/syncs.py | 6 +----- devito/operator/operator.py | 5 +++-- devito/tools/utils.py | 9 ++++++++- 7 files changed, 20 insertions(+), 14 deletions(-) diff --git a/devito/ir/iet/efunc.py b/devito/ir/iet/efunc.py index bbc3fb1627..e7192ecf9b 100644 --- a/devito/ir/iet/efunc.py +++ b/devito/ir/iet/efunc.py @@ -217,7 +217,7 @@ def functions(self): @cached_property def expr_symbols(self): - launch_symbols = (self.grid, self.block,) + launch_symbols = (self.grid, self.block) if self.stream is not None: launch_symbols += (self.stream,) return super().expr_symbols + launch_symbols diff --git a/devito/ir/iet/visitors.py b/devito/ir/iet/visitors.py index 3f95f0a11d..e0bf98017f 100644 --- a/devito/ir/iet/visitors.py +++ b/devito/ir/iet/visitors.py @@ -1295,6 +1295,7 @@ def visit_PragmaTransfer(self, o): arguments = [uxreplace(i, self.mapper) for i in o.arguments] if o.imask is None: return o._rebuild(function=function, arguments=arguments) + # An `imask` may be None, a list of symbols/numbers, or a list of # 2-tuples representing ranges imask = [] diff --git a/devito/ir/stree/algorithms.py b/devito/ir/stree/algorithms.py index 94a7f4e592..a10be2b3ce 100644 --- a/devito/ir/stree/algorithms.py +++ b/devito/ir/stree/algorithms.py @@ -173,7 +173,7 @@ def preprocess(clusters, options=None, **kwargs): elif c.is_critical_region: if c.is_wait: - processed.append(c.rebuild(exprs=[], syncs=c.syncs)) + processed.append(c.rebuild(exprs=[])) elif c.is_wild: continue diff --git a/devito/ir/support/space.py b/devito/ir/support/space.py index d28f6b33ee..6a11acdc28 100644 --- a/devito/ir/support/space.py +++ b/devito/ir/support/space.py @@ -7,8 +7,9 @@ from devito.ir.support.utils import minimum, maximum from devito.ir.support.vector import Vector, vmin, vmax -from devito.tools import (Ordering, Stamp, as_list, as_tuple, filter_ordered, - flatten, frozendict, is_integer, toposort) +from devito.tools import (Ordering, Stamp, as_list, as_set, as_tuple, + filter_ordered, flatten, frozendict, is_integer, + toposort) from devito.types import Dimension, ModuloDimension __all__ = ['NullInterval', 'Interval', 'IntervalGroup', 'IterationSpace', @@ -389,7 +390,7 @@ def generate(cls, op, *interval_groups, relations=None): interval = getattr(interval, op)(i) intervals.append(interval) - relations = set(as_tuple(relations)) + relations = as_set(relations) relations.update(set().union(*[ig.relations for ig in interval_groups])) modes = set(ig.mode for ig in interval_groups) @@ -470,7 +471,7 @@ def drop(self, maybe_callable, strict=False): if callable(maybe_callable): dims = {i.dim for i in self if maybe_callable(i.dim)} else: - dims = set(as_tuple(maybe_callable)) + dims = as_set(maybe_callable) # Dropping if strict: diff --git a/devito/ir/support/syncs.py b/devito/ir/support/syncs.py index 80dbeae8cd..0f7cd687cc 100644 --- a/devito/ir/support/syncs.py +++ b/devito/ir/support/syncs.py @@ -12,11 +12,7 @@ 'PrefetchUpdate', 'SnapOut', 'SnapIn', 'Ops', 'normalize_syncs'] -class Operation(Pickable): - pass - - -class SyncOp(Operation): +class SyncOp(Pickable): __rargs__ = ('handle', 'target') __rkwargs__ = ('tindex', 'function', 'findex', 'dim', 'size', 'origin') diff --git a/devito/operator/operator.py b/devito/operator/operator.py index 373ce9e0e1..f8749969cd 100644 --- a/devito/operator/operator.py +++ b/devito/operator/operator.py @@ -179,8 +179,9 @@ def _check_kwargs(cls, **kwargs): def _sanitize_exprs(cls, expressions, **kwargs): expressions = as_tuple(expressions) - if any(not isinstance(i, Evaluable) for i in expressions): - raise InvalidOperator("Only `devito.Evaluable` are allowed.") + for i in expressions: + if not isinstance(i, Evaluable): + raise InvalidOperator("`%s` is not an `Evaluable` object" % str(i)) return expressions diff --git a/devito/tools/utils.py b/devito/tools/utils.py index 683b8c26e8..b99eddcf6c 100644 --- a/devito/tools/utils.py +++ b/devito/tools/utils.py @@ -12,7 +12,7 @@ 'roundm', 'powerset', 'invert', 'flatten', 'single_or', 'filter_ordered', 'as_mapper', 'filter_sorted', 'pprint', 'sweep', 'all_equal', 'as_list', 'indices_to_slices', 'indices_to_sections', 'transitive_closure', - 'humanbytes', 'contains_val', 'sorted_priority'] + 'humanbytes', 'contains_val', 'sorted_priority', 'as_set'] def prod(iterable, initial=1): @@ -26,6 +26,13 @@ def as_list(item, type=None, length=None): return list(as_tuple(item, type=type, length=length)) +def as_set(iterable, type=None, length=None): + """ + Force item to a set. + """ + return set(as_tuple(iterable, type=type, length=length)) + + def as_tuple(item, type=None, length=None): """ Force item to a tuple. Passes tuple subclasses through also. From 60a2ee7b09e55d326dd747ded20e6bb24348a013 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Thu, 6 Jun 2024 09:29:08 +0000 Subject: [PATCH 56/58] compiler: Add stream_dimensions() --- devito/core/cpu.py | 3 ++- devito/core/gpu.py | 4 ++-- devito/passes/__init__.py | 10 ++++++++++ 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/devito/core/cpu.py b/devito/core/cpu.py index 5b0c8f6a22..dbf50c3e3b 100644 --- a/devito/core/cpu.py +++ b/devito/core/cpu.py @@ -3,6 +3,7 @@ from devito.core.operator import CoreOperator, CustomOperator, ParTile from devito.exceptions import InvalidOperator from devito.operator.operator import rcompile +from devito.passes import stream_dimensions from devito.passes.equations import collect_derivatives from devito.passes.clusters import (Lift, blocking, buffering, cire, cse, factorize, fission, fuse, optimize_pows, @@ -259,7 +260,7 @@ def _make_clusters_passes_mapper(cls, **kwargs): # on device backends def callback(f, *args): if f.is_TimeFunction and f.save is not None: - return f.space_dimensions + return stream_dimensions(f) else: return None diff --git a/devito/core/gpu.py b/devito/core/gpu.py index 1193d4de77..c0eae9732f 100644 --- a/devito/core/gpu.py +++ b/devito/core/gpu.py @@ -5,7 +5,7 @@ from devito.core.operator import CoreOperator, CustomOperator, ParTile from devito.exceptions import InvalidOperator from devito.operator.operator import rcompile -from devito.passes import is_on_device +from devito.passes import is_on_device, stream_dimensions from devito.passes.equations import collect_derivatives from devito.passes.clusters import (Lift, tasking, memcpy_prefetch, blocking, buffering, cire, cse, factorize, fission, fuse, @@ -431,7 +431,7 @@ def stream_key(items, *args): `(d_i, ..., d_n)` requiring data streaming. """ found = [f for f in as_tuple(items) if callback(f)] - retval = {f.space_dimensions for f in found} + retval = {stream_dimensions(f) for f in found} if len(retval) > 1: raise ValueError("Cannot determine homogenous stream Dimensions") elif len(retval) == 1: diff --git a/devito/passes/__init__.py b/devito/passes/__init__.py index 66f3edbc98..8db9072812 100644 --- a/devito/passes/__init__.py +++ b/devito/passes/__init__.py @@ -36,6 +36,16 @@ def is_on_device(obj, gpu_fit): return all(f in gpu_fit for f in fsave) +def stream_dimensions(f): + """ + Return the Function's dimensions that are candidates for data streaming. + """ + if f.is_TimeFunction: + return tuple(d for d in f.dimensions if not d.is_Time) + else: + return () + + def needs_transfer(f, gpu_fit): """ True if the given object triggers a transfer from/to device memory, From 1905fb61542753dfb829e01eb611c15258f717be Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Thu, 6 Jun 2024 09:39:54 +0000 Subject: [PATCH 57/58] compiler: Refactor _generate_macros --- devito/passes/iet/misc.py | 27 +++++++++++++++++---------- devito/types/dimension.py | 2 +- 2 files changed, 18 insertions(+), 11 deletions(-) diff --git a/devito/passes/iet/misc.py b/devito/passes/iet/misc.py index f6ba18adfc..f0b2b7f4f5 100644 --- a/devito/passes/iet/misc.py +++ b/devito/passes/iet/misc.py @@ -150,8 +150,7 @@ def _generate_macros(iet, tracker=None, **kwargs): headers = sorted((ccode(define), ccode(expr)) for define, expr in headers) # Generate Macros from higher-level SymPy objects - for i in FindApplications().visit(iet): - headers.extend(_generate_macros_math(i)) + headers.extend(_generate_macros_math(iet)) # Remove redundancies while preserving the order headers = filter_ordered(headers) @@ -170,12 +169,12 @@ def _generate_macros(iet, tracker=None, **kwargs): def _generate_macros_findexeds(iet, sregistry=None, tracker=None, **kwargs): indexeds = FindSymbols('indexeds').visit(iet) - indexeds = [i for i in indexeds if isinstance(i, FIndexed)] - if not indexeds: + findexeds = [i for i in indexeds if isinstance(i, FIndexed)] + if not findexeds: return iet subs = {} - for i in indexeds: + for i in findexeds: try: v = tracker[i.base].v subs[i] = v.func(v.base, *i.indices) @@ -194,13 +193,21 @@ def _generate_macros_findexeds(iet, sregistry=None, tracker=None, **kwargs): return iet +def _generate_macros_math(iet): + headers = [] + for i in FindApplications().visit(iet): + headers.extend(_lower_macro_math(i)) + + return headers + + @singledispatch -def _generate_macros_math(expr): +def _lower_macro_math(expr): return () -@_generate_macros_math.register(Min) -@_generate_macros_math.register(sympy.Min) +@_lower_macro_math.register(Min) +@_lower_macro_math.register(sympy.Min) def _(expr): if has_integer_args(*expr.args) and len(expr.args) == 2: return (('MIN(a,b)', ('(((a) < (b)) ? (a) : (b))')),) @@ -208,8 +215,8 @@ def _(expr): return () -@_generate_macros_math.register(Max) -@_generate_macros_math.register(sympy.Max) +@_lower_macro_math.register(Max) +@_lower_macro_math.register(sympy.Max) def _(expr): if has_integer_args(*expr.args) and len(expr.args) == 2: return (('MAX(a,b)', ('(((a) > (b)) ? (a) : (b))')),) diff --git a/devito/types/dimension.py b/devito/types/dimension.py index 0e4310bac3..b3d51284fe 100644 --- a/devito/types/dimension.py +++ b/devito/types/dimension.py @@ -1546,7 +1546,7 @@ class VirtualDimension(CustomDimension): def __init_finalize__(self, name, parent=None): super().__init_finalize__(name, parent=parent, symbolic_min=sympy.S.Zero, - symbolic_max=sympy.S.One) + symbolic_max=sympy.S.Zero) # *** From 0decbb23da904f48ff6c7ee124ed2b0cefc547af Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Tue, 18 Jun 2024 09:49:28 +0000 Subject: [PATCH 58/58] misc: Improve performance summary --- devito/operator/profiling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/devito/operator/profiling.py b/devito/operator/profiling.py index e4befb7484..b7fc3329c2 100644 --- a/devito/operator/profiling.py +++ b/devito/operator/profiling.py @@ -420,7 +420,7 @@ def add(self, name, rank, time, k = PerfKey(name, rank) - if not ops or any(np.isnan(i) for i in [ops, points, traffic]): + if not ops or any(not np.isfinite(i) for i in [ops, points, traffic]): self[k] = PerfEntry(time, 0.0, 0.0, 0.0, 0, []) else: gflops = float(ops)/10**9