From cf0f89300cbe6ca4972efe5b5086aa6a81bf453d Mon Sep 17 00:00:00 2001 From: Ahmad Nawab Date: Mon, 30 Sep 2024 17:18:45 +0200 Subject: [PATCH] SCCAnnotate: don't privatize arrays with exisiting data declarations --- .../transformations/single_column/annotate.py | 56 +++++++++++++++++-- .../single_column/tests/test_scc.py | 29 +++++++--- loki/transformations/single_column/vector.py | 4 +- .../tests/test_loop_blocking.py | 16 +++--- loki/transformations/utilities.py | 8 +-- 5 files changed, 85 insertions(+), 28 deletions(-) diff --git a/loki/transformations/single_column/annotate.py b/loki/transformations/single_column/annotate.py index d3283a963..acaf68e5d 100644 --- a/loki/transformations/single_column/annotate.py +++ b/loki/transformations/single_column/annotate.py @@ -5,13 +5,14 @@ # granted to it by virtue of its status as an intergovernmental organisation # nor does it submit to any jurisdiction. +from collections import defaultdict from loki.batch import Transformation from loki.expression import ( symbols as sym, FindVariables, is_dimension_constant ) from loki.ir import ( nodes as ir, FindNodes, pragmas_attached, is_loki_pragma, - get_pragma_parameters, Transformer + get_pragma_parameters, Transformer, pragma_regions_attached ) from loki.logging import info from loki.tools import as_tuple, flatten @@ -194,11 +195,54 @@ def transform_subroutine(self, routine, **kwargs): # Mark all non-parallel loops as `!$acc loop seq` self.annotate_sequential_loops(routine) + # Find variables with existing OpenACC data declarations + acc_vars = self.find_acc_vars(routine, targets) + with pragmas_attached(routine, ir.Loop, attach_pragma_post=True): - driver_loops = find_driver_loops(routine=routine, targets=targets) + driver_loops = find_driver_loops(section=routine.body, targets=targets) for loop in driver_loops: - self.annotate_driver_loop(loop) + self.annotate_driver_loop(loop, acc_vars.get(loop, [])) + + def find_acc_vars(self, routine, targets): + """ + Find variables already specified in acc data clauses. + + Parameters + ---------- + routine : :any:`Subroutine` + Subroutine to apply this transformation to. + targets : list or string + List of subroutines that are to be considered as part of + the transformation call tree. + """ + + acc_vars = defaultdict(list) + + with pragma_regions_attached(routine): + with pragmas_attached(routine, ir.Loop): + for region in FindNodes(ir.PragmaRegion).visit(routine.body): + if region.pragma.keyword.lower() == 'acc' and 'data' in region.pragma.content.lower(): + + driver_loops = find_driver_loops(section=region.body, targets=targets) + if not driver_loops: + continue + + parameters = get_pragma_parameters(region.pragma, starts_with='data', only_loki_pragmas=False) + if not 'none' in [p.strip().lower() for p in parameters.get('default', '').split(',')]: + for loop in driver_loops: + _vars = [var.name.lower() for var in FindVariables(unique=True).visit(loop)] + acc_vars[loop] += _vars + else: + _vars = [p.strip().lower() for p in parameters.get('present', '').split(',')] + _vars += [p.strip().lower() for p in parameters.get('copy', '').split(',')] + _vars += [p.strip().lower() for p in parameters.get('copyin', '').split(',')] + _vars += [p.strip().lower() for p in parameters.get('copyout', '').split(',')] + _vars += [p.strip().lower() for p in parameters.get('deviceptr', '').split(',')] + + for loop in driver_loops: + acc_vars[loop] += _vars + return acc_vars @classmethod def device_alloc_column_locals(cls, routine, column_locals): @@ -221,7 +265,7 @@ def device_alloc_column_locals(cls, routine, column_locals): routine.body.prepend((ir.Comment(''), pragma, ir.Comment(''))) routine.body.append((ir.Comment(''), pragma_post, ir.Comment(''))) - def annotate_driver_loop(self, loop): + def annotate_driver_loop(self, loop, acc_vars): """ Annotate driver block loop with ``'openacc'`` pragmas. @@ -229,6 +273,8 @@ def annotate_driver_loop(self, loop): ---------- loop : :any:`Loop` Driver :any:`Loop` to wrap in ``'opencc'`` pragmas. + acc_vars : list + Variables already declared in ``'openacc'`` data directives. """ # Mark driver loop as "gang parallel". @@ -241,7 +287,7 @@ def annotate_driver_loop(self, loop): # Filter out arrays that are explicitly allocated with block dimension sizes = self.block_dim.size_expressions arrays = [v for v in arrays if not any(d in sizes for d in as_tuple(v.shape))] - private_arrays = ', '.join(set(v.name for v in arrays)) + private_arrays = ', '.join(set(v.name for v in arrays if not v.name_parts[0].lower() in acc_vars)) private_clause = '' if not private_arrays else f' private({private_arrays})' for pragma in as_tuple(loop.pragma): diff --git a/loki/transformations/single_column/tests/test_scc.py b/loki/transformations/single_column/tests/test_scc.py index 71549ceac..3e92533b0 100644 --- a/loki/transformations/single_column/tests/test_scc.py +++ b/loki/transformations/single_column/tests/test_scc.py @@ -9,7 +9,7 @@ from loki import Subroutine, Sourcefile, Dimension, fgen from loki.batch import ProcedureItem -from loki.expression import Scalar, Array, IntLiteral, RangeIndex +from loki.expression import Scalar, Array, IntLiteral from loki.frontend import available_frontends, OMNI, OFP from loki.ir import ( FindNodes, Assignment, CallStatement, Conditional, Loop, @@ -236,32 +236,40 @@ def test_scc_demote_transformation(frontend, horizontal): @pytest.mark.parametrize('frontend', available_frontends()) -def test_scc_annotate_openacc(frontend, horizontal, blocking): +@pytest.mark.parametrize('acc_data', ['default', 'copyin', None]) +def test_scc_annotate_openacc(frontend, horizontal, blocking, acc_data): """ Test the correct addition of OpenACC pragmas to SCC format code (no hoisting). """ - fcode_driver = """ + fcode_driver = f""" SUBROUTINE column_driver(nlon, nproma, nlev, nz, q, nb) INTEGER, INTENT(IN) :: nlon, nz, nb ! Size of the horizontal and vertical INTEGER, INTENT(IN) :: nproma, nlev ! Aliases of horizontal and vertical sizes REAL, INTENT(INOUT) :: q(nlon,nz,nb) + REAL :: other_var(nlon) INTEGER :: b, start, end start = 1 end = nlon + {'!$acc data default(present)' if acc_data == 'default' + else '!$acc data copyin(other_var)' if acc_data == 'copyin' else ''} + ! do b=1, nb - call compute_column(start, end, nlon, nproma, nz, q(:,:,b)) + call compute_column(start, end, nlon, nproma, nz, q(:,:,b), other_var) end do + ! + {'!$acc end data' if acc_data else ''} END SUBROUTINE column_driver """ fcode_kernel = """ - SUBROUTINE compute_column(start, end, nlon, nproma, nlev, nz, q) + SUBROUTINE compute_column(start, end, nlon, nproma, nlev, nz, q, other_var) INTEGER, INTENT(IN) :: start, end ! Iteration indices INTEGER, INTENT(IN) :: nlon, nz ! Size of the horizontal and vertical INTEGER, INTENT(IN) :: nproma, nlev ! Aliases of horizontal and vertical sizes REAL, INTENT(INOUT) :: q(nlon,nz) + REAL, INTENT(IN) :: other_var REAL :: t(nlon,nz) REAL :: a(nlon) REAL :: d(nproma) @@ -326,8 +334,11 @@ def test_scc_annotate_openacc(frontend, horizontal, blocking): with pragmas_attached(driver, Loop): driver_loops = FindNodes(Loop).visit(driver.body) assert len(driver_loops) == 1 - assert driver_loops[0].pragma[0].keyword == 'acc' - assert driver_loops[0].pragma[0].content == 'parallel loop gang vector_length(nlon)' + assert driver_loops[0].pragma[0].keyword.lower() == 'acc' + if acc_data: + assert driver_loops[0].pragma[0].content == 'parallel loop gang vector_length(nlon)' + else: + assert driver_loops[0].pragma[0].content == 'parallel loop gang private(other_var) vector_length(nlon)' @pytest.mark.parametrize('frontend', available_frontends()) @@ -750,7 +761,7 @@ def test_scc_multiple_acc_pragmas(frontend, horizontal, blocking): @pytest.mark.parametrize('frontend', available_frontends()) -def test_scc_annotate_routine_seq_pragma(frontend, horizontal, blocking): +def test_scc_annotate_routine_seq_pragma(frontend, blocking): """ Test that `!$loki routine seq` pragmas are replaced correctly by `!$acc routine seq` pragmas. @@ -790,7 +801,7 @@ def test_scc_annotate_routine_seq_pragma(frontend, horizontal, blocking): @pytest.mark.parametrize('frontend', available_frontends()) -def test_scc_annotate_empty_data_clause(frontend, horizontal, blocking): +def test_scc_annotate_empty_data_clause(frontend, blocking): """ Test that we do not generate empty `!$acc data` clauses. """ diff --git a/loki/transformations/single_column/vector.py b/loki/transformations/single_column/vector.py index 7d200d960..83a35ea28 100644 --- a/loki/transformations/single_column/vector.py +++ b/loki/transformations/single_column/vector.py @@ -231,7 +231,7 @@ def process_driver(self, routine, targets=()): """ with pragmas_attached(routine, ir.Loop, attach_pragma_post=True): - driver_loops = find_driver_loops(routine=routine, targets=targets) + driver_loops = find_driver_loops(section=routine.body, targets=targets) # remove vector loops driver_loop_map = {} @@ -434,7 +434,7 @@ def transform_subroutine(self, routine, **kwargs): if role == 'driver': with pragmas_attached(routine, ir.Loop): - driver_loops = find_driver_loops(routine=routine, targets=targets) + driver_loops = find_driver_loops(section=routine.body, targets=targets) for loop in driver_loops: # Revector all marked sections within the driver loop body diff --git a/loki/transformations/tests/test_loop_blocking.py b/loki/transformations/tests/test_loop_blocking.py index 8aebac743..0ea3133a6 100644 --- a/loki/transformations/tests/test_loop_blocking.py +++ b/loki/transformations/tests/test_loop_blocking.py @@ -44,7 +44,7 @@ def test_1d_splitting(tmp_path, frontend, block_size, n): num_loops = len(loops) num_vars = len(routine.variable_map) with pragmas_attached(routine, Loop): - loops = find_driver_loops(routine, + loops = find_driver_loops(routine.body, targets=None) splitting_vars, inner_loop, outer_loop = split_loop(routine, loops[0], block_size) loops = FindNodes(ir.Loop).visit(routine.ir) @@ -94,7 +94,7 @@ def test_1d_splitting_multi_var(tmp_path, frontend, block_size, n): num_loops = len(loops) num_vars = len(routine.variable_map) with pragmas_attached(routine, Loop): - loops = find_driver_loops(routine, + loops = find_driver_loops(routine.body, targets=None) splitting_vars, inner_loop, outer_loop = split_loop(routine, loops[0], block_size) loops = FindNodes(ir.Loop).visit(routine.ir) @@ -142,7 +142,7 @@ def test_2d_splitting(tmp_path, frontend, block_size, n): num_loops = len(loops) num_vars = len(routine.variable_map) with pragmas_attached(routine, Loop): - loops = find_driver_loops(routine, + loops = find_driver_loops(routine.body, targets=None) splitting_vars, inner_loop, outer_loop = split_loop(routine, loops[0], block_size) loops = FindNodes(ir.Loop).visit(routine.ir) @@ -192,7 +192,7 @@ def test_3d_splitting(tmp_path, frontend, block_size, n): num_loops = len(loops) num_vars = len(routine.variable_map) with pragmas_attached(routine, Loop): - loops = find_driver_loops(routine, + loops = find_driver_loops(routine.body, targets=None) splitting_vars, inner_loop, outer_loop = split_loop(routine, loops[0], block_size) loops = FindNodes(ir.Loop).visit(routine.ir) @@ -250,7 +250,7 @@ def test_1d_blocking(tmp_path, frontend, block_size, n): routine = Subroutine.from_source(fcode, frontend=frontend) loops = FindNodes(ir.Loop).visit(routine.ir) with pragmas_attached(routine, Loop): - loops = find_driver_loops(routine, + loops = find_driver_loops(routine.body, targets=None) num_loops = len(loops) @@ -309,7 +309,7 @@ def test_1d_blocking_multi_intent(tmp_path, frontend, block_size, n): routine = Subroutine.from_source(fcode, frontend=frontend) loops = FindNodes(ir.Loop).visit(routine.ir) with pragmas_attached(routine, Loop): - loops = find_driver_loops(routine, + loops = find_driver_loops(routine.body, targets=None) num_loops = len(loops) @@ -372,7 +372,7 @@ def test_2d_blocking(tmp_path, frontend, block_size, n): num_loops = len(loops) num_vars = len(routine.variable_map) with pragmas_attached(routine, Loop): - loops = find_driver_loops(routine, + loops = find_driver_loops(routine.body, targets=None) splitting_vars, inner_loop, outer_loop = split_loop(routine, loops[0], block_size) loops = FindNodes(ir.Loop).visit(routine.ir) @@ -432,7 +432,7 @@ def test_3d_blocking(tmp_path, frontend, block_size, n): num_loops = len(loops) num_vars = len(routine.variable_map) with pragmas_attached(routine, Loop): - loops = find_driver_loops(routine, + loops = find_driver_loops(routine.body, targets=None) splitting_vars, inner_loop, outer_loop = split_loop(routine, loops[0], block_size) loops = FindNodes(ir.Loop).visit(routine.ir) diff --git a/loki/transformations/utilities.py b/loki/transformations/utilities.py index 76ab9eafd..15d0aea2a 100644 --- a/loki/transformations/utilities.py +++ b/loki/transformations/utilities.py @@ -594,16 +594,16 @@ def is_driver_loop(loop, targets): return False -def find_driver_loops(routine, targets): +def find_driver_loops(section, targets): """ - Find and return all driver loops of a given `routine`. + Find and return all driver loops in a given `section`. A *driver loop* is specified either by a call to a routine within `targets` or by the pragma `!$loki driver-loop`. Parameters ---------- - routine : :any:`Subroutine` + section : :any:`Section` or tuple The subroutine in which to find the driver loops. targets : list or string List of subroutines that are to be considered as part of @@ -612,7 +612,7 @@ def find_driver_loops(routine, targets): driver_loops = [] nested_driver_loops = [] - for loop in FindNodes(ir.Loop).visit(routine.body): + for loop in FindNodes(ir.Loop).visit(section): if loop in nested_driver_loops: continue