diff --git a/loki/bulk/scheduler.py b/loki/bulk/scheduler.py index 52819df62..02a07737d 100644 --- a/loki/bulk/scheduler.py +++ b/loki/bulk/scheduler.py @@ -55,6 +55,13 @@ def __init__(self, default, routines, disable=None, dimensions=None, dic2p=None, self.routines = CaseInsensitiveDict(routines) else: self.routines = CaseInsensitiveDict((r.name, r) for r in as_tuple(routines)) + for routine in self.routines: + if 'trafo' in self.routines[routine]: + if isinstance(self.routines[routine]['trafo'], list): + self.routines[routine]['trafo'] = self.routines[routine]['trafo'][0] + for key in self.routines[routine]['trafo']: + if isinstance(self.routines[routine]['trafo'][key], list): + self.routines[routine]['trafo'][key] = self.routines[routine]['trafo'][key][0] self.disable = as_tuple(disable) self.dimensions = dimensions self.enable_imports = enable_imports diff --git a/loki/expression/mappers.py b/loki/expression/mappers.py index 5dc305398..b78c3d85b 100644 --- a/loki/expression/mappers.py +++ b/loki/expression/mappers.py @@ -559,6 +559,7 @@ def map_variable_symbol(self, expr, *args, **kwargs): recurse_to_declaration_attributes = kwargs['recurse_to_declaration_attributes'] or expr.scope is None kwargs['recurse_to_declaration_attributes'] = False + new_type = expr.type if recurse_to_declaration_attributes: old_type = expr.type kind = self.rec(old_type.kind, *args, **kwargs) diff --git a/scripts/loki_transform.py b/scripts/loki_transform.py index 2696b10c7..55784be6b 100644 --- a/scripts/loki_transform.py +++ b/scripts/loki_transform.py @@ -376,7 +376,7 @@ def transpile(out_path, header, source, driver, cpp, include, define, frontend, @cli.command('plan') @click.option('--mode', '-m', default='sca', - type=click.Choice(['idem', 'sca', 'claw', 'scc', 'scc-hoist'])) + type=click.Choice(['idem', 'sca', 'claw', 'scc', 'scc-hoist', 'scc-stack'])) @click.option('--config', '-c', type=click.Path(), help='Path to configuration file.') @click.option('--header', '-I', type=click.Path(), multiple=True, @@ -410,6 +410,7 @@ def plan(mode, config, header, source, build, root, cpp, directive, frontend, ca paths += [Path(h).resolve().parent for h in header] scheduler = Scheduler(paths=paths, config=config, frontend=frontend, full_parse=False, preprocess=cpp) + mode = mode.replace("-", "_") # Construct the transformation plan as a set of CMake lists of source files scheduler.write_cmake_plan(filepath=plan_file, mode=mode, buildpath=build, rootpath=root) @@ -420,7 +421,7 @@ def plan(mode, config, header, source, build, root, cpp, directive, frontend, ca @cli.command('ecphys') @click.option('--mode', '-m', default='sca', - type=click.Choice(['idem', 'sca', 'claw', 'scc', 'scc-hoist'])) + type=click.Choice(['idem', 'sca', 'claw', 'scc', 'scc-hoist', 'scc-stack'])) @click.option('--config', '-c', type=click.Path(), help='Path to configuration file.') @click.option('--header', '-I', type=click.Path(), multiple=True, @@ -443,6 +444,7 @@ def ecphys(mode, config, header, source, build, cpp, directive, frontend): of interdependent subroutines. """ + # TODO: still problem with '-' within "scc-stack"?? info('[Loki] Bulk-processing physics using config: %s ', config) config = SchedulerConfig.from_file(config) @@ -490,7 +492,7 @@ def ecphys(mode, config, header, source, build, cpp, directive, frontend): # Define the target dimension to strip from kernel and caller transformation = ExtractSCATransformation(horizontal=horizontal) - if mode in ['scc', 'scc-hoist']: + if mode in ['scc', 'scc-hoist', 'scc-stack']: # Compose the main SCC transformation from core components based on config transformation = ( SCCBaseTransformation(horizontal=horizontal, directive=directive), @@ -527,6 +529,26 @@ def ecphys(mode, config, header, source, build, cpp, directive, frontend): else: raise RuntimeError('[Loki] Convert could not find specified Transformation!') + if mode in ['scc-stack']: + if frontend == Frontend.OMNI: + # To make the pool allocator size derivation work correctly, we need + # to normalize the 1:end-style index ranges that OMNI introduces + class NormalizeRangeIndexingTransformation(Transformation): + def transform_subroutine(self, routine, **kwargs): + normalize_range_indexing(routine) + + scheduler.process(transformation=NormalizeRangeIndexingTransformation()) + + horizontal = scheduler.config.dimensions['horizontal'] + vertical = scheduler.config.dimensions['vertical'] + block_dim = scheduler.config.dimensions['block_dim'] + transformation = TemporariesPoolAllocatorTransformation( + block_dim=block_dim, directive='openacc', check_bounds=False + ) + scheduler.process(transformation=transformation, reverse=True) + + mode = mode.replace("-", "_") + # Apply the dependency-injection transformation dependency = DependencyTransformation( mode='module', module_suffix='_MOD', suffix=f'_{mode.upper()}' diff --git a/transformations/tests/test_pool_allocator.py b/transformations/tests/test_pool_allocator.py index b5c0340b5..066943780 100644 --- a/transformations/tests/test_pool_allocator.py +++ b/transformations/tests/test_pool_allocator.py @@ -35,7 +35,7 @@ def check_c_sizeof_import(routine): def check_stack_created_in_driver(driver, stack_size, first_kernel_call, num_block_loops, - generate_driver_stack=True, kind_real='jprb'): + generate_driver_stack=True, kind_real='jprb', simplify_stmt=True): # Are stack size, storage and stack derived type declared? assert 'istsz' in driver.variables assert 'zstack(:,:)' in driver.variables @@ -51,7 +51,11 @@ def check_stack_created_in_driver(driver, stack_size, first_kernel_call, num_blo assignments = FindNodes(Assignment).visit(driver.body) for assignment in assignments: if assignment.lhs == 'istsz': - assert str(simplify(assignment.rhs)).lower().replace(' ', '') == str(stack_size).lower().replace(' ', '') + if simplify_stmt: + assert str(simplify(assignment.rhs)).lower().replace(' ', '') \ + == str(stack_size).lower().replace(' ', '') + else: + assert str(assignment.rhs).lower().replace(' ', '') == str(stack_size).lower().replace(' ', '') break # Check for stack assignment inside loop @@ -64,10 +68,10 @@ def check_stack_created_in_driver(driver, stack_size, first_kernel_call, num_blo assert 'zstack(1, b)' in assignments[0].rhs.parameters if generate_driver_stack: assert assignments[1].lhs == 'ylstack%u' and ( - assignments[1].rhs == f'ylstack%l + istsz * c_sizeof(real(1, kind={kind_real}))') + assignments[1].rhs == f'ylstack%l + istsz * max(c_sizeof(real(1, kind={kind_real})), 8)') else: assert assignments[1].lhs == 'ylstack%u' and ( - assignments[1].rhs == f'ylstack%l + c_sizeof(real(1, kind={kind_real}))*istsz') + assignments[1].rhs == f'ylstack%l + max(c_sizeof(real(1, kind={kind_real})), 8)*istsz') # Check that stack assignment happens before kernel call assert all(loops[0].body.index(a) < loops[0].body.index(first_kernel_call) for a in assignments) @@ -98,7 +102,7 @@ def test_pool_allocator_temporaries(frontend, generate_driver_stack, block_dim, REAL(KIND=JPRB), ALLOCATABLE :: ZSTACK(:, :) type(stack) :: ylstack - {'istsz = 3*c_sizeof(real(1,kind=jprb))*nlon/c_sizeof(real(1,kind=jprb))+c_sizeof(real(1,kind=jprb))*nlon*nz/c_sizeof(real(1,kind=jprb))' if nclv_param else 'istsz = 3*c_sizeof(real(1,kind=jprb))*nlon/c_sizeof(real(1,kind=jprb))+c_sizeof(real(1,kind=jprb))*nlon*nz/c_sizeof(real(1,kind=jprb))+2*c_sizeof(real(1,kind=jprb))/c_sizeof(real(1,kind=jprb))'} + {'istsz = 3*max(c_sizeof(real(1,kind=jprb)), 8)*nlon/max(c_sizeof(real(1,kind=jprb)), 8)+max(c_sizeof(real(1,kind=jprb)), 8)*nlon*nz/max(c_sizeof(real(1,kind=jprb)), 8)' if nclv_param else 'istsz = 3*max(c_sizeof(real(1,kind=jprb)), 8)*nlon/max(c_sizeof(real(1,kind=jprb)), 8)+max(c_sizeof(real(1,kind=jprb)), 8)*nlon*nz/max(c_sizeof(real(1,kind=jprb)), 8)+2*max(c_sizeof(real(1,kind=jprb)), 8)/max(c_sizeof(real(1,kind=jprb)), 8)'} ALLOCATE(ZSTACK(ISTSZ, nb)) """ else: @@ -107,13 +111,13 @@ def test_pool_allocator_temporaries(frontend, generate_driver_stack, block_dim, REAL(KIND=JPRB), ALLOCATABLE :: ZSTACK(:, :) type(stack) :: ylstack - {'istsz = c_sizeof(real(1,kind=jprb))*nlon/c_sizeof(real(1,kind=jprb))+c_sizeof(real(1,kind=jprb))*nlon*nz/c_sizeof(real(1,kind=jprb))+c_sizeof(real(1,kind=jprb))*nclv*nlon/c_sizeof(real(1,kind=jprb))' if nclv_param else 'istsz = 3*c_sizeof(real(1,kind=jprb))*nlon/c_sizeof(real(1,kind=jprb))+c_sizeof(real(1,kind=jprb))*nlon*nz/c_sizeof(real(1,kind=jprb))+2*c_sizeof(real(1,kind=jprb))/c_sizeof(real(1,kind=jprb))'} + {'istsz = max(c_sizeof(real(1,kind=jprb)), 8)*nlon/max(c_sizeof(real(1,kind=jprb)), 8)+max(c_sizeof(real(1,kind=jprb)), 8)*nlon*nz/max(c_sizeof(real(1,kind=jprb)), 8)+max(c_sizeof(real(1,kind=jprb)), 8)*nclv*nlon/max(c_sizeof(real(1,kind=jprb)), 8)' if nclv_param else 'istsz = 3*max(c_sizeof(real(1,kind=jprb)), 8)*nlon/max(c_sizeof(real(1,kind=jprb)), 8)+max(c_sizeof(real(1,kind=jprb)), 8)*nlon*nz/max(c_sizeof(real(1,kind=jprb)), 8)+2*max(c_sizeof(real(1,kind=jprb)), 8)/max(c_sizeof(real(1,kind=jprb)), 8)'} ALLOCATE(ZSTACK(ISTSZ, nb)) """ fcode_stack_assign = """ ylstack%l = loc(zstack(1, b)) - ylstack%u = ylstack%l + c_sizeof(real(1, kind=jprb)) * istsz + ylstack%u = ylstack%l + max(c_sizeof(real(1, kind=jprb)), 8) * istsz """ fcode_stack_dealloc = "DEALLOCATE(ZSTACK)" @@ -231,37 +235,53 @@ def test_pool_allocator_temporaries(frontend, generate_driver_stack, block_dim, if nclv_param: if frontend == OMNI: # pylint: disable-next=line-too-long - trafo_data_compare = f'3 * c_sizeof(real(1, kind={kind_real})) * klon + c_sizeof(real(1, kind={kind_real})) * klev * klon' + trafo_data_compare = f'3 * max(c_sizeof(real(1, kind={kind_real})), 8) * klon + ' \ + f'max(c_sizeof(real(1, kind={kind_real})), 8) * klev * klon' if generate_driver_stack: - stack_size = f'3 * c_sizeof(real(1, kind={kind_real})) * nlon / c_sizeof(real(1, kind=jprb))' - stack_size += f'+ c_sizeof(real(1, kind={kind_real})) * nlon * nz / c_sizeof(real(1, kind=jprb))' + stack_size = f'3 * max(c_sizeof(real(1, kind={kind_real})), 8) * nlon / ' \ + f'max(c_sizeof(real(1, kind=jprb)), 8)' + stack_size += f'+ max(c_sizeof(real(1, kind={kind_real})), 8) * nlon * nz / ' \ + f'max(c_sizeof(real(1, kind=jprb)), 8)' else: - stack_size = f'3 * c_sizeof(real(1, kind={kind_real})) * nlon / c_sizeof(real(1, kind={kind_real}))' - stack_size += f'+ c_sizeof(real(1, kind={kind_real})) * nlon * nz / c_sizeof(real(1, kind={kind_real}))' + stack_size = f'3 * max(c_sizeof(real(1, kind={kind_real})), 8) * nlon / ' \ + f'max(c_sizeof(real(1, kind={kind_real})), 8)' + stack_size += f'+ max(c_sizeof(real(1, kind={kind_real})), 8) * nlon * nz / ' \ + f'max(c_sizeof(real(1, kind={kind_real})), 8)' else: # pylint: disable-next=line-too-long - trafo_data_compare = f'c_sizeof(real(1, kind={kind_real})) * klon + c_sizeof(real(1, kind={kind_real})) * klev * klon' - trafo_data_compare += f'+ c_sizeof(real(1, kind={kind_real})) * klon * nclv' + trafo_data_compare = f'max(c_sizeof(real(1, kind={kind_real})), 8) * klon + ' \ + f'max(c_sizeof(real(1, kind={kind_real})), 8) * klev * klon' + trafo_data_compare += f'+ max(c_sizeof(real(1, kind={kind_real})), 8) * klon * nclv' - stack_size = f'c_sizeof(real(1, kind={kind_real})) * nlon / c_sizeof(real(1, kind=jprb))' - stack_size += f'+ c_sizeof(real(1, kind={kind_real})) * nlon * nz / c_sizeof(real(1, kind=jprb))' - stack_size += f'+ c_sizeof(real(1, kind={kind_real})) * nclv * nlon / c_sizeof(real(1, kind=jprb))' + stack_size = f'max(c_sizeof(real(1, kind={kind_real})), 8) * nlon / max(c_sizeof(real(1, kind=jprb)), 8)' + stack_size += f'+ max(c_sizeof(real(1, kind={kind_real})), 8) * nlon * nz / ' \ + f'max(c_sizeof(real(1, kind=jprb)), 8)' + stack_size += f'+ max(c_sizeof(real(1, kind={kind_real})), 8) * nclv * nlon / ' \ + f'max(c_sizeof(real(1, kind=jprb)), 8)' else: # pylint: disable-next=line-too-long - trafo_data_compare = f'c_sizeof(real(1, kind={kind_real})) * klon + c_sizeof(real(1, kind={kind_real})) * klev * klon' + trafo_data_compare = f'max(c_sizeof(real(1, kind={kind_real})), 8) * klon + ' \ + f'max(c_sizeof(real(1, kind={kind_real})), 8) * klev * klon' # pylint: disable-next=line-too-long - trafo_data_compare += f'+ c_sizeof(real(1, kind={kind_real})) * nclv + c_sizeof(real(1, kind={kind_real})) * klon * nclv' + trafo_data_compare += f'+ max(c_sizeof(real(1, kind={kind_real})), 8) * nclv + ' \ + f'max(c_sizeof(real(1, kind={kind_real})), 8) * klon * nclv' if generate_driver_stack: - stack_size = f'3 * c_sizeof(real(1, kind={kind_real})) * nlon / c_sizeof(real(1, kind=jprb))' - stack_size += f'+ c_sizeof(real(1, kind={kind_real})) * nlon * nz / c_sizeof(real(1, kind=jprb))' - stack_size += f'+ 2 * c_sizeof(real(1, kind={kind_real})) / c_sizeof(real(1, kind=jprb))' + stack_size = f'3 * max(c_sizeof(real(1, kind={kind_real})), 8) * nlon / ' \ + f'max(c_sizeof(real(1, kind=jprb)), 8)' + stack_size += f'+ max(c_sizeof(real(1, kind={kind_real})), 8) * nlon * nz / ' \ + f'max(c_sizeof(real(1, kind=jprb)), 8)' + stack_size += f'+ 2 * max(c_sizeof(real(1, kind={kind_real})), 8) / ' \ + f'max(c_sizeof(real(1, kind=jprb)), 8)' else: - stack_size = f'3 * c_sizeof(real(1, kind={kind_real})) * nlon / c_sizeof(real(1, kind={kind_real}))' - stack_size += f'+ c_sizeof(real(1, kind={kind_real})) * nlon * nz / c_sizeof(real(1, kind={kind_real}))' - stack_size += f'+ 2 * c_sizeof(real(1, kind={kind_real})) / c_sizeof(real(1, kind={kind_real}))' + stack_size = f'3 * max(c_sizeof(real(1, kind={kind_real})), 8) * nlon / ' \ + f'max(c_sizeof(real(1, kind={kind_real})), 8)' + stack_size += f'+ max(c_sizeof(real(1, kind={kind_real})), 8) * nlon * nz / ' \ + f'max(c_sizeof(real(1, kind={kind_real})), 8)' + stack_size += f'+ 2 * max(c_sizeof(real(1, kind={kind_real})), 8) / ' \ + f'max(c_sizeof(real(1, kind={kind_real})), 8)' assert kernel_item.trafo_data[transformation._key]['stack_size'] == trafo_data_compare assert all(v.scope is None for v in @@ -324,7 +344,7 @@ def test_pool_allocator_temporaries(frontend, generate_driver_stack, block_dim, if f'ip_tmp{tmp_index}' == assign.lhs: assign_idx[f'tmp{tmp_index}_ptr_assign'] = idx elif assign.lhs == 'ylstack%l' and 'ylstack%l' in assign.rhs and 'c_sizeof' in assign.rhs: - _size = str(assign.rhs).lower().replace(f'*c_sizeof(real(1, kind={kind_real}))', '') + _size = str(assign.rhs).lower().replace(f'*max(c_sizeof(real(1, kind={kind_real})), 8)', '') _size = _size.replace('ylstack%l + ', '') # Stack increment for tmp1, tmp2, tmp5 (and tmp3, tmp4 if no alloc_dims provided) @@ -363,7 +383,8 @@ def test_pool_allocator_temporaries(frontend, generate_driver_stack, block_dim, @pytest.mark.parametrize('frontend', available_frontends()) @pytest.mark.parametrize('directive', [None, 'openmp', 'openacc']) -def test_pool_allocator_temporaries_kernel_sequence(frontend, block_dim, directive): +@pytest.mark.parametrize('stack_insert_pragma', [False, True]) +def test_pool_allocator_temporaries_kernel_sequence(frontend, block_dim, directive, stack_insert_pragma): if directive == 'openmp': driver_loop_pragma1 = '!$omp parallel default(shared) private(b) firstprivate(a)\n !$omp do' driver_end_loop_pragma1 = '!$omp end do\n !$omp end parallel' @@ -383,6 +404,12 @@ def test_pool_allocator_temporaries_kernel_sequence(frontend, block_dim, directi driver_end_loop_pragma2 = '' kernel_pragma = '' + if stack_insert_pragma: + stack_size_location_pragma = '!$loki stack-insert' + else: + stack_size_location_pragma = '' + + fcode_parkind_mod = """ module parkind1 implicit none @@ -401,6 +428,10 @@ def test_pool_allocator_temporaries_kernel_sequence(frontend, block_dim, directi real(kind=jprb), intent(inout) :: field1(nlon, nb) real(kind=jprb), intent(inout) :: field2(nlon, nz, nb) integer :: a,b + + ! a = 1, necessary to check loki stack-insert pragma + a = 1 + {stack_size_location_pragma} {driver_loop_pragma1} do b=1,nb @@ -518,9 +549,9 @@ def test_pool_allocator_temporaries_kernel_sequence(frontend, block_dim, directi assert transformation._key == 'some_key' assert transformation._key in kernel_item.trafo_data # pylint: disable-next=line-too-long - assert kernel_item.trafo_data[transformation._key]['stack_size'] == f'c_sizeof(real(1, kind={kind_real}))*klon + c_sizeof(real(1, kind={kind_real}))*klev*klon + 2*c_sizeof(int(1, kind={kind_int}))*klon + c_sizeof(logical(true, kind={kind_log}))*klev' + assert kernel_item.trafo_data[transformation._key]['stack_size'] == f'max(c_sizeof(real(1, kind={kind_real})), 8)*klon + max(c_sizeof(real(1, kind={kind_real})), 8)*klev*klon + 2*max(c_sizeof(int(1, kind={kind_int})), 8)*klon + max(c_sizeof(logical(true, kind={kind_log})), 8)*klev' # pylint: disable-next=line-too-long - assert kernel2_item.trafo_data[transformation._key]['stack_size'] == f'3*c_sizeof(real(1, kind={kind_real}))*klev*klon' + assert kernel2_item.trafo_data[transformation._key]['stack_size'] == f'3*max(c_sizeof(real(1, kind={kind_real})), 8)*klev*klon' assert all(v.scope is None for v in FindVariables().visit(kernel_item.trafo_data[transformation._key]['stack_size'])) assert all(v.scope is None for v in @@ -531,6 +562,12 @@ def test_pool_allocator_temporaries_kernel_sequence(frontend, block_dim, directi # driver = scheduler['#driver'].routine + stack_order = FindNodes(Assignment).visit(driver.body) + if stack_insert_pragma: + assert stack_order[0].lhs == "a" + else: + assert stack_order[0].lhs == "ISTSZ" + # Has the stack module been imported? check_stack_module_import(driver) @@ -547,9 +584,12 @@ def test_pool_allocator_temporaries_kernel_sequence(frontend, block_dim, directi assert calls[0].arguments == ('1', 'nlon', 'nlon', 'nz', 'field1(:,b)', 'field2(:,:,b)', 'ylstack') assert calls[1].arguments == ('1', 'nlon', 'nlon', 'nz', 'field2(:,:,b)', 'ylstack') - stack_size = f'max(c_sizeof(real(1, kind={kind_real}))*nlon + c_sizeof(real(1, kind={kind_real}))*nlon*nz + ' - stack_size += f'2*c_sizeof(int(1, kind={kind_int}))*nlon + c_sizeof(logical(true, kind={kind_log}))*nz,' - stack_size += f'3*c_sizeof(real(1, kind={kind_real}))*nz*nlon)/c_sizeof(real(1, kind=jprb))' + stack_size = f'max(max(c_sizeof(real(1, kind={kind_real})), 8)*nlon + ' \ + f'max(c_sizeof(real(1, kind={kind_real})), 8)*nlon*nz + ' + stack_size += f'2*max(c_sizeof(int(1, kind={kind_int})), 8)*nlon + ' \ + f'max(c_sizeof(logical(true, kind={kind_log})), 8)*nz,' + stack_size += f'3*max(c_sizeof(real(1, kind={kind_real})), 8)*nz*nlon)/' \ + f'max(c_sizeof(real(1, kind=jprb)), 8)' check_stack_created_in_driver(driver, stack_size, calls[0], 2) # Has the data sharing been updated? @@ -600,9 +640,9 @@ def test_pool_allocator_temporaries_kernel_sequence(frontend, block_dim, directi # Let's check for the relevant "allocations" happening in the right order assign_idx = {} for idx, ass in enumerate(FindNodes(Assignment).visit(kernel.body)): - _size = str(ass.rhs).lower().replace(f'*c_sizeof(real(1, kind={kind_real}))', '') - _size = _size.replace(f'*c_sizeof(int(1, kind={kind_int}))', '') - _size = _size.replace(f'*c_sizeof(logical(.true., kind={kind_log}))', '') + _size = str(ass.rhs).lower().replace(f'*max(c_sizeof(real(1, kind={kind_real})), 8)', '') + _size = _size.replace(f'*max(c_sizeof(int(1, kind={kind_int})), 8)', '') + _size = _size.replace(f'*max(c_sizeof(logical(.true., kind={kind_log})), 8)', '') _size = _size.replace('ylstack%l + ', '') if ass.lhs == 'ylstack' and ass.rhs == 'ydstack': @@ -647,7 +687,8 @@ def test_pool_allocator_temporaries_kernel_sequence(frontend, block_dim, directi @pytest.mark.parametrize('frontend', available_frontends()) @pytest.mark.parametrize('directive', [None, 'openmp', 'openacc']) -def test_pool_allocator_temporaries_kernel_nested(frontend, block_dim, directive): +@pytest.mark.parametrize('stack_size_config', [None, 'dict', 'file']) +def test_pool_allocator_temporaries_kernel_nested(frontend, block_dim, directive, stack_size_config): if directive == 'openmp': driver_pragma = '!$omp PARALLEL do PRIVATE(b)' driver_end_pragma = '!$omp end parallel do' @@ -757,22 +798,65 @@ def test_pool_allocator_temporaries_kernel_nested(frontend, block_dim, directive else: definitions = () - config = { - 'default': { - 'mode': 'idem', - 'role': 'kernel', - 'expand': True, - 'strict': True - }, - 'routine': [{ - 'name': 'driver', - 'role': 'driver', - 'real_kind': 'jwrb', - }] - } - - scheduler = Scheduler(paths=[basedir], config=SchedulerConfig.from_dict(config), frontend=frontend, - definitions=definitions) + stack_size = "400*nz*nlon" + + if stack_size_config == "dict": + config = { + 'default': { + 'mode': 'idem', + 'role': 'kernel', + 'expand': True, + 'strict': True, + }, + 'routine': [{ + 'name': 'driver', + 'role': 'driver', + 'real_kind': 'jwrb', + 'trafo': { + 'TemporariesPoolAllocatorTransformation': { + 'stack-size': f"2*{stack_size}" + } + } + }] + } + elif stack_size_config == "file": + config_str = f""" +[default] +mode = 'idem' +role = 'kernel' +expand = true +strict = true + +[[routine]] +name = 'driver' +role = 'driver' +real_kind = 'jwrb' +[[routine.trafo]] +[[routine.trafo.TemporariesPoolAllocatorTransformation]] +stack-size = "4*{stack_size}" + """.strip() + (basedir / 'loki_config.config').write_text(config_str) + else: + config = { + 'default': { + 'mode': 'idem', + 'role': 'kernel', + 'expand': True, + 'strict': True + }, + 'routine': [{ + 'name': 'driver', + 'role': 'driver', + 'real_kind': 'jwrb', + }] + } + + if stack_size_config == "file": + scheduler = Scheduler(paths=[basedir], config=SchedulerConfig.from_file(basedir / 'loki_config.config'), + frontend=frontend, definitions=definitions) + else: + scheduler = Scheduler(paths=[basedir], config=SchedulerConfig.from_dict(config), frontend=frontend, + definitions=definitions) if frontend == OMNI: for item in scheduler.items: normalize_range_indexing(item.routine) @@ -794,9 +878,9 @@ def test_pool_allocator_temporaries_kernel_nested(frontend, block_dim, directive assert transformation._key in kernel_item.trafo_data # pylint: disable-next=line-too-long - assert kernel_item.trafo_data[transformation._key]['stack_size'] == f'c_sizeof(real(1, kind={kind_real}))*klon + 4*c_sizeof(real(1, kind={kind_real}))*klev*klon + 2*c_sizeof(int(1, kind={kind_int}))*klon + c_sizeof(logical(true, kind={kind_log}))*klev' + assert kernel_item.trafo_data[transformation._key]['stack_size'] == f'max(c_sizeof(real(1, kind={kind_real})), 8)*klon + 4*max(c_sizeof(real(1, kind={kind_real})), 8)*klev*klon + 2*max(c_sizeof(int(1, kind={kind_int})), 8)*klon + max(c_sizeof(logical(true, kind={kind_log})), 8)*klev' # pylint: disable-next=line-too-long - assert kernel2_item.trafo_data[transformation._key]['stack_size'] == f'3*c_sizeof(real(1, kind={kind_real}))*columns*levels' + assert kernel2_item.trafo_data[transformation._key]['stack_size'] == f'3*max(c_sizeof(real(1, kind={kind_real})), 8)*columns*levels' assert all(v.scope is None for v in FindVariables().visit(kernel_item.trafo_data[transformation._key]['stack_size'])) assert all(v.scope is None for v in @@ -821,11 +905,18 @@ def test_pool_allocator_temporaries_kernel_nested(frontend, block_dim, directive assert len(calls) == 1 assert calls[0].arguments == ('1', 'nlon', 'nlon', 'nz', 'field1(:,b)', 'field2(:,:,b)', 'ylstack') - stack_size = f'c_sizeof(real(1, kind={kind_real}))*nlon/c_sizeof(real(1, kind=jwrb)) +' - stack_size += f'4*c_sizeof(real(1, kind={kind_real}))*nlon*nz/c_sizeof(real(1, kind=jwrb)) +' - stack_size += f'2*c_sizeof(int(1, kind={kind_int}))*nlon/c_sizeof(real(1, kind=jwrb)) +' - stack_size += f'c_sizeof(logical(true, kind={kind_log}))*nz/c_sizeof(real(1, kind=jwrb))' - check_stack_created_in_driver(driver, stack_size, calls[0], 1, kind_real='jwrb') + if stack_size_config == "dict": + stack_size = '2*400*nz*nlon / MAX(C_SIZEOF(REAL(1, kind=JWRB)), 8)' + elif stack_size_config == "file": + stack_size = '4*400*nz*nlon / MAX(C_SIZEOF(REAL(1, kind=JWRB)), 8)' + else: + stack_size = f'max(c_sizeof(real(1, kind={kind_real})), 8)*nlon/max(c_sizeof(real(1, kind=jwrb)), 8) +' + stack_size += f'4*max(c_sizeof(real(1, kind={kind_real})), 8)*nlon*nz/max(c_sizeof(real(1, kind=jwrb)), 8) +' + stack_size += f'2*max(c_sizeof(int(1, kind={kind_int})), 8)*nlon/max(c_sizeof(real(1, kind=jwrb)), 8) +' + stack_size += f'max(c_sizeof(logical(true, kind={kind_log})), 8)*nz/max(c_sizeof(real(1, kind=jwrb)), 8)' + # can't use 'simplify(...)' on expression that was generated by 'parse_fparser_expression' containing e.g. 'max' + check_stack_created_in_driver(driver, stack_size, calls[0], 1, kind_real='jwrb', + simplify_stmt=stack_size_config is None) # check if stack allocatable in the driver has the correct kind parameter if not frontend == OMNI: @@ -884,9 +975,9 @@ def test_pool_allocator_temporaries_kernel_nested(frontend, block_dim, directive # Let's check for the relevant "allocations" happening in the right order assign_idx = {} for idx, ass in enumerate(FindNodes(Assignment).visit(kernel.body)): - _size = str(ass.rhs).lower().replace(f'*c_sizeof(real(1, kind={kind_real}))', '') - _size = _size.replace(f'*c_sizeof(int(1, kind={kind_int}))', '') - _size = _size.replace(f'*c_sizeof(logical(.true., kind={kind_log}))', '') + _size = str(ass.rhs).lower().replace(f'*max(c_sizeof(real(1, kind={kind_real})), 8)', '') + _size = _size.replace(f'*max(c_sizeof(int(1, kind={kind_int})), 8)', '') + _size = _size.replace(f'*max(c_sizeof(logical(.true., kind={kind_log})), 8)', '') _size = _size.replace('ylstack%l + ', '') if ass.lhs == 'ylstack' and ass.rhs == 'ydstack': @@ -1017,9 +1108,137 @@ def test_pool_allocator_more_call_checks(frontend, block_dim, caplog): if not frontend == OFP: # Now repeat the checks for the inline call - calls = [i for i in FindInlineCalls().visit(kernel.body) if not i.name.lower() in ('c_sizeof', 'real')] + calls = [i for i in FindInlineCalls().visit(kernel.body) if not i.name.lower() in ('max', 'c_sizeof', 'real')] assert len(calls) == 1 assert calls[0].parameters == ('jl', 'ylstack') assert 'Derived-type vars in Subroutine:: kernel not supported in pool allocator' in caplog.text rmtree(basedir) + + +@pytest.mark.parametrize('frontend', available_frontends()) +def test_pool_allocator_args_vs_kwargs(frontend, block_dim): + fcode_driver = """ +subroutine driver(NLON, NZ, NB, FIELD1, FIELD2) + use kernel_mod, only: kernel, kernel2 + use parkind1, only : jpim + implicit none + INTEGER, PARAMETER :: JWRB = SELECTED_REAL_KIND(13,300) + INTEGER, INTENT(IN) :: NLON, NZ, NB + real(kind=jwrb), intent(inout) :: field1(nlon, nb) + real(kind=jwrb), intent(inout) :: field2(nlon, nz, nb) + integer :: b + real(kind=jwrb) :: opt + do b=1,nb + call KERNEL(start=1, end=nlon, klon=nlon, klev=nz, field1=field1(:,b), field2=field2(:,:,b)) + call KERNEL2(1, nlon, nlon, nz, field2=field2(:,:,b)) + call KERNEL2(1, nlon, nlon, nz, field2(:,:,b)) + call KERNEL2(1, nlon, nlon, nz, field2=field2(:,:,b), opt_arg=opt) + call KERNEL2(1, nlon, nlon, nz, field2(:,:,b), opt) + end do +end subroutine driver + """.strip() + + fcode_kernel = """ +module kernel_mod + implicit none +contains + subroutine kernel(start, end, klon, klev, field1, field2) + use parkind1, only : jpim, jplm + implicit none + integer, parameter :: jwrb = selected_real_kind(13,300) + integer, intent(in) :: start, end, klon, klev + real(kind=jwrb), intent(inout) :: field1(klon) + real(kind=jwrb), intent(inout) :: field2(klon,klev) + real(kind=jwrb) :: tmp1(klon) + real(kind=jwrb) :: tmp2(klon, klev) + integer(kind=jpim) :: tmp3(klon*2) + logical(kind=jplm) :: tmp4(klev) + integer :: jk, jl + + do jk=1,klev + tmp1(jl) = 0.0_jwrb + do jl=start,end + tmp2(jl, jk) = field2(jl, jk) + tmp1(jl) = field2(jl, jk) + end do + field1(jl) = tmp1(jl) + tmp4(jk) = .true. + end do + + do jl=start,end + tmp3(jl) = 1_jpim + tmp3(jl+klon) = 1_jpim + enddo + + call kernel2(start, end, klon, klev, field2) + end subroutine kernel + subroutine kernel2(start, end, columns, levels, field2, opt_arg) + implicit none + integer, parameter :: jwrb = selected_real_kind(13,300) + integer, intent(in) :: start, end, columns, levels + real(kind=jwrb), intent(inout) :: field2(columns,levels) + real(kind=jwrb) :: tmp1(2*columns, levels), tmp2(columns, levels) + real(kind=jwrb), optional :: opt_arg + integer :: jk, jl + + do jk=1,levels + do jl=start,end + tmp1(jl, jk) = field2(jl, jk) + tmp1(jl+columns, jk) = field2(jl, jk)*2._jwrb + tmp2(jl, jk) = tmp1(jl, jk) + 1._jwrb + field2(jl, jk) = tmp2(jl, jk) + end do + end do + end subroutine kernel2 + +end module kernel_mod + """.strip() + + basedir = gettempdir() / 'test_pool_allocator_args_vs_kwargs' + basedir.mkdir(exist_ok=True) + (basedir / 'driver.F90').write_text(fcode_driver) + (basedir / 'kernel.F90').write_text(fcode_kernel) + + config = { + 'default': { + 'mode': 'idem', + 'role': 'kernel', + 'expand': True, + 'strict': True + }, + 'routine': [{ + 'name': 'driver', + 'role': 'driver' + }] + } + scheduler = Scheduler(paths=[basedir], config=SchedulerConfig.from_dict(config), frontend=frontend) + + if frontend == OMNI: + for item in scheduler.items: + normalize_range_indexing(item.routine) + + transformation = TemporariesPoolAllocatorTransformation(block_dim=block_dim) + scheduler.process(transformation=transformation, reverse=True) + + kernel = scheduler['kernel_mod#kernel'].routine + kernel2 = scheduler['kernel_mod#kernel2'].routine + driver = scheduler['#driver'].routine + + assert 'ydstack' in kernel.arguments + assert 'ydstack' in kernel2.arguments + + calls = FindNodes(CallStatement).visit(driver.body) + assert calls[0].arguments == () + assert calls[0].kwarguments == (('start', 1), ('end', 'nlon'), ('klon', 'nlon'), ('klev', 'nz'), + ('field1', 'field1(:, b)'), ('field2', 'field2(:, :, b)'), ('YDSTACK', 'YLSTACK')) + assert calls[1].arguments == ('1', 'nlon', 'nlon', 'nz') + assert calls[1].kwarguments == (('field2', 'field2(:, :, b)'), ('YDSTACK', 'YLSTACK')) + assert calls[2].arguments == ('1', 'nlon', 'nlon', 'nz', 'field2(:, :, b)', 'YLSTACK') + assert calls[2].kwarguments == () + assert calls[3].arguments == ('1', 'nlon', 'nlon', 'nz') + assert calls[3].kwarguments == (('field2', 'field2(:, :, b)'), ('YDSTACK', 'YLSTACK'), ('opt_arg', 'opt')) + assert calls[4].arguments == ('1', 'nlon', 'nlon', 'nz', 'field2(:, :, b)', 'YLSTACK', 'opt') + assert calls[4].kwarguments == () + + rmtree(basedir) diff --git a/transformations/transformations/argument_shape.py b/transformations/transformations/argument_shape.py index 08f169bdf..59a58fb50 100644 --- a/transformations/transformations/argument_shape.py +++ b/transformations/transformations/argument_shape.py @@ -149,7 +149,6 @@ def transform_subroutine(self, routine, **kwargs): # pylint: disable=arguments- arg_keys = dict(call.arg_iter()).keys() missing = [a for a in callee.arguments if a not in arg_keys and not a.type.optional and a in dim_vars] - # Add missing dimension variables (scalars new_kwargs = tuple((str(m), m) for m in missing if m.type.dtype == BasicType.INTEGER) call_map[call] = call.clone(kwarguments=call.kwarguments + new_kwargs) diff --git a/transformations/transformations/pool_allocator.py b/transformations/transformations/pool_allocator.py index d43eaf2de..ca0d9c812 100644 --- a/transformations/transformations/pool_allocator.py +++ b/transformations/transformations/pool_allocator.py @@ -13,7 +13,8 @@ SymbolAttributes, BasicType, DerivedType, Quotient, IntLiteral, LogicLiteral, Variable, Array, Sum, Literal, Product, InlineCall, Comparison, RangeIndex, Cast, Intrinsic, Assignment, Conditional, CallStatement, Import, Allocation, Deallocation, is_dimension_constant, - Loop, Pragma, SubroutineItem, FindInlineCalls, Interface, ProcedureSymbol, LogicalNot, dataflow_analysis_attached + Loop, Pragma, SubroutineItem, FindInlineCalls, Interface, ProcedureSymbol, LogicalNot, dataflow_analysis_attached, + expression, parse_fparser_expression, Scope ) __all__ = ['TemporariesPoolAllocatorTransformation'] @@ -121,6 +122,7 @@ def __init__(self, block_dim, self.local_ptr_var_name_pattern = local_ptr_var_name_pattern self.directive = directive self.check_bounds = check_bounds + self.trafo_dict = {} if key: self._key = key @@ -129,7 +131,11 @@ def transform_subroutine(self, routine, **kwargs): role = kwargs['role'] item = kwargs.get('item', None) + ignore = item.ignore if item else () targets = kwargs.get('targets', None) + self.trafo_dict = item.config.get('trafo', {}) + if self.trafo_dict: + self.trafo_dict = self.trafo_dict.get(self._key, {}) self.stack_type_kind = 'JPRB' if item: @@ -159,7 +165,7 @@ def transform_subroutine(self, routine, **kwargs): self.import_allocation_types(routine, item) self.create_pool_allocator(routine, stack_size) - self.inject_pool_allocator_into_calls(routine, targets) + self.inject_pool_allocator_into_calls(routine, targets, ignore) @staticmethod def import_c_sizeof(routine): @@ -292,15 +298,22 @@ def _get_stack_storage_and_size_var(self, routine, stack_size): stack_type_bytes = Cast(name='REAL', expression=Literal(1), kind=_kind) stack_type_bytes = InlineCall(Variable(name='C_SIZEOF'), parameters=as_tuple(stack_type_bytes)) + stack_type_bytes = InlineCall(function=Variable(name='MAX'), + parameters=(stack_type_bytes, Literal(8)), kw_parameters=()) + if "stack-size" in self.trafo_dict: + scope = Scope() + #TODO: parse_fparser_expression only for fparser ... + stack_size = parse_fparser_expression(self.trafo_dict["stack-size"], scope=scope) stack_size_assign = Assignment(lhs=stack_size_var, rhs=Quotient(stack_size, stack_type_bytes)) + body_prepend += [stack_size_assign] # Stack-size no longer guaranteed to be a multiple of 8-bytes, so we have to check here padding = Assignment(lhs=stack_size_var, rhs=Sum((stack_size_var, Literal(1)))) stack_size_check = Conditional( condition=LogicalNot(Comparison(InlineCall(Variable(name='MOD'), - parameters=(stack_size, stack_type_bytes)), - '==', Literal(0))), inline=True, body=(padding,), + parameters=(stack_size, stack_type_bytes)), + '==', Literal(0))), inline=True, body=(padding,), else_body=None ) body_prepend += [stack_size_check] @@ -345,12 +358,24 @@ def _get_stack_storage_and_size_var(self, routine, stack_size): if variables_append: routine.variables += as_tuple(variables_append) if body_prepend: - routine.body.prepend(body_prepend) + if not self._insert_stack_at_loki_pragma(routine, body_prepend): + routine.body.prepend(body_prepend) if body_append: routine.body.append(body_append) return stack_storage, stack_size_var + @staticmethod + def _insert_stack_at_loki_pragma(routine, insert): + pragma_map = {} + for pragma in FindNodes(Pragma).visit(routine.body): + if pragma.keyword == 'loki' and 'stack-insert' in pragma.content: + pragma_map[pragma] = insert + if pragma_map: + routine.body = Transformer(pragma_map).visit(routine.body) + return True + return False + def _determine_stack_size(self, routine, successors, local_stack_size=None, item=None): """ Utility routine to determine the stack size required for the given :data:`routine`, @@ -476,10 +501,19 @@ def _create_stack_allocation(self, stack_ptr, stack_end, ptr_var, arr, stack_siz # Build expression for array size in bytes dim = arr.dimensions[0] + # TODO: import RangeIndex instead of expression ? + if isinstance(dim, expression.symbols.RangeIndex): + dim = Sum((dim.upper, Product((-1, dim.lower)), 1)) for d in arr.dimensions[1:]: - dim = Product((dim, d)) - arr_size = Product((dim, InlineCall(Variable(name='C_SIZEOF'), - parameters=as_tuple(self._get_c_sizeof_arg(arr))))) + _dim = d + if isinstance(_dim, expression.symbols.RangeIndex): + _dim = Sum((_dim.upper, Product((-1, _dim.lower)), 1)) + dim = Product((dim, _dim)) + arr_type_bytes = InlineCall(Variable(name='C_SIZEOF'), + parameters=as_tuple(self._get_c_sizeof_arg(arr))) + arr_type_bytes = InlineCall(function=Variable(name='MAX'), + parameters=(arr_type_bytes, Literal(8)), kw_parameters=()) + arr_size = Product((dim, arr_type_bytes)) # Increment stack size stack_size = simplify(Sum((stack_size, arr_size))) @@ -644,6 +678,8 @@ def create_pool_allocator(self, routine, stack_size): _real_size_bytes = Cast(name='REAL', expression=Literal(1), kind=_kind) _real_size_bytes = InlineCall(Variable(name='C_SIZEOF'), parameters=as_tuple(_real_size_bytes)) + _real_size_bytes = InlineCall(function=Variable(name='MAX'), + parameters=(_real_size_bytes, Literal(8)), kw_parameters=()) stack_incr = Assignment( lhs=stack_end, rhs=Sum((stack_ptr, Product((stack_size_var, _real_size_bytes)))) ) @@ -659,25 +695,49 @@ def create_pool_allocator(self, routine, stack_size): f'Could not find a block dimension loop in {routine.name}; no stack pointer assignment inserted.' ) - def inject_pool_allocator_into_calls(self, routine, targets): + def inject_pool_allocator_into_calls(self, routine, targets, ignore): """ Add the pool allocator argument into subroutine calls """ call_map = {} stack_var = self._get_local_stack_var(routine) for call in FindNodes(CallStatement).visit(routine.body): - if call.name in targets: + if call.name in targets or call.routine.name.lower() in ignore: #['surfpp', 'surfexcdriver', 'surfseb']: # If call is declared via an explicit interface, the ProcedureSymbol corresponding to the call is the # interface block rather than the Subroutine itself. This means we have to update the interface block # accordingly if call.name in [s for i in FindNodes(Interface).visit(routine.spec) for s in i.symbols]: _ = self._get_stack_arg(call.routine) - - if call.routine != BasicType.DEFERRED and self.stack_argument_name in call.routine.arguments: - arg_idx = call.routine.arguments.index(self.stack_argument_name) - arguments = call.arguments - call_map[call] = call.clone(arguments=arguments[:arg_idx] + (stack_var,) + arguments[arg_idx:]) - + if call.routine != BasicType.DEFERRED and self.stack_local_var_name not in call.arguments: + if self.stack_argument_name in call.routine.arguments: + arg_idx = call.routine.arguments.index(self.stack_argument_name) + arguments = call.arguments + kwarguments = call.kwarguments + if arg_idx <= len(arguments): + arguments = arguments[:arg_idx] + (stack_var,) + arguments[arg_idx:] + else: + arg_idx -= len(arguments) + kwarguments = kwarguments[:arg_idx] + ((self.stack_argument_name, stack_var),) + \ + kwarguments[arg_idx:] + call_map[call] = call.clone(arguments=arguments, kwarguments=kwarguments) + else: + arguments = call.arguments + kwarguments = call.kwarguments + arg_pos = [call.routine.arguments.index(arg) for arg in call.routine.arguments + if arg.type.optional] + if arg_pos: + if arg_pos[0] < len(arguments): + arguments = arguments[:arg_pos[0]] + (stack_var,) + arguments[arg_pos[0]:] + else: + arg_idx = arg_pos[0] - len(arguments) + kwarguments = kwarguments[:arg_idx] + ((self.stack_argument_name, stack_var),) + \ + kwarguments[arg_idx:] + else: + if kwarguments: + kwarguments += ((self.stack_argument_name, stack_var),) + else: + arguments += (stack_var,) + call_map[call] = call.clone(arguments=as_tuple(arguments), kwarguments=as_tuple(kwarguments)) if call_map: routine.body = Transformer(call_map).visit(routine.body)