#2730 Fix issues with LFRic OMP offloading and add it to the integrat…

…ion tests
stfc · Oct 3, 2024 · ae26a8b · ae26a8b
1 parent fcc09be
commit ae26a8b
Show file tree

Hide file tree

Showing 5 changed files with 108 additions and 73 deletions.
diff --git a/.github/workflows/lfric_test.yml b/.github/workflows/lfric_test.yml
@@ -75,6 +75,58 @@ jobs:
         pip install .[test]
         pip install jinja2
 
+    # PSyclone, compile and run MetOffice gungho_model on GPU
+    - name: LFRic GungHo with OpenMP offload
+      run: |
+        # Set up environment
+        source /apps/spack/psyclone-spack/spack-repo/share/spack/setup-env.sh
+        spack load lfric-build-environment%nvhpc
+        source .runner_venv/bin/activate
+        export PSYCLONE_LFRIC_DIR=${GITHUB_WORKSPACE}/examples/lfric/scripts
+        export PSYCLONE_CONFIG_FILE=${PSYCLONE_LFRIC_DIR}/KGOs/lfric_psyclone.cfg
+        # The LFRic source must be patched to workaround bugs in the NVIDIA
+        # compiler's namelist handling.
+        rm -rf ${HOME}/LFRic/gpu_build
+        mkdir -p ${HOME}/LFRic/gpu_build
+        cp -r ${HOME}/LFRic/lfric_apps_${LFRIC_APPS_REV} ${HOME}/LFRic/gpu_build/lfric_apps
+        cp -r ${HOME}/LFRic/lfric_core_50869 ${HOME}/LFRic/gpu_build/lfric
+        cd ${HOME}/LFRic/gpu_build
+        patch -p1 < ${PSYCLONE_LFRIC_DIR}/KGOs/lfric_${LFRIC_APPS_REV}_nvidia.patch
+        # Update the compiler definitions to build for GPU
+        cp ${PSYCLONE_LFRIC_DIR}/KGOs/nvfortran_acc.mk lfric/infrastructure/build/fortran/nvfortran.mk
+        cp ${PSYCLONE_LFRIC_DIR}/KGOs/nvc++.mk lfric/infrastructure/build/cxx/.
+        # Update the PSyclone commands to ensure transformed kernels are written
+        # to working directory.
+        cp ${PSYCLONE_LFRIC_DIR}/KGOs/psyclone.mk lfric/infrastructure/build/psyclone/.
+        # Update dependencies.sh to point to our patched lfric core.
+        sed -i -e 's/export lfric_core_sources=.*$/export lfric_core_sources\=\/home\/gh_runner\/LFRic\/gpu_build\/lfric/' lfric_apps/dependencies.sh
+        export LFRIC_DIR=${HOME}/LFRic/gpu_build/lfric_apps
+        export OPT_DIR=${LFRIC_DIR}/applications/gungho_model/optimisation/psyclone-test
+        cd ${LFRIC_DIR}
+        # PSyclone scripts must now be under 'optimisation' and be called 'global.py'
+        mkdir -p ${OPT_DIR}
+        cp ${PSYCLONE_LFRIC_DIR}/gpu_offloading.py ${OPT_DIR}/global.py
+        # Clean previous version and compile again
+        rm -rf applications/gungho_model/working
+        OFFLOAD_USING_OMP=true ./build/local_build.py -a gungho_model -p psyclone-test
+        cd applications/gungho_model/example
+        cp ${PSYCLONE_LFRIC_DIR}/KGOs/lfric_gungho_configuration_4its.nml configuration.nml
+        mpirun -n 1 ../bin/gungho_model configuration.nml |& tee output.txt
+        python ${PSYCLONE_LFRIC_DIR}/compare_ouput.py ${PSYCLONE_LFRIC_DIR}/KGOs/lfric_gungho_configuration_4its_checksums.txt gungho_model-checksums.txt
+        cat timer.txt
+        export VAR_TIME=$(grep "gungho_model" timer.txt | cut -d'|' -f5)
+        export VAR_HALOS=$(grep "gungho_model" halo_calls_counter.txt | cut -d'|' -f5)
+        echo $GITHUB_REF_NAME $GITHUB_SHA $VAR_TIME $VAR_HALOS >> ${HOME}/store_results/lfric_omp_performance_history
+        ${HOME}/mongosh-2.1.1-linux-x64/bin/mongosh \
+          "mongodb+srv://cluster0.x8ncpxi.mongodb.net/PerformanceMonitoring" \
+          --quiet --apiVersion 1 --username ${{ secrets.MONGODB_USERNAME }} \
+          --password ${{ secrets.MONGODB_PASSWORD }} \
+          --eval 'db.GitHub_CI.insertOne({branch_name: "'"$GITHUB_REF_NAME"'", commit: "'"$GITHUB_SHA"'",
+          github_job: "'"$GITHUB_RUN_ID"'"-"'"$GITHUB_RUN_ATTEMPT"'",
+          ci_test: "LFRic OpenMP offloading", lfric_apps_version: '"$LFRIC_APPS_REV"', system: "GlaDos",
+          compiler:"spack-nvhpc-24.5", date: new Date(), elapsed_time: '"$VAR_TIME"',
+          num_of_halo_exchanges: '"$VAR_HALOS"'})'
+
     # PSyclone, compile and run MetOffice gungho_model on GPU
     - name: LFRic GungHo with OpenACC offload
       run: |

diff --git a/examples/lfric/scripts/KGOs/lfric_3269_nvidia.patch b/examples/lfric/scripts/KGOs/lfric_3269_nvidia.patch
@@ -57,65 +57,6 @@ index 19c9cff9..b5cd3014 100644
  	$(call MESSAGE,Compiled,$<)
 
 
-diff --git a/lfric/infrastructure/build/cxx/nvc++.mk b/lfric/infrastructure/build/cxx/nvc++.mk
-new file mode 100644
-index 00000000..13b17a10
---- /dev/null
-+++ b/lfric/infrastructure/build/cxx/nvc++.mk
-@@ -0,0 +1,9 @@
-+##############################################################################
-+# (c) Crown copyright 2017 Met Office. All rights reserved.
-+# The file LICENCE, distributed with this code, contains details of the terms
-+# under which the code may be used.
-+##############################################################################
-+
-+$(info ** Chosen NVC++ compiler)
-+
-+CXX_RUNTIME_LIBRARY=stdc++
-diff --git a/lfric/infrastructure/build/fortran/nvfortran.mk b/lfric/infrastructure/build/fortran/nvfortran.mk
-new file mode 100644
-index 00000000..cfed52c1
---- /dev/null
-+++ b/lfric/infrastructure/build/fortran/nvfortran.mk
-@@ -0,0 +1,38 @@
-+##############################################################################
-+# Copyright (c) 2017,  Met Office, on behalf of HMSO and Queen's Printer
-+# For further details please refer to the file LICENCE.original which you
-+# should have received as part of this distribution.
-+##############################################################################
-+# Various things specific to the Portland Fortran compiler.
-+##############################################################################
-+#
-+# This macro is evaluated now (:= syntax) so it may be used as many times as
-+# desired without wasting time rerunning it.
-+#
-+F_MOD_DESTINATION_ARG = -module$(SPACE)
-+OPENMP_ARG            = -mp
-+
-+FFLAGS_COMPILER           =
-+FFLAGS_NO_OPTIMISATION    = -O0
-+FFLAGS_SAFE_OPTIMISATION  = -O2
-+FFLAGS_RISKY_OPTIMISATION = -O4
-+FFLAGS_DEBUG              = -g -traceback
-+FFLAGS_RUNTIME            = -Mchkptr -Mchkstk
-+# Option for checking code meets Fortran standard (not available for PGI)
-+FFLAGS_FORTRAN_STANDARD   =
-+
-+LDFLAGS_COMPILER = -g
-+
-+FPP = cpp -traditional-cpp
-+FPPFLAGS = -P
-+FC = mpif90
-+
-+# FS#34981 (nvbug 4648082)
-+science/src/um/src/atmosphere/large_scale_precipitation/ls_ppnc.o: private FFLAGS_RUNTIME = -Mchkstk
-+
-+# FS#35751
-+mesh/create_mesh_mod.o: private FFLAGS_RUNTIME = -Mchkstk
-+
-+# 24.3
-+science/src/socrates/src/cosp_github/subsample_and_optics_example/optics/quickbeam_optics/optics_lib.o: private FFLAGS_SAFE_OPTIMISATION = -O1
-+science/src/socrates/src/cosp_github/subsample_and_optics_example/optics/quickbeam_optics/optics_lib.o: private FFLAGS_RISKY_OPTIMISATION = -O1
 diff --git a/lfric/infrastructure/build/tools/DependencyRules b/lfric/infrastructure/build/tools/DependencyRules
 index 9d4db390..e37384fc 100755
 --- a/lfric/infrastructure/build/tools/DependencyRules

diff --git a/examples/lfric/scripts/gpu_offloading.py b/examples/lfric/scripts/gpu_offloading.py
@@ -75,8 +75,10 @@ def trans(psy):
 
     if OFFLOAD_USING_OMP:
         # Use OpenMP offloading
-        loop_offloading_trans = OMPLoopTrans()
-        loop_offloading_trans.omp_directive = "teamsdistributeparalleldo"
+        loop_offloading_trans = OMPLoopTrans(
+            omp_directive="teamsdistributeparalleldo",
+            omp_schedule="none"
+        )
         kernels_trans = None
         gpu_region_trans = OMPTargetTrans()
         gpu_annotation_trans = OMPDeclareTargetTrans()
@@ -133,12 +135,16 @@ def trans(psy):
                             print(f"Failed to annotate '{kern.name}' with "
                                   f"GPU-enabled directive due to:\n"
                                   f"{err.value}")
+                        # For annotated or inlined kernels we could attempt to
+                        # provide compile-time dimensions for the temporary
+                        # arrays and convert to code unsupported intrinsics.
 
         # Add GPU offloading to loops unless they are over colours or are null.
         schedule = invoke.schedule
         for loop in schedule.walk(Loop):
-            if offload and all(kern.name.lower() not in failed_to_offload for
-                               kern in loop.kernels()):
+            kernel_names = [k.name.lower() for k in loop.kernels()]
+            if offload and all(name not in failed_to_offload for name in
+                               kernel_names):
                 try:
                     if loop.loop_type == "colours":
                         pass
@@ -151,13 +157,23 @@ def trans(psy):
                             loop, options={"independent": True})
                         gpu_region_trans.apply(loop.ancestor(Directive))
                     if loop.loop_type == "dof":
-                        if kernels_trans:
-                            # Loops over dofs can contain reductions, so we
-                            # don't add loop parallelism (is not supported yet)
-                            # but we can add 'kernel' parallelism if available
+                        # Loops over dofs can contains reductions that ...
+                        if OFFLOAD_USING_OMP:
+                            # with loop offloading will be detected by the
+                            # dependencyAnalysis and raise TransformationErrors
+                            loop_offloading_trans.apply(
+                                loop, options={"independent": True})
+                            gpu_region_trans.apply(loop.ancestor(Directive))
+                        elif kernels_trans:
+                            # if kernel offloading is available it should
+                            # manage them
                             kernels_trans.apply(loop)
+                        # Alternatively with could use loop parallelism with
+                        # reduction clauses
+                    print(f"Successfully offloaded loop with {kernel_names}")
                 except TransformationError as err:
-                    print(f"Failed to offload loop because: {err}")
+                    print(f"Failed to offload loop with {kernel_names} "
+                          f"because: {err}")
 
         # Apply OpenMP thread parallelism for any kernels we've not been able
         # to offload to GPU.

diff --git a/src/psyclone/f2pygen.py b/src/psyclone/f2pygen.py
@@ -138,7 +138,8 @@ class OMPDirective(Directive):
     '''
     def __init__(self, root, line, position, dir_type):
         self._types = ["parallel do", "parallel", "do", "master", "single",
-                       "taskloop", "taskwait", "declare", "target"]
+                       "taskloop", "taskwait", "declare", "target", "teams",
+                       "teams distribute parallel do"]
         self._positions = ["begin", "end"]
 
         super(OMPDirective, self).__init__(root, line, position, dir_type)

diff --git a/src/psyclone/psyir/nodes/omp_directives.py b/src/psyclone/psyir/nodes/omp_directives.py
@@ -1442,7 +1442,8 @@ def gen_code(self, parent):
             for call in reprod_red_call_list:
                 call.reduction_sum_loop(parent)
 
-        self.gen_post_region_code(parent)
+        if not self.ancestor(OMPRegionDirective):
+            self.gen_post_region_code(parent)
 
     def lower_to_language_level(self):
         '''
@@ -2302,7 +2303,7 @@ def gen_code(self, parent):
         # Add directive to the f2pygen tree
         parent.add(
             DirectiveGen(
-                parent, "omp", "begin", "parallel do", ", ".join(
+                parent, "omp", "begin", self.begin_string()[4:], " ".join(
                     text for text in [default_str, private_str, fprivate_str,
                                       schedule_str, self._reduction_string()]
                     if text)))
@@ -2312,10 +2313,24 @@ def gen_code(self, parent):
 
         # make sure the directive occurs straight after the loop body
         position = parent.previous_loop()
-        parent.add(DirectiveGen(parent, *self.end_string().split()),
+
+        # DirectiveGen only accepts 3 terms, e.g. "omp end loop", so for longer
+        # directive e.g. "omp end teams distribute parallel do", we split them
+        # between arguments and content (which is an additional string appended
+        # at the end)
+        terms = self.end_string().split()
+        if len(terms) > 3:
+            arguments = terms[:3]
+            content = " ".join(terms[3:])
+        else:
+            arguments = terms
+            content = ""
+
+        parent.add(DirectiveGen(parent, *arguments, content=content),
                    position=["after", position])
 
-        self.gen_post_region_code(parent)
+        if not self.ancestor(OMPRegionDirective):
+            self.gen_post_region_code(parent)
 
     def lower_to_language_level(self):
         '''
@@ -2415,6 +2430,16 @@ def gen_code(self, parent):
         # Generate the code for this Directive
         parent.add(DirectiveGen(parent, "omp", "begin", "target"))
 
+        # Generate the code for all of this node's children
+        for child in self.dir_body:
+            child.gen_code(parent)
+
+        # Generate the end code for this node
+        parent.add(DirectiveGen(parent, "omp", "end", "target", ""))
+
+        if not self.ancestor(OMPRegionDirective):
+            self.gen_post_region_code(parent)
+
 
 class OMPLoopDirective(OMPRegionDirective):
     ''' Class for the !$OMP LOOP directive that specifies that the iterations