Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add frontier as a supported machine #124

Merged
merged 5 commits into from
Jul 31, 2023
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 12 additions & 10 deletions mache/cime_machine_config/config_machines.xml
Original file line number Diff line number Diff line change
Expand Up @@ -518,7 +518,7 @@
<machine MACH="spock">
<DESC>Spock. NCCS moderate-security system that contains similar hardware and software as the upcoming Frontier system at ORNL.</DESC>
<NODENAME_REGEX>.*spock.*</NODENAME_REGEX>
<OS>CNL</OS>
<OS>Linux</OS>
<COMPILERS>gnu,cray</COMPILERS>
<MPILIBS>mpich</MPILIBS>
<PROJECT>cli133</PROJECT>
Expand Down Expand Up @@ -599,8 +599,8 @@
<machine MACH="frontier">
<DESC>Frontier exascale supercomputer at ORNL. 9408 nodes, Node: 4 AMD MI250X GPUs (2 GCDs) ~ 8 GPUs, 512 GB HDB2E, AMD EPYC 64 cores, 512GB DDR4 </DESC>
<NODENAME_REGEX>.*frontier.*</NODENAME_REGEX>
<OS>CNL</OS>
<COMPILERS>gnu,crayclang,amdclang,gnugpu,crayclanggpu,amdclanggpu</COMPILERS>
<OS>Linux</OS>
<COMPILERS>crayclang,gnu,amdclang,gnugpu,crayclanggpu,amdclanggpu</COMPILERS>
<MPILIBS>mpich</MPILIBS>
<PROJECT>cli115</PROJECT>
<SAVE_TIMING_DIR>/lustre/orion/cli115/world-shared/frontier</SAVE_TIMING_DIR>
Expand Down Expand Up @@ -723,8 +723,8 @@
<machine MACH="crusher">
<DESC>Crusher. NCCS moderate-security system that contains similar hardware and software as the upcoming Frontier system at ORNL. 192 AMD EPYC 7A53 64C nodes, 128 hwthreads, 512GB DDR4, 4 MI250X GPUs</DESC>
<NODENAME_REGEX>.*crusher.*</NODENAME_REGEX>
<OS>CNL</OS>
<COMPILERS>gnu,crayclang,amdclang,gnugpu,crayclanggpu,amdclanggpu</COMPILERS>
<OS>Linux</OS>
<COMPILERS>crayclang,gnu,amdclang,gnugpu,crayclanggpu,amdclanggpu</COMPILERS>
<MPILIBS>mpich</MPILIBS>
<PROJECT>cli115</PROJECT>
<SAVE_TIMING_DIR>/lustre/orion/cli115/world-shared/crusher</SAVE_TIMING_DIR>
Expand Down Expand Up @@ -846,7 +846,7 @@
<machine MACH="crusher-scream-cpu">
<DESC>Crusher. NCCS moderate-security system that contains similar hardware and software as the upcoming Frontier system at ORNL. 192 AMD EPYC 7A53 64C nodes, 128 hwthreads, 512GB DDR4, 4 MI250X GPUs</DESC>
<NODENAME_REGEX>.*crusher.*</NODENAME_REGEX>
<OS>CNL</OS>
<OS>Linux</OS>
<COMPILERS>crayclang-scream</COMPILERS>
<MPILIBS>mpich</MPILIBS>
<PROJECT>CLI133_crusher</PROJECT>
Expand Down Expand Up @@ -945,7 +945,7 @@
<machine MACH="crusher-scream-gpu">
<DESC>Crusher. NCCS moderate-security system that contains similar hardware and software as the upcoming Frontier system at ORNL. 192 AMD EPYC 7A53 64C nodes, 128 hwthreads, 512GB DDR4, 4 MI250X GPUs</DESC>
<NODENAME_REGEX>.*crusher.*</NODENAME_REGEX>
<OS>CNL</OS>
<OS>Linux</OS>
<COMPILERS>crayclang-scream</COMPILERS>
<MPILIBS>mpich</MPILIBS>
<PROJECT>CLI133_crusher</PROJECT>
Expand Down Expand Up @@ -2898,15 +2898,16 @@
<NTEST_PARALLEL_JOBS>4</NTEST_PARALLEL_JOBS>
<BATCH_SYSTEM>pbspro</BATCH_SYSTEM>
<SUPPORTED_BY>e3sm</SUPPORTED_BY>
<MAX_TASKS_PER_NODE>104</MAX_TASKS_PER_NODE>
<MAX_TASKS_PER_NODE compiler="oneapi-ifx">104</MAX_TASKS_PER_NODE>
<MAX_TASKS_PER_NODE>208</MAX_TASKS_PER_NODE>
<MAX_TASKS_PER_NODE compiler="oneapi-ifx">208</MAX_TASKS_PER_NODE>
<MAX_TASKS_PER_NODE compiler="oneapi-ifxgpu">104</MAX_TASKS_PER_NODE>
<MAX_MPITASKS_PER_NODE>104</MAX_MPITASKS_PER_NODE>
<MAX_MPITASKS_PER_NODE compiler="oneapi-ifx">52</MAX_MPITASKS_PER_NODE>
<MAX_MPITASKS_PER_NODE compiler="oneapi-ifx">104</MAX_MPITASKS_PER_NODE>
<MAX_MPITASKS_PER_NODE compiler="oneapi-ifxgpu">12</MAX_MPITASKS_PER_NODE>
<PROJECT_REQUIRED>FALSE</PROJECT_REQUIRED>
<mpirun mpilib="default">
<executable>mpiexec</executable>
<!--executable>numactl -m 2-3 mpiexec</executable--><!--for HBM runs-->
<arguments>
<arg name="total_num_tasks">-np {{ total_tasks }} --label</arg>
<arg name="ranks_per_node">-ppn {{ tasks_per_node }}</arg>
Expand Down Expand Up @@ -2987,6 +2988,7 @@
<env name="OMP_TARGET_OFFLOAD">DISABLED</env><!--default OMP_TARGET_OFFLOAD=MANDATORY-->
<env name="FI_CXI_DEFAULT_CQ_SIZE">131072</env>
<env name="FI_CXI_CQ_FILL_PERCENT">20</env>
<env name="MPIR_CVAR_ENABLE_GPU">0</env>
<env name="GPU_TILE_COMPACT"> </env>
</environment_variables>
<environment_variables SMP_PRESENT="TRUE" compiler="!gnu">
Expand Down
6 changes: 6 additions & 0 deletions mache/discover.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,12 @@ def discover_machine(quiet=False):
machine = 'cooley'
elif hostname.startswith('cori'):
machine = 'cori-haswell'
elif 'LMOD_SYSTEM_NAME' in os.environ:
hostname = os.environ['LMOD_SYSTEM_NAME']
if hostname == 'frontier':
# frontier's hostname is too generic to detect, so relying on
# LMOD_SYSTEM_NAME
machine = 'frontier'
Comment on lines +41 to +46
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not ideal but this is what I could figure out.

elif 'NERSC_HOST' in os.environ:
hostname = os.environ['NERSC_HOST']
if hostname == 'perlmutter':
Expand Down
72 changes: 72 additions & 0 deletions mache/machines/frontier.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
# Options related to deploying an e3sm-unified conda environment on supported
# machines
[e3sm_unified]

# the unix group for permissions for the e3sm-unified conda environment
group = cli115

# the compiler set to use for system libraries and MPAS builds
compiler = gnu

# the system MPI library to use for intel18 compiler
mpi = mpich

# the path to the directory where activation scripts, the base environment, and
# system libraries will be deployed
base_path = /lustre/orion/cli115/world-shared/software/e3sm-unified

# whether to use system modules for hdf5, netcdf-c, netcdf-fortran and pnetcdf
# (spack modules are used otherwise)
use_system_hdf5_netcdf = True


# config options related to data needed by diagnostics software such as
# e3sm_diags and MPAS-Analysis
[diagnostics]

# The base path to the diagnostics directory
base_path = /lustre/orion/cli115/world-shared/diagnostics

# the unix group for permissions for diagnostics
group = cli115


# The parallel section describes options related to running jobs in parallel
[parallel]

# parallel system of execution: slurm, cobalt or single_node
system = slurm

# whether to use mpirun or srun to run a task
parallel_executable = srun

# cores per node on the machine
cores_per_node = 56

# account for running diagnostics jobs
account = cli115

# available partition(s) (default is the first)
partitions = batch


# Config options related to spack environments
[spack]

# whether to load modules from the spack yaml file before loading the spack
# environment
modules_before = False

# whether to load modules from the spack yaml file after loading the spack
# environment
modules_after = False

# whether the machine uses cray compilers
cray_compilers = True


# config options related to synchronizing files
[sync]

# the full hostname of the machine
hostname = frontier.olcf.ornl.gov
19 changes: 19 additions & 0 deletions mache/spack/frontier_gnu_mpich.csh
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
module reset >& /dev/null
module switch PrgEnv-cray PrgEnv-gnu/8.3.3 >& /dev/null
module switch gcc gcc/11.2.0 >& /dev/null

{% if e3sm_lapack %}
module load cray-libsci/22.12.1.1
{% endif %}
{% if e3sm_hdf5_netcdf %}
module load cray-hdf5-parallel/1.12.2.1
module load cray-netcdf-hdf5parallel/4.9.0.1
module load cray-parallel-netcdf/1.12.3.1
{% endif %}

{% if e3sm_hdf5_netcdf %}
setenv NETCDF_C_PATH $CRAY_NETCDF_HDF5PARALLEL_PREFIX
setenv NETCDF_FORTRAN_PATH $CRAY_NETCDF_HDF5PARALLEL_PREFIX
setenv PNETCDF_PATH $CRAY_PARALLEL_NETCDF_PREFIX
{% endif %}
setenv HDF5_USE_FILE_LOCKING FALSE
19 changes: 19 additions & 0 deletions mache/spack/frontier_gnu_mpich.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
module reset >& /dev/null
module switch PrgEnv-cray PrgEnv-gnu/8.3.3 >& /dev/null
module switch gcc gcc/11.2.0 >& /dev/null

{% if e3sm_lapack %}
module load cray-libsci/22.12.1.1
{% endif %}
{% if e3sm_hdf5_netcdf %}
module load cray-hdf5-parallel/1.12.2.1
module load cray-netcdf-hdf5parallel/4.9.0.1
module load cray-parallel-netcdf/1.12.3.1
{% endif %}

{% if e3sm_hdf5_netcdf %}
export NETCDF_C_PATH=$CRAY_NETCDF_HDF5PARALLEL_PREFIX
export NETCDF_FORTRAN_PATH=$CRAY_NETCDF_HDF5PARALLEL_PREFIX
export PNETCDF_PATH=$CRAY_PARALLEL_NETCDF_PREFIX
{% endif %}
export HDF5_USE_FILE_LOCKING=FALSE
182 changes: 182 additions & 0 deletions mache/spack/frontier_gnu_mpich.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
spack:
specs:
- gcc
- cray-mpich
{% if e3sm_lapack %}
- cray-libsci
{% endif %}
{% if e3sm_hdf5_netcdf %}
- hdf5
- netcdf-c
- netcdf-fortran
- parallel-netcdf
{% endif %}
{{ specs }}
concretizer:
unify: when_possible
packages:
all:
compiler: [[email protected]]
providers:
mpi: [[email protected]]
{% if e3sm_lapack %}
lapack: [[email protected]]
{% endif %}
bzip2:
externals:
- spec: [email protected]
prefix: /usr
buildable: false
curl:
externals:
- spec: [email protected]
prefix: /usr
buildable: false
gettext:
externals:
- spec: [email protected]
prefix: /usr
buildable: false
libxml2:
externals:
- spec: [email protected]
prefix: /usr
buildable: false
ncurses:
externals:
- spec: [email protected]
prefix: /usr
buildable: false
openssl:
externals:
- spec: [email protected]
prefix: /usr
buildable: false
perl:
externals:
- spec: [email protected]
prefix: /usr
buildable: false
python:
externals:
- spec: [email protected]
prefix: /opt/cray/pe/python/3.9.13.1
modules:
- cray-python/3.9.13.1
buildable: false
tar:
externals:
- spec: [email protected]
prefix: /usr
buildable: false
xz:
externals:
- spec: [email protected]
prefix: /usr
buildable: false
gcc:
externals:
- spec: [email protected]
modules:
- PrgEnv-gnu/8.3.3
- gcc/11.2.0
- craype/2.7.19
- libfabric/1.15.2.0
buildable: false
cray-mpich:
externals:
- spec: [email protected]
prefix: /opt/cray/pe/mpich/8.1.23/ofi/gnu/9.1
modules:
- libfabric/1.15.2.0
- cray-mpich/8.1.23
buildable: false
libfabric:
externals:
- spec: [email protected]
prefix: /opt/cray/libfabric/1.15.2.0
modules:
- libfabric/1.15.2.0
buildable: false
{% if e3sm_lapack %}
cray-libsci:
externals:
- spec: [email protected]
prefix: /opt/cray/pe/libsci/22.12.1.1/GNU/9.1/x86_64
modules:
- cray-libsci/22.12.1.1
buildable: false
{% endif %}
{% if e3sm_hdf5_netcdf %}
hdf5:
externals:
- spec: [email protected]~cxx+fortran+hl~java+mpi+shared
prefix: /opt/cray/pe/hdf5-parallel/1.12.2.1/GNU/9.1
buildable: false
parallel-netcdf:
externals:
- spec: [email protected]+cxx+fortran+pic+shared
prefix: /opt/cray/pe/parallel-netcdf/1.12.3.1/GNU/9.1/
buildable: false
netcdf-c:
externals:
- spec: [email protected]+mpi~parallel-netcdf
prefix: /opt/cray/pe/netcdf-hdf5parallel/4.9.0.1/GNU/9.1
buildable: false
netcdf-fortran:
externals:
- spec: [email protected] ^netcdf-c+mpi
prefix: /opt/cray/pe/netcdf-hdf5parallel/4.9.0.1/GNU/9.1
- spec: [email protected] ^netcdf-c~mpi
prefix: /opt/cray/pe/netcdf/4.9.0.1/GNU/9.1
buildable: false
{% endif %}
{% if system_hdf5_netcdf %}
hdf5:
externals:
- spec: [email protected]~cxx+fortran+hl~java+mpi+shared
prefix: /opt/cray/pe/hdf5-parallel/1.12.2.1/GNU/9.1
- spec: [email protected]~cxx+fortran+hl~java~mpi+shared
prefix: /opt/cray/pe/hdf5/1.12.2.1/GNU/9.1
buildable: false
parallel-netcdf:
externals:
- spec: [email protected]+cxx+fortran+pic+shared
prefix: /opt/cray/pe/parallel-netcdf/1.12.3.1/GNU/9.1/
buildable: false
netcdf-c:
externals:
- spec: [email protected]+mpi~parallel-netcdf
prefix: /opt/cray/pe/netcdf-hdf5parallel/4.9.0.1/GNU/9.1
- spec: [email protected]~mpi~parallel-netcdf
prefix: /opt/cray/pe/netcdf/4.9.0.1/GNU/9.1
buildable: false
netcdf-fortran:
externals:
- spec: [email protected] ^netcdf-c+mpi
prefix: /opt/cray/pe/netcdf-hdf5parallel/4.9.0.1/GNU/9.1
- spec: [email protected] ^netcdf-c~mpi
prefix: /opt/cray/pe/netcdf/4.9.0.1/GNU/9.1
buildable: false
{% endif %}
config:
install_missing_compilers: false
compilers:
- compiler:
spec: [email protected]
paths:
cc: cc
cxx: CC
f77: ftn
fc: ftn
flags: {}
operating_system: sles15
target: x86_64
modules:
- PrgEnv-gnu/8.3.3
- gcc/11.2.0
- craype/2.7.19
- libfabric/1.15.2.0
environment:
prepend_path:
PKG_CONFIG_PATH: "/opt/cray/xpmem/2.5.2-2.4_3.45__gd0f7936.shasta/lib64/pkgconfig"
Loading