Skip to content

Latest commit

Β 

History

History
468 lines (393 loc) Β· 22.1 KB

notes.md

File metadata and controls

468 lines (393 loc) Β· 22.1 KB

Notes

Startup Files

In your {.bashrc,.zshrc}, you can:

ezpz_setup_alcf() {
    file=$(mktemp)
    curl -Ls https://raw.githubusercontent.com/saforem2/ezpz/main/src/ezpz/bin/utils.sh > "${file}"
    echo "Saving 'utils.sh' to ${file} and sourcing..."
    source "${file}" || exit
    hn=$(hostname)
    setup_alcf
}

hn=$(hostname)
if [[ "${hn}" == x1 || "${hn}" == x]]
if [[ $(hostname) == x3* || $(hostname) == polaris* ]]; then
  
elif [[ $(hostname) == x4* || $(hostname) == aurora* ]]; then
elif [[ $(hostname) == x1* || $(hostname) == uan* ]]; then
elif [[ $(hostname) == bastion* ]]; then
else
fi

MACHINE=$(echo "${machine}" | tr '[:upper:]' '[:lower:]')
export PATH="${HOME}/bin/${MACHINE}:${PATH}"
export HISTFILE="$HOME/.zsh_history-${MACHINE}"
# export CODESTATS_API_KEY="SFMyNTY.YzJGdFptOXlaVzFoYmc9PSMjTWpBNE1UST0.NQ4Oy3FSJcT4nMaMlVnYcnCtPc2mqImViSGiIxyJFrg"
export ZSH_COMPDUMP="${ZSH}/cache/.zcompdump-${MACHINE}"
  1. Clone ezpz + navigate into it:

    git clone https://github.com/saforem2/ezpz
    cd ezpz
  2. Source src/ezpz/bin/utils.sh

    #[🌌][01:16:07 PM][foremans@x4017c4s5b0n0][…/ezpz][🌱 jobs-rewrite]via ⨁ v1.3.450
    $ export PBS_O_WORKDIR=$(pwd) && source src/ezpz/bin/utils.sh
    Using WORKING_DIR: /home/foremans/2024-07-10-131541/ezpz
  3. Setup python:

    #[🌌][01:16:17 PM][foremans@x4017c4s5b0n0][…/ezpz][🌱 jobs-rewrite]via ⨁ v1.3.450
    $ setup_python
    No conda_prefix OR virtual_env found in environment...
    Setting up conda...
    machine name: aurora
    
    The following have been reloaded with a version change:
      1) intel_compute_runtime/release/821.36 => intel_compute_runtime/release/803.29
      2) oneapi/eng-compiler/2024.04.15.002 => oneapi/release/2024.1
    
    Found conda at: /opt/aurora/24.086.0/frameworks/aurora_nre_models_frameworks-2024.1
    No VIRTUAL_ENV found in environment!
        - Trying to setup from /opt/aurora/24.086.0/frameworks/aurora_nre_models_frameworks-2024.1
        - Using VENV_DIR=/home/foremans/2024-07-10-131541/ezpz/venvs/aurora_nre_models_frameworks-2024.1
    
        - Creating a new virtual env on top of aurora_nre_models_frameworks-2024.1 in /home/foremans/2024-07-10-131541/ezpz/venvs/aurora_nre_models_frameworks-2024.1
    [python] Using: /home/foremans/2024-07-10-131541/ezpz/venvs/aurora_nre_models_frameworks-2024.1/bin/python3
  4. Setup ALCF:

    1. via bash script:
    .jobenv:
     ```bash
     #[aurora_nre_models_frameworks-2024.1](πŸ‘» aurora_nre_models_frameworks-2024.1)
     #[🌌][01:16:45 PM][foremans@x4017c4s5b0n0][…/ezpz][🌱 jobs-rewrite]via ⨁ v1.3.450 [⏱ 13s]
     $ setup_alcf
    
     [ezpz/bin/utils.sh]
    
     [2024-07-10-131719]
         β€’ USER=foremans
         β€’ MACHINE=aurora
         β€’ HOST=x4017c4s5b0n0
    
     [setupHost]
         β€’ Using hostfile: /var/spool/pbs/aux/698077.aurora-pbs-0001.hostmgmt.cm.aurora.alcf.anl.gov
         β€’ Found in environment:
             β€’ HOSTFILE: /var/spool/pbs/aux/698077.aurora-pbs-0001.hostmgmt.cm.aurora.alcf.anl.gov
             β€’ Writing PBS vars to: /home/foremans/.pbsenv
    
     [save_pbs_env]
         β€’ Using:
             β€’ hostfile: /var/spool/pbs/aux/698077.aurora-pbs-0001.hostmgmt.cm.aurora.alcf.anl.gov
             β€’ jobenv_file: /home/foremans/.pbsenv
           to calculate:
             β€’ num_hosts: 2
             β€’ num_gpus_per_host: 12
             β€’ num_gpus: 24
             β€’ DIST_LAUNCH: mpiexec --verbose --envall -n 24 -ppn 12 --hostfile /var/spool/pbs/aux/698077.aurora-pbs-0001.hostmgmt.cm.aurora.alcf.anl.gov --cpu-bind depth -d 16
         β€’ Setting:
             β€’ HOSTFILE: /var/spool/pbs/aux/698077.aurora-pbs-0001.hostmgmt.cm.aurora.alcf.anl.gov
             β€’ JOBENV_FILE: /home/foremans/.pbsenv
    
     [HOSTS]
         β€’ [host:0] - x4017c4s5b0n0.hostmgmt2017.cm.aurora.alcf.anl.gov
         β€’ [host:1] - x4017c4s6b0n0.hostmgmt2017.cm.aurora.alcf.anl.gov
    
     [DIST INFO]
         β€’ HOSTFILE=/var/spool/pbs/aux/698077.aurora-pbs-0001.hostmgmt.cm.aurora.alcf.anl.gov
         β€’ NHOSTS=2
         β€’ NGPU_PER_HOST=12
         β€’ NGPUS=24
         β€’ DIST_LAUNCH=mpiexec --verbose --envall -n 24 -ppn 12 --hostfile /var/spool/pbs/aux/698077.aurora-pbs-0001.hostmgmt.cm.aurora.alcf.anl.gov --cpu-bind depth -d 16
    
     [LAUNCH]:
         β€’ To launch across all available GPUs, use: launch
           launch = mpiexec --verbose --envall -n 24 -ppn 12 --hostfile /var/spool/pbs/aux/698077.aurora-pbs-0001.hostmgmt.cm.aurora.alcf.anl.gov --cpu-bind depth -d 16
     ```
    
    1. via python:
    .jobenv:
     ```bash
     #[aurora_nre_models_frameworks-2024.1](πŸ‘» aurora_nre_models_frameworks-2024.1)
     #[🌌][01:20:20 PM][foremans@x4017c4s5b0n0][…/ezpz][🌱 jobs-rewrite]via ⨁ v1.3.450
     $ python3 -m ezpz.jobs
     2024-07-10 13:21:51,992 - numexpr.utils - INFO - Note: detected 208 virtual cores but NumExpr set to maximum of 64, check "NUMEXPR_MAX_THREADS" environment variable.
     2024-07-10 13:21:51,992 - numexpr.utils - INFO - Note: NumExpr detected 208 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
     2024-07-10 13:21:51,992 - numexpr.utils - INFO - NumExpr defaulting to 8 threads.
     /home/foremans/2024-07-10-131541/ezpz/venvs/aurora_nre_models_frameworks-2024.1/lib/python3.9/site-packages/pandas/core/computation/expressions.py:21: UserWarning: Pandas requires version '2.8.4' or newer of 'numexpr' (version '2.7.3' currently installed).
       from pandas.core.computation.check import NUMEXPR_INSTALLED
     [2024-07-10 13:21:54.534132][INFO][__init__:156] - Setting logging level to 'INFO' on 'RANK == 0'
     [2024-07-10 13:21:54.537096][INFO][__init__:157] - Setting logging level to 'CRITICAL' on all others 'RANK != 0'
     [2024-07-10 13:21:54.537529][INFO][__init__:160] - To disable this behavior, and log from ALL ranks (not recommended), set: 'export LOG_FROM_ALL_RANKS=1'  in your environment, and re-run.
     [2024-07-10 13:21:54.564493][INFO][dist:95] -
    
     [dist_info]:
       β€’ FRAMEWORK=pytorch
       β€’ DEVICE=xpu
       β€’ DEVICE_ID=xpu:0
       β€’ DISTRIBUTED_BACKEND=ccl
       β€’ GPUS_PER_NODE=12
       β€’ HOSTS=['x4017c4s5b0n0.hostmgmt2017.cm.aurora.alcf.anl.gov', 'x4017c4s6b0n0.hostmgmt2017.cm.aurora.alcf.anl.gov']
       β€’ HOSTFILE=/var/spool/pbs/aux/698077.aurora-pbs-0001.hostmgmt.cm.aurora.alcf.anl.gov
       β€’ HOSTNAME=x4017c4s5b0n0.hostmgmt2017.cm.aurora.alcf.anl.gov
       β€’ LOCAL_RANK=0
       β€’ MACHINE=Aurora
       β€’ NUM_NODES=2
       β€’ NGPUS=24
       β€’ NGPUS_AVAILABLE=24
       β€’ NODE_ID=0
       β€’ RANK=0
       β€’ SCHEDULER=PBS
       β€’ WORLD_SIZE_TOTAL=24
       β€’ WORLD_SIZE_IN_USE=1
       β€’ LAUNCH_CMD=mpiexec --verbose --envall -n 24 -ppn 12 --hostfile /var/spool/pbs/aux/698077.aurora-pbs-0001.hostmgmt.cm.aurora.alcf.anl.gov --cpu-bind depth -d 16
    
    
     [2024-07-10 13:21:54.591833][INFO][jobs:164] - Saving job env to /home/foremans/PBS-jobs/698077/.jobenv
     [2024-07-10 13:21:54.596525][INFO][jobs:164] - Saving job env to /home/foremans/2024-07-10-131541/ezpz/.jobenv
     [2024-07-10 13:21:54.613725][INFO][jobs:354] - Caught pbs_jobid='698077.aurora-pbs-0001.hostmgmt.cm.aurora.alcf.anl.gov', pbs_nodefile=PosixPath('/var/spool/pbs/aux/698077.aurora-pbs-0001.hostmgmt.cm.aurora.alcf.anl.gov') from env. Saving jobenv!
     [2024-07-10 13:21:54.655237][WARNING][jobs:144] - /home/foremans/PBS-jobs/698077  already in /home/foremans/PBS-jobs.log,  not appending !!
     [2024-07-10 13:21:54.655766][INFO][jobs:369] - Writing PBS env vars to  /home/foremans/PBS-jobs/698077 / jobenv{.sh, .yaml, .json}
     [2024-07-10 13:21:54.666092][INFO][jobs:241] - Saving job env to /home/foremans/PBS-jobs/698077/jobenv.sh
     [2024-07-10 13:21:54.682342][INFO][jobs:258] - Saving job env to /home/foremans/PBS-jobs/698077/jobenv.json
     [2024-07-10 13:21:54.700122][INFO][jobs:271] - Saving job env to /home/foremans/PBS-jobs/698077/jobenv.yaml
     [2024-07-10 13:21:54.707680][CRITICAL][jobs:381] -
     Run:
    
         source .jobenv
    
     to set these environment variables.
    
     6.59s user 8.17s system 16% cpu 1:27.78s total
     ```
    
    .jobenv:
    #[aurora_nre_models_frameworks-2024.1](πŸ‘» aurora_nre_models_frameworks-2024.1)
    #[🌌][01:21:58 PM][foremans@x4017c4s5b0n0][…/ezpz][🌱 jobs-rewrite]via ⨁ v1.3.450 [⏱ 1m27s]
    $ cat .jobenv
    #!/bin/bash --login
    FRAMEWORK="pytorch"
    DEVICE="xpu"
    DEVICE_ID="xpu:0"
    DISTRIBUTED_BACKEND="ccl"
    GPUS_PER_NODE="12"
    HOSTS="[x4017c4s5b0n0, x4017c4s6b0n0]"
    HOSTFILE="/var/spool/pbs/aux/698077.aurora-pbs-0001.hostmgmt.cm.aurora.alcf.anl.gov"
    HOSTNAME="x4017c4s5b0n0.hostmgmt2017.cm.aurora.alcf.anl.gov"
    LOCAL_RANK="0"
    MACHINE="Aurora"
    NUM_NODES="2"
    NGPUS="24"
    NGPUS_AVAILABLE="24"
    NODE_ID="0"
    RANK="0"
    SCHEDULER="PBS"
    WORLD_SIZE_TOTAL="24"
    WORLD_SIZE_IN_USE="1"
    LAUNCH_CMD="mpiexec --verbose --envall -n 24 -ppn 12 --hostfile /var/spool/pbs/aux/698077.aurora-pbs-0001.hostmgmt.cm.aurora.alcf.anl.gov --cpu-bind depth -d 16"
    PBS_O_HOME="/home/foremans"
    PBS_O_LANG="en_US.UTF-8"
    PBS_O_LOGNAME="foremans"
    PBS_O_PATH="/home/foremans/micromamba/condabin:/home/foremans/homebrew/bin:/home/foremans/homebrew/sbin:/home/foremans/bin/aurora:/opt/cray/pals/1.3.3/bin:/opt/cray/libfabric/1.15.2.0/bin:/opt/aurora/24.086.0/support/tools/gpu_validation:/opt/aurora/24.086.0/intel-gpu-umd/821.36/bin:/opt/aurora/24.086.0/CNDA/mpich/20231026/mpich-ofi-all-icc-default-pmix-gpu-drop20231026/bin:/opt/aurora/24.086.0/support/tools/mpi_wrapper_utils:/opt/aurora/24.086.0/CNDA/oneapi/dpcpp-ct/eng-20240227/bin:/opt/aurora/24.086.0/oneapi/advisor/latest/bin64:/opt/aurora/24.086.0/oneapi/vtune/latest/bin64:/opt/aurora/24.086.0/oneapi/debugger/latest/opt/debugger/bin:/opt/aurora/24.086.0/CNDA/oneapi/mkl/develop_20240229/bin:/opt/aurora/24.086.0/CNDA/oneapi/compiler/eng-20240227/bin:/opt/aurora/24.086.0/spack/gcc/0.7.0/install/linux-sles15-x86_64/gcc-12.2.0/gcc-12.2.0-jf4ov3v3scg7dvd76qhsuugl3jp42gfn/bin:/opt/clmgr/sbin:/opt/clmgr/bin:/opt/sgi/sbin:/opt/sgi/bin:/home/foremans/.local/bin:/home/foremans/bin:/usr/local/bin:/usr/bin:/bin:/opt/c3/bin:/usr/lib/mit/bin:/usr/lib/mit/sbin:/opt/pbs/bin:/sbin:/home/foremans/.local/share/kitty-ssh-kitten/kitty/bin:/home/foremans/.cargo/bin:/home/foremans/.fzf/bin:/home/foremans/.luarocks/bin"
    PBS_O_MAIL="/var/spool/mail/foremans"
    PBS_O_SHELL="/bin/zsh"
    PBS_O_TZ="America/Chicago"
    PBS_O_HOST="aurora-uan-0009.hostmgmt1000.cm.aurora.alcf.anl.gov"
    PBS_O_WORKDIR="/home/foremans/2024-07-10-131541/ezpz"
    PBS_O_SYSTEM="Linux"
    PBS_O_QUEUE="lustre_scaling"
    PBS_JOBID_SHORT="698077.aurora"
    PBS_HOOK_RESOURCES="eJydUNFuwyAM/KFNIixtoyHe9gl9tyhxEhYCzECr/P2cpZO67W0SD9h3vjs7I12RYEQ7RxiyblTeO1Nc8EfjarzrYXAe85oLLlnfKU/fw8p4H29grI01FLAT92EwzldC1tnRgKMp7oqwlZa/MWzYzVAPXOIYadVvLlvCDTO03sGyJvwFXCoFoE1DC2UrEbLtg+7zSyeOx/bQHmQn1JTsAm4xI2obl1QLZ6gUyUDBXEAK2YqTkGcpxaFpoD/JoZOtCjbVrNvmqIIDwhwrWdT7pAqxR1u0VJFPtMXhIMkbJmTOUJBUovjOFEjkIrmyMpfi5n0xduZr+q8L+10lzy7BBYOdFkNzZrESi/GPOwl146q4BbXoXoXgpz4qVoR/7rcP/3H+BG1nxmU="
    PBS_JOBNAME="STDIN"
    PBS_JOBID="698077.aurora-pbs-0001.hostmgmt.cm.aurora.alcf.anl.gov"
    PBS_QUEUE="lustre_scaling"
    PBS_JOBCOOKIE="5D073B7E1C16CA8D16018CC9224570E3"
    PBS_NODENUM="0"
    PBS_TASKNUM="1"
    PBS_MOMPORT="15003"
    PBS_NODEFILE="/var/spool/pbs/aux/698077.aurora-pbs-0001.hostmgmt.cm.aurora.alcf.anl.gov"
    PBS_ACCOUNT="Aurora_Deployment"
    PBS_JOBDIR="/home/foremans"
    PBS_ENVIRONMENT="PBS_INTERACTIVE"
    NHOSTS="2"
    NGPU_PER_HOST="12"
    BACKEND="ccl"
    alias launch="mpiexec --verbose --envall -n 24 -ppn 12 --hostfile /var/spool/pbs/aux/698077.aurora-pbs-0001.hostmgmt.cm.aurora.alcf.anl.gov --cpu-bind depth -d 16"
    echo "$(which launch)"

Custom hostfile

Bash

#[aurora_nre_models_frameworks-2024.1](πŸ‘» aurora_nre_models_frameworks-2024.1)
#[🌌][01:25:25 PM][foremans@x4017c4s5b0n0][…/ezpz][🌱 jobs-rewrite]via ⨁ v1.3.450
$ head -1 "$PBS_NODEFILE" > nodefile && cat nodefile
x4017c4s5b0n0.hostmgmt2017.cm.aurora.alcf.anl.gov

#[aurora_nre_models_frameworks-2024.1](πŸ‘» aurora_nre_models_frameworks-2024.1)
#[🌌][01:25:28 PM][foremans@x4017c4s5b0n0][…/ezpz][🌱 jobs-rewrite]via ⨁ v1.3.450
$ setup_alcf nodefile

[ezpz/bin/utils.sh]

[2024-07-10-132537]
    β€’ USER=foremans
    β€’ MACHINE=aurora
    β€’ HOST=x4017c4s5b0n0

[setupHost]
    β€’ Caught 1 arguments
    β€’ Caught 1 arguments
    β€’ hostfile=nodefile
        β€’ Writing PBS vars to: /home/foremans/.pbsenv

[save_pbs_env]
    β€’ Caught 1 arguments

    β€’ Caught hostfile != PBS_NODEFILE
        β€’ hostfile: nodefile
        β€’ PBS_NODEFILE: /var/spool/pbs/aux/698077.aurora-pbs-0001.hostmgmt.cm.aurora.alcf.anl.gov

    β€’ Using:
        β€’ hostfile: nodefile
        β€’ jobenv_file: /home/foremans/.pbsenv
      to calculate:
        β€’ num_hosts: 1
        β€’ num_gpus_per_host: 12
        β€’ num_gpus: 12
        β€’ DIST_LAUNCH: mpiexec --verbose --envall -n 12 -ppn 12 --hostfile nodefile --cpu-bind depth -d 16
    β€’ Setting:
        β€’ HOSTFILE: nodefile
        β€’ JOBENV_FILE: /home/foremans/.pbsenv

[HOSTS]
    β€’ [host:0] - x4017c4s5b0n0.hostmgmt2017.cm.aurora.alcf.anl.gov

[DIST INFO]
    β€’ HOSTFILE=nodefile
    β€’ NHOSTS=1
    β€’ NGPU_PER_HOST=12
    β€’ NGPUS=12
    β€’ DIST_LAUNCH=mpiexec --verbose --envall -n 12 -ppn 12 --hostfile nodefile --cpu-bind depth -d 16

[LAUNCH]:
    β€’ To launch across all available GPUs, use: launch
      launch = mpiexec --verbose --envall -n 12 -ppn 12 --hostfile nodefile --cpu-bind depth -d 16

Python

#[aurora_nre_models_frameworks-2024.1](πŸ‘» aurora_nre_models_frameworks-2024.1)
#[🌌][01:26:10 PM][foremans@x4017c4s5b0n0][…/ezpz][🌱 jobs-rewrite]via ⨁ v1.3.450 [⏱ 6s]
$ python3 -m ezpz.jobs --hostfile nodefile
2024-07-10 13:26:41,045 - numexpr.utils - INFO - Note: detected 208 virtual cores but NumExpr set to maximum of 64, check "NUMEXPR_MAX_THREADS" environment variable.
2024-07-10 13:26:41,045 - numexpr.utils - INFO - Note: NumExpr detected 208 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2024-07-10 13:26:41,045 - numexpr.utils - INFO - NumExpr defaulting to 8 threads.
/home/foremans/2024-07-10-131541/ezpz/venvs/aurora_nre_models_frameworks-2024.1/lib/python3.9/site-packages/pandas/core/computation/expressions.py:21: UserWarning: Pandas requires version '2.8.4' or newer of 'numexpr' (version '2.7.3' currently installed).
  from pandas.core.computation.check import NUMEXPR_INSTALLED
[2024-07-10 13:26:41.973940][INFO][__init__:156] - Setting logging level to 'INFO' on 'RANK == 0'
[2024-07-10 13:26:41.976941][INFO][__init__:157] - Setting logging level to 'CRITICAL' on all others 'RANK != 0'
[2024-07-10 13:26:41.977373][INFO][__init__:160] - To disable this behavior, and log from ALL ranks (not recommended), set: 'export LOG_FROM_ALL_RANKS=1'  in your environment, and re-run.
[2024-07-10 13:26:41.990751][WARNING][dist:1127] - Mismatch in `ngpus_in_use` and `ngpus_available` ngpus_in_use=12 vs. ngpus_available=24
[2024-07-10 13:26:41.991378][INFO][dist:95] -

[dist_info]:
  β€’ FRAMEWORK=pytorch
  β€’ DEVICE=xpu
  β€’ DEVICE_ID=xpu:0
  β€’ DISTRIBUTED_BACKEND=ccl
  β€’ GPUS_PER_NODE=12
  β€’ HOSTS=['x4017c4s5b0n0.hostmgmt2017.cm.aurora.alcf.anl.gov']
  β€’ HOSTFILE=/home/foremans/2024-07-10-131541/ezpz/nodefile
  β€’ HOSTNAME=x4017c4s5b0n0.hostmgmt2017.cm.aurora.alcf.anl.gov
  β€’ LOCAL_RANK=0
  β€’ MACHINE=Aurora
  β€’ NUM_NODES=1
  β€’ NGPUS=12
  β€’ NGPUS_AVAILABLE=24
  β€’ NODE_ID=0
  β€’ RANK=0
  β€’ SCHEDULER=PBS
  β€’ WORLD_SIZE_TOTAL=24
  β€’ WORLD_SIZE_IN_USE=1
  β€’ LAUNCH_CMD=mpiexec --verbose --envall -n 12 -ppn 12 --hostfile nodefile --cpu-bind depth -d 16


[2024-07-10 13:26:41.999545][WARNING][dist:1127] - Mismatch in `ngpus_in_use` and `ngpus_available` ngpus_in_use=12 vs. ngpus_available=24
[2024-07-10 13:26:42.002941][WARNING][dist:1127] - Mismatch in `ngpus_in_use` and `ngpus_available` ngpus_in_use=12 vs. ngpus_available=24
[2024-07-10 13:26:42.017104][WARNING][dist:1127] - Mismatch in `ngpus_in_use` and `ngpus_available` ngpus_in_use=12 vs. ngpus_available=24
[2024-07-10 13:26:42.017647][INFO][jobs:164] - Saving job env to /home/foremans/PBS-jobs/698077/.jobenv
[2024-07-10 13:26:42.022741][INFO][jobs:164] - Saving job env to /home/foremans/2024-07-10-131541/ezpz/.jobenv
[2024-07-10 13:26:42.027785][CRITICAL][jobs:381] -
Run:

    source .jobenv

to set these environment variables.

6.55s user 7.58s system 218% cpu 6.474s total
#[aurora_nre_models_frameworks-2024.1](πŸ‘» aurora_nre_models_frameworks-2024.1)
#[🌌][01:26:54 PM][foremans@x4017c4s5b0n0][…/ezpz][🌱 jobs-rewrite]via ⨁ v1.3.450 [⏱ 6s]
$ cat .jobenv
#!/bin/bash --login
FRAMEWORK="pytorch"
DEVICE="xpu"
DEVICE_ID="xpu:0"
DISTRIBUTED_BACKEND="ccl"
GPUS_PER_NODE="12"
HOSTS="[x4017c4s5b0n0]"
HOSTFILE="nodefile"
HOSTNAME="x4017c4s5b0n0.hostmgmt2017.cm.aurora.alcf.anl.gov"
LOCAL_RANK="0"
MACHINE="Aurora"
NUM_NODES="1"
NGPUS="12"
NGPUS_AVAILABLE="24"
NODE_ID="0"
RANK="0"
SCHEDULER="PBS"
WORLD_SIZE_TOTAL="24"
WORLD_SIZE_IN_USE="1"
LAUNCH_CMD="mpiexec --verbose --envall -n 12 -ppn 12 --hostfile nodefile --cpu-bind depth -d 16"
PBS_O_HOME="/home/foremans"
PBS_O_LANG="en_US.UTF-8"
PBS_O_LOGNAME="foremans"
PBS_O_PATH="/home/foremans/micromamba/condabin:/home/foremans/homebrew/bin:/home/foremans/homebrew/sbin:/home/foremans/bin/aurora:/opt/cray/pals/1.3.3/bin:/opt/cray/libfabric/1.15.2.0/bin:/opt/aurora/24.086.0/support/tools/gpu_validation:/opt/aurora/24.086.0/intel-gpu-umd/821.36/bin:/opt/aurora/24.086.0/CNDA/mpich/20231026/mpich-ofi-all-icc-default-pmix-gpu-drop20231026/bin:/opt/aurora/24.086.0/support/tools/mpi_wrapper_utils:/opt/aurora/24.086.0/CNDA/oneapi/dpcpp-ct/eng-20240227/bin:/opt/aurora/24.086.0/oneapi/advisor/latest/bin64:/opt/aurora/24.086.0/oneapi/vtune/latest/bin64:/opt/aurora/24.086.0/oneapi/debugger/latest/opt/debugger/bin:/opt/aurora/24.086.0/CNDA/oneapi/mkl/develop_20240229/bin:/opt/aurora/24.086.0/CNDA/oneapi/compiler/eng-20240227/bin:/opt/aurora/24.086.0/spack/gcc/0.7.0/install/linux-sles15-x86_64/gcc-12.2.0/gcc-12.2.0-jf4ov3v3scg7dvd76qhsuugl3jp42gfn/bin:/opt/clmgr/sbin:/opt/clmgr/bin:/opt/sgi/sbin:/opt/sgi/bin:/home/foremans/.local/bin:/home/foremans/bin:/usr/local/bin:/usr/bin:/bin:/opt/c3/bin:/usr/lib/mit/bin:/usr/lib/mit/sbin:/opt/pbs/bin:/sbin:/home/foremans/.local/share/kitty-ssh-kitten/kitty/bin:/home/foremans/.cargo/bin:/home/foremans/.fzf/bin:/home/foremans/.luarocks/bin"
PBS_O_MAIL="/var/spool/mail/foremans"
PBS_O_SHELL="/bin/zsh"
PBS_O_TZ="America/Chicago"
PBS_O_HOST="aurora-uan-0009.hostmgmt1000.cm.aurora.alcf.anl.gov"
PBS_O_WORKDIR="/home/foremans/2024-07-10-131541/ezpz"
PBS_O_SYSTEM="Linux"
PBS_O_QUEUE="lustre_scaling"
PBS_JOBID_SHORT="698077.aurora"
PBS_HOOK_RESOURCES="eJydUNFuwyAM/KFNIixtoyHe9gl9tyhxEhYCzECr/P2cpZO67W0SD9h3vjs7I12RYEQ7RxiyblTeO1Nc8EfjarzrYXAe85oLLlnfKU/fw8p4H29grI01FLAT92EwzldC1tnRgKMp7oqwlZa/MWzYzVAPXOIYadVvLlvCDTO03sGyJvwFXCoFoE1DC2UrEbLtg+7zSyeOx/bQHmQn1JTsAm4xI2obl1QLZ6gUyUDBXEAK2YqTkGcpxaFpoD/JoZOtCjbVrNvmqIIDwhwrWdT7pAqxR1u0VJFPtMXhIMkbJmTOUJBUovjOFEjkIrmyMpfi5n0xduZr+q8L+10lzy7BBYOdFkNzZrESi/GPOwl146q4BbXoXoXgpz4qVoR/7rcP/3H+BG1nxmU="
PBS_JOBNAME="STDIN"
PBS_JOBID="698077.aurora-pbs-0001.hostmgmt.cm.aurora.alcf.anl.gov"
PBS_QUEUE="lustre_scaling"
PBS_JOBCOOKIE="5D073B7E1C16CA8D16018CC9224570E3"
PBS_NODENUM="0"
PBS_TASKNUM="1"
PBS_MOMPORT="15003"
PBS_NODEFILE="/var/spool/pbs/aux/698077.aurora-pbs-0001.hostmgmt.cm.aurora.alcf.anl.gov"
PBS_ACCOUNT="Aurora_Deployment"
PBS_JOBDIR="/home/foremans"
PBS_ENVIRONMENT="PBS_INTERACTIVE"
NHOSTS="1"
NGPU_PER_HOST="12"
BACKEND="ccl"
alias launch="mpiexec --verbose --envall -n 12 -ppn 12 --hostfile nodefile --cpu-bind depth -d 16"
echo "$(which launch)"

To reset after using custom hostfile:

#[aurora_nre_models_frameworks-2024.1](πŸ‘» aurora_nre_models_frameworks-2024.1)
#[🌌][01:27:39 PM][foremans@x4017c4s5b0n0][…/ezpz][🌱 jobs-rewrite]via ⨁ v1.3.450
$ unset hostfile HOSTFILE

#[aurora_nre_models_frameworks-2024.1](πŸ‘» aurora_nre_models_frameworks-2024.1)
#[🌌][01:27:41 PM][foremans@x4017c4s5b0n0][…/ezpz][🌱 jobs-rewrite]via ⨁ v1.3.450
$ setup_alcf

[ezpz/bin/utils.sh]

[2024-07-10-132744]
    β€’ USER=foremans
    β€’ MACHINE=aurora
    β€’ HOST=x4017c4s5b0n0

[setupHost]
    β€’ Using hostfile: /var/spool/pbs/aux/698077.aurora-pbs-0001.hostmgmt.cm.aurora.alcf.anl.gov
    β€’ Found in environment:
        β€’ Writing PBS vars to: /home/foremans/.pbsenv

[save_pbs_env]
    β€’ Using:
        β€’ hostfile: /var/spool/pbs/aux/698077.aurora-pbs-0001.hostmgmt.cm.aurora.alcf.anl.gov
        β€’ jobenv_file: /home/foremans/.pbsenv
      to calculate:
        β€’ num_hosts: 2
        β€’ num_gpus_per_host: 12
        β€’ num_gpus: 24
        β€’ DIST_LAUNCH: mpiexec --verbose --envall -n 24 -ppn 12 --hostfile /var/spool/pbs/aux/698077.aurora-pbs-0001.hostmgmt.cm.aurora.alcf.anl.gov --cpu-bind depth -d 16
    β€’ Setting:
        β€’ HOSTFILE: /var/spool/pbs/aux/698077.aurora-pbs-0001.hostmgmt.cm.aurora.alcf.anl.gov
        β€’ JOBENV_FILE: /home/foremans/.pbsenv

[HOSTS]
    β€’ [host:0] - x4017c4s5b0n0.hostmgmt2017.cm.aurora.alcf.anl.gov
    β€’ [host:1] - x4017c4s6b0n0.hostmgmt2017.cm.aurora.alcf.anl.gov

[DIST INFO]
    β€’ HOSTFILE=/var/spool/pbs/aux/698077.aurora-pbs-0001.hostmgmt.cm.aurora.alcf.anl.gov
    β€’ NHOSTS=2
    β€’ NGPU_PER_HOST=12
    β€’ NGPUS=24
    β€’ DIST_LAUNCH=mpiexec --verbose --envall -n 24 -ppn 12 --hostfile /var/spool/pbs/aux/698077.aurora-pbs-0001.hostmgmt.cm.aurora.alcf.anl.gov --cpu-bind depth -d 16

[LAUNCH]:
    β€’ To launch across all available GPUs, use: launch
      launch = mpiexec --verbose --envall -n 24 -ppn 12 --hostfile /var/spool/pbs/aux/698077.aurora-pbs-0001.hostmgmt.cm.aurora.alcf.anl.gov --cpu-bind depth -d 16