Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updated wrap_rrdesi to fix multiple use cases. #2429

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 8 additions & 8 deletions .github/workflows/python-package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ jobs:
strategy:
fail-fast: true
matrix:
os: [ubuntu-latest]
os: [ubuntu-22.04]
python-version: ['3.9', '3.10'] # fuji+guadalupe, not ready for 3.11 yet?
astropy-version: ['==5.0', '<6'] # fuji+guadalupe, latest
fitsio-version: ['==1.1.6', '<2'] # fuji+guadalupe, latest
Expand Down Expand Up @@ -54,13 +54,13 @@ jobs:
strategy:
fail-fast: true
matrix:
os: [ubuntu-latest]
os: [ubuntu-22.04]
python-version: ['3.10'] # latest
astropy-version: ['<6'] # latest
fitsio-version: ['<2'] # latest
numpy-version: ['<1.23'] # to keep asscalar, used by astropy
env:
DESIUTIL_VERSION: 3.4.2
DESIUTIL_VERSION: 3.4.3
DESIMODEL_DATA: branches/test-0.18

steps:
Expand All @@ -78,7 +78,7 @@ jobs:
python -m pip install pytest pytest-cov coveralls
python -m pip install git+https://github.com/desihub/desiutil.git@${DESIUTIL_VERSION}#egg=desiutil
python -m pip install -r requirements.txt
python -m pip install specutils
python -m pip install specutils\<1.15
python -m pip install -U 'numpy${{ matrix.numpy-version }}'
python -m pip install -U 'astropy${{ matrix.astropy-version }}'
python -m pip cache remove fitsio
Expand All @@ -98,7 +98,7 @@ jobs:
strategy:
fail-fast: true
matrix:
os: [ubuntu-latest]
os: [ubuntu-22.04]
python-version: ['3.10']
steps:
- name: Checkout code
Expand Down Expand Up @@ -127,7 +127,7 @@ jobs:
strategy:
fail-fast: false
matrix:
os: [ubuntu-latest]
os: [ubuntu-22.04]
python-version: ['3.9']

steps:
Expand All @@ -151,7 +151,7 @@ jobs:
strategy:
fail-fast: false
matrix:
os: [ubuntu-latest]
os: [ubuntu-22.04]
python-version: ['3.9']
env:
DESIUTIL_VERSION: 3.3.0
Expand Down Expand Up @@ -180,7 +180,7 @@ jobs:
strategy:
fail-fast: false
matrix:
os: [ubuntu-latest]
os: [ubuntu-22.04]
python-version: ['3.9']

steps:
Expand Down
31 changes: 29 additions & 2 deletions bin/wrap_rrdesi
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,15 @@ from desispec.scripts import qsoqn, qsomgii, emlinefit
# MPI environment availability
have_mpi = None
if nersc_login_node():
have_mpi = False
print ("wrap_rrdesi should not be run on a login node.")
sys.exit(0)
else:
have_mpi = True
try:
import mpi4py.MPI as MPI
except ImportError:
have_mpi = False
print ("MPI not available")
print ("MPI not available - required to run wrap_rrdesi")
sys.exit(0)

parser = argparse.ArgumentParser(allow_abbrev=False)
Expand Down Expand Up @@ -61,6 +62,18 @@ afterburners = args.afterburners
comm = MPI.COMM_WORLD
comm_rank = comm.rank

#print ("COMM", comm.size, comm.rank)
env = os.environ
if not 'SLURM_STEP_RESV_PORTS' in os.environ and comm.rank == 0:
print ("WARNING: Detected that wrap_rrdesi is not being run with srun command.")
print ("WARNING: Calling directly can lead to under-utilizing resources.")
print ("Recommended syntax: srun -N nodes -n tasks -c 2 --gpu-bind=map_gpu:3,2,1,0 ./wrap_rrdesi [options]")
print ("\tEx: 8 tasks each with GPU support on 2 nodes:")
print ("\t\tsrun -N 2 -n 8 -c 2 --gpu-bind=map_gpu:3,2,1,0 wrap_rrdesi ...")
print ("\tEx: 64 tasks on 1 node and 4 GPUs - this will run on both GPU and non-GPU nodes at once:")
print ("\t\tsrun -N 1 -n 64 -c 2 --gpu-bind=map_gpu:3,2,1,0 wrap_rrdesi ...")


#Get number of nodes
nhosts = os.getenv('SLURM_NNODES')
if nhosts is None:
Expand All @@ -84,11 +97,21 @@ if args.gpu:
gpu_per_node = int(gpu_per_node)
ngpu = gpu_per_node*nhosts

if ngpu > comm.size:
if comm.rank == 0:
print (f"WARNING: wrap_rrdesi was called with {ngpu} GPUs but only {comm.size} MPI ranks.")
print (f"WARNING: Will only use {comm.size} GPUs.")
ngpu = comm.size

#Set GPU nodes
#We want the first gpu_per_node ranks of each host
ranks_per_host = comm.size // nhosts
use_gpu = (comm_rank % ranks_per_host) < gpu_per_node
ncpu_ranks = (comm.size - ngpu -1) // cpu_per_task + 1
#if comm.rank == 0:
# print (f'{ngpu=}, {gpu_per_node=}, {nhosts=}')
# print (f'{ranks_per_host=}, {use_gpu=}, {ncpu_ranks=}')
# print (f'{comm.size=}, {comm_rank=}, {cpu_per_task=}')
if args.gpuonly:
ncpu_ranks = 0

Expand Down Expand Up @@ -119,6 +142,7 @@ if use_gpu:
else:
myhost = ngpu + (comm.rank - gpu_per_node*(comm.rank // ranks_per_host)) // cpu_per_task
subcomm = comm.Split(myhost)
#print (f'{comm.rank=}, {ncomm=}, {myhost=}, {subcomm.size=}')

if comm.rank == 0:
print("Running "+str(len(inputfiles))+" input files on "+str(ngpu)+" GPUs and "+str(ncomm)+" total procs...")
Expand All @@ -127,6 +151,8 @@ if comm.rank == 0:
# In --gpuonly mode, CPU procs will not enter this block
if myhost < ncomm:
myfiles = np.array_split(inputfiles, ncomm)[myhost]
nfiles = len(myfiles)
#print (f'DEBUG: {myhost=} {ncomm=} {nfiles=} {myfiles=}, {comm.rank=}')
for infile in myfiles:
redrockfile = os.path.join(outdir, os.path.basename(infile).replace('coadd-', 'redrock-'))
if os.path.isfile(redrockfile) and not overwrite:
Expand All @@ -145,6 +171,7 @@ if myhost < ncomm:
opts.extend(args_to_pass)
if use_gpu:
opts.append('--gpu')
print (f'Running rrdesi on {myhost=} {subcomm.rank=} with options {opts=}')
desi.rrdesi(opts, comm=subcomm)

# optionally run all the afterburners
Expand Down
Loading