Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Radical-Cylon] Summit build and test #685

Merged
merged 9 commits into from
Dec 5, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion rivanna/scripts/rp-experiment-setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@
echo "..............................................................."
export RADICAL_LOG_LVL="DEBUG"
export RADICAL_PROFILE="TRUE"
export RADICAL_PILOT_DBURL="mongodb://rct-tutorial:[email protected]:27017/rct-tutorial"
export RADICAL_PILOT_DBURL="mongodb://Your_mongo_db_url"
echo "..............................................................."
lscpu
echo "..............................................................."
Expand Down
8 changes: 4 additions & 4 deletions summit/README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Running Cylon on Summit

Gregor von Laszewski ([email protected]), and Niranda Perera
Gregor von Laszewski ([email protected]), Niranda Perera, and Arup Sarker([email protected], [email protected])

## Issues on Summit

Expand Down Expand Up @@ -51,7 +51,8 @@ cd cylon
git checkout summit
cd ~

module load python/3.7.7
module load python/3.7.7
module load gcc/9.3.0
python -m venv $HOME/CYLON
source $HOME/CYLON/bin/activate

Expand All @@ -65,7 +66,6 @@ pip install -U pytest-mpi
pip install cmake
pip install numpy

module load gcc/9.3.0

cd ~/cylon
rm -rf build
Expand Down Expand Up @@ -161,4 +161,4 @@ A script that changes the parameter of n is located in the scripts dir and can b

```bash
sh ./benchmark-summit.sh 2>&1 | tee -a summit.log
```
```
42 changes: 42 additions & 0 deletions summit/rp/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
SHELL=/bin/bash

.PHONY: load image-singularity image-docker project

all: ${EXECS}

login:
ssh -tt rivanna "/opt/rci/bin/ijob --partition=parallel --account=bii_dsc_community --time=30:00 --ntasks-per-node=4 --nodes=2"

load:
./load.sh

clean:
rm -f *.log *.err script-*.slurm cylonrun-* script-*.sh script-*.lsf
rm -r raptor-*.cfg rp.session.*

rp:
python rp-experiment-setup.py

cy: load
python cylon-experiment-setup.py

q:
bjobs --format="%.18i %.9P %.50j %.8u %.8T %.10M %.9l %.6D %R" --me

a:
bjobs --format="%all" --me


qq:
watch bjobs -u arupcseuva --format=\"%.18i %.9P %.50j %.8u %.8T %.10M %.9l %.6D %R\"

i:
cat out.log
cat out.err
fgrep "###" out.log | wc -l


cancel:
- ./cancel.sh
- bjobs -u ${USER}

40 changes: 40 additions & 0 deletions summit/rp/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
1. Load module and activate the python virtual environment

```
module load gcc/9.2.0 openmpi/3.1.6 cmake/3.23.3 python/3.7.7
source /path_to_virtual_environment/cylon_rp_venv/bin/activate
```

2. Install Cloudmesh and Radical-Pilot

```
pip install cloudmesh-common
pip install openssl-python
python3 -m pip install urllib3==1.26.6
pip install radical.pilot
```

3. Make change in the ```cylon-experiment-setup.py ``` or ```rp-experiment-setup.py ``` for the configurations changes.

```
combination = [\
# (1,4, 5000, "parallel", "exclusive"), # always pending
(2,37, 1000000, "parallel", ""),
(4,37, 35000000, "parallel", ""),
(6,37, 35000000, "parallel", ""),
(8,37, 35000000, "parallel", ""),
(10,37, 35000000, "parallel", ""),
(12,37, 35000000, "parallel", ""),
(14,37, 35000000, "parallel", ""),
]
```


4. Run the scripts as follows.

```bash
make clean # For cleaning
make rp # For radical pilot
make cy # for bear metal Cylon

```
36 changes: 36 additions & 0 deletions summit/rp/bkp_script-02-42.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#!/bin/bash

##BSUB -P gen150_bench
##BSUB -W 0:45
##BSUB -nnodes 2
##BSUB -alloc_flags smt1
##BSUB -J cylonrun-w-2
##BSUB -o cylonrun-w-2.%J
##BSUB -e cylonrun-w-2.%J

#module load python/3.7.7
#source $HOME/CYLON/bin/activate

#module load gcc/9.3.0
echo "..............................................................."
export RADICAL_LOG_LVL="DEBUG"
export RADICAL_PROFILE="TRUE"
export RADICAL_PILOT_DBURL="mongodb://You_Mongodb_url"
echo "..............................................................."
lscpu
echo "..............................................................."
BUILD_PATH=$HOME/project/dev/cylon/build
export LD_LIBRARY_PATH=$BUILD_PATH/arrow/install/lib64:$BUILD_PATH/glog/install/lib64:$BUILD_PATH/lib64:$BUILD_PATH/lib:$LD_LIBRARY_PATH


echo ################## 2 case
time python $HOME/project/dev/cylon/summit/rp/rp_scaling.py raptor-2-42.cfg
#-n 50000000 -s w


if ((0)); then
time python $HOME/project/dev/cylon/summit/rp/rp_scaling.py raptor-2-42.cfg
#-n 50000000 -s w
fi
#time python rp_scaling.py raptor-2-42.cfg
echo "..............................................................."
2 changes: 2 additions & 0 deletions summit/rp/cancel.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
#! /bin/sh
squeue -u $USER | awk '{print $1}' | tail -n+2 | xargs scancel
117 changes: 117 additions & 0 deletions summit/rp/cylon-experiment-setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
import os
import sys
from textwrap import dedent
from cloudmesh.common.util import writefile
from cloudmesh.common.util import readfile
from cloudmesh.common.util import banner
from cloudmesh.common.console import Console

counter = 0

debug = True
debug = False

partition="bii-gpu"

partition="parallel"


# (nodes, threads, rows, partition, "exclusive")
combination = [\
# (1,4, 5000, "parallel", "exclusive"), # always pending
#(2,37, 1000000, "parallel", ""),
(4,37, 35000000, "parallel", ""),
#(6,37, 35000000, "parallel", ""),
#(8,37, 35000000, "parallel", ""),
#(10,37, 35000000, "parallel", ""),
#(12,37, 35000000, "parallel", ""),
#(14,37, 35000000, "parallel", ""),
]

'''
combination = []
for nodes in range(0,50):
for threads in range(0,37):
combination.append((nodes+1, threads+1, "parallel", ""))
'''

total = len(combination)
jobid="-%j"
# jobid=""

f = open("submit.log", "w")
for nodes, threads, rows, partition, exclusive in combination:
counter = counter + 1

if exclusive == "exclusive":
exclusive = "#SBATCH --exclusive"
e = "e1"
else:
exclusive = ""
e = "e0"

usable_threads = nodes * threads

'''
cores_per_node = nodes * threads - 2

print (cores_per_node)

config = readfile("raptor.in.cfg")

config = config.replace("CORES_PER_NODE", str(cores_per_node))
config = config.replace("NO_OF_ROWS", str(rows))


print (config)

cfg_filename = f"raptor-{nodes}-{threads}.cfg"

writefile(cfg_filename, config)
'''
banner(f"SLURM {nodes} {threads} {counter}/{total}")
script=dedent(f"""
#!/bin/bash
#SBATCH --job-name=h-n={nodes:02d}-t={threads:02d}-e={e}
#SBATCH --nodes={nodes}
#SBATCH --ntasks-per-node={threads}
#SBATCH --time=15:00
#SBATCH --output=out-{nodes:02d}-{threads:02d}{jobid}.log
#SBATCH --error=out-{nodes:02d}-{threads:02d}{jobid}.err
#SBATCH --partition=parallel
#SBATCH -A bii_dsc_community
{exclusive}
echo "..............................................................."
module load gcc/9.2.0 openmpi/3.1.6 cmake/3.23.3 python/3.7.7
echo "..............................................................."
source /project/bii_dsc_community/djy8hg/cylon_rp_venv/bin/activate
echo "..............................................................."
BUILD_PATH=/project/bii_dsc_community/djy8hg/Project/cylon/build
echo "..............................................................."
export LD_LIBRARY_PATH=$BUILD_PATH/arrow/install/lib64:$BUILD_PATH/glog/install/lib64:$BUILD_PATH/lib64:$BUILD_PATH/lib:$LD_LIBRARY_PATH
echo "..............................................................."
which python gcc g++
echo "..............................................................."
lscpu
echo "..............................................................."
time srun --exact --nodes {nodes} --ntasks {usable_threads} python cylon_scaling.py -n {rows}
echo "..............................................................."
""").strip()

print (script)
filename = f"script-{nodes:02d}-{threads:02d}.slurm"
writefile(filename, script)


if not debug:

r = os.system(f"sbatch {filename}")
total = nodes * threads
if r == 0:
msg = f"{counter} submitted: nodes={nodes:02d} threads={threads:02d} total={total}"
Console.ok(msg)
else:
msg = f"{counter} failed: nodes={nodes:02d} threads={threads:02d} total={total}"
Console.error(msg)
f.writelines([msg, "\n"])
f.close()
Loading
Loading