Skip to content

Commit

Permalink
[Radical-Cylon] Summit build and test (cylondata#685)
Browse files Browse the repository at this point in the history
* [Radical-Cylon] Summit build and test

Signed-off-by: Arup Sarker <[email protected]>

* Update README.md

* Update README.md

* Update README.md

* Update rp-experiment-setup.py

* Update rp-experiment-setup.py

* Update bkp_script-02-42.sh

* Update rp-experiment-setup.py

* Update rp-experiment-setup.py

---------

Signed-off-by: Arup Sarker <[email protected]>
  • Loading branch information
arupcsedu authored Dec 5, 2023
1 parent dc454ff commit 03e0338
Show file tree
Hide file tree
Showing 83 changed files with 230,147 additions and 75 deletions.
2 changes: 1 addition & 1 deletion rivanna/scripts/rp-experiment-setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@
echo "..............................................................."
export RADICAL_LOG_LVL="DEBUG"
export RADICAL_PROFILE="TRUE"
export RADICAL_PILOT_DBURL="mongodb://rct-tutorial:[email protected]:27017/rct-tutorial"
export RADICAL_PILOT_DBURL="mongodb://Your_mongo_db_url"
echo "..............................................................."
lscpu
echo "..............................................................."
Expand Down
8 changes: 4 additions & 4 deletions summit/README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Running Cylon on Summit

Gregor von Laszewski ([email protected]), and Niranda Perera
Gregor von Laszewski ([email protected]), Niranda Perera, and Arup Sarker([email protected], [email protected])

## Issues on Summit

Expand Down Expand Up @@ -51,7 +51,8 @@ cd cylon
git checkout summit
cd ~

module load python/3.7.7
module load python/3.7.7
module load gcc/9.3.0
python -m venv $HOME/CYLON
source $HOME/CYLON/bin/activate

Expand All @@ -65,7 +66,6 @@ pip install -U pytest-mpi
pip install cmake
pip install numpy

module load gcc/9.3.0

cd ~/cylon
rm -rf build
Expand Down Expand Up @@ -161,4 +161,4 @@ A script that changes the parameter of n is located in the scripts dir and can b

```bash
sh ./benchmark-summit.sh 2>&1 | tee -a summit.log
```
```
42 changes: 42 additions & 0 deletions summit/rp/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
SHELL=/bin/bash

.PHONY: load image-singularity image-docker project

all: ${EXECS}

login:
ssh -tt rivanna "/opt/rci/bin/ijob --partition=parallel --account=bii_dsc_community --time=30:00 --ntasks-per-node=4 --nodes=2"

load:
./load.sh

clean:
rm -f *.log *.err script-*.slurm cylonrun-* script-*.sh script-*.lsf
rm -r raptor-*.cfg rp.session.*

rp:
python rp-experiment-setup.py

cy: load
python cylon-experiment-setup.py

q:
bjobs --format="%.18i %.9P %.50j %.8u %.8T %.10M %.9l %.6D %R" --me

a:
bjobs --format="%all" --me


qq:
watch bjobs -u arupcseuva --format=\"%.18i %.9P %.50j %.8u %.8T %.10M %.9l %.6D %R\"

i:
cat out.log
cat out.err
fgrep "###" out.log | wc -l


cancel:
- ./cancel.sh
- bjobs -u ${USER}

40 changes: 40 additions & 0 deletions summit/rp/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
1. Load module and activate the python virtual environment

```
module load gcc/9.2.0 openmpi/3.1.6 cmake/3.23.3 python/3.7.7
source /path_to_virtual_environment/cylon_rp_venv/bin/activate
```

2. Install Cloudmesh and Radical-Pilot

```
pip install cloudmesh-common
pip install openssl-python
python3 -m pip install urllib3==1.26.6
pip install radical.pilot
```

3. Make change in the ```cylon-experiment-setup.py ``` or ```rp-experiment-setup.py ``` for the configurations changes.

```
combination = [\
# (1,4, 5000, "parallel", "exclusive"), # always pending
(2,37, 1000000, "parallel", ""),
(4,37, 35000000, "parallel", ""),
(6,37, 35000000, "parallel", ""),
(8,37, 35000000, "parallel", ""),
(10,37, 35000000, "parallel", ""),
(12,37, 35000000, "parallel", ""),
(14,37, 35000000, "parallel", ""),
]
```


4. Run the scripts as follows.

```bash
make clean # For cleaning
make rp # For radical pilot
make cy # for bear metal Cylon

```
36 changes: 36 additions & 0 deletions summit/rp/bkp_script-02-42.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#!/bin/bash

##BSUB -P gen150_bench
##BSUB -W 0:45
##BSUB -nnodes 2
##BSUB -alloc_flags smt1
##BSUB -J cylonrun-w-2
##BSUB -o cylonrun-w-2.%J
##BSUB -e cylonrun-w-2.%J

#module load python/3.7.7
#source $HOME/CYLON/bin/activate

#module load gcc/9.3.0
echo "..............................................................."
export RADICAL_LOG_LVL="DEBUG"
export RADICAL_PROFILE="TRUE"
export RADICAL_PILOT_DBURL="mongodb://You_Mongodb_url"
echo "..............................................................."
lscpu
echo "..............................................................."
BUILD_PATH=$HOME/project/dev/cylon/build
export LD_LIBRARY_PATH=$BUILD_PATH/arrow/install/lib64:$BUILD_PATH/glog/install/lib64:$BUILD_PATH/lib64:$BUILD_PATH/lib:$LD_LIBRARY_PATH


echo ################## 2 case
time python $HOME/project/dev/cylon/summit/rp/rp_scaling.py raptor-2-42.cfg
#-n 50000000 -s w


if ((0)); then
time python $HOME/project/dev/cylon/summit/rp/rp_scaling.py raptor-2-42.cfg
#-n 50000000 -s w
fi
#time python rp_scaling.py raptor-2-42.cfg
echo "..............................................................."
2 changes: 2 additions & 0 deletions summit/rp/cancel.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
#! /bin/sh
squeue -u $USER | awk '{print $1}' | tail -n+2 | xargs scancel
117 changes: 117 additions & 0 deletions summit/rp/cylon-experiment-setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
import os
import sys
from textwrap import dedent
from cloudmesh.common.util import writefile
from cloudmesh.common.util import readfile
from cloudmesh.common.util import banner
from cloudmesh.common.console import Console

counter = 0

debug = True
debug = False

partition="bii-gpu"

partition="parallel"


# (nodes, threads, rows, partition, "exclusive")
combination = [\
# (1,4, 5000, "parallel", "exclusive"), # always pending
#(2,37, 1000000, "parallel", ""),
(4,37, 35000000, "parallel", ""),
#(6,37, 35000000, "parallel", ""),
#(8,37, 35000000, "parallel", ""),
#(10,37, 35000000, "parallel", ""),
#(12,37, 35000000, "parallel", ""),
#(14,37, 35000000, "parallel", ""),
]

'''
combination = []
for nodes in range(0,50):
for threads in range(0,37):
combination.append((nodes+1, threads+1, "parallel", ""))
'''

total = len(combination)
jobid="-%j"
# jobid=""

f = open("submit.log", "w")
for nodes, threads, rows, partition, exclusive in combination:
counter = counter + 1

if exclusive == "exclusive":
exclusive = "#SBATCH --exclusive"
e = "e1"
else:
exclusive = ""
e = "e0"

usable_threads = nodes * threads

'''
cores_per_node = nodes * threads - 2
print (cores_per_node)
config = readfile("raptor.in.cfg")
config = config.replace("CORES_PER_NODE", str(cores_per_node))
config = config.replace("NO_OF_ROWS", str(rows))
print (config)
cfg_filename = f"raptor-{nodes}-{threads}.cfg"
writefile(cfg_filename, config)
'''
banner(f"SLURM {nodes} {threads} {counter}/{total}")
script=dedent(f"""
#!/bin/bash
#SBATCH --job-name=h-n={nodes:02d}-t={threads:02d}-e={e}
#SBATCH --nodes={nodes}
#SBATCH --ntasks-per-node={threads}
#SBATCH --time=15:00
#SBATCH --output=out-{nodes:02d}-{threads:02d}{jobid}.log
#SBATCH --error=out-{nodes:02d}-{threads:02d}{jobid}.err
#SBATCH --partition=parallel
#SBATCH -A bii_dsc_community
{exclusive}
echo "..............................................................."
module load gcc/9.2.0 openmpi/3.1.6 cmake/3.23.3 python/3.7.7
echo "..............................................................."
source /project/bii_dsc_community/djy8hg/cylon_rp_venv/bin/activate
echo "..............................................................."
BUILD_PATH=/project/bii_dsc_community/djy8hg/Project/cylon/build
echo "..............................................................."
export LD_LIBRARY_PATH=$BUILD_PATH/arrow/install/lib64:$BUILD_PATH/glog/install/lib64:$BUILD_PATH/lib64:$BUILD_PATH/lib:$LD_LIBRARY_PATH
echo "..............................................................."
which python gcc g++
echo "..............................................................."
lscpu
echo "..............................................................."
time srun --exact --nodes {nodes} --ntasks {usable_threads} python cylon_scaling.py -n {rows}
echo "..............................................................."
""").strip()

print (script)
filename = f"script-{nodes:02d}-{threads:02d}.slurm"
writefile(filename, script)


if not debug:

r = os.system(f"sbatch {filename}")
total = nodes * threads
if r == 0:
msg = f"{counter} submitted: nodes={nodes:02d} threads={threads:02d} total={total}"
Console.ok(msg)
else:
msg = f"{counter} failed: nodes={nodes:02d} threads={threads:02d} total={total}"
Console.error(msg)
f.writelines([msg, "\n"])
f.close()
Loading

0 comments on commit 03e0338

Please sign in to comment.