-
Notifications
You must be signed in to change notification settings - Fork 48
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[Radical-Cylon] Summit build and test (#685)
* [Radical-Cylon] Summit build and test Signed-off-by: Arup Sarker <[email protected]> * Update README.md * Update README.md * Update README.md * Update rp-experiment-setup.py * Update rp-experiment-setup.py * Update bkp_script-02-42.sh * Update rp-experiment-setup.py * Update rp-experiment-setup.py --------- Signed-off-by: Arup Sarker <[email protected]>
- Loading branch information
Showing
83 changed files
with
230,147 additions
and
75 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -85,7 +85,7 @@ | |
echo "..............................................................." | ||
export RADICAL_LOG_LVL="DEBUG" | ||
export RADICAL_PROFILE="TRUE" | ||
export RADICAL_PILOT_DBURL="mongodb://rct-tutorial:[email protected]:27017/rct-tutorial" | ||
export RADICAL_PILOT_DBURL="mongodb://Your_mongo_db_url" | ||
echo "..............................................................." | ||
lscpu | ||
echo "..............................................................." | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,6 @@ | ||
# Running Cylon on Summit | ||
|
||
Gregor von Laszewski ([email protected]), and Niranda Perera | ||
Gregor von Laszewski ([email protected]), Niranda Perera, and Arup Sarker([email protected], [email protected]) | ||
|
||
## Issues on Summit | ||
|
||
|
@@ -51,7 +51,8 @@ cd cylon | |
git checkout summit | ||
cd ~ | ||
|
||
module load python/3.7.7 | ||
module load python/3.7.7 | ||
module load gcc/9.3.0 | ||
python -m venv $HOME/CYLON | ||
source $HOME/CYLON/bin/activate | ||
|
||
|
@@ -65,7 +66,6 @@ pip install -U pytest-mpi | |
pip install cmake | ||
pip install numpy | ||
|
||
module load gcc/9.3.0 | ||
|
||
cd ~/cylon | ||
rm -rf build | ||
|
@@ -161,4 +161,4 @@ A script that changes the parameter of n is located in the scripts dir and can b | |
|
||
```bash | ||
sh ./benchmark-summit.sh 2>&1 | tee -a summit.log | ||
``` | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
SHELL=/bin/bash | ||
|
||
.PHONY: load image-singularity image-docker project | ||
|
||
all: ${EXECS} | ||
|
||
login: | ||
ssh -tt rivanna "/opt/rci/bin/ijob --partition=parallel --account=bii_dsc_community --time=30:00 --ntasks-per-node=4 --nodes=2" | ||
|
||
load: | ||
./load.sh | ||
|
||
clean: | ||
rm -f *.log *.err script-*.slurm cylonrun-* script-*.sh script-*.lsf | ||
rm -r raptor-*.cfg rp.session.* | ||
|
||
rp: | ||
python rp-experiment-setup.py | ||
|
||
cy: load | ||
python cylon-experiment-setup.py | ||
|
||
q: | ||
bjobs --format="%.18i %.9P %.50j %.8u %.8T %.10M %.9l %.6D %R" --me | ||
|
||
a: | ||
bjobs --format="%all" --me | ||
|
||
|
||
qq: | ||
watch bjobs -u arupcseuva --format=\"%.18i %.9P %.50j %.8u %.8T %.10M %.9l %.6D %R\" | ||
|
||
i: | ||
cat out.log | ||
cat out.err | ||
fgrep "###" out.log | wc -l | ||
|
||
|
||
cancel: | ||
- ./cancel.sh | ||
- bjobs -u ${USER} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
1. Load module and activate the python virtual environment | ||
|
||
``` | ||
module load gcc/9.2.0 openmpi/3.1.6 cmake/3.23.3 python/3.7.7 | ||
source /path_to_virtual_environment/cylon_rp_venv/bin/activate | ||
``` | ||
|
||
2. Install Cloudmesh and Radical-Pilot | ||
|
||
``` | ||
pip install cloudmesh-common | ||
pip install openssl-python | ||
python3 -m pip install urllib3==1.26.6 | ||
pip install radical.pilot | ||
``` | ||
|
||
3. Make change in the ```cylon-experiment-setup.py ``` or ```rp-experiment-setup.py ``` for the configurations changes. | ||
|
||
``` | ||
combination = [\ | ||
# (1,4, 5000, "parallel", "exclusive"), # always pending | ||
(2,37, 1000000, "parallel", ""), | ||
(4,37, 35000000, "parallel", ""), | ||
(6,37, 35000000, "parallel", ""), | ||
(8,37, 35000000, "parallel", ""), | ||
(10,37, 35000000, "parallel", ""), | ||
(12,37, 35000000, "parallel", ""), | ||
(14,37, 35000000, "parallel", ""), | ||
] | ||
``` | ||
|
||
|
||
4. Run the scripts as follows. | ||
|
||
```bash | ||
make clean # For cleaning | ||
make rp # For radical pilot | ||
make cy # for bear metal Cylon | ||
|
||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
#!/bin/bash | ||
|
||
##BSUB -P gen150_bench | ||
##BSUB -W 0:45 | ||
##BSUB -nnodes 2 | ||
##BSUB -alloc_flags smt1 | ||
##BSUB -J cylonrun-w-2 | ||
##BSUB -o cylonrun-w-2.%J | ||
##BSUB -e cylonrun-w-2.%J | ||
|
||
#module load python/3.7.7 | ||
#source $HOME/CYLON/bin/activate | ||
|
||
#module load gcc/9.3.0 | ||
echo "..............................................................." | ||
export RADICAL_LOG_LVL="DEBUG" | ||
export RADICAL_PROFILE="TRUE" | ||
export RADICAL_PILOT_DBURL="mongodb://You_Mongodb_url" | ||
echo "..............................................................." | ||
lscpu | ||
echo "..............................................................." | ||
BUILD_PATH=$HOME/project/dev/cylon/build | ||
export LD_LIBRARY_PATH=$BUILD_PATH/arrow/install/lib64:$BUILD_PATH/glog/install/lib64:$BUILD_PATH/lib64:$BUILD_PATH/lib:$LD_LIBRARY_PATH | ||
|
||
|
||
echo ################## 2 case | ||
time python $HOME/project/dev/cylon/summit/rp/rp_scaling.py raptor-2-42.cfg | ||
#-n 50000000 -s w | ||
|
||
|
||
if ((0)); then | ||
time python $HOME/project/dev/cylon/summit/rp/rp_scaling.py raptor-2-42.cfg | ||
#-n 50000000 -s w | ||
fi | ||
#time python rp_scaling.py raptor-2-42.cfg | ||
echo "..............................................................." |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
#! /bin/sh | ||
squeue -u $USER | awk '{print $1}' | tail -n+2 | xargs scancel |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,117 @@ | ||
import os | ||
import sys | ||
from textwrap import dedent | ||
from cloudmesh.common.util import writefile | ||
from cloudmesh.common.util import readfile | ||
from cloudmesh.common.util import banner | ||
from cloudmesh.common.console import Console | ||
|
||
counter = 0 | ||
|
||
debug = True | ||
debug = False | ||
|
||
partition="bii-gpu" | ||
|
||
partition="parallel" | ||
|
||
|
||
# (nodes, threads, rows, partition, "exclusive") | ||
combination = [\ | ||
# (1,4, 5000, "parallel", "exclusive"), # always pending | ||
#(2,37, 1000000, "parallel", ""), | ||
(4,37, 35000000, "parallel", ""), | ||
#(6,37, 35000000, "parallel", ""), | ||
#(8,37, 35000000, "parallel", ""), | ||
#(10,37, 35000000, "parallel", ""), | ||
#(12,37, 35000000, "parallel", ""), | ||
#(14,37, 35000000, "parallel", ""), | ||
] | ||
|
||
''' | ||
combination = [] | ||
for nodes in range(0,50): | ||
for threads in range(0,37): | ||
combination.append((nodes+1, threads+1, "parallel", "")) | ||
''' | ||
|
||
total = len(combination) | ||
jobid="-%j" | ||
# jobid="" | ||
|
||
f = open("submit.log", "w") | ||
for nodes, threads, rows, partition, exclusive in combination: | ||
counter = counter + 1 | ||
|
||
if exclusive == "exclusive": | ||
exclusive = "#SBATCH --exclusive" | ||
e = "e1" | ||
else: | ||
exclusive = "" | ||
e = "e0" | ||
|
||
usable_threads = nodes * threads | ||
|
||
''' | ||
cores_per_node = nodes * threads - 2 | ||
print (cores_per_node) | ||
config = readfile("raptor.in.cfg") | ||
config = config.replace("CORES_PER_NODE", str(cores_per_node)) | ||
config = config.replace("NO_OF_ROWS", str(rows)) | ||
print (config) | ||
cfg_filename = f"raptor-{nodes}-{threads}.cfg" | ||
writefile(cfg_filename, config) | ||
''' | ||
banner(f"SLURM {nodes} {threads} {counter}/{total}") | ||
script=dedent(f""" | ||
#!/bin/bash | ||
#SBATCH --job-name=h-n={nodes:02d}-t={threads:02d}-e={e} | ||
#SBATCH --nodes={nodes} | ||
#SBATCH --ntasks-per-node={threads} | ||
#SBATCH --time=15:00 | ||
#SBATCH --output=out-{nodes:02d}-{threads:02d}{jobid}.log | ||
#SBATCH --error=out-{nodes:02d}-{threads:02d}{jobid}.err | ||
#SBATCH --partition=parallel | ||
#SBATCH -A bii_dsc_community | ||
{exclusive} | ||
echo "..............................................................." | ||
module load gcc/9.2.0 openmpi/3.1.6 cmake/3.23.3 python/3.7.7 | ||
echo "..............................................................." | ||
source /project/bii_dsc_community/djy8hg/cylon_rp_venv/bin/activate | ||
echo "..............................................................." | ||
BUILD_PATH=/project/bii_dsc_community/djy8hg/Project/cylon/build | ||
echo "..............................................................." | ||
export LD_LIBRARY_PATH=$BUILD_PATH/arrow/install/lib64:$BUILD_PATH/glog/install/lib64:$BUILD_PATH/lib64:$BUILD_PATH/lib:$LD_LIBRARY_PATH | ||
echo "..............................................................." | ||
which python gcc g++ | ||
echo "..............................................................." | ||
lscpu | ||
echo "..............................................................." | ||
time srun --exact --nodes {nodes} --ntasks {usable_threads} python cylon_scaling.py -n {rows} | ||
echo "..............................................................." | ||
""").strip() | ||
|
||
print (script) | ||
filename = f"script-{nodes:02d}-{threads:02d}.slurm" | ||
writefile(filename, script) | ||
|
||
|
||
if not debug: | ||
|
||
r = os.system(f"sbatch {filename}") | ||
total = nodes * threads | ||
if r == 0: | ||
msg = f"{counter} submitted: nodes={nodes:02d} threads={threads:02d} total={total}" | ||
Console.ok(msg) | ||
else: | ||
msg = f"{counter} failed: nodes={nodes:02d} threads={threads:02d} total={total}" | ||
Console.error(msg) | ||
f.writelines([msg, "\n"]) | ||
f.close() |
Oops, something went wrong.