Skip to content

Commit

Permalink
[Cylon] Scripts for scaling test (#670)
Browse files Browse the repository at this point in the history
* [Cylon] Scripts for scaling test

Signed-off-by: Arup Sarker <[email protected]>

* [Cylon] Update scripts and documentation

Signed-off-by: Arup Sarker <[email protected]>

---------

Signed-off-by: Arup Sarker <[email protected]>
  • Loading branch information
arupcsedu authored Aug 5, 2023
1 parent be9c5f6 commit d9c655c
Show file tree
Hide file tree
Showing 13 changed files with 1,168 additions and 32 deletions.
42 changes: 42 additions & 0 deletions rivanna/scripts/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
SHELL=/bin/bash

.PHONY: load image-singularity image-docker project

all: ${EXECS}

login:
ssh -tt rivanna "/opt/rci/bin/ijob --partition=parallel --account=bii_dsc_community --time=30:00 --ntasks-per-node=4 --nodes=2"

load:
./load.sh

clean:
rm -f *.log *.err script-*.slurm
rm -r raptor-*.cfg rp.session.*

rp: load
python rp-experiment-setup.py

cy: load
python cylon-experiment-setup.py

q:
squeue --format="%.18i %.9P %.50j %.8u %.8T %.10M %.9l %.6D %R" --me

a:
squeue --format="%all" --me


qq:
watch squeue --format=\"%.18i %.9P %.50j %.8u %.8T %.10M %.9l %.6D %R\" --me

i:
cat out.log
cat out.err
fgrep "###" out.log | wc -l


cancel:
- ./cancel.sh
- squeue -u ${USER}

51 changes: 23 additions & 28 deletions rivanna/scripts/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,38 +6,33 @@ pip install openssl-python
python3 -m pip install urllib3==1.26.6
```

2. Run the scripts in set of **compute nodes** as follows.
2. Make change in the ```cylon-experiment-setup.py ``` or ```cylon-experiment-setup.py ``` for the configurations changes.

```bash
#!/bin/bash
#SBATCH --nodes=4
#SBATCH --ntasks-per-node=40
#SBATCH --exclusive
#SBATCH --time=1:00:00
#SBATCH --partition=bii
#SBATCH -A bii_dsc_community
#SBATCH --output=rivanna/scripts/cylogs/mpirun-96t-4n-160w-35m-%x-%j.out
#SBATCH --error=rivanna/scripts/cylogs/mpirun-96t-4n-160w-35m-%x-%j.err


module load gcc/9.2.0 openmpi/3.1.6 cmake/3.23.3 python/3.7.7

#module load gcc/11.2.0
#module load openmpi/4.1.4
#module load python/3.11.1

#source $HOME/CYLON/bin/activate
source $HOME/cylon_rp_venv/bin/activate

BUILD_PATH=$PWD/build

export LD_LIBRARY_PATH=$BUILD_PATH/arrow/install/lib64:$BUILD_PATH/glog/install/lib64:$BUILD_PATH/lib64:$BUILD_PATH/lib:$LD_LIBRARY_PATH
```
combination = [\
# (1,4, 5000, "parallel", "exclusive"), # always pending
(2,37, 1000000, "parallel", ""),
(4,37, 35000000, "parallel", ""),
(6,37, 35000000, "parallel", ""),
(8,37, 35000000, "parallel", ""),
(10,37, 35000000, "parallel", ""),
(12,37, 35000000, "parallel", ""),
(14,37, 35000000, "parallel", ""),
]
```


which python gcc g++
3. Load module and activate the python virtual environment

```
module load gcc/9.2.0 openmpi/3.1.6 cmake/3.23.3 python/3.7.7
source /path_to_virtual_environment/cylon_rp_venv/bin/activate
```
4. Run the scripts as follows.

#srun -n 160 python $PWD/rivanna/scripts/cylon_scaling.py -n 35000000
mpirun -np 160 python rivanna/scripts/cylon_scaling.py -n 35000000
```bash
make clean # For cleaning
make rp # For radical pilot
make cy # for bear metal Cylon

```
2 changes: 2 additions & 0 deletions rivanna/scripts/cancel.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
#! /bin/sh
squeue -u $USER | awk '{print $1}' | tail -n+2 | xargs scancel
117 changes: 117 additions & 0 deletions rivanna/scripts/cylon-experiment-setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
import os
import sys
from textwrap import dedent
from cloudmesh.common.util import writefile
from cloudmesh.common.util import readfile
from cloudmesh.common.util import banner
from cloudmesh.common.console import Console

counter = 0

debug = True
debug = False

partition="bii-gpu"

partition="parallel"


# (nodes, threads, rows, partition, "exclusive")
combination = [\
# (1,4, 5000, "parallel", "exclusive"), # always pending
(2,37, 1000000, "parallel", ""),
(4,37, 35000000, "parallel", ""),
(6,37, 35000000, "parallel", ""),
(8,37, 35000000, "parallel", ""),
(10,37, 35000000, "parallel", ""),
(12,37, 35000000, "parallel", ""),
(14,37, 35000000, "parallel", ""),
]

'''
combination = []
for nodes in range(0,50):
for threads in range(0,37):
combination.append((nodes+1, threads+1, "parallel", ""))
'''

total = len(combination)
jobid="-%j"
# jobid=""

f = open("submit.log", "w")
for nodes, threads, rows, partition, exclusive in combination:
counter = counter + 1

if exclusive == "exclusive":
exclusive = "#SBATCH --exclusive"
e = "e1"
else:
exclusive = ""
e = "e0"

usable_threads = nodes * threads

'''
cores_per_node = nodes * threads - 2
print (cores_per_node)
config = readfile("raptor.in.cfg")
config = config.replace("CORES_PER_NODE", str(cores_per_node))
config = config.replace("NO_OF_ROWS", str(rows))
print (config)
cfg_filename = f"raptor-{nodes}-{threads}.cfg"
writefile(cfg_filename, config)
'''
banner(f"SLURM {nodes} {threads} {counter}/{total}")
script=dedent(f"""
#!/bin/bash
#SBATCH --job-name=h-n={nodes:02d}-t={threads:02d}-e={e}
#SBATCH --nodes={nodes}
#SBATCH --ntasks-per-node={threads}
#SBATCH --time=15:00
#SBATCH --output=out-{nodes:02d}-{threads:02d}{jobid}.log
#SBATCH --error=out-{nodes:02d}-{threads:02d}{jobid}.err
#SBATCH --partition=parallel
#SBATCH -A bii_dsc_community
{exclusive}
echo "..............................................................."
module load gcc/9.2.0 openmpi/3.1.6 cmake/3.23.3 python/3.7.7
echo "..............................................................."
source /project/bii_dsc_community/djy8hg/cylon_rp_venv/bin/activate
echo "..............................................................."
BUILD_PATH=/project/bii_dsc_community/djy8hg/Project/cylon/build
echo "..............................................................."
export LD_LIBRARY_PATH=$BUILD_PATH/arrow/install/lib64:$BUILD_PATH/glog/install/lib64:$BUILD_PATH/lib64:$BUILD_PATH/lib:$LD_LIBRARY_PATH
echo "..............................................................."
which python gcc g++
echo "..............................................................."
lscpu
echo "..............................................................."
time srun --exact --nodes {nodes} --ntasks {usable_threads} python cylon_scaling.py -n {rows}
echo "..............................................................."
""").strip()

print (script)
filename = f"script-{nodes:02d}-{threads:02d}.slurm"
writefile(filename, script)


if not debug:

r = os.system(f"sbatch {filename}")
total = nodes * threads
if r == 0:
msg = f"{counter} submitted: nodes={nodes:02d} threads={threads:02d} total={total}"
Console.ok(msg)
else:
msg = f"{counter} failed: nodes={nodes:02d} threads={threads:02d} total={total}"
Console.error(msg)
f.writelines([msg, "\n"])
f.close()
60 changes: 56 additions & 4 deletions rivanna/scripts/cylon_scaling.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from cloudmesh.common.Shell import Shell


def join(data=None):
def cylon_join(data=None):
StopWatch.start(f"join_total_{data['host']}_{data['rows']}_{data['it']}")

comm = MPI.COMM_WORLD
Expand Down Expand Up @@ -60,8 +60,57 @@ def join(data=None):
StopWatch.benchmark(tag=str(data))

env.finalize()

def cylon_sort(data=None):
StopWatch.start(f"sort_total_{data['host']}_{data['rows']}_{data['it']}")

comm = MPI.COMM_WORLD

config = MPIConfig(comm)
env = CylonEnv(config=config, distributed=True)

u = data['unique']

if data['scaling'] == 'w': # weak
num_rows = data['rows']
max_val = num_rows * env.world_size
else: # 's' strong
max_val = data['rows']
num_rows = int(data['rows'] / env.world_size)

rng = default_rng(seed=env.rank)
data1 = rng.integers(0, int(max_val * u), size=(num_rows, 2))

df1 = DataFrame(pd.DataFrame(data1).add_prefix("col"))

if env.rank == 0:
print("Task# ", data['task'])

for i in range(data['it']):
env.barrier()
StopWatch.start(f"sort_{i}_{data['host']}_{data['rows']}_{data['it']}")
t1 = time.time()
df3 = df1.sort_values(by=[0], env=env)
env.barrier()
t2 = time.time()
t = (t2 - t1)
sum_t = comm.reduce(t)
tot_l = comm.reduce(len(df3))

if env.rank == 0:
avg_t = sum_t / env.world_size
print("### ", data['scaling'], env.world_size, num_rows, max_val, i, avg_t, tot_l)
StopWatch.stop(f"sort_{i}_{data['host']}_{data['rows']}_{data['it']}")

def slice(data=None):
StopWatch.stop(f"sort_total_{data['host']}_{data['rows']}_{data['it']}")

if env.rank == 0:
StopWatch.benchmark(tag=str(data))

env.finalize()


def cylon_slice(data=None):
StopWatch.start(f"slice_total_{data['host']}_{data['rows']}_{data['it']}")

comm = MPI.COMM_WORLD
Expand Down Expand Up @@ -113,6 +162,7 @@ def slice(data=None):

env.finalize()


if __name__ == "__main__":
parser = argparse.ArgumentParser(description="weak scaling")
parser.add_argument('-n', dest='rows', type=int, required=True)
Expand All @@ -123,9 +173,11 @@ def slice(data=None):

args = vars(parser.parse_args())
args['host'] = "rivanna"
for i in range(160):
for i in range(1):
args['task'] = i
join(args)
#cylon_slice(args)
#cylon_join(args)
cylon_sort(args)

# os.system(f"{git} branch | fgrep '*' ")
# os.system(f"{git} rev-parse HEAD")
2 changes: 2 additions & 0 deletions rivanna/scripts/load.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
module load gcc/9.2.0 openmpi/3.1.6 cmake/3.23.3 python/3.7.7
source /project/bii_dsc_community/djy8hg/cylon_rp_venv/bin/activate
55 changes: 55 additions & 0 deletions rivanna/scripts/raptor.in.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
{
# resource configuration
"cores_per_node" : CORES_PER_NODE,
"gpus_per_node" : 0,
"no_of_rows" : NO_OF_ROWS,
# raptor configuration
"n_masters" : 1,
"n_workers" : 1,
"masters_per_node" : 1,
"nodes_per_worker" : 1,

# extra nodes for non-raptor rp tasks
"nodes_rp" : 1,
# extra resources for the rp agent (optional)
"nodes_agent" : 0,

# pilot runtime in min
"runtime" : 60,

# task configuration
"cores_per_task" : 1,
"sleep" : 3,
# These are used as the range of the for loops for defining and submitting
# non-raptor and raptor tasks, respectively.
"tasks_rp" : 1,
"tasks_raptor" : 1,

"pilot_descr": {
"resource" : "uva.rivanna",
"runtime" : 60,
"access_schema": "interactive",
"queue" : "parallel",
"project" : "bii_dsc_community"
},

"master_descr": {
"mode" : "raptor.master",
"named_env" : "cylon_rp_venv",
"executable" : "./raptor_master.py"
},

"worker_descr": {
"mode" : "raptor.worker",
"named_env" : "cylon_rp_venv",
"pre_exec" : ["module load gcc/9.2.0",
"module load openmpi/3.1.6",
"module load python/3.7.7",
"export LD_LIBRARY_PATH=$HOME/rc_arup/cylon/build/arrow/install/lib64:$HOME/rc_arup/cylon/build/glog/install/lib64:$HOME/rc_arup/cylon/build/lib64:$HOME/rc_arup/cylon/build/lib:$LD_LIBRARY_PATH"
],

# custom worker class
"raptor_class" : "MyWorker",
"raptor_file" : "./raptor_worker.py"
}
}
Loading

0 comments on commit d9c655c

Please sign in to comment.