diff --git a/nersc/README.md b/nersc/README.md new file mode 100644 index 0000000..1883948 --- /dev/null +++ b/nersc/README.md @@ -0,0 +1,84 @@ +# Run QML Benchmarks on Perlmutter + +## Setup Podman + +All the following commands to be executed on Perlmutter. + +Build podman image from dockerfile: +``` +podman-hpc build -f Dockerfile.ubu22-PennyLane -t tgermain/ubu22-pennylane > podman_build.out +``` + +**TODO:** Add command to install Ray in dockerfile + +... or install in new image +``` +podman-hpc run -it --name ray tgermain/ubu22-pennylane +# in the container +pip install ray +exit +# +podman-hpc commit ray tgermain/ubu22-pennylane-ray +``` + +Locally install `qml_benchmarks` with dependencies for development: +``` +mkdir qml-benchmarks-devel/nersc/local + +IMG=tgermain/ubu22-pennylane-ray +CFSH=/global/cfs/cdirs/m4693 # CFS home +REPO_DIR=$CFSH/qml-benchmarks-devel # qml-benchmark repo +LOCAL_DIR=$REPO_DIR/nersc/local # to store local python files +WORK_DIR=$REPO_DIR/nersc/ray/workdir # to store output files + +podman-hpc run -it \ + --volume $LOCAL_DIR:/root \ + --volume $REPO_DIR:/qml-benchmarks \ + --volume $WORK_DIR:/work_dir \ + -e HDF5_USE_FILE_LOCKING='FALSE' \ + --workdir /work_dir \ + $IMG bash + +# in the container +cd /qml-benchmarks +pip3 install --user . # install in /root/.local +``` + +**Note:** `pip3 install --user .` will install in `/root/.local`, mounted to container. + +To make image available on CPU/GPU nodes, migrate your image onto the `$SCRATCH` filesystem with: +``` +podman-hpc migrate tgermain/ubu22-pennylane-ray[:version] +``` +or make available for everyone in project: +``` +IMG=tgermain/ubu22-pennylane-ray +POD_PUB=$CFS/m4693/podman/ +podman-hpc --squash-dir $POD_PUB migrate $IMG +chmod -R a+rx $POD_PUB # to allow anyone to use this image +``` + +**TODO:** Check and update instructions about migrate for project + +## Test Podman + +``` bash + +IMG=tgermain/ubu22-pennylane-ray +CMD="python3 -u performance_indicators/perf_ind_variational.py --numFeatures 4 --inputPath performance_indicators/linearly_separable/" + +# Run container interactively with wrapper +./wrap_podman.sh $IMG "$CMD" +``` + +## Run jobs + +Run performance indicator (1 model: 100 training steps and prediction) +``` bash +sbatch submit_job_shared.str +``` + +Run hyperparameter search +``` bash +sbatch submit_job_single.str +``` diff --git a/nersc/performance_indicators/RAY/IQPKernelClassifier_linearly_separable_15d_performance_indicators_RAY.csv b/nersc/performance_indicators/RAY/IQPKernelClassifier_linearly_separable_15d_performance_indicators_RAY.csv new file mode 100644 index 0000000..b1e5fe5 --- /dev/null +++ b/nersc/performance_indicators/RAY/IQPKernelClassifier_linearly_separable_15d_performance_indicators_RAY.csv @@ -0,0 +1,2 @@ +construct_kernel_time,training_time,predict_time,hyperparameters +127.64588618278503,127.65117883682251,23.83331537246704,"{'repeats': 10, 'use_jax': False, 'vmap': True, 'jit': False, 'use_ray': True}" diff --git a/nersc/performance_indicators/RAY/IQPKernelClassifier_linearly_separable_15d_performance_indicators_RAY_packages.txt b/nersc/performance_indicators/RAY/IQPKernelClassifier_linearly_separable_15d_performance_indicators_RAY_packages.txt new file mode 100644 index 0000000..272679a --- /dev/null +++ b/nersc/performance_indicators/RAY/IQPKernelClassifier_linearly_separable_15d_performance_indicators_RAY_packages.txt @@ -0,0 +1,82 @@ +Package Version Editable project location +-------------------------- ----------- -------------------------------------------- +absl-py 2.1.0 +aiosignal 1.3.1 +appdirs 1.4.4 +astunparse 1.6.3 +attrs 23.2.0 +autograd 1.6.2 +autoray 0.6.12 +cachetools 5.4.0 +certifi 2024.7.4 +charset-normalizer 3.3.2 +chex 0.1.86 +click 8.1.7 +contourpy 1.2.1 +cycler 0.12.1 +diastatic-malt 2.15.2 +etils 1.9.2 +filelock 3.15.4 +flax 0.8.5 +fonttools 4.53.1 +frozenlist 1.4.1 +fsspec 2024.6.1 +future 1.0.0 +gast 0.6.0 +idna 3.7 +importlib_resources 6.4.0 +jax 0.4.23 +jaxlib 0.4.23 +joblib 1.4.2 +jsonschema 4.23.0 +jsonschema-specifications 2023.12.1 +kiwisolver 1.4.5 +markdown-it-py 3.0.0 +matplotlib 3.9.1 +mdurl 0.1.2 +ml-dtypes 0.4.0 +msgpack 1.0.8 +nersc-pymon 0.2.1 +nest-asyncio 1.6.0 +networkx 3.3 +numpy 1.26.4 +opt-einsum 3.3.0 +optax 0.2.3 +orbax-checkpoint 0.5.23 +packaging 24.1 +pandas 2.2.2 +PennyLane 0.37.0 +PennyLane-Catalyst 0.7.0 +PennyLane_Lightning 0.37.0 +PennyLane_Lightning_Kokkos 0.37.0 +pillow 10.4.0 +pip 23.2.1 +protobuf 5.27.2 +Pygments 2.18.0 +pyparsing 3.1.2 +python-dateutil 2.9.0.post0 +pytz 2024.1 +PyYAML 6.0.1 +qml_benchmarks 0.1 /global/cfs/cdirs/m4693/qml-benchmarks-devel +ray 2.33.0 +referencing 0.35.1 +requests 2.32.3 +rich 13.7.1 +rpds-py 0.19.1 +rustworkx 0.15.1 +scikit-learn 1.5.1 +scipy 1.12.0 +seaborn 0.13.2 +semantic-version 2.10.0 +setuptools 65.5.0 +six 1.16.0 +tensorstore 0.1.63 +termcolor 2.4.0 +threadpoolctl 3.5.0 +toml 0.10.2 +toolz 0.12.1 +typing_extensions 4.12.2 +tzdata 2024.1 +urllib3 2.2.2 +wheel 0.43.0 +zipp 3.19.2 diff --git a/nersc/performance_indicators/README_ray.md b/nersc/performance_indicators/README_ray.md new file mode 100644 index 0000000..96c4467 --- /dev/null +++ b/nersc/performance_indicators/README_ray.md @@ -0,0 +1,49 @@ + + +## Run with Python `venv` + +### `lightning-kokkos` from pypi wheels + +NOTE: `venv/qml_LK` is described in `single_circuits/README.md` + +Start interactive job on CPU node for testing +``` bash +salloc -q interactive -C cpu -t 0:30:00 -A m4693 + +# and execute in this interactive session: + +source /global/common/software/m4693/venv/qml_LK/bin/activate +cd nersc/ + +python3 -u performance_indicators/perf_ind_kernel.py --numFeatures 4 --inputPath performance_indicators/linearly_separable/ +``` + +Runtimes with Ray on interactive CPU node, dataset with 240x240 = 57,600 kernels +``` +> 57600 / 128 = 450 kernels per core +> default.qubit.jax +qubits 4 10 15 +real 1m33s 2m42 11m27 +user 0m46s 1m00 4m20 +sys 0m22s 0m28 2m21 +``` + +Start batch job on CPU node +``` bash +sbatch submit_job_single.slr +``` + +Runtime with Ray on batch CPU node, 15 qubits +``` +Job ID: 28820822 +Cluster: perlmutter +User/Group: tgermain/tgermain +State: COMPLETED (exit code 0) +Nodes: 1 +Cores per node: 256 +CPU Utilized: 1-19:42:40 +CPU Efficiency: 86.94% of 2-02:16:32 core-walltime +Job Wall-clock time: 00:11:47 +Memory Utilized: 314.08 GB +Memory Efficiency: 65.90% of 476.56 GB +``` \ No newline at end of file diff --git a/nersc/performance_indicators/generate_linearly_separable.py b/nersc/performance_indicators/generate_linearly_separable.py index 477f327..18c9d14 100644 --- a/nersc/performance_indicators/generate_linearly_separable.py +++ b/nersc/performance_indicators/generate_linearly_separable.py @@ -25,7 +25,7 @@ n_samples = 300 -for n_features in range(2, 21): +for n_features in range(2, 31): margin = 0.02 * n_features X, y = generate_linearly_separable(n_samples, n_features, margin) diff --git a/nersc/performance_indicators/perf_ind_kernel.py b/nersc/performance_indicators/perf_ind_kernel.py index e633af2..6dfb0db 100644 --- a/nersc/performance_indicators/perf_ind_kernel.py +++ b/nersc/performance_indicators/perf_ind_kernel.py @@ -8,6 +8,9 @@ import os import yaml import subprocess + +import ray + from qml_benchmarks.hyperparam_search_utils import read_data import argparse @@ -37,15 +40,20 @@ def get_parser(): # You only need to change this to make a different performance indicator #define the model - from qml_benchmarks.models.projected_quantum_kernel import ProjectedQuantumKernel as Model + from qml_benchmarks.models.iqp_kernel import IQPKernelClassifier as Model #implementation attributes of model use_jax = False vmap = True - jit = True - model_settings = {'use_jax': use_jax, 'vmap': vmap, 'jit': jit} + jit = False + use_ray = True + model_settings = {'use_jax': use_jax, 'vmap': vmap, 'jit': jit, + 'use_ray': use_ray} + + perf_ind_name = 'RAY' #a name for the performance indicator used for naming files - perf_ind_name = 'CAT_CPU' #a name for the performance indicator used for naming files + if use_ray: + ray.init() ################################# diff --git a/nersc/run_hyperparam_search.sh b/nersc/run_hyperparam_search.sh new file mode 100755 index 0000000..e53e043 --- /dev/null +++ b/nersc/run_hyperparam_search.sh @@ -0,0 +1,27 @@ +#!/bin/bash -e + +export RAY_DEDUP_LOGS=0 + +REPO=/qml-benchmarks +DATA=${REPO}/nersc/performance_indicators/linearly_separable + +GENERATE_DATA=0 +# running python paper/benchmarks/generate_linearly_separable.py will generate a folder linearly_separable/ (in the current directory). +if [[ GENERATE_DATA == 1 ]]; then + python ${REPO}/paper/benchmarks/generate_linearly_separable.py +fi + +# You can then use any of the *.csv from this folder to start training. e.g. +#python ${QML}/scripts/run_hyperparameter_search.py\ +# --classifier-name IQPVariationalClassifier\ +# --dataset-path linearly_separable/linearly_separable_10d_train.csv + +# I reduced the grid space and the input size for a faster turn around. This was my command: +python3 -u ${REPO}/scripts/run_hyperparameter_search.py\ + --classifier-name IQPVariationalClassifier\ + --dataset-path ${DATA}/linearly_separable_4d_train.csv\ + --clean True +# --n-jobs 256 # 4 +# --n_layers 1 2\ +# --learning_rate 0.001\ +# --repeats 1\ diff --git a/nersc/run_performance_indicator.sh b/nersc/run_performance_indicator.sh new file mode 100755 index 0000000..c2abeb4 --- /dev/null +++ b/nersc/run_performance_indicator.sh @@ -0,0 +1,7 @@ +#!/bin/bash -e + +export RAY_DEDUP_LOGS=0 + +NUM_FEATURES=20 + +python3 -u performance_indicators/perf_ind_kernel.py --numFeatures $NUM_FEATURES --inputPath performance_indicators/linearly_separable/ diff --git a/nersc/start-head.sh b/nersc/start-head.sh new file mode 100644 index 0000000..11c0362 --- /dev/null +++ b/nersc/start-head.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +export LC_ALL=C.UTF-8 +export LANG=C.UTF-8 + +echo "starting ray head node" +# Launch the head node +ray start --head --node-ip-address=$1 --port=6379 # --redis-password=$2 +sleep infinity diff --git a/nersc/start-worker.sh b/nersc/start-worker.sh new file mode 100644 index 0000000..52a8104 --- /dev/null +++ b/nersc/start-worker.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +export LC_ALL=C.UTF-8 +export LANG=C.UTF-8 + +echo "starting ray worker node" +ray start --address $1 # --redis-password=$2 +sleep infinity diff --git a/nersc/submit_job_multinode.slr b/nersc/submit_job_multinode.slr new file mode 100644 index 0000000..c646dbd --- /dev/null +++ b/nersc/submit_job_multinode.slr @@ -0,0 +1,71 @@ +#!/bin/bash +#SBATCH -A m4693 +#SBATCH -C cpu +#SBATCH -t 0:10:00 +#SBATCH --job-name=qml_multinode +#SBATCH --nodes=2 +#SBATCH --ntasks-per-node=1 # 1 ray worker runtime per node +#SBATCH --cpus-per-task=128 +#SBATCH --mail-type=ALL +#SBATCH --mail-user=thomas.germain@xanadu.ai +#SBATCH --output=out/%j.%x.out +#SBATCH --error=out/%j.%x.err + +#------------------------------------------------------------------- +# SBATCH -q shared +# SBATCH --gpus-per-task=4 # 4 GPUs per node + +set -u ; # exit if you try to use an uninitialized variable + +echo "S:starting" +echo "S:PWD=$PWD" + +################################################################ +# Define image and command +################################################################ + +IMG=tgermain/ubu22-pennylane-ray +CMD="./run_hyperparam_search.sh" + +echo "S:IMG=$IMG" +echo "S:CMD=$CMD" + +################################################################ +# Initialize Ray cluster environment +################################################################ + +head_node=$(hostname) +head_node_ip=$(hostname --ip-address) +# if we detect a space character in the head node IP, we'll +# convert it to an ipv4 address. This step is optional. +if [[ "$head_node_ip" == *" "* ]]; then +IFS=' ' read -ra ADDR <<<"$head_node_ip" +if [[ ${#ADDR[0]} -gt 16 ]]; then + head_node_ip=${ADDR[1]} +else + head_node_ip=${ADDR[0]} +fi +fi +port=6379 + +echo "STARTING HEAD at $head_node" +echo "Head node IP: $head_node_ip" +srun --nodes=1 --ntasks=1 -w $head_node ./wrap_podman.sh $IMG "./start-head.sh $head_node_ip" & +sleep 10 + +worker_num=$(($SLURM_JOB_NUM_NODES - 1)) #number of nodes other than the head node +srun -n $worker_num --nodes=$worker_num --ntasks-per-node=1 --exclude $head_node ./wrap_podman.sh $IMG "./start-worker.sh $head_node_ip:$port" & +sleep 5 + +################################################################ +# Run Python scripts in container +################################################################ + +echo "S:ready to run" + +./wrap_podman.sh $IMG "$CMD" "$head_node_ip:$port" + +sleep 1 +echo S:done + +# EOF diff --git a/nersc/submit_job_shared.slr b/nersc/submit_job_shared.slr new file mode 100644 index 0000000..17ce938 --- /dev/null +++ b/nersc/submit_job_shared.slr @@ -0,0 +1,51 @@ +#!/bin/bash +#SBATCH -A m4693 +#SBATCH -C cpu +#SBATCH -t 0:10:00 +#SBATCH --job-name=qml_shared +#SBATCH -q shared +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 # 1 ray worker runtime per node +#SBATCH --cpus-per-task=8 +#SBATCH --mail-type=ALL +#SBATCH --mail-user=thomas.germain@xanadu.ai +#SBATCH --output=out/%j.%x.out +#SBATCH --error=out/%j.%x.err + +#------------------------------------------------------------------- +# SBATCH --gpus-per-task=4 # 4 GPUs per node + +set -u ; # exit if you try to use an uninitialized variable + +echo "S:starting" +echo "S:PWD=$PWD" + +################################################################ +# Define image and command +################################################################ + +IMG=tgermain/ubu22-pennylane-ray +CMD="./run_performance_indicator.sh" + +echo "S:IMG=$IMG" +echo "S:CMD=$CMD" + +################################################################ +# Initialize Ray cluster environment +################################################################ + +# ... + +################################################################ +# Run Python scripts in container +################################################################ + +N=1 + +echo "S:ready to run" +srun -n $N ./wrap_podman.sh $IMG "$CMD" + +sleep 1 +echo S:done + +# EOF diff --git a/nersc/submit_job_single.slr b/nersc/submit_job_single.slr new file mode 100644 index 0000000..4df2728 --- /dev/null +++ b/nersc/submit_job_single.slr @@ -0,0 +1,51 @@ +#!/bin/bash +#SBATCH -A m4693 +#SBATCH -C cpu +#SBATCH -t 0:30:00 +#SBATCH --job-name=qml_single +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 # 1 ray worker runtime per node +#SBATCH --cpus-per-task=256 +#SBATCH --mail-type=ALL +#SBATCH --mail-user=thomas.germain@xanadu.ai +#SBATCH --output=out/%j.%x.out +#SBATCH --error=out/%j.%x.err + +#------------------------------------------------------------------- +# SBATCH -q shared +# SBATCH --gpus-per-task=4 # 4 GPUs per node + +set -u ; # exit if you try to use an uninitialized variable + +echo "S:starting" +echo "S:PWD=$PWD" + +################################################################ +# Define image and command +################################################################ + +IMG=tgermain/ubu22-pennylane-ray +CMD="./run_performance_indicator.sh" + +echo "S:IMG=$IMG" +echo "S:CMD=$CMD" + +################################################################ +# Initialize Ray cluster environment +################################################################ + +# ... + +################################################################ +# Run Python scripts in container +################################################################ + +N=1 + +echo "S:ready to run" +srun -n $N ./wrap_podman.sh $IMG "$CMD" + +sleep 1 +echo S:done + +# EOF diff --git a/nersc/submit_job_venv.slr b/nersc/submit_job_venv.slr new file mode 100644 index 0000000..3cc02fd --- /dev/null +++ b/nersc/submit_job_venv.slr @@ -0,0 +1,48 @@ +#!/bin/bash +#SBATCH -A m4693 +#SBATCH -C cpu +#SBATCH -t 00:30:00 +#SBATCH --job-name=qml_single +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 # 1 ray worker runtime per node +#SBATCH --cpus-per-task=256 +#SBATCH --mail-type=ALL +#SBATCH --mail-user=thomas.germain@xanadu.ai +#SBATCH --output=out/%j.%x.out +#SBATCH --error=out/%j.%x.err + +#------------------------------------------------------------------- +# SBATCH -q shared +# SBATCH --gpus-per-task=4 # 4 GPUs per node + +set -u ; # exit if you try to use an uninitialized variable + +echo "S:starting" + +################################################################ +# Define venv and command +################################################################ + +module load python +source /global/common/software/m4693/venv/qml_LK/bin/activate + +cd /global/cfs/cdirs/m4693/qml-benchmarks-devel/nersc/ + +CMD="./run_performance_indicator.sh" + +echo "S:PWD=$PWD" +echo "S:CMD=$CMD" + +################################################################ +# Run Python scripts in container +################################################################ + +N=1 + +echo "S:ready to run" +srun -n $N $CMD + +sleep 1 +echo S:done + +# EOF diff --git a/nersc/wrap_podman.sh b/nersc/wrap_podman.sh index ee8ec5d..c352bb3 100755 --- a/nersc/wrap_podman.sh +++ b/nersc/wrap_podman.sh @@ -2,37 +2,46 @@ echo W:myRank is $SLURM_PROCID IMG=$1 CMD=$2 -outPath=$3 -CFSH=$4 -BASE_DIR=$5 -WORK_DIR=$6 +RAY_ADDRESS=$3 -if [ $SLURM_PROCID -eq 0 ] ; then +if [[ $SLURM_PROCID -eq 0 ]]; then echo W:IMG=$IMG echo W:CMD=$CMD - #echo Q:fire $ fi -echo W:BASE_DIR=$BASE_DIR -echo 'W:start podman' +CFSH=/global/cfs/cdirs/m4693 # CFS home +REPO_DIR=$CFSH/qml-benchmarks-devel # qml-benchmark repo +ROOT_DIR=$REPO_DIR/nersc/root # to access local python packages +WORK_DIR=$REPO_DIR/nersc # to store output files + +# Mount /tmp to avoid following error with Ray: +# ValueError: Can't find a `node_ip_address.json` file + +PORT=6379 + +# Script will run in the workdir mounted in the container, +# this will allow us to access the output files easily. + podman-hpc run -it \ - --volume $CFSH/$BASE_DIR:/root \ - --volume $CFSH/$BASE_DIR:$BASE_DIR \ - --volume $CFSH/$BASE_DIR/nersc/performance_indicators/linearly_separable:/linearly_separable \ - --volume $CFSH/$WORK_DIR:$WORK_DIR \ + --net host \ + -p $PORT:$PORT \ + --volume /tmp:/tmp \ + --volume $ROOT_DIR:/root \ + --volume $REPO_DIR:/qml-benchmarks \ + --volume $WORK_DIR:/work_dir \ + --workdir /work_dir \ -e HDF5_USE_FILE_LOCKING='FALSE' \ - --workdir $WORK_DIR \ + --shm-size=10.24gb \ $IMG <