XanaduAI · tomlqc · Aug 9, 2024 · Aug 9, 2024 · Aug 9, 2024
diff --git a/nersc/README.md b/nersc/README.md
@@ -0,0 +1,84 @@
+# Run QML Benchmarks on Perlmutter
+
+## Setup Podman
+
+All the following commands to be executed on Perlmutter.
+
+Build podman image from dockerfile:
+```
+podman-hpc build -f Dockerfile.ubu22-PennyLane -t tgermain/ubu22-pennylane > podman_build.out
+```
+
+**TODO:** Add command to install Ray in dockerfile
+
+... or install in new image
+```
+podman-hpc run -it --name ray tgermain/ubu22-pennylane
+# in the container
+pip install ray
+exit
+# 
+podman-hpc commit ray tgermain/ubu22-pennylane-ray
+```
+
+Locally install `qml_benchmarks` with dependencies for development:
+```
+mkdir qml-benchmarks-devel/nersc/local
+
+IMG=tgermain/ubu22-pennylane-ray
+CFSH=/global/cfs/cdirs/m4693  # CFS home
+REPO_DIR=$CFSH/qml-benchmarks-devel  # qml-benchmark repo
+LOCAL_DIR=$REPO_DIR/nersc/local  # to store local python files
+WORK_DIR=$REPO_DIR/nersc/ray/workdir  # to store output files
+
+podman-hpc run -it \
+    --volume $LOCAL_DIR:/root \
+    --volume $REPO_DIR:/qml-benchmarks \
+    --volume $WORK_DIR:/work_dir \
+    -e HDF5_USE_FILE_LOCKING='FALSE' \
+    --workdir /work_dir \
+    $IMG bash
+
+# in the container
+cd /qml-benchmarks
+pip3 install --user .  # install in /root/.local
+```
+
+**Note:** `pip3 install --user .` will install in `/root/.local`, mounted to container.
+
+To make image available on CPU/GPU nodes, migrate your image onto the `$SCRATCH` filesystem with:
+```
+podman-hpc migrate tgermain/ubu22-pennylane-ray[:version]
+```
+or make available for everyone in project:
+```
+IMG=tgermain/ubu22-pennylane-ray
+POD_PUB=$CFS/m4693/podman/
+podman-hpc --squash-dir $POD_PUB migrate $IMG
+chmod -R a+rx $POD_PUB   # to allow anyone to use this image
+```
+
+**TODO:** Check and update instructions about migrate for project
+
+## Test Podman
+
+``` bash
+
+IMG=tgermain/ubu22-pennylane-ray
+CMD="python3 -u performance_indicators/perf_ind_variational.py --numFeatures 4 --inputPath performance_indicators/linearly_separable/"
+
+# Run container interactively with wrapper
+./wrap_podman.sh $IMG "$CMD"
+```
+
+## Run jobs
+
+Run performance indicator (1 model: 100 training steps and prediction)
+``` bash
+sbatch submit_job_shared.str
+```
+
+Run hyperparameter search
+``` bash
+sbatch submit_job_single.str
+```
diff --git a/..._indicators/RAY/IQPKernelClassifier_linearly_separable_15d_performance_indicators_RAY.csv b/..._indicators/RAY/IQPKernelClassifier_linearly_separable_15d_performance_indicators_RAY.csv
@@ -0,0 +1,2 @@
+construct_kernel_time,training_time,predict_time,hyperparameters
+127.64588618278503,127.65117883682251,23.83331537246704,"{'repeats': 10, 'use_jax': False, 'vmap': True, 'jit': False, 'use_ray': True}"
diff --git a/...rs/RAY/IQPKernelClassifier_linearly_separable_15d_performance_indicators_RAY_packages.txt b/...rs/RAY/IQPKernelClassifier_linearly_separable_15d_performance_indicators_RAY_packages.txt
@@ -0,0 +1,82 @@
+Package                    Version     Editable project location
+-------------------------- ----------- --------------------------------------------
+absl-py                    2.1.0
+aiosignal                  1.3.1
+appdirs                    1.4.4
+astunparse                 1.6.3
+attrs                      23.2.0
+autograd                   1.6.2
+autoray                    0.6.12
+cachetools                 5.4.0
+certifi                    2024.7.4
+charset-normalizer         3.3.2
+chex                       0.1.86
+click                      8.1.7
+contourpy                  1.2.1
+cycler                     0.12.1
+diastatic-malt             2.15.2
+etils                      1.9.2
+filelock                   3.15.4
+flax                       0.8.5
+fonttools                  4.53.1
+frozenlist                 1.4.1
+fsspec                     2024.6.1
+future                     1.0.0
+gast                       0.6.0
+idna                       3.7
+importlib_resources        6.4.0
+jax                        0.4.23
+jaxlib                     0.4.23
+joblib                     1.4.2
+jsonschema                 4.23.0
+jsonschema-specifications  2023.12.1
+kiwisolver                 1.4.5
+markdown-it-py             3.0.0
+matplotlib                 3.9.1
+mdurl                      0.1.2
+ml-dtypes                  0.4.0
+msgpack                    1.0.8
+nersc-pymon                0.2.1
+nest-asyncio               1.6.0
+networkx                   3.3
+numpy                      1.26.4
+opt-einsum                 3.3.0
+optax                      0.2.3
+orbax-checkpoint           0.5.23
+packaging                  24.1
+pandas                     2.2.2
+PennyLane                  0.37.0
+PennyLane-Catalyst         0.7.0
+PennyLane_Lightning        0.37.0
+PennyLane_Lightning_Kokkos 0.37.0
+pillow                     10.4.0
+pip                        23.2.1
+protobuf                   5.27.2
+Pygments                   2.18.0
+pyparsing                  3.1.2
+python-dateutil            2.9.0.post0
+pytz                       2024.1
+PyYAML                     6.0.1
+qml_benchmarks             0.1         /global/cfs/cdirs/m4693/qml-benchmarks-devel
+ray                        2.33.0
+referencing                0.35.1
+requests                   2.32.3
+rich                       13.7.1
+rpds-py                    0.19.1
+rustworkx                  0.15.1
+scikit-learn               1.5.1
+scipy                      1.12.0
+seaborn                    0.13.2
+semantic-version           2.10.0
+setuptools                 65.5.0
+six                        1.16.0
+tensorstore                0.1.63
+termcolor                  2.4.0
+threadpoolctl              3.5.0
+toml                       0.10.2
+toolz                      0.12.1
+typing_extensions          4.12.2
+tzdata                     2024.1
+urllib3                    2.2.2
+wheel                      0.43.0
+zipp                       3.19.2
diff --git a/nersc/performance_indicators/README_ray.md b/nersc/performance_indicators/README_ray.md
@@ -0,0 +1,49 @@
+
+
+## Run with Python `venv`
+
+### `lightning-kokkos` from pypi wheels
+
+NOTE: `venv/qml_LK` is described in `single_circuits/README.md`
+
+Start interactive job on CPU node for testing
+``` bash
+salloc -q interactive -C cpu -t 0:30:00 -A m4693
+
+# and execute in this interactive session:
+
+source /global/common/software/m4693/venv/qml_LK/bin/activate
+cd nersc/
+
+python3 -u performance_indicators/perf_ind_kernel.py --numFeatures 4 --inputPath performance_indicators/linearly_separable/
+```
+
+Runtimes with Ray on interactive CPU node, dataset with 240x240 = 57,600 kernels
+```
+> 57600 / 128 = 450 kernels per core
+> default.qubit.jax
+qubits     4     10     15
+real    1m33s  2m42  11m27
+user    0m46s  1m00   4m20
+sys     0m22s  0m28   2m21
+```
+
+Start batch job on CPU node
+``` bash
+sbatch submit_job_single.slr
+```
+
+Runtime with Ray on batch CPU node, 15 qubits
+```
+Job ID: 28820822
+Cluster: perlmutter
+User/Group: tgermain/tgermain
+State: COMPLETED (exit code 0)
+Nodes: 1
+Cores per node: 256
+CPU Utilized: 1-19:42:40
+CPU Efficiency: 86.94% of 2-02:16:32 core-walltime
+Job Wall-clock time: 00:11:47
+Memory Utilized: 314.08 GB
+Memory Efficiency: 65.90% of 476.56 GB
+```
diff --git a/nersc/performance_indicators/generate_linearly_separable.py b/nersc/performance_indicators/generate_linearly_separable.py
@@ -25,7 +25,7 @@
 
 n_samples = 300
 
-for n_features in range(2, 21):
+for n_features in range(2, 31):
     margin = 0.02 * n_features
 
     X, y = generate_linearly_separable(n_samples, n_features, margin)

diff --git a/nersc/performance_indicators/perf_ind_kernel.py b/nersc/performance_indicators/perf_ind_kernel.py
@@ -8,6 +8,9 @@
 import os
 import yaml
 import subprocess
+
+import ray
+
 from qml_benchmarks.hyperparam_search_utils import read_data
 
 import argparse
@@ -37,15 +40,20 @@ def get_parser():
     # You only need to change this to make a different performance indicator
 
     #define the model
-    from qml_benchmarks.models.projected_quantum_kernel import ProjectedQuantumKernel as Model
+    from qml_benchmarks.models.iqp_kernel import IQPKernelClassifier as Model
 
     #implementation attributes of model
     use_jax = False
     vmap = True
-    jit = True
-    model_settings = {'use_jax': use_jax, 'vmap': vmap, 'jit': jit}
+    jit = False
+    use_ray = True
+    model_settings = {'use_jax': use_jax, 'vmap': vmap, 'jit': jit,
+                      'use_ray': use_ray}
+
+    perf_ind_name = 'RAY'  #a name for the performance indicator used for naming files
 
-    perf_ind_name = 'CAT_CPU'  #a name for the performance indicator used for naming files
+    if use_ray:
+        ray.init()
 
     #################################
 

diff --git a/nersc/run_hyperparam_search.sh b/nersc/run_hyperparam_search.sh
@@ -0,0 +1,27 @@
+#!/bin/bash -e
+
+export RAY_DEDUP_LOGS=0
+
+REPO=/qml-benchmarks
+DATA=${REPO}/nersc/performance_indicators/linearly_separable
+
+GENERATE_DATA=0
+# running python paper/benchmarks/generate_linearly_separable.py will generate a folder linearly_separable/ (in the current directory).
+if [[ GENERATE_DATA == 1 ]]; then
+    python ${REPO}/paper/benchmarks/generate_linearly_separable.py
+fi
+
+# You can then use any of the *.csv from this folder to start training. e.g.
+#python ${QML}/scripts/run_hyperparameter_search.py\
+# --classifier-name IQPVariationalClassifier\
+# --dataset-path linearly_separable/linearly_separable_10d_train.csv
+
+# I reduced the grid space and the input size for a faster turn around. This was my command:
+python3 -u ${REPO}/scripts/run_hyperparameter_search.py\
+ --classifier-name IQPVariationalClassifier\
+ --dataset-path ${DATA}/linearly_separable_4d_train.csv\
+ --clean True
+# --n-jobs 256  # 4
+# --n_layers 1 2\
+# --learning_rate 0.001\
+# --repeats 1\
diff --git a/nersc/run_performance_indicator.sh b/nersc/run_performance_indicator.sh
@@ -0,0 +1,7 @@
+#!/bin/bash -e
+
+export RAY_DEDUP_LOGS=0
+
+NUM_FEATURES=20
+
+python3 -u performance_indicators/perf_ind_kernel.py --numFeatures $NUM_FEATURES --inputPath performance_indicators/linearly_separable/
diff --git a/nersc/start-head.sh b/nersc/start-head.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+export LC_ALL=C.UTF-8
+export LANG=C.UTF-8
+
+echo "starting ray head node"
+# Launch the head node
+ray start --head --node-ip-address=$1 --port=6379  # --redis-password=$2 
+sleep infinity
diff --git a/nersc/start-worker.sh b/nersc/start-worker.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+export LC_ALL=C.UTF-8
+export LANG=C.UTF-8
+
+echo "starting ray worker node"
+ray start --address $1  # --redis-password=$2
+sleep infinity
diff --git a/nersc/submit_job_multinode.slr b/nersc/submit_job_multinode.slr
@@ -0,0 +1,71 @@
+#!/bin/bash
+#SBATCH -A m4693
+#SBATCH -C cpu
+#SBATCH -t 0:10:00
+#SBATCH --job-name=qml_multinode
+#SBATCH --nodes=2
+#SBATCH --ntasks-per-node=1 # 1 ray worker runtime per node
+#SBATCH --cpus-per-task=128
+#SBATCH --mail-type=ALL
+#SBATCH [email protected]
+#SBATCH --output=out/%j.%x.out
+#SBATCH --error=out/%j.%x.err
+
+#-------------------------------------------------------------------
+# SBATCH -q shared 
+# SBATCH --gpus-per-task=4 # 4 GPUs per node
+
+set -u ;  # exit if you try to use an uninitialized variable
+
+echo "S:starting"
+echo "S:PWD=$PWD"
+
+################################################################
+# Define image and command
+################################################################
+
+IMG=tgermain/ubu22-pennylane-ray
+CMD="./run_hyperparam_search.sh"
+
+echo "S:IMG=$IMG"
+echo "S:CMD=$CMD"
+
+################################################################
+# Initialize Ray cluster environment
+################################################################
+
+head_node=$(hostname)
+head_node_ip=$(hostname --ip-address)
+# if we detect a space character in the head node IP, we'll
+# convert it to an ipv4 address. This step is optional.
+if [[ "$head_node_ip" == *" "* ]]; then
+IFS=' ' read -ra ADDR <<<"$head_node_ip"
+if [[ ${#ADDR[0]} -gt 16 ]]; then
+  head_node_ip=${ADDR[1]}
+else
+  head_node_ip=${ADDR[0]}
+fi
+fi
+port=6379
+
+echo "STARTING HEAD at $head_node"
+echo "Head node IP: $head_node_ip"
+srun --nodes=1 --ntasks=1 -w $head_node ./wrap_podman.sh $IMG "./start-head.sh $head_node_ip" &
+sleep 10
+
+worker_num=$(($SLURM_JOB_NUM_NODES - 1)) #number of nodes other than the head node
+srun -n $worker_num --nodes=$worker_num --ntasks-per-node=1 --exclude $head_node ./wrap_podman.sh $IMG "./start-worker.sh $head_node_ip:$port" &
+sleep 5
+
+################################################################
+# Run Python scripts in container
+################################################################
+
+echo "S:ready to run"
+
+./wrap_podman.sh $IMG "$CMD" "$head_node_ip:$port"
+
+sleep 1
+echo S:done
+
+# EOF