Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Slurm on GKE - Guide #864

Open
wants to merge 13 commits into
base: main
Choose a base branch
from
6 changes: 6 additions & 0 deletions slurm-on-gke/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@

danielmarzini marked this conversation as resolved.
Show resolved Hide resolved
.terraform*
terraform.tfstate
terraform.tfstate.backup
terraform.tfvars
*/.terraform
danielmarzini marked this conversation as resolved.
Show resolved Hide resolved
448 changes: 448 additions & 0 deletions slurm-on-gke/README.md

Large diffs are not rendered by default.

121 changes: 121 additions & 0 deletions slurm-on-gke/image/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
# MIT License

# Copyright (c) 2019 Giovanni Torres
danielmarzini marked this conversation as resolved.
Show resolved Hide resolved

# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

FROM ubuntu:22.04

ARG SLURM_TAG=slurm-23.02
ARG GOSU_VERSION=1.11


RUN set -x \
&& apt-get update \
&& apt-get install -y \
wget \
gcc \
git \
make \
munge \
libmunge-dev \
python3-dev \
python3-pip \
python3 \
hwloc \
libhwloc-dev \
libpmix-dev \
libhttp-parser-dev \
libmysqlclient-dev \
libjson-c-dev \
psmisc \
bzip2 \
python3-http-parser \
nvidia-utils-535 \
nvidia-cuda-toolkit-gcc \
nvidia-cuda-dev \
libnvidia-compute-535 \
mariadb-server \
libdbus-1-dev \
openmpi-common \
openmpi-bin \
vim

RUN pip3 install Cython nose

RUN set -ex \
&& wget -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" \
&& wget -O /usr/local/bin/gosu.asc "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64.asc" \
&& export GNUPGHOME="$(mktemp -d)" \
&& gpg --batch --keyserver hkps://keys.openpgp.org --recv-keys B42F6819007F00F88E364FD4036A9C25BF357DD4 \
&& gpg --batch --verify /usr/local/bin/gosu.asc /usr/local/bin/gosu \
&& rm -rf "${GNUPGHOME}" /usr/local/bin/gosu.asc \
&& chmod +x /usr/local/bin/gosu \
&& gosu nobody true

ENV SHELL=/bin/bash
RUN set -x \
&& git clone -b ${SLURM_TAG} --single-branch --depth=1 https://github.com/SchedMD/slurm.git \
&& cd slurm \
&& ./configure --enable-debug --prefix=/usr --sysconfdir=/etc/slurm \
--with-mysql_config=/usr/bin --libdir=/usr/lib64 \
&& make install \
&& install -D -m644 contribs/slurm_completion_help/slurm_completion.sh /etc/profile.d/slurm_completion.sh \
&& cd .. \
&& rm -rf slurm

RUN mkdir /var/spool/slurmd \
/var/run/slurmd \
/var/run/slurmdbd \
/var/lib/slurmd \
/var/log/slurm \
/data \
/etc/slurm \
&& touch /var/lib/slurmd/node_state \
/var/lib/slurmd/front_end_state \
/var/lib/slurmd/job_state \
/var/lib/slurmd/resv_state \
/var/lib/slurmd/trigger_state \
/var/lib/slurmd/assoc_mgr_state \
/var/lib/slurmd/assoc_usage \
/var/lib/slurmd/qos_usage \
/var/lib/slurmd/fed_mgr_state \
&& useradd -r --uid=990 slurm \
&& chown -R slurm:slurm /var/*/slurm*

WORKDIR /opt
RUN export VERSION=1.18 OS=linux ARCH=amd64 \
&& wget https://dl.google.com/go/go$VERSION.$OS-$ARCH.tar.gz \
&& tar -xzvf go$VERSION.$OS-$ARCH.tar.gz \
&& export PATH=$PWD/go/bin:$PATH \
&& git clone https://github.com/vpenso/prometheus-slurm-exporter.git \
&& cd prometheus-slurm-exporter \
&& go build

RUN mv /opt/prometheus-slurm-exporter/prometheus-slurm-exporter /usr/local/bin

RUN mkdir -p /run/munge \
&& chown munge.munge /run/munge

VOLUME /etc/slurm
COPY docker-entrypoint.sh /usr/local/bin/docker-entrypoint.sh
RUN chmod 755 /usr/local/bin/docker-entrypoint.sh
ENTRYPOINT ["/usr/local/bin/docker-entrypoint.sh"]

CMD ["slurmdbd"]
146 changes: 146 additions & 0 deletions slurm-on-gke/image/docker-entrypoint.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
# MIT License

# Copyright (c) 2019 Giovanni Torres

# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

#!/bin/bash
set -euo pipefail

function start_munge(){

echo "---> Copying MUNGE key ..."
cp /tmp/munge.key /etc/munge/munge.key
chown munge:munge /etc/munge/munge.key

echo "---> Starting the MUNGE Authentication service (munged) ..."
gosu munge /usr/sbin/munged "$@"
}

if [ "$1" = "slurmdbd" ]
then

start_munge

echo "---> Starting the Slurm Database Daemon (slurmdbd) ..."

cp /tmp/slurmdbd.conf /etc/slurm/slurmdbd.conf
echo "StoragePass=${StoragePass}" >> /etc/slurm/slurmdbd.conf
chown slurm:slurm /etc/slurm/slurmdbd.conf
chmod 600 /etc/slurm/slurmdbd.conf
{
. /etc/slurm/slurmdbd.conf
until echo "SELECT 1" | mysql -h $StorageHost -u$StorageUser -p$StoragePass 2>&1 > /dev/null
do
echo "-- Waiting for database to become active ..."
sleep 2
done
}
echo "-- Database is now active ..."

exec gosu slurm /usr/sbin/slurmdbd -D "${@:2}"

elif [ "$1" = "slurmctld" ]
then

start_munge

echo "---> Waiting for slurmdbd to become active before starting slurmctld ..."

until 2>/dev/null >/dev/tcp/slurmdbd/6819
do
echo "-- slurmdbd is not available. Sleeping ..."
sleep 2
done
echo "-- slurmdbd is now active ..."

echo "---> Setting permissions for state directory ..."
chown slurm:slurm /var/spool/slurmctld

echo "---> Starting the Slurm Controller Daemon (slurmctld) ..."
if /usr/sbin/slurmctld -V | grep -q '17.02' ; then
exec gosu slurm /usr/sbin/slurmctld -D "${@:2}"
else
exec gosu slurm /usr/sbin/slurmctld -i -D "${@:2}"
fi

elif [ "$1" = "slurmd" ]
then
echo "---> Set shell resource limits ..."
#ulimit -l unlimited
#ulimit -s unlimited
#ulimit -n 131072
#ulimit -a

start_munge

cgroup_dir=`find /sys/fs/cgroup/kubepods.slice -type d -name "kubepods-pod*"`
mkdir -p "$cgroup_dir/system.slice/slurmstepd.scope/system.slice/slurmstepd.scope"
mkdir -p "/var/spool/slurmd"

echo "---> Starting the Slurm Node Daemon (slurmd) ..."
echo "${@:1}"
exec slurmd -D -s -vvv --conf-server="slurmctld-0:6820-6830" -Z -N $POD_NAME

elif [ "$1" = "login" ]
then

start_munge
while true; do sleep 30; done;

elif [ "$1" = "check-queue-hook" ]
then
start_munge

scontrol update NodeName=all State=DRAIN Reason="Preventing new jobs running before upgrade"

RUNNING_JOBS=$(squeue --states=RUNNING,COMPLETING,CONFIGURING,RESIZING,SIGNALING,STAGE_OUT,STOPPED,SUSPENDED --noheader --array | wc --lines)

if [[ $RUNNING_JOBS -eq 0 ]]
then
exit 0
else
exit 1
fi

elif [ "$1" = "undrain-nodes-hook" ]
then
start_munge
scontrol update NodeName=all State=UNDRAIN
exit 0

elif [ "$1" = "generate-keys-hook" ]
then
mkdir -p ./temphostkeys/etc/ssh
ssh-keygen -A -f ./temphostkeys
kubectl create secret generic host-keys-secret \
--dry-run=client \
--from-file=./temphostkeys/etc/ssh \
-o yaml | \
kubectl apply -f -

exit 0

elif [ "$1" = "debug" ]
then
start_munge --foreground

else
exec "$@"
fi
Loading