-
-
Notifications
You must be signed in to change notification settings - Fork 3
ClearML Agent Setup
mshannon-sil edited this page Sep 19, 2024
·
5 revisions
- Log in as root
- Install the GPU driver
ubuntu-drivers install
- Install Docker
- Install NVIDIA Container Toolkit and configure NVIDIA Container Runtime for Docker
- If using MIG partitions:
- Install nvidia-mig-manager.service
- Configure
/etc/nvidia-mig-manager/config.yaml
with your MIG configuration, e.g.version: v1 mig-configs: sil-config: - devices: [0] mig-enabled: true mig-devices: 3g.47gb: 2 - devices: [1] mig-enabled: false mig-devices: {}
- Configure
/etc/systemd/system/nvidia-mig-manager.service.d/override.conf
to use your mig-config, e.g.[Service] Environment="MIG_PARTED_SELECTED_CONFIG=sil-config"
- Run
nvidia-mig-parted apply
or reboot the server
- Create clearml user
adduser clearml
- Add clearml user to docker group: https://docs.docker.com/engine/install/linux-postinstall/#manage-docker-as-a-non-root-user
- Log in as clearml user. IMPORTANT: Do not create/modify any files in the clearml user directory as root.
su - clearml
- Install clearml-agent
pip install clearml-agent
- Add a clearml.conf file
- Either copy it from an existing setup, or use the skeleton provided in the SILNLP repo under scripts/clearml_agent/clearml.conf and fill out the clearml credentials, git credentials, worker id, and worker name sections. Also add the following lines to the extra_docker_arguments section and fill out the access key and secret access key sections.
extra_docker_arguments: [ "--env","SIL_NLP_DATA_PATH=/silnlp" "--env","AWS_REGION=us-east-1", "--env","AWS_ACCESS_KEY_ID=*your access key*", "--env","AWS_SECRET_ACCESS_KEY=*your secret key*", "--env","TOKENIZERS_PARALLELISM=false", "-v","/home/clearml/.clearml/hf-cache:/root/.cache/huggingface" ]
- Either copy it from an existing setup, or use the skeleton provided in the SILNLP repo under scripts/clearml_agent/clearml.conf and fill out the clearml credentials, git credentials, worker id, and worker name sections. Also add the following lines to the extra_docker_arguments section and fill out the access key and secret access key sections.
- Create a startup script called start-agents.sh, e.g.
!/bin/sh # Kill all clearml-agents running ps -A | grep clearml-agent | awk '{print $1}' | xargs kill -9 $1 # GPU 0 /home/clearml/.local/bin/clearml-agent daemon --use-owner-token --detached --docker --force-current-version --create-queue --gpus 0:0 --queue 47gb_queue /home/clearml/.local/bin/clearml-agent daemon --use-owner-token --detached --docker --force-current-version --create-queue --gpus 0:1 --queue 47gb_queue # GPU 1 /home/clearml/.local/bin/clearml-agent daemon --use-owner-token --detached --docker --force-current-version --create-queue --gpus 1 --queue 94gb_queue
- Start the agents
./start-agents.sh
- To configure the GPUs to survive a reboot:
- Become the root user again
exit
- Create a file called clearml-agent in /etc/init.d/ directory, e.g.
- Become the root user again
#!/bin/sh
set -e
### BEGIN INIT INFO
# Provides: clearml-agents
# Required-Start: $syslog $remote_fs $local_fs $syslog mountall
# Required-Stop: $syslog $remote_fs $local_fs $syslog
# Should-Start:
# Should-Stop:
# Default-Start: 2 3 4 5
# Default-Stop: 0 1 6
# Short-Description: ClearML Agents and queues to service GPUs
# Description:
# "ClearML is an open source platform that automates and simplifies
# developing and managing machine learning solutions. ClearML Agent
# is a virtual environment and execution manager for DL/ML solutions
# on GPU machines." --https://clear.ml
### END INIT INFO
export PATH=/sbin:/bin:/usr/sbin:/usr/bin:/usr/local/sbin:/usr/local/bin
NAME="clearml-agents"
# Get lsb functions
. /lib/lsb/init-functions
fail_unless_root() {
if [ "$(id -u)" != '0' ]; then
log_failure_msg "$NAME must be run as root"
exit 1
fi
}
do_start_stop() {
STOP=""
if [ "$1" = "stop" ]; then
STOP="--stop"
fi
# Half GPUs 0:0 and 0:1 and Full GPU 1
su --login --command "/home/clearml/.local/bin/clearml-agent daemon --use-owner-token --detached --docker --force-current-version --create-queue --gpus 0:0 --queue cheetah_47gb ${STOP}" clearml
su --login --command "/home/clearml/.local/bin/clearml-agent daemon --use-owner-token --detached --docker --force-current-version --create-queue --gpus 0:1 --queue cheetah_47gb ${STOP}" clearml
su --login --command "/home/clearml/.local/bin/clearml-agent daemon --use-owner-token --detached --docker --force-current-version --create-queue --gpus 1 --queue cheetah_94gb ${STOP}" clearml
}
case "$1" in
start)
fail_unless_root
log_begin_msg "Starting $NAME"
do_start_stop
log_end_msg $?
;;
stop)
fail_unless_root
do_start_stop "stop"
;;
restart)
fail_unless_root
do_start_stop "stop"
do_start_stop
;;
status)
ps -ef | head -1
ps -ef | grep clearml-agent | grep -v grep
;;
*)
echo "Usage: service clearml-agents {start|stop|restart|status}"
exit 1
;;
esac