Skip to content

ClearML Agent Setup

mshannon-sil edited this page Sep 19, 2024 · 5 revisions

On Linux

  1. Log in as root
  2. Install the GPU driver
    ubuntu-drivers install
    
  3. Install Docker
  4. Install NVIDIA Container Toolkit and configure NVIDIA Container Runtime for Docker
  5. If using MIG partitions:
    1. Install nvidia-mig-manager.service
    2. Configure /etc/nvidia-mig-manager/config.yaml with your MIG configuration, e.g.
      version: v1
      mig-configs:
        sil-config:
          - devices: [0]
            mig-enabled: true
            mig-devices:
              3g.47gb: 2
          - devices: [1]
            mig-enabled: false
            mig-devices: {}
      
    3. Configure /etc/systemd/system/nvidia-mig-manager.service.d/override.conf to use your mig-config, e.g.
      [Service]
      Environment="MIG_PARTED_SELECTED_CONFIG=sil-config"
      
    4. Run nvidia-mig-parted apply or reboot the server
  6. Create clearml user
    adduser clearml
    
  7. Add clearml user to docker group: https://docs.docker.com/engine/install/linux-postinstall/#manage-docker-as-a-non-root-user
  8. Log in as clearml user. IMPORTANT: Do not create/modify any files in the clearml user directory as root.
    su - clearml
    
  9. Install clearml-agent
    pip install clearml-agent
    
  10. Add a clearml.conf file
    • Either copy it from an existing setup, or use the skeleton provided in the SILNLP repo under scripts/clearml_agent/clearml.conf and fill out the clearml credentials, git credentials, worker id, and worker name sections. Also add the following lines to the extra_docker_arguments section and fill out the access key and secret access key sections.
      extra_docker_arguments: [
            "--env","SIL_NLP_DATA_PATH=/silnlp"
            "--env","AWS_REGION=us-east-1",
            "--env","AWS_ACCESS_KEY_ID=*your access key*",
            "--env","AWS_SECRET_ACCESS_KEY=*your secret key*",
            "--env","TOKENIZERS_PARALLELISM=false",
            "-v","/home/clearml/.clearml/hf-cache:/root/.cache/huggingface"
          ]
      
  11. Create a startup script called start-agents.sh, e.g.
    !/bin/sh
    # Kill all clearml-agents running
    ps -A | grep clearml-agent | awk '{print $1}' | xargs kill -9 $1
    # GPU 0
    /home/clearml/.local/bin/clearml-agent daemon --use-owner-token --detached --docker --force-current-version --create-queue --gpus 0:0   --queue 47gb_queue
    /home/clearml/.local/bin/clearml-agent daemon --use-owner-token --detached --docker --force-current-version --create-queue --gpus 0:1   --queue 47gb_queue
    # GPU 1
    /home/clearml/.local/bin/clearml-agent daemon --use-owner-token --detached --docker --force-current-version --create-queue --gpus 1   --queue 94gb_queue
    
  12. Start the agents
    ./start-agents.sh
    
  13. To configure the GPUs to survive a reboot:
    1. Become the root user again
      exit
      
    2. Create a file called clearml-agent in /etc/init.d/ directory, e.g.
#!/bin/sh
set -e

### BEGIN INIT INFO
# Provides:           clearml-agents
# Required-Start:     $syslog $remote_fs $local_fs $syslog mountall
# Required-Stop:      $syslog $remote_fs $local_fs $syslog
# Should-Start:
# Should-Stop:
# Default-Start:      2 3 4 5
# Default-Stop:       0 1 6
# Short-Description:  ClearML Agents and queues to service GPUs
# Description:
#  "ClearML is an open source platform that automates and simplifies
#  developing and managing machine learning solutions.  ClearML Agent
#  is a virtual environment and execution manager for DL/ML solutions
#  on GPU machines."  --https://clear.ml
### END INIT INFO

export PATH=/sbin:/bin:/usr/sbin:/usr/bin:/usr/local/sbin:/usr/local/bin

NAME="clearml-agents"

# Get lsb functions
. /lib/lsb/init-functions

fail_unless_root() {
        if [ "$(id -u)" != '0' ]; then
                log_failure_msg "$NAME must be run as root"
                exit 1
        fi
}

do_start_stop() {
        STOP=""
        if [ "$1" = "stop" ]; then
                STOP="--stop"
        fi

        # Half GPUs 0:0 and 0:1 and Full GPU 1
        su --login --command "/home/clearml/.local/bin/clearml-agent daemon --use-owner-token --detached --docker --force-current-version --create-queue --gpus 0:0   --queue cheetah_47gb ${STOP}" clearml
        su --login --command "/home/clearml/.local/bin/clearml-agent daemon --use-owner-token --detached --docker --force-current-version --create-queue --gpus 0:1   --queue cheetah_47gb ${STOP}" clearml
        su --login --command "/home/clearml/.local/bin/clearml-agent daemon --use-owner-token --detached --docker --force-current-version --create-queue --gpus 1   --queue cheetah_94gb ${STOP}" clearml
}

case "$1" in
        start)
                fail_unless_root

                log_begin_msg "Starting $NAME"
                do_start_stop
                log_end_msg $?
                ;;

        stop)
                fail_unless_root
                do_start_stop "stop"
                ;;

        restart)
                fail_unless_root
                do_start_stop "stop"
                do_start_stop
                ;;

        status)
                ps -ef | head -1
                ps -ef | grep clearml-agent | grep -v grep
                ;;

        *)
                echo "Usage: service clearml-agents {start|stop|restart|status}"
                exit 1
                ;;
esac