Skip to content

ClearML Agent Setup

mshannon-sil edited this page Sep 19, 2024 · 5 revisions

On Linux

  1. Log in as root
  2. Install the GPU driver
    ubuntu-drivers install
    
  3. Install Docker
  4. Install NVIDIA Container Toolkit and configure NVIDIA Container Runtime for Docker
  5. If using MIG partitions:
    1. Install nvidia-mig-manager.service
    2. Configure /etc/nvidia-mig-manager/config.yaml with your MIG configuration, e.g.
      version: v1
      mig-configs:
        sil-config:
          - devices: [0]
            mig-enabled: true
            mig-devices:
              3g.47gb: 2
          - devices: [1]
            mig-enabled: false
            mig-devices: {}
      
    3. Configure /etc/systemd/system/nvidia-mig-manager.service.d/override.conf to use your mig-config, e.g.
      [Service]
      Environment="MIG_PARTED_SELECTED_CONFIG=sil-config"
      
    4. Run nvidia-mig-parted apply or reboot the server
  6. Create clearml user
    adduser clearml
    
  7. Add clearml user to docker group: https://docs.docker.com/engine/install/linux-postinstall/#manage-docker-as-a-non-root-user
  8. Log in as clearml user. IMPORTANT: Do not create/modify any files in the clearml user directory as root.
    su - clearml
    
  9. Install clearml-agent
    pip install clearml-agent
    
  10. Add a clearml.conf file
    • Either copy it from an existing setup, or use the skeleton provided in the SILNLP repo under scripts/clearml_agent/clearml.conf and fill out the clearml credentials, git credentials, worker id, and worker name sections. Also add the following lines to the extra_docker_arguments section and fill out the access key and secret access key sections.
      extra_docker_arguments: [
            "--env","SIL_NLP_DATA_PATH=/silnlp"
            "--env","AWS_REGION=us-east-1",
            "--env","AWS_ACCESS_KEY_ID=***your access key***",
            "--env","AWS_SECRET_ACCESS_KEY=***your secret key***",
            "--env","TOKENIZERS_PARALLELISM=false",
            "-v","/home/clearml/.clearml/hf-cache:/root/.cache/huggingface"
          ]
      
  11. Create a startup script called start-agents.sh, e.g.
    !/bin/sh
    # Kill all clearml-agents running
    ps -A | grep clearml-agent | awk '{print $1}' | xargs kill -9 $1
    # GPU 0
    /home/clearml/.local/bin/clearml-agent daemon --use-owner-token --detached --docker --force-current-version --create-queue --gpus 0:0   --queue 47gb_queue
    /home/clearml/.local/bin/clearml-agent daemon --use-owner-token --detached --docker --force-current-version --create-queue --gpus 0:1   --queue 47gb_queue
    # GPU 1
    /home/clearml/.local/bin/clearml-agent daemon --use-owner-token --detached --docker --force-current-version --create-queue --gpus 1   --queue 94gb_queue
    
  12. Start the agents
    ./start-agents.sh
    
  13. To configure the GPUs to survive a reboot:
    1. Become the root user again
      exit
      
    2. Create a file called clearml-agent in /etc/init.d/ directory, e.g.
#!/bin/sh
set -e

### BEGIN INIT INFO
# Provides:           clearml-agents
# Required-Start:     $syslog $remote_fs $local_fs $syslog mountall
# Required-Stop:      $syslog $remote_fs $local_fs $syslog
# Should-Start:
# Should-Stop:
# Default-Start:      2 3 4 5
# Default-Stop:       0 1 6
# Short-Description:  ClearML Agents and queues to service GPUs
# Description:
#  "ClearML is an open source platform that automates and simplifies
#  developing and managing machine learning solutions.  ClearML Agent
#  is a virtual environment and execution manager for DL/ML solutions
#  on GPU machines."  --https://clear.ml
### END INIT INFO

export PATH=/sbin:/bin:/usr/sbin:/usr/bin:/usr/local/sbin:/usr/local/bin

NAME="clearml-agents"

# Get lsb functions
. /lib/lsb/init-functions

fail_unless_root() {
        if [ "$(id -u)" != '0' ]; then
                log_failure_msg "$NAME must be run as root"
                exit 1
        fi
}

do_start_stop() {
        STOP=""
        if [ "$1" = "stop" ]; then
                STOP="--stop"
        fi

        # Half GPUs 0:0 and 0:1 and Full GPU 1
        su --login --command "/home/clearml/.local/bin/clearml-agent daemon --use-owner-token --detached --docker --force-current-version --create-queue --gpus 0:0   --queue cheetah_47gb ${STOP}" clearml
        su --login --command "/home/clearml/.local/bin/clearml-agent daemon --use-owner-token --detached --docker --force-current-version --create-queue --gpus 0:1   --queue cheetah_47gb ${STOP}" clearml
        su --login --command "/home/clearml/.local/bin/clearml-agent daemon --use-owner-token --detached --docker --force-current-version --create-queue --gpus 1   --queue cheetah_94gb ${STOP}" clearml
}

case "$1" in
        start)
                fail_unless_root

                log_begin_msg "Starting $NAME"
                do_start_stop
                log_end_msg $?
                ;;

        stop)
                fail_unless_root
                do_start_stop "stop"
                ;;

        restart)
                fail_unless_root
                do_start_stop "stop"
                do_start_stop
                ;;

        status)
                ps -ef | head -1
                ps -ef | grep clearml-agent | grep -v grep
                ;;

        *)
                echo "Usage: service clearml-agents {start|stop|restart|status}"
                exit 1
                ;;
esac

On SLURM servers

  1. Install Miniconda
  2. Clone and enter the silnlp repo
    git clone https://github.com/sillsdev/silnlp.git
    cd silnlp
    
  3. Create a new conda environment using the environment.yml file in the repo
    conda env create --file environment.yml
    
  4. Activate the conda environment
    conda activate silnlp
    
  5. Install Poetry with the official installer, not pipx
    • Make sure to install the version that matches the one listed at the top of the poetry.lock file in SILNLP
    • Poetry must be installed after the conda environment is activated so that it uses the correct Python version
    • Double check that Poetry has been added to the path. You may need to restart the terminal.
  6. Install clearml-agent-slurm
    pip3 install -U --extra-index-url https://*****@*****.allegro.ai/repository/clearml_agent_slurm/simple clearml-agent-slurm
    
    • The credentials can be found by clicking on the question mark in the upper right corner of the ClearML dashboard, then clicking ClearML Python Package setup and copying the credentials in step 1.
  7. Add a clearml.conf file
    • Either copy it from an existing setup, or use the skeleton provided in the silnlp repo under scripts/clearml_agent/clearml.conf and fill out the clearml credentials, git credentials, worker id, and worker name.
  8. Set environment variables in .bashrc
    export PYTHONPATH=
    export AWS_REGION="us-east-1"
    export AWS_ACCESS_KEY_ID=***your access key***
    export AWS_SECRET_ACCESS_KEY=***your secret key***
    export SIL_NLP_DATA_PATH="/silnlp"
    export TOKENIZERS_PARALLELISM=false
    
  9. Create a batch template file called slurm.clearml.template
    • You'll need to update the --account and --partition parameters for your use case in the example below

#!/bin/bash
# available template variables (default value separator ":")
# ${CLEARML_QUEUE_NAME}
# ${CLEARML_QUEUE_ID}
# ${CLEARML_WORKER_ID}.
# complex template variables  (default value separator ":")
# ${CLEARML_TASK.id}
# ${CLEARML_TASK.name}
# ${CLEARML_TASK.project.id}
# ${CLEARML_TASK.hyperparams.properties.user_key.value}


# example
#SBATCH --job-name=clearml_task_${CLEARML_TASK.id}       # Job name DO NOT CHANGE
#SBATCH --output=task-${CLEARML_TASK.id}-%j.log
#SBATCH --account ***your account name***
#SBATCH --partition ***partition to use***
#SBATCH --time=${CLEARML_TASK.hyperparams.properties.time_limit.value:18:00:00}             # Time limit hrs:min:sec
#SBATCH --nodes=1


conda activate silnlp

${CLEARML_PRE_SETUP}

echo whoami $(whoami)

${CLEARML_AGENT_EXECUTE}

${CLEARML_POST_SETUP}
  1. Start the agent
    nohup clearml-agent-slurm --template-files slurm.clearml.template --queue ***queue_name***
    
  2. Press Ctrl + Z to suspend the process
  3. Move the process to the background
    bg