diff --git a/README.md b/README.md index 46e3b65..ed928ad 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,18 @@ Slurm ===== Install and configure a Slurm cluster on RHEL/CentOS or Debian/Ubuntu servers +To configure a custom Debian repository, define `slurm_configure_repos: true`. + +Then, define the APT repositories with the URL to the GPG key. + + # Example apt repository + slurm_apt_repository: "deb [trusted=yes] http://127.0.0.1/ubuntu/22.04/amd64/ ./" + # Example GPG key + slurm_gpg_key: 'http://127.0.0.1/ubuntu/22.04/amd64/GPG-KEY-slurm' + +Define `slurm_apt_priority` to pin the priority of the repository (APT only). This is optional. + + slurm_apt_priority: 900 Role Variables -------------- @@ -23,9 +35,10 @@ Partitions and nodes go in `slurm_partitions` and `slurm_nodes`, lists of hashes of that partition or node. Options for the additional configuration files [acct_gather.conf](https://slurm.schedmd.com/acct_gather.conf.html), -[cgroup.conf](https://slurm.schedmd.com/cgroup.conf.html) and [gres.conf](https://slurm.schedmd.com/gres.conf.html) -may be specified in the `slurm_acct_gather_config`, `slurm_cgroup_config` (both of them hashes) and -`slurm_gres_config` (list of hashes) respectively. +[cgroup.conf](https://slurm.schedmd.com/cgroup.conf.html), [gres.conf](https://slurm.schedmd.com/gres.conf.html) +and [job_container.conf](https://slurm.schedmd.com/job_container.conf.html) may be specified in the +`slurm_acct_gather_config`, `slurm_cgroup_config` (both of them hashes), `slurm_gres_config` (list of hashes) and +`slurm_job_container_config` (hashes) respectively. Set `slurm_upgrade` to true to upgrade the installed Slurm packages. @@ -88,6 +101,8 @@ More extensive example: SelectType: "select/cons_res" SelectTypeParameters: "CR_Core" SlurmctldHost: "slurmctl" + # Use a list to configure master and backups Slurmctld hosts + # SlurmctldHost: ['slurmctl1', 'slurmctl2'] SlurmctldLogFile: "/var/log/slurm/slurmctld.log" SlurmctldPidFile: "/var/run/slurmctld.pid" SlurmdLogFile: "/var/log/slurm/slurmd.log" diff --git a/defaults/main.yml b/defaults/main.yml index 71e593b..3996598 100644 --- a/defaults/main.yml +++ b/defaults/main.yml @@ -20,9 +20,16 @@ slurmdbd_service_name: slurmdbd #Cluster name for slurm config. This is required to correctly setup slurmdbd and attune it to the slurm config. __slurm_cluster_name: cluster __cluster_not_setup: true #Default value. Is modified if cluster already exists. +slurm_setup_cluster: false slurm_start_services: true + +# install from custom debian repos +slurm_configure_repos: false +#to setup custom systemd unit files +slurm_configure_systemd: false + __slurm_user_name: "{{ (slurm_user | default({})).name | default('slurm') }}" # TODO: this could be incorrect, use the group collection from galaxyproject.galaxy __slurm_group_name: "{{ (slurm_user | default({})).group | default(omit) }}" @@ -91,6 +98,5 @@ __slurmdbd_config_default: AuthType: auth/munge DbdPort: 6819 SlurmUser: "{{ __slurm_user_name }}" - SlurmctldPidFile: "{{ __slurm_run_dir ~ '/slurmdbd.pid' if __slurm_debian else omit }}" LogFile: "{{ __slurm_log_dir ~ '/slurmdbd.log' if __slurm_debian else omit }}" __slurmdbd_config_merged: "{{ __slurmdbd_config_default | combine(slurmdbd_config | default({})) }}" diff --git a/files/slurmctld.service b/files/slurmctld.service new file mode 100644 index 0000000..94381fc --- /dev/null +++ b/files/slurmctld.service @@ -0,0 +1,15 @@ +[Unit] +Description=Slurm controller daemon +After=network.target slurmdbd.service munge.service +ConditionPathExists=/etc/slurm/slurm.conf + +[Service] +Type=simple +EnvironmentFile=-/etc/sysconfig/slurmctld +ExecStart=/opt/slurm/sbin/slurmctld $SLURMCTLD_OPTIONS +ExecReload=/bin/kill -HUP $MAINPID +PIDFile=/var/run/slurm/slurmctld.pid +RuntimeDirectory=slurm + +[Install] +WantedBy=multi-user.target diff --git a/files/slurmd.service b/files/slurmd.service new file mode 100644 index 0000000..99238a9 --- /dev/null +++ b/files/slurmd.service @@ -0,0 +1,23 @@ +[Unit] +Description=Slurm node daemon +After=network.target munge.service +Wants=network-online.target +ConditionPathExists=/etc/slurm/slurm.conf + +[Service] +Type=simple +EnvironmentFile=-/etc/default/slurmd +ExecStartPre=/bin/mkdir -p /var/run/slurm +ExecStart=/opt/slurm/sbin/slurmd -D -s $SLURMD_OPTIONS +ExecReload=/bin/kill -HUP $MAINPID +PIDFile=/var/run/slurm/slurmd.pid +KillMode=process +LimitNOFILE=131072 +LimitMEMLOCK=infinity +LimitSTACK=infinity +Delegate=yes +TasksMax=20000 + + +[Install] +WantedBy=multi-user.target diff --git a/files/slurmdbd.service b/files/slurmdbd.service new file mode 100644 index 0000000..d3e82cf --- /dev/null +++ b/files/slurmdbd.service @@ -0,0 +1,18 @@ +[Unit] +Description=Slurm controller daemon +After=network.target munge.service mysql.service mysqld.service mariadb.service +Wants=network-online.target +ConditionPathExists=/etc/slurm/slurm.conf + +[Service] +Type=simple +EnvironmentFile=-/etc/default/slurmdbd +ExecStartPre=-/usr/bin/ls /var/lib/slurm/slurmctld +ExecStart=/opt/slurm/sbin/slurmdbd -D -s $SLURMDBD_OPTIONS +ExecReload=/bin/kill -HUP $MAINPID +PIDFile=/var/run/slurm/slurmdbd.pid +LimitNOFILE=65536 +RuntimeDirectory=slurm + +[Install] +WantedBy=multi-user.target diff --git a/handlers/main.yml b/handlers/main.yml index 805a33c..5fa944a 100644 --- a/handlers/main.yml +++ b/handlers/main.yml @@ -4,32 +4,56 @@ name: munge state: restarted +- name: Restart slurmdbd + ansible.builtin.systemd: + name: "{{ slurmdbd_service_name }}" + state: restarted + masked: no + enabled: yes + daemon_reload: yes + when: "(slurm_start_services | bool) and ('slurmservers' in group_names or 'controller' in slurm_roles)" + register: slurmdbd_restart + - name: Reload slurmdbd ansible.builtin.service: name: "{{ slurmdbd_service_name }}" state: reloaded - when: "slurm_start_services and ('slurmdbdservers' in group_names or 'dbd' in slurm_roles)" + when: + - slurm_start_services | bool + - ('slurmdbdservers' in group_names or 'dbd' in slurm_roles) + - slurmdbd_restart is not defined + +- name: Restart slurmctld + ansible.builtin.systemd: + name: "{{ slurmctld_service_name }}" + state: restarted + masked: no + enabled: yes + daemon_reload: yes + when: "(slurm_start_services | bool) and ('slurmservers' in group_names or 'controller' in slurm_roles)" + register: slurmctld_restart - name: Reload slurmctld ansible.builtin.service: name: "{{ slurmctld_service_name }}" state: reloaded - when: "slurm_start_services and ('slurmservers' in group_names or 'controller' in slurm_roles)" + when: + - slurm_start_services | bool + - ('slurmservers' in group_names or 'controller' in slurm_roles) + - slurmctld_restart is not defined -- name: Restart slurmctld +- name: Restart slurmd ansible.builtin.service: - name: "{{ slurmctld_service_name }}" + name: "{{ slurmd_service_name }}" state: restarted - when: "slurm_start_services and ('slurmservers' in group_names or 'controller' in slurm_roles)" + when: "(slurm_start_services | bool) and ('slurmexechosts' in group_names or 'exec' in slurm_roles)" + register: slurmd_restart - name: Reload slurmd ansible.builtin.service: name: "{{ slurmd_service_name }}" state: reloaded - when: "slurm_start_services and ('slurmexechosts' in group_names or 'exec' in slurm_roles)" - -- name: Restart slurmd - ansible.builtin.service: - name: "{{ slurmd_service_name }}" - state: restarted - when: "slurm_start_services and ('slurmexechosts' in group_names or 'exec' in slurm_roles)" + when: + - slurm_start_services | bool + - ('slurmexechosts' in group_names or 'exec' in slurm_roles) + - slurmd_restart is not defined diff --git a/meta/main.yml b/meta/main.yml index 15efb41..3b73630 100644 --- a/meta/main.yml +++ b/meta/main.yml @@ -1,11 +1,11 @@ galaxy_info: role_name: slurm - namespace: galaxyproject + namespace: mila author: The Galaxy Project description: Install and manage the Slurm Workload Manager - company: The Galaxy Project + company: Mila license: MIT - min_ansible_version: 2.5 + min_ansible_version: '2.14' github_branch: main platforms: - name: EL diff --git a/tasks/_inc_extra_configs.yml b/tasks/_inc_extra_configs.yml index 9a3dded..e436c70 100644 --- a/tasks/_inc_extra_configs.yml +++ b/tasks/_inc_extra_configs.yml @@ -16,6 +16,9 @@ - name: gres.conf config: slurm_gres_config template: gres.conf.j2 + - name: job_container.conf + config: slurm_job_container_config + template: generic.conf.j2 loop_control: label: "{{ item.name }}" when: item.config in vars diff --git a/tasks/common.yml b/tasks/common.yml index 621d7f0..87d4357 100644 --- a/tasks/common.yml +++ b/tasks/common.yml @@ -16,6 +16,25 @@ mode: 0644 when: slurm_rotate_logs +- name: Install plugstack.conf + ansible.builtin.template: + src: "plugstack.conf.j2" + dest: "{{ slurm_config_dir }}/plugstack.conf" + owner: root + group: root + mode: 0444 + notify: + - Restart slurmd + - Restart slurmctld + +- name: Check that slurm plugin dir exists + ansible.builtin.file: + path: "{{ slurm_config_dir }}/plugstack.conf.d/" + state: directory + notify: + - Restart slurmd + - Restart slurmctld + - name: Install slurm.conf ansible.builtin.template: src: "slurm.conf.j2" diff --git a/tasks/main.yml b/tasks/main.yml index 977d7b3..04faa6d 100644 --- a/tasks/main.yml +++ b/tasks/main.yml @@ -3,6 +3,10 @@ - name: Include user creation tasks ansible.builtin.include_tasks: user.yml when: slurm_create_user + +- name: Include Configure custom repositories + ansible.builtin.include_tasks: repositories-Debian.yml + when: slurm_configure_repos - name: Include controller installation tasks ansible.builtin.include_tasks: slurmctld.yml @@ -24,22 +28,22 @@ name: "{{ slurmdbd_service_name }}" enabled: true state: started - when: "slurm_start_services and ('slurmdbdservers' in group_names or 'dbd' in slurm_roles)" + when: "(slurm_start_services | bool) and ('slurmdbdservers' in group_names or 'dbd' in slurm_roles)" - name: Ensure slurmctld is enabled and running ansible.builtin.service: name: "{{ slurmctld_service_name }}" enabled: true state: started - when: "slurm_start_services and ('slurmservers' in group_names or 'controller' in slurm_roles)" + when: "(slurm_start_services | bool) and ('slurmservers' in group_names or 'controller' in slurm_roles)" - name: Ensure slurmd is enabled and running ansible.builtin.service: name: "{{ slurmd_service_name }}" enabled: true state: started - when: "slurm_start_services and ('slurmexechosts' in group_names or 'exec' in slurm_roles)" + when: "(slurm_start_services | bool) and ('slurmexechosts' in group_names or 'exec' in slurm_roles)" - name: Setup cluster on slurmdb include_tasks: slurmdbd_cluster.yml - when: "slurm_start_services and ('slurmdbdservers' in group_names or 'dbd' in slurm_roles)" + when: "(slurm_setup_cluster | bool ) and (slurm_start_services | bool) and ('slurmdbdservers' in group_names or 'dbd' in slurm_roles)" diff --git a/tasks/repositories-Debian.yml b/tasks/repositories-Debian.yml new file mode 100644 index 0000000..87ced47 --- /dev/null +++ b/tasks/repositories-Debian.yml @@ -0,0 +1,26 @@ +--- +- name: Install GPG-KEY + ansible.builtin.apt_key: + url: "{{ slurm_gpg_key }}" + keyring: /etc/apt/trusted.gpg.d/slurm.gpg + when: slurm_gpg_key is defined + +- name: Configure Slurm repository + ansible.builtin.copy: + content: "{{ slurm_apt_repository }}\n" + dest: /etc/apt/sources.list.d/slurm.list + mode: 0644 + +- name: Configure APT preferences for Slurm repository + ansible.builtin.copy: + content: | + Package: * + Pin: release o=SLURM + Pin-Priority: {{ slurm_apt_priority }} + dest: /etc/apt/preferences.d/priority-slurm + mode: 0644 + when: slurm_apt_priority is defined + +- name: "Update repository cache" + ansible.builtin.apt: + update_cache: yes diff --git a/tasks/slurmctld.yml b/tasks/slurmctld.yml index b15152c..d4d462c 100644 --- a/tasks/slurmctld.yml +++ b/tasks/slurmctld.yml @@ -4,6 +4,8 @@ ansible.builtin.package: name: "{{ __slurm_packages.slurmctld }}" state: "{{ 'latest' if slurm_upgrade else 'present' }}" + notify: + - Restart slurmctld - name: Create slurm state directory ansible.builtin.file: @@ -25,6 +27,17 @@ state: directory when: slurm_create_dirs and __slurm_config_merged.SlurmctldLogFile != omit +- name: Add slurmctld service + ansible.builtin.copy: + src: slurmctld.service + dest: /etc/systemd/system/slurmctld.service + owner: root + group: root + mode: 0644 + when: slurm_configure_systemd + notify: + - Restart slurmctld + - name: Include config dir creation tasks ansible.builtin.include_tasks: _inc_create_config_dir.yml when: slurm_create_dirs diff --git a/tasks/slurmd.yml b/tasks/slurmd.yml index 1ed0899..298d55d 100644 --- a/tasks/slurmd.yml +++ b/tasks/slurmd.yml @@ -4,6 +4,8 @@ ansible.builtin.package: name: "{{ __slurm_packages.slurmd }}" state: "{{ 'latest' if slurm_upgrade else 'present' }}" + notify: + - Restart slurmd - name: Create slurm spool directory ansible.builtin.file: @@ -25,6 +27,17 @@ state: directory when: slurm_create_dirs and __slurm_config_merged.SlurmdLogFile != omit +- name: Add slurmd service + ansible.builtin.copy: + src: slurmd.service + dest: /etc/systemd/system/slurmd.service + owner: root + group: root + mode: 0644 + when: slurm_configure_systemd + notify: + - Restart slurmd + - name: Include config dir creation tasks ansible.builtin.include_tasks: _inc_create_config_dir.yml when: slurm_create_dirs diff --git a/tasks/slurmdbd.yml b/tasks/slurmdbd.yml index 138349a..d9061da 100644 --- a/tasks/slurmdbd.yml +++ b/tasks/slurmdbd.yml @@ -4,6 +4,8 @@ ansible.builtin.package: name: "{{ __slurm_packages.slurmdbd }}" state: "{{ 'latest' if slurm_upgrade else 'present' }}" + notify: + - Restart slurmdbd - name: Create slurm log directory ansible.builtin.file: @@ -17,6 +19,17 @@ - name: Include config dir creation tasks ansible.builtin.include_tasks: _inc_create_config_dir.yml when: slurm_create_dirs + +- name: Add slurmdbd service + ansible.builtin.copy: + src: slurmdbd.service + dest: /etc/systemd/system/slurmdbd.service + owner: root + group: root + mode: 0644 + when: slurm_configure_systemd + notify: + - Restart slurmdbd - name: Install slurmdbd.conf ansible.builtin.template: diff --git a/tasks/slurmdbd_cluster.yml b/tasks/slurmdbd_cluster.yml index 08032a2..66dda91 100644 --- a/tasks/slurmdbd_cluster.yml +++ b/tasks/slurmdbd_cluster.yml @@ -16,5 +16,5 @@ become: yes become_user: root notify: - - reload slurmdbd + - Reload slurmdbd when: __cluster_not_setup diff --git a/templates/logrotate.j2 b/templates/logrotate.j2 index ceb7a4d..659af19 100644 --- a/templates/logrotate.j2 +++ b/templates/logrotate.j2 @@ -2,7 +2,7 @@ # Slurm Logrotate Configuration ## # TODO: this ignores the actual *LogFile values -{{ '/var/log/slurm-llnl' if __slurm_debian else '/var/log/slurm' }}/*.log { +{{ '/var/log/slurm-llnl' if (not slurm_configure_repos and __slurm_debian) else '/var/log/slurm' }}/*.log { compress missingok nocopytruncate diff --git a/templates/plugstack.conf.j2 b/templates/plugstack.conf.j2 new file mode 100644 index 0000000..391ccf5 --- /dev/null +++ b/templates/plugstack.conf.j2 @@ -0,0 +1,5 @@ +## +## This file is maintained by Ansible - ALL MODIFICATIONS WILL BE REVERTED +## + +include {{ slurm_config_dir }}/plugstack.conf.d/*.conf diff --git a/templates/slurm.conf.j2 b/templates/slurm.conf.j2 index e1bc114..f0761cc 100644 --- a/templates/slurm.conf.j2 +++ b/templates/slurm.conf.j2 @@ -11,8 +11,14 @@ ControlMachine=localhost {% for key in __slurm_config_merged | sort %} {% set val = __slurm_config_merged[key] %} {% if val is not none and val != omit %} +{% if key == 'SlurmctldHost' and val is iterable and val is not string and val is not mapping %} +{% for slurmctldhost in val %} +SlurmctldHost={{ slurmctldhost }} +{% endfor %} +{% else %} {{ key }}={{ 'YES' if val is sameas true else ('NO' if val is sameas false else val) }} {% endif %} +{% endif %} {% endfor %} # Nodes