Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make Elasticsearch restarts always rolling #349

Draft
wants to merge 3 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 3 additions & 4 deletions roles/elasticsearch/handlers/main.yml
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
---
# handlers file for elasticsearch
- name: Restart Elasticsearch
ansible.builtin.service:
name: elasticsearch
state: restarted
daemon_reload: yes
ansible.builtin.include_tasks: restart_elasticsearch.yml
with_items: "{{ groups[elasticstack_elasticsearch_group_name] }}"
when:
- "hostvars[item].inventory_hostname == inventory_hostname"
- elasticsearch_enable | bool
- not elasticsearch_freshstart.changed | bool
- not elasticsearch_freshstart_security.changed | bool
Expand Down
12 changes: 12 additions & 0 deletions roles/elasticsearch/handlers/restart_elasticsearch.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
---

- name: Check for running Elasticsearch service
ansible.builtin.systemd:
name: elasticsearch
register: elasticsearch_running

- name: Include rolling stop
ansible.builtin.include_tasks: "{{ role_path }}/tasks/elasticsearch-rolling-stop.yml"

- name: Include rolling start
ansible.builtin.include_tasks: "{{ role_path }}/tasks/elasticsearch-rolling-start.yml"
83 changes: 83 additions & 0 deletions roles/elasticsearch/tasks/elasticsearch-rolling-start.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
# Ansible
#
# Rolling Upgrade of Elasticsearch with security on
# Source from: author: Jeff Steinmetz, @jeffsteinmetz; Bin Li, @holysoros
# Modifications: author: Daniel Neuberger @netways.de
# More modifications: NETWAYS Professional Services GmbH
# latest tested with Ansible 2.9 and later

---

# For now we support upgrade only for clusters with security enabled
# If you positively need support for safely upgrading clusters without security,
# feel free to open an issue at https://github.com/NETWAYS/ansible-collection-elasticstack/issues

- name: Start elasticsearch
ansible.builtin.service:
name: elasticsearch
enabled: yes
state: started
when:
- elasticsearch_running.status.ActiveState == "active"
- not elasticsearch_unsafe_upgrade_restart | bool

- name: Restart elasticsearch (fast, for non-prod)
ansible.builtin.service:
name: elasticsearch
enabled: yes
state: restarted
when:
- elasticsearch_running.status.ActiveState == "active"
- elasticsearch_unsafe_upgrade_restart | bool

- name: Wait for elasticsearch node to come back up if it was stopped
ansible.builtin.wait_for:
host: "{{ elasticsearch_api_host }}"
port: "{{ elasticstack_elasticsearch_http_port }}"
delay: 30

- name: Confirm the node joins the cluster # noqa: risky-shell-pipe
ansible.builtin.shell: >
if test -n "$(ps -p $$ | grep bash)"; then set -o pipefail; fi;
curl
-k
-u elastic:{{ elasticstack_password.stdout }}
-s
-m 2
'{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cat/nodes?h=name'
| grep
-E
'^{{ elasticsearch_nodename }}$'
register: result
until: result.rc == 0
retries: 200
delay: 3
changed_when: false

- name: Enable shard allocation for the cluster
ansible.builtin.uri:
url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cluster/settings"
method: PUT
body: '{ "persistent": { "cluster.routing.allocation.enable": null }}'
body_format: json
user: elastic
password: "{{ elasticstack_password.stdout }}"
validate_certs: no
register: response
# next line is boolean not string, so no quotes around true
# use python truthiness
until: "response.json.acknowledged == true"
retries: 5
delay: 30

- name: Wait for cluster health to return to yellow or green
ansible.builtin.uri:
url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cluster/health"
method: GET
user: elastic
password: "{{ elasticstack_password.stdout }}"
validate_certs: no
register: response
until: "response.json.status == 'yellow' or response.json.status == 'green'"
retries: 5
delay: 30
76 changes: 76 additions & 0 deletions roles/elasticsearch/tasks/elasticsearch-rolling-stop.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
# Ansible
#
# Rolling Upgrade of Elasticsearch with security on
# Source from: author: Jeff Steinmetz, @jeffsteinmetz; Bin Li, @holysoros
# Modifications: author: Daniel Neuberger @netways.de
# More modifications: NETWAYS Professional Services GmbH
# latest tested with Ansible 2.9 and later

---

# For now we support upgrade only for clusters with security enabled
# If you positively need support for safely upgrading clusters without security,
# feel free to open an issue at https://github.com/NETWAYS/ansible-collection-elasticstack/issues
- name: Set connection protocol to https
ansible.builtin.set_fact:
elasticsearch_http_protocol: "https"

# Usually we should not need this step. It's only there to recover from broken upgrade plays
# Without this step the cluster would never recover and the play would always fail
- name: Enable shard allocation for the cluster
ansible.builtin.uri:
url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cluster/settings"
method: PUT
body: '{ "persistent": { "cluster.routing.allocation.enable": null }}'
body_format: json
user: elastic
password: "{{ elasticstack_password.stdout }}"
validate_certs: no
register: response
# next line is boolean not string, so no quotes around true
# use python truthiness
until: "response.json.acknowledged == true"
retries: 5
delay: 30

# this step is key!!! Don't restart more nodes
# until all shards have completed recovery
- name: Wait for cluster health to return to green
ansible.builtin.uri:
url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cluster/health"
method: GET
user: elastic
password: "{{ elasticstack_password.stdout }}"
validate_certs: no
register: response
until: "response.json.status == 'green'"
retries: 50
delay: 30

# Disabling shard allocation right after enabling it seems redundant. Please see above for details.
- name: Disable shard allocation for the cluster
ansible.builtin.uri:
url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cluster/settings"
method: PUT
body: '{ "persistent": { "cluster.routing.allocation.enable": "none" }}'
body_format: json
user: elastic
password: "{{ elasticstack_password.stdout }}"
validate_certs: no

- name: Stop non essential indexing to speed up shard recovery
ansible.builtin.uri:
url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_flush"
method: POST
user: elastic
password: "{{ elasticstack_password.stdout }}"
validate_certs: no
failed_when: false

- name: Shutdown elasticsearch service
ansible.builtin.service:
name: elasticsearch
enabled: yes
state: stopped
when:
- not elasticsearch_unsafe_upgrade_restart | bool
132 changes: 4 additions & 128 deletions roles/elasticsearch/tasks/elasticsearch-rolling-upgrade.yml
Original file line number Diff line number Diff line change
Expand Up @@ -71,65 +71,8 @@
- groups[elasticstack_elasticsearch_group_name] | length > 1
block:

# Usually we should not need this step. It's only there to recover from broken upgrade plays
# Without this step the cluster would never recover and the play would always fail
- name: Enable shard allocation for the cluster
ansible.builtin.uri:
url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cluster/settings"
method: PUT
body: '{ "persistent": { "cluster.routing.allocation.enable": null }}'
body_format: json
user: elastic
password: "{{ elasticstack_password.stdout }}"
validate_certs: no
register: response
# next line is boolean not string, so no quotes around true
# use python truthiness
until: "response.json.acknowledged == true"
retries: 5
delay: 30

# this step is key!!! Don't restart more nodes
# until all shards have completed recovery
- name: Wait for cluster health to return to green
ansible.builtin.uri:
url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cluster/health"
method: GET
user: elastic
password: "{{ elasticstack_password.stdout }}"
validate_certs: no
register: response
until: "response.json.status == 'green'"
retries: 50
delay: 30

# Disabling shard allocation right after enabling it seems redundant. Please see above for details.
- name: Disable shard allocation for the cluster
ansible.builtin.uri:
url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cluster/settings"
method: PUT
body: '{ "persistent": { "cluster.routing.allocation.enable": "none" }}'
body_format: json
user: elastic
password: "{{ elasticstack_password.stdout }}"
validate_certs: no

- name: Stop non essential indexing to speed up shard recovery
ansible.builtin.uri:
url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_flush"
method: POST
user: elastic
password: "{{ elasticstack_password.stdout }}"
validate_certs: no
failed_when: false

- name: Shutdown elasticsearch service
ansible.builtin.service:
name: elasticsearch
enabled: yes
state: stopped
when:
- not elasticsearch_unsafe_upgrade_restart | bool
- name: Include rolling stop
ansible.builtin.include_tasks: elasticsearch-rolling-stop.yml

- name: Update Elasticsearch - rpm with managed repositories
ansible.builtin.package:
Expand All @@ -147,72 +90,5 @@
- ansible_os_family == "Debian" or
not elasticstack_full_stack | bool

- name: Start elasticsearch
ansible.builtin.service:
name: elasticsearch
enabled: yes
state: started
when:
- elasticsearch_running.status.ActiveState == "active"
- not elasticsearch_unsafe_upgrade_restart | bool

- name: Restart elasticsearch (fast, for non-prod)
ansible.builtin.service:
name: elasticsearch
enabled: yes
state: restarted
when:
- elasticsearch_running.status.ActiveState == "active"
- elasticsearch_unsafe_upgrade_restart | bool

- name: Wait for elasticsearch node to come back up if it was stopped
ansible.builtin.wait_for:
host: "{{ elasticsearch_api_host }}"
port: "{{ elasticstack_elasticsearch_http_port }}"
delay: 30

- name: Confirm the node joins the cluster # noqa: risky-shell-pipe
ansible.builtin.shell: >
if test -n "$(ps -p $$ | grep bash)"; then set -o pipefail; fi;
curl
-k
-u elastic:{{ elasticstack_password.stdout }}
-s
-m 2
'{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cat/nodes?h=name'
| grep
-E
'^{{ elasticsearch_nodename }}$'
register: result
until: result.rc == 0
retries: 200
delay: 3
changed_when: false

- name: Enable shard allocation for the cluster
ansible.builtin.uri:
url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cluster/settings"
method: PUT
body: '{ "persistent": { "cluster.routing.allocation.enable": null }}'
body_format: json
user: elastic
password: "{{ elasticstack_password.stdout }}"
validate_certs: no
register: response
# next line is boolean not string, so no quotes around true
# use python truthiness
until: "response.json.acknowledged == true"
retries: 5
delay: 30

- name: Wait for cluster health to return to yellow or green
ansible.builtin.uri:
url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cluster/health"
method: GET
user: elastic
password: "{{ elasticstack_password.stdout }}"
validate_certs: no
register: response
until: "response.json.status == 'yellow' or response.json.status == 'green'"
retries: 5
delay: 30
- name: Include rolling start
ansible.builtin.include_tasks: elasticsearch-rolling-start.yml