Skip to content

Commit

Permalink
[DPE-4935] no allocation exclusions on restart (#391)
Browse files Browse the repository at this point in the history
## Issue
When an Opensearch node is restarted by the operator, currently the
shards assigned to this node are first moved away from the node, and
after restarting moved back. This is a lot of work that, in case of
simple restarts, is not necessary.

To avoid this behavior, no allocation exclusion should be added for the
departing node, when a restart operation is performed.

## Solution
Check for the `restart` flag in `_stop_opensearch()` and pass it through
to `opensearch_exclusions.add_current()`. If set to `True`, only the
voting exclusion should be added for the departing node, but no
allocation exclusion.
  • Loading branch information
reneradoi authored Aug 13, 2024
1 parent 5e1f46e commit 088db63
Show file tree
Hide file tree
Showing 3 changed files with 9 additions and 9 deletions.
6 changes: 1 addition & 5 deletions lib/charms/opensearch/v0/opensearch_base_charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -1011,10 +1011,8 @@ def _stop_opensearch(self, *, restart=False) -> None:
# otherwise cluster manager election will be blocked when starting up again
# and re-using storage
if len(nodes) > 1:
# TODO: we should probably NOT have any exclusion on restart
# https://chat.canonical.com/canonical/pl/bgndmrfxr7fbpgmwpdk3hin93c
# 1. Add current node to the voting + alloc exclusions
self.opensearch_exclusions.add_current()
self.opensearch_exclusions.add_current(restart=restart)
except OpenSearchHttpError:
logger.debug("Failed to get online nodes, voting and alloc exclusions not added")

Expand All @@ -1027,8 +1025,6 @@ def _stop_opensearch(self, *, restart=False) -> None:
self.status.set(WaitingStatus(ServiceStopped))

# 3. Remove the exclusions
# TODO: we should probably NOT have any exclusion on restart
# https://chat.canonical.com/canonical/pl/bgndmrfxr7fbpgmwpdk3hin93c
if not restart:
try:
self.opensearch_exclusions.delete_current()
Expand Down
5 changes: 4 additions & 1 deletion lib/charms/opensearch/v0/opensearch_health.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

"""Base class for the OpenSearch Health management."""
import logging
import time
from typing import Dict, Optional

from charms.opensearch.v0.constants_charm import (
Expand Down Expand Up @@ -130,9 +131,11 @@ def get( # noqa: C901

return status

@retry(stop=stop_after_attempt(90), wait=wait_fixed(10), reraise=True)
@retry(stop=stop_after_attempt(90), wait=wait_fixed(5), reraise=True)
def wait_for_shards_relocation(self) -> None:
"""Blocking function until the shards relocation completes in the cluster."""
time.sleep(5)

health = self.get(local_app_only=False)

if health == HealthColors.YELLOW_TEMP:
Expand Down
7 changes: 4 additions & 3 deletions lib/charms/opensearch/v0/opensearch_nodes_exclusions.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,13 +44,14 @@ def __init__(self, charm):

self._scope = Scope.APP if self._charm.unit.is_leader() else Scope.UNIT

def add_current(self) -> None:
def add_current(self, restart: bool = False) -> None:
"""Add Voting and alloc exclusions."""
if (self._node.is_cm_eligible() or self._node.is_voting_only()) and not self._add_voting():
logger.error(f"Failed to add voting exclusion: {self._node.name}.")

if self._node.is_data() and not self._add_allocations():
logger.error(f"Failed to add shard allocation exclusion: {self._node.name}.")
if not restart:
if self._node.is_data() and not self._add_allocations():
logger.error(f"Failed to add shard allocation exclusion: {self._node.name}.")

def delete_current(self) -> None:
"""Delete Voting and alloc exclusions."""
Expand Down

0 comments on commit 088db63

Please sign in to comment.