diff --git a/lib/charms/opensearch/v0/opensearch_base_charm.py b/lib/charms/opensearch/v0/opensearch_base_charm.py index b51fd5b7f..25a4c27dc 100644 --- a/lib/charms/opensearch/v0/opensearch_base_charm.py +++ b/lib/charms/opensearch/v0/opensearch_base_charm.py @@ -1022,7 +1022,8 @@ def _stop_opensearch(self, *, restart=False) -> None: except OpenSearchHttpError: logger.debug("Failed to get online nodes, voting and alloc exclusions not added") - # TODO: should block until all shards move addressed in PR DPE-2234 + # block until all primary shards are moved away from the unit that is stopping + self.health.wait_for_shards_relocation() # 2. stop the service self.opensearch.stop() diff --git a/lib/charms/opensearch/v0/opensearch_health.py b/lib/charms/opensearch/v0/opensearch_health.py index ecb4d2980..4175866e2 100644 --- a/lib/charms/opensearch/v0/opensearch_health.py +++ b/lib/charms/opensearch/v0/opensearch_health.py @@ -112,34 +112,34 @@ def get( # noqa: C901 logger.error(e) # means the status was reported as an int (i.e: 503) return HealthColors.UNKNOWN - if status != HealthColors.YELLOW: - return status - - try: - logger.debug( - f"\n\nHealth: {status} -- Shards: {ClusterState.shards(self._opensearch, host, verbose=True)}\n\n" - ) - logger.debug( - f"Allocation explanations: {ClusterState.allocation_explain(self._opensearch, host)}\n\n" - ) - except OpenSearchHttpError: - pass - # we differentiate between a temp yellow (moving shards) and a permanent # one (such as: missing replicas) - if response["initializing_shards"] > 0 or response["relocating_shards"] > 0: + if status in [HealthColors.GREEN, HealthColors.YELLOW] and ( + response["initializing_shards"] > 0 or response["relocating_shards"] > 0 + ): + try: + logger.debug( + f"\n\nHealth: {status} -- Shards: {ClusterState.shards(self._opensearch, host, verbose=True)}\n\n" + ) + logger.debug( + f"Allocation explanations: {ClusterState.allocation_explain(self._opensearch, host)}\n\n" + ) + except OpenSearchHttpError: + pass return HealthColors.YELLOW_TEMP - return HealthColors.YELLOW - @retry(stop=stop_after_attempt(15), wait=wait_fixed(5), reraise=True) + return status + + @retry(stop=stop_after_attempt(90), wait=wait_fixed(10), reraise=True) def wait_for_shards_relocation(self) -> None: """Blocking function until the shards relocation completes in the cluster.""" - if self.get(wait_for_green_first=True) != HealthColors.YELLOW_TEMP: - return + health = self.get(local_app_only=False) - # we throw an error because various operations should NOT start while data - # is being relocated. Examples are: simple stop, unit removal, upgrade - raise OpenSearchHAError("Shards haven't completed relocating.") + if health == HealthColors.YELLOW_TEMP: + logger.info("Shards still moving before stopping Opensearch.") + # we throw an error because various operations should NOT start while data + # is being relocated. Examples are: simple stop, unit removal, upgrade + raise OpenSearchHAError("Shards haven't completed relocating.") def _apply_for_app(self, status: str) -> None: """Cluster wide / app status."""