From 5b460e774daccae5215663d6e3998be01f0e362b Mon Sep 17 00:00:00 2001 From: Pierre Ricadat Date: Tue, 5 Nov 2024 10:32:42 +0900 Subject: [PATCH] Don't increment podHealthChecked metric on internal check (do it only for unresponsive pods) (#149) * Don't increment podHealthChecked metric on internal check (do it only for unresponsive pods) * Fix --- .../scala/com/devsisters/shardcake/ShardManager.scala | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/manager/src/main/scala/com/devsisters/shardcake/ShardManager.scala b/manager/src/main/scala/com/devsisters/shardcake/ShardManager.scala index 2cbbf50..e4e6201 100644 --- a/manager/src/main/scala/com/devsisters/shardcake/ShardManager.scala +++ b/manager/src/main/scala/com/devsisters/shardcake/ShardManager.scala @@ -48,10 +48,10 @@ class ShardManager( ZIO.fail(new RuntimeException(s"Pod $pod is not healthy, refusing to register")) ) - def notifyUnhealthyPod(podAddress: PodAddress): UIO[Unit] = + def notifyUnhealthyPod(podAddress: PodAddress, ignoreMetric: Boolean = false): UIO[Unit] = ZIO .whenZIODiscard(stateRef.get.map(_.pods.contains(podAddress))) { - ManagerMetrics.podHealthChecked.tagged("pod_address", podAddress.toString).increment *> + ManagerMetrics.podHealthChecked.tagged("pod_address", podAddress.toString).increment.unless(ignoreMetric) *> eventsHub.publish(ShardingEvent.PodHealthChecked(podAddress)) *> ZIO.unlessZIO(healthApi.isAlive(podAddress))( ZIO.logWarning(s"Pod $podAddress is not alive, unregistering") *> unregister(podAddress) @@ -61,7 +61,7 @@ class ShardManager( def checkAllPodsHealth: UIO[Unit] = for { pods <- stateRef.get.map(_.pods.keySet) - _ <- ZIO.foreachParDiscard(pods)(notifyUnhealthyPod).withParallelism(4) + _ <- ZIO.foreachParDiscard(pods)(notifyUnhealthyPod(_, ignoreMetric = true)).withParallelism(4) } yield () def unregister(podAddress: PodAddress): UIO[Unit] = @@ -154,7 +154,7 @@ class ShardManager( .map(_.flatten[PodAddress].toSet) failedPods = failedPingedPods ++ failedUnassignedPods ++ failedAssignedPods // check if failing pods are still up - _ <- ZIO.foreachDiscard(failedPods)(notifyUnhealthyPod).forkDaemon + _ <- ZIO.foreachDiscard(failedPods)(notifyUnhealthyPod(_)).forkDaemon _ <- ZIO.logWarning(s"Failed to rebalance pods: $failedPods").when(failedPods.nonEmpty) // retry rebalancing later if there was any failure _ <- (Clock.sleep(config.rebalanceRetryInterval) *> rebalance(rebalanceImmediately)).forkDaemon