Check the health of all pods at regular intervals (#140)

* Check the health of all pods at regular intervals * Update comments * update documents * address review
devsisters · Aug 12, 2024 · ac1b90b · ac1b90b
1 parent 7ec093d
commit ac1b90b
Show file tree

Hide file tree

Showing 3 changed files with 9 additions and 5 deletions.
diff --git a/manager/src/main/scala/com/devsisters/shardcake/ManagerConfig.scala b/manager/src/main/scala/com/devsisters/shardcake/ManagerConfig.scala
@@ -12,6 +12,7 @@ import zio._
  * @param persistRetryInterval retry interval for persistence of pods and shard assignments
  * @param persistRetryCount max retry count for persistence of pods and shard assignments
  * @param rebalanceRate max ratio of shards to rebalance at once
+ * @param podHealthCheckInterval interval for checking pod health
  */
 case class ManagerConfig(
   numberOfShards: Int,
@@ -21,7 +22,8 @@ case class ManagerConfig(
   pingTimeout: Duration,
   persistRetryInterval: Duration,
   persistRetryCount: Int,
-  rebalanceRate: Double
+  rebalanceRate: Double,
+  podHealthCheckInterval: Duration
 )
 
 object ManagerConfig {
@@ -34,6 +36,7 @@ object ManagerConfig {
       pingTimeout = 3 seconds,
       persistRetryInterval = 3 seconds,
       persistRetryCount = 100,
-      rebalanceRate = 2 / 100d
+      rebalanceRate = 2 / 100d,
+      podHealthCheckInterval = 1 minute
     )
 }
diff --git a/manager/src/main/scala/com/devsisters/shardcake/ShardManager.scala b/manager/src/main/scala/com/devsisters/shardcake/ShardManager.scala
@@ -46,14 +46,13 @@ class ShardManager(
 
   def notifyUnhealthyPod(podAddress: PodAddress): UIO[Unit] =
     ZIO
-      .whenZIO(stateRef.get.map(_.pods.contains(podAddress))) {
+      .whenZIODiscard(stateRef.get.map(_.pods.contains(podAddress))) {
         ManagerMetrics.podHealthChecked.tagged("pod_address", podAddress.toString).increment *>
           eventsHub.publish(ShardingEvent.PodHealthChecked(podAddress)) *>
           ZIO.unlessZIO(healthApi.isAlive(podAddress))(
             ZIO.logWarning(s"Pod $podAddress is not alive, unregistering") *> unregister(podAddress)
           )
       }
-      .unit
 
   def checkAllPodsHealth: UIO[Unit] =
     for {
@@ -267,6 +266,7 @@ object ShardManager {
                                           .repeat(Schedule.spaced(config.rebalanceInterval))
                                           .forkDaemon
         _                            <- shardManager.getShardingEvents.mapZIO(event => ZIO.logInfo(event.toString)).runDrain.forkDaemon
+        _                            <- shardManager.checkAllPodsHealth.repeat(Schedule.spaced(config.podHealthCheckInterval)).forkDaemon
         _                            <- ZIO.logInfo("Shard Manager loaded")
       } yield shardManager
     }

diff --git a/vuepress/docs/docs/config.md b/vuepress/docs/docs/config.md
@@ -49,10 +49,11 @@ Here's the list of thing you can configure on the Shard Manager side:
 - `persistRetryInterval`: retry interval for persistence of pods and shard assignments
 - `persistRetryCount`: max retry count for persistence of pods and shard assignments
 - `rebalanceRate`: max ratio of shards to rebalance in a single iteration
+- `podHealthCheckInterval`: interval for checking pod health
 ::: tip Rebalance Rate
 The rebalance rate is there to prevent too many shards being assigned immediately to new pods.
 Instead of reaching a perfect spread right away, we can use several iterations to make sure new pods are able to handle the new shards. 
 This is particularly useful if starting entities has a performance cost (e.g. loading state) and we don't want to start too many at once.
 
 When a pod is leaving, its shards need to be immediately rebalanced so the ratio is not used in that case.
-:::
+:::