Skip to content

Commit

Permalink
Check the health of all pods at regular intervals (#140)
Browse files Browse the repository at this point in the history
* Check the health of all pods at regular intervals

* Update comments

* update documents

* address review
  • Loading branch information
nox213 authored Aug 12, 2024
1 parent 7ec093d commit ac1b90b
Show file tree
Hide file tree
Showing 3 changed files with 9 additions and 5 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import zio._
* @param persistRetryInterval retry interval for persistence of pods and shard assignments
* @param persistRetryCount max retry count for persistence of pods and shard assignments
* @param rebalanceRate max ratio of shards to rebalance at once
* @param podHealthCheckInterval interval for checking pod health
*/
case class ManagerConfig(
numberOfShards: Int,
Expand All @@ -21,7 +22,8 @@ case class ManagerConfig(
pingTimeout: Duration,
persistRetryInterval: Duration,
persistRetryCount: Int,
rebalanceRate: Double
rebalanceRate: Double,
podHealthCheckInterval: Duration
)

object ManagerConfig {
Expand All @@ -34,6 +36,7 @@ object ManagerConfig {
pingTimeout = 3 seconds,
persistRetryInterval = 3 seconds,
persistRetryCount = 100,
rebalanceRate = 2 / 100d
rebalanceRate = 2 / 100d,
podHealthCheckInterval = 1 minute
)
}
Original file line number Diff line number Diff line change
Expand Up @@ -46,14 +46,13 @@ class ShardManager(

def notifyUnhealthyPod(podAddress: PodAddress): UIO[Unit] =
ZIO
.whenZIO(stateRef.get.map(_.pods.contains(podAddress))) {
.whenZIODiscard(stateRef.get.map(_.pods.contains(podAddress))) {
ManagerMetrics.podHealthChecked.tagged("pod_address", podAddress.toString).increment *>
eventsHub.publish(ShardingEvent.PodHealthChecked(podAddress)) *>
ZIO.unlessZIO(healthApi.isAlive(podAddress))(
ZIO.logWarning(s"Pod $podAddress is not alive, unregistering") *> unregister(podAddress)
)
}
.unit

def checkAllPodsHealth: UIO[Unit] =
for {
Expand Down Expand Up @@ -267,6 +266,7 @@ object ShardManager {
.repeat(Schedule.spaced(config.rebalanceInterval))
.forkDaemon
_ <- shardManager.getShardingEvents.mapZIO(event => ZIO.logInfo(event.toString)).runDrain.forkDaemon
_ <- shardManager.checkAllPodsHealth.repeat(Schedule.spaced(config.podHealthCheckInterval)).forkDaemon
_ <- ZIO.logInfo("Shard Manager loaded")
} yield shardManager
}
Expand Down
3 changes: 2 additions & 1 deletion vuepress/docs/docs/config.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,10 +49,11 @@ Here's the list of thing you can configure on the Shard Manager side:
- `persistRetryInterval`: retry interval for persistence of pods and shard assignments
- `persistRetryCount`: max retry count for persistence of pods and shard assignments
- `rebalanceRate`: max ratio of shards to rebalance in a single iteration
- `podHealthCheckInterval`: interval for checking pod health
::: tip Rebalance Rate
The rebalance rate is there to prevent too many shards being assigned immediately to new pods.
Instead of reaching a perfect spread right away, we can use several iterations to make sure new pods are able to handle the new shards.
This is particularly useful if starting entities has a performance cost (e.g. loading state) and we don't want to start too many at once.

When a pod is leaving, its shards need to be immediately rebalanced so the ratio is not used in that case.
:::
:::

0 comments on commit ac1b90b

Please sign in to comment.