Skip to content

Commit

Permalink
Add PENDING type to healthchecks
Browse files Browse the repository at this point in the history
  • Loading branch information
andythsu committed Sep 20, 2024
1 parent a9dad62 commit 2a00b88
Show file tree
Hide file tree
Showing 12 changed files with 87 additions and 40 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ public record ClusterStats(
int runningQueryCount,
int queuedQueryCount,
int numWorkerNodes,
boolean healthy,
TrinoHealthStateType healthState,
String proxyTo,
String externalUrl,
String routingGroup,
Expand All @@ -41,7 +41,7 @@ public static final class Builder
private int runningQueryCount;
private int queuedQueryCount;
private int numWorkerNodes;
private boolean healthy;
private TrinoHealthStateType healthState;
private String proxyTo;
private String externalUrl;
private String routingGroup;
Expand Down Expand Up @@ -70,9 +70,9 @@ public Builder numWorkerNodes(int numWorkerNodes)
return this;
}

public Builder healthy(boolean healthy)
public Builder healthy(TrinoHealthStateType healthState)
{
this.healthy = healthy;
this.healthState = healthState;
return this;
}

Expand Down Expand Up @@ -107,7 +107,7 @@ public ClusterStats build()
runningQueryCount,
queuedQueryCount,
numWorkerNodes,
healthy,
healthState,
proxyTo,
externalUrl,
routingGroup,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ public ClusterStats monitor(ProxyBackendConfiguration backend)
.numWorkerNodes(activeWorkers)
.queuedQueryCount((int) result.get("queuedQueries"))
.runningQueryCount((int) result.get("runningQueries"))
.healthy(activeWorkers > 0)
.healthy(activeWorkers > 0 ? TrinoHealthStateType.HEALTHY : TrinoHealthStateType.UNHEALTHY)
.proxyTo(backend.getProxyTo())
.externalUrl(backend.getExternalUrl())
.routingGroup(backend.getRoutingGroup());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,19 +55,19 @@ public ClusterStats monitor(ProxyBackendConfiguration backend)
.routingGroup(backend.getRoutingGroup()).build();
}

private boolean isReadyStatus(String baseUrl)
private TrinoHealthStateType isReadyStatus(String baseUrl)
{
return isReadyStatus(baseUrl, retries);
}

private boolean isReadyStatus(String baseUrl, int retriesRemaining)
private TrinoHealthStateType isReadyStatus(String baseUrl, int retriesRemaining)
{
Request request = prepareGet()
.setUri(uriBuilderFrom(URI.create(baseUrl)).appendPath("/v1/info").build())
.build();
try {
ServerInfo serverInfo = client.execute(request, SERVER_INFO_JSON_RESPONSE_HANDLER);
return !serverInfo.isStarting();
return serverInfo.isStarting() ? TrinoHealthStateType.PENDING : TrinoHealthStateType.HEALTHY;
}
catch (UnexpectedResponseException e) {
if (shouldRetry(e.getStatusCode())) {
Expand All @@ -86,7 +86,7 @@ private boolean isReadyStatus(String baseUrl, int retriesRemaining)
catch (Exception e) {
log.error(e, "Exception checking %s for health", request.getUri());
}
return false;
return TrinoHealthStateType.UNHEALTHY;
}

public static boolean shouldRetry(int statusCode)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,9 @@ public ClusterStats monitor(ProxyBackendConfiguration backend)
partialState.put(rs.getString("state"), rs.getInt("count"));
}
return clusterStats
.healthy(true)
// at this point we can set cluster to healthState because otherwise
// it wouldn't have gotten worker stats
.healthy(TrinoHealthStateType.HEALTHY)
.queuedQueryCount(partialState.getOrDefault("QUEUED", 0))
.runningQueryCount(partialState.getOrDefault("RUNNING", 0))
.build();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ public HealthChecker(Notifier notifier)
public void observe(List<ClusterStats> clustersStats)
{
for (ClusterStats clusterStats : clustersStats) {
if (!clusterStats.healthy()) {
if (clusterStats.healthState() == TrinoHealthStateType.UNHEALTHY) {
notifyUnhealthyCluster(clusterStats);
}
else {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.trino.gateway.ha.clustermonitor;

/**
* PENDING is for ui/observability purpose and functionally it's unhealthy
* We should use PENDING when Trino clusters are still spinning up
* HEALTHY is when health checks report clusters as up
* UNHEALTHY is when health checks report clusters as down
*/
public enum TrinoHealthStateType
{
PENDING,
HEALTHY,
UNHEALTHY
}
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,10 @@
import com.google.common.collect.ImmutableList;
import com.google.inject.Inject;
import io.airlift.log.Logger;
import io.trino.gateway.ha.clustermonitor.ClusterStats;
import io.trino.gateway.ha.clustermonitor.TrinoHealthStateType;
import io.trino.gateway.ha.config.ProxyBackendConfiguration;
import io.trino.gateway.ha.router.BackendStateManager;
import io.trino.gateway.ha.router.GatewayBackendManager;
import io.trino.gateway.ha.router.ResourceGroupsManager;
import io.trino.gateway.ha.router.RoutingManager;
Expand Down Expand Up @@ -52,13 +55,19 @@ public class EntityEditorResource
private final GatewayBackendManager gatewayBackendManager;
private final ResourceGroupsManager resourceGroupsManager;
private final RoutingManager routingManager;
private final BackendStateManager backendStateManager;

@Inject
public EntityEditorResource(GatewayBackendManager gatewayBackendManager, ResourceGroupsManager resourceGroupsManager, RoutingManager routingManager)
public EntityEditorResource(
GatewayBackendManager gatewayBackendManager,
ResourceGroupsManager resourceGroupsManager,
RoutingManager routingManager,
BackendStateManager backendStateManager)
{
this.gatewayBackendManager = requireNonNull(gatewayBackendManager, "gatewayBackendManager is null");
this.resourceGroupsManager = requireNonNull(resourceGroupsManager, "resourceGroupsManager is null");
this.routingManager = requireNonNull(routingManager, "routingManager is null");
this.backendStateManager = requireNonNull(backendStateManager, "backendStateManager is null");
}

@GET
Expand Down Expand Up @@ -87,7 +96,12 @@ public Response updateEntity(
OBJECT_MAPPER.readValue(jsonPayload, ProxyBackendConfiguration.class);
gatewayBackendManager.updateBackend(backend);
log.info("Turning cluster %s %s", backend.getName(), backend.isActive() ? "on" : "off");
routingManager.updateBackEndHealth(backend.getName(), backend.isActive());
routingManager.updateBackEndHealth(backend.getName(), backend.isActive() ? TrinoHealthStateType.PENDING : TrinoHealthStateType.UNHEALTHY);
backendStateManager.updateStates(
backend.getName(),
ClusterStats.builder(backend.getName())
.healthy(backend.isActive() ? TrinoHealthStateType.PENDING : TrinoHealthStateType.UNHEALTHY)
.build());
break;
case RESOURCE_GROUP:
ResourceGroupsDetail resourceGroupDetails = OBJECT_MAPPER.readValue(jsonPayload,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import com.google.common.collect.ImmutableList;
import com.google.errorprone.annotations.concurrent.GuardedBy;
import io.trino.gateway.ha.clustermonitor.ClusterStats;
import io.trino.gateway.ha.clustermonitor.TrinoHealthStateType;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

Expand Down Expand Up @@ -46,7 +47,7 @@ static class LocalStats
{
private int runningQueryCount;
private int queuedQueryCount;
private boolean healthy;
private TrinoHealthStateType healthState;
private String proxyTo;
private String routingGroup;
private String clusterId;
Expand All @@ -57,7 +58,7 @@ static class LocalStats
clusterId = stats.clusterId();
runningQueryCount = stats.runningQueryCount();
queuedQueryCount = stats.queuedQueryCount();
healthy = stats.healthy();
healthState = stats.healthState();
proxyTo = stats.proxyTo();
routingGroup = stats.routingGroup();
if (stats.userQueuedCount() != null) {
Expand Down Expand Up @@ -93,14 +94,14 @@ public void queuedQueryCount(int queuedQueryCount)
this.queuedQueryCount = queuedQueryCount;
}

public boolean healthy()
public TrinoHealthStateType healthState()
{
return this.healthy;
return this.healthState;
}

public void healthy(boolean healthy)
public void healthState(TrinoHealthStateType healthState)
{
this.healthy = healthy;
this.healthState = healthState;
}

public String proxyTo()
Expand Down Expand Up @@ -187,7 +188,7 @@ private synchronized Optional<LocalStats> getClusterToRoute(String user, String
{
log.debug("sorting cluster stats for {} {}", user, routingGroup);
List<LocalStats> filteredList = clusterStats.stream()
.filter(stats -> stats.healthy())
.filter(stats -> stats.healthState() == TrinoHealthStateType.HEALTHY)
.filter(stats -> routingGroup.equals(stats.routingGroup()))
.collect(Collectors.toList());

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import com.google.common.cache.LoadingCache;
import io.airlift.log.Logger;
import io.trino.gateway.ha.clustermonitor.ClusterStats;
import io.trino.gateway.ha.clustermonitor.TrinoHealthStateType;
import io.trino.gateway.ha.config.ProxyBackendConfiguration;
import jakarta.ws.rs.HttpMethod;

Expand Down Expand Up @@ -45,7 +46,7 @@ public abstract class RoutingManager
private final LoadingCache<String, String> queryIdBackendCache;
private final ExecutorService executorService = Executors.newFixedThreadPool(5);
private final GatewayBackendManager gatewayBackendManager;
private final ConcurrentHashMap<String, Boolean> backendToHealth;
private final ConcurrentHashMap<String, TrinoHealthStateType> backendToHealth;

public RoutingManager(GatewayBackendManager gatewayBackendManager)
{
Expand All @@ -64,7 +65,7 @@ public String load(String queryId)
}
});

this.backendToHealth = new ConcurrentHashMap<String, Boolean>();
this.backendToHealth = new ConcurrentHashMap<>();
}

protected GatewayBackendManager getGatewayBackendManager()
Expand Down Expand Up @@ -123,7 +124,7 @@ public String findBackendForQueryId(String queryId)
return backendAddress;
}

public void updateBackEndHealth(String backendId, Boolean value)
public void updateBackEndHealth(String backendId, TrinoHealthStateType value)
{
log.info("backend %s isHealthy %s", backendId, value);
backendToHealth.put(backendId, value);
Expand All @@ -132,7 +133,7 @@ public void updateBackEndHealth(String backendId, Boolean value)
public void updateBackEndStats(List<ClusterStats> stats)
{
for (ClusterStats clusterStats : stats) {
updateBackEndHealth(clusterStats.clusterId(), clusterStats.healthy());
updateBackEndHealth(clusterStats.clusterId(), clusterStats.healthState());
}
}

Expand Down Expand Up @@ -187,10 +188,10 @@ private boolean isBackendNotHealthy(String backendId)
log.error("backends can not be empty");
return true;
}
Boolean isHealthy = backendToHealth.get(backendId);
if (isHealthy == null) {
TrinoHealthStateType healthType = backendToHealth.get(backendId);
if (healthType == null) {
return true;
}
return !isHealthy;
return healthType != TrinoHealthStateType.HEALTHY;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,6 @@ private void testClusterStatsMonitor(Function<BackendStateConfiguration, Cluster

ClusterStats stats = monitor.monitor(proxyBackend);
assertThat(stats.clusterId()).isEqualTo("test_cluster");
assertThat(stats.healthy()).isTrue();
assertThat(stats.healthState()).isEqualTo(TrinoHealthStateType.HEALTHY);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

import com.google.common.collect.ImmutableList;
import io.trino.gateway.ha.clustermonitor.ClusterStats;
import io.trino.gateway.ha.clustermonitor.TrinoHealthStateType;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;

Expand Down Expand Up @@ -52,7 +53,7 @@ private static List<ClusterStats> getClusterStatsList(String routingGroup)
{
ClusterStats.Builder cluster = ClusterStats.builder("c1");
cluster.proxyTo(BACKEND_URL_1);
cluster.healthy(true);
cluster.healthy(TrinoHealthStateType.HEALTHY);
cluster.routingGroup(routingGroup);
cluster.runningQueryCount(50);
cluster.queuedQueryCount(SAME_QUERY_COUNT);
Expand All @@ -63,7 +64,7 @@ private static List<ClusterStats> getClusterStatsList(String routingGroup)
{
ClusterStats.Builder cluster = ClusterStats.builder("c2");
cluster.proxyTo(BACKEND_URL_2);
cluster.healthy(true);
cluster.healthy(TrinoHealthStateType.HEALTHY);
cluster.routingGroup(routingGroup);
cluster.runningQueryCount(51);
cluster.queuedQueryCount(SAME_QUERY_COUNT);
Expand All @@ -78,7 +79,7 @@ private static List<ClusterStats> getClusterStatsList(String routingGroup)
{
ClusterStats.Builder cluster = ClusterStats.builder("c3");
cluster.proxyTo(BACKEND_URL_3);
cluster.healthy(true);
cluster.healthy(TrinoHealthStateType.HEALTHY);
cluster.routingGroup(routingGroup);
cluster.runningQueryCount(5);
cluster.queuedQueryCount(SAME_QUERY_COUNT);
Expand All @@ -93,7 +94,7 @@ private static List<ClusterStats> getClusterStatsList(String routingGroup)
{
ClusterStats.Builder cluster = ClusterStats.builder("c-unhealthy");
cluster.proxyTo("http://c-unhealthy");
cluster.healthy(false); //This cluster should never show up to route
cluster.healthy(TrinoHealthStateType.UNHEALTHY); //This cluster should never show up to route
cluster.routingGroup(routingGroup);
cluster.runningQueryCount(5);
cluster.queuedQueryCount(SAME_QUERY_COUNT);
Expand All @@ -105,7 +106,7 @@ private static List<ClusterStats> getClusterStatsList(String routingGroup)
{
ClusterStats.Builder cluster = ClusterStats.builder("c-unhealthy2");
cluster.proxyTo("http://c-unhealthy2");
cluster.healthy(false); //This cluster should never show up to route
cluster.healthy(TrinoHealthStateType.UNHEALTHY); //This cluster should never show up to route

clustersBuilder.add(cluster.build());
}
Expand All @@ -115,7 +116,7 @@ private static List<ClusterStats> getClusterStatsList(String routingGroup)
cluster.proxyTo("http://c-messed-up");
//This is a scenrio when, something is really wrong
//We just get the cluster state as health but no stats
cluster.healthy(true);
cluster.healthy(TrinoHealthStateType.HEALTHY);
clustersBuilder.add(cluster.build());
}

Expand All @@ -126,7 +127,7 @@ static ClusterStats getClusterWithNoUserQueueAndMinQueueCount()
{
ClusterStats.Builder cluster = ClusterStats.builder("c-Minimal-Queue");
cluster.proxyTo(BACKEND_URL_4);
cluster.healthy(true);
cluster.healthy(TrinoHealthStateType.HEALTHY);
cluster.routingGroup("adhoc");
cluster.runningQueryCount(5);
cluster.queuedQueryCount(LEAST_QUEUED_COUNT);
Expand All @@ -137,7 +138,7 @@ static ClusterStats getClusterWithMinRunningQueries()
{
ClusterStats.Builder cluster = ClusterStats.builder("c-Minimal-Running");
cluster.proxyTo(BACKEND_URL_5);
cluster.healthy(true);
cluster.healthy(TrinoHealthStateType.HEALTHY);
cluster.routingGroup("adhoc");
cluster.runningQueryCount(1);
cluster.queuedQueryCount(LEAST_QUEUED_COUNT);
Expand Down
Loading

0 comments on commit 2a00b88

Please sign in to comment.