Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(discovery): retry failed target connections #1593

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 8 additions & 8 deletions compose/compose-cryostat.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ services:
user: '0'
stdin_open: true
tty: true
labels:
labels:
- io.cryostat.discovery=true
- io.cryostat.jmxUrl=service:jmx:rmi:///jndi/rmi://localhost:0/jmxrmi
deploy:
Expand Down Expand Up @@ -82,7 +82,7 @@ services:
cryostat-duplicate:
image: quay.io/cryostat/cryostat:latest
container_name: cryostat-duplicate
labels:
labels:
- io.cryostat.discovery=true
- io.cryostat.jmxUrl=service:jmx:rmi:///jndi/rmi://cryostat-duplicate:8282/jmxrmi
- io.cryostat.jmxPort=8282
Expand Down Expand Up @@ -154,9 +154,9 @@ services:

# Testing apps
scaled-app:
image: quay.io/andrewazores/vertx-fib-demo:0.12.2
image: quay.io/andrewazores/vertx-fib-demo:0.13.0
container_name: scaled-app
labels:
labels:
- io.cryostat.jmxUrl=service:jmx:rmi:///jndi/rmi://scaled-app:9093/jmxrmi
environment:
- JMX_PORT=9093
Expand All @@ -167,9 +167,9 @@ services:
profiles: ["scaled"]

stopped-app:
image: quay.io/andrewazores/vertx-fib-demo:0.12.2
image: quay.io/andrewazores/vertx-fib-demo:0.13.0
container_name: stopped-app
labels:
labels:
- io.cryostat.jmxUrl=service:jmx:rmi:///jndi/rmi://stopped-app:9093/jmxrmi
environment:
- JMX_PORT=9093
Expand All @@ -181,9 +181,9 @@ services:

# invalid targets:
invalid-podman:
image: quay.io/andrewazores/vertx-fib-demo:0.12.2
image: quay.io/andrewazores/vertx-fib-demo:0.13.0
container_name: invalid-jmx
labels:
labels:
- io.cryostat.jmxUrl=invalid:jmx:rmi:///jndi/rmi://invalid-podman:9093/jmxrmi # invalid serviceurl
environment:
- JMX_PORT=9093
Expand Down
2 changes: 1 addition & 1 deletion compose/compose-vertx-jmx.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
---
services:
vertx-jmx:
image: quay.io/andrewazores/vertx-fib-demo:0.12.4
image: quay.io/andrewazores/vertx-fib-demo:0.13.0
container_name: vertx-jmx
deploy:
mode: global
Expand Down
6 changes: 3 additions & 3 deletions smoketest-docker.sh
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ runDemoApps() {
--label io.cryostat.jmxPort="9093" \
--publish 8081:8081 \
--publish 9093:9093 \
--rm -d quay.io/andrewazores/vertx-fib-demo:0.12.2
--rm -d quay.io/andrewazores/vertx-fib-demo:0.13.0
CONTAINERS="${CONTAINERS:+${CONTAINERS} }vertx-fib-demo-1"

docker run \
Expand All @@ -119,7 +119,7 @@ runDemoApps() {
--label io.cryostat.jmxPort="9094" \
--publish 8082:8082 \
--publish 9094:9092 \
--rm -d quay.io/andrewazores/vertx-fib-demo:0.12.2
--rm -d quay.io/andrewazores/vertx-fib-demo:0.13.0
CONTAINERS="${CONTAINERS:+${CONTAINERS} }vertx-fib-demo-2"

docker run \
Expand All @@ -133,7 +133,7 @@ runDemoApps() {
--label io.cryostat.jmxPort="9095" \
--publish 8083:8083 \
--publish 9095:9095 \
--rm -d quay.io/andrewazores/vertx-fib-demo:0.12.2
--rm -d quay.io/andrewazores/vertx-fib-demo:0.13.0
CONTAINERS="${CONTAINERS:+${CONTAINERS} }vertx-fib-demo-3"

# this config is broken on purpose (missing required env vars) to test the agent's behaviour
Expand Down
17 changes: 14 additions & 3 deletions smoketest.sh
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,17 @@ runDemoApps() {
--env QUARKUS_HTTP_PORT=10012 \
--rm -d quay.io/roberttoyonaga/jmx:jmxquarkus@sha256:b067f29faa91312d20d43c55d194a2e076de7d0d094da3d43ee7d2b2b5a6f100

podman run \
--name vertx-fib-demo-0 \
--env HTTP_PORT=8079 \
--env JMX_PORT=9089 \
--env START_DELAY=60 \
--pod cryostat-pod \
--label io.cryostat.discovery="true" \
--label io.cryostat.jmxHost="localhost" \
--label io.cryostat.jmxPort="9089" \
--rm -d quay.io/andrewazores/vertx-fib-demo:0.13.0

podman run \
--name vertx-fib-demo-1 \
--env HTTP_PORT=8081 \
Expand All @@ -133,7 +144,7 @@ runDemoApps() {
--label io.cryostat.discovery="true" \
--label io.cryostat.jmxHost="localhost" \
--label io.cryostat.jmxPort="9093" \
--rm -d quay.io/andrewazores/vertx-fib-demo:0.12.3
--rm -d quay.io/andrewazores/vertx-fib-demo:0.13.0

podman run \
--name vertx-fib-demo-2 \
Expand All @@ -154,7 +165,7 @@ runDemoApps() {
--label io.cryostat.jmxHost="localhost" \
--label io.cryostat.jmxPort="9094" \
--label io.cryostat.jmxUrl="service:jmx:rmi:///jndi/rmi://localhost:9094/jmxrmi" \
--rm -d quay.io/andrewazores/vertx-fib-demo:0.12.3
--rm -d quay.io/andrewazores/vertx-fib-demo:0.13.0

podman run \
--name vertx-fib-demo-3 \
Expand All @@ -174,7 +185,7 @@ runDemoApps() {
--pod cryostat-pod \
--label io.cryostat.discovery="true" \
--label io.cryostat.jmxUrl="service:jmx:rmi:///jndi/rmi://localhost:9095/jmxrmi" \
--rm -d quay.io/andrewazores/vertx-fib-demo:0.12.3
--rm -d quay.io/andrewazores/vertx-fib-demo:0.13.0

# this config is broken on purpose (missing required env vars) to test the agent's behaviour
# when not properly set up
Expand Down
110 changes: 76 additions & 34 deletions src/main/java/io/cryostat/discovery/DiscoveryStorage.java
Original file line number Diff line number Diff line change
Expand Up @@ -45,10 +45,13 @@
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Objects;
import java.util.Optional;
import java.util.UUID;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ForkJoinPool;
import java.util.function.Predicate;

import javax.script.ScriptException;

Expand All @@ -57,14 +60,14 @@
import io.cryostat.configuration.StoredCredentials;
import io.cryostat.core.log.Logger;
import io.cryostat.core.net.discovery.JvmDiscoveryClient.EventKind;
import io.cryostat.net.web.http.AbstractAuthenticatedRequestHandler;
import io.cryostat.platform.ServiceRef;
import io.cryostat.platform.ServiceRef.AnnotationKey;
import io.cryostat.platform.discovery.AbstractNode;
import io.cryostat.platform.discovery.BaseNodeType;
import io.cryostat.platform.discovery.EnvironmentNode;
import io.cryostat.platform.discovery.TargetNode;
import io.cryostat.recordings.JvmIdHelper;
import io.cryostat.recordings.JvmIdHelper.JvmIdGetException;
import io.cryostat.rules.MatchExpressionEvaluator;
import io.cryostat.util.HttpStatusCodeIdentifier;

Expand Down Expand Up @@ -96,9 +99,10 @@ public class DiscoveryStorage extends AbstractPlatformClientVerticle {
private final Gson gson;
private final WebClient http;
private final Logger logger;
private long timerId = -1L;
private long pluginPruneTimerId = -1L;
private long targetRetryTimeId = -1L;

private final Map<TargetNode, UUID> targetsToUpdate = new HashMap<>();
private final Map<TargetNode, UUID> nonConnectableTargets = new HashMap<>();

public static final String DISCOVERY_STARTUP_ADDRESS = "discovery-startup";

Expand Down Expand Up @@ -143,34 +147,53 @@ public void start(Promise<Void> future) throws Exception {
+ " deployed")))
.onFailure(future::fail);

this.timerId = getVertx().setPeriodic(pingPeriod.toMillis(), i -> pingPrune());
this.pluginPruneTimerId = getVertx().setPeriodic(pingPeriod.toMillis(), i -> pingPrune());
this.targetRetryTimeId =
getVertx()
.setPeriodic(
// TODO make this configurable, use an exponential backoff, have a
// maximum retry policy, etc.
tthvo marked this conversation as resolved.
Show resolved Hide resolved
15_000,
i -> {
testNonConnectedTargets(
entry -> {
TargetNode targetNode = entry.getKey();
try {
String id =
jvmIdHelper
.get()
.resolveId(
targetNode.getTarget())
.getJvmId();
return StringUtils.isNotBlank(id);
} catch (JvmIdGetException e) {
logger.info(
"Retain null jvmId for node [{}]",
targetNode.getName());
logger.info(e);
return false;
}
});
});
this.credentialsManager
.get()
.addListener(
event -> {
switch (event.getEventType()) {
case ADDED:
Map<TargetNode, UUID> copy = new HashMap<>(targetsToUpdate);
for (var entry : copy.entrySet()) {
try {
if (matchExpressionEvaluator
.get()
.applies(
event.getPayload(),
entry.getKey().getTarget())) {
targetsToUpdate.remove(entry.getKey());
UUID id = entry.getValue();
PluginInfo plugin = getById(id).orElseThrow();
EnvironmentNode original =
gson.fromJson(
plugin.getSubtree(),
EnvironmentNode.class);
update(id, original.getChildren());
}
} catch (JsonSyntaxException | ScriptException e) {
throw new RuntimeException(e);
}
}
testNonConnectedTargets(
entry -> {
try {
return matchExpressionEvaluator
.get()
.applies(
event.getPayload(),
entry.getKey().getTarget());
} catch (ScriptException e) {
logger.error(e);
return false;
}
});
break;
case REMOVED:
break;
Expand All @@ -181,9 +204,33 @@ public void start(Promise<Void> future) throws Exception {
});
}

private void testNonConnectedTargets(Predicate<Entry<TargetNode, UUID>> predicate) {
ForkJoinPool.commonPool()
.execute(
() -> {
Map<TargetNode, UUID> copy = new HashMap<>(nonConnectableTargets);
for (var entry : copy.entrySet()) {
try {
if (predicate.test(entry)) {
nonConnectableTargets.remove(entry.getKey());
UUID id = entry.getValue();
PluginInfo plugin = getById(id).orElseThrow();
EnvironmentNode original =
gson.fromJson(
plugin.getSubtree(), EnvironmentNode.class);
update(id, original.getChildren());
}
} catch (JsonSyntaxException e) {
throw new RuntimeException(e);
}
}
});
}

@Override
public void stop() {
getVertx().cancelTimer(timerId);
getVertx().cancelTimer(pluginPruneTimerId);
getVertx().cancelTimer(targetRetryTimeId);
}

private CompositeFuture pingPrune() {
Expand Down Expand Up @@ -315,17 +362,12 @@ private List<AbstractNode> modifyChildrenWithJvmIds(
ServiceRef ref = ((TargetNode) child).getTarget();
try {
ref = jvmIdHelper.get().resolveId(ref);
child = new TargetNode(child.getNodeType(), ref, child.getLabels());
} catch (Exception e) {
// if Exception is of SSL or JMX Auth, ignore warning and use null jvmId
if (!(AbstractAuthenticatedRequestHandler.isJmxAuthFailure(e)
|| AbstractAuthenticatedRequestHandler.isJmxSslFailure(e))) {
logger.info("Ignoring target node [{}]", child.getName());
continue;
}
logger.info("Update node [{}] with null jvmId", child.getName());
targetsToUpdate.putIfAbsent((TargetNode) child, id);
logger.info(e);
nonConnectableTargets.putIfAbsent((TargetNode) child, id);
}
child = new TargetNode(child.getNodeType(), ref, child.getLabels());
modifiedChildren.add(child);
} else if (child instanceof EnvironmentNode) {
modifiedChildren.add(
Expand Down
2 changes: 1 addition & 1 deletion src/main/java/io/cryostat/recordings/JvmIdHelper.java
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,6 @@ public class JvmIdHelper extends AbstractEventEmitter<JvmIdHelper.IdEvent, Strin
}

private boolean observe(ServiceRef sr) {
logger.info("Observing new target: {}", sr);
if (StringUtils.isBlank(sr.getJvmId())) {
return false;
}
Expand All @@ -133,6 +132,7 @@ public ServiceRef resolveId(ServiceRef sr) throws JvmIdGetException {
if (observe(sr)) {
return sr;
}
logger.info("Observing new target: {}", sr);
URI serviceUri = sr.getServiceUri();
String uriStr = serviceUri.toString();
try {
Expand Down
Loading
Loading