Skip to content

Commit

Permalink
Write reason to node history for wTF, failed and parked events
Browse files Browse the repository at this point in the history
  • Loading branch information
freva committed Dec 9, 2024
1 parent 825dcf2 commit 610b2e6
Show file tree
Hide file tree
Showing 6 changed files with 28 additions and 21 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -283,12 +283,9 @@ public Optional<WireguardKeyWithTimestamp> wireguardPubKey() {
/**
* Returns a copy of this where wantToFail is set to true and history is updated to reflect this.
*/
public Node withWantToFail(boolean wantToFail, Agent agent, Instant at) {
Node node = this.with(status.withWantToFail(wantToFail));
if (wantToFail)
node = node.with(history.with(new History.Event(History.Event.Type.wantToFail, agent, at)));
return node;

public Node withWantToFail(Agent agent, Instant at, String reason) {
return with(status.withWantToFail(true))
.with(history.with(new History.Event(History.Event.Type.wantToFail, agent, at, Optional.of(reason))));
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,7 @@ private void failActive(FailingNode failing) {

if (activeChildrenToFail.isEmpty()) {
log.log(Level.INFO, "Failing out " + failing.node + ": " + failing.reason);
markWantToFail(failing.node(), true, lock.parent());
markWantToFail(failing.node(), true, failing.reason, lock.parent());
redeploy = true;
}
}
Expand All @@ -260,7 +260,7 @@ private void failActive(FailingNode failing) {
if (failing == null) return;

log.log(Level.INFO, "Failing out " + failing.node + ": " + failing.reason);
markWantToFail(failing.node(), true, lock);
markWantToFail(failing.node(), true, failing.reason, lock);
redeploy = true;
}
}
Expand Down Expand Up @@ -295,16 +295,19 @@ private void redeploy(Deployment deployment, FailingNode failing) {
Optional<NodeMutex> optionalNodeMutex = nodeRepository().nodes().lockAndGet(failing.node());
if (optionalNodeMutex.isEmpty()) return;
try (var nodeMutex = optionalNodeMutex.get()) {
markWantToFail(nodeMutex.node(), false, nodeMutex);
markWantToFail(nodeMutex.node(), false, failing.reason(), nodeMutex);
log.log(Level.WARNING, "Could not fail " + failing.node() + " for " + failing.node().allocation().get().owner() +
" for " + failing.reason() + ": " + Exceptions.toMessageString(e));
}
}
}

private void markWantToFail(Node node, boolean wantToFail, Mutex lock) {
private void markWantToFail(Node node, boolean wantToFail, String reason, Mutex lock) {
if (node.status().wantToFail() != wantToFail) {
nodeRepository().nodes().write(node.withWantToFail(wantToFail, Agent.NodeFailer, clock().instant()), lock);
node = wantToFail ?
node.withWantToFail(Agent.NodeFailer, clock().instant(), reason) :
node.with(node.status().withWantToFail(false));
nodeRepository().nodes().write(node, lock);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -141,19 +141,20 @@ private ImmutableMap.Builder<Event.Type, Event> builderWithout(Event.Type type)
}

/** Returns a copy of this history with a record of this state transition added, if applicable */
public History recordStateTransition(Node.State from, Node.State to, Agent agent, Instant at) {
public History recordStateTransition(Node.State from, Node.State to, Agent agent, Optional<String> reason, Instant at) {
// If the event is a re-reservation, allow the new one to override the older one.
if (from == to && from != Node.State.reserved) return this;
// Only keeping reason for errored states (failed, parked)
return switch (to) {
case provisioned -> this.with(new Event(Event.Type.provisioned, agent, at));
case deprovisioned -> this.with(new Event(Event.Type.deprovisioned, agent, at));
case ready -> this.withoutApplicationEvents().with(new Event(Event.Type.readied, agent, at));
case active -> this.with(new Event(Event.Type.activated, agent, at));
case inactive -> this.with(new Event(Event.Type.deactivated, agent, at));
case reserved -> this.with(new Event(Event.Type.reserved, agent, at));
case failed -> this.with(new Event(Event.Type.failed, agent, at));
case failed -> this.with(new Event(Event.Type.failed, agent, at, reason));
case dirty -> this.with(new Event(Event.Type.deallocated, agent, at));
case parked -> this.with(new Event(Event.Type.parked, agent, at));
case parked -> this.with(new Event(Event.Type.parked, agent, at, reason));
case breakfixed -> this.with(new Event(Event.Type.breakfixed, agent, at));
};
}
Expand All @@ -180,7 +181,10 @@ public String toString() {
}

/** An event which may happen to a node */
public record Event(Type type, Agent agent, Instant at) {
public record Event(Type type, Agent agent, Instant at, Optional<String> reason) {
public Event(Type type, Agent agent, Instant at) {
this(type, agent, at, Optional.empty());
}

public enum Type {
// State changes
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -276,7 +276,7 @@ public List<Node> deactivate(List<Node> nodes, ApplicationTransaction transactio
*/
public List<Node> fail(List<Node> nodes, ApplicationTransaction transaction) {
return db.writeTo(Node.State.failed,
nodes.stream().map(n -> n.withWantToFail(false, Agent.application, clock.instant())).toList(),
nodes.stream().map(n -> n.with(n.status().withWantToFail(false))).toList(),
Agent.application, Optional.of("Failed by application"), transaction);
}

Expand Down Expand Up @@ -380,7 +380,7 @@ public List<Node> failOrMarkRecursively(String hostname, Agent agent, String rea

private Node failOrMark(Node node, Agent agent, String reason, Mutex lock) {
if (node.state() == Node.State.active) {
node = node.withWantToFail(true, agent, clock.instant());
node = node.withWantToFail(agent, clock.instant(), reason);
write(node, lock);
return node;
} else {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ public List<Node> addNodesInState(LockedNodeList nodes, Node.State expectedState
if (node.state() != expectedState)
throw new IllegalArgumentException(node + " is not in the " + expectedState + " state");

node = node.with(node.history().recordStateTransition(null, expectedState, agent, clock.instant()));
node = node.with(node.history().recordStateTransition(null, expectedState, agent, Optional.empty(), clock.instant()));
byte[] serialized = nodeSerializer.toJson(node);
curatorTransaction.add(CuratorOperations.create(nodePath(node).getAbsolute(), serialized));
}
Expand Down Expand Up @@ -234,7 +234,7 @@ public List<Node> writeTo(Node.State toState, List<Node> nodes,
newNodeStatus(node, toState),
toState,
toState.isAllocated() ? node.allocation() : Optional.empty(),
node.history().recordStateTransition(node.state(), toState, agent, clock.instant()),
node.history().recordStateTransition(node.state(), toState, agent, reason, clock.instant()),
node.type(), node.reports(), node.modelName(), node.reservedTo(),
node.exclusiveToApplicationId(), node.provisionedForApplicationId(), node.hostTTL(), node.hostEmptyAt(),
node.exclusiveToClusterType(), node.switchHostname(), node.trustedCertificates(),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,8 @@ public class NodeSerializer {
// History event fields
private static final String historyEventTypeKey = "type";
private static final String atKey = "at";
private static final String agentKey = "agent"; // retired events only
private static final String agentKey = "agent";
private static final String reasonKey = "reason";

// Network port fields
private static final String networkPortsKey = "networkPorts";
Expand Down Expand Up @@ -238,6 +239,7 @@ private void toSlime(History.Event event, Cursor object) {
object.setString(historyEventTypeKey, toString(event.type()));
object.setLong(atKey, event.at().toEpochMilli());
object.setString(agentKey, toString(event.agent()));
event.reason().ifPresent(reason -> object.setString(reasonKey, reason));
}

private void toSlime(List<String> addresses, Cursor array, boolean dummyDueToErasure) {
Expand Down Expand Up @@ -365,7 +367,8 @@ private History.Event eventFromSlime(Inspector object) {
if (type == null) return null;
Instant at = Instant.ofEpochMilli(object.field(atKey).asLong());
Agent agent = eventAgentFromSlime(object.field(agentKey));
return new History.Event(type, agent, at);
Optional<String> reason = SlimeUtils.optionalString(object.field(reasonKey));
return new History.Event(type, agent, at, reason);
}

private Generation generationFromSlime(Inspector object, String wantedField, String currentField) {
Expand Down

0 comments on commit 610b2e6

Please sign in to comment.