Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

IGNITE-21456 Use StopNodeFailureHandler by default #4933

Open
wants to merge 14 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,8 @@ protected String getNodeBootstrapConfigTemplate() {
+ " },\n"
+ " clientConnector.port: {},\n"
+ " rest.port: {},\n"
+ " compute.threadPoolSize: 1\n"
+ " compute.threadPoolSize: 1,\n"
+ " failureHandler.dumpThreadsOnFailure: false\n"
+ "}";
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,8 @@ public static String restSslBootstrapConfig(@Nullable String ciphers) {
+ " },\n"
+ (nullOrBlank(ciphers) ? "" : " ciphers: \"" + ciphers + "\"")
+ " }\n"
+ " }\n"
+ " },\n"
+ " failureHandler.dumpThreadsOnFailure: false\n"
+ "}";
}

Expand Down Expand Up @@ -108,7 +109,8 @@ public static String clientConnectorSslBootstrapConfig(@Nullable String ciphers)
+ " rest: {\n"
+ " port: {},\n"
+ " ssl.port: {}\n"
+ " }\n"
+ " },\n"
+ " failureHandler.dumpThreadsOnFailure: false\n"
+ "}";
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,8 @@ class ItLogicalTopologyTest extends ClusterPerTestIntegrationTest {
+ " nodeAttributes.nodeAttributes: {region.attribute = US, storage.attribute = SSD},\n"
+ " storage.profiles: {lru_rocks.engine = rocksdb, segmented_aipersist.engine = aipersist},\n"
+ " clientConnector.port: {},\n"
+ " rest.port: {}\n"
+ " rest.port: {},\n"
+ " failureHandler.dumpThreadsOnFailure: false\n"
+ "}";

private final LogicalTopologyEventListener listener = new LogicalTopologyEventListener() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,8 @@ protected String getNodeBootstrapConfigTemplate() {
+ " compute: {"
+ " threadPoolSize: 1,\n"
+ " statesLifetimeMillis: 1000\n"
+ " }\n"
+ " },\n"
+ " failureHandler.dumpThreadsOnFailure: false\n"
+ "}";
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@

package org.apache.ignite.internal.failure;

import org.apache.ignite.internal.failure.handlers.FailureHandler;

/**
* General failure processing API.
*/
Expand All @@ -25,7 +27,7 @@ public interface FailureProcessor {
* Processes failure accordingly to configured {@link FailureHandler}.
*
* @param failureCtx Failure context.
* @return {@code True} If this very call led to Ignite node invalidation.
* @return {@code true} If this call leads to Ignite node invalidation and {@code false} otherwise.
*/
boolean process(FailureContext failureCtx);
}
Original file line number Diff line number Diff line change
Expand Up @@ -332,6 +332,52 @@ public static String getFullStackTrace(Throwable throwable) {
return sw.getBuffer().toString();
}

/**
* Checks if passed in {@code 'Throwable'} has given class in {@code 'cause'} hierarchy
* <b>including</b> that throwable itself.
* Note that this method follows includes {@link Throwable#getSuppressed()}
* into check.
*
* @param t Throwable to check (if {@code null}, {@code false} is returned).
* @param msg Message text that should be in cause.
* @param cls Cause classes to check (if {@code null} or empty, {@code false} is returned).
* @return {@code True} if one of the causing exception is an instance of passed in classes,
* {@code false} otherwise.
*/
public static boolean hasCause(@Nullable Throwable t, @Nullable String msg, Class<?> @Nullable... cls) {
if (t == null || cls == null || cls.length == 0) {
return false;
}

for (Throwable th = t; th != null; th = th.getCause()) {
for (Class<?> c : cls) {
if (c.isAssignableFrom(th.getClass())) {
if (msg != null) {
if (th.getMessage() != null && th.getMessage().contains(msg)) {
return true;
} else {
continue;
}
}

return true;
}
}

for (Throwable n : th.getSuppressed()) {
if (hasCause(n, msg, cls)) {
return true;
}
}

if (th.getCause() == th) {
break;
}
}

return false;
}

/**
* Unwraps exception cause from wrappers like CompletionException and ExecutionException.
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,11 @@
import static org.apache.ignite.lang.ErrorGroups.Common.NODE_STOPPING_ERR;

import java.io.IOException;
import java.lang.management.LockInfo;
import java.lang.management.ManagementFactory;
import java.lang.management.MonitorInfo;
import java.lang.management.ThreadInfo;
import java.lang.management.ThreadMXBean;
import java.lang.reflect.Field;
import java.lang.reflect.Modifier;
import java.nio.ByteBuffer;
Expand All @@ -38,18 +42,23 @@
import java.nio.file.StandardCopyOption;
import java.nio.file.StandardOpenOption;
import java.nio.file.attribute.BasicFileAttributes;
import java.time.Instant;
import java.time.LocalDateTime;
import java.time.ZoneId;
import java.time.format.DateTimeFormatter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
import java.util.Set;
import java.util.concurrent.CancellationException;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ConcurrentHashMap;
Expand Down Expand Up @@ -127,6 +136,16 @@ public static long monotonicMs() {
*/
private static final String JMX_MBEAN_PACKAGE = "org.apache";

/** Thread dump message. */
public static final String THREAD_DUMP_MSG = "Thread dump at ";

/** Date format for thread dumps. */
private static final DateTimeFormatter THREAD_DUMP_FMT =
DateTimeFormatter.ofPattern("yyyy/MM/dd HH:mm:ss z").withZone(ZoneId.systemDefault());

/** System line separator. */
private static final String NL = System.lineSeparator();

/**
* Get JDK version.
*
Expand Down Expand Up @@ -1108,7 +1127,7 @@ public static boolean startsWith(byte[] key, byte[] prefix) {
* Serializes collection to bytes.
*
* @param collection Collection.
* @param transform Tranform function for the collection element.
* @param transform Transform function for the collection element.
* @return Byte array.
*/
public static <T> byte[] collectionToBytes(Collection<T> collection, Function<T, byte[]> transform) {
Expand Down Expand Up @@ -1297,4 +1316,165 @@ public static boolean shouldSwitchToRequestsExecutor(ThreadOperation... required
return true;
}
}

/**
* Performs thread dump and prints all available info to the given log with WARN logging level.
*
* @param log Logger.
*/
public static void dumpThreads(IgniteLogger log) {
dumpThreads(log, false);
}

/**
* Performs thread dump and prints all available info to the given log
* with WARN or ERROR logging level depending on {@code isErrorLevel} parameter.
*
* @param log Logger.
* @param isErrorLevel {@code true} if thread dump must be printed with ERROR logging level,
* {@code false} if thread dump must be printed with WARN logging level.
*/
public static void dumpThreads(IgniteLogger log, boolean isErrorLevel) {
ThreadMXBean mxBean = ManagementFactory.getThreadMXBean();

Set<Long> deadlockedThreadsIds = getDeadlockedThreadIds(mxBean);

if (deadlockedThreadsIds.isEmpty()) {
logMessage(log, "No deadlocked threads detected.", isErrorLevel);
} else {
logMessage(log, "Deadlocked threads detected (see thread dump below) "
+ "[deadlockedThreadsCnt=" + deadlockedThreadsIds.size() + ']', isErrorLevel);
}

ThreadInfo[] threadInfos =
mxBean.dumpAllThreads(mxBean.isObjectMonitorUsageSupported(), mxBean.isSynchronizerUsageSupported());

StringBuilder sb = new StringBuilder(THREAD_DUMP_MSG)
.append(THREAD_DUMP_FMT.format(Instant.ofEpochMilli(System.currentTimeMillis())))
.append(NL);

for (ThreadInfo info : threadInfos) {
printThreadInfo(info, sb, deadlockedThreadsIds);

sb.append(NL);

if (info.getLockedSynchronizers() != null && info.getLockedSynchronizers().length > 0) {
printSynchronizersInfo(info.getLockedSynchronizers(), sb);

sb.append(NL);
}
}

sb.append(NL);

logMessage(log, sb.toString(), isErrorLevel);
}

/**
* Prints message to the given log with WARN or ERROR logging level depending on {@code isErrorLevel} parameter.
*
* @param log Logger.
* @param msg Message.
* @param isErrorLevel {@code true} if message must be printed with ERROR logging level,
* {@code false} if message must be printed with WARN logging level.
*/
private static void logMessage(IgniteLogger log, String msg, boolean isErrorLevel) {
if (isErrorLevel) {
log.error(msg);
} else {
log.warn(msg);
}
}

/**
* Get deadlocks from the thread bean.
*
* @param mxBean The management interface for the thread system.
* @return the set of deadlocked threads (may be empty Set, but never {@code null}).
*/
private static Set<Long> getDeadlockedThreadIds(ThreadMXBean mxBean) {
long[] deadlockedIds = mxBean.isSynchronizerUsageSupported()
? mxBean.findDeadlockedThreads() : null;

Set<Long> deadlockedThreadsIds;

if (deadlockedIds != null && deadlockedIds.length != 0) {
Set<Long> set = new HashSet<>();

for (long id : deadlockedIds) {
set.add(id);
}

deadlockedThreadsIds = Collections.unmodifiableSet(set);
} else {
deadlockedThreadsIds = Collections.emptySet();
}

return deadlockedThreadsIds;
}

/**
* Prints single thread info to a buffer.
*
* @param threadInfo Thread info.
* @param sb Buffer.
*/
private static void printThreadInfo(ThreadInfo threadInfo, StringBuilder sb, Set<Long> deadlockedIdSet) {
long id = threadInfo.getThreadId();

if (deadlockedIdSet.contains(id)) {
sb.append("##### DEADLOCKED ");
}

sb.append("Thread [name=\"").append(threadInfo.getThreadName())
.append("\", id=").append(threadInfo.getThreadId())
.append(", state=").append(threadInfo.getThreadState())
.append(", blockCnt=").append(threadInfo.getBlockedCount())
.append(", waitCnt=").append(threadInfo.getWaitedCount())
.append(']').append(NL);

LockInfo lockInfo = threadInfo.getLockInfo();

if (lockInfo != null) {
sb.append(" Lock [object=")
.append(lockInfo)
.append(", ownerName=")
.append(threadInfo.getLockOwnerName())
.append(", ownerId=")
.append(threadInfo.getLockOwnerId())
.append(']')
.append(NL);
}

MonitorInfo[] monitors = threadInfo.getLockedMonitors();
StackTraceElement[] elements = threadInfo.getStackTrace();

for (int i = 0; i < elements.length; i++) {
StackTraceElement e = elements[i];

sb.append(" at ").append(e.toString());

for (MonitorInfo monitor : monitors) {
if (monitor.getLockedStackDepth() == i) {
sb.append(NL).append(" - locked ").append(monitor);
}
}

sb.append(NL);
}
}

/**
* Prints Synchronizers info to a buffer.
*
* @param syncs Synchronizers info.
* @param sb Buffer.
*/
private static void printSynchronizersInfo(LockInfo[] syncs, StringBuilder sb) {
sb.append(" Locked synchronizers:");

for (LockInfo info : syncs) {
sb.append(NL).append(" ").append(info);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,8 @@ private static String createStartConfig(@Language("HOCON") String nodeAttributes
+ " nodeAttributes.nodeAttributes: " + nodeAttributes + ",\n"
+ " storage.profiles: " + storageProfiles + ",\n"
+ " clientConnector.port: {},\n"
+ " rest.port: {}\n"
+ " rest.port: {},\n"
+ " failureHandler.dumpThreadsOnFailure: false\n"
+ "}";
}

Expand Down
Loading