Skip to content

Commit

Permalink
IGNITE-20449 Add support generating thread dump on failure (#4923)
Browse files Browse the repository at this point in the history
  • Loading branch information
sk0x50 authored Dec 20, 2024
1 parent 9da33bf commit c4a7355
Show file tree
Hide file tree
Showing 28 changed files with 795 additions and 110 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,8 @@ protected String getNodeBootstrapConfigTemplate() {
+ " },\n"
+ " clientConnector.port: {},\n"
+ " rest.port: {},\n"
+ " compute.threadPoolSize: 1\n"
+ " compute.threadPoolSize: 1,\n"
+ " failureHandler.dumpThreadsOnFailure: false\n"
+ "}";
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,8 @@ public static String restSslBootstrapConfig(@Nullable String ciphers) {
+ " },\n"
+ (nullOrBlank(ciphers) ? "" : " ciphers: \"" + ciphers + "\"")
+ " }\n"
+ " }\n"
+ " },\n"
+ " failureHandler.dumpThreadsOnFailure: false\n"
+ "}";
}

Expand Down Expand Up @@ -108,7 +109,8 @@ public static String clientConnectorSslBootstrapConfig(@Nullable String ciphers)
+ " rest: {\n"
+ " port: {},\n"
+ " ssl.port: {}\n"
+ " }\n"
+ " },\n"
+ " failureHandler.dumpThreadsOnFailure: false\n"
+ "}";
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,8 @@ class ItLogicalTopologyTest extends ClusterPerTestIntegrationTest {
+ " nodeAttributes.nodeAttributes: {region.attribute = US, storage.attribute = SSD},\n"
+ " storage.profiles: {lru_rocks.engine = rocksdb, segmented_aipersist.engine = aipersist},\n"
+ " clientConnector.port: {},\n"
+ " rest.port: {}\n"
+ " rest.port: {},\n"
+ " failureHandler.dumpThreadsOnFailure: false\n"
+ "}";

private final LogicalTopologyEventListener listener = new LogicalTopologyEventListener() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,8 @@ protected String getNodeBootstrapConfigTemplate() {
+ " compute: {"
+ " threadPoolSize: 1,\n"
+ " statesLifetimeMillis: 1000\n"
+ " }\n"
+ " },\n"
+ " failureHandler.dumpThreadsOnFailure: false\n"
+ "}";
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@

package org.apache.ignite.internal.failure;

import org.apache.ignite.internal.failure.handlers.FailureHandler;

/**
* General failure processing API.
*/
Expand All @@ -25,7 +27,7 @@ public interface FailureProcessor {
* Processes failure accordingly to configured {@link FailureHandler}.
*
* @param failureCtx Failure context.
* @return {@code True} If this very call led to Ignite node invalidation.
* @return {@code true} If this call leads to Ignite node invalidation and {@code false} otherwise.
*/
boolean process(FailureContext failureCtx);
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,237 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.ignite.internal.thread;

import java.lang.management.LockInfo;
import java.lang.management.ManagementFactory;
import java.lang.management.MonitorInfo;
import java.lang.management.ThreadInfo;
import java.lang.management.ThreadMXBean;
import java.time.Instant;
import java.time.LocalDateTime;
import java.time.ZoneId;
import java.time.format.DateTimeFormatter;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;
import org.apache.ignite.internal.lang.IgniteStringFormatter;
import org.apache.ignite.internal.logger.IgniteLogger;

/**
* This class contains utility methods for working with threads.
*/
public class ThreadUtils {
/** Thread dump message. */
public static final String THREAD_DUMP_MSG = "Thread dump at ";

/** Date format for thread dumps. */
private static final DateTimeFormatter THREAD_DUMP_FMT =
DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss z").withZone(ZoneId.systemDefault());

/**
* Short date format pattern for log messages in "quiet" mode. Only time is included since we don't expect "quiet" mode to be used for
* longer runs.
*/
private static final DateTimeFormatter SHORT_DATE_FMT = DateTimeFormatter.ofPattern("HH:mm:ss");

/** System line separator. */
private static final String NL = System.lineSeparator();

/**
* Performs thread dump and prints all available info to the given log with WARN logging level.
*
* @param log Logger.
*/
public static void dumpThreads(IgniteLogger log) {
dumpThreads(log, false);
}

/**
* Performs thread dump and prints all available info to the given log
* with WARN or ERROR logging level depending on {@code isErrorLevel} parameter.
*
* @param log Logger.
* @param isErrorLevel {@code true} if thread dump must be printed with ERROR logging level,
* {@code false} if thread dump must be printed with WARN logging level.
*/
public static void dumpThreads(IgniteLogger log, boolean isErrorLevel) {
ThreadMXBean mxBean = ManagementFactory.getThreadMXBean();

Set<Long> deadlockedThreadsIds = getDeadlockedThreadIds(mxBean);

if (deadlockedThreadsIds.isEmpty()) {
logMessage(log, "No deadlocked threads detected.", isErrorLevel);
} else {
logMessage(log, "Deadlocked threads detected (see thread dump below) "
+ "[deadlockedThreadsCount=" + deadlockedThreadsIds.size() + ']', isErrorLevel);
}

ThreadInfo[] threadInfos =
mxBean.dumpAllThreads(mxBean.isObjectMonitorUsageSupported(), mxBean.isSynchronizerUsageSupported());

StringBuilder sb = new StringBuilder(THREAD_DUMP_MSG)
.append(THREAD_DUMP_FMT.format(Instant.ofEpochMilli(System.currentTimeMillis())))
.append(NL);

for (ThreadInfo info : threadInfos) {
printThreadInfo(info, sb, deadlockedThreadsIds);

sb.append(NL);

if (info.getLockedSynchronizers() != null && info.getLockedSynchronizers().length > 0) {
printSynchronizersInfo(info.getLockedSynchronizers(), sb);

sb.append(NL);
}
}

sb.append(NL);

logMessage(log, sb.toString(), isErrorLevel);
}

/**
* Prints message to the given log with WARN or ERROR logging level depending on {@code isErrorLevel} parameter.
*
* @param log Logger.
* @param message Message.
* @param isErrorLevel {@code true} if message must be printed with ERROR logging level,
* {@code false} if message must be printed with WARN logging level.
*/
private static void logMessage(IgniteLogger log, String message, boolean isErrorLevel) {
if (isErrorLevel) {
log.error(message);
} else {
log.warn(message);
}
}

/**
* Get deadlocks from the thread bean.
*
* @param mxBean The management interface for the thread system.
* @return the set of deadlocked threads (may be empty Set, but never {@code null}).
*/
private static Set<Long> getDeadlockedThreadIds(ThreadMXBean mxBean) {
long[] deadlockedIds = mxBean.isSynchronizerUsageSupported()
? mxBean.findDeadlockedThreads() : null;

Set<Long> deadlockedThreadsIds;

if (deadlockedIds != null && deadlockedIds.length != 0) {
Set<Long> set = new HashSet<>();

for (long id : deadlockedIds) {
set.add(id);
}

deadlockedThreadsIds = Collections.unmodifiableSet(set);
} else {
deadlockedThreadsIds = Collections.emptySet();
}

return deadlockedThreadsIds;
}

/**
* Prints single thread info to a buffer.
*
* @param threadInfo Thread info.
* @param stringBuilder Buffer.
*/
private static void printThreadInfo(ThreadInfo threadInfo, StringBuilder stringBuilder, Set<Long> deadlockedIdSet) {
long id = threadInfo.getThreadId();

if (deadlockedIdSet.contains(id)) {
stringBuilder.append("##### DEADLOCKED ");
}

stringBuilder.append("Thread [name=\"").append(threadInfo.getThreadName())
.append("\", id=").append(threadInfo.getThreadId())
.append(", state=").append(threadInfo.getThreadState())
.append(", blockCnt=").append(threadInfo.getBlockedCount())
.append(", waitCnt=").append(threadInfo.getWaitedCount())
.append(']').append(NL);

LockInfo lockInfo = threadInfo.getLockInfo();

if (lockInfo != null) {
stringBuilder.append(" Lock [object=")
.append(lockInfo)
.append(", ownerName=")
.append(threadInfo.getLockOwnerName())
.append(", ownerId=")
.append(threadInfo.getLockOwnerId())
.append(']')
.append(NL);
}

MonitorInfo[] monitors = threadInfo.getLockedMonitors();
StackTraceElement[] elements = threadInfo.getStackTrace();

for (int i = 0; i < elements.length; i++) {
StackTraceElement e = elements[i];

stringBuilder.append(" at ").append(e.toString());

for (MonitorInfo monitor : monitors) {
if (monitor.getLockedStackDepth() == i) {
stringBuilder.append(NL).append(" - locked ").append(monitor);
}
}

stringBuilder.append(NL);
}
}

/**
* Prints Synchronizers info to a buffer.
*
* @param syncs Synchronizers info.
* @param stringBuilder Buffer.
*/
private static void printSynchronizersInfo(LockInfo[] syncs, StringBuilder stringBuilder) {
stringBuilder.append(" Locked synchronizers:");

for (LockInfo info : syncs) {
stringBuilder.append(NL).append(" ").append(info);
}
}

/**
* Prints stack trace of the current thread to provided logger.
*
* @param log Logger.
* @param message Message to print with the stack.
* @deprecated Calls to this method should never be committed to master.
*/
@Deprecated
public static void dumpStack(IgniteLogger log, String message, Object... params) {
String reason = "Dumping stack";

var err = new Exception(IgniteStringFormatter.format(message, params));

if (log != null) {
log.warn(reason, err);
} else {
System.err.println("[" + LocalDateTime.now().format(SHORT_DATE_FMT) + "] (err) " + reason);

err.printStackTrace(System.err);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -332,6 +332,55 @@ public static String getFullStackTrace(Throwable throwable) {
return sw.getBuffer().toString();
}

/**
* Checks if passed in {@code 'Throwable'} has given class in {@code 'cause'} hierarchy
* <b>including</b> that throwable itself.
* Note that this method follows includes {@link Throwable#getSuppressed()}
* into check.
*
* @param throwable Throwable to check (if {@code null}, {@code false} is returned).
* @param message Message text that should be in cause.
* @param clazz Cause classes to check (if {@code null} or empty, {@code false} is returned).
* @return {@code true} if one of the causing exception is an instance of passed in classes,
* {@code false} otherwise.
*/
public static boolean hasCauseOrSuppressed(
@Nullable Throwable throwable,
@Nullable String message,
Class<?> @Nullable... clazz) {
if (throwable == null || clazz == null || clazz.length == 0) {
return false;
}

for (Throwable th = throwable; th != null; th = th.getCause()) {
for (Class<?> c : clazz) {
if (c.isAssignableFrom(th.getClass())) {
if (message != null) {
if (th.getMessage() != null && th.getMessage().contains(message)) {
return true;
} else {
continue;
}
}

return true;
}
}

for (Throwable n : th.getSuppressed()) {
if (hasCauseOrSuppressed(n, message, clazz)) {
return true;
}
}

if (th.getCause() == th) {
break;
}
}

return false;
}

/**
* Unwraps exception cause from wrappers like CompletionException and ExecutionException.
*
Expand Down
Loading

0 comments on commit c4a7355

Please sign in to comment.