-
Notifications
You must be signed in to change notification settings - Fork 374
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[CELEBORN-1792] MemoryManager resume should use pinnedDirectMemory instead of usedDirectMemory #3018
[CELEBORN-1792] MemoryManager resume should use pinnedDirectMemory instead of usedDirectMemory #3018
Changes from 3 commits
5fdc844
e149e86
4fa3612
ea78183
354515d
eb0d635
3567fb4
dcd3596
2d5c9b9
91ea4d7
27f88d3
c1439cb
135d5a8
abacec6
e8412f0
366ff59
e7e7479
e2de154
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -30,12 +30,14 @@ | |
import com.google.common.base.Preconditions; | ||
import io.netty.buffer.ByteBuf; | ||
import io.netty.buffer.ByteBufAllocator; | ||
import io.netty.buffer.PooledByteBufAllocator; | ||
import io.netty.util.internal.PlatformDependent; | ||
import org.slf4j.Logger; | ||
import org.slf4j.LoggerFactory; | ||
|
||
import org.apache.celeborn.common.CelebornConf; | ||
import org.apache.celeborn.common.metrics.source.AbstractSource; | ||
import org.apache.celeborn.common.network.util.NettyUtils; | ||
import org.apache.celeborn.common.protocol.TransportModuleConstants; | ||
import org.apache.celeborn.common.util.ThreadUtils; | ||
import org.apache.celeborn.common.util.Utils; | ||
|
@@ -93,6 +95,7 @@ public class MemoryManager { | |
private long memoryFileStorageThreshold; | ||
private final LongAdder memoryFileStorageCounter = new LongAdder(); | ||
private final StorageManager storageManager; | ||
private boolean networkMemoryAllocatorPooled; | ||
|
||
@VisibleForTesting | ||
public static MemoryManager initialize(CelebornConf conf) { | ||
|
@@ -159,6 +162,7 @@ private MemoryManager(CelebornConf conf, StorageManager storageManager, Abstract | |
readBufferThreshold = (long) (maxDirectMemory * readBufferRatio); | ||
readBufferTarget = (long) (readBufferThreshold * readBufferTargetRatio); | ||
memoryFileStorageThreshold = (long) (maxDirectMemory * memoryFileStorageRatio); | ||
networkMemoryAllocatorPooled = conf.networkMemoryAllocatorPooled(); | ||
|
||
checkService.scheduleWithFixedDelay( | ||
() -> { | ||
|
@@ -293,6 +297,18 @@ public boolean shouldEvict(boolean aggressiveMemoryFileEvictEnabled, double evic | |
|
||
public ServingState currentServingState() { | ||
long memoryUsage = getMemoryUsage(); | ||
long allocatedMemory; | ||
if (networkMemoryAllocatorPooled) { | ||
allocatedMemory = getAllocatedMemory(); | ||
} else { | ||
allocatedMemory = memoryUsage; | ||
} | ||
// trigger resume | ||
// CELEBORN-1792: resume should use pinnedDirectMemory instead of usedDirectMemory | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Although we needn't change to pause state, it would be better to call trim when netty direct memory used above pausePushDataThreshold/pauseReplicateThreshold, WDYT? |
||
if (allocatedMemory / (double) (maxDirectMemory) < resumeRatio) { | ||
isPaused = false; | ||
return ServingState.NONE_PAUSED; | ||
} | ||
// pause replicate threshold always greater than pause push data threshold | ||
// so when trigger pause replicate, pause both push and replicate | ||
if (memoryUsage > pauseReplicateThreshold) { | ||
|
@@ -304,11 +320,6 @@ public ServingState currentServingState() { | |
isPaused = true; | ||
return ServingState.PUSH_PAUSED; | ||
} | ||
// trigger resume | ||
if (memoryUsage / (double) (maxDirectMemory) < resumeRatio) { | ||
isPaused = false; | ||
return ServingState.NONE_PAUSED; | ||
} | ||
// if isPaused and not trigger resume, then return pause push | ||
// wait for trigger resumeThreshold to resume state | ||
return isPaused ? ServingState.PUSH_PAUSED : ServingState.NONE_PAUSED; | ||
|
@@ -436,6 +447,16 @@ public long getMemoryUsage() { | |
return getNettyUsedDirectMemory() + sortMemoryCounter.get(); | ||
} | ||
|
||
public long getAllocatedMemory() { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This method should be renamed to getPinnedMemory. The allocated memory is the netty memory counter. |
||
return getNettyPinnedDirectMemory() + sortMemoryCounter.get(); | ||
} | ||
|
||
public long getNettyPinnedDirectMemory() { | ||
return NettyUtils.getAllPooledByteBufAllocators().stream() | ||
.mapToLong(PooledByteBufAllocator::pinnedDirectMemory) | ||
.sum(); | ||
} | ||
|
||
public AtomicLong getSortMemoryCounter() { | ||
return sortMemoryCounter; | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe we can add a new conf for pinnedMemoryToResume and keep exist conf for directMemoryRatioToResume