From c8953e0e74004a89362e590d55ecdd743069ad27 Mon Sep 17 00:00:00 2001 From: Kuhu Shukla Date: Wed, 31 Jul 2024 14:36:13 -0500 Subject: [PATCH 1/3] Reserve allocation should be displayed when erroring due to lack of memory on startup Signed-off-by: Kuhu Shukla --- .../spark/rapids/GpuDeviceManager.scala | 28 ++++++++++++------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuDeviceManager.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuDeviceManager.scala index 2cfce60c4a5..595cdcbff46 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuDeviceManager.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuDeviceManager.scala @@ -240,19 +240,27 @@ object GpuDeviceManager extends Logging { (conf.rmmAllocFraction * (info.free - reserveAmount)).toLong) if (poolAllocation < minAllocation) { throw new IllegalArgumentException(s"The pool allocation of " + - s"${toMB(poolAllocation)} MB (calculated from ${RapidsConf.RMM_ALLOC_FRACTION} " + - s"(=${conf.rmmAllocFraction}) and ${toMB(info.free)} MB free memory) was less than " + - s"the minimum allocation of ${toMB(minAllocation)} (calculated from " + - s"${RapidsConf.RMM_ALLOC_MIN_FRACTION} (=${conf.rmmAllocMinFraction}) " + - s"and ${toMB(info.total)} MB total memory)") + s"${toMB(poolAllocation)} MiB (gpu.free: ${toMB(info.free)}," + + s"${RapidsConf.RMM_ALLOC_FRACTION}: (=${conf.rmmAllocFraction}," + + s"${RapidsConf.RMM_ALLOC_RESERVE}: ${reserveAmount} => " + + s"(gpu.free - reserve) * allocFraction = ${toMB(poolAllocation)})" + + s"was less than allocation of ${toMB(minAllocation)} MiB (gpu.total: " + + s"${toMB(info.total)} MiB, ${RapidsConf.RMM_ALLOC_MIN_FRACTION}: " + + s"${conf.rmmAllocMinFraction} => gpu.total *" + + s"minAllocFraction = ${toMB(minAllocation)} MiB). Please ensure that the GPU has" + + s"enough free memory, or adjust configuration accordingly.") } if (maxAllocation < poolAllocation) { throw new IllegalArgumentException(s"The pool allocation of " + - s"${toMB(poolAllocation)} MB (calculated from ${RapidsConf.RMM_ALLOC_FRACTION} " + - s"(=${conf.rmmAllocFraction}) and ${toMB(info.free)} MB free memory) was more than " + - s"the maximum allocation of ${toMB(maxAllocation)} (calculated from " + - s"${RapidsConf.RMM_ALLOC_MAX_FRACTION} (=${conf.rmmAllocMaxFraction}) " + - s"and ${toMB(info.total)} MB total memory)") + s"${toMB(poolAllocation)} MiB (gpu.free: ${toMB(info.free)}," + + s"${RapidsConf.RMM_ALLOC_FRACTION}: (=${conf.rmmAllocFraction}," + + s"${RapidsConf.RMM_ALLOC_RESERVE}: ${reserveAmount} => " + + s"(gpu.free - reserve) * allocFraction = ${toMB(poolAllocation)})" + + s"was more than allocation of ${toMB(maxAllocation)} MiB (gpu.total: " + + s"${toMB(info.total)} MiB, ${RapidsConf.RMM_ALLOC_MAX_FRACTION}: " + + s"${conf.rmmAllocMaxFraction} => gpu.total *" + + s"maxAllocFraction = ${toMB(maxAllocation)} MiB). Please ensure that pool allocation" + + s"does not exceed maximum allocation and adjust configuration accordingly.") } if (reserveAmount >= maxAllocation) { throw new IllegalArgumentException(s"RMM reserve memory (${toMB(reserveAmount)} MB) " + From 92ed1fd19d3bc40503af62beb1ba1eafef05b606 Mon Sep 17 00:00:00 2001 From: Kuhu Shukla Date: Thu, 1 Aug 2024 10:55:31 -0500 Subject: [PATCH 2/3] Address review comments Signed-off-by: Kuhu Shukla --- .../spark/rapids/GpuDeviceManager.scala | 44 +++++++++---------- 1 file changed, 21 insertions(+), 23 deletions(-) diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuDeviceManager.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuDeviceManager.scala index 595cdcbff46..16f7e94f4c7 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuDeviceManager.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuDeviceManager.scala @@ -216,7 +216,7 @@ object GpuDeviceManager extends Logging { } } - private def toMB(x: Long): Double = x / 1024 / 1024.0 + private def toMiB(x: Long): Double = x / 1024 / 1024.0 private def computeRmmPoolSize(conf: RapidsConf, info: CudaMemInfo): Long = { def truncateToAlignment(x: Long): Long = x & ~511L @@ -238,41 +238,39 @@ object GpuDeviceManager extends Logging { } var poolAllocation = truncateToAlignment( (conf.rmmAllocFraction * (info.free - reserveAmount)).toLong) + val errorPhrase = "The pool allocation of " + + s"${toMiB(poolAllocation)} MiB (gpu.free: ${toMiB(info.free)}," + + s"${RapidsConf.RMM_ALLOC_FRACTION}: (=${conf.rmmAllocFraction}," + + s"${RapidsConf.RMM_ALLOC_RESERVE}: ${reserveAmount} => " + + s"(gpu.free - reserve) * allocFraction = ${toMiB(poolAllocation)}) was " if (poolAllocation < minAllocation) { - throw new IllegalArgumentException(s"The pool allocation of " + - s"${toMB(poolAllocation)} MiB (gpu.free: ${toMB(info.free)}," + - s"${RapidsConf.RMM_ALLOC_FRACTION}: (=${conf.rmmAllocFraction}," + - s"${RapidsConf.RMM_ALLOC_RESERVE}: ${reserveAmount} => " + - s"(gpu.free - reserve) * allocFraction = ${toMB(poolAllocation)})" + - s"was less than allocation of ${toMB(minAllocation)} MiB (gpu.total: " + - s"${toMB(info.total)} MiB, ${RapidsConf.RMM_ALLOC_MIN_FRACTION}: " + + throw new IllegalArgumentException(errorPhrase + + s"less than allocation of ${toMiB(minAllocation)} MiB (gpu.total: " + + s"${toMiB(info.total)} MiB, ${RapidsConf.RMM_ALLOC_MIN_FRACTION}: " + s"${conf.rmmAllocMinFraction} => gpu.total *" + - s"minAllocFraction = ${toMB(minAllocation)} MiB). Please ensure that the GPU has" + + s"minAllocFraction = ${toMiB(minAllocation)} MiB). Please ensure that the GPU has" + s"enough free memory, or adjust configuration accordingly.") } if (maxAllocation < poolAllocation) { - throw new IllegalArgumentException(s"The pool allocation of " + - s"${toMB(poolAllocation)} MiB (gpu.free: ${toMB(info.free)}," + - s"${RapidsConf.RMM_ALLOC_FRACTION}: (=${conf.rmmAllocFraction}," + - s"${RapidsConf.RMM_ALLOC_RESERVE}: ${reserveAmount} => " + - s"(gpu.free - reserve) * allocFraction = ${toMB(poolAllocation)})" + - s"was more than allocation of ${toMB(maxAllocation)} MiB (gpu.total: " + - s"${toMB(info.total)} MiB, ${RapidsConf.RMM_ALLOC_MAX_FRACTION}: " + + throw new IllegalArgumentException(errorPhrase + + s"more than allocation of ${toMiB(maxAllocation)} MiB (gpu.total: " + + s"${toMiB(info.total)} MiB, ${RapidsConf.RMM_ALLOC_MAX_FRACTION}: " + s"${conf.rmmAllocMaxFraction} => gpu.total *" + - s"maxAllocFraction = ${toMB(maxAllocation)} MiB). Please ensure that pool allocation" + + s"maxAllocFraction = ${toMiB(maxAllocation)} MiB). Please ensure that pool allocation" + s"does not exceed maximum allocation and adjust configuration accordingly.") } if (reserveAmount >= maxAllocation) { - throw new IllegalArgumentException(s"RMM reserve memory (${toMB(reserveAmount)} MB) " + - s"larger than maximum pool size (${toMB(maxAllocation)} MB). Check the settings for " + + throw new IllegalArgumentException(s"RMM reserve memory (${toMiB(reserveAmount)} MB) " + + s"larger than maximum pool size (${toMiB(maxAllocation)} MB). Check the settings for " + s"${RapidsConf.RMM_ALLOC_MAX_FRACTION} (=${conf.rmmAllocFraction}) and " + s"${RapidsConf.RMM_ALLOC_RESERVE} (=$reserveAmount)") } val adjustedMaxAllocation = truncateToAlignment(maxAllocation - reserveAmount) if (poolAllocation > adjustedMaxAllocation) { - logWarning(s"RMM pool allocation (${toMB(poolAllocation)} MB) does not leave enough free " + - s"memory for reserve memory (${toMB(reserveAmount)} MB), lowering the pool size to " + - s"${toMB(adjustedMaxAllocation)} MB to accommodate the requested reserve amount.") + logWarning(s"RMM pool allocation (${toMiB(poolAllocation)} MB) does not leave enough" + + s"free memory for reserve memory (${toMiB(reserveAmount)} MB), lowering the pool " + + s"size to ${toMiB(adjustedMaxAllocation)} MB to " + + s"accommodate the requested reserve amount.") poolAllocation = adjustedMaxAllocation } @@ -356,7 +354,7 @@ object GpuDeviceManager extends Logging { deviceId = Some(gpuId) logInfo(s"Initializing RMM${features.mkString(" ", " ", "")} " + - s"pool size = ${toMB(poolAllocation)} MB on gpuId $gpuId") + s"pool size = ${toMiB(poolAllocation)} MB on gpuId $gpuId") if (Cuda.isPtdsEnabled()) { logInfo("Using per-thread default stream") From 9e84b60149460f7ebb095eb8951059a00d986fe4 Mon Sep 17 00:00:00 2001 From: Kuhu Shukla Date: Thu, 31 Oct 2024 13:48:43 -0500 Subject: [PATCH 3/3] Address minor review comments andswitch to 24.12 Signed-off-by: Kuhu Shukla --- .../scala/com/nvidia/spark/rapids/GpuDeviceManager.scala | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuDeviceManager.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuDeviceManager.scala index 16f7e94f4c7..b0c86773166 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuDeviceManager.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuDeviceManager.scala @@ -248,7 +248,7 @@ object GpuDeviceManager extends Logging { s"less than allocation of ${toMiB(minAllocation)} MiB (gpu.total: " + s"${toMiB(info.total)} MiB, ${RapidsConf.RMM_ALLOC_MIN_FRACTION}: " + s"${conf.rmmAllocMinFraction} => gpu.total *" + - s"minAllocFraction = ${toMiB(minAllocation)} MiB). Please ensure that the GPU has" + + s"minAllocFraction = ${toMiB(minAllocation)} MiB). Please ensure that the GPU has " + s"enough free memory, or adjust configuration accordingly.") } if (maxAllocation < poolAllocation) { @@ -256,8 +256,8 @@ object GpuDeviceManager extends Logging { s"more than allocation of ${toMiB(maxAllocation)} MiB (gpu.total: " + s"${toMiB(info.total)} MiB, ${RapidsConf.RMM_ALLOC_MAX_FRACTION}: " + s"${conf.rmmAllocMaxFraction} => gpu.total *" + - s"maxAllocFraction = ${toMiB(maxAllocation)} MiB). Please ensure that pool allocation" + - s"does not exceed maximum allocation and adjust configuration accordingly.") + s"maxAllocFraction = ${toMiB(maxAllocation)} MiB). Please ensure that pool " + + s"allocation does not exceed maximum allocation and adjust configuration accordingly.") } if (reserveAmount >= maxAllocation) { throw new IllegalArgumentException(s"RMM reserve memory (${toMiB(reserveAmount)} MB) " + @@ -267,7 +267,7 @@ object GpuDeviceManager extends Logging { } val adjustedMaxAllocation = truncateToAlignment(maxAllocation - reserveAmount) if (poolAllocation > adjustedMaxAllocation) { - logWarning(s"RMM pool allocation (${toMiB(poolAllocation)} MB) does not leave enough" + + logWarning(s"RMM pool allocation (${toMiB(poolAllocation)} MB) does not leave enough " + s"free memory for reserve memory (${toMiB(reserveAmount)} MB), lowering the pool " + s"size to ${toMiB(adjustedMaxAllocation)} MB to " + s"accommodate the requested reserve amount.")