Skip to content

Commit

Permalink
Improve reporting of lending limit
Browse files Browse the repository at this point in the history
  • Loading branch information
tardieu committed Oct 10, 2024
1 parent bfdd2aa commit b271c4e
Showing 1 changed file with 30 additions and 9 deletions.
39 changes: 30 additions & 9 deletions tools/cluster-checker/checker.js
Original file line number Diff line number Diff line change
Expand Up @@ -266,17 +266,20 @@ async function main () {
const client = new Client()

let clusterGPUs = 0 // cluster capacity
const noScheduleGPUs = 0 // no-schedule GPUs
const noExecuteGPUs = 0 // no-execute GPUs
let noScheduleGPUs = 0 // no-schedule GPUs
let noExecuteGPUs = 0 // no-execute GPUs
let usedGPUs = 0 // GPU usage by admitted workloads
let borrowedGPUs = 0 // GPU borrowed from the cohort
let quotaGPUs = 0 // nominal GPU quota (excluding slack queue)
let slackGPUs = 0 // lending limit on slack queue
let limitGPUs = 0 // lending limit on slack queue
let slackGPUs = 0 // nominal GPU quota on slack queue

const config = await client.readOperatorConfig()
const taints = config.autopilot?.resourceTaints?.['nvidia.com/gpu']
const slackQueueName = config.slackQueueName

let newline = false

// compute GPU counts
const nodes = await client.nodes()
for (const node of nodes) {
Expand All @@ -288,27 +291,39 @@ async function main () {
for (const taint of taints ?? []) {
if (node.metadata.labels?.[taint.key] === taint.value) {
if (taint.effect === 'NoExecute') {
console.log(`WARNING: node "${node.metadata.name}" has label "${taint.key}"="${taint.value}" with effect "${taint.effect}"`)
newline = true
node.noExecute = true
} else if (taint.effect === 'NoSchedule') {
console.log(`WARNING: node "${node.metadata.name}" has label "${taint.key}"="${taint.value}" with effect "${taint.effect}"`)
newline = true
node.noSchedule = true
}
}
}
for (const taint of node.spec.taints ?? []) {
if (taint.effect === 'NoExecute') {
console.log(`WARNING: node "${node.metadata.name}" has taint "${taint.key}" with effect "${taint.effect}"`)
newline = true
node.noExecute = true
} else if (taint.effect === 'NoSchedule') {
console.log(`WARNING: node "${node.metadata.name}" has taint "${taint.key}" with effect "${taint.effect}"`)
newline = true
node.noSchedule = true
}
}
if (node.noExecute) {
node.noExecuteGPUs += gpus
noExecuteGPUs += gpus
} else if (node.noSchedule) { // no double counting
node.noScheduleGPUs += gpus
noScheduleGPUs += gpus
}
}
}

if (newline) {
console.log()
}

// collect cluster queue metrics
const clusterQueues = await client.clusterQueues()
const queues = {}
Expand Down Expand Up @@ -348,7 +363,8 @@ async function main () {
usedGPUs += queue.usage
borrowedGPUs += queue.borrowed
if (clusterQueue.metadata.name === slackQueueName) {
slackGPUs = queue.lendingLimit
slackGPUs = queue.quota
limitGPUs = queue.lendingLimit
// do not include slack queue in table
} else {
quotaGPUs += queue.quota
Expand All @@ -368,8 +384,9 @@ async function main () {
console.log(`Schedulable GPU count: = ${pad(clusterGPUs - noExecuteGPUs - noScheduleGPUs, width)}`)
console.log()
console.log(`Nominal GPU quota: ${pad(quotaGPUs, width)}`)
console.log(`Slack GPU quota: + ${pad(slackGPUs, width)}`)
console.log(`Total GPU quota: = ${pad(quotaGPUs + slackGPUs, width)}`)
console.log(`Maximum slack GPU quota: + ${pad(slackGPUs, width)}`)
console.log(`Slack GPU quota adjustment: - ${pad(slackGPUs - limitGPUs, width)}`)
console.log(`Current GPU quota: = ${pad(quotaGPUs + limitGPUs, width)}`)
console.log()
console.log(`GPU usage by admitted workloads: ${pad(usedGPUs, width)}`)
console.log(`Borrowed GPU count: ${pad(borrowedGPUs, width)}`)
Expand All @@ -379,8 +396,12 @@ async function main () {
console.log('WARNING: nominal GPU quota is greater than schedulable GPU count')
}

if (quotaGPUs + slackGPUs < clusterGPUs) {
console.log('WARNING: maximum GPU quota is lower than total GPU count')
}

if (quotaGPUs + slackGPUs > clusterGPUs) {
console.log('WARNING: total GPU quota is greater than total GPU count')
console.log('WARNING: maximum GPU quota is greater than total GPU count')
}

// check all accessible namespaces
Expand Down

0 comments on commit b271c4e

Please sign in to comment.