Skip to content

Commit

Permalink
Improve cluster-checker tool (#47)
Browse files Browse the repository at this point in the history
  • Loading branch information
tardieu authored Sep 4, 2024
1 parent b68f6cf commit 236dd87
Show file tree
Hide file tree
Showing 2 changed files with 174 additions and 88 deletions.
28 changes: 24 additions & 4 deletions tools/cluster-checker/README.md
Original file line number Diff line number Diff line change
@@ -1,17 +1,37 @@
# Cluster Checker

The tool in this directory diagnoses the state of a cluster looking for common
issues and producing a gpu allocation summary per namespace and for the entire
cluster.
The tool in this directory produces a summary view on GPU quotas and utilization
on the cluster. It also diagnoses the state of a cluster looking for common
issues.

The tool is implemented in JavaScript and intended to run with Node.js.

Install [Node.js](https://nodejs.org/) with the npm package manager.

Install dependencies with:
```sh
npm install
```

Run with:
Run the tool against the current Kubernetes context with:
```sh
node checker.js
```
```
CLUSTER QUEUE GPU QUOTA GPU USAGE ADMITTED WORKLOADS PENDING WORKLOADS
team1-cluster-queue 8 16 1 0
team2-cluster-queue 8 4 4 0
Total GPU count in cluster: 24
Unschedulable GPU count: - 0
Schedulable GPU count: = 24
Nominal GPU quota: 16
Slack GPU quota: + 8
Total GPU quota: = 24
GPU usage by admitted workloads: 20
Borrowed GPU count: 8
WARNING: workload "default/pytorchjob-job-e6381" refers to a non-existent local queue "test-queue"
```
234 changes: 150 additions & 84 deletions tools/cluster-checker/checker.js
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,41 @@ class Client {
}
}

// pad value with spaces to the left
function pad (v, n) {
return String(v ?? '').padStart(n)
}

// format and print table
function printTable (table, kind, ...columns) {
const widths = { name: kind.length } // column widths
const names = Object.keys(table).sort() // object names

// compute column widths
for (const name of names) {
widths.name = Math.max(widths.name, name.length)
for (const column of columns) {
widths[column[1]] = Math.max(widths[column[1]] ?? column[0].length, String(table[name][column[0]] ?? '').length)
}
}

// print table header
let header = kind.toUpperCase().padEnd(widths.name, ' ')
for (const column of columns) {
header += ' ' + pad(column[0].toUpperCase(), widths[column[1]])
}
console.log(header)

// print table rows
for (const name of names) {
let row = name.padEnd(widths.name, ' ')
for (const column of columns) {
row += ' ' + pad(table[name][column[1]], widths[column[1]])
}
console.log(row)
}
}

// return the number of GPUs reserved by the pod
function reservation (pod) {
if (pod.status?.phase === 'Succeeded' || pod.status?.phase === 'Failed') {
Expand All @@ -76,74 +111,64 @@ function reservation (pod) {
return 0 // pod has not been scheduled yet
}
let gpus = 0
// compute sum of regular containers
// compute sum of container gpu limits
for (const container of pod.spec.containers) {
gpus += parseInt(container.resources?.limits?.['nvidia.com/gpu'] ?? "0")
gpus += parseInt(container.resources?.limits?.['nvidia.com/gpu'] ?? '0')
}
// compute max with init containers
// compute max with init container gpu limits
for (const container of pod.spec.initContainers ?? []) {
gpus = Math.max(gpus, parseInt(container.resources?.limits?.['nvidia.com/gpu'] ?? "0"))
gpus = Math.max(gpus, parseInt(container.resources?.limits?.['nvidia.com/gpu'] ?? '0'))
}
return gpus
}

// check workloads in user namespace and report total GPU usage for namespace
async function checkUserNamespace (client, namespace, quotas, localQueues) {
// extract local queue names and compute GPU quota for namespace
const clusterQueueNames = new Set() // accessible cluster queues without repetition
const queueNames = [] // local queue names
for (const localQueue of localQueues) {
clusterQueueNames.add(localQueue.spec.clusterQueue)
queueNames.push(localQueue.metadata.name)
}
let quota = 0 // gpu quota
for (const clusterQueueName of clusterQueueNames) {
quota += quotas[clusterQueueName]
}

const pods = await client.pods(namespace.metadata.name)
// check user namespace
async function checkUserNamespace (client, namespace, queues) {
const workloads = await client.workloads(namespace.metadata.name)

let gpus = 0 // GPUs in use by scheduled pods
for (const pod of pods) {
gpus += reservation(pod)
}

// check every workload
for (const workload of workloads) {
// check queue name
// report invalid queue names
let queueName = workload.spec.queueName
if (queueName) {
if (!queueNames.includes(queueName)) {
console.log(`- Workload "${namespace.metadata.name}/${workload.metadata.name}" refers to a non-existent queue "${queueName}"`)
if (!queues.find(queue => queue.metadata.name === queueName)) {
console.log(`WARNING: workload "${namespace.metadata.name}/${workload.metadata.name}" refers to a non-existent local queue "${queueName}"`)
}
} else {
console.log(`- Workload "${namespace.metadata.name}/${workload.metadata.name}" is missing a queue name`)
console.log(`WARNING: workload "${namespace.metadata.name}/${workload.metadata.name}" is missing a local queue name`)
}
}

console.log(`Namespace "${namespace.metadata.name}" uses ${gpus} GPU(s) and has a quota of ${quota} GPU(s)`)
console.log()
return gpus
// report high-priority workloads
if (workload.spec.priorityClassName !== 'default-priority' && workload.spec.priorityClassName !== 'low-priority') {
console.log(`WARNING: workload "${namespace.metadata.name}/${workload.metadata.name}" has priority "${workload.spec.priorityClassName}"`)
}

// report unusual conditions
const conditions = {}
for (const condition of workload.status?.conditions ?? []) {
conditions[condition.type] = condition.status
}
if (conditions['Admitted'] === 'True' && conditions['PodsReady'] === 'False') {
console.log(`WARNING: workload "${namespace.metadata.name}/${workload.metadata.name}" has conditions Admitted=True and PodsReady=False`)
}
if (conditions['Evicted'] === 'True') {
console.log(`WARNING: workload "${namespace.metadata.name}/${workload.metadata.name}" has condition Evicted=True`)
}
}
}

// identify pods using GPUs in system namespace and report total GPU usage for namespace
async function checkSystemNamespace (client, namespace) {
// check system namespace
async function checkSystemNamespace (client, namespace, nodes) {
const pods = await client.pods(namespace.metadata.name)

let gpus = 0 // GPUs in use by scheduled pods
for (const pod of pods) {
const n = reservation(pod)
if (n) {
console.log(`- System pod "${namespace.metadata.name}/${pod.metadata.name}" uses ${n} GPU(s)`)
gpus += n
// report GPU occupancy
const gpus = reservation(pod)
if (gpus) {
const node = nodes.find(node => node.metadata.name === pod.spec.nodeName)
console.log(`WARNING: pod "${namespace.metadata.name}/${pod.metadata.name}" occupies ${gpus} GPU(s)` +
`on node "${pod.spec.nodeName}" with GPU taints noExecute=${node?.noExecute} and noSchedule=${node?.noSchedule}`)
}
}
if (gpus > 0) {
console.log(`System namespace "${namespace.metadata.name}" uses ${gpus} GPU(s) but has no quota`)
console.log()
}
return gpus
}

async function main () {
Expand All @@ -152,99 +177,140 @@ async function main () {
const client = new Client()

let clusterGPUs = 0 // cluster capacity
let noScheduleGPUs = 0 // unschedulable GPUs
let noScheduleGPUs = 0 // no-schedule GPUs
let noExecuteGPUs = 0 // no-execute GPUs
let userGPUs = 0 // GPU usage by user namespaces
let systemGPUs = 0 // GPU usage by system namespaces
let usedGPUs = 0 // GPU usage by admitted workloads
let borrowedGPUs = 0 // GPU borrowed from the cohort
let quotaGPUs = 0 // nominal GPU quota (excluding slack queue)
let slackGPUs = 0 // lending limit on slack queue

// load taint configuration
// load codeflare operator configuration
const configMap = await client.readConfigMap('codeflare-operator-config', 'redhat-ods-applications')
const config = k8s.loadYaml(configMap.data['config.yaml'])
const taints = config.appwrapper?.Config?.autopilot?.resourceTaints?.['nvidia.com/gpu']
const slackQueueName = config.appwrapper?.Config?.slackQueueName

// compute GPU counts
const nodes = await client.nodes()
for (const node of nodes) {
const gpus = parseInt(node.status.capacity['nvidia.com/gpu'])
const gpus = parseInt(node.status.capacity['nvidia.com/gpu'] ?? '0')
if (gpus > 0) {
clusterGPUs += gpus
let noSchedule = false
let noExecute = false
node.noSchedule = false
node.noExecute = false
for (const taint of taints ?? []) {
if (node.metadata.labels?.[taint.key] === taint.value) {
if (taint.effect === 'NoExecute') {
noExecute = true
node.noExecute = true
} else if (taint.effect === 'NoSchedule') {
noSchedule = true
node.noSchedule = true
}
}
}
for (const taint of node.spec.taints ?? []) {
if (taint.effect === 'NoExecute') {
noExecute = true
node.noExecute = true
} else if (taint.effect === 'NoSchedule') {
noSchedule = true
node.noSchedule = true
}
}
if (noExecute) {
noExecuteGPUs += gpus
} else if (noSchedule) {
noScheduleGPUs += gpus
if (node.noExecute) {
node.noExecuteGPUs += gpus
} else if (node.noSchedule) { // no double counting
node.noScheduleGPUs += gpus
}
}
}

// compute GPU quotas for each cluster queue
// collect cluster queue metrics
const clusterQueues = await client.clusterQueues()
const quotas = {}
const queues = {}
for (const clusterQueue of clusterQueues) {
quotas[clusterQueue.metadata.name] = 0
const queue = {
quota: 0, usage: 0, borrowed: 0, lendingLimit: 0,
admitted: clusterQueue.status?.admittedWorkloads ?? 0,
pending: clusterQueue.status?.pendingWorkloads ?? 0
}
for (const resourceGroup of clusterQueue.spec.resourceGroups) {
if (resourceGroup.coveredResources.includes('nvidia.com/gpu')) {
for (const flavor of resourceGroup.flavors) {
for (const resource of flavor.resources) {
if (resource.name === 'nvidia.com/gpu') {
quotas[clusterQueue.metadata.name] += resource.nominalQuota
queue.quota += parseInt(resource.nominalQuota ?? '0')
// lending limit is nominal quota if not set
queue.lendingLimit += parseInt(resource.lendingLimit ?? resource.nominalQuota ?? '0')
break // resource may only occur once in flavor
}
}
}
break // resource may only belong to one resource group
}
}
for (const flavor of clusterQueue.status?.flavorsUsage ?? []) {
for (const resource of flavor.resources) {
if (resource.name === 'nvidia.com/gpu') {
queue.usage += parseInt(resource.total ?? '0')
queue.borrowed += parseInt(resource.borrowed ?? '0')
break // resource may only occur once in flavor
}
}
}
usedGPUs += queue.usage
borrowedGPUs += queue.borrowed
if (clusterQueue.metadata.name === slackQueueName) {
slackGPUs = queue.lendingLimit
// do not include slack queue in table
} else {
quotaGPUs += queue.quota
queues[clusterQueue.metadata.name] = queue
}
}

// print cluster queue table
printTable(queues, 'cluster queue', ['gpu quota', 'quota'], ['gpu usage', 'usage'],
['admitted workloads', 'admitted'], ['pending workloads', 'pending'])
console.log()

// print summary results
const width = Math.max(String(clusterGPUs).length, String(quotaGPUs).length)
console.log(`Total GPU count in cluster: ${pad(clusterGPUs, width)}`)
console.log(`Unschedulable GPU count: - ${pad(noExecuteGPUs + noScheduleGPUs, width)}`)
console.log(`Schedulable GPU count: = ${pad(clusterGPUs - noExecuteGPUs - noScheduleGPUs, width)}`)
console.log()
console.log(`Nominal GPU quota: ${pad(quotaGPUs, width)}`)
console.log(`Slack GPU quota: + ${pad(slackGPUs, width)}`)
console.log(`Total GPU quota: = ${pad(quotaGPUs + slackGPUs, width)}`)
console.log()
console.log(`GPU usage by admitted workloads: ${pad(usedGPUs, width)}`)
console.log(`Borrowed GPU count: ${pad(borrowedGPUs, width)}`)
console.log()

if (quotaGPUs > clusterGPUs - noExecuteGPUs - noScheduleGPUs) {
console.log('WARNING: nominal GPU quota is greater than schedulable GPU count')
}

// check all namespaces
// check all accessible namespaces
const namespaces = await client.namespaces()
for (const namespace of namespaces) {
if (namespace.metadata.name.startsWith('openshift-')) {
continue // skip openshift namespaces
}

const localQueues = await client.localQueues(namespace.metadata.name)
let localQueues
try {
localQueues = await client.localQueues(namespace.metadata.name)
} catch (err) {
continue // skip inaccessible namespaces
}

if (localQueues.length === 0) {
systemGPUs += await checkSystemNamespace(client, namespace)
await checkSystemNamespace(client, namespace, nodes)
} else {
userGPUs += await checkUserNamespace(client, namespace, quotas, localQueues)
await checkUserNamespace(client, namespace, localQueues)
}
}

// print summary results
console.log(`${clusterGPUs} GPU(s) in cluster`)
if (noExecuteGPUs) {
console.log(`${noExecuteGPUs} GPU(s) tainted NoExecute`)
}
if (noScheduleGPUs) {
console.log(`${noScheduleGPUs} GPU(s) tainted NoSchedule`)
}
console.log(`${userGPUs} GPU(s) used by scheduled workloads`)
if (systemGPUs > 0) {
console.log(`${systemGPUs} GPU(s) used by system pods`)
}

} catch (e) {
console.error(e)
} catch (err) {
console.error(err)
}
}

Expand Down

0 comments on commit 236dd87

Please sign in to comment.