Improve cluster-checker tool (#47)

project-codeflare · Sep 4, 2024 · 236dd87 · 236dd87
1 parent b68f6cf
commit 236dd87
Show file tree

Hide file tree

Showing 2 changed files with 174 additions and 88 deletions.
diff --git a/tools/cluster-checker/README.md b/tools/cluster-checker/README.md
@@ -1,17 +1,37 @@
 # Cluster Checker
 
-The tool in this directory diagnoses the state of a cluster looking for common
-issues and producing a gpu allocation summary per namespace and for the entire
-cluster.
+The tool in this directory produces a summary view on GPU quotas and utilization
+on the cluster. It also diagnoses the state of a cluster looking for common
+issues.
 
 The tool is implemented in JavaScript and intended to run with Node.js.
 
+Install [Node.js](https://nodejs.org/) with the npm package manager.
+
 Install dependencies with:
 ```sh
 npm install
 ```
 
-Run with:
+Run the tool against the current Kubernetes context with:
 ```sh
 node checker.js
 ```
+```
+CLUSTER QUEUE         GPU QUOTA   GPU USAGE   ADMITTED WORKLOADS   PENDING WORKLOADS
+team1-cluster-queue           8          16                    1                   0
+team2-cluster-queue           8           4                    4                   0
+
+Total GPU count in cluster:        24
+Unschedulable GPU count:         -  0
+Schedulable GPU count:           = 24
+
+Nominal GPU quota:                 16
+Slack GPU quota:                 +  8
+Total GPU quota:                 = 24
+
+GPU usage by admitted workloads:   20
+Borrowed GPU count:                 8
+
+WARNING: workload "default/pytorchjob-job-e6381" refers to a non-existent local queue "test-queue"
+```
diff --git a/tools/cluster-checker/checker.js b/tools/cluster-checker/checker.js
@@ -58,6 +58,41 @@ class Client {
   }
 }
 
+// pad value with spaces to the left
+function pad (v, n) {
+  return String(v ?? '').padStart(n)
+}
+
+// format and print table
+function printTable (table, kind, ...columns) {
+  const widths = { name: kind.length } // column widths
+  const names = Object.keys(table).sort() // object names
+
+  // compute column widths
+  for (const name of names) {
+    widths.name = Math.max(widths.name, name.length)
+    for (const column of columns) {
+      widths[column[1]] = Math.max(widths[column[1]] ?? column[0].length, String(table[name][column[0]] ?? '').length)
+    }
+  }
+
+  // print table header
+  let header = kind.toUpperCase().padEnd(widths.name, ' ')
+  for (const column of columns) {
+    header += '   ' + pad(column[0].toUpperCase(), widths[column[1]])
+  }
+  console.log(header)
+
+  // print table rows
+  for (const name of names) {
+    let row = name.padEnd(widths.name, ' ')
+    for (const column of columns) {
+      row += '   ' + pad(table[name][column[1]], widths[column[1]])
+    }
+    console.log(row)
+  }
+}
+
 // return the number of GPUs reserved by the pod
 function reservation (pod) {
   if (pod.status?.phase === 'Succeeded' || pod.status?.phase === 'Failed') {
@@ -76,74 +111,64 @@ function reservation (pod) {
     return 0 // pod has not been scheduled yet
   }
   let gpus = 0
-  // compute sum of regular containers
+  // compute sum of container gpu limits
   for (const container of pod.spec.containers) {
-    gpus += parseInt(container.resources?.limits?.['nvidia.com/gpu'] ?? "0")
+    gpus += parseInt(container.resources?.limits?.['nvidia.com/gpu'] ?? '0')
   }
-  // compute max with init containers
+  // compute max with init container gpu limits
   for (const container of pod.spec.initContainers ?? []) {
-    gpus = Math.max(gpus, parseInt(container.resources?.limits?.['nvidia.com/gpu'] ?? "0"))
+    gpus = Math.max(gpus, parseInt(container.resources?.limits?.['nvidia.com/gpu'] ?? '0'))
   }
   return gpus
 }
 
-// check workloads in user namespace and report total GPU usage for namespace
-async function checkUserNamespace (client, namespace, quotas, localQueues) {
-  // extract local queue names and compute GPU quota for namespace
-  const clusterQueueNames = new Set() // accessible cluster queues without repetition
-  const queueNames = [] // local queue names
-  for (const localQueue of localQueues) {
-    clusterQueueNames.add(localQueue.spec.clusterQueue)
-    queueNames.push(localQueue.metadata.name)
-  }
-  let quota = 0 // gpu quota
-  for (const clusterQueueName of clusterQueueNames) {
-    quota += quotas[clusterQueueName]
-  }
-
-  const pods = await client.pods(namespace.metadata.name)
+// check user namespace
+async function checkUserNamespace (client, namespace, queues) {
   const workloads = await client.workloads(namespace.metadata.name)
 
-  let gpus = 0 // GPUs in use by scheduled pods
-  for (const pod of pods) {
-    gpus += reservation(pod)
-  }
-
-  // check every workload
   for (const workload of workloads) {
-    // check queue name
+    // report invalid queue names
     let queueName = workload.spec.queueName
     if (queueName) {
-      if (!queueNames.includes(queueName)) {
-        console.log(`- Workload "${namespace.metadata.name}/${workload.metadata.name}" refers to a non-existent queue "${queueName}"`)
+      if (!queues.find(queue => queue.metadata.name === queueName)) {
+        console.log(`WARNING: workload "${namespace.metadata.name}/${workload.metadata.name}" refers to a non-existent local queue "${queueName}"`)
       }
     } else {
-      console.log(`- Workload "${namespace.metadata.name}/${workload.metadata.name}" is missing a queue name`)
+      console.log(`WARNING: workload "${namespace.metadata.name}/${workload.metadata.name}" is missing a local queue name`)
     }
-  }
 
-  console.log(`Namespace "${namespace.metadata.name}" uses ${gpus} GPU(s) and has a quota of ${quota} GPU(s)`)
-  console.log()
-  return gpus
+    // report high-priority workloads
+    if (workload.spec.priorityClassName !== 'default-priority' && workload.spec.priorityClassName !== 'low-priority') {
+      console.log(`WARNING: workload "${namespace.metadata.name}/${workload.metadata.name}" has priority "${workload.spec.priorityClassName}"`)
+    }
+
+    // report unusual conditions
+    const conditions = {}
+    for (const condition of workload.status?.conditions ?? []) {
+      conditions[condition.type] = condition.status
+    }
+    if (conditions['Admitted'] === 'True' && conditions['PodsReady'] === 'False') {
+      console.log(`WARNING: workload "${namespace.metadata.name}/${workload.metadata.name}" has conditions Admitted=True and PodsReady=False`)
+    }
+    if (conditions['Evicted'] === 'True') {
+      console.log(`WARNING: workload "${namespace.metadata.name}/${workload.metadata.name}" has condition Evicted=True`)
+    }
+  }
 }
 
-// identify pods using GPUs in system namespace and report total GPU usage for namespace
-async function checkSystemNamespace (client, namespace) {
+// check system namespace
+async function checkSystemNamespace (client, namespace, nodes) {
   const pods = await client.pods(namespace.metadata.name)
 
-  let gpus = 0 // GPUs in use by scheduled pods
   for (const pod of pods) {
-    const n = reservation(pod)
-    if (n) {
-      console.log(`- System pod "${namespace.metadata.name}/${pod.metadata.name}" uses ${n} GPU(s)`)
-      gpus += n
+    // report GPU occupancy
+    const gpus = reservation(pod)
+    if (gpus) {
+      const node = nodes.find(node => node.metadata.name === pod.spec.nodeName)
+      console.log(`WARNING: pod "${namespace.metadata.name}/${pod.metadata.name}" occupies ${gpus} GPU(s)` +
+        `on node "${pod.spec.nodeName}" with GPU taints noExecute=${node?.noExecute} and noSchedule=${node?.noSchedule}`)
     }
   }
-  if (gpus > 0) {
-    console.log(`System namespace "${namespace.metadata.name}" uses ${gpus} GPU(s) but has no quota`)
-    console.log()
-  }
-  return gpus
 }
 
 async function main () {
@@ -152,99 +177,140 @@ async function main () {
     const client = new Client()
 
     let clusterGPUs = 0    // cluster capacity
-    let noScheduleGPUs = 0 // unschedulable GPUs
+    let noScheduleGPUs = 0 // no-schedule GPUs
     let noExecuteGPUs = 0  // no-execute GPUs
-    let userGPUs = 0       // GPU usage by user namespaces
-    let systemGPUs = 0     // GPU usage by system namespaces
+    let usedGPUs = 0       // GPU usage by admitted workloads
+    let borrowedGPUs = 0   // GPU borrowed from the cohort
+    let quotaGPUs = 0      // nominal GPU quota (excluding slack queue)
+    let slackGPUs = 0      // lending limit on slack queue
 
-    // load taint configuration
+    // load codeflare operator configuration
     const configMap = await client.readConfigMap('codeflare-operator-config', 'redhat-ods-applications')
     const config = k8s.loadYaml(configMap.data['config.yaml'])
     const taints = config.appwrapper?.Config?.autopilot?.resourceTaints?.['nvidia.com/gpu']
+    const slackQueueName = config.appwrapper?.Config?.slackQueueName
 
     // compute GPU counts
     const nodes = await client.nodes()
     for (const node of nodes) {
-      const gpus = parseInt(node.status.capacity['nvidia.com/gpu'])
+      const gpus = parseInt(node.status.capacity['nvidia.com/gpu'] ?? '0')
       if (gpus > 0) {
         clusterGPUs += gpus
-        let noSchedule = false
-        let noExecute = false
+        node.noSchedule = false
+        node.noExecute = false
         for (const taint of taints ?? []) {
           if (node.metadata.labels?.[taint.key] === taint.value) {
             if (taint.effect === 'NoExecute') {
-              noExecute = true
+              node.noExecute = true
             } else if (taint.effect === 'NoSchedule') {
-              noSchedule = true
+              node.noSchedule = true
             }
           }
         }
         for (const taint of node.spec.taints ?? []) {
           if (taint.effect === 'NoExecute') {
-            noExecute = true
+            node.noExecute = true
           } else if (taint.effect === 'NoSchedule') {
-            noSchedule = true
+            node.noSchedule = true
           }
         }
-        if (noExecute) {
-          noExecuteGPUs += gpus
-        } else if (noSchedule) {
-          noScheduleGPUs += gpus
+        if (node.noExecute) {
+          node.noExecuteGPUs += gpus
+        } else if (node.noSchedule) { // no double counting
+          node.noScheduleGPUs += gpus
         }
       }
     }
 
-    // compute GPU quotas for each cluster queue 
+    // collect cluster queue metrics
     const clusterQueues = await client.clusterQueues()
-    const quotas = {}
+    const queues = {}
     for (const clusterQueue of clusterQueues) {
-      quotas[clusterQueue.metadata.name] = 0
+      const queue = {
+        quota: 0, usage: 0, borrowed: 0, lendingLimit: 0,
+        admitted: clusterQueue.status?.admittedWorkloads ?? 0,
+        pending: clusterQueue.status?.pendingWorkloads ?? 0
+      }
       for (const resourceGroup of clusterQueue.spec.resourceGroups) {
         if (resourceGroup.coveredResources.includes('nvidia.com/gpu')) {
           for (const flavor of resourceGroup.flavors) {
             for (const resource of flavor.resources) {
               if (resource.name === 'nvidia.com/gpu') {
-                quotas[clusterQueue.metadata.name] += resource.nominalQuota
+                queue.quota += parseInt(resource.nominalQuota ?? '0')
+                // lending limit is nominal quota if not set
+                queue.lendingLimit += parseInt(resource.lendingLimit ?? resource.nominalQuota ?? '0')
                 break // resource may only occur once in flavor
               }
             }
           }
           break // resource may only belong to one resource group
         }
       }
+      for (const flavor of clusterQueue.status?.flavorsUsage ?? []) {
+        for (const resource of flavor.resources) {
+          if (resource.name === 'nvidia.com/gpu') {
+            queue.usage += parseInt(resource.total ?? '0')
+            queue.borrowed += parseInt(resource.borrowed ?? '0')
+            break // resource may only occur once in flavor
+          }
+        }
+      }
+      usedGPUs += queue.usage
+      borrowedGPUs += queue.borrowed
+      if (clusterQueue.metadata.name === slackQueueName) {
+        slackGPUs = queue.lendingLimit
+        // do not include slack queue in table
+      } else {
+        quotaGPUs += queue.quota
+        queues[clusterQueue.metadata.name] = queue
+      }
+    }
+
+    // print cluster queue table
+    printTable(queues, 'cluster queue', ['gpu quota', 'quota'], ['gpu usage', 'usage'],
+      ['admitted workloads', 'admitted'], ['pending workloads', 'pending'])
+    console.log()
+
+    // print summary results
+    const width = Math.max(String(clusterGPUs).length, String(quotaGPUs).length)
+    console.log(`Total GPU count in cluster:        ${pad(clusterGPUs, width)}`)
+    console.log(`Unschedulable GPU count:         - ${pad(noExecuteGPUs + noScheduleGPUs, width)}`)
+    console.log(`Schedulable GPU count:           = ${pad(clusterGPUs - noExecuteGPUs - noScheduleGPUs, width)}`)
+    console.log()
+    console.log(`Nominal GPU quota:                 ${pad(quotaGPUs, width)}`)
+    console.log(`Slack GPU quota:                 + ${pad(slackGPUs, width)}`)
+    console.log(`Total GPU quota:                 = ${pad(quotaGPUs + slackGPUs, width)}`)
+    console.log()
+    console.log(`GPU usage by admitted workloads:   ${pad(usedGPUs, width)}`)
+    console.log(`Borrowed GPU count:                ${pad(borrowedGPUs, width)}`)
+    console.log()
+
+    if (quotaGPUs > clusterGPUs - noExecuteGPUs - noScheduleGPUs) {
+      console.log('WARNING: nominal GPU quota is greater than schedulable GPU count')
     }
 
-    // check all namespaces
+    // check all accessible namespaces
     const namespaces = await client.namespaces()
     for (const namespace of namespaces) {
       if (namespace.metadata.name.startsWith('openshift-')) {
         continue // skip openshift namespaces
       }
 
-      const localQueues = await client.localQueues(namespace.metadata.name)
+      let localQueues
+      try {
+        localQueues = await client.localQueues(namespace.metadata.name)
+      } catch (err) {
+        continue // skip inaccessible namespaces
+      }
 
       if (localQueues.length === 0) {
-        systemGPUs += await checkSystemNamespace(client, namespace)
+        await checkSystemNamespace(client, namespace, nodes)
       } else {
-        userGPUs += await checkUserNamespace(client, namespace, quotas, localQueues)
+        await checkUserNamespace(client, namespace, localQueues)
       }
     }
-
-    // print summary results
-    console.log(`${clusterGPUs} GPU(s) in cluster`)
-    if (noExecuteGPUs) {
-      console.log(`${noExecuteGPUs} GPU(s) tainted NoExecute`)
-    }
-    if (noScheduleGPUs) {
-      console.log(`${noScheduleGPUs} GPU(s) tainted NoSchedule`)
-    }
-    console.log(`${userGPUs} GPU(s) used by scheduled workloads`)
-    if (systemGPUs > 0) {
-      console.log(`${systemGPUs} GPU(s) used by system pods`)
-    }
-
-  } catch (e) {
-    console.error(e)
+  } catch (err) {
+    console.error(err)
   }
 }