Skip to content

Commit

Permalink
basic checking of container resources (#59)
Browse files Browse the repository at this point in the history
  • Loading branch information
dgrove-oss authored Sep 16, 2024
1 parent acd30fe commit 951c1a7
Show file tree
Hide file tree
Showing 3 changed files with 72 additions and 2 deletions.
62 changes: 62 additions & 0 deletions tools/cluster-checker/checker.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,14 @@
'use strict'

const k8s = require('@kubernetes/client-node')
const k8srp = require('kubernetes-resource-parser')

const nodeResources = {
'nvidia.com/gpu' : 8,
'nvidia.com/roce_gdr' : 2,
'cpu' : 80,
'memory' : '800G'
}

class Client {
constructor () {
Expand Down Expand Up @@ -140,6 +148,50 @@ function reservation (pod) {
return gpus
}

// check container resource requests against node_resources
function checkContainerResources(namespace, workload, container) {
// selectively merge limits into requests
const resources = {}
for (const k in container.resources?.requests ?? []) {
resources[k] = container.resources.requests[k]
}
for (const k in container.resources?.limits ?? []) {
if (!(k in resources)) {
resources[k] = container.resources.limits[k]
}
}

const gpus = parseInt(resources['nvidia.com/gpu'] ?? '0')
const gdr = parseInt(resources['nvidia.com/roce_gdr'] ?? '0')
const cpus = k8srp.cpuParser(resources['cpu'] ?? '0')
const mem = k8srp.memoryParser(resources['memory'] ?? '0')

// warn if the resource requests cannot be satisfied by a Node
if (gpus > nodeResources['nvidia.com/gpu']) {
console.log(`WARNING: workload "${namespace.metadata.name}/${workload.metadata.name}" has a container requesting "${gpus} GPUs"`)
}
if (gdr > nodeResources['gdrPerNode']) {
console.log(`WARNING: workload "${namespace.metadata.name}/${workload.metadata.name}" has a container requesting ${gdr} roce_gdr interfaces"`)
}
if (cpus > nodeResources['cpu']) {
console.log(`WARNING: workload "${namespace.metadata.name}/${workload.metadata.name}" has a container requesting "${cpus} CPUs"`)
}
if (mem > k8srp.memoryParser(nodeResources['memory'])) {
console.log(`WARNING: workload "${namespace.metadata.name}/${workload.metadata.name}" has a container requesting ${resources['memory']} memory`)
}

// warn if the resource:GPU ratio is not proportional to Node resources
if (gdr > 0 && ((gpus == 0) || (gpus/gdr < nodeResources['nvidia.com/gpu']/nodeResources['nvidia.com/roce_gdr']))) {
console.log(`WARNING: workload "${namespace.metadata.name}/${workload.metadata.name}" has a container requesting ${gdr} roce_gdr but only ${gpus} GPUs`)
}
if (gpus > 0 && (cpus > 0) && (cpus/gpus > nodeResources['cpu']/nodeResources['nvidia.com/gpu'])) {
console.log(`WARNING: workload "${namespace.metadata.name}/${workload.metadata.name}" has a container requesting ${cpus} cpus but only ${gpus} GPUs`)
}
if (gpus > 0 && (mem > 0) && (mem/gpus > k8srp.memoryParser(nodeResources['memory'])/nodeResources['nvidia.com/gpu'])) {
console.log(`WARNING: workload "${namespace.metadata.name}/${workload.metadata.name}" has a container requesting ${resources['memory']} memory but only ${gpus} GPUs`)
}
}

// check user namespace
async function checkUserNamespace (client, namespace, queues) {
const workloads = await client.workloads(namespace.metadata.name)
Expand Down Expand Up @@ -171,6 +223,16 @@ async function checkUserNamespace (client, namespace, queues) {
if (conditions['Evicted'] === 'True') {
console.log(`WARNING: workload "${namespace.metadata.name}/${workload.metadata.name}" has condition Evicted=True`)
}

// report misconfigured resource requests
for (const podSet of workload.spec?.podSets) {
for (const ic of podSet.template?.spec?.initContainers ?? []) {
checkContainerResources(namespace, workload, ic)
}
for (const c of podSet.template?.spec?.containers ?? []) {
checkContainerResources(namespace, workload, c)
}
}
}
}

Expand Down
9 changes: 8 additions & 1 deletion tools/cluster-checker/package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion tools/cluster-checker/package.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
{
"dependencies": {
"@kubernetes/client-node": "^0.21.0"
"@kubernetes/client-node": "^0.21.0",
"kubernetes-resource-parser": "0.1.0"
}
}

0 comments on commit 951c1a7

Please sign in to comment.