diff --git a/base/node-problem-detector/repository.yaml b/base/node-problem-detector/repository.yaml index 967b38f4..8777a12a 100644 --- a/base/node-problem-detector/repository.yaml +++ b/base/node-problem-detector/repository.yaml @@ -5,4 +5,5 @@ metadata: namespace: node-problem-detector spec: interval: 5m - url: https://charts.deliveryhero.io/ + oci: true + url: oci://ghcr.io/deliveryhero/helm-charts diff --git a/base/node-problem-detector/values.yaml b/base/node-problem-detector/values.yaml index b1bc5ded..25058095 100644 --- a/base/node-problem-detector/values.yaml +++ b/base/node-problem-detector/values.yaml @@ -20,7 +20,47 @@ resources: memory: 18Mi settings: + #log_monitors: + # - /config/kernel-monitor.json + # - /config/readonly-monitor.json + # - /config/health-checker-containerd.json log_monitors: - /config/kernel-monitor.json - /config/readonly-monitor.json - - /config/health-checker-containerd.json + - /custom-config/docker-monitor-filelog.json + + custom_monitor_definitions: + health-checker-containerd.json: | + { + "plugin": "custom", + "pluginConfig": { + "invoke_interval": "10s", + "timeout": "3m", + "max_output_length": "80", + "concurrency": "1" + }, + "source": "health-checker", + "metricsReporting": true, + "conditions": [ + { + "type": "ContainerRuntimeUnhealthy", + "reason": "ContainerRuntimeIsHealthy", + "message": "Container runtime on the node is functioning properly" + } + ], + "rules": [ + { + "type": "permanent", + "condition": "ContainerRuntimeUnhealthy", + "reason": "ContainerdUnhealthy", + "path": "/home/kubernetes/bin/health-checker", + "args": [ + "--component=cri", + "--enable-repair=true", + "--cooldown-time=2m", + "--health-check-timeout=60s" + ], + "timeout": "3m" + } + ] + }