Skip to content

Commit

Permalink
neonvm: add readiness probe for sysfs scaling
Browse files Browse the repository at this point in the history
Call runner /ready which, in sysfs scaling mode, proxifies to daemon's /cpu to
check if runner pod and vm is ready. Runner's endpoint /ready does nothing in
case of qmp scaling model.
Move neonvm-daemon line in the inittab to start it right before vmstart.

Signed-off-by: Mikhail Sakhnov <[email protected]>
  • Loading branch information
mikhail-sakhnov committed Dec 30, 2024
1 parent 17a07a9 commit e5c9385
Show file tree
Hide file tree
Showing 3 changed files with 82 additions and 4 deletions.
55 changes: 53 additions & 2 deletions neonvm-runner/cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -1209,6 +1209,19 @@ func runQEMU(
lastValue.Store(uint32(cpu))
return nil
},
ready: func(logger *zap.Logger) bool {
// if we are in sysfs mode, we need to check if the NeonVM Daemon is ready
if cfg.cpuScalingMode == vmv1.CpuScalingModeSysfs {
err := checkNeonvmDaemonCPU()
if err != nil {
logger.Warn("neonvm-daemon ready probe failed", zap.Error(err))
return false
}
return true
}
// do nothing for QMP mode
return true
},
}

wg.Add(1)
Expand Down Expand Up @@ -1336,8 +1349,9 @@ func handleCPUCurrent(
}

type cpuServerCallbacks struct {
get func(*zap.Logger) (*vmv1.MilliCPU, error)
set func(*zap.Logger, vmv1.MilliCPU) error
get func(*zap.Logger) (*vmv1.MilliCPU, error)
set func(*zap.Logger, vmv1.MilliCPU) error
ready func(*zap.Logger) bool
}

func listenForHTTPRequests(
Expand All @@ -1359,6 +1373,13 @@ func listenForHTTPRequests(
mux.HandleFunc("/cpu_current", func(w http.ResponseWriter, r *http.Request) {
handleCPUCurrent(cpuCurrentLogger, w, r, callbacks.get)
})
mux.HandleFunc("/ready", func(w http.ResponseWriter, r *http.Request) {
if callbacks.ready(logger) {
w.WriteHeader(200)
} else {
w.WriteHeader(500)
}
})
if networkMonitoring {
reg := prometheus.NewRegistry()
metrics := NewMonitoringMetrics(reg)
Expand Down Expand Up @@ -2004,3 +2025,33 @@ func setNeonvmDaemonCPU(cpu vmv1.MilliCPU) error {

return nil
}

// checkNeonvmDaemonCPU sends a GET request to the NeonVM Daemon to get the current CPU limit for the sake of readiness probe.
func checkNeonvmDaemonCPU() error {
_, vmIP, _, err := calcIPs(defaultNetworkCIDR)
if err != nil {
return fmt.Errorf("could not calculate VM IP address: %w", err)
}

ctx, cancel := context.WithTimeout(context.TODO(), time.Second)
defer cancel()

url := fmt.Sprintf("http://%s:25183/cpu", vmIP)

req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
if err != nil {
return fmt.Errorf("could not build request: %w", err)
}

resp, err := http.DefaultClient.Do(req)
if err != nil {
return fmt.Errorf("could not send request: %w", err)
}
defer resp.Body.Close()

if resp.StatusCode != 200 {
return fmt.Errorf("neonvm-daemon responded with status %d", resp.StatusCode)
}

return nil
}
29 changes: 28 additions & 1 deletion pkg/neonvm/controllers/vm_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/intstr"
"k8s.io/apiserver/pkg/storage/names"
"k8s.io/client-go/tools/record"

Expand Down Expand Up @@ -932,12 +933,26 @@ func runnerStatus(pod *corev1.Pod) runnerStatusKind {
case corev1.PodFailed:
return runnerFailed
case corev1.PodRunning:
return runnerRunning
return isRunnerPodReady(pod)
default:
panic(fmt.Errorf("unknown pod phase: %q", pod.Status.Phase))
}
}

// isRunnerPodReady returns whether the runner pod is ready respecting the readiness probe of its containers.
func isRunnerPodReady(pod *corev1.Pod) runnerStatusKind {
if pod.Status.ContainerStatuses == nil {
return runnerPending
}
for _, c := range pod.Status.ContainerStatuses {
// we only care about the neonvm-runner container
if c.Name == "neonvm-runner" && !c.Ready {
return runnerPending
}
}
return runnerRunning
}

// deleteRunnerPodIfEnabled deletes the runner pod if buildtag.NeverDeleteRunnerPods is false, and
// then emits an event and log line about what it did, whether it actually deleted the runner pod.
func (r *VMReconciler) deleteRunnerPodIfEnabled(
Expand Down Expand Up @@ -1414,6 +1429,18 @@ func podSpec(
}
}(),
Resources: vm.Spec.PodResources,
ReadinessProbe: &corev1.Probe{
ProbeHandler: corev1.ProbeHandler{
HTTPGet: &corev1.HTTPGetAction{
Path: "/ready",
Port: intstr.FromInt32(vm.Spec.RunnerPort),
Scheme: corev1.URISchemeHTTP,
},
},
InitialDelaySeconds: 5,
PeriodSeconds: 5,
FailureThreshold: 3,
},
}

return []corev1.Container{runner}
Expand Down
2 changes: 1 addition & 1 deletion vm-builder/files/inittab
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
::sysinit:/neonvm/bin/vminit
::once:/neonvm/bin/touch /neonvm/vmstart.allowed
::respawn:/neonvm/bin/neonvmd --addr=0.0.0.0:25183
::respawn:/neonvm/bin/udhcpc -t 1 -T 1 -A 1 -f -i eth0 -O 121 -O 119 -s /neonvm/bin/udhcpc.script
::respawn:/neonvm/bin/udevd
::wait:/neonvm/bin/udev-init.sh
::respawn:/neonvm/bin/acpid -f -c /neonvm/acpi
::respawn:/neonvm/bin/vector -c /neonvm/config/vector.yaml --config-dir /etc/vector --color never
::respawn:/neonvm/bin/chronyd -n -f /neonvm/config/chrony.conf -l /var/log/chrony/chrony.log
::respawn:/neonvm/bin/sshd -E /var/log/ssh.log -f /neonvm/config/sshd_config
::respawn:/neonvm/bin/neonvmd --addr=0.0.0.0:25183
::respawn:/neonvm/bin/vmstart
{{ range .InittabCommands }}
::{{.SysvInitAction}}:su -p {{.CommandUser}} -c {{.ShellEscapedCommand}}
Expand Down

0 comments on commit e5c9385

Please sign in to comment.