neondatabase · mikhail-sakhnov · Dec 30, 2024 · sharnoff · Jan 7, 2025 · mikhail-sakhnov
diff --git a/neonvm-runner/cmd/httpserver.go b/neonvm-runner/cmd/httpserver.go
@@ -19,8 +19,9 @@ import (
 )
 
 type cpuServerCallbacks struct {
-	get func(*zap.Logger) (*vmv1.MilliCPU, error)
-	set func(*zap.Logger, vmv1.MilliCPU) error
+	get   func(*zap.Logger) (*vmv1.MilliCPU, error)
+	set   func(*zap.Logger, vmv1.MilliCPU) error
+	ready func(*zap.Logger) bool
 }
 
 func listenForHTTPRequests(
@@ -42,6 +43,13 @@ func listenForHTTPRequests(
 	mux.HandleFunc("/cpu_current", func(w http.ResponseWriter, r *http.Request) {
 		handleCPUCurrent(cpuCurrentLogger, w, r, callbacks.get)
 	})
+	mux.HandleFunc("/ready", func(w http.ResponseWriter, r *http.Request) {
+		if callbacks.ready(logger) {
+			w.WriteHeader(200)
+		} else {
+			w.WriteHeader(500)
+		}
+	})
 	if networkMonitoring {
 		reg := prometheus.NewRegistry()
 		metrics := NewMonitoringMetrics(reg)

diff --git a/neonvm-runner/cmd/main.go b/neonvm-runner/cmd/main.go
@@ -497,6 +497,20 @@ func runQEMU(
 			lastValue.Store(uint32(cpu))
 			return nil
 		},
+		ready: func(logger *zap.Logger) bool {
+			// if we are in sysfs mode, we need to check if the NeonVM Daemon is ready
+			if cfg.cpuScalingMode == vmv1.CpuScalingModeSysfs {
+				err := checkNeonvmDaemonCPU()
+				if err != nil {
+					logger.Warn("neonvm-daemon ready probe failed", zap.Error(err))
+					return false
+				}
+				return true
+			}
+			// do nothing for QMP mode
+
+			return false
+		},
 	}
 
 	wg.Add(1)
@@ -753,3 +767,27 @@ func setNeonvmDaemonCPU(cpu vmv1.MilliCPU) error {
 
 	return nil
 }
+
+// checkNeonvmDaemonCPU sends a GET request to the NeonVM Daemon to get the current CPU limit for the sake of readiness probe.
+func checkNeonvmDaemonCPU() error {
+	_, vmIP, _, err := calcIPs(defaultNetworkCIDR)
+	if err != nil {
+		return fmt.Errorf("could not calculate VM IP address: %w", err)
+	}
+	ctx, cancel := context.WithTimeout(context.TODO(), time.Second)
+	defer cancel()
+	url := fmt.Sprintf("http://%s:25183/cpu", vmIP)
+	req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
+	if err != nil {
+		return fmt.Errorf("could not build request: %w", err)
+	}
+	resp, err := http.DefaultClient.Do(req)
+	if err != nil {
+		return fmt.Errorf("could not send request: %w", err)
+	}
+	defer resp.Body.Close()
+	if resp.StatusCode != 200 {
+		return fmt.Errorf("neonvm-daemon responded with status %d", resp.StatusCode)
+	}
+	return nil
+}
diff --git a/neonvm/samples/vm-example.yaml b/neonvm/samples/vm-example.yaml
@@ -29,6 +29,7 @@ kind: VirtualMachine
 metadata:
   name: example
 spec:
+  cpuScalingMode: SysfsScaling
   guest:
     cpus:
       min: 1

diff --git a/pkg/neonvm/controllers/vm_controller.go b/pkg/neonvm/controllers/vm_controller.go
@@ -47,6 +47,7 @@ import (
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/runtime"
 	"k8s.io/apimachinery/pkg/types"
+	"k8s.io/apimachinery/pkg/util/intstr"
 	"k8s.io/apiserver/pkg/storage/names"
 	"k8s.io/client-go/tools/record"
 
@@ -864,12 +865,45 @@ func runnerStatus(pod *corev1.Pod) runnerStatusKind {
 	case corev1.PodFailed:
 		return runnerFailed
 	case corev1.PodRunning:
-		return runnerRunning
+		return isRunnerPodReady(pod)
 	default:
 		panic(fmt.Errorf("unknown pod phase: %q", pod.Status.Phase))
 	}
 }
 
+// isRunnerPodReady returns whether the runner pod is ready respecting the readiness probe of its containers.
+func isRunnerPodReady(pod *corev1.Pod) runnerStatusKind {
+	// if the pod has no container statuses, we consider it pending
+	if pod.Status.ContainerStatuses == nil {
+		return runnerPending
+	}
+
+	// first we check if the pod is a target pod for the migration
+	// in that case we don't wait for the readiness probe
+	// from neonvm-daemon because qemu started
+	// in incoming migration mode
+	for _, containerSpec := range pod.Spec.Containers {
+		if containerSpec.Name == "neonvm-runner" {
+			for _, env := range containerSpec.Env {
+				if env.Name == "RECEIVE_MIGRATION" && env.Value == "true" {
+					return runnerRunning
+				}
+			}
+		}
+	}
+
+	// if the pod is not a target pod for the migration
+	// we check the neonvm-runner container
+	// for the readiness probe
+	for _, c := range pod.Status.ContainerStatuses {
+		// we only care about the neonvm-runner container
+		if c.Name == "neonvm-runner" && !c.Ready {
+			return runnerPending
+		}
+	}
+	return runnerRunning
+}
+
 // deleteRunnerPodIfEnabled deletes the runner pod if buildtag.NeverDeleteRunnerPods is false, and
 // then emits an event and log line about what it did, whether it actually deleted the runner pod.
 func (r *VMReconciler) deleteRunnerPodIfEnabled(
@@ -1178,6 +1212,10 @@ func imageForVmRunner() (string, error) {
 	return image, nil
 }
 
+// podSpec returns a VirtualMachine Pod object
+// withReadinessProbe - if true, adds a readiness probe to the container
+// we don't need readiness probe for the VM runner pod if it is a target pod
+// for migration because VM is not started until migration is complete
 func podSpec(
 	vm *vmv1.VirtualMachine,
 	sshSecret *corev1.Secret,
@@ -1371,6 +1409,19 @@ func podSpec(
 		},
 	}
 
+	pod.Spec.Containers[0].ReadinessProbe = &corev1.Probe{
+		ProbeHandler: corev1.ProbeHandler{
+			HTTPGet: &corev1.HTTPGetAction{
+				Path:   "/ready",
+				Port:   intstr.FromInt32(vm.Spec.RunnerPort),
+				Scheme: corev1.URISchemeHTTP,
+			},
+		},
+		InitialDelaySeconds: 5,
+		PeriodSeconds:       5,
+		FailureThreshold:    3,
+	}
+
 	if sshSecret != nil {
 		pod.Spec.Containers[0].VolumeMounts = append(pod.Spec.Containers[0].VolumeMounts,
 			corev1.VolumeMount{

diff --git a/pkg/neonvm/controllers/vmmigration_controller.go b/pkg/neonvm/controllers/vmmigration_controller.go
@@ -713,6 +713,7 @@ func (r *VirtualMachineMigrationReconciler) targetPodForVirtualMachine(
 	pod.Name = migration.Status.TargetPodName
 
 	// add env variable to turn on migration receiver
+	// TODO: make it false or empty after the migration is done to enable correct readiness probe
 	pod.Spec.Containers[0].Env = append(pod.Spec.Containers[0].Env, corev1.EnvVar{Name: "RECEIVE_MIGRATION", Value: "true"})
 
 	// add podAntiAffinity to schedule target pod to another k8s node

diff --git a/vm-builder/files/inittab b/vm-builder/files/inittab
@@ -1,13 +1,13 @@
 ::sysinit:/neonvm/bin/vminit
 ::once:/neonvm/bin/touch /neonvm/vmstart.allowed
-::respawn:/neonvm/bin/neonvmd --addr=0.0.0.0:25183
 ::respawn:/neonvm/bin/udhcpc -t 1 -T 1 -A 1 -f -i eth0 -O 121 -O 119 -s /neonvm/bin/udhcpc.script
 ::respawn:/neonvm/bin/udevd
 ::wait:/neonvm/bin/udev-init.sh
 ::respawn:/neonvm/bin/acpid -f -c /neonvm/acpi
 ::respawn:/neonvm/bin/vector -c /neonvm/config/vector.yaml --config-dir /etc/vector --color never
 ::respawn:/neonvm/bin/chronyd -n -f /neonvm/config/chrony.conf -l /var/log/chrony/chrony.log
 ::respawn:/neonvm/bin/sshd -E /var/log/ssh.log -f /neonvm/config/sshd_config
+::respawn:/neonvm/bin/neonvmd --addr=0.0.0.0:25183
 ::respawn:/neonvm/bin/vmstart
 {{ range .InittabCommands }}
 ::{{.SysvInitAction}}:su -p {{.CommandUser}} -c {{.ShellEscapedCommand}}
-Original file line number
+Diff line change
@@ Expand Up / @@ -29,6 +29,7 @@ kind: VirtualMachine @@
     metadata:
       name: example
     spec:
+      cpuScalingMode: SysfsScaling
       guest:
         cpus:
           min: 1
@@ Expand Down @@