From 430a012dd8bc33b3d63256dd976ed1531fa80de4 Mon Sep 17 00:00:00 2001 From: Muvaffak Onus Date: Mon, 6 Jan 2025 15:18:50 +0300 Subject: [PATCH] deviceplugin: if dev path of the device changes, mark the node as unhealthy to trigger a kubelet reconciliation to update the paths in the pod Signed-off-by: Muvaffak Onus --- .../device_plugin_instance_controller.rs | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/agent/src/plugin_manager/device_plugin_instance_controller.rs b/agent/src/plugin_manager/device_plugin_instance_controller.rs index 9be972707..36f5da1d1 100644 --- a/agent/src/plugin_manager/device_plugin_instance_controller.rs +++ b/agent/src/plugin_manager/device_plugin_instance_controller.rs @@ -170,6 +170,38 @@ impl InstanceDevicePlugin { async fn update_slots(&self, slots: &HashMap) -> Result<(), DevicePluginError> { let my_slots = self.slots_status.lock().await; let new_slots = construct_slots_map(slots)?; + + // Track which specific slots have changed + let mut changed_slots = HashSet::new(); + for (k, v) in new_slots.iter() { + if let Some(current_slot) = my_slots.borrow().get(*k) { + if current_slot != v { + changed_slots.insert(*k); + } + } + } + + // If any slots changed, mark only those specific ones as unhealthy + if !changed_slots.is_empty() { + my_slots.send_if_modified(|current| { + for slot_id in &changed_slots { + if let Some(slot) = current.get_mut(*slot_id) { + if let DeviceUsage::Node(node) = slot { + if node == &self.node_name { + // Temporarily mark only this specific device as unhealthy + *node = "temporary-unhealthy".to_string(); + } + } + } + } + true + }); + + // Give kubelet time to notice the unhealthy state + tokio::time::sleep(Duration::from_millis(100)).await; + } + + // Now update to the new state my_slots.send_if_modified(|current| { let mut modified = false; for (k, v) in new_slots.iter() {