From b6fe5b67fd728b591f546fb5969e48b369fcbc22 Mon Sep 17 00:00:00 2001 From: Sebastian Sch Date: Thu, 11 Jul 2024 21:48:29 +0300 Subject: [PATCH 1/4] implement RemovePfAppliedStatus function in store Signed-off-by: Sebastian Sch --- pkg/helper/mock/mock_helper.go | 14 ++++++++++++++ pkg/host/store/mock/mock_store.go | 14 ++++++++++++++ pkg/host/store/store.go | 12 ++++++++++++ 3 files changed, 40 insertions(+) diff --git a/pkg/helper/mock/mock_helper.go b/pkg/helper/mock/mock_helper.go index f19902825..5ef289ac4 100644 --- a/pkg/helper/mock/mock_helper.go +++ b/pkg/helper/mock/mock_helper.go @@ -942,6 +942,20 @@ func (mr *MockHostHelpersInterfaceMockRecorder) RemovePersistPFNameUdevRule(pfPc return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "RemovePersistPFNameUdevRule", reflect.TypeOf((*MockHostHelpersInterface)(nil).RemovePersistPFNameUdevRule), pfPciAddress) } +// RemovePfAppliedStatus mocks base method. +func (m *MockHostHelpersInterface) RemovePfAppliedStatus(pciAddress string) error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "RemovePfAppliedStatus", pciAddress) + ret0, _ := ret[0].(error) + return ret0 +} + +// RemovePfAppliedStatus indicates an expected call of RemovePfAppliedStatus. +func (mr *MockHostHelpersInterfaceMockRecorder) RemovePfAppliedStatus(pciAddress interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "RemovePfAppliedStatus", reflect.TypeOf((*MockHostHelpersInterface)(nil).RemovePfAppliedStatus), pciAddress) +} + // RemoveVfRepresentorUdevRule mocks base method. func (m *MockHostHelpersInterface) RemoveVfRepresentorUdevRule(pfPciAddress string) error { m.ctrl.T.Helper() diff --git a/pkg/host/store/mock/mock_store.go b/pkg/host/store/mock/mock_store.go index 2e0071dfd..d405543a2 100644 --- a/pkg/host/store/mock/mock_store.go +++ b/pkg/host/store/mock/mock_store.go @@ -79,6 +79,20 @@ func (mr *MockManagerInterfaceMockRecorder) LoadPfsStatus(pciAddress interface{} return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "LoadPfsStatus", reflect.TypeOf((*MockManagerInterface)(nil).LoadPfsStatus), pciAddress) } +// RemovePfAppliedStatus mocks base method. +func (m *MockManagerInterface) RemovePfAppliedStatus(pciAddress string) error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "RemovePfAppliedStatus", pciAddress) + ret0, _ := ret[0].(error) + return ret0 +} + +// RemovePfAppliedStatus indicates an expected call of RemovePfAppliedStatus. +func (mr *MockManagerInterfaceMockRecorder) RemovePfAppliedStatus(pciAddress interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "RemovePfAppliedStatus", reflect.TypeOf((*MockManagerInterface)(nil).RemovePfAppliedStatus), pciAddress) +} + // SaveLastPfAppliedStatus mocks base method. func (m *MockManagerInterface) SaveLastPfAppliedStatus(PfInfo *v1.Interface) error { m.ctrl.T.Helper() diff --git a/pkg/host/store/store.go b/pkg/host/store/store.go index 77e4bbd32..67c0b17e3 100644 --- a/pkg/host/store/store.go +++ b/pkg/host/store/store.go @@ -20,6 +20,7 @@ import ( type ManagerInterface interface { ClearPCIAddressFolder() error SaveLastPfAppliedStatus(PfInfo *sriovnetworkv1.Interface) error + RemovePfAppliedStatus(pciAddress string) error LoadPfsStatus(pciAddress string) (*sriovnetworkv1.Interface, bool, error) GetCheckPointNodeState() (*sriovnetworkv1.SriovNetworkNodeState, error) @@ -111,6 +112,17 @@ func (s *manager) SaveLastPfAppliedStatus(PfInfo *sriovnetworkv1.Interface) erro return err } +func (s *manager) RemovePfAppliedStatus(pciAddress string) error { + hostExtension := utils.GetHostExtension() + pathFile := filepath.Join(hostExtension, consts.PfAppliedConfig, pciAddress) + err := os.RemoveAll(pathFile) + if err != nil { + log.Log.Error(err, "failed to remove PF status", "pathFile", pathFile) + return err + } + return nil +} + // LoadPfsStatus convert the /etc/sriov-operator/pci/ json to pfstatus // returns false if the file doesn't exist. func (s *manager) LoadPfsStatus(pciAddress string) (*sriovnetworkv1.Interface, bool, error) { From 62e33cdbabf04b589d921805699766cd03d856bc Mon Sep 17 00:00:00 2001 From: Sebastian Sch Date: Thu, 11 Jul 2024 21:48:55 +0300 Subject: [PATCH 2/4] Remove store file on reset Signed-off-by: Sebastian Sch --- pkg/host/internal/sriov/sriov.go | 13 +++++++++++++ pkg/host/internal/sriov/sriov_test.go | 2 ++ 2 files changed, 15 insertions(+) diff --git a/pkg/host/internal/sriov/sriov.go b/pkg/host/internal/sriov/sriov.go index ee21b1539..5a110e2ff 100644 --- a/pkg/host/internal/sriov/sriov.go +++ b/pkg/host/internal/sriov/sriov.go @@ -817,6 +817,13 @@ func (s *sriov) checkForConfigAndReset(ifaceStatus sriovnetworkv1.InterfaceExt, log.Log.V(2).Info("checkForConfigAndReset(): PF name with pci address was externally created skipping the device reset", "pf-name", ifaceStatus.Name, "address", ifaceStatus.PciAddress) + + // remove pf status from host + err = storeManager.RemovePfAppliedStatus(ifaceStatus.PciAddress) + if err != nil { + return err + } + return nil } err = s.removeUdevRules(ifaceStatus.PciAddress) @@ -828,6 +835,12 @@ func (s *sriov) checkForConfigAndReset(ifaceStatus sriovnetworkv1.InterfaceExt, return err } + // remove pf status from host + err = storeManager.RemovePfAppliedStatus(ifaceStatus.PciAddress) + if err != nil { + return err + } + return nil } diff --git a/pkg/host/internal/sriov/sriov_test.go b/pkg/host/internal/sriov/sriov_test.go index cb647e64d..4a772bb52 100644 --- a/pkg/host/internal/sriov/sriov_test.go +++ b/pkg/host/internal/sriov/sriov_test.go @@ -452,6 +452,7 @@ var _ = Describe("SRIOV", func() { PciAddress: "0000:d8:00.0", NumVfs: 2, }, true, nil) + storeManagerMode.EXPECT().RemovePfAppliedStatus("0000:d8:00.0").Return(nil) netlinkLibMock.EXPECT().DevLinkGetDeviceByName("pci", "0000:d8:00.0").Return( &netlink.DevlinkDevice{Attrs: netlink.DevlinkDevAttrs{Eswitch: netlink.DevlinkDevEswitchAttr{Mode: "legacy"}}}, nil) @@ -479,6 +480,7 @@ var _ = Describe("SRIOV", func() { NumVfs: 2, ExternallyManaged: true, }, true, nil) + storeManagerMode.EXPECT().RemovePfAppliedStatus("0000:d8:00.0").Return(nil) Expect(s.ConfigSriovInterfaces(storeManagerMode, []sriovnetworkv1.Interface{}, []sriovnetworkv1.InterfaceExt{ From 556da0086ee2ee5e6bca51d701c4e773dc7814b4 Mon Sep 17 00:00:00 2001 From: Sebastian Sch Date: Thu, 11 Jul 2024 21:49:33 +0300 Subject: [PATCH 3/4] Skip firmware reset for devices we don't control Signed-off-by: Sebastian Sch --- pkg/plugins/mellanox/mellanox_plugin.go | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/pkg/plugins/mellanox/mellanox_plugin.go b/pkg/plugins/mellanox/mellanox_plugin.go index 87363464b..353080e42 100644 --- a/pkg/plugins/mellanox/mellanox_plugin.go +++ b/pkg/plugins/mellanox/mellanox_plugin.go @@ -143,6 +143,11 @@ func (p *MellanoxPlugin) OnNodeStateChange(new *sriovnetworkv1.SriovNetworkNodeS processedNics[pciPrefix] = true pciAddress := pciPrefix + "0" + // Skip devices not configured by the operator + if p.nicNotConfiguredByOperator(portsMap) { + continue + } + // Skip externally managed NICs if p.nicHasExternallyManagedPFs(portsMap) { continue @@ -206,3 +211,19 @@ func (p *MellanoxPlugin) nicHasExternallyManagedPFs(nicPortsMap map[string]sriov } return false } + +// nicNotConfiguredByOperator returns true if one of the ports(interface) of the NIC is not configured by operator +func (p *MellanoxPlugin) nicNotConfiguredByOperator(nicPortsMap map[string]sriovnetworkv1.InterfaceExt) bool { + for _, iface := range nicPortsMap { + _, exist, err := p.helpers.LoadPfsStatus(iface.PciAddress) + if err != nil { + log.Log.Error(err, "failed to load PF status from disk", "address", iface.PciAddress) + continue + } + if exist { + log.Log.V(2).Info("PF configured by the operator", "interface", iface) + return true + } + } + return false +} From 411a215bede85e0f93bddba11fe5a7541afa37e7 Mon Sep 17 00:00:00 2001 From: Sebastian Sch Date: Thu, 11 Jul 2024 21:50:04 +0300 Subject: [PATCH 4/4] implement functional test for both regular and externally manage Signed-off-by: Sebastian Sch --- test/conformance/tests/test_sriov_operator.go | 50 ++++++++++++++++--- 1 file changed, 44 insertions(+), 6 deletions(-) diff --git a/test/conformance/tests/test_sriov_operator.go b/test/conformance/tests/test_sriov_operator.go index d72f6a57a..e1c246af5 100644 --- a/test/conformance/tests/test_sriov_operator.go +++ b/test/conformance/tests/test_sriov_operator.go @@ -1222,7 +1222,7 @@ var _ = Describe("[sriov] operator", func() { It("Should be possible to create a vfio-pci resource and allocate to a pod", func() { By("creating a vfio-pci node policy") resourceName := "testvfio" - _, err := network.CreateSriovPolicy(clients, "test-policy-", operatorNamespace, vfioNic.Name, vfioNode, 5, resourceName, "vfio-pci") + vfioPolicy, err := network.CreateSriovPolicy(clients, "test-policy-", operatorNamespace, vfioNic.Name, vfioNode, 5, resourceName, "vfio-pci") Expect(err).ToNot(HaveOccurred()) By("waiting for the node state to be updated") @@ -1249,6 +1249,11 @@ var _ = Describe("[sriov] operator", func() { return allocatable }, 10*time.Minute, time.Second).Should(Equal(int64(5))) + By("validate the pf info exist on host") + output, _, err := runCommandOnConfigDaemon(vfioNode, "/bin/bash", "-c", "ls /host/etc/sriov-operator/pci/ | wc -l") + Expect(err).ToNot(HaveOccurred()) + Expect(output).ToNot(Equal("1")) + By("Creating sriov network to use the vfio device") sriovNetwork := &sriovv1.SriovNetwork{ ObjectMeta: metav1.ObjectMeta{ @@ -1278,6 +1283,25 @@ var _ = Describe("[sriov] operator", func() { networkStatusJSON, exist := firstPod.Annotations["k8s.v1.cni.cncf.io/network-status"] Expect(exist).To(BeTrue()) Expect(networkStatusJSON).To(ContainSubstring(fmt.Sprintf("\"mtu\": %d", vfioNic.Mtu))) + + By("deleting the policy") + err = clients.Delete(context.Background(), vfioPolicy, &runtimeclient.DeleteOptions{}) + Expect(err).ToNot(HaveOccurred()) + WaitForSRIOVStable() + + Eventually(func() int64 { + testedNode, err := clients.CoreV1Interface.Nodes().Get(context.Background(), vfioNode, metav1.GetOptions{}) + Expect(err).ToNot(HaveOccurred()) + resNum := testedNode.Status.Allocatable[corev1.ResourceName("openshift.io/"+resourceName)] + allocatable, _ := resNum.AsInt64() + return allocatable + }, 2*time.Minute, time.Second).Should(Equal(int64(0))) + + By("validate the pf info doesn't exist on the host anymore") + output, _, err = runCommandOnConfigDaemon(vfioNode, "/bin/bash", "-c", "ls /host/etc/sriov-operator/pci/ | wc -l") + Expect(err).ToNot(HaveOccurred()) + Expect(output).ToNot(Equal("0")) + }) }) @@ -2196,15 +2220,19 @@ var _ = Describe("[sriov] operator", func() { Context("ExternallyManaged Validation", func() { numVfs := 5 var node string - var nic *sriovv1.InterfaceExt + var nic sriovv1.InterfaceExt externallyManage := func(policy *sriovv1.SriovNetworkNodePolicy) { policy.Spec.ExternallyManaged = true } execute.BeforeAll(func() { - var err error - node, nic, err = sriovInfos.FindOneSriovNodeAndDevice() - Expect(err).ToNot(HaveOccurred()) + node, nic = sriovInfos.FindOneVfioSriovDevice() + }) + + BeforeEach(func() { + if node == "" { + Skip("not suitable device found for the test") + } By("Using device " + nic.Name + " on node " + node) }) @@ -2265,6 +2293,11 @@ var _ = Describe("[sriov] operator", func() { return allocatable }, 3*time.Minute, time.Second).Should(Equal(int64(numVfs))) + By("validate the pf info exist on host") + output, _, err := runCommandOnConfigDaemon(node, "/bin/bash", "-c", "ls /host/etc/sriov-operator/pci/ | wc -l") + Expect(err).ToNot(HaveOccurred()) + Expect(output).ToNot(Equal("1")) + By("deleting the policy") err = clients.Delete(context.Background(), sriovPolicy, &runtimeclient.DeleteOptions{}) Expect(err).ToNot(HaveOccurred()) @@ -2279,11 +2312,16 @@ var _ = Describe("[sriov] operator", func() { }, 2*time.Minute, time.Second).Should(Equal(int64(0))) By("checking the virtual functions are still on the host") - output, errOutput, err := runCommandOnConfigDaemon(node, "/bin/bash", "-c", fmt.Sprintf("cat /host/sys/class/net/%s/device/sriov_numvfs", nic.Name)) + output, errOutput, err = runCommandOnConfigDaemon(node, "/bin/bash", "-c", fmt.Sprintf("cat /host/sys/class/net/%s/device/sriov_numvfs", nic.Name)) Expect(err).ToNot(HaveOccurred()) Expect(errOutput).To(Equal("")) Expect(output).To(ContainSubstring("5")) + By("validate the pf info doesn't exist on the host anymore") + output, _, err = runCommandOnConfigDaemon(node, "/bin/bash", "-c", "ls /host/etc/sriov-operator/pci/ | wc -l") + Expect(err).ToNot(HaveOccurred()) + Expect(output).ToNot(Equal("0")) + By("cleaning the manual sriov created") _, errOutput, err = runCommandOnConfigDaemon(node, "/bin/bash", "-c", fmt.Sprintf("echo 0 > /host/sys/class/net/%s/device/sriov_numvfs", nic.Name)) Expect(err).ToNot(HaveOccurred())