From c296bfd6692895ac3f5a357a0336c57f3c77ef75 Mon Sep 17 00:00:00 2001 From: caohe Date: Thu, 16 Nov 2023 01:13:46 +0800 Subject: [PATCH] refactor(reporter): refactor the logics to report rdma topology Signed-off-by: caohe --- .../app/options/reporter/kubelet_plugin.go | 9 +- go.mod | 7 - go.sum | 15 -- .../fetcher/kubelet/kubeletplugin.go | 4 +- .../kubelet/topology/topology_adapter.go | 106 +++----- .../kubelet/topology/topology_adapter_test.go | 242 +++--------------- pkg/config/agent/reporter/kubelet_plugin.go | 2 +- pkg/util/cnr.go | 17 +- 8 files changed, 87 insertions(+), 315 deletions(-) diff --git a/cmd/katalyst-agent/app/options/reporter/kubelet_plugin.go b/cmd/katalyst-agent/app/options/reporter/kubelet_plugin.go index fe6dec0e8..12155d535 100644 --- a/cmd/katalyst-agent/app/options/reporter/kubelet_plugin.go +++ b/cmd/katalyst-agent/app/options/reporter/kubelet_plugin.go @@ -27,7 +27,7 @@ type KubeletPluginOptions struct { PodResourcesServerEndpoints []string KubeletResourcePluginPaths []string EnableReportTopologyPolicy bool - EnableReportRDMATopology bool + ResourceNameToZoneNameMap map[string]string } func NewKubeletPluginOptions() *KubeletPluginOptions { @@ -39,7 +39,7 @@ func NewKubeletPluginOptions() *KubeletPluginOptions { pluginapi.ResourcePluginPath, }, EnableReportTopologyPolicy: false, - EnableReportRDMATopology: false, + ResourceNameToZoneNameMap: make(map[string]string), } } @@ -52,15 +52,14 @@ func (o *KubeletPluginOptions) AddFlags(fss *cliflag.NamedFlagSets) { "the path of kubelet resource plugin") fs.BoolVar(&o.EnableReportTopologyPolicy, "enable-report-topology-policy", o.EnableReportTopologyPolicy, "whether to report topology policy") - fs.BoolVar(&o.EnableReportRDMATopology, "enable-report-rdma-topology", false, "enable report rdma topology, default false") - + fs.Var(cliflag.NewMapStringString(&o.ResourceNameToZoneNameMap), "resource-name-to-zone-name-map", "a map that stores the mapping relationship between resource names to zone names in KCNR (e.g. nvidia.com/gpu=GPU,...)") } func (o *KubeletPluginOptions) ApplyTo(c *reporter.KubeletPluginConfiguration) error { c.PodResourcesServerEndpoints = o.PodResourcesServerEndpoints c.KubeletResourcePluginPaths = o.KubeletResourcePluginPaths c.EnableReportTopologyPolicy = o.EnableReportTopologyPolicy - c.EnableReportRDMATopology = o.EnableReportRDMATopology + c.ResourceNameToZoneNameMap = o.ResourceNameToZoneNameMap return nil } diff --git a/go.mod b/go.mod index 0c1628274..72379d448 100644 --- a/go.mod +++ b/go.mod @@ -11,7 +11,6 @@ require ( github.com/gogo/protobuf v1.3.2 github.com/golang/protobuf v1.5.2 github.com/google/cadvisor v0.44.1 - github.com/jaypipes/ghw v0.12.0 github.com/kubewharf/katalyst-api v0.1.17-0.20231103034307-03a36866a851 github.com/opencontainers/runc v1.1.1 github.com/pkg/errors v0.9.1 @@ -56,7 +55,6 @@ require ( github.com/OneOfOne/xxhash v1.2.5 // indirect github.com/PuerkitoBio/purell v1.1.1 // indirect github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578 // indirect - github.com/StackExchange/wmi v1.2.1 // indirect github.com/beorn7/perks v1.0.1 // indirect github.com/blang/semver/v4 v4.0.0 // indirect github.com/cespare/xxhash/v2 v2.1.2 // indirect @@ -70,9 +68,7 @@ require ( github.com/emicklei/go-restful v2.16.0+incompatible // indirect github.com/emicklei/go-restful-swagger12 v0.0.0-20201014110547-68ccff494617 // indirect github.com/felixge/httpsnoop v1.0.1 // indirect - github.com/ghodss/yaml v1.0.0 // indirect github.com/go-logr/logr v1.2.3 // indirect - github.com/go-ole/go-ole v1.2.6 // indirect github.com/go-openapi/jsonpointer v0.19.5 // indirect github.com/go-openapi/jsonreference v0.19.6 // indirect github.com/go-openapi/swag v0.19.15 // indirect @@ -86,14 +82,12 @@ require ( github.com/grpc-ecosystem/grpc-gateway v1.16.0 // indirect github.com/imdario/mergo v0.3.12 // indirect github.com/inconshreveable/mousetrap v1.0.0 // indirect - github.com/jaypipes/pcidb v1.0.0 // indirect github.com/josharian/intern v1.0.0 // indirect github.com/jpillora/backoff v1.0.0 // indirect github.com/json-iterator/go v1.1.12 // indirect github.com/mailru/easyjson v0.7.6 // indirect github.com/matttproud/golang_protobuf_extensions v1.0.2-0.20181231171920-c182affec369 // indirect github.com/mistifyio/go-zfs v2.1.2-0.20190413222219-f784269be439+incompatible // indirect - github.com/mitchellh/go-homedir v1.1.0 // indirect github.com/moby/spdystream v0.2.0 // indirect github.com/moby/sys/mountinfo v0.6.0 // indirect github.com/moby/term v0.0.0-20210619224110-3f7ff695adc6 // indirect @@ -135,7 +129,6 @@ require ( gopkg.in/natefinch/lumberjack.v2 v2.0.0 // indirect gopkg.in/yaml.v2 v2.4.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect - howett.net/plist v1.0.0 // indirect k8s.io/cloud-provider v0.24.6 // indirect k8s.io/csi-translation-lib v0.24.6 // indirect k8s.io/kube-openapi v0.0.0-20220803162953-67bda5d908f1 // indirect diff --git a/go.sum b/go.sum index 99bc43f45..66ef37a27 100644 --- a/go.sum +++ b/go.sum @@ -81,8 +81,6 @@ github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578/go.mod h1:uGdko github.com/Shopify/sarama v1.19.0/go.mod h1:FVkBWblsNy7DGZRfXLU0O9RCGt5g3g3yEuWXgklEdEo= github.com/Shopify/toxiproxy v2.1.4+incompatible/go.mod h1:OXgGpZ6Cli1/URJOF1DMxUHB2q5Ap20/P/eIdh4G0pI= github.com/StackExchange/wmi v0.0.0-20180116203802-5d049714c4a6/go.mod h1:3eOhrUMpNV+6aFIbp5/iudMxNCF27Vw2OZgy4xEx0Fg= -github.com/StackExchange/wmi v1.2.1 h1:VIkavFPXSjcnS+O8yTq7NI32k0R5Aj+v39y29VYDOSA= -github.com/StackExchange/wmi v1.2.1/go.mod h1:rcmrprowKIVzvc+NUiLncP2uuArMWLCbu9SBzvHz7e8= github.com/VividCortex/gohistogram v1.0.0/go.mod h1:Pf5mBqqDxYaXu3hDrrU+w6nw50o/4+TcAqDqk/vUH7g= github.com/afex/hystrix-go v0.0.0-20180502004556-fa1af6a1f4f5/go.mod h1:SkGFH1ia65gfNATL8TAiHDNxPzPdmEL5uirI2Uyuz6c= github.com/ajstarks/svgo v0.0.0-20180226025133-644b8db467af/go.mod h1:K08gAheRH3/J6wwsYMMT4xOr94bZjxIelGM0+d/wbFw= @@ -269,7 +267,6 @@ github.com/fsnotify/fsnotify v1.5.1/go.mod h1:T3375wBYaZdLLcVNkcVbzGHY7f1l/uK5T5 github.com/fvbommel/sortorder v1.0.1/go.mod h1:uk88iVf1ovNn1iLfgUVU2F9o5eO30ui720w+kxuqRs0= github.com/getkin/kin-openapi v0.76.0/go.mod h1:660oXbgy5JFMKreazJaQTw7o+X00qeSyhcnluiMv+Xg= github.com/getsentry/raven-go v0.2.0/go.mod h1:KungGk8q33+aIAZUIVWZDr2OfAEBsO49PX4NzFV5kcQ= -github.com/ghodss/yaml v1.0.0 h1:wQHKEahhL6wmXdzwWG11gIVCkOv05bNOh+Rxn0yngAk= github.com/ghodss/yaml v1.0.0/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04= github.com/go-critic/go-critic v0.5.2/go.mod h1:cc0+HvdE3lFpqLecgqMaJcvWWH77sLdBp+wLGPM1Yyo= github.com/go-errors/errors v1.0.1/go.mod h1:f4zRHt4oKfwPJE5k8C9vpYG+aDHdBFUsgrm6/TyX73Q= @@ -292,9 +289,6 @@ github.com/go-logr/logr v1.2.3/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbV github.com/go-logr/zapr v1.2.0 h1:n4JnPI1T3Qq1SFEi/F8rwLrZERp2bso19PJZDB9dayk= github.com/go-logr/zapr v1.2.0/go.mod h1:Qa4Bsj2Vb+FAVeAKsLD8RLQ+YRJB8YDmOAKxaBQf7Ro= github.com/go-ole/go-ole v1.2.1/go.mod h1:7FAglXiTm7HKlQRDeOQ6ZNUHidzCWXuZWq/1dTyBNF8= -github.com/go-ole/go-ole v1.2.5/go.mod h1:pprOEPIfldk/42T2oK7lQ4v4JSDwmV0As9GaiUsvbm0= -github.com/go-ole/go-ole v1.2.6 h1:/Fpf6oFPoeFik9ty7siob0G6Ke8QvQEuVcuChpwXzpY= -github.com/go-ole/go-ole v1.2.6/go.mod h1:pprOEPIfldk/42T2oK7lQ4v4JSDwmV0As9GaiUsvbm0= github.com/go-openapi/jsonpointer v0.19.3/go.mod h1:Pl9vOtqEWErmShwVjC8pYs9cog34VGT37dQOVbmoatg= github.com/go-openapi/jsonpointer v0.19.5 h1:gZr+CIYByUqjcgeLXnQu2gHYQC9o73G2XUeOFYEICuY= github.com/go-openapi/jsonpointer v0.19.5/go.mod h1:Pl9vOtqEWErmShwVjC8pYs9cog34VGT37dQOVbmoatg= @@ -502,11 +496,6 @@ github.com/inconshreveable/mousetrap v1.0.0 h1:Z8tu5sraLXCXIcARxBp/8cbvlwVa7Z1NH github.com/inconshreveable/mousetrap v1.0.0/go.mod h1:PxqpIevigyE2G7u3NXJIT2ANytuPF1OarO4DADm73n8= github.com/influxdata/influxdb1-client v0.0.0-20191209144304-8bf82d3c094d/go.mod h1:qj24IKcXYK6Iy9ceXlo3Tc+vtHo9lIhSX5JddghvEPo= github.com/ishidawataru/sctp v0.0.0-20190723014705-7c296d48a2b5/go.mod h1:DM4VvS+hD/kDi1U1QsX2fnZowwBhqD0Dk3bRPKF/Oc8= -github.com/jaypipes/ghw v0.12.0 h1:xU2/MDJfWmBhJnujHY9qwXQLs3DBsf0/Xa9vECY0Tho= -github.com/jaypipes/ghw v0.12.0/go.mod h1:jeJGbkRB2lL3/gxYzNYzEDETV1ZJ56OKr+CSeSEym+g= -github.com/jaypipes/pcidb v1.0.0 h1:vtZIfkiCUE42oYbJS0TAq9XSfSmcsgo9IdxSm9qzYU8= -github.com/jaypipes/pcidb v1.0.0/go.mod h1:TnYUvqhPBzCKnH34KrIX22kAeEbDCSRJ9cqLRCuNDfk= -github.com/jessevdk/go-flags v1.4.0/go.mod h1:4FA24M0QyGHXBuZZK/XkWh8h0e1EYbRYJSGM75WSRxI= github.com/jingyugao/rowserrcheck v0.0.0-20191204022205-72ab7603b68a/go.mod h1:xRskid8CManxVta/ALEhJha/pweKBaVG6fWgc0yH25s= github.com/jirfag/go-printf-func-name v0.0.0-20191110105641-45db9963cdd3/go.mod h1:HEWGJkRDzjJY2sqdDwxccsGicWEf9BQOZsq2tV+xzM0= github.com/jmespath/go-jmespath v0.0.0-20180206201540-c2b33e8439af/go.mod h1:Nht3zPeWKUH0NzdCt2Blrr5ys8VGpn0CEB0cQHVjt7k= @@ -599,7 +588,6 @@ github.com/mistifyio/go-zfs v2.1.2-0.20190413222219-f784269be439+incompatible h1 github.com/mistifyio/go-zfs v2.1.2-0.20190413222219-f784269be439+incompatible/go.mod h1:8AuVvqP/mXw1px98n46wfvcGfQ4ci2FwoAjKYxuo3Z4= github.com/mitchellh/cli v1.0.0/go.mod h1:hNIlj7HEI86fIcpObd7a0FcrxTWetlwJDGcceTlRvqc= github.com/mitchellh/go-homedir v1.0.0/go.mod h1:SfyaCUpYCn1Vlf4IUYiD9fPX4A5wJrkLzIz1N1q0pr0= -github.com/mitchellh/go-homedir v1.1.0 h1:lukF9ziXFxDFPkA1vsr5zpc1XuPDn/wFntq5mG+4E0Y= github.com/mitchellh/go-homedir v1.1.0/go.mod h1:SfyaCUpYCn1Vlf4IUYiD9fPX4A5wJrkLzIz1N1q0pr0= github.com/mitchellh/go-ps v1.0.0/go.mod h1:J4lOc8z8yJs6vUwklHw2XEIiT4z4C40KtWVN3nvg8Pg= github.com/mitchellh/go-testing-interface v1.0.0/go.mod h1:kRemZodwjscx+RGhAo8eIhFbs2+BFgRtFPeD/KE+zxI= @@ -1492,7 +1480,6 @@ gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7 h1:uRGJdciOHaEIrze2W8Q3AKkep gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7/go.mod h1:dt/ZhP58zS4L8KSrWDmTeBkI65Dw0HsyUHuEVlX15mw= gopkg.in/warnings.v0 v0.1.1/go.mod h1:jksf8JmL6Qr/oQM2OXTHunEvvTAsrWBLb6OOjuVWRNI= gopkg.in/warnings.v0 v0.1.2/go.mod h1:jksf8JmL6Qr/oQM2OXTHunEvvTAsrWBLb6OOjuVWRNI= -gopkg.in/yaml.v1 v1.0.0-20140924161607-9f9df34309c0/go.mod h1:WDnlLJ4WF5VGsH/HVa3CI79GS0ol3YnhVnKP89i0kNg= gopkg.in/yaml.v2 v2.0.0-20170812160011-eb3733d160e7/go.mod h1:JAlM8MvJe8wmxCU4Bli9HhUf9+ttbYbLASfIpnQbh74= gopkg.in/yaml.v2 v2.2.1/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= @@ -1520,8 +1507,6 @@ honnef.co/go/tools v0.0.1-2019.2.3/go.mod h1:a3bituU0lyd329TUQxRnasdCoJDkEUEAqEt honnef.co/go/tools v0.0.1-2020.1.3/go.mod h1:X/FiERA/W4tHapMX5mGpAtMSVEeEUOyHaw9vFzvIQ3k= honnef.co/go/tools v0.0.1-2020.1.4/go.mod h1:X/FiERA/W4tHapMX5mGpAtMSVEeEUOyHaw9vFzvIQ3k= honnef.co/go/tools v0.0.1-2020.1.5/go.mod h1:X/FiERA/W4tHapMX5mGpAtMSVEeEUOyHaw9vFzvIQ3k= -howett.net/plist v1.0.0 h1:7CrbWYbPPO/PyNy38b2EB/+gYbjCe2DXBxgtOOZbSQM= -howett.net/plist v1.0.0/go.mod h1:lqaXoTrLY4hg8tnEzNru53gicrbv7rrk+2xJA/7hw9g= k8s.io/api v0.24.6 h1:9EZhqZv4Ct++e6XMh3f5bmP4XMxsuxf4c7WOCCjYvF8= k8s.io/api v0.24.6/go.mod h1:eoAZTnLglpwBajOxAusPL3xfVW5tUn3Y3gQv8e1XHBA= k8s.io/apiextensions-apiserver v0.24.6 h1:XgOhyeTIKUiw1qupOtx3x3xyfeaGsCBQYwPR11wYFwk= diff --git a/pkg/agent/resourcemanager/fetcher/kubelet/kubeletplugin.go b/pkg/agent/resourcemanager/fetcher/kubelet/kubeletplugin.go index fae95fbb3..c29d2faa2 100644 --- a/pkg/agent/resourcemanager/fetcher/kubelet/kubeletplugin.go +++ b/pkg/agent/resourcemanager/fetcher/kubelet/kubeletplugin.go @@ -89,8 +89,8 @@ func NewKubeletReporterPlugin(emitter metrics.MetricEmitter, metaServer *metaser } topologyStatusAdapter, err := topology.NewPodResourcesServerTopologyAdapter(metaServer, - conf.PodResourcesServerEndpoints, conf.KubeletResourcePluginPaths, nil, - p.getNumaInfo, nil, podresources.GetV1Client, conf.EnableReportRDMATopology) + conf.PodResourcesServerEndpoints, conf.KubeletResourcePluginPaths, conf.ResourceNameToZoneNameMap, + nil, p.getNumaInfo, nil, podresources.GetV1Client) if err != nil { return nil, err } diff --git a/pkg/agent/resourcemanager/fetcher/kubelet/topology/topology_adapter.go b/pkg/agent/resourcemanager/fetcher/kubelet/topology/topology_adapter.go index cf83ba9e8..a397cdd90 100644 --- a/pkg/agent/resourcemanager/fetcher/kubelet/topology/topology_adapter.go +++ b/pkg/agent/resourcemanager/fetcher/kubelet/topology/topology_adapter.go @@ -20,7 +20,6 @@ import ( "context" "encoding/json" "fmt" - "github.com/jaypipes/ghw/pkg/pci" "sync" "time" @@ -35,7 +34,6 @@ import ( "k8s.io/klog/v2" podresv1 "k8s.io/kubelet/pkg/apis/podresources/v1" - "github.com/jaypipes/ghw" nodev1alpha1 "github.com/kubewharf/katalyst-api/pkg/apis/node/v1alpha1" apiconsts "github.com/kubewharf/katalyst-api/pkg/consts" "github.com/kubewharf/katalyst-api/pkg/utils" @@ -51,7 +49,6 @@ import ( const ( podResourcesClientTimeout = 10 * time.Second podResourcesClientMaxMsgSize = 1024 * 1024 * 16 - NetDeviceType = 0x02 ) // NumaInfoGetter is to get numa info @@ -87,16 +84,14 @@ type topologyAdapterImpl struct { // kubeletResourcePluginPaths is the path of kubelet resource plugin kubeletResourcePluginPaths []string - // pciInfo pci device information - pciInfo *pci.Info - - enableReportRDMATopology bool + // resourceNameToZoneNameMap is a map that stores the mapping relationship between resource names to zone names + resourceNameToZoneNameMap map[string]string } // NewPodResourcesServerTopologyAdapter creates a topology adapter which uses pod resources server func NewPodResourcesServerTopologyAdapter(metaServer *metaserver.MetaServer, endpoints []string, - kubeletResourcePluginPaths []string, skipDeviceNames sets.String, numaInfoGetter NumaInfoGetter, - podResourcesFilter PodResourcesFilter, getClientFunc podresources.GetClientFunc, enableReportRDMATopology bool) (Adapter, error) { + kubeletResourcePluginPaths []string, resourceNameToZoneNameMap map[string]string, skipDeviceNames sets.String, + numaInfoGetter NumaInfoGetter, podResourcesFilter PodResourcesFilter, getClientFunc podresources.GetClientFunc) (Adapter, error) { numaInfo, err := numaInfoGetter() if err != nil { return nil, fmt.Errorf("failed to get numa info: %s", err) @@ -110,10 +105,6 @@ func NewPodResourcesServerTopologyAdapter(metaServer *metaserver.MetaServer, end return nil, errors.Wrapf(err, "ensure resource plugin path %s exists failed", path) } } - pciInfo, err := ghw.PCI() - if err != nil { - return nil, errors.Wrapf(err, "get pci Info failed") - } numaSocketZoneNodeMap := util.GenerateNumaSocketZone(numaInfo) return &topologyAdapterImpl{ @@ -124,8 +115,7 @@ func NewPodResourcesServerTopologyAdapter(metaServer *metaserver.MetaServer, end skipDeviceNames: skipDeviceNames, getClientFunc: getClientFunc, podResourcesFilter: podResourcesFilter, - pciInfo: pciInfo, - enableReportRDMATopology: enableReportRDMATopology, + resourceNameToZoneNameMap: resourceNameToZoneNameMap, }, nil } @@ -193,21 +183,19 @@ func (p *topologyAdapterImpl) GetTopologyZones(parentCtx context.Context) ([]*no // add other children zone node of numa or socket into topology zone generator by allocatable resources err = p.addNumaSocketChildrenZoneNodes(topologyZoneGenerator, allocatableResources) if err != nil { - return nil, errors.Wrap(err, "get zone topology failed") + return nil, errors.Wrap(err, "get socket and numa zone topology failed") } - if p.enableReportRDMATopology { - err = p.addNICNumaChildrenZoneNodes(topologyZoneGenerator, allocatableResources) - if err != nil { - return nil, errors.Wrap(err, "get zone topology failed") - } + err = p.addDeviceZoneNodes(topologyZoneGenerator, allocatableResources) + if err != nil { + return nil, errors.Wrap(err, "get device zone topology failed") + } - nicZoneAllocations, err := p.getNicZoneAllocations(podList, podResourcesList) - if err != nil { - return nil, errors.Wrap(err, "get zone allocations failed") - } - mergeZoneAllocations(zoneAllocations, nicZoneAllocations) + deviceZoneAllocations, err := p.getDeviceZoneAllocations(podList, podResourcesList) + if err != nil { + return nil, errors.Wrap(err, "get device zone allocations failed") } + mergeZoneAllocations(zoneAllocations, deviceZoneAllocations) return topologyZoneGenerator.GenerateTopologyZoneStatus(zoneAllocations, zoneResources, zoneAttributes), nil } @@ -337,34 +325,32 @@ func (p *topologyAdapterImpl) addNumaSocketChildrenZoneNodes(generator *util.Top return nil } -// addNumaSocketChildrenZoneNodes add the child nodes of socket or numa zone nodes to the generator, the child nodes are +// addDeviceZoneNodes add the device nodes which are children of numa zone nodes to the generator, the device nodes are // generated by generateZoneNode according to TopologyLevel, Type and Name in TopologyAwareAllocatableQuantityList -func (p *topologyAdapterImpl) addNICNumaChildrenZoneNodes(generator *util.TopologyZoneGenerator, +func (p *topologyAdapterImpl) addDeviceZoneNodes(generator *util.TopologyZoneGenerator, allocatableResources *podresv1.AllocatableResourcesResponse) error { if allocatableResources == nil { return fmt.Errorf("allocatable Resources is nil") } var errList []error - nicNumaZoneNodeMap := make(map[util.ZoneNode]util.ZoneNode) - for _, device := range allocatableResources.Devices { - if util.IsRDMA(device.ResourceName) { - for _, deviceId := range device.DeviceIds { - nicNode := util.GenerateNICZoneNode(deviceId) - if _, ok := nicNumaZoneNodeMap[nicNode]; !ok { - numaId := p.getNumaIdByPCIAddress(deviceId) - numaZoneNode := util.GenerateNumaZoneNode(numaId) - nicNumaZoneNodeMap[nicNode] = numaZoneNode + for targetResourceName, targetZoneName := range p.resourceNameToZoneNameMap { + for _, device := range allocatableResources.Devices { + if targetResourceName == device.ResourceName { + for _, deviceId := range device.DeviceIds { + deviceNode := util.GenerateDeviceZoneNode(deviceId, targetZoneName) + if len(device.Topology.Nodes) == 0 { + continue + } + numaZoneNode := util.GenerateNumaZoneNode(int(device.Topology.Nodes[0].ID)) + err := generator.AddNode(&numaZoneNode, deviceNode) + if err != nil { + errList = append(errList, err) + continue + } } } } } - for nicNode, numaNode := range nicNumaZoneNodeMap { - err := generator.AddNode(&numaNode, nicNode) - if err != nil { - errList = append(errList, err) - continue - } - } if len(errList) > 0 { return utilerrors.NewAggregate(errList) @@ -503,7 +489,7 @@ func (p *topologyAdapterImpl) getZoneAllocations(podList []*v1.Pod, podResources return zoneAllocationsMap, nil } -func (p *topologyAdapterImpl) getNicZoneAllocations(podList []*v1.Pod, podResourcesList []*podresv1.PodResources) (map[util.ZoneNode]util.ZoneAllocations, error) { +func (p *topologyAdapterImpl) getDeviceZoneAllocations(podList []*v1.Pod, podResourcesList []*podresv1.PodResources) (map[util.ZoneNode]util.ZoneAllocations, error) { var ( err error errList []error @@ -543,16 +529,18 @@ func (p *topologyAdapterImpl) getNicZoneAllocations(podList []*v1.Pod, podResour for _, c := range podResources.Containers { for _, device := range c.Devices { - if util.IsRDMA(device.ResourceName) { - for _, deviceId := range device.DeviceIds { - nicNode := util.GenerateNICZoneNode(deviceId) - if _, ok := zoneAllocationsMap[nicNode]; !ok { - zoneAllocationsMap[nicNode] = []*nodev1alpha1.Allocation{ - {Consumer: native.GenerateUniqObjectUIDKey(pod), - Requests: &v1.ResourceList{ - util.ResourceRDMA: resource.MustParse("1"), + for targetResourceName, targetZoneName := range p.resourceNameToZoneNameMap { + if device.ResourceName == targetResourceName { + for _, deviceId := range device.DeviceIds { + deviceNode := util.GenerateDeviceZoneNode(deviceId, targetZoneName) + if _, ok := zoneAllocationsMap[deviceNode]; !ok { + zoneAllocationsMap[deviceNode] = []*nodev1alpha1.Allocation{ + {Consumer: native.GenerateUniqObjectUIDKey(pod), + Requests: &v1.ResourceList{ + v1.ResourceName(device.ResourceName): resource.MustParse("1"), + }, }, - }, + } } } } @@ -884,13 +872,3 @@ func mergeZoneAllocations(zone1, zone2 map[util.ZoneNode]util.ZoneAllocations) { zone1[zoneNode] = allocations } } - -func (p *topologyAdapterImpl) getNumaIdByPCIAddress(address string) int { - device := p.pciInfo.GetDevice(address) - if device != nil && device.Node != nil { - return device.Node.ID - } else { - klog.Errorf("get numa Id error, pci address %s", address) - } - return -1 -} diff --git a/pkg/agent/resourcemanager/fetcher/kubelet/topology/topology_adapter_test.go b/pkg/agent/resourcemanager/fetcher/kubelet/topology/topology_adapter_test.go index 50852f29f..83e135e3b 100644 --- a/pkg/agent/resourcemanager/fetcher/kubelet/topology/topology_adapter_test.go +++ b/pkg/agent/resourcemanager/fetcher/kubelet/topology/topology_adapter_test.go @@ -18,8 +18,6 @@ package topology import ( "context" - "github.com/jaypipes/ghw/pkg/pci" - "github.com/jaypipes/ghw/pkg/topology" "net" "os" "path" @@ -772,120 +770,21 @@ func Test_podResourcesServerTopologyAdapterImpl_GetTopologyZones_ReportRDMATopol PodResources: []*podresv1.PodResources{ { Namespace: "default", - Name: "pod-1", + Name: "pod-2", Containers: []*podresv1.ContainerResources{ { - Name: "container-1", + Name: "container-2", Devices: []*podresv1.ContainerDevices{ { - ResourceName: "vke.volcengine.com/rdma", - DeviceIds: []string{"rdma-0", "rdma-1"}, + ResourceName: "resource.katalyst.kubewharf.io/rdma", + DeviceIds: []string{ + "eth0", + }, Topology: &podresv1.TopologyInfo{ Nodes: []*podresv1.NUMANode{ - {ID: 0}, - {ID: 1}, - }, - }, - }, - }, - Resources: []*podresv1.TopologyAwareResource{ - { - ResourceName: "cpu", - OriginalTopologyAwareQuantityList: []*podresv1.TopologyAwareQuantity{ - { - ResourceValue: 12, - Node: 0, - }, - { - ResourceValue: 15, - Node: 1, - }, - }, - }, - { - ResourceName: "memory", - OriginalTopologyAwareQuantityList: []*podresv1.TopologyAwareQuantity{ - { - ResourceValue: generateFloat64ResourceValue("12G"), - Node: 0, - }, - { - ResourceValue: generateFloat64ResourceValue("15G"), - Node: 1, - }, - }, - }, - }, - }, - }, - }, - { - Namespace: "default", - Name: "pod-2", - Containers: []*podresv1.ContainerResources{ - { - Name: "container-1", - Resources: []*podresv1.TopologyAwareResource{ - { - ResourceName: "cpu", - OriginalTopologyAwareQuantityList: []*podresv1.TopologyAwareQuantity{ - { - ResourceValue: 24, - Node: 0, - }, - { - ResourceValue: 24, - Node: 1, - }, - }, - }, - { - ResourceName: "memory", - OriginalTopologyAwareQuantityList: []*podresv1.TopologyAwareQuantity{ - { - ResourceValue: generateFloat64ResourceValue("32G"), - Node: 0, - }, - { - ResourceValue: generateFloat64ResourceValue("32G"), - Node: 1, - }, - }, - }, - }, - }, - }, - }, - { - Namespace: "default", - Name: "pod-3", - Containers: []*podresv1.ContainerResources{ - { - Name: "container-1", - Resources: []*podresv1.TopologyAwareResource{ - { - ResourceName: "cpu", - OriginalTopologyAwareQuantityList: []*podresv1.TopologyAwareQuantity{ - { - ResourceValue: 24, - Node: 0, - }, - { - ResourceValue: 24, - Node: 1, - }, - }, - }, - { - ResourceName: "memory", - OriginalTopologyAwareQuantityList: []*podresv1.TopologyAwareQuantity{ - { - ResourceValue: generateFloat64ResourceValue("32G"), - Node: 0, - }, - { - ResourceValue: generateFloat64ResourceValue("32G"), - Node: 1, + { + ID: 0, + }, }, }, }, @@ -898,9 +797,9 @@ func Test_podResourcesServerTopologyAdapterImpl_GetTopologyZones_ReportRDMATopol allocatableResources: &podresv1.AllocatableResourcesResponse{ Devices: []*podresv1.ContainerDevices{ { - ResourceName: "gpu", + ResourceName: "resource.katalyst.kubewharf.io/rdma", DeviceIds: []string{ - "0", + "eth0", }, Topology: &podresv1.TopologyInfo{ Nodes: []*podresv1.NUMANode{ @@ -909,13 +808,13 @@ func Test_podResourcesServerTopologyAdapterImpl_GetTopologyZones_ReportRDMATopol }, }, { - ResourceName: "gpu", + ResourceName: "resource.katalyst.kubewharf.io/rdma", DeviceIds: []string{ - "1", + "eth1", }, Topology: &podresv1.TopologyInfo{ Nodes: []*podresv1.NUMANode{ - {ID: 0}, + {ID: 1}, }, }, }, @@ -984,39 +883,21 @@ func Test_podResourcesServerTopologyAdapterImpl_GetTopologyZones_ReportRDMATopol Name: "0", Resources: nodev1alpha1.Resources{ Capacity: &v1.ResourceList{ - "gpu": resource.MustParse("2"), - "cpu": resource.MustParse("24"), - "memory": resource.MustParse("32G"), + "cpu": resource.MustParse("24"), + "memory": resource.MustParse("32G"), + "resource.katalyst.kubewharf.io/rdma": resource.MustParse("1"), }, Allocatable: &v1.ResourceList{ - "gpu": resource.MustParse("2"), - "cpu": resource.MustParse("24"), - "memory": resource.MustParse("32G"), + "cpu": resource.MustParse("24"), + "memory": resource.MustParse("32G"), + "resource.katalyst.kubewharf.io/rdma": resource.MustParse("1"), }, }, Allocations: []*nodev1alpha1.Allocation{ - { - Consumer: "default/pod-1/pod-1-uid", - Requests: &v1.ResourceList{ - "gpu": resource.MustParse("1"), - "cpu": resource.MustParse("12"), - "memory": resource.MustParse("12G"), - }, - }, { Consumer: "default/pod-2/pod-2-uid", Requests: &v1.ResourceList{ - "gpu": resource.MustParse("1"), - "cpu": resource.MustParse("24"), - "memory": resource.MustParse("32G"), - }, - }, - { - Consumer: "default/pod-3/pod-3-uid", - Requests: &v1.ResourceList{ - "gpu": resource.MustParse("1"), - "cpu": resource.MustParse("24"), - "memory": resource.MustParse("32G"), + "resource.katalyst.kubewharf.io/rdma": resource.MustParse("1"), }, }, }, @@ -1024,19 +905,11 @@ func Test_podResourcesServerTopologyAdapterImpl_GetTopologyZones_ReportRDMATopol { Type: nodev1alpha1.TopologyTypeNIC, Name: "eth0", - Resources: nodev1alpha1.Resources{ - Capacity: &v1.ResourceList{ - "nic": resource.MustParse("10G"), - }, - Allocatable: &v1.ResourceList{ - "nic": resource.MustParse("10G"), - }, - }, Allocations: []*nodev1alpha1.Allocation{ { Consumer: "default/pod-2/pod-2-uid", Requests: &v1.ResourceList{ - "nic": resource.MustParse("10G"), + "resource.katalyst.kubewharf.io/rdma": resource.MustParse("1"), }, }, }, @@ -1054,51 +927,20 @@ func Test_podResourcesServerTopologyAdapterImpl_GetTopologyZones_ReportRDMATopol Name: "1", Resources: nodev1alpha1.Resources{ Capacity: &v1.ResourceList{ - "cpu": resource.MustParse("24"), - "memory": resource.MustParse("32G"), + "cpu": resource.MustParse("24"), + "memory": resource.MustParse("32G"), + "resource.katalyst.kubewharf.io/rdma": resource.MustParse("1"), }, Allocatable: &v1.ResourceList{ - "cpu": resource.MustParse("24"), - "memory": resource.MustParse("32G"), - }, - }, - Allocations: []*nodev1alpha1.Allocation{ - { - Consumer: "default/pod-1/pod-1-uid", - Requests: &v1.ResourceList{ - "cpu": resource.MustParse("15"), - "memory": resource.MustParse("15G"), - }, - }, - { - Consumer: "default/pod-2/pod-2-uid", - Requests: &v1.ResourceList{ - "gpu": resource.MustParse("1"), - "cpu": resource.MustParse("24"), - "memory": resource.MustParse("32G"), - }, - }, - { - Consumer: "default/pod-3/pod-3-uid", - Requests: &v1.ResourceList{ - "gpu": resource.MustParse("1"), - "cpu": resource.MustParse("24"), - "memory": resource.MustParse("32G"), - }, + "cpu": resource.MustParse("24"), + "memory": resource.MustParse("32G"), + "resource.katalyst.kubewharf.io/rdma": resource.MustParse("1"), }, }, Children: []*nodev1alpha1.TopologyZone{ { Type: nodev1alpha1.TopologyTypeNIC, Name: "eth1", - Resources: nodev1alpha1.Resources{ - Capacity: &v1.ResourceList{ - "nic": resource.MustParse("10G"), - }, - Allocatable: &v1.ResourceList{ - "nic": resource.MustParse("10G"), - }, - }, }, }, }, @@ -1109,22 +951,7 @@ func Test_podResourcesServerTopologyAdapterImpl_GetTopologyZones_ReportRDMATopol } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - pciInfo := &pci.Info{} - pciInfo.Devices = []*pci.Device{ - { - Address: "rdma-0", - Node: &topology.Node{ - ID: 0, - }, - }, - { - Address: "rdma-1", - Node: &topology.Node{ - ID: 1, - }, - }, - } - p := &topologyAdapterImpl{ + p := topologyAdapterImpl{ client: &fakePodResourcesListerClient{ ListPodResourcesResponse: tt.fields.listPodResources, AllocatableResourcesResponse: tt.fields.allocatableResources, @@ -1134,9 +961,10 @@ func Test_podResourcesServerTopologyAdapterImpl_GetTopologyZones_ReportRDMATopol PodFetcher: &pod.PodFetcherStub{PodList: tt.fields.podList}, }, }, - numaSocketZoneNodeMap: tt.fields.numaSocketZoneNodeMap, - pciInfo: pciInfo, - enableReportRDMATopology: true, + numaSocketZoneNodeMap: tt.fields.numaSocketZoneNodeMap, + resourceNameToZoneNameMap: map[string]string{ + "resource.katalyst.kubewharf.io/rdma": "NIC", + }, } got, err := p.GetTopologyZones(context.TODO()) if (err != nil) != tt.wantErr { @@ -1890,8 +1718,8 @@ func Test_podResourcesServerTopologyAdapterImpl_Run(t *testing.T) { ctx, cancel := context.WithCancel(context.TODO()) notifier := make(chan struct{}, 1) p, _ := NewPodResourcesServerTopologyAdapter(testMetaServer, - endpoints, kubeletResourcePluginPath, - nil, getNumaInfo, nil, podresources.GetV1Client, false) + endpoints, kubeletResourcePluginPath, nil, + nil, getNumaInfo, nil, podresources.GetV1Client) err = p.Run(ctx, func() {}) assert.NoError(t, err) diff --git a/pkg/config/agent/reporter/kubelet_plugin.go b/pkg/config/agent/reporter/kubelet_plugin.go index d732b4ca2..f01ad2d57 100644 --- a/pkg/config/agent/reporter/kubelet_plugin.go +++ b/pkg/config/agent/reporter/kubelet_plugin.go @@ -20,7 +20,7 @@ type KubeletPluginConfiguration struct { PodResourcesServerEndpoints []string KubeletResourcePluginPaths []string EnableReportTopologyPolicy bool - EnableReportRDMATopology bool + ResourceNameToZoneNameMap map[string]string } func NewKubeletPluginConfiguration() *KubeletPluginConfiguration { diff --git a/pkg/util/cnr.go b/pkg/util/cnr.go index 5bb927426..ba745e881 100644 --- a/pkg/util/cnr.go +++ b/pkg/util/cnr.go @@ -42,8 +42,6 @@ const ( CNRFieldNameTopologyZone = "TopologyZone" CNRFieldNameResources = "Resources" CNRFieldNameTopologyPolicy = "TopologyPolicy" - - ResourceRDMA = "vke.volcengine.com/rdma" ) var ( @@ -376,21 +374,12 @@ func GenerateSocketZoneNode(socketID int) ZoneNode { } } -// GenerateNICZoneNode generates nic zone node by socket id, which must be unique -func GenerateNICZoneNode(deviceId string) ZoneNode { +// GenerateDeviceZoneNode generates device zone node through device id, which must be unique +func GenerateDeviceZoneNode(deviceId, zoneName string) ZoneNode { return ZoneNode{ Meta: ZoneMeta{ - Type: nodev1alpha1.TopologyTypeNIC, + Type: nodev1alpha1.TopologyType(zoneName), Name: deviceId, }, } } - -func IsRDMA(resourceName string) bool { - return ResourceRDMA == resourceName -} - -// ParseDeviceID returns device ID parsed from the string as 64bit integer -func ParseDeviceID(deviceID string) (int64, error) { - return strconv.ParseInt(deviceID, 16, 64) -}