From 795bf248a0989dcc64e894f0fdecf27741676092 Mon Sep 17 00:00:00 2001 From: Chris Doherty Date: Wed, 17 Jan 2024 16:00:35 -0600 Subject: [PATCH] Add Cilium kube-proxy replacement support Cilium can take ownership of all kube-proxy responsibilities. In Cilium v1.13 there are partial and strict modes, however the API changes in v1.14 to a binary setting so this support is for v1.13 strict mode only. --- pkg/api/v1alpha1/cluster_types.go | 10 + pkg/networking/cilium/templater.go | 7 + .../cloudstack/config/template-cp.yaml | 6 + pkg/providers/cloudstack/template.go | 8 + pkg/providers/cloudstack/template_test.go | 18 + ...cluster_cilium_kube_proxy_replacement.yaml | 69 +++ ...cluster_cilium_kube_proxy_replacement.yaml | 406 ++++++++++++++++++ 7 files changed, 524 insertions(+) create mode 100644 pkg/providers/cloudstack/testdata/cluster_cilium_kube_proxy_replacement.yaml create mode 100644 pkg/providers/cloudstack/testdata/expected_cluster_cilium_kube_proxy_replacement.yaml diff --git a/pkg/api/v1alpha1/cluster_types.go b/pkg/api/v1alpha1/cluster_types.go index fa0b107b42d8..2f8345fc85f2 100644 --- a/pkg/api/v1alpha1/cluster_types.go +++ b/pkg/api/v1alpha1/cluster_types.go @@ -834,6 +834,10 @@ type CiliumConfig struct { // +optional SkipUpgrade *bool `json:"skipUpgrade,omitempty"` + // EnableKubeProxyReplacement replaces the default kube-proxy with Ciliums builtin functionality. + // +optional + EnableKubeProxyReplacement *bool `json:"enableKubeProxyReplacement,omitempty"` + // RoutingMode indicates the routing tunnel mode to use for Cilium. Accepted values are overlay (geneve tunnel with overlay) // or direct (tunneling disabled with direct routing) // Defaults to overlay. @@ -863,6 +867,12 @@ func (n *CiliumConfig) IsManaged() bool { return n.SkipUpgrade == nil || !*n.SkipUpgrade } +// IsKubeProxyReplacementEnabled checks if the EnableKubeProxyReplacement flag is true. Nil +// indicates false. +func (n *CiliumConfig) IsKubeProxyReplacementEnabled() bool { + return n.EnableKubeProxyReplacement != nil && *n.EnableKubeProxyReplacement +} + // KindnetdConfig contains configuration specific to the Kindnetd CNI. type KindnetdConfig struct{} diff --git a/pkg/networking/cilium/templater.go b/pkg/networking/cilium/templater.go index e64473a5a56b..06d525263cd6 100644 --- a/pkg/networking/cilium/templater.go +++ b/pkg/networking/cilium/templater.go @@ -245,6 +245,13 @@ func templateValues(spec *cluster.Spec, versionsBundle *cluster.VersionsBundle) } } + if spec.Cluster.Spec.ClusterNetwork.CNIConfig.Cilium.IsKubeProxyReplacementEnabled() { + // See https://docs.cilium.io/en/v1.13/network/kubernetes/kubeproxy-free/#kube-proxy-hybrid-modes. + // When upgrading to Cilium> v1.13 this needs to be changed to 'true' as Cilium has simplified the + // API. + val["kubeProxyReplacement"] = "strict" + } + return val } diff --git a/pkg/providers/cloudstack/config/template-cp.yaml b/pkg/providers/cloudstack/config/template-cp.yaml index 9442052dcef0..fe607020a4bf 100644 --- a/pkg/providers/cloudstack/config/template-cp.yaml +++ b/pkg/providers/cloudstack/config/template-cp.yaml @@ -306,6 +306,12 @@ spec: timeAdded: {{ .TimeAdded }} {{- end }} {{- end }} +{{- end }} +{{- with .kubeadmSkipPhases }} + skipPhases: + {{- range . }} + - {{ . }} + {{- end }} {{- end }} joinConfiguration: nodeRegistration: diff --git a/pkg/providers/cloudstack/template.go b/pkg/providers/cloudstack/template.go index 154387336fe7..067be35f88aa 100644 --- a/pkg/providers/cloudstack/template.go +++ b/pkg/providers/cloudstack/template.go @@ -191,6 +191,7 @@ func buildTemplateMapCP(clusterSpec *cluster.Spec) (map[string]interface{}, erro "externalEtcdVersion": versionsBundle.KubeDistro.EtcdVersion, "etcdImage": versionsBundle.KubeDistro.EtcdImage.VersionedImage(), "eksaSystemNamespace": constants.EksaSystemNamespace, + "kubeadmSkipPhases": []string{}, } auditPolicy, err := common.GetAuditPolicy(clusterSpec.Cluster.Spec.KubernetesVersion) @@ -246,6 +247,13 @@ func buildTemplateMapCP(clusterSpec *cluster.Spec) (map[string]interface{}, erro values["encryptionProviderConfig"] = conf } + cni := clusterSpec.Cluster.Spec.ClusterNetwork + if cni.CNIConfig != nil && cni.CNIConfig.Cilium != nil { + if cni.CNIConfig.Cilium.IsKubeProxyReplacementEnabled() { + values["kubeadmSkipPhases"] = append(values["kubeadmSkipPhases"].([]string), "addon/kube-proxy") + } + } + return values, nil } diff --git a/pkg/providers/cloudstack/template_test.go b/pkg/providers/cloudstack/template_test.go index 76c4043d3715..d9580c5524e4 100644 --- a/pkg/providers/cloudstack/template_test.go +++ b/pkg/providers/cloudstack/template_test.go @@ -169,3 +169,21 @@ func TestTemplateBuilder_CertSANs(t *testing.T) { test.AssertContentToFile(t, string(data), tc.Output) } } + +func TestTemplateBuilder_KubeProxyReplacement(t *testing.T) { + input := "testdata/cluster_cilium_kube_proxy_replacement.yaml" + output := "testdata/expected_cluster_cilium_kube_proxy_replacement.yaml" + + g := NewWithT(t) + clusterSpec := test.NewFullClusterSpec(t, input) + + bldr := cloudstack.NewTemplateBuilder(time.Now) + + data, err := bldr.GenerateCAPISpecControlPlane(clusterSpec, func(values map[string]interface{}) { + values["controlPlaneTemplateName"] = clusterapi.ControlPlaneMachineTemplateName(clusterSpec.Cluster) + }) + g.Expect(err).ToNot(HaveOccurred()) + + test.AssertContentToFile(t, string(data), output) + +} diff --git a/pkg/providers/cloudstack/testdata/cluster_cilium_kube_proxy_replacement.yaml b/pkg/providers/cloudstack/testdata/cluster_cilium_kube_proxy_replacement.yaml new file mode 100644 index 000000000000..d7e1eee7f5ba --- /dev/null +++ b/pkg/providers/cloudstack/testdata/cluster_cilium_kube_proxy_replacement.yaml @@ -0,0 +1,69 @@ +apiVersion: anywhere.eks.amazonaws.com/v1alpha1 +kind: Cluster +metadata: + name: test + namespace: test +spec: + clusterNetwork: + cniConfig: + cilium: + enableKubeProxyReplacement: true + pods: + cidrBlocks: + - 192.168.0.0/16 + services: + cidrBlocks: + - 10.96.0.0/12 + controlPlaneConfiguration: + count: 1 + endpoint: + host: 0.0.0.0 + machineGroupRef: + kind: CloudStackMachineConfig + name: test + datacenterRef: + kind: CloudStackDatacenterConfig + name: test + kubernetesVersion: "1.21" +--- +apiVersion: anywhere.eks.amazonaws.com/v1alpha1 +kind: CloudStackDatacenterConfig +metadata: + name: test + namespace: test +spec: + availabilityZones: + - account: "admin" + domain: "domain1" + name: "default-az-0" + credentialsRef: "global" + zone: + name: "zone1" + network: + name: "net1" + managementApiEndpoint: "http://127.16.0.1:8080/client/api" +--- +apiVersion: anywhere.eks.amazonaws.com/v1alpha1 +kind: CloudStackMachineConfig +metadata: + name: test + namespace: test +spec: + computeOffering: + name: "m4-large" + users: + - name: "mySshUsername" + sshAuthorizedKeys: # The key below was manually generated and not used in any production systems + - "ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAACAQC1BK73XhIzjX+meUr7pIYh6RHbvI3tmHeQIXY5lv7aztN1UoX+bhPo3dwo2sfSQn5kuxgQdnxIZ/CTzy0p0GkEYVv3gwspCeurjmu0XmrdmaSGcGxCEWT/65NtvYrQtUE5ELxJ+N/aeZNlK2B7IWANnw/82913asXH4VksV1NYNduP0o1/G4XcwLLSyVFB078q/oEnmvdNIoS61j4/o36HVtENJgYr0idcBvwJdvcGxGnPaqOhx477t+kfJAa5n5dSA5wilIaoXH5i1Tf/HsTCM52L+iNCARvQzJYZhzbWI1MDQwzILtIBEQCJsl2XSqIupleY8CxqQ6jCXt2mhae+wPc3YmbO5rFvr2/EvC57kh3yDs1Nsuj8KOvD78KeeujbR8n8pScm3WDp62HFQ8lEKNdeRNj6kB8WnuaJvPnyZfvzOhwG65/9w13IBl7B1sWxbFnq2rMpm5uHVK7mAmjL0Tt8zoDhcE1YJEnp9xte3/pvmKPkST5Q/9ZtR9P5sI+02jY0fvPkPyC03j2gsPixG7rpOCwpOdbny4dcj0TDeeXJX8er+oVfJuLYz0pNWJcT2raDdFfcqvYA0B0IyNYlj5nWX4RuEcyT3qocLReWPnZojetvAG/H8XwOh7fEVGqHAKOVSnPXCSQJPl6s0H12jPJBDJMTydtYPEszl4/CeQ== testemail@test.com" + template: + name: "kubernetes_1_21" + diskOffering: + name: "Small" + mountPath: "/data-small" + device: "/dev/vdb" + filesystem: "ext4" + label: "data_disk" + symlinks: + /var/log/kubernetes: /data-small/var/log/kubernetes + affinityGroupIds: + - control-plane-anti-affinity \ No newline at end of file diff --git a/pkg/providers/cloudstack/testdata/expected_cluster_cilium_kube_proxy_replacement.yaml b/pkg/providers/cloudstack/testdata/expected_cluster_cilium_kube_proxy_replacement.yaml new file mode 100644 index 000000000000..ebfbfe3f87f3 --- /dev/null +++ b/pkg/providers/cloudstack/testdata/expected_cluster_cilium_kube_proxy_replacement.yaml @@ -0,0 +1,406 @@ +apiVersion: cluster.x-k8s.io/v1beta1 +kind: Cluster +metadata: + labels: + cluster.x-k8s.io/cluster-name: test + name: test + namespace: eksa-system +spec: + clusterNetwork: + pods: + cidrBlocks: [192.168.0.0/16] + services: + cidrBlocks: [10.96.0.0/12] + controlPlaneEndpoint: + host: 0.0.0.0 + port: 6443 + controlPlaneRef: + apiVersion: controlplane.cluster.x-k8s.io/v1beta1 + kind: KubeadmControlPlane + name: test + infrastructureRef: + apiVersion: infrastructure.cluster.x-k8s.io/v1beta3 + kind: CloudStackCluster + name: test +--- +apiVersion: infrastructure.cluster.x-k8s.io/v1beta3 +kind: CloudStackCluster +metadata: + name: test + namespace: eksa-system +spec: + controlPlaneEndpoint: + host: 0.0.0.0 + port: 6443 + failureDomains: + - name: default-az-0 + zone: + id: + name: zone1 + network: + id: + name: net1 + domain: domain1 + account: admin + acsEndpoint: + name: global + namespace: eksa-system +--- +apiVersion: controlplane.cluster.x-k8s.io/v1beta1 +kind: KubeadmControlPlane +metadata: + name: test + namespace: eksa-system +spec: + machineTemplate: + infrastructureRef: + apiVersion: infrastructure.cluster.x-k8s.io/v1beta3 + kind: CloudStackMachineTemplate + name: test-control-plane-1 + kubeadmConfigSpec: + clusterConfiguration: + imageRepository: public.ecr.aws/eks-distro/kubernetes + etcd: + local: + imageRepository: public.ecr.aws/eks-distro/etcd-io + imageTag: v3.4.16-eks-1-21-4 + extraArgs: + cipher-suites: TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256 + dns: + imageRepository: public.ecr.aws/eks-distro/coredns + imageTag: v1.8.3-eks-1-21-4 + apiServer: + extraArgs: + cloud-provider: external + audit-policy-file: /etc/kubernetes/audit-policy.yaml + audit-log-path: /var/log/kubernetes/api-audit.log + audit-log-maxage: "30" + audit-log-maxbackup: "10" + audit-log-maxsize: "512" + profiling: "false" + tls-cipher-suites: TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256 + extraVolumes: + - hostPath: /etc/kubernetes/audit-policy.yaml + mountPath: /etc/kubernetes/audit-policy.yaml + name: audit-policy + pathType: File + readOnly: true + - hostPath: /var/log/kubernetes + mountPath: /var/log/kubernetes + name: audit-log-dir + pathType: DirectoryOrCreate + readOnly: false + controllerManager: + extraArgs: + cloud-provider: external + profiling: "false" + tls-cipher-suites: TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256 + scheduler: + extraArgs: + profiling: "false" + tls-cipher-suites: TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256 + files: + - content: | + apiVersion: v1 + kind: Pod + metadata: + creationTimestamp: null + name: kube-vip + namespace: kube-system + spec: + containers: + - args: + - manager + env: + - name: vip_arp + value: "true" + - name: port + value: "6443" + - name: vip_cidr + value: "32" + - name: cp_enable + value: "true" + - name: cp_namespace + value: kube-system + - name: vip_ddns + value: "false" + - name: vip_leaderelection + value: "true" + - name: vip_leaseduration + value: "15" + - name: vip_renewdeadline + value: "10" + - name: vip_retryperiod + value: "2" + - name: address + value: 0.0.0.0 + image: public.ecr.aws/l0g8r8j6/kube-vip/kube-vip:v0.3.7-eks-a-v0.0.0-dev-build.158 + imagePullPolicy: IfNotPresent + name: kube-vip + resources: {} + securityContext: + capabilities: + add: + - NET_ADMIN + - NET_RAW + volumeMounts: + - mountPath: /etc/kubernetes/admin.conf + name: kubeconfig + hostNetwork: true + volumes: + - hostPath: + path: /etc/kubernetes/admin.conf + name: kubeconfig + status: {} + owner: root:root + path: /etc/kubernetes/manifests/kube-vip.yaml + - content: | + apiVersion: audit.k8s.io/v1beta1 + kind: Policy + rules: + # Log aws-auth configmap changes + - level: RequestResponse + namespaces: ["kube-system"] + verbs: ["update", "patch", "delete"] + resources: + - group: "" # core + resources: ["configmaps"] + resourceNames: ["aws-auth"] + omitStages: + - "RequestReceived" + # The following requests were manually identified as high-volume and low-risk, + # so drop them. + - level: None + users: ["system:kube-proxy"] + verbs: ["watch"] + resources: + - group: "" # core + resources: ["endpoints", "services", "services/status"] + - level: None + users: ["kubelet"] # legacy kubelet identity + verbs: ["get"] + resources: + - group: "" # core + resources: ["nodes", "nodes/status"] + - level: None + userGroups: ["system:nodes"] + verbs: ["get"] + resources: + - group: "" # core + resources: ["nodes", "nodes/status"] + - level: None + users: + - system:kube-controller-manager + - system:kube-scheduler + - system:serviceaccount:kube-system:endpoint-controller + verbs: ["get", "update"] + namespaces: ["kube-system"] + resources: + - group: "" # core + resources: ["endpoints"] + - level: None + users: ["system:apiserver"] + verbs: ["get"] + resources: + - group: "" # core + resources: ["namespaces", "namespaces/status", "namespaces/finalize"] + # Don't log HPA fetching metrics. + - level: None + users: + - system:kube-controller-manager + verbs: ["get", "list"] + resources: + - group: "metrics.k8s.io" + # Don't log these read-only URLs. + - level: None + nonResourceURLs: + - /healthz* + - /version + - /swagger* + # Don't log events requests. + - level: None + resources: + - group: "" # core + resources: ["events"] + # node and pod status calls from nodes are high-volume and can be large, don't log responses for expected updates from nodes + - level: Request + users: ["kubelet", "system:node-problem-detector", "system:serviceaccount:kube-system:node-problem-detector"] + verbs: ["update","patch"] + resources: + - group: "" # core + resources: ["nodes/status", "pods/status"] + omitStages: + - "RequestReceived" + - level: Request + userGroups: ["system:nodes"] + verbs: ["update","patch"] + resources: + - group: "" # core + resources: ["nodes/status", "pods/status"] + omitStages: + - "RequestReceived" + # deletecollection calls can be large, don't log responses for expected namespace deletions + - level: Request + users: ["system:serviceaccount:kube-system:namespace-controller"] + verbs: ["deletecollection"] + omitStages: + - "RequestReceived" + # Secrets, ConfigMaps, and TokenReviews can contain sensitive & binary data, + # so only log at the Metadata level. + - level: Metadata + resources: + - group: "" # core + resources: ["secrets", "configmaps"] + - group: authentication.k8s.io + resources: ["tokenreviews"] + omitStages: + - "RequestReceived" + - level: Request + resources: + - group: "" + resources: ["serviceaccounts/token"] + # Get repsonses can be large; skip them. + - level: Request + verbs: ["get", "list", "watch"] + resources: + - group: "" # core + - group: "admissionregistration.k8s.io" + - group: "apiextensions.k8s.io" + - group: "apiregistration.k8s.io" + - group: "apps" + - group: "authentication.k8s.io" + - group: "authorization.k8s.io" + - group: "autoscaling" + - group: "batch" + - group: "certificates.k8s.io" + - group: "extensions" + - group: "metrics.k8s.io" + - group: "networking.k8s.io" + - group: "policy" + - group: "rbac.authorization.k8s.io" + - group: "scheduling.k8s.io" + - group: "settings.k8s.io" + - group: "storage.k8s.io" + omitStages: + - "RequestReceived" + # Default level for known APIs + - level: RequestResponse + resources: + - group: "" # core + - group: "admissionregistration.k8s.io" + - group: "apiextensions.k8s.io" + - group: "apiregistration.k8s.io" + - group: "apps" + - group: "authentication.k8s.io" + - group: "authorization.k8s.io" + - group: "autoscaling" + - group: "batch" + - group: "certificates.k8s.io" + - group: "extensions" + - group: "metrics.k8s.io" + - group: "networking.k8s.io" + - group: "policy" + - group: "rbac.authorization.k8s.io" + - group: "scheduling.k8s.io" + - group: "settings.k8s.io" + - group: "storage.k8s.io" + omitStages: + - "RequestReceived" + # Default level for all other requests. + - level: Metadata + omitStages: + - "RequestReceived" + owner: root:root + path: /etc/kubernetes/audit-policy.yaml + initConfiguration: + nodeRegistration: + criSocket: /var/run/containerd/containerd.sock + kubeletExtraArgs: + provider-id: cloudstack:///'{{ ds.meta_data.instance_id }}' + read-only-port: "0" + anonymous-auth: "false" + tls-cipher-suites: TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256 + name: "{{ ds.meta_data.hostname }}" + skipPhases: + - addon/kube-proxy + joinConfiguration: + nodeRegistration: + criSocket: /var/run/containerd/containerd.sock + kubeletExtraArgs: + provider-id: cloudstack:///'{{ ds.meta_data.instance_id }}' + read-only-port: "0" + anonymous-auth: "false" + tls-cipher-suites: TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256 + name: "{{ ds.meta_data.hostname }}" + preKubeadmCommands: + - swapoff -a + - hostname "{{ ds.meta_data.hostname }}" + - echo "::1 ipv6-localhost ipv6-loopback" >/etc/hosts + - echo "127.0.0.1 localhost" >>/etc/hosts + - echo "127.0.0.1 {{ ds.meta_data.hostname }}" >>/etc/hosts + - echo "{{ ds.meta_data.hostname }}" >/etc/hostname + - >- + if [ ! -L /var/log/kubernetes ] ; + then + mv /var/log/kubernetes /var/log/kubernetes-$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10) ; + mkdir -p /data-small/var/log/kubernetes && ln -s /data-small/var/log/kubernetes /var/log/kubernetes ; + else echo "/var/log/kubernetes already symlnk"; + fi + diskSetup: + filesystems: + - device: /dev/vdb1 + overwrite: false + extraOpts: + - -E + - lazy_itable_init=1,lazy_journal_init=1 + filesystem: ext4 + label: data_disk + partitions: + - device: /dev/vdb + layout: true + overwrite: false + tableType: gpt + mounts: + - - LABEL=data_disk + - /data-small + useExperimentalRetryJoin: true + users: + - name: mySshUsername + sshAuthorizedKeys: + - 'ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAACAQC1BK73XhIzjX+meUr7pIYh6RHbvI3tmHeQIXY5lv7aztN1UoX+bhPo3dwo2sfSQn5kuxgQdnxIZ/CTzy0p0GkEYVv3gwspCeurjmu0XmrdmaSGcGxCEWT/65NtvYrQtUE5ELxJ+N/aeZNlK2B7IWANnw/82913asXH4VksV1NYNduP0o1/G4XcwLLSyVFB078q/oEnmvdNIoS61j4/o36HVtENJgYr0idcBvwJdvcGxGnPaqOhx477t+kfJAa5n5dSA5wilIaoXH5i1Tf/HsTCM52L+iNCARvQzJYZhzbWI1MDQwzILtIBEQCJsl2XSqIupleY8CxqQ6jCXt2mhae+wPc3YmbO5rFvr2/EvC57kh3yDs1Nsuj8KOvD78KeeujbR8n8pScm3WDp62HFQ8lEKNdeRNj6kB8WnuaJvPnyZfvzOhwG65/9w13IBl7B1sWxbFnq2rMpm5uHVK7mAmjL0Tt8zoDhcE1YJEnp9xte3/pvmKPkST5Q/9ZtR9P5sI+02jY0fvPkPyC03j2gsPixG7rpOCwpOdbny4dcj0TDeeXJX8er+oVfJuLYz0pNWJcT2raDdFfcqvYA0B0IyNYlj5nWX4RuEcyT3qocLReWPnZojetvAG/H8XwOh7fEVGqHAKOVSnPXCSQJPl6s0H12jPJBDJMTydtYPEszl4/CeQ==' + sudo: ALL=(ALL) NOPASSWD:ALL + format: cloud-config + replicas: 1 + version: v1.21.2-eks-1-21-4 +--- +apiVersion: infrastructure.cluster.x-k8s.io/v1beta3 +kind: CloudStackMachineTemplate +metadata: + annotations: + device.diskoffering.cloudstack.anywhere.eks.amazonaws.com/v1alpha1: /dev/vdb + filesystem.diskoffering.cloudstack.anywhere.eks.amazonaws.com/v1alpha1: ext4 + label.diskoffering.cloudstack.anywhere.eks.amazonaws.com/v1alpha1: data_disk + mountpath.diskoffering.cloudstack.anywhere.eks.amazonaws.com/v1alpha1: /data-small + symlinks.cloudstack.anywhere.eks.amazonaws.com/v1alpha1: /var/log/kubernetes:/data-small/var/log/kubernetes + creationTimestamp: null + name: test-control-plane-1 + namespace: eksa-system +spec: + template: + spec: + affinityGroupIDs: + - control-plane-anti-affinity + diskOffering: + customSizeInGB: 0 + device: /dev/vdb + filesystem: ext4 + label: data_disk + mountPath: /data-small + name: Small + offering: + name: m4-large + sshKey: "" + template: + name: kubernetes_1_21 + +---