Skip to content

Commit

Permalink
Make maxUnhealthy count configurable for control plane and worker mac…
Browse files Browse the repository at this point in the history
…hines
  • Loading branch information
abhay-krishna committed Jan 11, 2024
1 parent ed36457 commit 484581d
Show file tree
Hide file tree
Showing 21 changed files with 396 additions and 44 deletions.
2 changes: 2 additions & 0 deletions cmd/eksctl-anywhere/cmd/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ const (
unhealthyMachineTimeoutFlag = "unhealthy-machine-timeout"
nodeStartupTimeoutFlag = "node-startup-timeout"
noTimeoutsFlag = "no-timeouts"
cpMaxUnhealthyFlag = "control-plane-max-unhealthy"
workerMaxUnhealthyFlag = "worker-max-healthy"
)

type Operation int
Expand Down
2 changes: 2 additions & 0 deletions cmd/eksctl-anywhere/cmd/createcluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ import (
type createClusterOptions struct {
clusterOptions
timeoutOptions
maxUnhealthyOptions
forceClean bool
skipIpCheck bool
hardwareCSVPath string
Expand Down Expand Up @@ -61,6 +62,7 @@ func init() {
createCmd.AddCommand(createClusterCmd)
applyClusterOptionFlags(createClusterCmd.Flags(), &cc.clusterOptions)
applyTimeoutFlags(createClusterCmd.Flags(), &cc.timeoutOptions)
applyMaxUnhealthyFlags(createClusterCmd.Flags(), &cc.maxUnhealthyOptions)
applyTinkerbellHardwareFlag(createClusterCmd.Flags(), &cc.hardwareCSVPath)
aflag.String(aflag.TinkerbellBootstrapIP, &cc.tinkerbellBootstrapIP, createClusterCmd.Flags())
createClusterCmd.Flags().BoolVar(&cc.forceClean, "force-cleanup", false, "Force deletion of previously created bootstrap cluster")
Expand Down
17 changes: 17 additions & 0 deletions cmd/eksctl-anywhere/cmd/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"time"

"github.com/spf13/pflag"
"k8s.io/apimachinery/pkg/util/intstr"

"github.com/aws/eks-anywhere/pkg/api/v1alpha1"
"github.com/aws/eks-anywhere/pkg/cluster"
Expand Down Expand Up @@ -38,6 +39,11 @@ type timeoutOptions struct {
noTimeouts bool
}

type maxUnhealthyOptions struct {
cpMaxUnhealthy string
workerMaxUnhealthy string
}

func applyTimeoutFlags(flagSet *pflag.FlagSet, t *timeoutOptions) {
flagSet.StringVar(&t.cpWaitTimeout, cpWaitTimeoutFlag, clustermanager.DefaultControlPlaneWait.String(), "Override the default control plane wait timeout")
flagSet.StringVar(&t.externalEtcdWaitTimeout, externalEtcdWaitTimeoutFlag, clustermanager.DefaultEtcdWait.String(), "Override the default external etcd wait timeout")
Expand All @@ -47,6 +53,11 @@ func applyTimeoutFlags(flagSet *pflag.FlagSet, t *timeoutOptions) {
flagSet.BoolVar(&t.noTimeouts, noTimeoutsFlag, false, "Disable timeout for all wait operations")
}

func applyMaxUnhealthyFlags(flagSet *pflag.FlagSet, m *maxUnhealthyOptions) {
flagSet.StringVar(&m.cpMaxUnhealthy, cpMaxUnhealthyFlag, constants.DefaultControlPlaneMaxUnhealthy, "Override the default control plane maxUnhealthy count")
flagSet.StringVar(&m.workerMaxUnhealthy, workerMaxUnhealthyFlag, constants.DefaultWorkerMaxUnhealthy, "Override the default worker machine maxUnhealthy count")
}

// buildClusterManagerOpts builds options for constructing a ClusterManager from CLI flags.
// datacenterKind is an API kind such as v1alpha1.TinkerbellDatacenterKind.
func buildClusterManagerOpts(t timeoutOptions, datacenterKind string) (*dependencies.ClusterManagerTimeoutOptions, error) {
Expand Down Expand Up @@ -196,6 +207,9 @@ func buildCreateCliConfig(clusterOptions *createClusterOptions) (*config.CreateC
createCliConfig.NodeStartupTimeout = nodeStartupTimeout
createCliConfig.UnhealthyMachineTimeout = unhealthyMachineTimeout

createCliConfig.ControlPlaneMaxUnhealthy = intstr.Parse(clusterOptions.cpMaxUnhealthy)
createCliConfig.WorkerMaxUnhealthy = intstr.Parse(clusterOptions.workerMaxUnhealthy)

Check warning on line 212 in cmd/eksctl-anywhere/cmd/options.go

View check run for this annotation

Codecov / codecov/patch

cmd/eksctl-anywhere/cmd/options.go#L210-L212

Added lines #L210 - L212 were not covered by tests
return createCliConfig, nil
}

Expand All @@ -222,6 +236,9 @@ func buildUpgradeCliConfig(clusterOptions *upgradeClusterOptions) (*config.Upgra
upgradeCliConfig.NodeStartupTimeout = nodeStartupTimeout
upgradeCliConfig.UnhealthyMachineTimeout = unhealthyMachineTimeout

upgradeCliConfig.ControlPlaneMaxUnhealthy = intstr.Parse(clusterOptions.cpMaxUnhealthy)
upgradeCliConfig.WorkerMaxUnhealthy = intstr.Parse(clusterOptions.workerMaxUnhealthy)

Check warning on line 241 in cmd/eksctl-anywhere/cmd/options.go

View check run for this annotation

Codecov / codecov/patch

cmd/eksctl-anywhere/cmd/options.go#L239-L241

Added lines #L239 - L241 were not covered by tests
return &upgradeCliConfig, nil
}

Expand Down
2 changes: 2 additions & 0 deletions cmd/eksctl-anywhere/cmd/upgradecluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ import (
type upgradeClusterOptions struct {
clusterOptions
timeoutOptions
maxUnhealthyOptions
wConfig string
forceClean bool
hardwareCSVPath string
Expand Down Expand Up @@ -68,6 +69,7 @@ func init() {
upgradeCmd.AddCommand(upgradeClusterCmd)
applyClusterOptionFlags(upgradeClusterCmd.Flags(), &uc.clusterOptions)
applyTimeoutFlags(upgradeClusterCmd.Flags(), &uc.timeoutOptions)
applyMaxUnhealthyFlags(upgradeClusterCmd.Flags(), &uc.maxUnhealthyOptions)
applyTinkerbellHardwareFlag(upgradeClusterCmd.Flags(), &uc.hardwareCSVPath)
upgradeClusterCmd.Flags().StringVarP(&uc.wConfig, "w-config", "w", "", "Kubeconfig file to use when upgrading a workload cluster")
upgradeClusterCmd.Flags().BoolVar(&uc.forceClean, "force-cleanup", false, "Force deletion of previously created bootstrap cluster")
Expand Down
54 changes: 54 additions & 0 deletions config/crd/bases/anywhere.eks.amazonaws.com_clusters.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,12 @@ spec:
name:
type: string
type: object
nodeStartupTimeout:
description: NodeStartupTimeout is used to configure the control
plane machine-level override for the node startup timeout in
machine health checks. If not configured, the nodeStartupTimeout
in the top-level MHC config will be used.
type: string
skipLoadBalancerDeployment:
description: SkipLoadBalancerDeployment skip deploying control
plane load balancer. Make sure your infrastructure can handle
Expand Down Expand Up @@ -221,6 +227,12 @@ spec:
- key
type: object
type: array
unhealthyMachineTimeout:
description: UnhealthyMachineTimeout is used to configure the
control plane machine-level override for the unhealthy machine
timeout in machine health checks. If not configured, the unhealthyMachineTimeout
in the top-level MHC config will be used.
type: string
upgradeRolloutStrategy:
description: UpgradeRolloutStrategy determines the rollout strategy
to use for rolling upgrades and related parameters/knobs
Expand Down Expand Up @@ -342,6 +354,17 @@ spec:
to wait to remediate unhealthy machine or determine health of nodes'
machines.
properties:
controlPlaneMaxUnhealthy:
anyOf:
- type: integer
- type: string
description: ControlPlaneMaxUnhealthy is used to configure the
maximum number of unhealthy control plane machines in machine
health checks. If the number of unhealthy control plane machines
exceeds the limit set by controlPlaneMaxUnhealthy, further remediation
will not be performed. If not configured, the default value
is set to "100%".
x-kubernetes-int-or-string: true
nodeStartupTimeout:
description: NodeStartupTimeout is used to configure the node
startup timeout in machine health checks. It determines how
Expand All @@ -357,6 +380,16 @@ spec:
the machines are considered unhealthy. If not configured, the
default value is set to "5m0s" (5 minutes).
type: string
workerMaxUnhealthy:
anyOf:
- type: integer
- type: string
description: WorkerMaxUnhealthy is used to configure the maximum
number of unhealthy worker machines in machine health checks.
If the number of unhealthy worker machines exceeds the limit
set by workerMaxUnhealthy, further remediation will not be performed.
If not configured, the default value is set to "40%".
x-kubernetes-int-or-string: true
type: object
managementCluster:
properties:
Expand Down Expand Up @@ -534,9 +567,24 @@ spec:
name:
type: string
type: object
maxUnhealthy:
anyOf:
- type: integer
- type: string
description: MaxUnhealthy is used to configure the worker machine-level
override for maximum unhealthy count in machine health checks.
If not confogured, the workerMaxUnhealthy in the top-level
MHC config will be used.
x-kubernetes-int-or-string: true
name:
description: Name refers to the name of the worker node group
type: string
nodeStartupTimeout:
description: NodeStartupTimeout is used to configure the worker
machine-level override for the node startup timeout in machine
health checks. If not configured, the nodeStartupTimeout in
the top-level MHC config will be used.
type: string
taints:
description: Taints define the set of taints to be applied on
worker nodes
Expand Down Expand Up @@ -567,6 +615,12 @@ spec:
- key
type: object
type: array
unhealthyMachineTimeout:
description: UnhealthyMachineTimeout is used to configure the
worker machine-level override for the unhealthy machine timeout
in machine health checks. If not configured, the unhealthyMachineTimeout
in the top-level MHC config will be used.
type: string
upgradeRolloutStrategy:
description: UpgradeRolloutStrategy determines the rollout strategy
to use for rolling upgrades and related parameters/knobs
Expand Down
2 changes: 0 additions & 2 deletions config/crd/bases/anywhere.eks.amazonaws.com_nodeupgrades.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,6 @@ spec:
nodes and ignored for worker nodes.
type: boolean
kubernetesVersion:
description: 'TODO(in-place): Determine if there''s a way to get these
dynamically instead of expecting it from the CRD.'
type: string
machine:
description: Machine is a reference to the CAPI Machine that needs
Expand Down
85 changes: 81 additions & 4 deletions config/manifest/eksa-components.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3740,24 +3740,51 @@ spec:
in the cluster
properties:
cilium:
description: CiliumConfig contains configuration specific
to the Cilium CNI.
properties:
egressMasqueradeInterfaces:
description: EgressMasquaradeInterfaces determines which
network interfaces are used for masquerading. Accepted
values are a valid interface name or interface prefix.
type: string
ipv4NativeRoutingCIDR:
description: IPv4NativeRoutingCIDR specifies the CIDR
to use when RoutingMode is set to direct. When specified,
Cilium assumes networking for this CIDR is preconfigured
and hands traffic destined for that range to the Linux
network stack without applying any SNAT. If this is
not set autoDirectNodeRoutes will be set to true
type: string
ipv6NativeRoutingCIDR:
description: IPv6NativeRoutingCIDR specifies the IPv6
CIDR to use when RoutingMode is set to direct. When
specified, Cilium assumes networking for this CIDR is
preconfigured and hands traffic destined for that range
to the Linux network stack without applying any SNAT.
If this is not set autoDirectNodeRoutes will be set
to true
type: string
policyEnforcementMode:
description: PolicyEnforcementMode determines communication
allowed between pods. Accepted values are default, always,
never.
type: string
routingMode:
description: RoutingMode indicates the routing tunnel
mode to use for Cilium. Accepted values are overlay
(geneve tunnel with overlay) or direct (tunneling disabled
with direct routing) Defaults to overlay.
type: string
skipUpgrade:
description: SkipUpgrade indicicates that Cilium maintenance
should be skipped during upgrades. This can be used
when operators wish to self manage the Cilium installation.
type: boolean
type: object
kindnetd:
description: KindnetdConfig contains configuration specific
to the Kindnetd CNI.
type: object
type: object
dns:
Expand Down Expand Up @@ -3833,6 +3860,12 @@ spec:
name:
type: string
type: object
nodeStartupTimeout:
description: NodeStartupTimeout is used to configure the control
plane machine-level override for the node startup timeout in
machine health checks. If not configured, the nodeStartupTimeout
in the top-level MHC config will be used.
type: string
skipLoadBalancerDeployment:
description: SkipLoadBalancerDeployment skip deploying control
plane load balancer. Make sure your infrastructure can handle
Expand Down Expand Up @@ -3868,6 +3901,12 @@ spec:
- key
type: object
type: array
unhealthyMachineTimeout:
description: UnhealthyMachineTimeout is used to configure the
control plane machine-level override for the unhealthy machine
timeout in machine health checks. If not configured, the unhealthyMachineTimeout
in the top-level MHC config will be used.
type: string
upgradeRolloutStrategy:
description: UpgradeRolloutStrategy determines the rollout strategy
to use for rolling upgrades and related parameters/knobs
Expand Down Expand Up @@ -3989,6 +4028,17 @@ spec:
to wait to remediate unhealthy machine or determine health of nodes'
machines.
properties:
controlPlaneMaxUnhealthy:
anyOf:
- type: integer
- type: string
description: ControlPlaneMaxUnhealthy is used to configure the
maximum number of unhealthy control plane machines in machine
health checks. If the number of unhealthy control plane machines
exceeds the limit set by controlPlaneMaxUnhealthy, further remediation
will not be performed. If not configured, the default value
is set to "100%".
x-kubernetes-int-or-string: true
nodeStartupTimeout:
description: NodeStartupTimeout is used to configure the node
startup timeout in machine health checks. It determines how
Expand All @@ -4004,6 +4054,16 @@ spec:
the machines are considered unhealthy. If not configured, the
default value is set to "5m0s" (5 minutes).
type: string
workerMaxUnhealthy:
anyOf:
- type: integer
- type: string
description: WorkerMaxUnhealthy is used to configure the maximum
number of unhealthy worker machines in machine health checks.
If the number of unhealthy worker machines exceeds the limit
set by workerMaxUnhealthy, further remediation will not be performed.
If not configured, the default value is set to "40%".
x-kubernetes-int-or-string: true
type: object
managementCluster:
properties:
Expand Down Expand Up @@ -4181,9 +4241,24 @@ spec:
name:
type: string
type: object
maxUnhealthy:
anyOf:
- type: integer
- type: string
description: MaxUnhealthy is used to configure the worker machine-level
override for maximum unhealthy count in machine health checks.
If not confogured, the workerMaxUnhealthy in the top-level
MHC config will be used.
x-kubernetes-int-or-string: true
name:
description: Name refers to the name of the worker node group
type: string
nodeStartupTimeout:
description: NodeStartupTimeout is used to configure the worker
machine-level override for the node startup timeout in machine
health checks. If not configured, the nodeStartupTimeout in
the top-level MHC config will be used.
type: string
taints:
description: Taints define the set of taints to be applied on
worker nodes
Expand Down Expand Up @@ -4214,6 +4289,12 @@ spec:
- key
type: object
type: array
unhealthyMachineTimeout:
description: UnhealthyMachineTimeout is used to configure the
worker machine-level override for the unhealthy machine timeout
in machine health checks. If not configured, the unhealthyMachineTimeout
in the top-level MHC config will be used.
type: string
upgradeRolloutStrategy:
description: UpgradeRolloutStrategy determines the rollout strategy
to use for rolling upgrades and related parameters/knobs
Expand Down Expand Up @@ -4440,10 +4521,6 @@ spec:
upgraded:
format: int64
type: integer
required:
- ready
- requireUpgrade
- upgraded
type: object
type: object
served: true
Expand Down
Loading

0 comments on commit 484581d

Please sign in to comment.