Skip to content

Commit

Permalink
Merge pull request #406 from buildkite/clean-up-pending-cancelled
Browse files Browse the repository at this point in the history
Clean up pending pods for cancelled jobs
  • Loading branch information
DrJosh9000 authored Oct 30, 2024
2 parents 54a1888 + 4a41f29 commit 637da43
Show file tree
Hide file tree
Showing 10 changed files with 326 additions and 80 deletions.
14 changes: 13 additions & 1 deletion charts/agent-stack-k8s/values.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@
"image": {
"type": "string",
"default": "",
"title": "The image Schema",
"title": "The container image used to obtain buildkite-agent, and for running commands without any k8s-specific configuration. The default for each release of agent-stack-k8s is set to a version tag of gchr.io/buildkite/agent matching agent-stack-k8s's go.mod file",
"examples": [""]
},
"debug": {
Expand Down Expand Up @@ -232,6 +232,18 @@
},
"examples": [["SECRET_RECIPE"]]
},
"image-pull-backoff-grace-period": {
"type": "string",
"default": "30s",
"title": "Duration after starting a pod that the controller will wait before considering cancelling a job due to ImagePullBackOff (e.g. when the podSpec specifies container images that cannot be pulled). Must be a Go duration string",
"examples": ["60s"]
},
"job-cancel-checker-poll-interval": {
"type": "string",
"default": "5s",
"title": "Controls the interval between job state queries while a pod is still Pending. Must be a Go duration string",
"examples": ["10s"]
},
"prohibit-kubernetes-plugin": {
"type": "boolean",
"default": false,
Expand Down
5 changes: 5 additions & 0 deletions cmd/controller/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,11 @@ func AddConfigFlags(cmd *cobra.Command) {
config.DefaultImagePullBackOffGracePeriod,
"Duration after starting a pod that the controller will wait before considering cancelling a job due to ImagePullBackOff (e.g. when the podSpec specifies container images that cannot be pulled)",
)
cmd.Flags().Duration(
"job-cancel-checker-poll-interval",
config.DefaultJobCancelCheckerPollInterval,
"Controls the interval between job state queries while a pod is still Pending",
)
cmd.Flags().Bool(
"prohibit-kubernetes-plugin",
false,
Expand Down
29 changes: 15 additions & 14 deletions cmd/controller/controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,20 +19,21 @@ func ptr[T any](v T) *T {

func TestReadAndParseConfig(t *testing.T) {
expected := config.Config{
Debug: true,
AgentTokenSecret: "my-kubernetes-secret",
BuildkiteToken: "my-graphql-enabled-token",
Image: "my.registry.dev/buildkite-agent:latest",
JobTTL: 300 * time.Second,
ImagePullBackOffGradePeriod: 60 * time.Second,
PollInterval: 5 * time.Second,
MaxInFlight: 100,
Namespace: "my-buildkite-ns",
Org: "my-buildkite-org",
Tags: []string{"queue=my-queue", "priority=high"},
ClusterUUID: "beefcafe-abbe-baba-abba-deedcedecade",
ProhibitKubernetesPlugin: true,
GraphQLEndpoint: "http://graphql.buildkite.localhost/v1",
Debug: true,
AgentTokenSecret: "my-kubernetes-secret",
BuildkiteToken: "my-graphql-enabled-token",
Image: "my.registry.dev/buildkite-agent:latest",
JobTTL: 300 * time.Second,
ImagePullBackOffGracePeriod: 60 * time.Second,
JobCancelCheckerPollInterval: 10 * time.Second,
PollInterval: 5 * time.Second,
MaxInFlight: 100,
Namespace: "my-buildkite-ns",
Org: "my-buildkite-org",
Tags: []string{"queue=my-queue", "priority=high"},
ClusterUUID: "beefcafe-abbe-baba-abba-deedcedecade",
ProhibitKubernetesPlugin: true,
GraphQLEndpoint: "http://graphql.buildkite.localhost/v1",
AgentConfig: &config.AgentConfig{
Endpoint: ptr("http://agent.buildkite.localhost/v3"),
},
Expand Down
1 change: 1 addition & 0 deletions examples/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ debug: true
image: my.registry.dev/buildkite-agent:latest
job-ttl: 5m
image-pull-backoff-grace-period: 60s
job-cancel-checker-poll-interval: 10s
poll-interval: 5s
max-in-flight: 100
namespace: my-buildkite-ns
Expand Down
23 changes: 13 additions & 10 deletions internal/controller/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,12 @@ import (
)

const (
UUIDLabel = "buildkite.com/job-uuid"
BuildURLAnnotation = "buildkite.com/build-url"
JobURLAnnotation = "buildkite.com/job-url"
DefaultNamespace = "default"
DefaultImagePullBackOffGracePeriod = 30 * time.Second
UUIDLabel = "buildkite.com/job-uuid"
BuildURLAnnotation = "buildkite.com/build-url"
JobURLAnnotation = "buildkite.com/job-url"
DefaultNamespace = "default"
DefaultImagePullBackOffGracePeriod = 30 * time.Second
DefaultJobCancelCheckerPollInterval = 5 * time.Second
)

var DefaultAgentImage = "ghcr.io/buildkite/agent:" + version.Version()
Expand All @@ -40,10 +41,11 @@ type Config struct {

// ClusterUUID field is mandatory for most new orgs.
// Some old orgs allows unclustered setup.
ClusterUUID string `json:"cluster-uuid" validate:"omitempty"`
AdditionalRedactedVars stringSlice `json:"additional-redacted-vars" validate:"omitempty"`
PodSpecPatch *corev1.PodSpec `json:"pod-spec-patch" validate:"omitempty"`
ImagePullBackOffGradePeriod time.Duration `json:"image-pull-backoff-grace-period" validate:"omitempty"`
ClusterUUID string `json:"cluster-uuid" validate:"omitempty"`
AdditionalRedactedVars stringSlice `json:"additional-redacted-vars" validate:"omitempty"`
PodSpecPatch *corev1.PodSpec `json:"pod-spec-patch" validate:"omitempty"`
ImagePullBackOffGracePeriod time.Duration `json:"image-pull-backoff-grace-period" validate:"omitempty"`
JobCancelCheckerPollInterval time.Duration `json:"job-cancel-checker-poll-interval" validate:"omitempty"`

AgentConfig *AgentConfig `json:"agent-config" validate:"omitempty"`
DefaultCheckoutParams *CheckoutParams `json:"default-checkout-params" validate:"omitempty"`
Expand Down Expand Up @@ -83,7 +85,8 @@ func (c Config) MarshalLogObject(enc zapcore.ObjectEncoder) error {
if err := enc.AddReflected("pod-spec-patch", c.PodSpecPatch); err != nil {
return err
}
enc.AddDuration("image-pull-backoff-grace-period", c.ImagePullBackOffGradePeriod)
enc.AddDuration("image-pull-backoff-grace-period", c.ImagePullBackOffGracePeriod)
enc.AddDuration("job-cancel-checker-poll-interval", c.JobCancelCheckerPollInterval)
if err := enc.AddReflected("agent-config", c.AgentConfig); err != nil {
return err
}
Expand Down
8 changes: 4 additions & 4 deletions internal/controller/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -80,13 +80,13 @@ func Run(
logger.Fatal("failed to register completions informer", zap.Error(err))
}

imagePullBackOffWatcher := scheduler.NewImagePullBackOffWatcher(
logger.Named("imagePullBackoffWatcher"),
podWatcher := scheduler.NewPodWatcher(
logger.Named("podWatcher"),
k8sClient,
cfg,
)
if err := imagePullBackOffWatcher.RegisterInformer(ctx, informerFactory); err != nil {
logger.Fatal("failed to register imagePullBackoffWatcher informer", zap.Error(err))
if err := podWatcher.RegisterInformer(ctx, informerFactory); err != nil {
logger.Fatal("failed to register podWatcher informer", zap.Error(err))
}

select {
Expand Down
Loading

0 comments on commit 637da43

Please sign in to comment.