diff --git a/charts/agent-stack-k8s/values.schema.json b/charts/agent-stack-k8s/values.schema.json index 0f23fc7a..77b80fb7 100644 --- a/charts/agent-stack-k8s/values.schema.json +++ b/charts/agent-stack-k8s/values.schema.json @@ -174,7 +174,7 @@ "image": { "type": "string", "default": "", - "title": "The image Schema", + "title": "The container image used to obtain buildkite-agent, and for running commands without any k8s-specific configuration. The default for each release of agent-stack-k8s is set to a version tag of gchr.io/buildkite/agent matching agent-stack-k8s's go.mod file", "examples": [""] }, "debug": { @@ -232,6 +232,18 @@ }, "examples": [["SECRET_RECIPE"]] }, + "image-pull-backoff-grace-period": { + "type": "string", + "default": "30s", + "title": "Duration after starting a pod that the controller will wait before considering cancelling a job due to ImagePullBackOff (e.g. when the podSpec specifies container images that cannot be pulled). Must be a Go duration string", + "examples": ["60s"] + }, + "job-cancel-checker-poll-interval": { + "type": "string", + "default": "5s", + "title": "Controls the interval between job state queries while a pod is still Pending. Must be a Go duration string", + "examples": ["10s"] + }, "prohibit-kubernetes-plugin": { "type": "boolean", "default": false, diff --git a/cmd/controller/controller.go b/cmd/controller/controller.go index a2f666af..d091d9b1 100644 --- a/cmd/controller/controller.go +++ b/cmd/controller/controller.go @@ -90,6 +90,11 @@ func AddConfigFlags(cmd *cobra.Command) { config.DefaultImagePullBackOffGracePeriod, "Duration after starting a pod that the controller will wait before considering cancelling a job due to ImagePullBackOff (e.g. when the podSpec specifies container images that cannot be pulled)", ) + cmd.Flags().Duration( + "job-cancel-checker-poll-interval", + config.DefaultJobCancelCheckerPollInterval, + "Controls the interval between job state queries while a pod is still Pending", + ) cmd.Flags().Bool( "prohibit-kubernetes-plugin", false, diff --git a/cmd/controller/controller_test.go b/cmd/controller/controller_test.go index b4762c65..088941ec 100644 --- a/cmd/controller/controller_test.go +++ b/cmd/controller/controller_test.go @@ -19,20 +19,21 @@ func ptr[T any](v T) *T { func TestReadAndParseConfig(t *testing.T) { expected := config.Config{ - Debug: true, - AgentTokenSecret: "my-kubernetes-secret", - BuildkiteToken: "my-graphql-enabled-token", - Image: "my.registry.dev/buildkite-agent:latest", - JobTTL: 300 * time.Second, - ImagePullBackOffGradePeriod: 60 * time.Second, - PollInterval: 5 * time.Second, - MaxInFlight: 100, - Namespace: "my-buildkite-ns", - Org: "my-buildkite-org", - Tags: []string{"queue=my-queue", "priority=high"}, - ClusterUUID: "beefcafe-abbe-baba-abba-deedcedecade", - ProhibitKubernetesPlugin: true, - GraphQLEndpoint: "http://graphql.buildkite.localhost/v1", + Debug: true, + AgentTokenSecret: "my-kubernetes-secret", + BuildkiteToken: "my-graphql-enabled-token", + Image: "my.registry.dev/buildkite-agent:latest", + JobTTL: 300 * time.Second, + ImagePullBackOffGracePeriod: 60 * time.Second, + JobCancelCheckerPollInterval: 10 * time.Second, + PollInterval: 5 * time.Second, + MaxInFlight: 100, + Namespace: "my-buildkite-ns", + Org: "my-buildkite-org", + Tags: []string{"queue=my-queue", "priority=high"}, + ClusterUUID: "beefcafe-abbe-baba-abba-deedcedecade", + ProhibitKubernetesPlugin: true, + GraphQLEndpoint: "http://graphql.buildkite.localhost/v1", AgentConfig: &config.AgentConfig{ Endpoint: ptr("http://agent.buildkite.localhost/v3"), }, diff --git a/examples/config.yaml b/examples/config.yaml index 0143f797..f2d1c25b 100644 --- a/examples/config.yaml +++ b/examples/config.yaml @@ -3,6 +3,7 @@ debug: true image: my.registry.dev/buildkite-agent:latest job-ttl: 5m image-pull-backoff-grace-period: 60s +job-cancel-checker-poll-interval: 10s poll-interval: 5s max-in-flight: 100 namespace: my-buildkite-ns diff --git a/internal/controller/config/config.go b/internal/controller/config/config.go index 985ea5fb..087500c6 100644 --- a/internal/controller/config/config.go +++ b/internal/controller/config/config.go @@ -11,11 +11,12 @@ import ( ) const ( - UUIDLabel = "buildkite.com/job-uuid" - BuildURLAnnotation = "buildkite.com/build-url" - JobURLAnnotation = "buildkite.com/job-url" - DefaultNamespace = "default" - DefaultImagePullBackOffGracePeriod = 30 * time.Second + UUIDLabel = "buildkite.com/job-uuid" + BuildURLAnnotation = "buildkite.com/build-url" + JobURLAnnotation = "buildkite.com/job-url" + DefaultNamespace = "default" + DefaultImagePullBackOffGracePeriod = 30 * time.Second + DefaultJobCancelCheckerPollInterval = 5 * time.Second ) var DefaultAgentImage = "ghcr.io/buildkite/agent:" + version.Version() @@ -40,10 +41,11 @@ type Config struct { // ClusterUUID field is mandatory for most new orgs. // Some old orgs allows unclustered setup. - ClusterUUID string `json:"cluster-uuid" validate:"omitempty"` - AdditionalRedactedVars stringSlice `json:"additional-redacted-vars" validate:"omitempty"` - PodSpecPatch *corev1.PodSpec `json:"pod-spec-patch" validate:"omitempty"` - ImagePullBackOffGradePeriod time.Duration `json:"image-pull-backoff-grace-period" validate:"omitempty"` + ClusterUUID string `json:"cluster-uuid" validate:"omitempty"` + AdditionalRedactedVars stringSlice `json:"additional-redacted-vars" validate:"omitempty"` + PodSpecPatch *corev1.PodSpec `json:"pod-spec-patch" validate:"omitempty"` + ImagePullBackOffGracePeriod time.Duration `json:"image-pull-backoff-grace-period" validate:"omitempty"` + JobCancelCheckerPollInterval time.Duration `json:"job-cancel-checker-poll-interval" validate:"omitempty"` AgentConfig *AgentConfig `json:"agent-config" validate:"omitempty"` DefaultCheckoutParams *CheckoutParams `json:"default-checkout-params" validate:"omitempty"` @@ -83,7 +85,8 @@ func (c Config) MarshalLogObject(enc zapcore.ObjectEncoder) error { if err := enc.AddReflected("pod-spec-patch", c.PodSpecPatch); err != nil { return err } - enc.AddDuration("image-pull-backoff-grace-period", c.ImagePullBackOffGradePeriod) + enc.AddDuration("image-pull-backoff-grace-period", c.ImagePullBackOffGracePeriod) + enc.AddDuration("job-cancel-checker-poll-interval", c.JobCancelCheckerPollInterval) if err := enc.AddReflected("agent-config", c.AgentConfig); err != nil { return err } diff --git a/internal/controller/controller.go b/internal/controller/controller.go index d496052b..8f042914 100644 --- a/internal/controller/controller.go +++ b/internal/controller/controller.go @@ -80,13 +80,13 @@ func Run( logger.Fatal("failed to register completions informer", zap.Error(err)) } - imagePullBackOffWatcher := scheduler.NewImagePullBackOffWatcher( - logger.Named("imagePullBackoffWatcher"), + podWatcher := scheduler.NewPodWatcher( + logger.Named("podWatcher"), k8sClient, cfg, ) - if err := imagePullBackOffWatcher.RegisterInformer(ctx, informerFactory); err != nil { - logger.Fatal("failed to register imagePullBackoffWatcher informer", zap.Error(err)) + if err := podWatcher.RegisterInformer(ctx, informerFactory); err != nil { + logger.Fatal("failed to register podWatcher informer", zap.Error(err)) } select { diff --git a/internal/controller/scheduler/imagePullBackOffWatcher.go b/internal/controller/scheduler/pod_watcher.go similarity index 51% rename from internal/controller/scheduler/imagePullBackOffWatcher.go rename to internal/controller/scheduler/pod_watcher.go index 14a8a76a..3744c551 100644 --- a/internal/controller/scheduler/imagePullBackOffWatcher.go +++ b/internal/controller/scheduler/pod_watcher.go @@ -7,6 +7,7 @@ import ( "regexp" "slices" "strings" + "sync" "time" "github.com/buildkite/agent-stack-k8s/v2/api" @@ -19,111 +20,174 @@ import ( "go.uber.org/zap" corev1 "k8s.io/api/core/v1" policyv1 "k8s.io/api/policy/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/informers" "k8s.io/client-go/kubernetes" _ "k8s.io/client-go/tools/cache" ) -type imagePullBackOffWatcher struct { +type podWatcher struct { logger *zap.Logger k8s kubernetes.Interface gql graphql.Client cfg *config.Config - // The imagePullBackOffWatcher waits at least this duration after pod + // ImagePullBackOff detection waits at least this duration after pod // creation before it cancels the job. - gracePeriod time.Duration + imagePullBackOffGracePeriod time.Duration // Jobs that we've failed, cancelled, or were found to be in a terminal // state. - ignoreJobs map[uuid.UUID]struct{} + ignoreJobsMu sync.RWMutex + ignoreJobs map[uuid.UUID]struct{} + + // The job cancel checkers query the job state every so often. + jobCancelCheckerInterval time.Duration + + // Channels that are closed when a cancel checker should stop. + cancelCheckerChsMu sync.Mutex + cancelCheckerChs map[uuid.UUID]*onceChan + + // This is the context passed to RegisterInformer. + // It's being stored here (grrrr!) because the k8s ResourceEventHandler + // interface doesn't have context args. (Working around an interface in a + // library outside of our control is a carve-out from the usual rule.) + // The context is needed to ensure job cancel checkers are cleaned up. + resourceEventHandlerCtx context.Context } -// NewImagePullBackOffWatcher creates an informer that will use the Buildkite -// GraphQL API to cancel jobs that have pods with containers in the -// ImagePullBackOff state -func NewImagePullBackOffWatcher( - logger *zap.Logger, - k8s kubernetes.Interface, - cfg *config.Config, -) *imagePullBackOffWatcher { - return &imagePullBackOffWatcher{ - logger: logger, - k8s: k8s, - gql: api.NewClient(cfg.BuildkiteToken, cfg.GraphQLEndpoint), - cfg: cfg, - gracePeriod: cfg.ImagePullBackOffGradePeriod, - ignoreJobs: make(map[uuid.UUID]struct{}), +// NewPodWatcher creates an informer that does various things with pods and +// Buildkite jobs: +// - If a container stays in ImagePullBackOff state for too long, the Buildkite +// Agent REST API will be used to fail the job and the pod will be evicted. +// - If a container stays in ImagePullBackOff, and the pod somehow got through +// all the init containers (including the image pull checks...) the Buildkite +// GraphQL API will be used to cancel the job instead. +// - If a pod is pending, every so often Buildkite will be checked to see if +// the corresponding job has been cancelled so that the pod can be evicted +// early. +func NewPodWatcher(logger *zap.Logger, k8s kubernetes.Interface, cfg *config.Config) *podWatcher { + imagePullBackOffGracePeriod := cfg.ImagePullBackOffGracePeriod + if imagePullBackOffGracePeriod <= 0 { + imagePullBackOffGracePeriod = config.DefaultImagePullBackOffGracePeriod + } + jobCancelCheckerInterval := cfg.JobCancelCheckerPollInterval + if jobCancelCheckerInterval <= 0 { + jobCancelCheckerInterval = config.DefaultJobCancelCheckerPollInterval + } + return &podWatcher{ + logger: logger, + k8s: k8s, + gql: api.NewClient(cfg.BuildkiteToken, cfg.GraphQLEndpoint), + cfg: cfg, + imagePullBackOffGracePeriod: imagePullBackOffGracePeriod, + jobCancelCheckerInterval: jobCancelCheckerInterval, + ignoreJobs: make(map[uuid.UUID]struct{}), + cancelCheckerChs: make(map[uuid.UUID]*onceChan), } } // Creates a Pods informer and registers the handler on it -func (w *imagePullBackOffWatcher) RegisterInformer( - ctx context.Context, - factory informers.SharedInformerFactory, -) error { +func (w *podWatcher) RegisterInformer(ctx context.Context, factory informers.SharedInformerFactory) error { informer := factory.Core().V1().Pods().Informer() if _, err := informer.AddEventHandler(w); err != nil { return err } + w.resourceEventHandlerCtx = ctx // 😡 go factory.Start(ctx.Done()) return nil } -func (w *imagePullBackOffWatcher) OnDelete(obj any) {} +func (w *podWatcher) OnDelete(maybePod any) { + pod, wasPod := maybePod.(*corev1.Pod) + if !wasPod { + return + } -func (w *imagePullBackOffWatcher) OnAdd(maybePod any, isInInitialList bool) { + jobUUID, _, err := w.jobUUIDAndLogger(pod) + if err != nil { + return + } + + w.stopJobCancelChecker(jobUUID) +} + +func (w *podWatcher) OnAdd(maybePod any, isInInitialList bool) { pod, wasPod := maybePod.(*corev1.Pod) if !wasPod { return } - w.cancelImagePullBackOff(context.Background(), pod) + w.runChecks(w.resourceEventHandlerCtx, pod) } -func (w *imagePullBackOffWatcher) OnUpdate(oldMaybePod, newMaybePod any) { +func (w *podWatcher) OnUpdate(oldMaybePod, newMaybePod any) { oldPod, oldWasPod := newMaybePod.(*corev1.Pod) newPod, newWasPod := newMaybePod.(*corev1.Pod) // This nonsense statement is only necessary because the types are too loose. // Most likely both old and new are going to be Pods. - if newWasPod { - w.cancelImagePullBackOff(context.Background(), newPod) - } else if oldWasPod { - w.cancelImagePullBackOff(context.Background(), oldPod) + switch { + case newWasPod: + w.runChecks(w.resourceEventHandlerCtx, newPod) + + case oldWasPod: + w.runChecks(w.resourceEventHandlerCtx, oldPod) } } -func (w *imagePullBackOffWatcher) cancelImagePullBackOff(ctx context.Context, pod *corev1.Pod) { - log := w.logger.With(zap.String("namespace", pod.Namespace), zap.String("podName", pod.Name)) - log.Debug("Checking pod for ImagePullBackOff") - - if pod.Status.StartTime == nil { - // Status could be unpopulated, or it hasn't started yet. - return - } - startedAt := pod.Status.StartTime.Time - if startedAt.IsZero() || time.Since(startedAt) < w.gracePeriod { - // Not started yet, or started recently +func (w *podWatcher) runChecks(ctx context.Context, pod *corev1.Pod) { + jobUUID, log, err := w.jobUUIDAndLogger(pod) + if err != nil { return } + // Check for a container stuck in ImagePullBackOff, and fail or cancel + // the job accordingly. + w.cancelImagePullBackOff(ctx, log, pod, jobUUID) + + // Check whether the agent container has started yet, and start or stop the + // job cancel checker accordingly. + w.startOrStopJobCancelChecker(ctx, log, pod, jobUUID) +} + +func (w *podWatcher) jobUUIDAndLogger(pod *corev1.Pod) (uuid.UUID, *zap.Logger, error) { + log := w.logger.With(zap.String("namespace", pod.Namespace), zap.String("podName", pod.Name)) + rawJobUUID, exists := pod.Labels[config.UUIDLabel] if !exists { - log.Info("Job UUID label not present. Skipping.") - return + log.Debug("Job UUID label not present. Skipping.") + return uuid.UUID{}, log, errors.New("no job UUID label") } jobUUID, err := uuid.Parse(rawJobUUID) if err != nil { log.Warn("Job UUID label was not a UUID!", zap.String("jobUUID", rawJobUUID)) - return + return uuid.UUID{}, log, err } log = log.With(zap.String("jobUUID", jobUUID.String())) - if _, done := w.ignoreJobs[jobUUID]; done { + w.ignoreJobsMu.RLock() + defer w.ignoreJobsMu.RUnlock() + + if _, ignore := w.ignoreJobs[jobUUID]; ignore { log.Debug("Job already failed, canceled, or wasn't in a failable/cancellable state") + return jobUUID, log, errors.New("job ignored") + } + return jobUUID, log, nil +} + +func (w *podWatcher) cancelImagePullBackOff(ctx context.Context, log *zap.Logger, pod *corev1.Pod, jobUUID uuid.UUID) { + log.Debug("Checking pod for ImagePullBackOff") + + if pod.Status.StartTime == nil { + // Status could be unpopulated, or it hasn't started yet. + return + } + startedAt := pod.Status.StartTime.Time + if startedAt.IsZero() || time.Since(startedAt) < w.imagePullBackOffGracePeriod { + // Not started yet, or started recently return } @@ -188,7 +252,7 @@ func (w *imagePullBackOffWatcher) cancelImagePullBackOff(ctx context.Context, po // If the job is in one of these states, we can neither acquire nor // cancel it (now or in the future). log.Debug("Job not acquirable or cancelable") - w.ignoreJobs[jobUUID] = struct{}{} + w.ignoreJob(jobUUID) default: // Most states don't make sense for a command job that we've started @@ -199,7 +263,7 @@ func (w *imagePullBackOffWatcher) cancelImagePullBackOff(ctx context.Context, po } } -func (w *imagePullBackOffWatcher) failJob(ctx context.Context, log *zap.Logger, pod *corev1.Pod, jobUUID uuid.UUID, images map[string]struct{}) { +func (w *podWatcher) failJob(ctx context.Context, log *zap.Logger, pod *corev1.Pod, jobUUID uuid.UUID, images map[string]struct{}) { agentToken, err := fetchAgentToken(ctx, w.logger, w.k8s, w.cfg.Namespace, w.cfg.AgentTokenSecret) if err != nil { log.Error("Couldn't fetch agent token in order to fail the job", zap.Error(err)) @@ -252,10 +316,10 @@ func (w *imagePullBackOffWatcher) failJob(ctx context.Context, log *zap.Logger, // Because eviction isn't instantaneous, the pod can continue to exist // for a bit. Record that we've failed the job to avoid trying to fail // it again. - w.ignoreJobs[jobUUID] = struct{}{} + w.ignoreJob(jobUUID) } -func (w *imagePullBackOffWatcher) cancelJob(ctx context.Context, log *zap.Logger, pod *corev1.Pod, jobUUID uuid.UUID) { +func (w *podWatcher) cancelJob(ctx context.Context, log *zap.Logger, pod *corev1.Pod, jobUUID uuid.UUID) { _, err := api.CancelCommandJob(ctx, w.gql, api.JobTypeCommandCancelInput{ ClientMutationId: pod.Name, Id: jobUUID.String(), @@ -276,9 +340,118 @@ func (w *imagePullBackOffWatcher) cancelJob(ctx context.Context, log *zap.Logger // We can avoid repeating the GraphQL queries to fetch and cancel the job // (between cancelling and Kubernetes cleaning up the pod) if we got here. + w.ignoreJob(jobUUID) +} + +func (w *podWatcher) startOrStopJobCancelChecker(ctx context.Context, log *zap.Logger, pod *corev1.Pod, jobUUID uuid.UUID) { + switch pod.Status.Phase { + case corev1.PodPending: + w.startJobCancelChecker(ctx, log, pod.ObjectMeta, jobUUID) + + default: + // Running: the agent container has started or is about to start, and it + // can handle the cancellation and exit. + // Succeeded, Failed: it's already over. + // Unknown: probably shouldn't interfere. + w.stopJobCancelChecker(jobUUID) + } +} + +func (w *podWatcher) startJobCancelChecker(ctx context.Context, log *zap.Logger, podMeta metav1.ObjectMeta, jobUUID uuid.UUID) { + w.cancelCheckerChsMu.Lock() + defer w.cancelCheckerChsMu.Unlock() + + if w.cancelCheckerChs[jobUUID] != nil { + // The checker is already running or has run. + return + } + stopCh := make(chan struct{}) + w.cancelCheckerChs[jobUUID] = &onceChan{ch: stopCh} + go w.jobCancelChecker(ctx, stopCh, log, podMeta, jobUUID) +} + +func (w *podWatcher) stopJobCancelChecker(jobUUID uuid.UUID) { + w.cancelCheckerChsMu.Lock() + defer w.cancelCheckerChsMu.Unlock() + w.cancelCheckerChs[jobUUID].closeOnce() + delete(w.cancelCheckerChs, jobUUID) +} + +// jobCancelChecker runs a loop that queries Buildkite for the job state, and +// evicts the pod if the job becomes cancelled. This should only be used for +// pods that are still pending: stopCh should be closed as soon as the agent +// container starts running. +func (w *podWatcher) jobCancelChecker(ctx context.Context, stopCh <-chan struct{}, log *zap.Logger, podMeta metav1.ObjectMeta, jobUUID uuid.UUID) { + log.Debug("Checking job state for cancellation") + defer log.Debug("Stopped checking job state for cancellation") + + ticker := time.NewTicker(w.jobCancelCheckerInterval) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return + + case <-stopCh: + return + + case <-ticker.C: + resp, err := api.GetCommandJob(ctx, w.gql, jobUUID.String()) + if err != nil { + // *shrug* Check again soon. + continue + } + job, ok := resp.Job.(*api.GetCommandJobJobJobTypeCommand) + if !ok { + log.Warn("Job was not a command job") + continue + } + log = log.With(zap.String("job_state", string(job.State))) + + switch job.State { + case api.JobStatesCanceled, api.JobStatesCanceling: + log.Info("Evicting pending pod for cancelled job") + eviction := &policyv1.Eviction{ObjectMeta: podMeta} + if err := w.k8s.PolicyV1().Evictions(w.cfg.Namespace).Evict(ctx, eviction); err != nil { + log.Error("Couldn't evict pod", zap.Error(err)) + } + return + + case api.JobStatesScheduled: + // The pod can continue waiting for resources / initializing. + + default: + // Assigned, Accepted, Running: Too late. Let the agent within + // the pod handle cancellation. Finished, etc: it's already over. + // If it's any other state, we probably shouldn't interfere. + log.Debug("Ending job cancel checker due to job state") + return + } + } + } +} + +func (w *podWatcher) ignoreJob(jobUUID uuid.UUID) { + w.ignoreJobsMu.Lock() + defer w.ignoreJobsMu.Unlock() w.ignoreJobs[jobUUID] = struct{}{} } +// onceChan stores a channel and a [sync.Once] to be used for closing the +// channel at most once. +type onceChan struct { + once sync.Once + ch chan struct{} +} + +func (oc *onceChan) closeOnce() { + if oc == nil { + return + } + oc.once.Do(func() { close(oc.ch) }) +} + func shouldCancel(containerStatus *corev1.ContainerStatus) bool { return containerStatus.State.Waiting != nil && containerStatus.State.Waiting.Reason == "ImagePullBackOff" diff --git a/internal/integration/fixtures/cancel-checker.yaml b/internal/integration/fixtures/cancel-checker.yaml new file mode 100644 index 00000000..96d74244 --- /dev/null +++ b/internal/integration/fixtures/cancel-checker.yaml @@ -0,0 +1,17 @@ +steps: + - label: ":stopwatch:" + agents: + queue: "{{.queue}}" + plugins: + - kubernetes: + podSpec: + initContainers: + - name: snorlax + image: buildkite/agent:latest + command: + - "sleep 20" + containers: # one command needed to make a valid podspec + - name: load-bearing + image: buildkite/agent:latest + command: + - "echo romeo romeo oscar romeo" diff --git a/internal/integration/integration_test.go b/internal/integration/integration_test.go index d84655b5..5e6534ed 100644 --- a/internal/integration/integration_test.go +++ b/internal/integration/integration_test.go @@ -4,11 +4,13 @@ import ( "context" "fmt" "strconv" + "strings" "testing" "time" "github.com/buildkite/agent-stack-k8s/v2/api" "github.com/buildkite/agent-stack-k8s/v2/internal/controller/scheduler" + "github.com/google/uuid" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" corev1 "k8s.io/api/core/v1" @@ -470,3 +472,31 @@ func TestInterposerVector(t *testing.T) { assert.Contains(t, logs, "Hello World!") assert.Contains(t, logs, "Goodbye World!") } + +func TestCancelCheckerEvictsPod(t *testing.T) { + tc := testcase{ + T: t, + Fixture: "cancel-checker.yaml", + Repo: repoHTTP, + GraphQL: api.NewClient(cfg.BuildkiteToken, cfg.GraphQLEndpoint), + }.Init() + ctx := context.Background() + pipelineID := tc.PrepareQueueAndPipelineWithCleanup(ctx) + tc.StartController(ctx, cfg) + build := tc.TriggerBuild(ctx, pipelineID) + + time.Sleep(10 * time.Second) + + _, err := api.BuildCancel(ctx, tc.GraphQL, api.BuildCancelInput{ + ClientMutationId: uuid.New().String(), + Id: build.Id, + }) + if err != nil { + t.Errorf("api.BuildCancel(... %q) error: %v", build.Id, err) + } + tc.AssertCancelled(ctx, build) + logs := tc.FetchLogs(build) + if strings.Contains(logs, "Received cancellation signal, interrupting") { + t.Error("The agent ran and handled cancellation") + } +} diff --git a/internal/integration/testcase_test.go b/internal/integration/testcase_test.go index 95748556..e930b74f 100644 --- a/internal/integration/testcase_test.go +++ b/internal/integration/testcase_test.go @@ -274,10 +274,14 @@ func (t testcase) AssertArtifactsContain(build api.Build, expected ...string) { func (t testcase) AssertFail(ctx context.Context, build api.Build) { t.Helper() - assert.Equal(t, api.BuildStatesFailed, t.waitForBuild(ctx, build)) } +func (t testcase) AssertCancelled(ctx context.Context, build api.Build) { + t.Helper() + assert.Equal(t, api.BuildStatesCanceled, t.waitForBuild(ctx, build)) +} + func (t testcase) waitForBuild(ctx context.Context, build api.Build) api.BuildStates { t.Helper()