From d2122d4e3f0b10ef69ee86b457c6f58e1945171b Mon Sep 17 00:00:00 2001
From: Dimitrios Liappis <dimitrios.liappis@gmail.com>
Date: Mon, 29 Apr 2024 11:07:59 +0300
Subject: [PATCH 01/30] Add pipeline-scheduler pipeline in catalog.info
 (#39254)

As a follow up to PR#39206 and PR#39171, this commit adds a new generic
scheduling pipeline in catalog-info that serves as a central point
for scheduling any other pipeline.

Unfortunately, it's not possible to specify a custom agent (k8s image)
yet at the catalog-info level[^1], therefore we still need a small
static pipeline -- empty for now -- that uploads the needed steps.

[^1]: https://github.com/elastic/ci/blob/71e83d340e3b93ab43fcf16a7a70ac33bdeec6e9/terrazzo/terrazzo/constructs/buildkite/pipelines.py#L787-L842
---
 .buildkite/pipeline-scheduler.yml |  0
 catalog-info.yaml                 | 47 +++++++++++++++++++++++++++++++
 2 files changed, 47 insertions(+)
 create mode 100644 .buildkite/pipeline-scheduler.yml

diff --git a/.buildkite/pipeline-scheduler.yml b/.buildkite/pipeline-scheduler.yml
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/catalog-info.yaml b/catalog-info.yaml
index 116e5024663..ae37200762b 100644
--- a/catalog-info.yaml
+++ b/catalog-info.yaml
@@ -1113,3 +1113,50 @@ spec:
           access_level: BUILD_AND_READ
         everyone:
           access_level: READ_ONLY
+
+---
+# yaml-language-server: $schema=https://gist.githubusercontent.com/elasticmachine/988b80dae436cafea07d9a4a460a011d/raw/rre.schema.json
+apiVersion: backstage.io/v1alpha1
+kind: Resource
+metadata:
+  name: beats-pipeline-scheduler
+  description: 'Scheduled runs of various Beats pipelines per release branch'
+  links:
+    - title: 'Scheduled runs of Beats pipelines per release branch'
+      url: https://buildkite.com/elastic/logstash-pipeline-scheduler
+spec:
+  type: buildkite-pipeline
+  owner: group:ingest-fp
+  system: buildkite
+  implementation:
+    apiVersion: buildkite.elastic.dev/v1
+    kind: Pipeline
+    metadata:
+      name: beats-pipeline-scheduler
+      description: ':alarm_clock: Scheduled runs of various Beats pipelines per release branch'
+    spec:
+      repository: elastic/beats
+      pipeline_file: ".buildkite/pipeline-scheduler.yml"
+      maximum_timeout_in_minutes: 240
+      schedules:
+        Daily Snapshot DRA:
+          branch: main
+          cronline: 30 02 * * *
+          message: Daily trigger of Iron Bank validation Pipeline per branch
+          env:
+            PIPELINES_TO_TRIGGER: 'beats-ironbank-validation'
+      skip_intermediate_builds: true
+      provider_settings:
+        trigger_mode: none
+      env:
+        # TODO enable slack notifications when it's tested
+        ELASTIC_SLACK_NOTIFICATIONS_ENABLED: 'false'
+        SLACK_NOTIFICATIONS_CHANNEL: '#ingest-notifications'
+        SLACK_NOTIFICATIONS_ON_SUCCESS: 'false'
+      teams:
+        ingest-fp:
+          access_level: MANAGE_BUILD_AND_READ
+        release-eng:
+          access_level: BUILD_AND_READ
+        everyone:
+          access_level: READ_ONLY

From a4b21dcd8435b80a97063e0e69a6d98aaba17d1a Mon Sep 17 00:00:00 2001
From: Dimitrios Liappis <dimitrios.liappis@gmail.com>
Date: Mon, 29 Apr 2024 14:29:41 +0300
Subject: [PATCH 02/30] Add IronBank validation to cron schedule (#39255)

This commit is a follow up to #39254 and adds
a schedule for the IronBank validation pipeline
to the centralized scheduling pipeline.

Relates: https://github.com/elastic/ingest-dev/issues/3235
---
 .buildkite/pipeline-scheduler.yml | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/.buildkite/pipeline-scheduler.yml b/.buildkite/pipeline-scheduler.yml
index e69de29bb2d..3f9b628bc63 100644
--- a/.buildkite/pipeline-scheduler.yml
+++ b/.buildkite/pipeline-scheduler.yml
@@ -0,0 +1,17 @@
+# yaml-language-server: $schema=https://raw.githubusercontent.com/buildkite/pipeline-schema/main/schema.json
+
+# this intermediate pipeline is required because we can't specify a custom agent (k8s image) yet
+# in catalog-info: https://github.com/elastic/ci/blob/71e83d340e3b93ab43fcf16a7a70ac33bdeec6e9/terrazzo/terrazzo/constructs/buildkite/pipelines.py#L787-L842
+
+steps:
+  - label: ":pipeline: Generate trigger steps for $PIPELINES_TO_TRIGGER"
+    command: |
+      set -eo pipefail
+      .buildkite/pipeline-scheduler.py >steps.yml
+      echo "~~~ Printing pipeline steps"
+      yq . steps.yml
+      echo "~~~ Uploading steps"
+      buildkite-agent pipeline upload steps.yml
+    agents:
+      image: "docker.elastic.co/ci-agent-images/platform-ingest/buildkite-agent-beats-ci-with-hooks:0.1"
+      useCustomGlobalHooks: true

From e588628b24946595a9c6123cf57b5597b534c72b Mon Sep 17 00:00:00 2001
From: Fae Charlton <fae.charlton@elastic.co>
Date: Mon, 29 Apr 2024 08:40:02 -0400
Subject: [PATCH 03/30] Fix concurrency bugs that could cause data loss in the
 `aws-s3` input (#39131)

This is a cleanup of concurrency and error handling in the `aws-s3` input that could cause several known bugs:

- Memory leaks ([1](https://github.com/elastic/integrations/issues/9463), [2](https://github.com/elastic/beats/issues/39052)). This issue was caused because the input could run several scans of its s3 bucket simultaneously, which led to the cleanup routine `s3Poller.Purge` being called many times concurrently. Inefficiencies in this function caused it to accumulate over time, creating many copies of the state data which could overload process memory. Fixed by:
  * Changing the `s3Poller` run loop to only run one scan at a time, and wait for it to complete before starting the next one.
  * Having each object persist its own state after completing, instead of waiting until the end of a scan and writing an entire bucket worth of metadata at once.
    - This also allowed the removal of other metadata: there is no longer any reason to track the detailed acknowledgment state of each "listing" (page of ~1K events during bucket enumeration), so the `states` helper object is now much simpler.
- Skipped data due to buggy last-modified calculations ([3](https://github.com/elastic/beats/issues/39065)). The most recent scanned timestamp was calculated incorrectly, causing the input to skip a growing number of events as ingestion progressed.
  * Fixed by removing the bucket-wide last modified check entirely. This feature was already risky, since objects with earlier creation timestamps can appear after ones with later timestamps, so there is always the possibility to miss objects. Since the value was calculated incorrectly and was discarded between runs, we can remove it without breaking compatibility and reimplement it more safely in the future if needed.
- Skipped data because rate limiting is treated as permanent failure ([4](https://github.com/elastic/beats/issues/39114)). The input treats all error types the same, which causes many objects to be skipped for ephemeral errors.
  * Fixed by creating an error, `errS3DownloadFailure`, that is returned when processing failure is caused by a download error. In this case, the S3 workers will not persist the failure to the `states` table, so the object will be retried on the next bucket scan. When this happens the worker also sleeps (using an exponential backoff) before trying the next object.
  * Exponential backoff was also added to the bucket scanning loop for page listing errors, so the bucket scan is not restarted needlessly.
---
 x-pack/filebeat/input/awss3/input.go          |  40 +-
 .../input/awss3/input_benchmark_test.go       |  14 +-
 x-pack/filebeat/input/awss3/s3.go             | 321 ++++-----------
 x-pack/filebeat/input/awss3/s3_objects.go     |  15 +-
 .../filebeat/input/awss3/s3_objects_test.go   |   9 +-
 x-pack/filebeat/input/awss3/s3_test.go        |  20 +-
 x-pack/filebeat/input/awss3/state.go          |  66 +---
 x-pack/filebeat/input/awss3/state_test.go     |   2 +-
 x-pack/filebeat/input/awss3/states.go         | 368 +++---------------
 x-pack/filebeat/input/awss3/states_test.go    | 306 +++------------
 10 files changed, 246 insertions(+), 915 deletions(-)

diff --git a/x-pack/filebeat/input/awss3/input.go b/x-pack/filebeat/input/awss3/input.go
index 733de949f29..bb4a5c15bda 100644
--- a/x-pack/filebeat/input/awss3/input.go
+++ b/x-pack/filebeat/input/awss3/input.go
@@ -13,6 +13,7 @@ import (
 	"time"
 
 	awssdk "github.com/aws/aws-sdk-go-v2/aws"
+	"github.com/aws/aws-sdk-go-v2/aws/retry"
 	"github.com/aws/aws-sdk-go-v2/service/s3"
 	"github.com/aws/aws-sdk-go-v2/service/sqs"
 	"github.com/aws/smithy-go"
@@ -21,7 +22,6 @@ import (
 	v2 "github.com/elastic/beats/v7/filebeat/input/v2"
 	"github.com/elastic/beats/v7/libbeat/beat"
 	"github.com/elastic/beats/v7/libbeat/feature"
-	"github.com/elastic/beats/v7/libbeat/statestore"
 	awscommon "github.com/elastic/beats/v7/x-pack/libbeat/common/aws"
 	conf "github.com/elastic/elastic-agent-libs/config"
 	"github.com/elastic/go-concert/unison"
@@ -99,21 +99,6 @@ func (in *s3Input) Test(ctx v2.TestContext) error {
 }
 
 func (in *s3Input) Run(inputContext v2.Context, pipeline beat.Pipeline) error {
-	var err error
-
-	persistentStore, err := in.store.Access()
-	if err != nil {
-		return fmt.Errorf("can not access persistent store: %w", err)
-	}
-
-	defer persistentStore.Close()
-
-	states := newStates(inputContext)
-	err = states.readStatesFrom(persistentStore)
-	if err != nil {
-		return fmt.Errorf("can not start persistent store: %w", err)
-	}
-
 	ctx := v2.GoContextFromCanceler(inputContext.Cancelation)
 
 	if in.config.QueueURL != "" {
@@ -158,8 +143,20 @@ func (in *s3Input) Run(inputContext v2.Context, pipeline beat.Pipeline) error {
 		}
 		defer client.Close()
 
+		// Connect to the registry and create our states lookup
+		persistentStore, err := in.store.Access()
+		if err != nil {
+			return fmt.Errorf("can not access persistent store: %w", err)
+		}
+		defer persistentStore.Close()
+
+		states, err := newStates(inputContext, persistentStore)
+		if err != nil {
+			return fmt.Errorf("can not start persistent store: %w", err)
+		}
+
 		// Create S3 receiver and S3 notification processor.
-		poller, err := in.createS3Lister(inputContext, ctx, client, persistentStore, states)
+		poller, err := in.createS3Lister(inputContext, ctx, client, states)
 		if err != nil {
 			return fmt.Errorf("failed to initialize s3 poller: %w", err)
 		}
@@ -230,7 +227,7 @@ func (n nonAWSBucketResolver) ResolveEndpoint(region string, options s3.Endpoint
 	return awssdk.Endpoint{URL: n.endpoint, SigningRegion: region, HostnameImmutable: true, Source: awssdk.EndpointSourceCustom}, nil
 }
 
-func (in *s3Input) createS3Lister(ctx v2.Context, cancelCtx context.Context, client beat.Client, persistentStore *statestore.Store, states *states) (*s3Poller, error) {
+func (in *s3Input) createS3Lister(ctx v2.Context, cancelCtx context.Context, client beat.Client, states *states) (*s3Poller, error) {
 	var bucketName string
 	var bucketID string
 	if in.config.NonAWSBucketName != "" {
@@ -250,6 +247,12 @@ func (in *s3Input) createS3Lister(ctx v2.Context, cancelCtx context.Context, cli
 			o.EndpointOptions.UseFIPSEndpoint = awssdk.FIPSEndpointStateEnabled
 		}
 		o.UsePathStyle = in.config.PathStyle
+
+		o.Retryer = retry.NewStandard(func(so *retry.StandardOptions) {
+			so.MaxAttempts = 5
+			// Recover quickly when requests start working again
+			so.NoRetryIncrement = 100
+		})
 	})
 	regionName, err := getRegionForBucket(cancelCtx, s3Client, bucketName)
 	if err != nil {
@@ -295,7 +298,6 @@ func (in *s3Input) createS3Lister(ctx v2.Context, cancelCtx context.Context, cli
 		client,
 		s3EventHandlerFactory,
 		states,
-		persistentStore,
 		bucketID,
 		in.config.BucketListPrefix,
 		in.awsConfig.Region,
diff --git a/x-pack/filebeat/input/awss3/input_benchmark_test.go b/x-pack/filebeat/input/awss3/input_benchmark_test.go
index e05e5b461ca..5d22d141168 100644
--- a/x-pack/filebeat/input/awss3/input_benchmark_test.go
+++ b/x-pack/filebeat/input/awss3/input_benchmark_test.go
@@ -8,7 +8,6 @@ import (
 	"context"
 	"errors"
 	"fmt"
-	"io/ioutil"
 	"os"
 	"path/filepath"
 	"runtime"
@@ -16,6 +15,8 @@ import (
 	"testing"
 	"time"
 
+	"github.com/stretchr/testify/assert"
+
 	"github.com/elastic/beats/v7/libbeat/statestore"
 	"github.com/elastic/beats/v7/libbeat/statestore/storetest"
 
@@ -132,7 +133,7 @@ type constantS3 struct {
 var _ s3API = (*constantS3)(nil)
 
 func newConstantS3(t testing.TB) *constantS3 {
-	data, err := ioutil.ReadFile(cloudtrailTestFile)
+	data, err := os.ReadFile(cloudtrailTestFile)
 	if err != nil {
 		t.Fatal(err)
 	}
@@ -342,14 +343,11 @@ func benchmarkInputS3(t *testing.T, numberOfWorkers int) testing.BenchmarkResult
 					return
 				}
 
-				err = store.Set(awsS3WriteCommitPrefix+"bucket"+listPrefix, &commitWriteState{time.Time{}})
-				if err != nil {
-					errChan <- err
-					return
-				}
+				states, err := newStates(inputCtx, store)
+				assert.NoError(t, err, "states creation should succeed")
 
 				s3EventHandlerFactory := newS3ObjectProcessorFactory(log.Named("s3"), metrics, s3API, config.FileSelectors, backupConfig{}, numberOfWorkers)
-				s3Poller := newS3Poller(logp.NewLogger(inputName), metrics, s3API, client, s3EventHandlerFactory, newStates(inputCtx), store, "bucket", listPrefix, "region", "provider", numberOfWorkers, time.Second)
+				s3Poller := newS3Poller(logp.NewLogger(inputName), metrics, s3API, client, s3EventHandlerFactory, states, "bucket", listPrefix, "region", "provider", numberOfWorkers, time.Second)
 
 				if err := s3Poller.Poll(ctx); err != nil {
 					if !errors.Is(err, context.DeadlineExceeded) {
diff --git a/x-pack/filebeat/input/awss3/s3.go b/x-pack/filebeat/input/awss3/s3.go
index 5aa8d31e95d..8909f78bb39 100644
--- a/x-pack/filebeat/input/awss3/s3.go
+++ b/x-pack/filebeat/input/awss3/s3.go
@@ -11,34 +11,22 @@ import (
 	"sync"
 	"time"
 
-	"github.com/gofrs/uuid"
-	"go.uber.org/multierr"
+	"github.com/aws/aws-sdk-go-v2/aws/ratelimit"
 
 	"github.com/elastic/beats/v7/libbeat/beat"
-	"github.com/elastic/beats/v7/libbeat/statestore"
+	"github.com/elastic/beats/v7/libbeat/common/backoff"
 	awscommon "github.com/elastic/beats/v7/x-pack/libbeat/common/aws"
 	"github.com/elastic/elastic-agent-libs/logp"
 	"github.com/elastic/go-concert/timed"
 )
 
-const maxCircuitBreaker = 5
-
-type commitWriteState struct {
-	time.Time
-}
-
-type s3ObjectInfo struct {
-	name         string
-	key          string
-	etag         string
-	lastModified time.Time
-	listingID    string
-}
+// var instead of const so it can be reduced during unit tests (instead of waiting
+// through 10 minutes of retry backoff)
+var readerLoopMaxCircuitBreaker = 10
 
 type s3ObjectPayload struct {
 	s3ObjectHandler s3ObjectHandler
-	s3ObjectInfo    s3ObjectInfo
-	s3ObjectEvent   s3EventV2
+	objectState     state
 }
 
 type s3Poller struct {
@@ -48,15 +36,12 @@ type s3Poller struct {
 	region               string
 	provider             string
 	bucketPollInterval   time.Duration
-	workerSem            *awscommon.Sem
 	s3                   s3API
 	log                  *logp.Logger
 	metrics              *inputMetrics
 	client               beat.Client
 	s3ObjectHandler      s3ObjectHandlerFactory
 	states               *states
-	store                *statestore.Store
-	workersListingMap    *sync.Map
 	workersProcessingMap *sync.Map
 }
 
@@ -66,7 +51,6 @@ func newS3Poller(log *logp.Logger,
 	client beat.Client,
 	s3ObjectHandler s3ObjectHandlerFactory,
 	states *states,
-	store *statestore.Store,
 	bucket string,
 	listPrefix string,
 	awsRegion string,
@@ -85,41 +69,17 @@ func newS3Poller(log *logp.Logger,
 		region:               awsRegion,
 		provider:             provider,
 		bucketPollInterval:   bucketPollInterval,
-		workerSem:            awscommon.NewSem(numberOfWorkers),
 		s3:                   s3,
 		log:                  log,
 		metrics:              metrics,
 		client:               client,
 		s3ObjectHandler:      s3ObjectHandler,
 		states:               states,
-		store:                store,
-		workersListingMap:    new(sync.Map),
 		workersProcessingMap: new(sync.Map),
 	}
 }
 
-func (p *s3Poller) handlePurgingLock(info s3ObjectInfo, isStored bool) {
-	id := stateID(info.name, info.key, info.etag, info.lastModified)
-	previousState := p.states.FindPreviousByID(id)
-	if !previousState.IsEmpty() {
-		if isStored {
-			previousState.MarkAsStored()
-		} else {
-			previousState.MarkAsError()
-		}
-
-		p.states.Update(previousState, info.listingID)
-	}
-
-	// Manage locks for purging.
-	if p.states.IsListingFullyStored(info.listingID) {
-		// locked on processing we unlock when all the object were ACKed
-		lock, _ := p.workersListingMap.Load(info.listingID)
-		lock.(*sync.Mutex).Unlock()
-	}
-}
-
-func (p *s3Poller) createS3ObjectProcessor(ctx context.Context, state state) (s3ObjectHandler, s3EventV2) {
+func (p *s3Poller) createS3ObjectProcessor(ctx context.Context, state state) s3ObjectHandler {
 	event := s3EventV2{}
 	event.AWSRegion = p.region
 	event.Provider = p.provider
@@ -129,275 +89,126 @@ func (p *s3Poller) createS3ObjectProcessor(ctx context.Context, state state) (s3
 
 	acker := awscommon.NewEventACKTracker(ctx)
 
-	return p.s3ObjectHandler.Create(ctx, p.log, p.client, acker, event), event
+	return p.s3ObjectHandler.Create(ctx, p.log, p.client, acker, event)
 }
 
-func (p *s3Poller) ProcessObject(s3ObjectPayloadChan <-chan *s3ObjectPayload) error {
-	var errs []error
+func (p *s3Poller) workerLoop(ctx context.Context, s3ObjectPayloadChan <-chan *s3ObjectPayload) {
+	rateLimitWaiter := backoff.NewEqualJitterBackoff(ctx.Done(), 1, 120)
 
 	for s3ObjectPayload := range s3ObjectPayloadChan {
-		// Process S3 object (download, parse, create events).
-		err := s3ObjectPayload.s3ObjectHandler.ProcessS3Object()
+		objHandler := s3ObjectPayload.s3ObjectHandler
+		state := s3ObjectPayload.objectState
 
-		// Wait for all events to be ACKed before proceeding.
-		s3ObjectPayload.s3ObjectHandler.Wait()
+		// Process S3 object (download, parse, create events).
+		err := objHandler.ProcessS3Object()
+		if errors.Is(err, errS3DownloadFailed) {
+			// Download errors are ephemeral. Add a backoff delay, then skip to the
+			// next iteration so we don't mark the object as permanently failed.
+			rateLimitWaiter.Wait()
+			continue
+		}
+		// Reset the rate limit delay on results that aren't download errors.
+		rateLimitWaiter.Reset()
 
-		info := s3ObjectPayload.s3ObjectInfo
+		// Wait for downloaded objects to be ACKed.
+		objHandler.Wait()
 
 		if err != nil {
-			event := s3ObjectPayload.s3ObjectEvent
-			errs = append(errs,
-				fmt.Errorf(
-					fmt.Sprintf("failed processing S3 event for object key %q in bucket %q: %%w",
-						event.S3.Object.Key, event.S3.Bucket.Name),
-					err))
-
-			p.handlePurgingLock(info, false)
-			continue
+			p.log.Errorf("failed processing S3 event for object key %q in bucket %q: %v",
+				state.Key, state.Bucket, err.Error())
+
+			// Non-retryable error.
+			state.Failed = true
+		} else {
+			state.Stored = true
 		}
 
-		p.handlePurgingLock(info, true)
+		// Persist the result
+		p.states.AddState(state)
 
 		// Metrics
 		p.metrics.s3ObjectsAckedTotal.Inc()
 	}
-
-	return multierr.Combine(errs...)
 }
 
-func (p *s3Poller) GetS3Objects(ctx context.Context, s3ObjectPayloadChan chan<- *s3ObjectPayload) {
+func (p *s3Poller) readerLoop(ctx context.Context, s3ObjectPayloadChan chan<- *s3ObjectPayload) {
 	defer close(s3ObjectPayloadChan)
 
 	bucketName := getBucketNameFromARN(p.bucket)
 
+	errorBackoff := backoff.NewEqualJitterBackoff(ctx.Done(), 1, 120)
 	circuitBreaker := 0
 	paginator := p.s3.ListObjectsPaginator(bucketName, p.listPrefix)
 	for paginator.HasMorePages() {
 		page, err := paginator.NextPage(ctx)
-		if err != nil {
-			if !paginator.HasMorePages() {
-				break
-			}
 
+		if err != nil {
 			p.log.Warnw("Error when paginating listing.", "error", err)
-			circuitBreaker++
-			if circuitBreaker >= maxCircuitBreaker {
-				p.log.Warnw(fmt.Sprintf("%d consecutive error when paginating listing, breaking the circuit.", circuitBreaker), "error", err)
-				break
+			// QuotaExceededError is client-side rate limiting in the AWS sdk,
+			// don't include it in the circuit breaker count
+			if !errors.As(err, &ratelimit.QuotaExceededError{}) {
+				circuitBreaker++
+				if circuitBreaker >= readerLoopMaxCircuitBreaker {
+					p.log.Warnw(fmt.Sprintf("%d consecutive error when paginating listing, breaking the circuit.", circuitBreaker), "error", err)
+					break
+				}
 			}
+			// add a backoff delay and try again
+			errorBackoff.Wait()
 			continue
 		}
+		// Reset the circuit breaker and the error backoff if a read is successful
+		circuitBreaker = 0
+		errorBackoff.Reset()
 
-		listingID, err := uuid.NewV4()
-		if err != nil {
-			p.log.Warnw("Error generating UUID for listing page.", "error", err)
-			continue
-		}
-
-		// lock for the listing page and state in workersListingMap
-		// this map is shared with the storedOp and will be unlocked there
-		lock := new(sync.Mutex)
-		lock.Lock()
-		p.workersListingMap.Store(listingID.String(), lock)
-
-		totProcessableObjects := 0
 		totListedObjects := len(page.Contents)
-		s3ObjectPayloadChanByPage := make(chan *s3ObjectPayload, totListedObjects)
 
 		// Metrics
 		p.metrics.s3ObjectsListedTotal.Add(uint64(totListedObjects))
 		for _, object := range page.Contents {
-			state := newState(bucketName, *object.Key, *object.ETag, p.listPrefix, *object.LastModified)
-			if p.states.MustSkip(state, p.store) {
+			state := newState(bucketName, *object.Key, *object.ETag, *object.LastModified)
+			if p.states.IsProcessed(state) {
 				p.log.Debugw("skipping state.", "state", state)
 				continue
 			}
 
-			// we have no previous state or the previous state
-			// is not stored: refresh the state
-			previousState := p.states.FindPrevious(state)
-			if previousState.IsEmpty() || !previousState.IsProcessed() {
-				p.states.Update(state, "")
-			}
-
-			s3Processor, event := p.createS3ObjectProcessor(ctx, state)
+			s3Processor := p.createS3ObjectProcessor(ctx, state)
 			if s3Processor == nil {
 				p.log.Debugw("empty s3 processor.", "state", state)
 				continue
 			}
 
-			totProcessableObjects++
-
-			s3ObjectPayloadChanByPage <- &s3ObjectPayload{
+			s3ObjectPayloadChan <- &s3ObjectPayload{
 				s3ObjectHandler: s3Processor,
-				s3ObjectInfo: s3ObjectInfo{
-					name:         bucketName,
-					key:          *object.Key,
-					etag:         *object.ETag,
-					lastModified: *object.LastModified,
-					listingID:    listingID.String(),
-				},
-				s3ObjectEvent: event,
-			}
-		}
-
-		if totProcessableObjects == 0 {
-			p.log.Debugw("0 processable objects on bucket pagination.", "bucket", p.bucket, "listPrefix", p.listPrefix, "listingID", listingID)
-			// nothing to be ACKed, unlock here
-			p.states.DeleteListing(listingID.String())
-			lock.Unlock()
-		} else {
-			listingInfo := &listingInfo{totObjects: totProcessableObjects}
-			p.states.AddListing(listingID.String(), listingInfo)
-
-			// Metrics
-			p.metrics.s3ObjectsProcessedTotal.Add(uint64(totProcessableObjects))
-		}
-
-		close(s3ObjectPayloadChanByPage)
-		for s3ObjectPayload := range s3ObjectPayloadChanByPage {
-			s3ObjectPayloadChan <- s3ObjectPayload
-		}
-	}
-}
-
-func (p *s3Poller) Purge(ctx context.Context) {
-	listingIDs := p.states.GetListingIDs()
-	p.log.Debugw("purging listing.", "listingIDs", listingIDs)
-	for _, listingID := range listingIDs {
-		// we lock here in order to process the purge only after
-		// full listing page is ACKed by all the workers
-		lock, loaded := p.workersListingMap.Load(listingID)
-		if !loaded {
-			// purge calls can overlap, GetListingIDs can return
-			// an outdated snapshot with listing already purged
-			p.states.DeleteListing(listingID)
-			p.log.Debugw("deleting already purged listing from states.", "listingID", listingID)
-			continue
-		}
-
-		lock.(*sync.Mutex).Lock()
-
-		states := map[string]*state{}
-		latestStoredTimeByBucketAndListPrefix := make(map[string]time.Time, 0)
-
-		listingStates := p.states.GetStatesByListingID(listingID)
-		for i, state := range listingStates {
-			// it is not stored, keep
-			if !state.IsProcessed() {
-				p.log.Debugw("state not stored or with error, skip purge", "state", state)
-				continue
+				objectState:     state,
 			}
 
-			var latestStoredTime time.Time
-			states[state.ID] = &listingStates[i]
-			latestStoredTime, ok := latestStoredTimeByBucketAndListPrefix[state.Bucket+state.ListPrefix]
-			if !ok {
-				var commitWriteState commitWriteState
-				err := p.store.Get(awsS3WriteCommitPrefix+state.Bucket+state.ListPrefix, &commitWriteState)
-				if err == nil {
-					// we have no entry in the map, and we have no entry in the store
-					// set zero time
-					latestStoredTime = time.Time{}
-					p.log.Debugw("last stored time is zero time", "bucket", state.Bucket, "listPrefix", state.ListPrefix)
-				} else {
-					latestStoredTime = commitWriteState.Time
-					p.log.Debugw("last stored time is commitWriteState", "commitWriteState", commitWriteState, "bucket", state.Bucket, "listPrefix", state.ListPrefix)
-				}
-			} else {
-				p.log.Debugw("last stored time from memory", "latestStoredTime", latestStoredTime, "bucket", state.Bucket, "listPrefix", state.ListPrefix)
-			}
-
-			if state.LastModified.After(latestStoredTime) {
-				p.log.Debugw("last stored time updated", "state.LastModified", state.LastModified, "bucket", state.Bucket, "listPrefix", state.ListPrefix)
-				latestStoredTimeByBucketAndListPrefix[state.Bucket+state.ListPrefix] = state.LastModified
-			}
-		}
-
-		for key := range states {
-			p.states.Delete(key)
-		}
-
-		if err := p.states.writeStates(p.store); err != nil {
-			p.log.Errorw("Failed to write states to the registry", "error", err)
-		}
-
-		for bucketAndListPrefix, latestStoredTime := range latestStoredTimeByBucketAndListPrefix {
-			if err := p.store.Set(awsS3WriteCommitPrefix+bucketAndListPrefix, commitWriteState{latestStoredTime}); err != nil {
-				p.log.Errorw("Failed to write commit time to the registry", "error", err)
-			}
-		}
-
-		// purge is done, we can unlock and clean
-		lock.(*sync.Mutex).Unlock()
-		p.workersListingMap.Delete(listingID)
-		p.states.DeleteListing(listingID)
-
-		// Listing is removed from all states, we can finalize now
-		for _, state := range states {
-			processor, _ := p.createS3ObjectProcessor(ctx, *state)
-			if err := processor.FinalizeS3Object(); err != nil {
-				p.log.Errorw("Failed to finalize S3 object", "key", state.Key, "error", err)
-			}
+			p.metrics.s3ObjectsProcessedTotal.Inc()
 		}
 	}
 }
 
 func (p *s3Poller) Poll(ctx context.Context) error {
-	// This loop tries to keep the workers busy as much as possible while
-	// honoring the number in config opposed to a simpler loop that does one
-	//  listing, sequentially processes every object and then does another listing
-	workerWg := new(sync.WaitGroup)
 	for ctx.Err() == nil {
-		// Determine how many S3 workers are available.
-		workers, err := p.workerSem.AcquireContext(p.numberOfWorkers, ctx)
-		if err != nil {
-			break
-		}
-
-		if workers == 0 {
-			continue
-		}
+		var workerWg sync.WaitGroup
+		workChan := make(chan *s3ObjectPayload)
 
-		s3ObjectPayloadChan := make(chan *s3ObjectPayload)
-
-		workerWg.Add(1)
-		go func() {
-			defer func() {
-				workerWg.Done()
-			}()
-
-			p.GetS3Objects(ctx, s3ObjectPayloadChan)
-			p.Purge(ctx)
-		}()
-
-		workerWg.Add(workers)
-		for i := 0; i < workers; i++ {
+		// Start the worker goroutines to listen on the work channel
+		for i := 0; i < p.numberOfWorkers; i++ {
+			workerWg.Add(1)
 			go func() {
-				defer func() {
-					workerWg.Done()
-					p.workerSem.Release(1)
-				}()
-				if err := p.ProcessObject(s3ObjectPayloadChan); err != nil {
-					p.log.Warnw("Failed processing S3 listing.", "error", err)
-				}
+				defer workerWg.Done()
+				p.workerLoop(ctx, workChan)
 			}()
 		}
 
-		err = timed.Wait(ctx, p.bucketPollInterval)
-		if err != nil {
-			if errors.Is(err, context.Canceled) {
-				// A canceled context is a normal shutdown.
-				return nil
-			}
+		// Start reading data and wait for its processing to be done
+		p.readerLoop(ctx, workChan)
+		workerWg.Wait()
 
-			return err
-		}
+		_ = timed.Wait(ctx, p.bucketPollInterval)
 	}
 
-	// Wait for all workers to finish.
-	workerWg.Wait()
-
 	if errors.Is(ctx.Err(), context.Canceled) {
 		// A canceled context is a normal shutdown.
 		return nil
diff --git a/x-pack/filebeat/input/awss3/s3_objects.go b/x-pack/filebeat/input/awss3/s3_objects.go
index 32911778336..21dfa2243e7 100644
--- a/x-pack/filebeat/input/awss3/s3_objects.go
+++ b/x-pack/filebeat/input/awss3/s3_objects.go
@@ -43,6 +43,11 @@ type s3ObjectProcessorFactory struct {
 	backupConfig  backupConfig
 }
 
+// errS3DownloadFailed reports problems downloading an S3 object. Download errors
+// should never treated as permanent, they are just an indication to apply a
+// retry backoff until the connection is healthy again.
+var errS3DownloadFailed = errors.New("S3 download failure")
+
 func newS3ObjectProcessorFactory(log *logp.Logger, metrics *inputMetrics, s3 s3API, sel []fileSelectorConfig, backupConfig backupConfig, maxWorkers int) *s3ObjectProcessorFactory {
 	if metrics == nil {
 		// Metrics are optional. Initialize a stub.
@@ -135,8 +140,9 @@ func (p *s3ObjectProcessor) ProcessS3Object() error {
 	// Request object (download).
 	contentType, meta, body, err := p.download()
 	if err != nil {
-		return fmt.Errorf("failed to get s3 object (elapsed_time_ns=%d): %w",
-			time.Since(start).Nanoseconds(), err)
+		// Wrap downloadError in the result so the caller knows it's not a
+		// permanent failure.
+		return fmt.Errorf("%w: %w", errS3DownloadFailed, err)
 	}
 	defer body.Close()
 	p.s3Metadata = meta
@@ -434,10 +440,7 @@ func (p *s3ObjectProcessor) FinalizeS3Object() error {
 	if bucketName == "" {
 		return nil
 	}
-	backupKey := p.s3Obj.S3.Object.Key
-	if p.backupConfig.BackupToBucketPrefix != "" {
-		backupKey = fmt.Sprintf("%s%s", p.backupConfig.BackupToBucketPrefix, backupKey)
-	}
+	backupKey := p.backupConfig.BackupToBucketPrefix + p.s3Obj.S3.Object.Key
 	_, err := p.s3.CopyObject(p.ctx, p.s3Obj.S3.Bucket.Name, bucketName, p.s3Obj.S3.Object.Key, backupKey)
 	if err != nil {
 		return fmt.Errorf("failed to copy object to backup bucket: %w", err)
diff --git a/x-pack/filebeat/input/awss3/s3_objects_test.go b/x-pack/filebeat/input/awss3/s3_objects_test.go
index 6732c12e057..28e8f4f42a5 100644
--- a/x-pack/filebeat/input/awss3/s3_objects_test.go
+++ b/x-pack/filebeat/input/awss3/s3_objects_test.go
@@ -8,7 +8,8 @@ import (
 	"bytes"
 	"context"
 	"errors"
-	"io/ioutil"
+	"io"
+	"os"
 	"path/filepath"
 	"strings"
 	"testing"
@@ -27,7 +28,7 @@ import (
 )
 
 func newS3Object(t testing.TB, filename, contentType string) (s3EventV2, *s3.GetObjectOutput) {
-	data, err := ioutil.ReadFile(filename)
+	data, err := os.ReadFile(filename)
 	if err != nil {
 		t.Fatal(err)
 	}
@@ -39,7 +40,7 @@ func newS3GetObjectResponse(filename string, data []byte, contentType string) *s
 	r := bytes.NewReader(data)
 	getObjectOutput := s3.GetObjectOutput{}
 	getObjectOutput.ContentLength = int64(r.Len())
-	getObjectOutput.Body = ioutil.NopCloser(r)
+	getObjectOutput.Body = io.NopCloser(r)
 	if contentType != "" {
 		getObjectOutput.ContentType = &contentType
 	}
@@ -157,7 +158,7 @@ func TestS3ObjectProcessor(t *testing.T) {
 		ack := awscommon.NewEventACKTracker(ctx)
 		err := s3ObjProc.Create(ctx, logp.NewLogger(inputName), mockPublisher, ack, s3Event).ProcessS3Object()
 		require.Error(t, err)
-		assert.True(t, errors.Is(err, errFakeConnectivityFailure), "expected errFakeConnectivityFailure error")
+		assert.True(t, errors.Is(err, errS3DownloadFailed), "expected errS3DownloadFailed")
 	})
 
 	t.Run("no error empty result in download", func(t *testing.T) {
diff --git a/x-pack/filebeat/input/awss3/s3_test.go b/x-pack/filebeat/input/awss3/s3_test.go
index b94ba7cfb09..be1d65b796e 100644
--- a/x-pack/filebeat/input/awss3/s3_test.go
+++ b/x-pack/filebeat/input/awss3/s3_test.go
@@ -13,7 +13,6 @@ import (
 	"github.com/aws/aws-sdk-go-v2/service/s3"
 	"github.com/aws/aws-sdk-go-v2/service/s3/types"
 	"github.com/golang/mock/gomock"
-	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
 
 	"github.com/elastic/beats/v7/libbeat/statestore"
@@ -134,12 +133,16 @@ func TestS3Poller(t *testing.T) {
 			Return(nil, errFakeConnectivityFailure)
 
 		s3ObjProc := newS3ObjectProcessorFactory(logp.NewLogger(inputName), nil, mockAPI, nil, backupConfig{}, numberOfWorkers)
-		receiver := newS3Poller(logp.NewLogger(inputName), nil, mockAPI, mockPublisher, s3ObjProc, newStates(inputCtx), store, bucket, "key", "region", "provider", numberOfWorkers, pollInterval)
+		states, err := newStates(inputCtx, store)
+		require.NoError(t, err, "states creation must succeed")
+		receiver := newS3Poller(logp.NewLogger(inputName), nil, mockAPI, mockPublisher, s3ObjProc, states, bucket, "key", "region", "provider", numberOfWorkers, pollInterval)
 		require.Error(t, context.DeadlineExceeded, receiver.Poll(ctx))
-		assert.Equal(t, numberOfWorkers, receiver.workerSem.Available())
 	})
 
-	t.Run("retry after Poll error", func(t *testing.T) {
+	t.Run("restart bucket scan after paging errors", func(t *testing.T) {
+		// Change the restart limit to 2 consecutive errors, so the test doesn't
+		// take too long to run
+		readerLoopMaxCircuitBreaker = 2
 		storeReg := statestore.NewRegistry(storetest.NewMemoryStoreBackend())
 		store, err := storeReg.Get("test")
 		if err != nil {
@@ -176,13 +179,13 @@ func TestS3Poller(t *testing.T) {
 		// Initial Next gets an error.
 		mockPagerFirst.EXPECT().
 			HasMorePages().
-			Times(10).
+			Times(2).
 			DoAndReturn(func() bool {
 				return true
 			})
 		mockPagerFirst.EXPECT().
 			NextPage(gomock.Any()).
-			Times(5).
+			Times(2).
 			DoAndReturn(func(_ context.Context, optFns ...func(*s3.Options)) (*s3.ListObjectsV2Output, error) {
 				return nil, errFakeConnectivityFailure
 			})
@@ -257,8 +260,9 @@ func TestS3Poller(t *testing.T) {
 			Return(nil, errFakeConnectivityFailure)
 
 		s3ObjProc := newS3ObjectProcessorFactory(logp.NewLogger(inputName), nil, mockAPI, nil, backupConfig{}, numberOfWorkers)
-		receiver := newS3Poller(logp.NewLogger(inputName), nil, mockAPI, mockPublisher, s3ObjProc, newStates(inputCtx), store, bucket, "key", "region", "provider", numberOfWorkers, pollInterval)
+		states, err := newStates(inputCtx, store)
+		require.NoError(t, err, "states creation must succeed")
+		receiver := newS3Poller(logp.NewLogger(inputName), nil, mockAPI, mockPublisher, s3ObjProc, states, bucket, "key", "region", "provider", numberOfWorkers, pollInterval)
 		require.Error(t, context.DeadlineExceeded, receiver.Poll(ctx))
-		assert.Equal(t, numberOfWorkers, receiver.workerSem.Available())
 	})
 }
diff --git a/x-pack/filebeat/input/awss3/state.go b/x-pack/filebeat/input/awss3/state.go
index 97fb8d538cd..4b7e09f9e7f 100644
--- a/x-pack/filebeat/input/awss3/state.go
+++ b/x-pack/filebeat/input/awss3/state.go
@@ -5,84 +5,52 @@
 package awss3
 
 import (
-	"fmt"
 	"time"
 )
 
 // state is used to communicate the publishing state of a s3 object
 type state struct {
-	// ID is used to identify the state in the store, and it is composed by
-	// Bucket + Key + Etag + LastModified.String(): changing this value or how it is
-	// composed will break backward compatibilities with entries already in the store.
-	ID           string    `json:"id" struct:"id"`
 	Bucket       string    `json:"bucket" struct:"bucket"`
 	Key          string    `json:"key" struct:"key"`
 	Etag         string    `json:"etag" struct:"etag"`
 	LastModified time.Time `json:"last_modified" struct:"last_modified"`
 
-	// ListPrefix is used for unique of the key in the store for awsS3WriteCommitPrefix
-	ListPrefix string `json:"list_prefix" struct:"list_prefix"`
-
 	// A state has Stored = true when all events are ACKed.
 	Stored bool `json:"stored" struct:"stored"`
-	// A state has Error = true when ProcessS3Object returned an error
-	Error bool `json:"error" struct:"error"`
+
+	// Failed is true when ProcessS3Object returned an error other than
+	// s3DownloadError.
+	// Before 8.14, this field was called "error". However, that field was
+	// set for many ephemeral reasons including client-side rate limiting
+	// (see https://github.com/elastic/beats/issues/39114). Now that we
+	// don't treat download errors as permanent, the field name was changed
+	// so that users upgrading from old versions aren't prevented from
+	// retrying old download failures.
+	Failed bool `json:"failed" struct:"failed"`
 }
 
+// ID is used to identify the state in the store, and it is composed by
+// Bucket + Key + Etag + LastModified.String(): changing this value or how it is
+// composed will break backward compatibilities with entries already in the store.
 func stateID(bucket, key, etag string, lastModified time.Time) string {
 	return bucket + key + etag + lastModified.String()
 }
 
 // newState creates a new s3 object state
-func newState(bucket, key, etag, listPrefix string, lastModified time.Time) state {
-	s := state{
+func newState(bucket, key, etag string, lastModified time.Time) state {
+	return state{
 		Bucket:       bucket,
 		Key:          key,
 		LastModified: lastModified,
 		Etag:         etag,
-		ListPrefix:   listPrefix,
-		Stored:       false,
-		Error:        false,
 	}
-
-	s.ID = stateID(s.Bucket, s.Key, s.Etag, s.LastModified)
-
-	return s
 }
 
-// MarkAsStored set the stored flag to true
-func (s *state) MarkAsStored() {
-	s.Stored = true
-}
-
-// MarkAsError set the error flag to true
-func (s *state) MarkAsError() {
-	s.Error = true
-}
-
-// IsProcessed checks if the state is either Stored or Error
-func (s *state) IsProcessed() bool {
-	return s.Stored || s.Error
+func (s *state) ID() string {
+	return stateID(s.Bucket, s.Key, s.Etag, s.LastModified)
 }
 
 // IsEqual checks if the two states point to the same s3 object.
 func (s *state) IsEqual(c *state) bool {
 	return s.Bucket == c.Bucket && s.Key == c.Key && s.Etag == c.Etag && s.LastModified.Equal(c.LastModified)
 }
-
-// IsEmpty checks if the state is empty
-func (s *state) IsEmpty() bool {
-	c := state{}
-	return s.Bucket == c.Bucket && s.Key == c.Key && s.Etag == c.Etag && s.LastModified.Equal(c.LastModified)
-}
-
-// String returns string representation of the struct
-func (s *state) String() string {
-	return fmt.Sprintf(
-		"{ID: %v, Bucket: %v, Key: %v, Etag: %v, LastModified: %v}",
-		s.ID,
-		s.Bucket,
-		s.Key,
-		s.Etag,
-		s.LastModified)
-}
diff --git a/x-pack/filebeat/input/awss3/state_test.go b/x-pack/filebeat/input/awss3/state_test.go
index 24a5e9d81b4..375a44ce79e 100644
--- a/x-pack/filebeat/input/awss3/state_test.go
+++ b/x-pack/filebeat/input/awss3/state_test.go
@@ -61,7 +61,7 @@ func TestStateIsEqual(t *testing.T) {
 					Key:          "/key/to/this/file/1",
 					Etag:         "etag",
 					LastModified: lastModifed,
-					Error:        true,
+					Failed:       true,
 				},
 				{
 					Bucket:       "bucket a",
diff --git a/x-pack/filebeat/input/awss3/states.go b/x-pack/filebeat/input/awss3/states.go
index 449219a867f..edbbcc73793 100644
--- a/x-pack/filebeat/input/awss3/states.go
+++ b/x-pack/filebeat/input/awss3/states.go
@@ -15,278 +15,64 @@ import (
 	"github.com/elastic/beats/v7/libbeat/statestore"
 )
 
-const (
-	awsS3ObjectStatePrefix = "filebeat::aws-s3::state::"
-	awsS3WriteCommitPrefix = "filebeat::aws-s3::writeCommit::"
-)
-
-type listingInfo struct {
-	totObjects int
-
-	mu            sync.Mutex
-	storedObjects int
-	errorObjects  int
-	finalCheck    bool
-}
+const awsS3ObjectStatePrefix = "filebeat::aws-s3::state::"
 
 // states handles list of s3 object state. One must use newStates to instantiate a
 // file states registry. Using the zero-value is not safe.
 type states struct {
-	sync.RWMutex
-
 	log *logp.Logger
 
-	// states store
-	states []state
-
-	// idx maps state IDs to state indexes for fast lookup and modifications.
-	idx map[string]int
+	// Completed S3 object states, indexed by state ID.
+	// statesLock must be held to access states.
+	states     map[string]state
+	statesLock sync.Mutex
 
-	listingIDs        map[string]struct{}
-	listingInfo       *sync.Map
-	statesByListingID map[string][]state
+	// The store used to persist state changes to the registry.
+	// storeLock must be held to access store.
+	store     *statestore.Store
+	storeLock sync.Mutex
 }
 
 // newStates generates a new states registry.
-func newStates(ctx v2.Context) *states {
-	return &states{
-		log:               ctx.Logger.Named("states"),
-		states:            nil,
-		idx:               map[string]int{},
-		listingInfo:       new(sync.Map),
-		listingIDs:        map[string]struct{}{},
-		statesByListingID: map[string][]state{},
-	}
-}
-
-func (s *states) MustSkip(state state, store *statestore.Store) bool {
-	if !s.IsNew(state) {
-		s.log.Debugw("not new state in must skip", "state", state)
-		return true
-	}
-
-	previousState := s.FindPrevious(state)
-
-	// status is forgotten. if there is no previous state and
-	// the state.LastModified is before the last cleanStore
-	// write commit we can remove
-	var commitWriteState commitWriteState
-	err := store.Get(awsS3WriteCommitPrefix+state.Bucket+state.ListPrefix, &commitWriteState)
-	if err == nil && previousState.IsEmpty() &&
-		(state.LastModified.Before(commitWriteState.Time) || state.LastModified.Equal(commitWriteState.Time)) {
-		s.log.Debugw("state.LastModified older than writeCommitState in must skip", "state", state, "commitWriteState", commitWriteState)
-		return true
-	}
-
-	// the previous state is stored or has error: let's skip
-	if !previousState.IsEmpty() && previousState.IsProcessed() {
-		s.log.Debugw("previous state is stored or has error", "state", state)
-		return true
-	}
-
-	return false
-}
-
-func (s *states) Delete(id string) {
-	s.Lock()
-	defer s.Unlock()
-
-	index := s.findPrevious(id)
-	if index >= 0 {
-		last := len(s.states) - 1
-		s.states[last], s.states[index] = s.states[index], s.states[last]
-		s.states = s.states[:last]
-
-		s.idx = map[string]int{}
-		for i, state := range s.states {
-			s.idx[state.ID] = i
-		}
-	}
-}
-
-// IsListingFullyStored check if listing if fully stored
-// After first time the condition is met it will always return false
-func (s *states) IsListingFullyStored(listingID string) bool {
-	info, ok := s.listingInfo.Load(listingID)
-	if !ok {
-		return false
-	}
-	listingInfo, ok := info.(*listingInfo)
-	if !ok {
-		return false
-	}
-
-	listingInfo.mu.Lock()
-	defer listingInfo.mu.Unlock()
-	if listingInfo.finalCheck {
-		return false
-	}
-
-	listingInfo.finalCheck = (listingInfo.storedObjects + listingInfo.errorObjects) == listingInfo.totObjects
-
-	if (listingInfo.storedObjects + listingInfo.errorObjects) > listingInfo.totObjects {
-		s.log.Warnf("unexepected mixmatch between storedObjects (%d), errorObjects (%d) and totObjects (%d)",
-			listingInfo.storedObjects, listingInfo.errorObjects, listingInfo.totObjects)
-	}
-
-	return listingInfo.finalCheck
-}
-
-// AddListing add listing info
-func (s *states) AddListing(listingID string, listingInfo *listingInfo) {
-	s.Lock()
-	defer s.Unlock()
-	s.listingIDs[listingID] = struct{}{}
-	s.listingInfo.Store(listingID, listingInfo)
-}
-
-// DeleteListing delete listing info
-func (s *states) DeleteListing(listingID string) {
-	s.Lock()
-	defer s.Unlock()
-	delete(s.listingIDs, listingID)
-	delete(s.statesByListingID, listingID)
-	s.listingInfo.Delete(listingID)
-}
-
-// Update updates a state. If previous state didn't exist, new one is created
-func (s *states) Update(newState state, listingID string) {
-	s.Lock()
-	defer s.Unlock()
-
-	id := newState.ID
-	index := s.findPrevious(id)
-
-	if index >= 0 {
-		s.states[index] = newState
-	} else {
-		// No existing state found, add new one
-		s.idx[id] = len(s.states)
-		s.states = append(s.states, newState)
-		s.log.Debug("New state added for ", newState.ID)
-	}
-
-	if listingID == "" || !newState.IsProcessed() {
-		return
-	}
-
-	// here we increase the number of stored object
-	info, ok := s.listingInfo.Load(listingID)
-	if !ok {
-		return
-	}
-	listingInfo, ok := info.(*listingInfo)
-	if !ok {
-		return
-	}
-
-	listingInfo.mu.Lock()
-
-	if newState.Stored {
-		listingInfo.storedObjects++
-	}
-
-	if newState.Error {
-		listingInfo.errorObjects++
-	}
-
-	listingInfo.mu.Unlock()
-
-	if _, ok := s.statesByListingID[listingID]; !ok {
-		s.statesByListingID[listingID] = make([]state, 0)
+func newStates(ctx v2.Context, store *statestore.Store) (*states, error) {
+	states := &states{
+		log:    ctx.Logger.Named("states"),
+		states: map[string]state{},
+		store:  store,
 	}
-
-	s.statesByListingID[listingID] = append(s.statesByListingID[listingID], newState)
+	return states, states.loadFromRegistry()
 }
 
-// FindPrevious lookups a registered state, that matching the new state.
-// Returns a zero-state if no match is found.
-func (s *states) FindPrevious(newState state) state {
-	s.RLock()
-	defer s.RUnlock()
-	id := newState.ID
-	i := s.findPrevious(id)
-	if i < 0 {
-		return state{}
-	}
-	return s.states[i]
+func (s *states) IsProcessed(state state) bool {
+	s.statesLock.Lock()
+	defer s.statesLock.Unlock()
+	// Our in-memory table only stores completed objects
+	_, ok := s.states[state.ID()]
+	return ok
 }
 
-// FindPreviousByID lookups a registered state, that matching the id.
-// Returns a zero-state if no match is found.
-func (s *states) FindPreviousByID(id string) state {
-	s.RLock()
-	defer s.RUnlock()
-	i := s.findPrevious(id)
-	if i < 0 {
-		return state{}
-	}
-	return s.states[i]
-}
-
-func (s *states) IsNew(state state) bool {
-	s.RLock()
-	defer s.RUnlock()
-	id := state.ID
-	i := s.findPrevious(id)
-
-	if i < 0 {
-		return true
-	}
+func (s *states) AddState(state state) {
 
-	return !s.states[i].IsEqual(&state)
-}
+	id := state.ID()
+	// Update in-memory copy
+	s.statesLock.Lock()
+	s.states[id] = state
+	s.statesLock.Unlock()
 
-// findPrevious returns the previous state for the file.
-// In case no previous state exists, index -1 is returned
-func (s *states) findPrevious(id string) int {
-	if i, exists := s.idx[id]; exists {
-		return i
+	// Persist to the registry
+	s.storeLock.Lock()
+	key := awsS3ObjectStatePrefix + id
+	if err := s.store.Set(key, state); err != nil {
+		s.log.Errorw("Failed to write states to the registry", "error", err)
 	}
-	return -1
-}
-
-// GetStates creates copy of the file states.
-func (s *states) GetStates() []state {
-	s.RLock()
-	defer s.RUnlock()
-
-	newStates := make([]state, len(s.states))
-	copy(newStates, s.states)
-
-	return newStates
-}
-
-// GetListingIDs return a of the listing IDs
-func (s *states) GetListingIDs() []string {
-	s.RLock()
-	defer s.RUnlock()
-	listingIDs := make([]string, 0, len(s.listingIDs))
-	for listingID := range s.listingIDs {
-		listingIDs = append(listingIDs, listingID)
-	}
-
-	return listingIDs
-}
-
-// GetStatesByListingID return a copy of the states by listing ID
-func (s *states) GetStatesByListingID(listingID string) []state {
-	s.RLock()
-	defer s.RUnlock()
-
-	if _, ok := s.statesByListingID[listingID]; !ok {
-		return nil
-	}
-
-	newStates := make([]state, len(s.statesByListingID[listingID]))
-	copy(newStates, s.statesByListingID[listingID])
-	return newStates
+	s.storeLock.Unlock()
 }
 
-func (s *states) readStatesFrom(store *statestore.Store) error {
-	var states []state
+func (s *states) loadFromRegistry() error {
+	states := map[string]state{}
 
-	err := store.Each(func(key string, dec statestore.ValueDecoder) (bool, error) {
+	s.storeLock.Lock()
+	err := s.store.Each(func(key string, dec statestore.ValueDecoder) (bool, error) {
 		if !strings.HasPrefix(key, awsS3ObjectStatePrefix) {
 			return true, nil
 		}
@@ -294,78 +80,30 @@ func (s *states) readStatesFrom(store *statestore.Store) error {
 		// try to decode. Ignore faulty/incompatible values.
 		var st state
 		if err := dec.Decode(&st); err != nil {
-			// XXX: Do we want to log here? In case we start to store other
-			// state types in the registry, then this operation will likely fail
-			// quite often, producing some false-positives in the logs...
-			return false, err
+			// Skip this key but continue iteration
+			s.log.Warnf("invalid S3 state loading object key %v", key)
+			//nolint:nilerr // One bad object shouldn't stop iteration
+			return true, nil
+		}
+		if !st.Stored && !st.Failed {
+			// This is from an older version where state could be stored in the
+			// registry even if the object wasn't processed, or if it encountered
+			// ephemeral download errors. We don't add these to the in-memory cache,
+			// so if we see them during a bucket scan we will still retry them.
+			return true, nil
 		}
 
-		st.ID = key[len(awsS3ObjectStatePrefix):]
-		states = append(states, st)
+		states[st.ID()] = st
 		return true, nil
 	})
+	s.storeLock.Unlock()
 	if err != nil {
 		return err
 	}
 
-	states = fixStates(states)
-
-	for _, state := range states {
-		s.Update(state, "")
-	}
-
-	return nil
-}
-
-// fixStates cleans up the registry states when updating from an older version
-// of filebeat potentially writing invalid entries.
-func fixStates(states []state) []state {
-	if len(states) == 0 {
-		return states
-	}
-
-	// we use a map of states here, so to identify and merge duplicate entries.
-	idx := map[string]*state{}
-	for i := range states {
-		state := &states[i]
-
-		old, exists := idx[state.ID]
-		if !exists {
-			idx[state.ID] = state
-		} else {
-			mergeStates(old, state) // overwrite the entry in 'old'
-		}
-	}
-
-	if len(idx) == len(states) {
-		return states
-	}
-
-	i := 0
-	newStates := make([]state, len(idx))
-	for _, state := range idx {
-		newStates[i] = *state
-		i++
-	}
-	return newStates
-}
-
-// mergeStates merges 2 states by trying to determine the 'newer' state.
-// The st state is overwritten with the updated fields.
-func mergeStates(st, other *state) {
-	// update file meta-data. As these are updated concurrently by the
-	// inputs, select the newer state based on the update timestamp.
-	if st.LastModified.Before(other.LastModified) {
-		st.LastModified = other.LastModified
-	}
-}
+	s.statesLock.Lock()
+	s.states = states
+	s.statesLock.Unlock()
 
-func (s *states) writeStates(store *statestore.Store) error {
-	for _, state := range s.GetStates() {
-		key := awsS3ObjectStatePrefix + state.ID
-		if err := store.Set(key, state); err != nil {
-			return err
-		}
-	}
 	return nil
 }
diff --git a/x-pack/filebeat/input/awss3/states_test.go b/x-pack/filebeat/input/awss3/states_test.go
index 39dc4cf82e6..2f8bbf58fdf 100644
--- a/x-pack/filebeat/input/awss3/states_test.go
+++ b/x-pack/filebeat/input/awss3/states_test.go
@@ -14,6 +14,7 @@ import (
 	"github.com/elastic/beats/v7/libbeat/statestore/storetest"
 
 	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
 
 	v2 "github.com/elastic/beats/v7/filebeat/input/v2"
 	"github.com/elastic/elastic-agent-libs/logp"
@@ -46,287 +47,92 @@ var inputCtx = v2.Context{
 	Cancelation: context.Background(),
 }
 
-func TestStatesIsNewAndMustSkip(t *testing.T) {
+func TestStatesAddStateAndIsProcessed(t *testing.T) {
 	type stateTestCase struct {
-		states            func() *states
-		state             state
-		mustBeNew         bool
-		persistentStoreKV map[string]interface{}
-		expectedMustSkip  bool
-		expectedIsNew     bool
+		// An initialization callback to invoke on the (initially empty) states.
+		statesEdit func(states *states)
+
+		// The state to call IsProcessed on and the expected result
+		state               state
+		expectedIsProcessed bool
+
+		// If true, the test will run statesEdit, then create a new states
+		// object from the same persistent store before calling IsProcessed
+		// (to test persistence between restarts).
+		shouldReload bool
 	}
 	lastModified := time.Date(2022, time.June, 30, 14, 13, 00, 0, time.UTC)
+	testState1 := newState("bucket", "key", "etag", lastModified)
+	testState2 := newState("bucket1", "key1", "etag1", lastModified)
 	tests := map[string]stateTestCase{
 		"with empty states": {
-			states: func() *states {
-				return newStates(inputCtx)
-			},
-			state:            newState("bucket", "key", "etag", "listPrefix", lastModified),
-			expectedMustSkip: false,
-			expectedIsNew:    true,
+			state:               testState1,
+			expectedIsProcessed: false,
 		},
 		"not existing state": {
-			states: func() *states {
-				states := newStates(inputCtx)
-				states.Update(newState("bucket", "key", "etag", "listPrefix", lastModified), "")
-				return states
+			statesEdit: func(states *states) {
+				states.AddState(testState2)
 			},
-			state:            newState("bucket1", "key1", "etag1", "listPrefix1", lastModified),
-			expectedMustSkip: false,
-			expectedIsNew:    true,
+			state:               testState1,
+			expectedIsProcessed: false,
 		},
 		"existing state": {
-			states: func() *states {
-				states := newStates(inputCtx)
-				states.Update(newState("bucket", "key", "etag", "listPrefix", lastModified), "")
-				return states
-			},
-			state:            newState("bucket", "key", "etag", "listPrefix", lastModified),
-			expectedMustSkip: true,
-			expectedIsNew:    false,
-		},
-		"with different etag": {
-			states: func() *states {
-				states := newStates(inputCtx)
-				states.Update(newState("bucket", "key", "etag1", "listPrefix", lastModified), "")
-				return states
-			},
-			state:            newState("bucket", "key", "etag2", "listPrefix", lastModified),
-			expectedMustSkip: false,
-			expectedIsNew:    true,
-		},
-		"with different lastmodified": {
-			states: func() *states {
-				states := newStates(inputCtx)
-				states.Update(newState("bucket", "key", "etag", "listPrefix", lastModified), "")
-				return states
-			},
-			state:            newState("bucket", "key", "etag", "listPrefix", lastModified.Add(1*time.Second)),
-			expectedMustSkip: false,
-			expectedIsNew:    true,
-		},
-		"with stored state": {
-			states: func() *states {
-				states := newStates(inputCtx)
-				aState := newState("bucket", "key", "etag", "listPrefix", lastModified)
-				aState.Stored = true
-				states.Update(aState, "")
-				return states
+			statesEdit: func(states *states) {
+				states.AddState(testState1)
 			},
-			state:            newState("bucket", "key", "etag", "listPrefix", lastModified),
-			mustBeNew:        true,
-			expectedMustSkip: true,
-			expectedIsNew:    true,
+			state:               testState1,
+			expectedIsProcessed: true,
 		},
-		"with error state": {
-			states: func() *states {
-				states := newStates(inputCtx)
-				aState := newState("bucket", "key", "etag", "listPrefix", lastModified)
-				aState.Error = true
-				states.Update(aState, "")
-				return states
+		"existing stored state is persisted": {
+			statesEdit: func(states *states) {
+				state := testState1
+				state.Stored = true
+				states.AddState(state)
 			},
-			state:            newState("bucket", "key", "etag", "listPrefix", lastModified),
-			mustBeNew:        true,
-			expectedMustSkip: true,
-			expectedIsNew:    true,
+			state:               testState1,
+			shouldReload:        true,
+			expectedIsProcessed: true,
 		},
-		"before commit write": {
-			states: func() *states {
-				return newStates(inputCtx)
+		"existing failed state is persisted": {
+			statesEdit: func(states *states) {
+				state := testState1
+				state.Failed = true
+				states.AddState(state)
 			},
-			persistentStoreKV: map[string]interface{}{
-				awsS3WriteCommitPrefix + "bucket" + "listPrefix": &commitWriteState{lastModified},
-			},
-			state:            newState("bucket", "key", "etag", "listPrefix", lastModified.Add(-1*time.Second)),
-			expectedMustSkip: true,
-			expectedIsNew:    true,
+			state:               testState1,
+			shouldReload:        true,
+			expectedIsProcessed: true,
 		},
-		"same commit write": {
-			states: func() *states {
-				return newStates(inputCtx)
-			},
-			persistentStoreKV: map[string]interface{}{
-				awsS3WriteCommitPrefix + "bucket" + "listPrefix": &commitWriteState{lastModified},
+		"existing unprocessed state is not persisted": {
+			statesEdit: func(states *states) {
+				states.AddState(testState1)
 			},
-			state:            newState("bucket", "key", "etag", "listPrefix", lastModified),
-			expectedMustSkip: true,
-			expectedIsNew:    true,
-		},
-		"after commit write": {
-			states: func() *states {
-				return newStates(inputCtx)
-			},
-			persistentStoreKV: map[string]interface{}{
-				awsS3WriteCommitPrefix + "bucket" + "listPrefix": &commitWriteState{lastModified},
-			},
-			state:            newState("bucket", "key", "etag", "listPrefix", lastModified.Add(time.Second)),
-			expectedMustSkip: false,
-			expectedIsNew:    true,
+			state:               testState1,
+			shouldReload:        true,
+			expectedIsProcessed: false,
 		},
 	}
 
 	for name, test := range tests {
 		test := test
 		t.Run(name, func(t *testing.T) {
-			states := test.states()
 			store := openTestStatestore()
 			persistentStore, err := store.Access()
 			if err != nil {
 				t.Fatalf("unexpected err: %v", err)
 			}
-			for key, value := range test.persistentStoreKV {
-				_ = persistentStore.Set(key, value)
+			states, err := newStates(inputCtx, persistentStore)
+			require.NoError(t, err, "states creation must succeed")
+			if test.statesEdit != nil {
+				test.statesEdit(states)
 			}
-
-			if test.mustBeNew {
-				test.state.LastModified = test.state.LastModified.Add(1 * time.Second)
+			if test.shouldReload {
+				states, err = newStates(inputCtx, persistentStore)
+				require.NoError(t, err, "states creation must succeed")
 			}
 
-			isNew := states.IsNew(test.state)
-			assert.Equal(t, test.expectedIsNew, isNew)
-
-			mustSkip := states.MustSkip(test.state, persistentStore)
-			assert.Equal(t, test.expectedMustSkip, mustSkip)
-		})
-	}
-}
-
-func TestStatesDelete(t *testing.T) {
-	type stateTestCase struct {
-		states   func() *states
-		deleteID string
-		expected []state
-	}
-
-	lastModified := time.Date(2021, time.July, 22, 18, 38, 00, 0, time.UTC)
-	tests := map[string]stateTestCase{
-		"delete empty states": {
-			states: func() *states {
-				return newStates(inputCtx)
-			},
-			deleteID: "an id",
-			expected: []state{},
-		},
-		"delete not existing state": {
-			states: func() *states {
-				states := newStates(inputCtx)
-				states.Update(newState("bucket", "key", "etag", "listPrefix", lastModified), "")
-				return states
-			},
-			deleteID: "an id",
-			expected: []state{
-				{
-					ID:           stateID("bucket", "key", "etag", lastModified),
-					Bucket:       "bucket",
-					Key:          "key",
-					Etag:         "etag",
-					ListPrefix:   "listPrefix",
-					LastModified: lastModified,
-				},
-			},
-		},
-		"delete only one existing": {
-			states: func() *states {
-				states := newStates(inputCtx)
-				states.Update(newState("bucket", "key", "etag", "listPrefix", lastModified), "")
-				return states
-			},
-			deleteID: stateID("bucket", "key", "etag", lastModified),
-			expected: []state{},
-		},
-		"delete first": {
-			states: func() *states {
-				states := newStates(inputCtx)
-				states.Update(newState("bucket", "key1", "etag1", "listPrefix", lastModified), "")
-				states.Update(newState("bucket", "key2", "etag2", "listPrefix", lastModified), "")
-				states.Update(newState("bucket", "key3", "etag3", "listPrefix", lastModified), "")
-				return states
-			},
-			deleteID: "bucketkey1etag1" + lastModified.String(),
-			expected: []state{
-				{
-					ID:           stateID("bucket", "key3", "etag3", lastModified),
-					Bucket:       "bucket",
-					Key:          "key3",
-					Etag:         "etag3",
-					ListPrefix:   "listPrefix",
-					LastModified: lastModified,
-				},
-				{
-					ID:           stateID("bucket", "key2", "etag2", lastModified),
-					Bucket:       "bucket",
-					Key:          "key2",
-					Etag:         "etag2",
-					ListPrefix:   "listPrefix",
-					LastModified: lastModified,
-				},
-			},
-		},
-		"delete last": {
-			states: func() *states {
-				states := newStates(inputCtx)
-				states.Update(newState("bucket", "key1", "etag1", "listPrefix", lastModified), "")
-				states.Update(newState("bucket", "key2", "etag2", "listPrefix", lastModified), "")
-				states.Update(newState("bucket", "key3", "etag3", "listPrefix", lastModified), "")
-				return states
-			},
-			deleteID: "bucketkey3etag3" + lastModified.String(),
-			expected: []state{
-				{
-					ID:           stateID("bucket", "key1", "etag1", lastModified),
-					Bucket:       "bucket",
-					Key:          "key1",
-					Etag:         "etag1",
-					ListPrefix:   "listPrefix",
-					LastModified: lastModified,
-				},
-				{
-					ID:           stateID("bucket", "key2", "etag2", lastModified),
-					Bucket:       "bucket",
-					Key:          "key2",
-					Etag:         "etag2",
-					ListPrefix:   "listPrefix",
-					LastModified: lastModified,
-				},
-			},
-		},
-		"delete any": {
-			states: func() *states {
-				states := newStates(inputCtx)
-				states.Update(newState("bucket", "key1", "etag1", "listPrefix", lastModified), "")
-				states.Update(newState("bucket", "key2", "etag2", "listPrefix", lastModified), "")
-				states.Update(newState("bucket", "key3", "etag3", "listPrefix", lastModified), "")
-				return states
-			},
-			deleteID: "bucketkey2etag2" + lastModified.String(),
-			expected: []state{
-				{
-					ID:           stateID("bucket", "key1", "etag1", lastModified),
-					Bucket:       "bucket",
-					Key:          "key1",
-					Etag:         "etag1",
-					ListPrefix:   "listPrefix",
-					LastModified: lastModified,
-				},
-				{
-					ID:           stateID("bucket", "key3", "etag3", lastModified),
-					Bucket:       "bucket",
-					Key:          "key3",
-					Etag:         "etag3",
-					ListPrefix:   "listPrefix",
-					LastModified: lastModified,
-				},
-			},
-		},
-	}
-
-	for name, test := range tests {
-		test := test
-		t.Run(name, func(t *testing.T) {
-			states := test.states()
-			states.Delete(test.deleteID)
-			assert.Equal(t, test.expected, states.GetStates())
+			isProcessed := states.IsProcessed(test.state)
+			assert.Equal(t, test.expectedIsProcessed, isProcessed)
 		})
 	}
 }

From c2c5fea524317e5724fc9114d3766ca42cf0fccd Mon Sep 17 00:00:00 2001
From: Dimitrios Liappis <dimitrios.liappis@gmail.com>
Date: Mon, 29 Apr 2024 16:52:51 +0300
Subject: [PATCH 04/30] Fix cron description for Iron Bank validation (#39260)

This commit fixes the schedule description for the Iron Bank validation and removes the old static schedule, now that we have a centralized
scheduling job (#39254).

Additionally, now that the job has been tested ([^1]) it enables slack alerts as well.

[^1]: https://github.com/elastic/beats/pull/39255#issuecomment-2082368821
---
 catalog-info.yaml | 24 ++----------------------
 1 file changed, 2 insertions(+), 22 deletions(-)

diff --git a/catalog-info.yaml b/catalog-info.yaml
index ae37200762b..bc22fbc905d 100644
--- a/catalog-info.yaml
+++ b/catalog-info.yaml
@@ -1087,25 +1087,6 @@ spec:
       skip_intermediate_builds: false
       provider_settings:
         trigger_mode: none
-      # TODO uncomment out after https://github.com/elastic/ingest-dev/issues/3235
-      # schedules:
-      #   # TODO to be replaced with a generic scheduler similar to https://github.com/elastic/logstash/pull/15705
-      #   Daily run of ironbank validation / main:
-      #     branch: main
-      #     cronline: 30 02 * * *
-      #     message: Daily trigger of IronBank validation on main
-      #   Daily run of ironbank validation / 8.14:
-      #     branch: 8.14
-      #     cronline: 30 02 * * *
-      #     message: Daily trigger of IronBank validation on 8.14
-      #   Daily run of ironbank validation / 8.13:
-      #     branch: 8.13
-      #     cronline: 30 02 * * *
-      #     message: Daily trigger of IronBank validation on 8.13
-      #   Daily run of ironbank validation / 7.17:
-      #     branch: 7.17
-      #     cronline: 30 02 * * *
-      #     message: Daily trigger of IronBank validation on 7.17
       teams:
         ingest-fp:
           access_level: MANAGE_BUILD_AND_READ
@@ -1139,7 +1120,7 @@ spec:
       pipeline_file: ".buildkite/pipeline-scheduler.yml"
       maximum_timeout_in_minutes: 240
       schedules:
-        Daily Snapshot DRA:
+        Daily run of Iron Bank validation:
           branch: main
           cronline: 30 02 * * *
           message: Daily trigger of Iron Bank validation Pipeline per branch
@@ -1149,8 +1130,7 @@ spec:
       provider_settings:
         trigger_mode: none
       env:
-        # TODO enable slack notifications when it's tested
-        ELASTIC_SLACK_NOTIFICATIONS_ENABLED: 'false'
+        ELASTIC_SLACK_NOTIFICATIONS_ENABLED: 'true'
         SLACK_NOTIFICATIONS_CHANNEL: '#ingest-notifications'
         SLACK_NOTIFICATIONS_ON_SUCCESS: 'false'
       teams:

From 37816dd7150029c8cf99ed9ed962db25d8c3e519 Mon Sep 17 00:00:00 2001
From: Alexandros Sapranidis <alexandros@elastic.co>
Date: Mon, 29 Apr 2024 17:00:58 +0300
Subject: [PATCH 05/30] Update the Beats packaging pipeline settings (#39263)

This commits changes the settings of the packaging pipeline to make it
execute only on the selected branches which currently is only main.

Signed-off-by: Alexandros Sapranidis <alexandros@elastic.co>
---
 catalog-info.yaml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/catalog-info.yaml b/catalog-info.yaml
index bc22fbc905d..0e5a5d864d3 100644
--- a/catalog-info.yaml
+++ b/catalog-info.yaml
@@ -1046,6 +1046,10 @@ spec:
       cancel_intermediate_builds: false
       skip_intermediate_builds: false
       provider_settings:
+        build_branches: false
+        build_pull_request_forks: false
+        build_pull_requests: false
+        build_tags: false
         trigger_mode: code
       env:
         ELASTIC_SLACK_NOTIFICATIONS_ENABLED: 'true'

From 11998672ace0b4e652bfe2edb437bc851880ec7f Mon Sep 17 00:00:00 2001
From: David Kilfoyle <41695641+kilfoyle@users.noreply.github.com>
Date: Mon, 29 Apr 2024 11:46:01 -0400
Subject: [PATCH 06/30] Mark add_docker-metadata process as unsupported in
 packetbeat (#39241)

* Mark add_docker-metadata process as unsupported in packetbeat

* Update libbeat/processors/add_docker_metadata/docs/add_docker_metadata.asciidoc
---
 .../add_docker_metadata/docs/add_docker_metadata.asciidoc   | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/libbeat/processors/add_docker_metadata/docs/add_docker_metadata.asciidoc b/libbeat/processors/add_docker_metadata/docs/add_docker_metadata.asciidoc
index 53292667f13..61658210173 100644
--- a/libbeat/processors/add_docker_metadata/docs/add_docker_metadata.asciidoc
+++ b/libbeat/processors/add_docker_metadata/docs/add_docker_metadata.asciidoc
@@ -5,6 +5,11 @@
 <titleabbrev>add_docker_metadata</titleabbrev>
 ++++
 
+ifeval::["{beatname_lc}"=="packetbeat"]
+There is currently extremely limited capability for using {beatname_lc} to monitor and coexist with containers, for example Docker, Podman, or Kubernetes. Using the `add_docker_metadata` processor with {beatname_lc} is not recommended nor supported. 
+endif::[]
+
+ifeval::["{beatname_lc}"!="packetbeat"]
 The `add_docker_metadata` processor annotates each event with relevant metadata
 from Docker containers. At startup it detects a docker environment and caches the metadata.
 The events are annotated with Docker metadata, only if a valid configuration
@@ -88,3 +93,4 @@ forget metadata for a container, 60s by default.
 
 `labels.dedot`:: (Optional) Default to be false. If set to true, replace dots in
  labels with `_`.
+endif::[]
\ No newline at end of file

From 59421bb12602eab337cee0fe6e689262cba89763 Mon Sep 17 00:00:00 2001
From: Tiago Queiroz <tiago.queiroz@elastic.co>
Date: Mon, 29 Apr 2024 20:24:21 +0200
Subject: [PATCH 07/30] Document havester_limit for Filestream input and fix
 typo (#39244)

This commit documents `harvester_limit` for the filestream input and
replaces `close_*` by the correct key `close.on_state_change.*`.
---
 .../input-filestream-file-options.asciidoc    | 24 +++++++++++++++++++
 .../docs/inputs/input-filestream.asciidoc     |  5 ++--
 2 files changed, 27 insertions(+), 2 deletions(-)

diff --git a/filebeat/docs/inputs/input-filestream-file-options.asciidoc b/filebeat/docs/inputs/input-filestream-file-options.asciidoc
index 47a8c819d9e..5436d3863dc 100644
--- a/filebeat/docs/inputs/input-filestream-file-options.asciidoc
+++ b/filebeat/docs/inputs/input-filestream-file-options.asciidoc
@@ -517,6 +517,30 @@ less than or equal to `prospector.scanner.check_interval`
 If `backoff.max` needs to be higher, it is recommended to close the file handler
 instead and let {beatname_uc} pick up the file again.
 
+[float]
+[id="{beatname_lc}-input-{type}-harvester-limit"]
+===== `harvester_limit`
+
+The `harvester_limit` option limits the number of harvesters that are started in
+parallel for one input. This directly relates to the maximum number of file
+handlers that are opened. The default for `harvester_limit` is 0, which means
+there is no limit. This configuration is useful if the number of files to be
+harvested exceeds the open file handler limit of the operating system.
+
+Setting a limit on the number of harvesters means that potentially not all files
+are opened in parallel. Therefore we recommended that you use this option in
+combination with the `close.on_state_change.*` options to make sure
+harvesters are stopped more often so that new files can be picked up.
+
+Currently if a new harvester can be started again, the harvester is picked
+randomly. This means it's possible that the harvester for a file that was just
+closed and then updated again might be started instead of the harvester for a
+file that hasn't been harvested for a longer period of time.
+
+This configuration option applies per input. You can use this option to
+indirectly set higher priorities on certain inputs by assigning a higher
+limit of harvesters.
+
 [float]
 ===== `file_identity`
 
diff --git a/filebeat/docs/inputs/input-filestream.asciidoc b/filebeat/docs/inputs/input-filestream.asciidoc
index 47d1b24a8e8..54283d6cce7 100644
--- a/filebeat/docs/inputs/input-filestream.asciidoc
+++ b/filebeat/docs/inputs/input-filestream.asciidoc
@@ -11,8 +11,9 @@ Use the `filestream` input to read lines from active log files. It is the
 new, improved alternative to the `log` input. It comes with various improvements
 to the existing input:
 
-1. Checking of `close_*` options happens out of band. Thus, if an output is blocked,
-{beatname_uc} can close the reader and avoid keeping too many files open.
+1. Checking of `close.on_state_change.*` options happens out of
+band. Thus, if an output is blocked, {beatname_uc} can close the
+reader and avoid keeping too many files open.
 
 2. Detailed metrics are available for all files that match the `paths` configuration
 regardless of the `harvester_limit`. This way, you can keep track of all files,

From 54fb91ed4ee4a697ed3c5cd6a08c5d6671b6f73e Mon Sep 17 00:00:00 2001
From: apmmachine <58790750+apmmachine@users.noreply.github.com>
Date: Mon, 29 Apr 2024 16:20:14 -0400
Subject: [PATCH 08/30] chore: Update snapshot.yml (#39268)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Made with ❤️️ by updatecli

Co-authored-by: apmmachine <apmmachine@users.noreply.github.com>
Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com>
---
 testing/environments/snapshot.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/testing/environments/snapshot.yml b/testing/environments/snapshot.yml
index a031c2184e5..b531cf78a51 100644
--- a/testing/environments/snapshot.yml
+++ b/testing/environments/snapshot.yml
@@ -3,7 +3,7 @@
 version: '2.3'
 services:
   elasticsearch:
-    image: docker.elastic.co/elasticsearch/elasticsearch:8.15.0-053650c4-SNAPSHOT
+    image: docker.elastic.co/elasticsearch/elasticsearch:8.15.0-81021969-SNAPSHOT
     # When extend is used it merges healthcheck.tests, see:
     # https://github.com/docker/compose/issues/8962
     # healthcheck:
@@ -31,7 +31,7 @@ services:
     - "./docker/elasticsearch/users_roles:/usr/share/elasticsearch/config/users_roles"
 
   logstash:
-    image: docker.elastic.co/logstash/logstash:8.15.0-053650c4-SNAPSHOT
+    image: docker.elastic.co/logstash/logstash:8.15.0-81021969-SNAPSHOT
     healthcheck:
       test: ["CMD", "curl", "-f", "http://localhost:9600/_node/stats"]
       retries: 600
@@ -44,7 +44,7 @@ services:
       - 5055:5055
 
   kibana:
-    image: docker.elastic.co/kibana/kibana:8.15.0-053650c4-SNAPSHOT
+    image: docker.elastic.co/kibana/kibana:8.15.0-81021969-SNAPSHOT
     environment:
     - "ELASTICSEARCH_USERNAME=kibana_system_user"
     - "ELASTICSEARCH_PASSWORD=testing"

From 81fc73e634c8f8b49d00c0cc3afc378039a1438e Mon Sep 17 00:00:00 2001
From: Maurizio Branca <maurizio.branca@elastic.co>
Date: Mon, 29 Apr 2024 22:48:15 +0200
Subject: [PATCH 09/30] Fix Azure Monitor support for multiple aggregation
 types (#39204)

* Add aggregation type to the MetricRegistry key

The MetricRegistry wasn't using the aggregation type in the cache key,
returning the wrong answer to the 'needs update?' question.

* Handle multiple aggregation types

Restores support for multiple aggregation types for the same metric
name.

Adding tests for the known use cases so we don't miss this feature
again in future updates.
---
 CHANGELOG.next.asciidoc                       |   1 +
 x-pack/metricbeat/module/azure/azure_test.go  |  39 +++++
 x-pack/metricbeat/module/azure/client_test.go | 156 ++++++++++++++++++
 x-pack/metricbeat/module/azure/data.go        | 103 ++++++++----
 x-pack/metricbeat/module/azure/data_test.go   | 107 +++++++++++-
 .../module/azure/metric_registry.go           |   9 +-
 .../module/azure/metric_registry_test.go      | 138 +++++++++++++++-
 .../metricbeat/module/azure/mock_service.go   |   2 +-
 .../module/azure/service_interface.go         |  13 +-
 9 files changed, 527 insertions(+), 41 deletions(-)
 create mode 100644 x-pack/metricbeat/module/azure/azure_test.go

diff --git a/CHANGELOG.next.asciidoc b/CHANGELOG.next.asciidoc
index c10e5eb08fa..f57b7100077 100644
--- a/CHANGELOG.next.asciidoc
+++ b/CHANGELOG.next.asciidoc
@@ -159,6 +159,7 @@ https://github.com/elastic/beats/compare/v8.8.1\...main[Check the HEAD diff]
 - Fix fields not being parsed correctly in postgresql/database {issue}25301[25301] {pull}37720[37720]
 - rabbitmq/queue - Change the mapping type of `rabbitmq.queue.consumers.utilisation.pct` to `scaled_float` from `long` because the values fall within the range of `[0.0, 1.0]`. Previously, conversion to integer resulted in reporting either `0` or `1`.
 - Fix timeout caused by the retrival of which indices are hidden {pull}39165[39165]
+- Fix Azure Monitor support for multiple aggregation types {issue}39192[39192] {pull}39204[39204]
 
 *Osquerybeat*
 
diff --git a/x-pack/metricbeat/module/azure/azure_test.go b/x-pack/metricbeat/module/azure/azure_test.go
new file mode 100644
index 00000000000..c3d67525ddb
--- /dev/null
+++ b/x-pack/metricbeat/module/azure/azure_test.go
@@ -0,0 +1,39 @@
+// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+// or more contributor license agreements. Licensed under the Elastic License;
+// you may not use this file except in compliance with the Elastic License.
+
+package azure
+
+import (
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestGroupMetricsDefinitionsByResourceId(t *testing.T) {
+
+	t.Run("Group metrics definitions by resource ID", func(t *testing.T) {
+		metrics := []Metric{
+			{
+				ResourceId: "resource-1",
+				Namespace:  "namespace-1",
+				Names:      []string{"metric-1"},
+			},
+			{
+				ResourceId: "resource-1",
+				Namespace:  "namespace-1",
+				Names:      []string{"metric-2"},
+			},
+			{
+				ResourceId: "resource-1",
+				Namespace:  "namespace-1",
+				Names:      []string{"metric-3"},
+			},
+		}
+
+		metricsByResourceId := groupMetricsDefinitionsByResourceId(metrics)
+
+		assert.Equal(t, 1, len(metricsByResourceId))
+		assert.Equal(t, 3, len(metricsByResourceId["resource-1"]))
+	})
+}
diff --git a/x-pack/metricbeat/module/azure/client_test.go b/x-pack/metricbeat/module/azure/client_test.go
index 79b1742ded0..c23326ac82b 100644
--- a/x-pack/metricbeat/module/azure/client_test.go
+++ b/x-pack/metricbeat/module/azure/client_test.go
@@ -9,10 +9,12 @@ import (
 	"testing"
 	"time"
 
+	"github.com/Azure/azure-sdk-for-go/sdk/azcore/to"
 	"github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/monitor/armmonitor"
 	"github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/resources/armresources"
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/mock"
+	"github.com/stretchr/testify/require"
 )
 
 var (
@@ -35,6 +37,7 @@ var (
 					},
 				}}},
 	}
+	countUnit = armmonitor.MetricUnit("Count")
 )
 
 func mockMapResourceMetrics(client *Client, resources []*armresources.GenericResourceExpanded, resourceConfig ResourceConfig) ([]Metric, error) {
@@ -112,4 +115,157 @@ func TestGetMetricValues(t *testing.T) {
 		assert.Equal(t, len(client.ResourceConfigurations.Metrics[0].Values), 0)
 		m.AssertExpectations(t)
 	})
+
+	t.Run("multiple aggregation types", func(t *testing.T) {
+		client := NewMockClient()
+		referenceTime := time.Now().UTC()
+		client.ResourceConfigurations = ResourceConfiguration{
+			Metrics: []Metric{
+				{
+					Namespace:    "Microsoft.EventHub/Namespaces",
+					Names:        []string{"ActiveConnections"},
+					Aggregations: "Maximum,Minimum,Average",
+					TimeGrain:    "PT1M",
+				},
+			},
+		}
+
+		m := &MockService{}
+		m.On(
+			"GetMetricValues",
+			mock.Anything,
+			mock.Anything,
+			mock.Anything,
+			mock.Anything,
+			mock.Anything,
+			mock.Anything,
+			mock.Anything,
+		).Return(
+			[]armmonitor.Metric{{
+				ID: to.Ptr("test"),
+				Name: &armmonitor.LocalizableString{
+					Value:          to.Ptr("ActiveConnections"),
+					LocalizedValue: to.Ptr("ActiveConnections"),
+				},
+				Timeseries: []*armmonitor.TimeSeriesElement{{
+					Data: []*armmonitor.MetricValue{{
+						Average:   to.Ptr(1.0),
+						Maximum:   to.Ptr(2.0),
+						Minimum:   to.Ptr(3.0),
+						TimeStamp: to.Ptr(time.Now()),
+					}},
+				}},
+				Type:               to.Ptr("Microsoft.Insights/metrics"),
+				Unit:               &countUnit,
+				DisplayDescription: to.Ptr("Total Active Connections for Microsoft.EventHub."),
+				ErrorCode:          to.Ptr("Success"),
+			}},
+			"PT1M",
+			nil,
+		)
+
+		client.AzureMonitorService = m
+		mr := MockReporterV2{}
+
+		metricValues := client.GetMetricValues(referenceTime, client.ResourceConfigurations.Metrics, &mr)
+
+		require.Equal(t, len(metricValues), 1)
+		require.Equal(t, len(metricValues[0].Values), 1)
+
+		assert.Equal(t, *metricValues[0].Values[0].avg, 1.0)
+		assert.Equal(t, *metricValues[0].Values[0].max, 2.0)
+		assert.Equal(t, *metricValues[0].Values[0].min, 3.0)
+
+		require.Equal(t, len(client.ResourceConfigurations.Metrics[0].Values), 1)
+
+		m.AssertExpectations(t)
+	})
+
+	t.Run("single aggregation types", func(t *testing.T) {
+		client := NewMockClient()
+		referenceTime := time.Now().UTC()
+		timestamp := time.Now().UTC()
+		client.ResourceConfigurations = ResourceConfiguration{
+			Metrics: []Metric{
+				{
+					Namespace:    "Microsoft.EventHub/Namespaces",
+					Names:        []string{"ActiveConnections"},
+					Aggregations: "Maximum",
+					TimeGrain:    "PT1M",
+				}, {
+					Namespace:    "Microsoft.EventHub/Namespaces",
+					Names:        []string{"ActiveConnections"},
+					Aggregations: "Minimum",
+					TimeGrain:    "PT1M",
+				}, {
+					Namespace:    "Microsoft.EventHub/Namespaces",
+					Names:        []string{"ActiveConnections"},
+					Aggregations: "Average",
+					TimeGrain:    "PT1M",
+				},
+			},
+		}
+
+		m := &MockService{}
+
+		x := []struct {
+			aggregation string
+			data        []*armmonitor.MetricValue
+		}{
+			{aggregation: "Maximum", data: []*armmonitor.MetricValue{{Maximum: to.Ptr(3.0), TimeStamp: to.Ptr(timestamp)}}},
+			{aggregation: "Minimum", data: []*armmonitor.MetricValue{{Minimum: to.Ptr(1.0), TimeStamp: to.Ptr(timestamp)}}},
+			{aggregation: "Average", data: []*armmonitor.MetricValue{{Average: to.Ptr(2.0), TimeStamp: to.Ptr(timestamp)}}},
+		}
+
+		for _, v := range x {
+			m.On(
+				"GetMetricValues",
+				mock.Anything,
+				mock.Anything,
+				mock.Anything,
+				mock.Anything,
+				mock.Anything,
+				v.aggregation,
+				mock.Anything,
+			).Return(
+				[]armmonitor.Metric{{
+					ID: to.Ptr("test"),
+					Name: &armmonitor.LocalizableString{
+						Value:          to.Ptr("ActiveConnections"),
+						LocalizedValue: to.Ptr("ActiveConnections"),
+					},
+					Timeseries: []*armmonitor.TimeSeriesElement{{
+						Data: v.data,
+					}},
+					Type:               to.Ptr("Microsoft.Insights/metrics"),
+					Unit:               &countUnit,
+					DisplayDescription: to.Ptr("Total Active Connections for Microsoft.EventHub."),
+					ErrorCode:          to.Ptr("Success"),
+				}},
+				"PT1M",
+				nil,
+			).Once()
+		}
+
+		client.AzureMonitorService = m
+		mr := MockReporterV2{}
+
+		metricValues := client.GetMetricValues(referenceTime, client.ResourceConfigurations.Metrics, &mr)
+
+		require.Equal(t, 3, len(metricValues))
+
+		require.Equal(t, 1, len(metricValues[0].Values))
+		require.Equal(t, 1, len(metricValues[1].Values))
+		require.Equal(t, 1, len(metricValues[2].Values))
+
+		require.NotNil(t, metricValues[0].Values[0].max, "max value is nil")
+		require.NotNil(t, metricValues[1].Values[0].min, "min value is nil")
+		require.NotNil(t, metricValues[2].Values[0].avg, "avg value is nil")
+
+		assert.Equal(t, *metricValues[0].Values[0].max, 3.0)
+		assert.Equal(t, *metricValues[1].Values[0].min, 1.0)
+		assert.Equal(t, *metricValues[2].Values[0].avg, 2.0)
+
+		m.AssertExpectations(t)
+	})
 }
diff --git a/x-pack/metricbeat/module/azure/data.go b/x-pack/metricbeat/module/azure/data.go
index c46aee9da24..b2fffb40426 100644
--- a/x-pack/metricbeat/module/azure/data.go
+++ b/x-pack/metricbeat/module/azure/data.go
@@ -133,41 +133,8 @@ func mapToKeyValuePoints(metrics []Metric) []KeyValuePoint {
 	var points []KeyValuePoint
 	for _, metric := range metrics {
 		for _, value := range metric.Values {
-			point := KeyValuePoint{
-				Timestamp:  value.timestamp,
-				Dimensions: mapstr.M{},
-			}
-
 			metricName := managePropertyName(value.name)
-			switch {
-			case value.min != nil:
-				point.Key = fmt.Sprintf("%s.%s", metricName, "min")
-				point.Value = value.min
-			case value.max != nil:
-				point.Key = fmt.Sprintf("%s.%s", metricName, "max")
-				point.Value = value.max
-			case value.avg != nil:
-				point.Key = fmt.Sprintf("%s.%s", metricName, "avg")
-				point.Value = value.avg
-			case value.total != nil:
-				point.Key = fmt.Sprintf("%s.%s", metricName, "total")
-				point.Value = value.total
-			case value.count != nil:
-				point.Key = fmt.Sprintf("%s.%s", metricName, "count")
-				point.Value = value.count
-			}
-
-			point.Namespace = metric.Namespace
-			point.ResourceId = metric.ResourceId
-			point.ResourceSubId = metric.ResourceSubId
-			point.TimeGrain = metric.TimeGrain
-
-			// The number of dimensions in the metric definition and the
-			// number of dimensions in the metric values should be the same.
-			//
-			// But, since definitions and values are retrieved from different
-			// API endpoints, we need to make sure that we don't panic if the
-			// number of dimensions is different.
+			dimensions := mapstr.M{}
 			if len(metric.Dimensions) == len(value.dimensions) {
 				// Take the dimension name from the metric definition and the
 				// dimension value from the metric value.
@@ -180,11 +147,75 @@ func mapToKeyValuePoints(metrics []Metric) []KeyValuePoint {
 					// Dimensions from metric definition and metric value are
 					// not guaranteed to be in the same order, so we need to
 					// find by name the right value for each dimension.
-					_, _ = point.Dimensions.Put(dim.Name, getDimensionValue(dim.Name, value.dimensions))
+					// _, _ = point.Dimensions.Put(dim.Name, getDimensionValue(dim.Name, value.dimensions))
+					_, _ = dimensions.Put(dim.Name, getDimensionValue(dim.Name, value.dimensions))
 				}
 			}
 
-			points = append(points, point)
+			if value.min != nil {
+				points = append(points, KeyValuePoint{
+					Key:           fmt.Sprintf("%s.%s", metricName, "min"),
+					Value:         value.min,
+					Namespace:     metric.Namespace,
+					ResourceId:    metric.ResourceId,
+					ResourceSubId: metric.ResourceSubId,
+					TimeGrain:     metric.TimeGrain,
+					Dimensions:    dimensions,
+					Timestamp:     value.timestamp,
+				})
+			}
+
+			if value.max != nil {
+				points = append(points, KeyValuePoint{
+					Key:           fmt.Sprintf("%s.%s", metricName, "max"),
+					Value:         value.max,
+					Namespace:     metric.Namespace,
+					ResourceId:    metric.ResourceId,
+					ResourceSubId: metric.ResourceSubId,
+					TimeGrain:     metric.TimeGrain,
+					Dimensions:    dimensions,
+					Timestamp:     value.timestamp,
+				})
+			}
+
+			if value.avg != nil {
+				points = append(points, KeyValuePoint{
+					Key:           fmt.Sprintf("%s.%s", metricName, "avg"),
+					Value:         value.avg,
+					Namespace:     metric.Namespace,
+					ResourceId:    metric.ResourceId,
+					ResourceSubId: metric.ResourceSubId,
+					TimeGrain:     metric.TimeGrain,
+					Dimensions:    dimensions,
+					Timestamp:     value.timestamp,
+				})
+			}
+
+			if value.total != nil {
+				points = append(points, KeyValuePoint{
+					Key:           fmt.Sprintf("%s.%s", metricName, "total"),
+					Value:         value.total,
+					Namespace:     metric.Namespace,
+					ResourceId:    metric.ResourceId,
+					ResourceSubId: metric.ResourceSubId,
+					TimeGrain:     metric.TimeGrain,
+					Dimensions:    dimensions,
+					Timestamp:     value.timestamp,
+				})
+			}
+
+			if value.count != nil {
+				points = append(points, KeyValuePoint{
+					Key:           fmt.Sprintf("%s.%s", metricName, "count"),
+					Value:         value.count,
+					Namespace:     metric.Namespace,
+					ResourceId:    metric.ResourceId,
+					ResourceSubId: metric.ResourceSubId,
+					TimeGrain:     metric.TimeGrain,
+					Dimensions:    dimensions,
+					Timestamp:     value.timestamp,
+				})
+			}
 		}
 	}
 
diff --git a/x-pack/metricbeat/module/azure/data_test.go b/x-pack/metricbeat/module/azure/data_test.go
index 85b781ed64e..1519f78982d 100644
--- a/x-pack/metricbeat/module/azure/data_test.go
+++ b/x-pack/metricbeat/module/azure/data_test.go
@@ -62,7 +62,37 @@ func TestMapToKeyValuePoints(t *testing.T) {
 	resourceSubId := "test"
 	timeGrain := "PT1M"
 
-	t.Run("test aggregation types", func(t *testing.T) {
+	t.Run("test single aggregation type (single config)", func(t *testing.T) {
+
+		metrics := []Metric{{
+			Namespace:     namespace,
+			Names:         []string{"test"},
+			Aggregations:  "min",
+			Values:        []MetricValue{{name: metricName, min: &minValue, timestamp: timestamp}},
+			TimeGrain:     timeGrain,
+			ResourceId:    resourceId,
+			ResourceSubId: resourceSubId,
+		}}
+
+		actual := mapToKeyValuePoints(metrics)
+
+		expected := []KeyValuePoint{
+			{
+				Key:           fmt.Sprintf("%s.%s", metricName, "min"),
+				Value:         &minValue,
+				Namespace:     namespace,
+				TimeGrain:     timeGrain,
+				Timestamp:     timestamp,
+				ResourceId:    resourceId,
+				ResourceSubId: resourceSubId,
+				Dimensions:    map[string]interface{}{},
+			},
+		}
+
+		assert.Equal(t, expected, actual)
+	})
+
+	t.Run("test single aggregation types (multiple configs)", func(t *testing.T) {
 
 		metrics := []Metric{{
 			Namespace:     namespace,
@@ -161,4 +191,79 @@ func TestMapToKeyValuePoints(t *testing.T) {
 
 		assert.Equal(t, expected, actual)
 	})
+
+	t.Run("test multiple aggregation types (multiple configs)", func(t *testing.T) {
+		metrics := []Metric{{
+			Namespace:    namespace,
+			Names:        []string{"test"},
+			Aggregations: "Minimum,Maximum,Average,Total,Count",
+			Values: []MetricValue{
+				{name: metricName, min: &minValue, timestamp: timestamp},
+				{name: metricName, max: &maxValue, timestamp: timestamp},
+				{name: metricName, avg: &avgValue, timestamp: timestamp},
+				{name: metricName, total: &totalValue, timestamp: timestamp},
+				{name: metricName, count: &countValue, timestamp: timestamp},
+			},
+			TimeGrain:     timeGrain,
+			ResourceId:    resourceId,
+			ResourceSubId: resourceSubId,
+		}}
+
+		actual := mapToKeyValuePoints(metrics)
+
+		expected := []KeyValuePoint{
+			{
+				Key:           fmt.Sprintf("%s.%s", metricName, "min"),
+				Value:         &minValue,
+				Namespace:     namespace,
+				TimeGrain:     timeGrain,
+				Timestamp:     timestamp,
+				ResourceId:    resourceId,
+				ResourceSubId: resourceSubId,
+				Dimensions:    map[string]interface{}{},
+			},
+			{
+				Key:           fmt.Sprintf("%s.%s", metricName, "max"),
+				Value:         &maxValue,
+				Namespace:     namespace,
+				TimeGrain:     timeGrain,
+				Timestamp:     timestamp,
+				ResourceId:    resourceId,
+				ResourceSubId: resourceSubId,
+				Dimensions:    map[string]interface{}{},
+			},
+			{
+				Key:           fmt.Sprintf("%s.%s", metricName, "avg"),
+				Value:         &avgValue,
+				Namespace:     namespace,
+				TimeGrain:     timeGrain,
+				Timestamp:     timestamp,
+				ResourceId:    resourceId,
+				ResourceSubId: resourceSubId,
+				Dimensions:    map[string]interface{}{},
+			},
+			{
+				Key:           fmt.Sprintf("%s.%s", metricName, "total"),
+				Value:         &totalValue,
+				Namespace:     namespace,
+				TimeGrain:     timeGrain,
+				Timestamp:     timestamp,
+				ResourceId:    resourceId,
+				ResourceSubId: resourceSubId,
+				Dimensions:    map[string]interface{}{},
+			},
+			{
+				Key:           fmt.Sprintf("%s.%s", metricName, "count"),
+				Value:         &countValue,
+				Namespace:     namespace,
+				TimeGrain:     timeGrain,
+				Timestamp:     timestamp,
+				ResourceId:    resourceId,
+				ResourceSubId: resourceSubId,
+				Dimensions:    map[string]interface{}{},
+			},
+		}
+
+		assert.Equal(t, expected, actual)
+	})
 }
diff --git a/x-pack/metricbeat/module/azure/metric_registry.go b/x-pack/metricbeat/module/azure/metric_registry.go
index cdaa9496b5d..c127701c996 100644
--- a/x-pack/metricbeat/module/azure/metric_registry.go
+++ b/x-pack/metricbeat/module/azure/metric_registry.go
@@ -5,6 +5,7 @@
 package azure
 
 import (
+	"fmt"
 	"strings"
 	"time"
 
@@ -118,8 +119,14 @@ func (m *MetricRegistry) buildMetricKey(metric Metric) string {
 	keyComponents := []string{
 		metric.Namespace,
 		metric.ResourceId,
+		metric.Aggregations,
+		metric.TimeGrain,
+		strings.Join(metric.Names, ","),
+	}
+
+	for _, dim := range metric.Dimensions {
+		keyComponents = append(keyComponents, fmt.Sprintf("%s=%s", dim.Name, dim.Value))
 	}
-	keyComponents = append(keyComponents, metric.Names...)
 
 	return strings.Join(keyComponents, ",")
 }
diff --git a/x-pack/metricbeat/module/azure/metric_registry_test.go b/x-pack/metricbeat/module/azure/metric_registry_test.go
index a0ecdc84b85..63984aa6b59 100644
--- a/x-pack/metricbeat/module/azure/metric_registry_test.go
+++ b/x-pack/metricbeat/module/azure/metric_registry_test.go
@@ -13,7 +13,7 @@ import (
 	"github.com/elastic/elastic-agent-libs/logp"
 )
 
-func TestNewMetricRegistry(t *testing.T) {
+func TestMetricRegistry(t *testing.T) {
 	logger := logp.NewLogger("test azure monitor")
 
 	t.Run("Collect metrics with a regular 5 minutes period", func(t *testing.T) {
@@ -90,4 +90,140 @@ func TestNewMetricRegistry(t *testing.T) {
 
 		assert.True(t, needsUpdate, "metric should not need update")
 	})
+
+	t.Run("Metrics with different aggregation types", func(t *testing.T) {
+		metricRegistry := NewMetricRegistry(logger)
+
+		referenceTime := time.Now().UTC()
+		lastCollectionAt := referenceTime.Add(-time.Minute * 10)
+
+		metric1 := Metric{
+			ResourceId:   "test",
+			Namespace:    "test",
+			Aggregations: "Maximum",
+		}
+		metric2 := Metric{
+			ResourceId:   "test",
+			Namespace:    "test",
+			Aggregations: "Minimum",
+		}
+
+		metricCollectionInfo := MetricCollectionInfo{
+			timeGrain: "PT5M",
+			timestamp: lastCollectionAt,
+		}
+
+		// Update metrics collection info for previous collection
+		metricRegistry.Update(metric1, metricCollectionInfo)
+		metricRegistry.Update(metric2, metricCollectionInfo)
+
+		// Update metric info for metric1
+		metricRegistry.Update(metric1, MetricCollectionInfo{
+			timeGrain: "PT5M",
+			timestamp: referenceTime,
+		})
+
+		// Check if metrics need update
+		metric1NeedsUpdate := metricRegistry.NeedsUpdate(referenceTime, metric1)
+		metric2NeedsUpdate := metricRegistry.NeedsUpdate(referenceTime, metric2)
+
+		assert.False(t, metric1NeedsUpdate, "metric should not need update")
+		assert.True(t, metric2NeedsUpdate, "metric should need update")
+	})
+
+	t.Run("Metrics with different dimensions", func(t *testing.T) {
+		metricRegistry := NewMetricRegistry(logger)
+
+		referenceTime := time.Now().UTC()
+		lastCollectionAt := referenceTime.Add(-time.Minute * 10)
+
+		metric1 := Metric{
+			ResourceId: "resource-id-1",
+			Namespace:  "namespace-1",
+			Names:      []string{"metric-name-1"},
+			Dimensions: []Dimension{
+				{Name: "dimension-1", Value: "*"},
+			},
+			TimeGrain: "PT1M",
+		}
+		metric2 := Metric{
+			ResourceId: "resource-id-1",
+			Namespace:  "namespace-1",
+			Names:      []string{"metric-name-1"},
+			Dimensions: []Dimension{
+				{Name: "dimension-2", Value: "*"},
+			},
+			TimeGrain: "PT1M",
+		}
+
+		metricCollectionInfo := MetricCollectionInfo{
+			timeGrain: "PT1M",
+			timestamp: lastCollectionAt,
+		}
+
+		// Update metrics collection info for previous collection
+		metricRegistry.Update(metric1, metricCollectionInfo)
+		metricRegistry.Update(metric2, metricCollectionInfo)
+
+		// Update metric info for metric1
+		metricRegistry.Update(metric1, MetricCollectionInfo{
+			timeGrain: "PT1M",
+			timestamp: referenceTime,
+		})
+
+		// Check if metrics need update
+		metric1NeedsUpdate := metricRegistry.NeedsUpdate(referenceTime, metric1)
+		metric2NeedsUpdate := metricRegistry.NeedsUpdate(referenceTime, metric2)
+
+		assert.False(t, metric1NeedsUpdate, "metric should not need update")
+		assert.True(t, metric2NeedsUpdate, "metric should need update")
+	})
+
+	t.Run("Metrics with different timegrain", func(t *testing.T) {
+		metricRegistry := NewMetricRegistry(logger)
+
+		referenceTime := time.Now().UTC()
+		lastCollectionAt := referenceTime.Add(-time.Minute * 10)
+
+		metric1 := Metric{
+			ResourceId: "resource-id-1",
+			Namespace:  "namespace-1",
+			Names:      []string{"metric-name-1"},
+			Dimensions: []Dimension{
+				{Name: "dimension-1", Value: "*"},
+			},
+			TimeGrain: "PT1M",
+		}
+		metric2 := Metric{
+			ResourceId: "resource-id-1",
+			Namespace:  "namespace-1",
+			Names:      []string{"metric-name-1"},
+			Dimensions: []Dimension{
+				{Name: "dimension-1", Value: "*"},
+			},
+			TimeGrain: "PT5M",
+		}
+
+		metricCollectionInfo := MetricCollectionInfo{
+			timeGrain: "PT1M",
+			timestamp: lastCollectionAt,
+		}
+
+		// Update metrics collection info for previous collection
+		metricRegistry.Update(metric1, metricCollectionInfo)
+		metricRegistry.Update(metric2, metricCollectionInfo)
+
+		// Update metric info for metric1
+		metricRegistry.Update(metric1, MetricCollectionInfo{
+			timeGrain: "PT1M",
+			timestamp: referenceTime,
+		})
+
+		// Check if metrics need update
+		metric1NeedsUpdate := metricRegistry.NeedsUpdate(referenceTime, metric1)
+		metric2NeedsUpdate := metricRegistry.NeedsUpdate(referenceTime, metric2)
+
+		assert.False(t, metric1NeedsUpdate, "metric should not need update")
+		assert.True(t, metric2NeedsUpdate, "metric should need update")
+	})
 }
diff --git a/x-pack/metricbeat/module/azure/mock_service.go b/x-pack/metricbeat/module/azure/mock_service.go
index 9626952fa6d..293adc7c9a7 100644
--- a/x-pack/metricbeat/module/azure/mock_service.go
+++ b/x-pack/metricbeat/module/azure/mock_service.go
@@ -43,7 +43,7 @@ func (client *MockService) GetMetricNamespaces(resourceId string) (armmonitor.Me
 
 // GetMetricValues is a mock function for the azure service
 func (client *MockService) GetMetricValues(resourceId string, namespace string, timegrain string, timespan string, metricNames []string, aggregations string, filter string) ([]armmonitor.Metric, string, error) {
-	args := client.Called(resourceId, namespace)
+	args := client.Called(resourceId, namespace, timegrain, timespan, metricNames, aggregations, filter)
 	return args.Get(0).([]armmonitor.Metric), args.String(1), args.Error(2)
 }
 
diff --git a/x-pack/metricbeat/module/azure/service_interface.go b/x-pack/metricbeat/module/azure/service_interface.go
index cb524c7f6ea..75ae48d3d6e 100644
--- a/x-pack/metricbeat/module/azure/service_interface.go
+++ b/x-pack/metricbeat/module/azure/service_interface.go
@@ -15,5 +15,16 @@ type Service interface {
 	GetResourceDefinitions(id []string, group []string, rType string, query string) ([]*armresources.GenericResourceExpanded, error)
 	GetMetricDefinitionsWithRetry(resourceId string, namespace string) (armmonitor.MetricDefinitionCollection, error)
 	GetMetricNamespaces(resourceId string) (armmonitor.MetricNamespaceCollection, error)
-	GetMetricValues(resourceId string, namespace string, timegrain string, timespan string, metricNames []string, aggregations string, filter string) ([]armmonitor.Metric, string, error)
+	// GetMetricValues returns the metric values for the given resource ID, namespace, timegrain, timespan, metricNames, aggregations and filter.
+	//
+	// If the timegrain is empty, the default timegrain for the metric is used and returned.
+	GetMetricValues(
+		resourceId string, // resourceId is the ID of the resource to query (e.g. "/subscriptions/{subscriptionId}/resourceGroups/{resourceGroupName}/providers/{resourceProviderNamespace}/{resourceType}/{resourceName}")
+		namespace string, // namespace is the metric namespace to query (e.g. "Microsoft.Compute/virtualMachines")
+		timegrain string, // timegrain is the timegrain to use for the metric query (e.g. "PT1M"); if empty, returns the default timegrain for the metric.
+		timespan string, // timespan is the time interval to query (e.g. 2024-04-29T14:03:00Z/2024-04-29T14:04:00Z)
+		metricNames []string, // metricNames is the list of metric names to query (e.g. ["ServiceApiLatency", "Availability"])
+		aggregations string, // aggregations is the comma-separated list of aggregations to use for the metric query (e.g. "Average,Maximum,Minimum")
+		filter string, // filter is the filter to query for dimensions (e.g. "ActivityType eq '*' AND ActivityName eq '*' AND StatusCode eq '*' AND StatusCodeClass eq '*'")
+	) ([]armmonitor.Metric, string, error)
 }

From 6bb2a82b684922419561fd3a935ddd0573fcb762 Mon Sep 17 00:00:00 2001
From: Yi Song <166383463+goodfirm@users.noreply.github.com>
Date: Tue, 30 Apr 2024 15:23:31 +0800
Subject: [PATCH 10/30] chore: fix function names in comment (#38800)

Signed-off-by: goodfirm <fanyishang@yeah.net>
Co-authored-by: Pierre HILBERT <pierre.hilbert@elastic.co>
---
 dev-tools/mage/kubernetes/kuberemote.go       | 2 +-
 filebeat/input/filestream/environment_test.go | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/dev-tools/mage/kubernetes/kuberemote.go b/dev-tools/mage/kubernetes/kuberemote.go
index 8e9d9897d44..e3062f00d1a 100644
--- a/dev-tools/mage/kubernetes/kuberemote.go
+++ b/dev-tools/mage/kubernetes/kuberemote.go
@@ -250,7 +250,7 @@ func (r *KubeRemote) waitForPod(wait time.Duration, condition watchtools.Conditi
 	return nil, err
 }
 
-// portFoward runs the port forwarding so SSH rsync can be ran into the pod.
+// portForward runs the port forwarding so SSH rsync can be ran into the pod.
 func (r *KubeRemote) portForward(ports []string, stopChannel, readyChannel chan struct{}, stdout, stderr io.Writer) (*portforward.PortForwarder, error) {
 	roundTripper, upgrader, err := spdy.RoundTripperFor(r.cfg)
 	if err != nil {
diff --git a/filebeat/input/filestream/environment_test.go b/filebeat/input/filestream/environment_test.go
index 7c3c8ccd4d3..88163258938 100644
--- a/filebeat/input/filestream/environment_test.go
+++ b/filebeat/input/filestream/environment_test.go
@@ -448,7 +448,7 @@ func (e *inputTestingEnvironment) waitUntilHarvesterIsDone() {
 	}
 }
 
-// requireEventReceived requires that the list of messages has made it into the output.
+// requireEventsReceived requires that the list of messages has made it into the output.
 func (e *inputTestingEnvironment) requireEventsReceived(events []string) {
 	foundEvents := make([]bool, len(events))
 	checkedEventCount := 0

From cfffc1ce552565fa1ae7c8d13abd04fca3d4fa7e Mon Sep 17 00:00:00 2001
From: Pavel Zorin <pavel.zorin@elastic.co>
Date: Tue, 30 Apr 2024 10:27:44 +0200
Subject: [PATCH 11/30] DRA: Disable summary reports for dry runs (#39240)

---
 .buildkite/scripts/dra.sh | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/.buildkite/scripts/dra.sh b/.buildkite/scripts/dra.sh
index ec9d523bf3c..4b6a94ffa2c 100755
--- a/.buildkite/scripts/dra.sh
+++ b/.buildkite/scripts/dra.sh
@@ -70,11 +70,13 @@ docker run --rm \
         --artifact-set "main" \
         ${DRY_RUN} | tee rm-output.txt
 
-# extract the summary URL from a release manager output line like:
-# Report summary-18.22.0.html can be found at https://artifacts-staging.elastic.co/beats/18.22.0-ABCDEFGH/summary-18.22.0.html
 
-SUMMARY_URL=$(grep -E '^Report summary-.* can be found at ' rm-output.txt | grep -oP 'https://\S+' | awk '{print $1}')
-rm rm-output.txt
+if [[ "$DRY_RUN" != "--dry-run" ]]; then
+  # extract the summary URL from a release manager output line like:
+  # Report summary-18.22.0.html can be found at https://artifacts-staging.elastic.co/beats/18.22.0-ABCDEFGH/summary-18.22.0.html
+  SUMMARY_URL=$(grep -E '^Report summary-.* can be found at ' rm-output.txt | grep -oP 'https://\S+' | awk '{print $1}')
+  rm rm-output.txt
 
-# and make it easily clickable as a Builkite annotation
-printf "**Summary link:** [${SUMMARY_URL}](${SUMMARY_URL})\n" | buildkite-agent annotate --style=success 
+  # and make it easily clickable as a Builkite annotation
+  printf "**Summary link:** [${SUMMARY_URL}](${SUMMARY_URL})\n" | buildkite-agent annotate --style=success 
+fi

From d275f2768c0be2b8c2d53c9649a3f263d8b18d64 Mon Sep 17 00:00:00 2001
From: Dimitrios Liappis <dimitrios.liappis@gmail.com>
Date: Tue, 30 Apr 2024 11:36:42 +0300
Subject: [PATCH 12/30] Ensure ordered DRA artifacts (#39270)

As things are now we allow parallel builds on the
packaging pipeline, which could result in out of order
artifacts (depending on which one takes longer to finish).

This commit implements two "queues" (snapshot/staging)
to ensure ordered builds of DRA artifacts.

Relates https://github.com/elastic/ingest-dev/issues/3095
---
 .buildkite/packaging.pipeline.yml | 41 ++++++++++++++++++++++++++++++-
 1 file changed, 40 insertions(+), 1 deletion(-)

diff --git a/.buildkite/packaging.pipeline.yml b/.buildkite/packaging.pipeline.yml
index a7fdabb2268..1dd2aaf60b0 100644
--- a/.buildkite/packaging.pipeline.yml
+++ b/.buildkite/packaging.pipeline.yml
@@ -12,11 +12,32 @@ env:
   PLATFORMS_ARM: "linux/arm64"
 
 steps:
+  # we use concurrency gates (https://buildkite.com/blog/concurrency-gates)
+  # to implement two FIFO queues for DRA-snapshot and DRA-staging
+  # this prevents parallel builds and possibility of publishing out of order DRA artifacts if the first job takes longer than the second
+
+  - name: Start of concurrency group for DRA Snapshot
+    if: build.branch =~ /^\d+\.\d+$$/ || build.branch == 'main' || build.env('RUN_SNAPSHOT') == "true"
+    command: echo "--> Start of concurrency gate dra-snapshot"
+    concurrency_group: "dra-gate-snapshot"
+    concurrency: 1
+    key: start-gate-snapshot
+
+  - name: Start of concurrency group for DRA Staging
+    if: build.branch =~ /^\d+\.\d+$$/
+    command: echo "--> Start of concurrency gate dra-staging"
+    concurrency_group: "dra-gate-staging"
+    concurrency: 1
+    key: start-gate-staging
+
+  - wait
+
   - group: Beats dashboards
     key: dashboards
     steps:
       - label: Snapshot dashboards
         if: build.branch =~ /^\d+\.\d+$$/ || build.branch == 'main' || build.env('RUN_SNAPSHOT') == "true"
+        depends_on: start-gate-snapshot
         key: dashboards-snapshot
         # TODO: container with go and make
         agents:
@@ -34,6 +55,7 @@ steps:
 
       - label: Staging dashboards
         if: build.branch =~ /^\d+\.\d+$$/
+        depends_on: start-gate-staging
         key: dashboards-staging
         # TODO: container with go and make
         agents:
@@ -52,6 +74,7 @@ steps:
   - group: Packaging snapshot
     if: build.branch =~ /^\d+\.\d+$$/ || build.branch == 'main' || build.env('RUN_SNAPSHOT') == "true"
     key: packaging-snapshot
+    depends_on: start-gate-snapshot
     steps:
       - label: "SNAPSHOT: {{matrix}}"
         env:
@@ -123,8 +146,8 @@ steps:
           - build/distributions/**/*
 
   - group: Packaging Staging
-
     key: packaging-staging
+    depends_on: start-gate-staging
     ## Only for release
     if: build.branch =~ /^\d+\.\d+$$/
     steps:
@@ -207,6 +230,7 @@ steps:
         env:
           DRA_WORKFLOW: snapshot
         depends_on:
+          - start-gate-snapshot
           - packaging-snapshot
           - dashboards-snapshot
         command: |
@@ -225,6 +249,7 @@ steps:
         env:
           DRA_WORKFLOW: staging
         depends_on:
+          - start-gate-staging
           - packaging-staging
           - dashboards-staging
         command: |
@@ -235,3 +260,17 @@ steps:
           provider: gcp
           image: "${IMAGE_UBUNTU_X86_64}"
           machineType: "${GCP_DEFAULT_MACHINE_TYPE}"
+
+  - wait
+
+  - command: echo "End of concurrency gate dra-snapshot <--"
+    if: build.branch =~ /^\d+\.\d+$$/ || build.branch == 'main' || build.env('RUN_SNAPSHOT') == "true"
+    concurrency_group: "dra-gate-snapshot"
+    concurrency: 1
+    key: end-gate-snapshot
+
+  - command: echo "End of concurrency gate dra-staging <--"
+    if: build.branch =~ /^\d+\.\d+$$/
+    concurrency_group: "dra-gate-staging"
+    concurrency: 1
+    key: end-gate-staging

From b7e3fa27ce564e6d0b8ae9b12c17fcb860ef4cc8 Mon Sep 17 00:00:00 2001
From: Dimitrios Liappis <dimitrios.liappis@gmail.com>
Date: Tue, 30 Apr 2024 12:11:23 +0300
Subject: [PATCH 13/30] Fix auto triggered packaging builds (#39291)

PR#39263 introduced a bug causing on packaging DRA
builds to be triggered.

This commit fixes the issue and also allowed
triggered builds for `8.14`
---
 catalog-info.yaml | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/catalog-info.yaml b/catalog-info.yaml
index 0e5a5d864d3..59a89559a79 100644
--- a/catalog-info.yaml
+++ b/catalog-info.yaml
@@ -1040,16 +1040,19 @@ spec:
     spec:
       repository: elastic/beats
       pipeline_file: ".buildkite/packaging.pipeline.yml"
-      branch_configuration: "main"
+      branch_configuration: "main 8.14"
       # TODO enable after packaging backports for release branches
       # branch_configuration: "main 8.* 7.17"
       cancel_intermediate_builds: false
       skip_intermediate_builds: false
       provider_settings:
-        build_branches: false
+        build_branches: true
         build_pull_request_forks: false
         build_pull_requests: false
         build_tags: false
+        filter_condition: >-
+          build.branch =~ /^[0-9]+\.[0-9]+$$/ || build.branch == "main"
+        filter_enabled: true
         trigger_mode: code
       env:
         ELASTIC_SLACK_NOTIFICATIONS_ENABLED: 'true'

From d3eaed50b3156664a701f36eda74f2f5e1b3f3a4 Mon Sep 17 00:00:00 2001
From: Dimitrios Liappis <dimitrios.liappis@gmail.com>
Date: Tue, 30 Apr 2024 12:19:13 +0300
Subject: [PATCH 14/30] Add timeout for DRA builds (#39293)

---
 catalog-info.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/catalog-info.yaml b/catalog-info.yaml
index 59a89559a79..420d9c1c16a 100644
--- a/catalog-info.yaml
+++ b/catalog-info.yaml
@@ -1045,6 +1045,7 @@ spec:
       # branch_configuration: "main 8.* 7.17"
       cancel_intermediate_builds: false
       skip_intermediate_builds: false
+      maximum_timeout_in_minutes: 60
       provider_settings:
         build_branches: true
         build_pull_request_forks: false

From 85c9d146ebc454fc18819aed430334ce1b78f7ce Mon Sep 17 00:00:00 2001
From: Dimitrios Liappis <dimitrios.liappis@gmail.com>
Date: Tue, 30 Apr 2024 14:41:35 +0300
Subject: [PATCH 15/30] Fix missing docker staging DRA artifacts (#39297)

The DRA staging release is failing because the Buildkite step isn't capturing the right artifacts.
This commit fixes the issue by adjusting the artifact_paths to match the other steps.
---
 .buildkite/packaging.pipeline.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/packaging.pipeline.yml b/.buildkite/packaging.pipeline.yml
index 1dd2aaf60b0..36cbed29fcf 100644
--- a/.buildkite/packaging.pipeline.yml
+++ b/.buildkite/packaging.pipeline.yml
@@ -192,7 +192,7 @@ steps:
           imagePrefix: "${AWS_IMAGE_UBUNTU_ARM_64}"
           instanceType: "${AWS_ARM_INSTANCE_TYPE}"
         artifact_paths:
-          - build/distributions/**
+          - build/distributions/**/*
         matrix:
           - auditbeat
           - filebeat

From 40c68cf2e16d2c4fe9db903822430736b73d88b8 Mon Sep 17 00:00:00 2001
From: Dimitrios Liappis <dimitrios.liappis@gmail.com>
Date: Tue, 30 Apr 2024 14:53:42 +0300
Subject: [PATCH 16/30] Branch specific concurrency gates (#39298)

PR #39293 introduced one concurrency queue per staging/snapshot but
this slows down unnecessarily concurrent DRA builds for main and other
release branches.

This commit makes the concurrency gates (additionally) specific per branch.
---
 .buildkite/packaging.pipeline.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.buildkite/packaging.pipeline.yml b/.buildkite/packaging.pipeline.yml
index 36cbed29fcf..c01428100ec 100644
--- a/.buildkite/packaging.pipeline.yml
+++ b/.buildkite/packaging.pipeline.yml
@@ -19,14 +19,14 @@ steps:
   - name: Start of concurrency group for DRA Snapshot
     if: build.branch =~ /^\d+\.\d+$$/ || build.branch == 'main' || build.env('RUN_SNAPSHOT') == "true"
     command: echo "--> Start of concurrency gate dra-snapshot"
-    concurrency_group: "dra-gate-snapshot"
+    concurrency_group: "dra-gate-snapshot-$BUILDKITE_BRANCH"
     concurrency: 1
     key: start-gate-snapshot
 
   - name: Start of concurrency group for DRA Staging
     if: build.branch =~ /^\d+\.\d+$$/
     command: echo "--> Start of concurrency gate dra-staging"
-    concurrency_group: "dra-gate-staging"
+    concurrency_group: "dra-gate-staging-$BUILDKITE_BRANCH"
     concurrency: 1
     key: start-gate-staging
 
@@ -265,12 +265,12 @@ steps:
 
   - command: echo "End of concurrency gate dra-snapshot <--"
     if: build.branch =~ /^\d+\.\d+$$/ || build.branch == 'main' || build.env('RUN_SNAPSHOT') == "true"
-    concurrency_group: "dra-gate-snapshot"
+    concurrency_group: "dra-gate-snapshot-$BUILDKITE_BRANCH"
     concurrency: 1
     key: end-gate-snapshot
 
   - command: echo "End of concurrency gate dra-staging <--"
     if: build.branch =~ /^\d+\.\d+$$/
-    concurrency_group: "dra-gate-staging"
+    concurrency_group: "dra-gate-staging-$BUILDKITE_BRANCH"
     concurrency: 1
     key: end-gate-staging

From f6bad74ef6edec0a3d0221d875651aaed75f95d6 Mon Sep 17 00:00:00 2001
From: Alexandros Sapranidis <alexandros@elastic.co>
Date: Tue, 30 Apr 2024 14:58:10 +0300
Subject: [PATCH 17/30] Fix the annotating for snapshot and staging (#39299)

* Fix the annotating for snapshot and staging

This commit adds the `--append` flag to the buildkite-annotate so that
when it is called by the snapshot and staging steps, it will not overwrite
the other annotation.

Signed-off-by: Alexandros Sapranidis <alexandros@elastic.co>
---
 .buildkite/scripts/dra.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/scripts/dra.sh b/.buildkite/scripts/dra.sh
index 4b6a94ffa2c..5ce6e5884b9 100755
--- a/.buildkite/scripts/dra.sh
+++ b/.buildkite/scripts/dra.sh
@@ -78,5 +78,5 @@ if [[ "$DRY_RUN" != "--dry-run" ]]; then
   rm rm-output.txt
 
   # and make it easily clickable as a Builkite annotation
-  printf "**Summary link:** [${SUMMARY_URL}](${SUMMARY_URL})\n" | buildkite-agent annotate --style=success 
+  printf "**${DRA_WORKFLOW} summary link:** [${SUMMARY_URL}](${SUMMARY_URL})\n" | buildkite-agent annotate --style=success --append
 fi

From 2fa1123b8f1fb2eef6a96b23ccb7d460cbb6163b Mon Sep 17 00:00:00 2001
From: Fae Charlton <fae.charlton@elastic.co>
Date: Tue, 30 Apr 2024 12:39:19 -0400
Subject: [PATCH 18/30] Cleanup: organizing code in awss3/input.go (#38958)

Cleanups in `x-pack/filebeat/input/awss3/input.go`.

- Split up the two main configuration cases, SQS queues versus bare S3 buckets, into two explicit helper functions (`s3Input.runQueueReader` and `s3Input.runS3Poller`) instead of handling them inline in `s3Input.Run`.
- Simplify region-detection logic in `getRegionFromQueueURL` (`regionMismatchError` is no longer needed)
- Rename `createS3Lister` to `createS3Poller` (since it creates an `s3Poller`)

This is only a cleanup / reorganization, it does not change any behavior.
---
 x-pack/filebeat/input/awss3/input.go      | 155 +++++++++++-----------
 x-pack/filebeat/input/awss3/input_test.go |   4 +-
 2 files changed, 76 insertions(+), 83 deletions(-)

diff --git a/x-pack/filebeat/input/awss3/input.go b/x-pack/filebeat/input/awss3/input.go
index bb4a5c15bda..51e8c9808ed 100644
--- a/x-pack/filebeat/input/awss3/input.go
+++ b/x-pack/filebeat/input/awss3/input.go
@@ -102,72 +102,85 @@ func (in *s3Input) Run(inputContext v2.Context, pipeline beat.Pipeline) error {
 	ctx := v2.GoContextFromCanceler(inputContext.Cancelation)
 
 	if in.config.QueueURL != "" {
-		regionName, err := getRegionFromQueueURL(in.config.QueueURL, in.config.AWSConfig.Endpoint, in.config.RegionName)
-		if err != nil && in.config.RegionName == "" {
-			return fmt.Errorf("failed to get AWS region from queue_url: %w", err)
-		}
-		var warn regionMismatchError
-		if errors.As(err, &warn) {
-			// Warn of mismatch, but go ahead with configured region name.
-			inputContext.Logger.Warnf("%v: using %q", err, regionName)
-		}
-		in.awsConfig.Region = regionName
+		return in.runQueueReader(ctx, inputContext, pipeline)
+	}
 
-		// Create SQS receiver and S3 notification processor.
-		receiver, err := in.createSQSReceiver(inputContext, pipeline)
-		if err != nil {
-			return fmt.Errorf("failed to initialize sqs receiver: %w", err)
-		}
-		defer receiver.metrics.Close()
+	if in.config.BucketARN != "" || in.config.NonAWSBucketName != "" {
+		return in.runS3Poller(ctx, inputContext, pipeline)
+	}
 
-		// Poll metrics periodically in the background
-		go pollSqsWaitingMetric(ctx, receiver)
+	return nil
+}
 
-		if err := receiver.Receive(ctx); err != nil {
-			return err
-		}
+func (in *s3Input) runQueueReader(
+	ctx context.Context,
+	inputContext v2.Context,
+	pipeline beat.Pipeline,
+) error {
+	configRegion := in.config.RegionName
+	urlRegion, err := getRegionFromQueueURL(in.config.QueueURL, in.config.AWSConfig.Endpoint)
+	if err != nil && configRegion == "" {
+		// Only report an error if we don't have a configured region
+		// to fall back on.
+		return fmt.Errorf("failed to get AWS region from queue_url: %w", err)
+	} else if configRegion != "" && configRegion != urlRegion {
+		inputContext.Logger.Warnf("configured region disagrees with queue_url region (%q != %q): using %q", configRegion, urlRegion, urlRegion)
 	}
 
-	if in.config.BucketARN != "" || in.config.NonAWSBucketName != "" {
-		// Create client for publishing events and receive notification of their ACKs.
-		client, err := pipeline.ConnectWith(beat.ClientConfig{
-			EventListener: awscommon.NewEventACKHandler(),
-			Processing: beat.ProcessingConfig{
-				// This input only produces events with basic types so normalization
-				// is not required.
-				EventNormalization: boolPtr(false),
-			},
-		})
-		if err != nil {
-			return fmt.Errorf("failed to create pipeline client: %w", err)
-		}
-		defer client.Close()
+	in.awsConfig.Region = urlRegion
 
-		// Connect to the registry and create our states lookup
-		persistentStore, err := in.store.Access()
-		if err != nil {
-			return fmt.Errorf("can not access persistent store: %w", err)
-		}
-		defer persistentStore.Close()
+	// Create SQS receiver and S3 notification processor.
+	receiver, err := in.createSQSReceiver(inputContext, pipeline)
+	if err != nil {
+		return fmt.Errorf("failed to initialize sqs receiver: %w", err)
+	}
+	defer receiver.metrics.Close()
 
-		states, err := newStates(inputContext, persistentStore)
-		if err != nil {
-			return fmt.Errorf("can not start persistent store: %w", err)
-		}
+	// Poll metrics periodically in the background
+	go pollSqsWaitingMetric(ctx, receiver)
 
-		// Create S3 receiver and S3 notification processor.
-		poller, err := in.createS3Lister(inputContext, ctx, client, states)
-		if err != nil {
-			return fmt.Errorf("failed to initialize s3 poller: %w", err)
-		}
-		defer poller.metrics.Close()
+	return receiver.Receive(ctx)
+}
 
-		if err := poller.Poll(ctx); err != nil {
-			return err
-		}
+func (in *s3Input) runS3Poller(
+	ctx context.Context,
+	inputContext v2.Context,
+	pipeline beat.Pipeline,
+) error {
+	// Create client for publishing events and receive notification of their ACKs.
+	client, err := pipeline.ConnectWith(beat.ClientConfig{
+		EventListener: awscommon.NewEventACKHandler(),
+		Processing: beat.ProcessingConfig{
+			// This input only produces events with basic types so normalization
+			// is not required.
+			EventNormalization: boolPtr(false),
+		},
+	})
+	if err != nil {
+		return fmt.Errorf("failed to create pipeline client: %w", err)
 	}
+	defer client.Close()
 
-	return nil
+	// Connect to the registry and create our states lookup
+	persistentStore, err := in.store.Access()
+	if err != nil {
+		return fmt.Errorf("can not access persistent store: %w", err)
+	}
+	defer persistentStore.Close()
+
+	states, err := newStates(inputContext, persistentStore)
+	if err != nil {
+		return fmt.Errorf("can not start persistent store: %w", err)
+	}
+
+	// Create S3 receiver and S3 notification processor.
+	poller, err := in.createS3Poller(inputContext, ctx, client, states)
+	if err != nil {
+		return fmt.Errorf("failed to initialize s3 poller: %w", err)
+	}
+	defer poller.metrics.Close()
+
+	return poller.Poll(ctx)
 }
 
 func (in *s3Input) createSQSReceiver(ctx v2.Context, pipeline beat.Pipeline) (*sqsReader, error) {
@@ -212,8 +225,11 @@ func (in *s3Input) createSQSReceiver(ctx v2.Context, pipeline beat.Pipeline) (*s
 		return nil, err
 	}
 	in.metrics = newInputMetrics(ctx.ID, nil, in.config.MaxNumberOfMessages)
+
 	s3EventHandlerFactory := newS3ObjectProcessorFactory(log.Named("s3"), in.metrics, s3API, fileSelectors, in.config.BackupConfig, in.config.MaxNumberOfMessages)
+
 	sqsMessageHandler := newSQSS3EventProcessor(log.Named("sqs_s3_event"), in.metrics, sqsAPI, script, in.config.VisibilityTimeout, in.config.SQSMaxReceiveCount, pipeline, s3EventHandlerFactory, in.config.MaxNumberOfMessages)
+
 	sqsReader := newSQSReader(log.Named("sqs"), in.metrics, sqsAPI, in.config.MaxNumberOfMessages, sqsMessageHandler)
 
 	return sqsReader, nil
@@ -227,7 +243,7 @@ func (n nonAWSBucketResolver) ResolveEndpoint(region string, options s3.Endpoint
 	return awssdk.Endpoint{URL: n.endpoint, SigningRegion: region, HostnameImmutable: true, Source: awssdk.EndpointSourceCustom}, nil
 }
 
-func (in *s3Input) createS3Lister(ctx v2.Context, cancelCtx context.Context, client beat.Client, states *states) (*s3Poller, error) {
+func (in *s3Input) createS3Poller(ctx v2.Context, cancelCtx context.Context, client beat.Client, states *states) (*s3Poller, error) {
 	var bucketName string
 	var bucketID string
 	if in.config.NonAWSBucketName != "" {
@@ -310,7 +326,7 @@ func (in *s3Input) createS3Lister(ctx v2.Context, cancelCtx context.Context, cli
 
 var errBadQueueURL = errors.New("QueueURL is not in format: https://sqs.{REGION_ENDPOINT}.{ENDPOINT}/{ACCOUNT_NUMBER}/{QUEUE_NAME} or https://{VPC_ENDPOINT}.sqs.{REGION_ENDPOINT}.vpce.{ENDPOINT}/{ACCOUNT_NUMBER}/{QUEUE_NAME}")
 
-func getRegionFromQueueURL(queueURL string, endpoint, defaultRegion string) (region string, err error) {
+func getRegionFromQueueURL(queueURL, endpoint string) (string, error) {
 	// get region from queueURL
 	// Example for sqs queue: https://sqs.us-east-1.amazonaws.com/12345678912/test-s3-logs
 	// Example for vpce: https://vpce-test.sqs.us-east-1.vpce.amazonaws.com/12345678912/sqs-queue
@@ -323,11 +339,7 @@ func getRegionFromQueueURL(queueURL string, endpoint, defaultRegion string) (reg
 		// check for sqs queue url
 		if len(queueHostSplit) == 3 && queueHostSplit[0] == "sqs" {
 			if queueHostSplit[2] == endpoint || (endpoint == "" && strings.HasPrefix(queueHostSplit[2], "amazonaws.")) {
-				region = queueHostSplit[1]
-				if defaultRegion != "" && region != defaultRegion {
-					return defaultRegion, regionMismatchError{queueURLRegion: region, defaultRegion: defaultRegion}
-				}
-				return region, nil
+				return queueHostSplit[1], nil
 			}
 		}
 
@@ -335,30 +347,13 @@ func getRegionFromQueueURL(queueURL string, endpoint, defaultRegion string) (reg
 		queueHostSplitVPC := strings.SplitN(u.Host, ".", 5)
 		if len(queueHostSplitVPC) == 5 && queueHostSplitVPC[1] == "sqs" {
 			if queueHostSplitVPC[4] == endpoint || (endpoint == "" && strings.HasPrefix(queueHostSplitVPC[4], "amazonaws.")) {
-				region = queueHostSplitVPC[2]
-				if defaultRegion != "" && region != defaultRegion {
-					return defaultRegion, regionMismatchError{queueURLRegion: region, defaultRegion: defaultRegion}
-				}
-				return region, nil
+				return queueHostSplitVPC[2], nil
 			}
 		}
-
-		if defaultRegion != "" {
-			return defaultRegion, nil
-		}
 	}
 	return "", errBadQueueURL
 }
 
-type regionMismatchError struct {
-	queueURLRegion string
-	defaultRegion  string
-}
-
-func (e regionMismatchError) Error() string {
-	return fmt.Sprintf("configured region disagrees with queue_url region: %q != %q", e.queueURLRegion, e.defaultRegion)
-}
-
 func getRegionForBucket(ctx context.Context, s3Client *s3.Client, bucketName string) (string, error) {
 	getBucketLocationOutput, err := s3Client.GetBucketLocation(ctx, &s3.GetBucketLocationInput{
 		Bucket: awssdk.String(bucketName),
diff --git a/x-pack/filebeat/input/awss3/input_test.go b/x-pack/filebeat/input/awss3/input_test.go
index abc9f5c9a6a..0a3053f7f1b 100644
--- a/x-pack/filebeat/input/awss3/input_test.go
+++ b/x-pack/filebeat/input/awss3/input_test.go
@@ -54,7 +54,6 @@ func TestGetRegionFromQueueURL(t *testing.T) {
 		name     string
 		queueURL string
 		endpoint string
-		deflt    string
 		want     string
 		wantErr  error
 	}{
@@ -77,7 +76,6 @@ func TestGetRegionFromQueueURL(t *testing.T) {
 		{
 			name:     "vpce_endpoint",
 			queueURL: "https://vpce-test.sqs.us-east-2.vpce.amazonaws.com/12345678912/sqs-queue",
-			deflt:    "",
 			want:     "us-east-2",
 		},
 		{
@@ -90,7 +88,7 @@ func TestGetRegionFromQueueURL(t *testing.T) {
 
 	for _, test := range tests {
 		t.Run(test.name, func(t *testing.T) {
-			got, err := getRegionFromQueueURL(test.queueURL, test.endpoint, test.deflt)
+			got, err := getRegionFromQueueURL(test.queueURL, test.endpoint)
 			if !sameError(err, test.wantErr) {
 				t.Errorf("unexpected error: got:%v want:%v", err, test.wantErr)
 			}

From e2c652c2d38fa2a7d4b130ce2860cb0c0c98b87d Mon Sep 17 00:00:00 2001
From: Dimitrios Liappis <dimitrios.liappis@gmail.com>
Date: Tue, 30 Apr 2024 20:10:34 +0300
Subject: [PATCH 19/30] Disable 8.14 DRA on Jenkins (#39322)

This commit is complementing PR #39321 and is needed to disable the
execution of 8.14 DRA packaging on Jenkins.
---
 .ci/jobs/packaging.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.ci/jobs/packaging.yml b/.ci/jobs/packaging.yml
index 6d4b136a557..50cec32edd8 100644
--- a/.ci/jobs/packaging.yml
+++ b/.ci/jobs/packaging.yml
@@ -14,7 +14,7 @@
           discover-pr-forks-trust: 'permission'
           discover-pr-origin: 'merge-current'
           discover-tags: true
-          head-filter-regex: '(7\.1[6789]|8\.\d+|PR-.*|v\d+\.\d+\.\d+)'
+          head-filter-regex: '(7\.1[6789]|8\.13|PR-.*|v8\.13\.\d+)'
           disable-pr-notifications: true
           notification-context: 'beats-packaging'
           repo: 'beats'
@@ -28,11 +28,11 @@
               ignore-tags-older-than: -1
               ignore-tags-newer-than: 30
           - named-branches:
-              - regex-name:
-                  regex: '7\.1[6789]'
+              - exact-name:
+                  name: '8.13'
                   case-sensitive: true
               - regex-name:
-                  regex: '8\.\d+'
+                  regex: '7\.1[6789]'
                   case-sensitive: true
           - change-request:
               ignore-target-only-changes: true

From 8c48989a8498f29e8e4dfdcf9d8f6f8bba6fc285 Mon Sep 17 00:00:00 2001
From: Blake Rouse <blake.rouse@elastic.co>
Date: Tue, 30 Apr 2024 13:12:30 -0400
Subject: [PATCH 20/30] Include metricbeat modules directory into agentbeat
 build. (#39278)

---
 metricbeat/scripts/mage/package.go | 14 +++++++-------
 x-pack/agentbeat/magefile.go       | 20 ++++++++++++++++++--
 2 files changed, 25 insertions(+), 9 deletions(-)

diff --git a/metricbeat/scripts/mage/package.go b/metricbeat/scripts/mage/package.go
index e206881dd3c..43e12652f4a 100644
--- a/metricbeat/scripts/mage/package.go
+++ b/metricbeat/scripts/mage/package.go
@@ -40,7 +40,7 @@ const (
 // not supported. You must declare a dependency on either
 // PrepareModulePackagingOSS or PrepareModulePackagingXPack.
 func CustomizePackaging() {
-	mg.Deps(customizeLightModulesPackaging)
+	mg.Deps(CustomizeLightModulesPackaging)
 
 	var (
 		modulesDTarget = "modules.d"
@@ -104,7 +104,7 @@ func CustomizePackaging() {
 // PrepareModulePackagingOSS generates build/package/modules and
 // build/package/modules.d directories for use in packaging.
 func PrepareModulePackagingOSS() error {
-	err := prepareLightModulesPackaging("module")
+	err := PrepareLightModulesPackaging("module")
 	if err != nil {
 		return err
 	}
@@ -116,7 +116,7 @@ func PrepareModulePackagingOSS() error {
 // PrepareModulePackagingXPack generates build/package/modules and
 // build/package/modules.d directories for use in packaging.
 func PrepareModulePackagingXPack() error {
-	err := prepareLightModulesPackaging("module", devtools.OSSBeatDir("module"))
+	err := PrepareLightModulesPackaging("module", devtools.OSSBeatDir("module"))
 	if err != nil {
 		return err
 	}
@@ -201,8 +201,8 @@ func GenerateDirModulesD() error {
 	return nil
 }
 
-// customizeLightModulesPackaging customizes packaging to add light modules
-func customizeLightModulesPackaging() error {
+// CustomizeLightModulesPackaging customizes packaging to add light modules
+func CustomizeLightModulesPackaging() error {
 	var (
 		moduleTarget = "module"
 		module       = devtools.PackageFile{
@@ -225,8 +225,8 @@ func customizeLightModulesPackaging() error {
 	return nil
 }
 
-// prepareLightModulesPackaging generates light modules
-func prepareLightModulesPackaging(paths ...string) error {
+// PrepareLightModulesPackaging generates light modules
+func PrepareLightModulesPackaging(paths ...string) error {
 	err := devtools.Clean([]string{dirModulesGenerated})
 	if err != nil {
 		return err
diff --git a/x-pack/agentbeat/magefile.go b/x-pack/agentbeat/magefile.go
index 874c79bf7a3..c7e6c561830 100644
--- a/x-pack/agentbeat/magefile.go
+++ b/x-pack/agentbeat/magefile.go
@@ -20,6 +20,7 @@ import (
 
 	devtools "github.com/elastic/beats/v7/dev-tools/mage"
 	"github.com/elastic/beats/v7/dev-tools/mage/target/build"
+	metricbeat "github.com/elastic/beats/v7/metricbeat/scripts/mage"
 	packetbeat "github.com/elastic/beats/v7/packetbeat/scripts/mage"
 	osquerybeat "github.com/elastic/beats/v7/x-pack/osquerybeat/scripts/mage"
 
@@ -112,11 +113,19 @@ func CrossBuildDeps() error {
 	return callForBeat("crossBuildExt", "osquerybeat")
 }
 
+// PrepareLightModules prepares the module packaging.
+func PrepareLightModules() error {
+	return metricbeat.PrepareLightModulesPackaging(
+		filepath.Join("..", "metricbeat", "module"),       // x-pack/metricbeat
+		filepath.Join("..", "..", "metricbeat", "module"), // metricbeat (oss)
+	)
+}
+
 // Package packages the Beat for distribution.
 // Use SNAPSHOT=true to build snapshots.
 // Use PLATFORMS to control the target platforms.
 // Use VERSION_QUALIFIER to control the version qualifier.
-func Package() {
+func Package() error {
 	start := time.Now()
 	defer func() { fmt.Println("package ran for", time.Since(start)) }()
 
@@ -126,7 +135,14 @@ func Package() {
 	// Add osquery distro binaries, required for the osquerybeat subcommand.
 	osquerybeat.CustomizePackaging()
 
-	mg.SerialDeps(Update, osquerybeat.FetchOsqueryDistros, CrossBuildDeps, CrossBuild, devtools.Package, TestPackages)
+	// Add metricbeat lightweight modules.
+	if err := metricbeat.CustomizeLightModulesPackaging(); err != nil {
+		return err
+	}
+
+	mg.SerialDeps(Update, PrepareLightModules, osquerybeat.FetchOsqueryDistros, CrossBuildDeps, CrossBuild, devtools.Package, TestPackages)
+
+	return nil
 }
 
 // TestPackages tests the generated packages (i.e. file modes, owners, groups).

From 562e48efea1a93eca9087ca03781dfba60cca883 Mon Sep 17 00:00:00 2001
From: Alex K <8418476+fearful-symmetry@users.noreply.github.com>
Date: Tue, 30 Apr 2024 12:10:18 -0700
Subject: [PATCH 21/30] Add queue percentage to libbeat metrics (#39205)

* add queue full percentage metric

* newline

* add div by zero check

* change name

* linter

* fix gauge settings

* linter...

* change name

* set percentage when we set queue max

* change name

* round numbers
---
 libbeat/monitoring/report/log/log.go     | 66 ++++++++++++------------
 libbeat/publisher/pipeline/monitoring.go | 29 +++++++++--
 2 files changed, 59 insertions(+), 36 deletions(-)

diff --git a/libbeat/monitoring/report/log/log.go b/libbeat/monitoring/report/log/log.go
index 886e207593a..e11e8228cf7 100644
--- a/libbeat/monitoring/report/log/log.go
+++ b/libbeat/monitoring/report/log/log.go
@@ -37,34 +37,36 @@ import (
 // TODO: Replace this with a proper solution that uses the metric type from
 // where it is defined. See: https://github.com/elastic/beats/issues/5433
 var gauges = map[string]bool{
-	"libbeat.output.events.active":       true,
-	"libbeat.pipeline.events.active":     true,
-	"libbeat.pipeline.clients":           true,
-	"libbeat.config.module.running":      true,
-	"registrar.states.current":           true,
-	"filebeat.events.active":             true,
-	"filebeat.harvester.running":         true,
-	"filebeat.harvester.open_files":      true,
-	"beat.memstats.memory_total":         true,
-	"beat.memstats.memory_alloc":         true,
-	"beat.memstats.rss":                  true,
-	"beat.memstats.gc_next":              true,
-	"beat.info.uptime.ms":                true,
-	"beat.cgroup.memory.mem.usage.bytes": true,
-	"beat.cpu.user.ticks":                true,
-	"beat.cpu.system.ticks":              true,
-	"beat.cpu.total.value":               true,
-	"beat.cpu.total.ticks":               true,
-	"beat.handles.open":                  true,
-	"beat.handles.limit.hard":            true,
-	"beat.handles.limit.soft":            true,
-	"beat.runtime.goroutines":            true,
-	"system.load.1":                      true,
-	"system.load.5":                      true,
-	"system.load.15":                     true,
-	"system.load.norm.1":                 true,
-	"system.load.norm.5":                 true,
-	"system.load.norm.15":                true,
+	"libbeat.output.events.active":             true,
+	"libbeat.pipeline.events.active":           true,
+	"libbeat.pipeline.clients":                 true,
+	"libbeat.pipeline.queue.max_events":        true,
+	"libbeat.pipeline.queue.filled.pct.events": true,
+	"libbeat.config.module.running":            true,
+	"registrar.states.current":                 true,
+	"filebeat.events.active":                   true,
+	"filebeat.harvester.running":               true,
+	"filebeat.harvester.open_files":            true,
+	"beat.memstats.memory_total":               true,
+	"beat.memstats.memory_alloc":               true,
+	"beat.memstats.rss":                        true,
+	"beat.memstats.gc_next":                    true,
+	"beat.info.uptime.ms":                      true,
+	"beat.cgroup.memory.mem.usage.bytes":       true,
+	"beat.cpu.user.ticks":                      true,
+	"beat.cpu.system.ticks":                    true,
+	"beat.cpu.total.value":                     true,
+	"beat.cpu.total.ticks":                     true,
+	"beat.handles.open":                        true,
+	"beat.handles.limit.hard":                  true,
+	"beat.handles.limit.soft":                  true,
+	"beat.runtime.goroutines":                  true,
+	"system.load.1":                            true,
+	"system.load.5":                            true,
+	"system.load.15":                           true,
+	"system.load.norm.1":                       true,
+	"system.load.norm.5":                       true,
+	"system.load.norm.15":                      true,
 }
 
 // isGauge returns true when the given metric key name represents a gauge value.
@@ -249,16 +251,16 @@ func toKeyValuePairs(snaps map[string]monitoring.FlatSnapshot) []interface{} {
 	for name, snap := range snaps {
 		data := make(mapstr.M, snapshotLen(snap))
 		for k, v := range snap.Bools {
-			data.Put(k, v) //nolint:errcheck // All keys within the flat snapshot are unique and are for scalar values.
+			data.Put(k, v)
 		}
 		for k, v := range snap.Floats {
-			data.Put(k, v) //nolint:errcheck // All keys within the flat snapshot are unique and are for scalar values.
+			data.Put(k, v)
 		}
 		for k, v := range snap.Ints {
-			data.Put(k, v) //nolint:errcheck // All keys within the flat snapshot are unique and are for scalar values.
+			data.Put(k, v)
 		}
 		for k, v := range snap.Strings {
-			data.Put(k, v) //nolint:errcheck // All keys within the flat snapshot are unique and are for scalar values.
+			data.Put(k, v)
 		}
 		if len(data) > 0 {
 			args = append(args, logp.Reflect(name, data))
diff --git a/libbeat/publisher/pipeline/monitoring.go b/libbeat/publisher/pipeline/monitoring.go
index 69a21c2c71c..cda329e0963 100644
--- a/libbeat/publisher/pipeline/monitoring.go
+++ b/libbeat/publisher/pipeline/monitoring.go
@@ -17,7 +17,11 @@
 
 package pipeline
 
-import "github.com/elastic/elastic-agent-libs/monitoring"
+import (
+	"math"
+
+	"github.com/elastic/elastic-agent-libs/monitoring"
+)
 
 type observer interface {
 	pipelineObserver
@@ -67,8 +71,9 @@ type metricsObserverVars struct {
 	activeEvents                        *monitoring.Uint
 
 	// queue metrics
-	queueACKed     *monitoring.Uint
-	queueMaxEvents *monitoring.Uint
+	queueACKed       *monitoring.Uint
+	queueMaxEvents   *monitoring.Uint
+	percentQueueFull *monitoring.Float
 }
 
 func newMetricsObserver(metrics *monitoring.Registry) *metricsObserver {
@@ -92,7 +97,8 @@ func newMetricsObserver(metrics *monitoring.Registry) *metricsObserver {
 			queueACKed:     monitoring.NewUint(reg, "queue.acked"),
 			queueMaxEvents: monitoring.NewUint(reg, "queue.max_events"),
 
-			activeEvents: monitoring.NewUint(reg, "events.active"), // Gauge
+			activeEvents:     monitoring.NewUint(reg, "events.active"), // Gauge
+			percentQueueFull: monitoring.NewFloat(reg, "queue.filled.pct.events"),
 		},
 	}
 }
@@ -121,12 +127,24 @@ func (o *metricsObserver) clientClosed() { o.vars.clients.Dec() }
 func (o *metricsObserver) newEvent() {
 	o.vars.events.Inc()
 	o.vars.activeEvents.Inc()
+	o.setPercentageFull()
+}
+
+// setPercentageFull is used interally to set the `queue.full` metric
+func (o *metricsObserver) setPercentageFull() {
+	maxEvt := o.vars.queueMaxEvents.Get()
+	if maxEvt != 0 {
+		pct := float64(o.vars.activeEvents.Get()) / float64(maxEvt)
+		pctRound := math.Round(pct/0.0005) * 0.0005
+		o.vars.percentQueueFull.Set(pctRound)
+	}
 }
 
 // (client) event is filtered out (on purpose or failed)
 func (o *metricsObserver) filteredEvent() {
 	o.vars.filtered.Inc()
 	o.vars.activeEvents.Dec()
+	o.setPercentageFull()
 }
 
 // (client) managed to push an event into the publisher pipeline
@@ -138,6 +156,7 @@ func (o *metricsObserver) publishedEvent() {
 func (o *metricsObserver) failedPublishEvent() {
 	o.vars.failed.Inc()
 	o.vars.activeEvents.Dec()
+	o.setPercentageFull()
 }
 
 //
@@ -148,11 +167,13 @@ func (o *metricsObserver) failedPublishEvent() {
 func (o *metricsObserver) queueACKed(n int) {
 	o.vars.queueACKed.Add(uint64(n))
 	o.vars.activeEvents.Sub(uint64(n))
+	o.setPercentageFull()
 }
 
 // (queue) maximum queue event capacity
 func (o *metricsObserver) queueMaxEvents(n int) {
 	o.vars.queueMaxEvents.Set(uint64(n))
+	o.setPercentageFull()
 }
 
 //

From 5c684a81beb0f24f30339e442306c41fccb74f58 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 30 Apr 2024 16:37:03 -0400
Subject: [PATCH 22/30] build(deps): bump
 github.com/elastic/elastic-agent-autodiscover from 0.6.8 to 0.6.14 (#39178)

* build(deps): bump github.com/elastic/elastic-agent-autodiscover

Bumps [github.com/elastic/elastic-agent-autodiscover](https://github.com/elastic/elastic-agent-autodiscover) from 0.6.8 to 0.6.14.
- [Release notes](https://github.com/elastic/elastic-agent-autodiscover/releases)
- [Changelog](https://github.com/elastic/elastic-agent-autodiscover/blob/main/CHANGELOG.md)
- [Commits](https://github.com/elastic/elastic-agent-autodiscover/compare/v0.6.8...v0.6.14)

---
updated-dependencies:
- dependency-name: github.com/elastic/elastic-agent-autodiscover
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>

* Update NOTICE.txt

---------

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: dependabot[bot] <dependabot[bot]@users.noreply.github.com>
---
 NOTICE.txt | 8 ++++----
 go.mod     | 4 ++--
 go.sum     | 8 ++++----
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/NOTICE.txt b/NOTICE.txt
index f060baf4098..951b7e7785c 100644
--- a/NOTICE.txt
+++ b/NOTICE.txt
@@ -12525,11 +12525,11 @@ various licenses:
 
 --------------------------------------------------------------------------------
 Dependency : github.com/elastic/elastic-agent-autodiscover
-Version: v0.6.13
+Version: v0.6.14
 Licence type (autodetected): Apache-2.0
 --------------------------------------------------------------------------------
 
-Contents of probable licence file $GOMODCACHE/github.com/elastic/elastic-agent-autodiscover@v0.6.13/LICENSE:
+Contents of probable licence file $GOMODCACHE/github.com/elastic/elastic-agent-autodiscover@v0.6.14/LICENSE:
 
                                  Apache License
                            Version 2.0, January 2004
@@ -25433,11 +25433,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 --------------------------------------------------------------------------------
 Dependency : golang.org/x/net
-Version: v0.21.0
+Version: v0.23.0
 Licence type (autodetected): BSD-3-Clause
 --------------------------------------------------------------------------------
 
-Contents of probable licence file $GOMODCACHE/golang.org/x/net@v0.21.0/LICENSE:
+Contents of probable licence file $GOMODCACHE/golang.org/x/net@v0.23.0/LICENSE:
 
 Copyright (c) 2009 The Go Authors. All rights reserved.
 
diff --git a/go.mod b/go.mod
index 0805e9200c8..ad13afabd8d 100644
--- a/go.mod
+++ b/go.mod
@@ -154,7 +154,7 @@ require (
 	golang.org/x/crypto v0.21.0
 	golang.org/x/lint v0.0.0-20210508222113-6edffad5e616
 	golang.org/x/mod v0.14.0
-	golang.org/x/net v0.21.0
+	golang.org/x/net v0.23.0
 	golang.org/x/oauth2 v0.10.0
 	golang.org/x/sync v0.5.0
 	golang.org/x/sys v0.18.0
@@ -203,7 +203,7 @@ require (
 	github.com/awslabs/kinesis-aggregation/go/v2 v2.0.0-20220623125934-28468a6701b5
 	github.com/elastic/bayeux v1.0.5
 	github.com/elastic/ebpfevents v0.6.0
-	github.com/elastic/elastic-agent-autodiscover v0.6.13
+	github.com/elastic/elastic-agent-autodiscover v0.6.14
 	github.com/elastic/elastic-agent-libs v0.7.5
 	github.com/elastic/elastic-agent-shipper-client v0.5.1-0.20230228231646-f04347b666f3
 	github.com/elastic/elastic-agent-system-metrics v0.9.2
diff --git a/go.sum b/go.sum
index 57711b7a9fe..5c45bdee748 100644
--- a/go.sum
+++ b/go.sum
@@ -551,8 +551,8 @@ github.com/elastic/dhcp v0.0.0-20200227161230-57ec251c7eb3 h1:lnDkqiRFKm0rxdljqr
 github.com/elastic/dhcp v0.0.0-20200227161230-57ec251c7eb3/go.mod h1:aPqzac6AYkipvp4hufTyMj5PDIphF3+At8zr7r51xjY=
 github.com/elastic/ebpfevents v0.6.0 h1:BrL3m7JFK7U6h2jkbk3xAWWs//IZnugCHEDds5u2v68=
 github.com/elastic/ebpfevents v0.6.0/go.mod h1:ESG9gw7N+n5yCCMgdg1IIJENKWSmX7+X0Fi9GUs9nvU=
-github.com/elastic/elastic-agent-autodiscover v0.6.13 h1:zBeTxV+o2efEKntY+o6iMMNJ1AVjDXUqY3o6uzIkKaw=
-github.com/elastic/elastic-agent-autodiscover v0.6.13/go.mod h1:7P6YVKxuBT0qE/VxuA87obwZUAEU0O44mCN3r4/6x8w=
+github.com/elastic/elastic-agent-autodiscover v0.6.14 h1:0zJYNyv9GKTOiNqCHqEVboP+WioV73ia17Et+UlFbz8=
+github.com/elastic/elastic-agent-autodiscover v0.6.14/go.mod h1:39/fHHlnyTK6oUNZfAhxJwBTVahO9tNasEIjzsxGMu8=
 github.com/elastic/elastic-agent-client/v7 v7.8.1 h1:J9wZc/0mUvSEok0X5iR5+n60Jgb+AWooKddb3XgPWqM=
 github.com/elastic/elastic-agent-client/v7 v7.8.1/go.mod h1:axl1nkdqc84YRFkeJGD9jExKNPUrOrzf3DFo2m653nY=
 github.com/elastic/elastic-agent-libs v0.7.5 h1:4UMqB3BREvhwecYTs/L23oQp1hs/XUkcunPlmTZn5yg=
@@ -1960,8 +1960,8 @@ golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
 golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
 golang.org/x/net v0.8.0/go.mod h1:QVkue5JL9kW//ek3r6jTKnTFis1tRmNAW2P1shuFdJc=
 golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg=
-golang.org/x/net v0.21.0 h1:AQyQV4dYCvJ7vGmJyKki9+PBdyvhkSd8EIx/qb0AYv4=
-golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44=
+golang.org/x/net v0.23.0 h1:7EYJ93RZ9vYSZAIb2x3lnuvqO5zneoD6IvWjuhfxjTs=
+golang.org/x/net v0.23.0/go.mod h1:JKghWKKOSdJwpW2GEx0Ja7fmaKnMsbu+MWVZTokSYmg=
 golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
 golang.org/x/oauth2 v0.0.0-20190130055435-99b60b757ec1/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
 golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=

From 33ba5f0d9a4493b78d337a22d53d32db390aaeea Mon Sep 17 00:00:00 2001
From: Fae Charlton <fae.charlton@elastic.co>
Date: Tue, 30 Apr 2024 17:07:51 -0400
Subject: [PATCH 23/30] add change log for S3 fix (#39320)

---
 CHANGELOG.next.asciidoc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.next.asciidoc b/CHANGELOG.next.asciidoc
index f57b7100077..58ce7ac0f65 100644
--- a/CHANGELOG.next.asciidoc
+++ b/CHANGELOG.next.asciidoc
@@ -142,6 +142,7 @@ https://github.com/elastic/beats/compare/v8.8.1\...main[Check the HEAD diff]
 - Updated Websocket input title to align with existing inputs {pull}39006[39006]
 - Restore netflow input on Windows {pull}39024[39024]
 - Upgrade azure-event-hubs-go and azure-storage-blob-go dependencies. {pull}38861[38861]
+- Fix concurrency/error handling bugs in the AWS S3 input that could drop data and prevent ingestion of large buckets. {pull}39131[39131]
 
 *Heartbeat*
 

From 50e173aebb6ef064adb3f1a97dcef52b998af55d Mon Sep 17 00:00:00 2001
From: apmmachine <58790750+apmmachine@users.noreply.github.com>
Date: Tue, 30 Apr 2024 19:04:19 -0400
Subject: [PATCH 24/30] chore: Update snapshot.yml (#39319)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Made with ❤️️ by updatecli

Co-authored-by: apmmachine <apmmachine@users.noreply.github.com>
Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com>
---
 testing/environments/snapshot.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/testing/environments/snapshot.yml b/testing/environments/snapshot.yml
index b531cf78a51..bf5fdbc9545 100644
--- a/testing/environments/snapshot.yml
+++ b/testing/environments/snapshot.yml
@@ -3,7 +3,7 @@
 version: '2.3'
 services:
   elasticsearch:
-    image: docker.elastic.co/elasticsearch/elasticsearch:8.15.0-81021969-SNAPSHOT
+    image: docker.elastic.co/elasticsearch/elasticsearch:8.15.0-bb66fa2a-SNAPSHOT
     # When extend is used it merges healthcheck.tests, see:
     # https://github.com/docker/compose/issues/8962
     # healthcheck:
@@ -31,7 +31,7 @@ services:
     - "./docker/elasticsearch/users_roles:/usr/share/elasticsearch/config/users_roles"
 
   logstash:
-    image: docker.elastic.co/logstash/logstash:8.15.0-81021969-SNAPSHOT
+    image: docker.elastic.co/logstash/logstash:8.15.0-bb66fa2a-SNAPSHOT
     healthcheck:
       test: ["CMD", "curl", "-f", "http://localhost:9600/_node/stats"]
       retries: 600
@@ -44,7 +44,7 @@ services:
       - 5055:5055
 
   kibana:
-    image: docker.elastic.co/kibana/kibana:8.15.0-81021969-SNAPSHOT
+    image: docker.elastic.co/kibana/kibana:8.15.0-bb66fa2a-SNAPSHOT
     environment:
     - "ELASTICSEARCH_USERNAME=kibana_system_user"
     - "ELASTICSEARCH_PASSWORD=testing"

From 246a8bc019676043f19510d57d86853fb5ebbe57 Mon Sep 17 00:00:00 2001
From: Andrew Kroh <andrew.kroh@elastic.co>
Date: Wed, 1 May 2024 08:43:14 -0400
Subject: [PATCH 25/30] .github/dependabot.yml - add elastic/ebpfevents
 (#38695)

Add github.com/elastic/ebpfevents to the list of dependencies that are watched.

Co-authored-by: Dimitrios Liappis <dimitrios.liappis@gmail.com>
---
 .github/dependabot.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/dependabot.yml b/.github/dependabot.yml
index 7fcaca8ac9e..bbd4255fd87 100644
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -34,6 +34,7 @@ updates:
       - dependency-name: github.com/elastic/go-perf
       - dependency-name: github.com/elastic/go-seccomp-bpf
       - dependency-name: github.com/elastic/toutoumomoma
+      - dependency-name: github.com/elastic/ebpfevents
     ignore:
       # Skip github.com/elastic/mito because it requires documentation updates.
       - dependency-name: github.com/elastic/mito

From 5011cccdc64c8ecd7ebca6dd10574af4a525fa98 Mon Sep 17 00:00:00 2001
From: Alexandros Sapranidis <alexandros@elastic.co>
Date: Wed, 1 May 2024 17:38:35 +0300
Subject: [PATCH 26/30] Allow everyone in Elastic to build Beats (#39335)

This commit allows everyone under the Elastic org to be able to trigger
builds in Buildkite.

Signed-off-by: Alexandros Sapranidis <alexandros@elastic.co>
---
 catalog-info.yaml | 48 +++++++++++++++++++++++------------------------
 1 file changed, 24 insertions(+), 24 deletions(-)

diff --git a/catalog-info.yaml b/catalog-info.yaml
index 420d9c1c16a..f81d3a6df1a 100644
--- a/catalog-info.yaml
+++ b/catalog-info.yaml
@@ -61,7 +61,7 @@ spec:
         ingest-fp:
           access_level: MANAGE_BUILD_AND_READ
         everyone:
-          access_level: READ_ONLY
+          access_level: BUILD_AND_READ
 
 ---
 # yaml-language-server: $schema=https://gist.githubusercontent.com/elasticmachine/988b80dae436cafea07d9a4a460a011d/raw/rre.schema.json
@@ -108,7 +108,7 @@ spec:
         ingest-fp:
           access_level: MANAGE_BUILD_AND_READ
         everyone:
-          access_level: READ_ONLY
+          access_level: BUILD_AND_READ
 
 ---
 # yaml-language-server: $schema=https://gist.githubusercontent.com/elasticmachine/988b80dae436cafea07d9a4a460a011d/raw/rre.schema.json
@@ -155,7 +155,7 @@ spec:
         ingest-fp:
           access_level: MANAGE_BUILD_AND_READ
         everyone:
-          access_level: READ_ONLY
+          access_level: BUILD_AND_READ
 
 ---
 # yaml-language-server: $schema=https://gist.githubusercontent.com/elasticmachine/988b80dae436cafea07d9a4a460a011d/raw/rre.schema.json
@@ -202,7 +202,7 @@ spec:
         ingest-fp:
           access_level: MANAGE_BUILD_AND_READ
         everyone:
-          access_level: READ_ONLY
+          access_level: BUILD_AND_READ
 
 ---
 # yaml-language-server: $schema=https://gist.githubusercontent.com/elasticmachine/988b80dae436cafea07d9a4a460a011d/raw/rre.schema.json
@@ -249,7 +249,7 @@ spec:
         ingest-fp:
           access_level: MANAGE_BUILD_AND_READ
         everyone:
-          access_level: READ_ONLY
+          access_level: BUILD_AND_READ
 
 ---
 # yaml-language-server: $schema=https://gist.githubusercontent.com/elasticmachine/988b80dae436cafea07d9a4a460a011d/raw/rre.schema.json
@@ -296,7 +296,7 @@ spec:
         ingest-fp:
           access_level: MANAGE_BUILD_AND_READ
         everyone:
-          access_level: READ_ONLY
+          access_level: BUILD_AND_READ
 
 ---
 # yaml-language-server: $schema=https://gist.githubusercontent.com/elasticmachine/988b80dae436cafea07d9a4a460a011d/raw/rre.schema.json
@@ -343,7 +343,7 @@ spec:
         ingest-fp:
           access_level: MANAGE_BUILD_AND_READ
         everyone:
-          access_level: READ_ONLY
+          access_level: BUILD_AND_READ
 
 ---
 # yaml-language-server: $schema=https://gist.githubusercontent.com/elasticmachine/988b80dae436cafea07d9a4a460a011d/raw/rre.schema.json
@@ -390,7 +390,7 @@ spec:
         ingest-fp:
           access_level: MANAGE_BUILD_AND_READ
         everyone:
-          access_level: READ_ONLY
+          access_level: BUILD_AND_READ
 
 ---
 # yaml-language-server: $schema=https://gist.githubusercontent.com/elasticmachine/988b80dae436cafea07d9a4a460a011d/raw/rre.schema.json
@@ -436,7 +436,7 @@ spec:
         ingest-fp:
           access_level: MANAGE_BUILD_AND_READ
         everyone:
-          access_level: READ_ONLY
+          access_level: BUILD_AND_READ
 
 ---
 # yaml-language-server: $schema=https://gist.githubusercontent.com/elasticmachine/988b80dae436cafea07d9a4a460a011d/raw/rre.schema.json
@@ -483,7 +483,7 @@ spec:
         ingest-fp:
           access_level: MANAGE_BUILD_AND_READ
         everyone:
-          access_level: READ_ONLY
+          access_level: BUILD_AND_READ
 
 ---
 # yaml-language-server: $schema=https://gist.githubusercontent.com/elasticmachine/988b80dae436cafea07d9a4a460a011d/raw/rre.schema.json
@@ -530,7 +530,7 @@ spec:
         ingest-fp:
           access_level: MANAGE_BUILD_AND_READ
         everyone:
-          access_level: READ_ONLY
+          access_level: BUILD_AND_READ
 
 ---
 # yaml-language-server: $schema=https://gist.githubusercontent.com/elasticmachine/988b80dae436cafea07d9a4a460a011d/raw/rre.schema.json
@@ -577,7 +577,7 @@ spec:
         ingest-fp:
           access_level: MANAGE_BUILD_AND_READ
         everyone:
-          access_level: READ_ONLY
+          access_level: BUILD_AND_READ
 
 ---
 # yaml-language-server: $schema=https://gist.githubusercontent.com/elasticmachine/988b80dae436cafea07d9a4a460a011d/raw/rre.schema.json
@@ -624,7 +624,7 @@ spec:
         ingest-fp:
           access_level: MANAGE_BUILD_AND_READ
         everyone:
-          access_level: READ_ONLY
+          access_level: BUILD_AND_READ
 
 ---
 # yaml-language-server: $schema=https://gist.githubusercontent.com/elasticmachine/988b80dae436cafea07d9a4a460a011d/raw/rre.schema.json
@@ -671,7 +671,7 @@ spec:
         ingest-fp:
           access_level: MANAGE_BUILD_AND_READ
         everyone:
-          access_level: READ_ONLY
+          access_level: BUILD_AND_READ
 
 ---
 # yaml-language-server: $schema=https://gist.githubusercontent.com/elasticmachine/988b80dae436cafea07d9a4a460a011d/raw/rre.schema.json
@@ -706,7 +706,7 @@ spec:
         release-eng:
           access_level: BUILD_AND_READ
         everyone:
-          access_level: READ_ONLY
+          access_level: BUILD_AND_READ
 ---
 # yaml-language-server: $schema=https://gist.githubusercontent.com/elasticmachine/988b80dae436cafea07d9a4a460a011d/raw/rre.schema.json
 apiVersion: backstage.io/v1alpha1
@@ -788,7 +788,7 @@ spec:
         ingest-fp:
           access_level: MANAGE_BUILD_AND_READ
         everyone:
-          access_level: READ_ONLY
+          access_level: BUILD_AND_READ
 
 ---
 # yaml-language-server: $schema=https://gist.githubusercontent.com/elasticmachine/988b80dae436cafea07d9a4a460a011d/raw/rre.schema.json
@@ -835,7 +835,7 @@ spec:
         ingest-fp:
           access_level: MANAGE_BUILD_AND_READ
         everyone:
-          access_level: READ_ONLY
+          access_level: BUILD_AND_READ
 
 ---
 # yaml-language-server: $schema=https://gist.githubusercontent.com/elasticmachine/988b80dae436cafea07d9a4a460a011d/raw/rre.schema.json
@@ -882,7 +882,7 @@ spec:
         ingest-fp:
           access_level: MANAGE_BUILD_AND_READ
         everyone:
-          access_level: READ_ONLY
+          access_level: BUILD_AND_READ
 
 ---
 # yaml-language-server: $schema=https://gist.githubusercontent.com/elasticmachine/988b80dae436cafea07d9a4a460a011d/raw/rre.schema.json
@@ -929,7 +929,7 @@ spec:
         ingest-fp:
           access_level: MANAGE_BUILD_AND_READ
         everyone:
-          access_level: READ_ONLY
+          access_level: BUILD_AND_READ
 
 ---
 # yaml-language-server: $schema=https://gist.githubusercontent.com/elasticmachine/988b80dae436cafea07d9a4a460a011d/raw/rre.schema.json
@@ -976,7 +976,7 @@ spec:
         ingest-fp:
           access_level: MANAGE_BUILD_AND_READ
         everyone:
-          access_level: READ_ONLY
+          access_level: BUILD_AND_READ
 
 ---
 # yaml-language-server: $schema=https://gist.githubusercontent.com/elasticmachine/988b80dae436cafea07d9a4a460a011d/raw/rre.schema.json
@@ -1015,7 +1015,7 @@ spec:
         release-eng:
           access_level: BUILD_AND_READ
         everyone:
-          access_level: READ_ONLY
+          access_level: BUILD_AND_READ
 
 ---
 # yaml-language-server: $schema=https://gist.githubusercontent.com/elasticmachine/988b80dae436cafea07d9a4a460a011d/raw/rre.schema.json
@@ -1065,7 +1065,7 @@ spec:
         release-eng:
           access_level: BUILD_AND_READ
         everyone:
-          access_level: READ_ONLY
+          access_level: BUILD_AND_READ
 
 ---
 # yaml-language-server: $schema=https://gist.githubusercontent.com/elasticmachine/988b80dae436cafea07d9a4a460a011d/raw/rre.schema.json
@@ -1101,7 +1101,7 @@ spec:
         release-eng:
           access_level: BUILD_AND_READ
         everyone:
-          access_level: READ_ONLY
+          access_level: BUILD_AND_READ
 
 ---
 # yaml-language-server: $schema=https://gist.githubusercontent.com/elasticmachine/988b80dae436cafea07d9a4a460a011d/raw/rre.schema.json
@@ -1147,4 +1147,4 @@ spec:
         release-eng:
           access_level: BUILD_AND_READ
         everyone:
-          access_level: READ_ONLY
+          access_level: BUILD_AND_READ

From 726f6e9bdec715f958ba47500e77feb5655b0a48 Mon Sep 17 00:00:00 2001
From: Dimitrios Liappis <dimitrios.liappis@gmail.com>
Date: Wed, 1 May 2024 18:04:21 +0300
Subject: [PATCH 27/30] More resilient DRA packaging (#39332)

Occasionally packaging steps from the DRA pipeline may get stuck[^1].
This causes a breach of the global pipeline timeout (currently 1hr) and
cancels the job.

This commit increases the global timeout to 90min, adds one retry per
step and limits the runtime per step to 40min (so that a single stuck
step doesn't exhaust the entire global timeout).

Finally, we shush slack notifications if the retry recovered the step.

In a future PR we will consider also adding a daily DRA build to cover
for cases where the retries didn't help and there were no subsequent
commits to trigger a new build.

[^1]: https://buildkite.com/elastic/beats-packaging-pipeline/builds/114
---
 .buildkite/packaging.pipeline.yml | 32 +++++++++++++++++++++++++++++++
 catalog-info.yaml                 |  3 ++-
 2 files changed, 34 insertions(+), 1 deletion(-)

diff --git a/.buildkite/packaging.pipeline.yml b/.buildkite/packaging.pipeline.yml
index c01428100ec..5fd559f458d 100644
--- a/.buildkite/packaging.pipeline.yml
+++ b/.buildkite/packaging.pipeline.yml
@@ -44,6 +44,10 @@ steps:
           provider: gcp
           image: "${IMAGE_UBUNTU_X86_64}"
           machineType: "${GCP_DEFAULT_MACHINE_TYPE}"
+        timeout_in_minutes: 40
+        retry:
+          automatic:
+            - limit: 1
         commands:
           - make build/distributions/dependencies.csv
           - make beats-dashboards
@@ -62,6 +66,10 @@ steps:
           provider: gcp
           image: "${IMAGE_UBUNTU_X86_64}"
           machineType: "${GCP_DEFAULT_MACHINE_TYPE}"
+        timeout_in_minutes: 40
+        retry:
+          automatic:
+            - limit: 1
         commands:
           - make build/distributions/dependencies.csv
           - make beats-dashboards
@@ -86,6 +94,10 @@ steps:
           provider: gcp
           image: "${IMAGE_UBUNTU_X86_64}"
           machineType: "${GCP_DEFAULT_MACHINE_TYPE}"
+        timeout_in_minutes: 40
+        retry:
+          automatic:
+            - limit: 1
         artifact_paths:
           - build/distributions/**/*
         matrix:
@@ -116,6 +128,10 @@ steps:
           provider: "aws"
           imagePrefix: "${AWS_IMAGE_UBUNTU_ARM_64}"
           instanceType: "${AWS_ARM_INSTANCE_TYPE}"
+        timeout_in_minutes: 40
+        retry:
+          automatic:
+            - limit: 1
         artifact_paths:
           - build/distributions/**/*
         matrix:
@@ -142,6 +158,10 @@ steps:
           provider: gcp
           image: "${IMAGE_UBUNTU_X86_64}"
           machineType: "c2-standard-16"
+        timeout_in_minutes: 40
+        retry:
+          automatic:
+            - limit: 1
         artifact_paths:
           - build/distributions/**/*
 
@@ -161,6 +181,10 @@ steps:
           provider: gcp
           image: "${IMAGE_UBUNTU_X86_64}"
           machineType: "${GCP_DEFAULT_MACHINE_TYPE}"
+        timeout_in_minutes: 40
+        retry:
+          automatic:
+            - limit: 1
         artifact_paths:
           - build/distributions/**/*
         matrix:
@@ -191,6 +215,10 @@ steps:
           provider: "aws"
           imagePrefix: "${AWS_IMAGE_UBUNTU_ARM_64}"
           instanceType: "${AWS_ARM_INSTANCE_TYPE}"
+        timeout_in_minutes: 40
+        retry:
+          automatic:
+            - limit: 1
         artifact_paths:
           - build/distributions/**/*
         matrix:
@@ -217,6 +245,10 @@ steps:
           provider: gcp
           image: "${IMAGE_UBUNTU_X86_64}"
           machineType: "c2-standard-16"
+        timeout_in_minutes: 40
+        retry:
+          automatic:
+            - limit: 1
         artifact_paths:
           - build/distributions/**/*
 
diff --git a/catalog-info.yaml b/catalog-info.yaml
index f81d3a6df1a..34d9e397ca3 100644
--- a/catalog-info.yaml
+++ b/catalog-info.yaml
@@ -1045,7 +1045,7 @@ spec:
       # branch_configuration: "main 8.* 7.17"
       cancel_intermediate_builds: false
       skip_intermediate_builds: false
-      maximum_timeout_in_minutes: 60
+      maximum_timeout_in_minutes: 90
       provider_settings:
         build_branches: true
         build_pull_request_forks: false
@@ -1059,6 +1059,7 @@ spec:
         ELASTIC_SLACK_NOTIFICATIONS_ENABLED: 'true'
         SLACK_NOTIFICATIONS_CHANNEL: '#ingest-notifications'
         SLACK_NOTIFICATIONS_ON_SUCCESS: 'false'
+        SLACK_NOTIFICATIONS_SKIP_FOR_RETRIES: 'true'        
       teams:
         ingest-fp:
           access_level: MANAGE_BUILD_AND_READ

From 5a0293ec9f6222e8fbaddc49f5f56e32d1c09096 Mon Sep 17 00:00:00 2001
From: apmmachine <58790750+apmmachine@users.noreply.github.com>
Date: Wed, 1 May 2024 16:01:57 -0400
Subject: [PATCH 28/30] chore: Update snapshot.yml (#39342)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Made with ❤️️ by updatecli

Co-authored-by: apmmachine <apmmachine@users.noreply.github.com>
---
 testing/environments/snapshot.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/testing/environments/snapshot.yml b/testing/environments/snapshot.yml
index bf5fdbc9545..30002f9a255 100644
--- a/testing/environments/snapshot.yml
+++ b/testing/environments/snapshot.yml
@@ -3,7 +3,7 @@
 version: '2.3'
 services:
   elasticsearch:
-    image: docker.elastic.co/elasticsearch/elasticsearch:8.15.0-bb66fa2a-SNAPSHOT
+    image: docker.elastic.co/elasticsearch/elasticsearch:8.15.0-aa640648-SNAPSHOT
     # When extend is used it merges healthcheck.tests, see:
     # https://github.com/docker/compose/issues/8962
     # healthcheck:
@@ -31,7 +31,7 @@ services:
     - "./docker/elasticsearch/users_roles:/usr/share/elasticsearch/config/users_roles"
 
   logstash:
-    image: docker.elastic.co/logstash/logstash:8.15.0-bb66fa2a-SNAPSHOT
+    image: docker.elastic.co/logstash/logstash:8.15.0-aa640648-SNAPSHOT
     healthcheck:
       test: ["CMD", "curl", "-f", "http://localhost:9600/_node/stats"]
       retries: 600
@@ -44,7 +44,7 @@ services:
       - 5055:5055
 
   kibana:
-    image: docker.elastic.co/kibana/kibana:8.15.0-bb66fa2a-SNAPSHOT
+    image: docker.elastic.co/kibana/kibana:8.15.0-aa640648-SNAPSHOT
     environment:
     - "ELASTICSEARCH_USERNAME=kibana_system_user"
     - "ELASTICSEARCH_PASSWORD=testing"

From 02ea29d8cf4078ce30c0acb507ffada149101a9b Mon Sep 17 00:00:00 2001
From: Olga Naydyonock <olga.naidjonoka@elastic.co>
Date: Wed, 1 May 2024 23:04:45 +0300
Subject: [PATCH 29/30] Enabling retries for Beats flaky tests (#39174)

* added retries for auditbeat flaky tests

* added retries for filebeat flaky tests

* added retries

* test exitcode

* checged exit status code for retries

* set larger timeout for pytestOpts

* restored timeout for pytests
---
 .buildkite/auditbeat/auditbeat-pipeline.yml   | 33 ++++++++++++++++
 .buildkite/filebeat/filebeat-pipeline.yml     | 33 ++++++++++++++++
 .buildkite/heartbeat/heartbeat-pipeline.yml   | 36 +++++++++++++++++
 .buildkite/libbeat/pipeline.libbeat.yml       | 18 +++++++++
 .buildkite/metricbeat/pipeline.yml            | 33 ++++++++++++++++
 .buildkite/packetbeat/pipeline.packetbeat.yml | 30 ++++++++++++++
 .buildkite/winlogbeat/pipeline.winlogbeat.yml | 18 +++++++++
 .../x-pack/pipeline.xpack.auditbeat.yml       | 30 ++++++++++++++
 .../x-pack/pipeline.xpack.dockerlogbeat.yml   |  6 +++
 .buildkite/x-pack/pipeline.xpack.filebeat.yml | 33 ++++++++++++++++
 .../x-pack/pipeline.xpack.heartbeat.yml       | 29 +++++++++++++-
 .buildkite/x-pack/pipeline.xpack.libbeat.yml  | 27 +++++++++++++
 .../x-pack/pipeline.xpack.metricbeat.yml      | 30 ++++++++++++++
 .../x-pack/pipeline.xpack.osquerybeat.yml     | 27 +++++++++++++
 .../x-pack/pipeline.xpack.packetbeat.yml      | 39 +++++++++++++++++++
 .../x-pack/pipeline.xpack.winlogbeat.yml      | 18 +++++++++
 16 files changed, 439 insertions(+), 1 deletion(-)

diff --git a/.buildkite/auditbeat/auditbeat-pipeline.yml b/.buildkite/auditbeat/auditbeat-pipeline.yml
index 801768c271e..ed19c7d9164 100644
--- a/.buildkite/auditbeat/auditbeat-pipeline.yml
+++ b/.buildkite/auditbeat/auditbeat-pipeline.yml
@@ -32,6 +32,9 @@ steps:
         command: |
           cd auditbeat
           mage build unitTest
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_UBUNTU_X86_64}"
@@ -47,6 +50,9 @@ steps:
         command: |
           cd auditbeat
           mage build unitTest
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_RHEL9}"
@@ -62,6 +68,9 @@ steps:
         command: |
           Set-Location -Path auditbeat
           mage build unitTest
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_WIN_2016}"
@@ -79,6 +88,9 @@ steps:
         command: |
           Set-Location -Path auditbeat
           mage build unitTest
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_WIN_2022}"
@@ -97,6 +109,9 @@ steps:
           make -C auditbeat crosscompile
         env:
           GOX_FLAGS: "-arch amd64"
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_UBUNTU_X86_64}"
@@ -115,6 +130,9 @@ steps:
           set -euo pipefail
           cd auditbeat
           mage unitTest
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "aws"
           imagePrefix: "${AWS_IMAGE_UBUNTU_ARM_64}"
@@ -133,6 +151,9 @@ steps:
           source .buildkite/scripts/install_macos_tools.sh
           cd auditbeat
           mage unitTest
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "orka"
           imagePrefix: "${IMAGE_MACOS_X86_64}"
@@ -147,6 +168,9 @@ steps:
           source .buildkite/scripts/install_macos_tools.sh
           cd auditbeat
           mage unitTest
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "orka"
           imagePrefix: "${IMAGE_MACOS_ARM}"
@@ -164,6 +188,9 @@ steps:
         command: |
           Set-Location -Path auditbeat
           mage build unitTest
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_WIN_2019}"
@@ -180,6 +207,9 @@ steps:
         command: |
           Set-Location -Path auditbeat
           mage build unitTest
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_WIN_10}"
@@ -196,6 +226,9 @@ steps:
         command: |
           Set-Location -Path auditbeat
           mage build unitTest
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_WIN_11}"
diff --git a/.buildkite/filebeat/filebeat-pipeline.yml b/.buildkite/filebeat/filebeat-pipeline.yml
index 7eedd9d76fb..053e8dbec41 100644
--- a/.buildkite/filebeat/filebeat-pipeline.yml
+++ b/.buildkite/filebeat/filebeat-pipeline.yml
@@ -30,6 +30,9 @@ steps:
         command: |
           cd filebeat
           mage build unitTest
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_UBUNTU_X86_64}"
@@ -45,6 +48,9 @@ steps:
         command: |
           cd filebeat
           mage goIntegTest
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_UBUNTU_X86_64}"
@@ -60,6 +66,9 @@ steps:
         command: |
           cd filebeat
           mage pythonIntegTest
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: gcp
           image: "${IMAGE_UBUNTU_X86_64}"
@@ -76,6 +85,9 @@ steps:
         command: |
           Set-Location -Path filebeat
           mage build unitTest
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_WIN_2016}"
@@ -94,6 +106,9 @@ steps:
         command: |
           Set-Location -Path filebeat
           mage build unitTest
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_WIN_2022}"
@@ -118,6 +133,9 @@ steps:
           source .buildkite/scripts/install_macos_tools.sh
           cd filebeat
           mage build unitTest
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "orka"
           imagePrefix: "${IMAGE_MACOS_X86_64}"
@@ -136,6 +154,9 @@ steps:
           source .buildkite/scripts/install_macos_tools.sh
           cd filebeat
           mage build unitTest
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "orka"
           imagePrefix: "${IMAGE_MACOS_ARM}"
@@ -152,6 +173,9 @@ steps:
         command: |
           cd filebeat
           mage build unitTest
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "aws"
           imagePrefix: "${AWS_IMAGE_UBUNTU_ARM_64}"
@@ -172,6 +196,9 @@ steps:
         command: |
           Set-Location -Path filebeat
           mage build unitTest
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_WIN_2019}"
@@ -190,6 +217,9 @@ steps:
         command: |
           Set-Location -Path filebeat
           mage build unitTest
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_WIN_11}"
@@ -208,6 +238,9 @@ steps:
         command: |
           Set-Location -Path filebeat
           mage build unitTest
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_WIN_10}"
diff --git a/.buildkite/heartbeat/heartbeat-pipeline.yml b/.buildkite/heartbeat/heartbeat-pipeline.yml
index 8091b2eead1..cadbcec1eca 100644
--- a/.buildkite/heartbeat/heartbeat-pipeline.yml
+++ b/.buildkite/heartbeat/heartbeat-pipeline.yml
@@ -30,6 +30,9 @@ steps:
         command: |
           cd heartbeat
           mage build unitTest
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_UBUNTU_X86_64}"
@@ -45,6 +48,9 @@ steps:
         command: |
           cd heartbeat
           mage build unitTest
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_RHEL9}"
@@ -61,6 +67,9 @@ steps:
         command: |
           Set-Location -Path heartbeat
           mage build unitTest
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_WIN_2016}"
@@ -78,6 +87,9 @@ steps:
         command: |
           Set-Location -Path heartbeat
           mage build unitTest
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_WIN_2022}"
@@ -94,6 +106,9 @@ steps:
         command:  |
           cd heartbeat
           mage goIntegTest
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_UBUNTU_X86_64}"
@@ -109,6 +124,9 @@ steps:
         command: |
           cd heartbeat
           mage pythonIntegTest
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_UBUNTU_X86_64}"
@@ -130,6 +148,9 @@ steps:
         command: |
           cd heartbeat
           mage build unitTest
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "aws"
           imagePrefix: "${AWS_IMAGE_UBUNTU_ARM_64}"
@@ -151,6 +172,9 @@ steps:
           source .buildkite/scripts/install_macos_tools.sh
           cd heartbeat
           mage build unitTest
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "orka"
           imagePrefix: "${IMAGE_MACOS_X86_64}"
@@ -168,6 +192,9 @@ steps:
           source .buildkite/scripts/install_macos_tools.sh
           cd heartbeat
           mage build unitTest
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "orka"
           imagePrefix: "${IMAGE_MACOS_ARM}"
@@ -188,6 +215,9 @@ steps:
         command:  |
           Set-Location -Path heartbeat
           mage build unitTest
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_WIN_2019}"
@@ -205,6 +235,9 @@ steps:
         command: |
           Set-Location -Path heartbeat
           mage build unitTest
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_WIN_11}"
@@ -222,6 +255,9 @@ steps:
         command: |
           Set-Location -Path heartbeat
           mage build unitTest
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_WIN_10}"
diff --git a/.buildkite/libbeat/pipeline.libbeat.yml b/.buildkite/libbeat/pipeline.libbeat.yml
index 040ad9b1d66..bc77712c330 100644
--- a/.buildkite/libbeat/pipeline.libbeat.yml
+++ b/.buildkite/libbeat/pipeline.libbeat.yml
@@ -21,6 +21,9 @@ steps:
           set -euo pipefail
           cd libbeat
           mage build unitTest
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_UBUNTU_X86_64}"
@@ -38,6 +41,9 @@ steps:
           set -euo pipefail
           cd libbeat
           mage goIntegTest
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_UBUNTU_X86_64}"
@@ -55,6 +61,9 @@ steps:
           set -euo pipefail
           cd libbeat
           mage pythonIntegTest
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_UBUNTU_X86_64}"
@@ -72,6 +81,9 @@ steps:
           set -euo pipefail
           cd libbeat
           make crosscompile
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_UBUNTU_X86_64}"
@@ -89,6 +101,9 @@ steps:
           set -euo pipefail
           cd libbeat
           make STRESS_TEST_OPTIONS='-timeout=20m -race -v -parallel 1' GOTEST_OUTPUT_OPTIONS=' | go-junit-report > libbeat-stress-test.xml' stress-tests
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_UBUNTU_X86_64}"
@@ -108,6 +123,9 @@ steps:
           set -euo pipefail
           cd libbeat
           mage build unitTest
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "aws"
           imagePrefix: "${AWS_IMAGE_UBUNTU_ARM_64}"
diff --git a/.buildkite/metricbeat/pipeline.yml b/.buildkite/metricbeat/pipeline.yml
index 1fb6bfcc237..d15212d2ef3 100644
--- a/.buildkite/metricbeat/pipeline.yml
+++ b/.buildkite/metricbeat/pipeline.yml
@@ -32,6 +32,9 @@ steps:
       - label: ":linux: Ubuntu Unit Tests"
         key: "mandatory-linux-unit-test"
         command: "cd metricbeat && mage build unitTest"
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_UBUNTU_X86_64}"
@@ -62,6 +65,9 @@ steps:
           echo "~~~ Running tests"
           export KUBECONFIG="$$PWD/kubecfg"
           cd metricbeat && mage goIntegTest
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_UBUNTU_X86_64}"
@@ -92,6 +98,9 @@ steps:
           echo "~~~ Running tests"
           export KUBECONFIG="$$PWD/kubecfg"
           cd metricbeat && mage pythonIntegTest
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_UBUNTU_X86_64}"
@@ -106,6 +115,9 @@ steps:
       - label: ":negative_squared_cross_mark: Cross compile"
         key: "mandatory-cross-compile"
         command: "make -C metricbeat crosscompile"
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_UBUNTU_X86_64}"
@@ -122,6 +134,9 @@ steps:
           Set-Location -Path metricbeat
           mage build unitTest
         key: "mandatory-win-2016-unit-tests"
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_WIN_2016}"
@@ -140,6 +155,9 @@ steps:
           Set-Location -Path metricbeat
           mage build unitTest
         key: "mandatory-win-2022-unit-tests"
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_WIN_2022}"
@@ -162,6 +180,9 @@ steps:
           Set-Location -Path metricbeat
           mage build unitTest
         key: "extended-win-10-unit-tests"
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_WIN_10}"
@@ -180,6 +201,9 @@ steps:
           Set-Location -Path metricbeat
           mage build unitTest
         key: "extended-win-11-unit-tests"
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_WIN_11}"
@@ -198,6 +222,9 @@ steps:
           Set-Location -Path metricbeat
           mage build unitTest
         key: "extended-win-2019-unit-tests"
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_WIN_2019}"
@@ -221,6 +248,9 @@ steps:
           set -euo pipefail
           source .buildkite/scripts/install_macos_tools.sh
           cd metricbeat && mage build unitTest
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "orka"
           imagePrefix: "${IMAGE_MACOS_X86_64}"
@@ -238,6 +268,9 @@ steps:
           set -euo pipefail
           source .buildkite/scripts/install_macos_tools.sh
           cd metricbeat && mage build unitTest
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "orka"
           imagePrefix: "${IMAGE_MACOS_ARM}"
diff --git a/.buildkite/packetbeat/pipeline.packetbeat.yml b/.buildkite/packetbeat/pipeline.packetbeat.yml
index c0f5c1e1a73..d510107a89c 100644
--- a/.buildkite/packetbeat/pipeline.packetbeat.yml
+++ b/.buildkite/packetbeat/pipeline.packetbeat.yml
@@ -28,6 +28,9 @@ steps:
         command: |
           cd packetbeat
           mage build unitTest
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_UBUNTU_X86_64}"
@@ -43,6 +46,9 @@ steps:
         command: |
           cd packetbeat
           mage build unitTest
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_RHEL9_X86_64}"
@@ -58,6 +64,9 @@ steps:
         command: |
           Set-Location -Path packetbeat
           mage build unitTest
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_WIN_2016}"
@@ -75,6 +84,9 @@ steps:
         command: |
           Set-Location -Path packetbeat
           mage build unitTest
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_WIN_2022}"
@@ -96,6 +108,9 @@ steps:
         command: |
           Set-Location -Path packetbeat
           mage build unitTest
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_WIN_10}"
@@ -114,6 +129,9 @@ steps:
           Set-Location -Path packetbeat
           mage build unitTest
         key: "extended-win-11-unit-tests"
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_WIN_11}"
@@ -132,6 +150,9 @@ steps:
           Set-Location -Path packetbeat
           mage build unitTest
         key: "extended-win-2019-unit-tests"
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_WIN_2019}"
@@ -156,6 +177,9 @@ steps:
           source .buildkite/scripts/install_macos_tools.sh
           cd packetbeat
           mage build unitTest
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "orka"
           imagePrefix: "${IMAGE_MACOS_X86_64}"
@@ -174,6 +198,9 @@ steps:
           source .buildkite/scripts/install_macos_tools.sh
           cd packetbeat
           mage build unitTest
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "orka"
           imagePrefix: "${IMAGE_MACOS_ARM}"
@@ -188,6 +215,9 @@ steps:
         key: "linux-arm64-unit-tests-extended"
         command: "cd packetbeat && mage build unitTest"
         if: build.env("BUILDKITE_PULL_REQUEST") == "false" || build.env("GITHUB_PR_LABELS") =~ /.*arm.*/
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "aws"
           imagePrefix: "${AWS_IMAGE_UBUNTU_ARM_64}"
diff --git a/.buildkite/winlogbeat/pipeline.winlogbeat.yml b/.buildkite/winlogbeat/pipeline.winlogbeat.yml
index c71858b45b0..ff332791349 100644
--- a/.buildkite/winlogbeat/pipeline.winlogbeat.yml
+++ b/.buildkite/winlogbeat/pipeline.winlogbeat.yml
@@ -24,6 +24,9 @@ steps:
       - label: ":ubuntu: Winlogbeat Crossccompile"
         key: "mandatory-cross-compile"
         command: "make -C winlogbeat crosscompile"
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_UBUNTU_X86_64}"
@@ -40,6 +43,9 @@ steps:
           Set-Location -Path winlogbeat
           mage build unitTest
         key: "mandatory-win-2016-unit-tests"
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_WIN_2016}"
@@ -58,6 +64,9 @@ steps:
           Set-Location -Path winlogbeat
           mage build unitTest
         key: "mandatory-win-2019-unit-tests"
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_WIN_2019}"
@@ -76,6 +85,9 @@ steps:
           Set-Location -Path winlogbeat
           mage build unitTest
         key: "mandatory-win-2022-unit-tests"
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_WIN_2022}"
@@ -99,6 +111,9 @@ steps:
           Set-Location -Path winlogbeat
           mage build unitTest
         key: "extended-win-10-unit-tests"
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_WIN_10}"
@@ -117,6 +132,9 @@ steps:
           Set-Location -Path winlogbeat
           mage build unitTest
         key: "extended-win-11-unit-tests"
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_WIN_11}"
diff --git a/.buildkite/x-pack/pipeline.xpack.auditbeat.yml b/.buildkite/x-pack/pipeline.xpack.auditbeat.yml
index 36fcb9bebd9..80c298c725d 100644
--- a/.buildkite/x-pack/pipeline.xpack.auditbeat.yml
+++ b/.buildkite/x-pack/pipeline.xpack.auditbeat.yml
@@ -36,6 +36,9 @@ steps:
           echo "~~~ Will run tests with env var MODULE=$$MODULE"
           cd x-pack/auditbeat
           mage update build test
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_UBUNTU_X86_64}"
@@ -52,6 +55,9 @@ steps:
         command: |
           cd x-pack/auditbeat
           mage build unitTest
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_RHEL9_X86_64}"
@@ -68,6 +74,9 @@ steps:
           Set-Location -Path x-pack/auditbeat
           mage build unitTest
         key: "mandatory-win-2022-unit-tests"
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_WIN_2022}"
@@ -86,6 +95,9 @@ steps:
           Set-Location -Path x-pack/auditbeat
           mage build unitTest
         key: "mandatory-win-2016-unit-tests"
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_WIN_2016}"
@@ -108,6 +120,9 @@ steps:
           Set-Location -Path x-pack/auditbeat
           mage build unitTest
         key: "extended-win-2019-unit-tests"
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_WIN_2019}"
@@ -126,6 +141,9 @@ steps:
           Set-Location -Path x-pack/auditbeat
           mage build unitTest
         key: "extended-win-10-unit-tests"
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_WIN_10}"
@@ -144,6 +162,9 @@ steps:
           Set-Location -Path x-pack/auditbeat
           mage build unitTest
         key: "extended-win-11-unit-tests"
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_WIN_11}"
@@ -167,6 +188,9 @@ steps:
           source .buildkite/scripts/install_macos_tools.sh
           cd x-pack/auditbeat
           mage build unitTest
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "orka"
           imagePrefix: "${IMAGE_MACOS_X86_64}"
@@ -183,6 +207,9 @@ steps:
           source .buildkite/scripts/install_macos_tools.sh
           cd x-pack/auditbeat
           mage build unitTest
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "orka"
           imagePrefix: "${IMAGE_MACOS_ARM}"
@@ -201,6 +228,9 @@ steps:
         command: |
           cd x-pack/auditbeat
           mage build unitTest
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "aws"
           imagePrefix: "${IMAGE_UBUNTU_ARM_64}"
diff --git a/.buildkite/x-pack/pipeline.xpack.dockerlogbeat.yml b/.buildkite/x-pack/pipeline.xpack.dockerlogbeat.yml
index 05aee81e4d8..a64f7851913 100644
--- a/.buildkite/x-pack/pipeline.xpack.dockerlogbeat.yml
+++ b/.buildkite/x-pack/pipeline.xpack.dockerlogbeat.yml
@@ -28,6 +28,9 @@ steps:
       - label: ":ubuntu: Xpack/Dockerlogbeat Ubuntu Unit Tests"
         key: "mandatory-linux-unit-test"
         command: "cd x-pack/dockerlogbeat && mage build unitTest"
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_UBUNTU_X86_64}"
@@ -44,6 +47,9 @@ steps:
         command: "cd x-pack/dockerlogbeat && mage goIntegTest"
         env:
           MODULE: $MODULE
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_UBUNTU_X86_64}"
diff --git a/.buildkite/x-pack/pipeline.xpack.filebeat.yml b/.buildkite/x-pack/pipeline.xpack.filebeat.yml
index 795302bc2d9..b7e71e3c3c0 100644
--- a/.buildkite/x-pack/pipeline.xpack.filebeat.yml
+++ b/.buildkite/x-pack/pipeline.xpack.filebeat.yml
@@ -30,6 +30,9 @@ steps:
         command: |
           cd x-pack/filebeat
           mage build unitTest
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_UBUNTU_X86_64}"
@@ -50,6 +53,9 @@ steps:
           defineModuleFromTheChangeSet x-pack/filebeat
           echo "~~~ Will run tests with env var MODULE=$$MODULE"
           cd x-pack/filebeat && mage goIntegTest
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_UBUNTU_X86_64}"
@@ -70,6 +76,9 @@ steps:
           defineModuleFromTheChangeSet x-pack/filebeat
           echo "~~~ Running tests with env var MODULE=$$MODULE"
           cd x-pack/filebeat && mage pythonIntegTest
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_UBUNTU_X86_64}"
@@ -86,6 +95,9 @@ steps:
           Set-Location -Path x-pack/filebeat
           mage build unitTest
         key: "x-pack-filebeat-mandatory-win-2022-unit-tests"
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_WIN_2022}"
@@ -104,6 +116,9 @@ steps:
           Set-Location -Path x-pack/filebeat
           mage build unitTest
         key: "x-pack-filebeat-mandatory-win-2016-unit-tests"
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_WIN_2016}"
@@ -122,6 +137,9 @@ steps:
         command: |
           cd x-pack/filebeat
           mage build unitTest
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "aws"
           imagePrefix: "${IMAGE_UBUNTU_ARM_64}"
@@ -142,6 +160,9 @@ steps:
           Set-Location -Path x-pack/filebeat
           mage build unitTest
         key: "x-pack-filebeat-extended-win-2019-unit-tests"
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_WIN_2019}"
@@ -160,6 +181,9 @@ steps:
           Set-Location -Path x-pack/filebeat
           mage build unitTest
         key: "x-pack-filebeat-extended-win-10-unit-tests"
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_WIN_10}"
@@ -178,6 +202,9 @@ steps:
           Set-Location -Path x-pack/filebeat
           mage build unitTest
         key: "x-pack-filebeat-extended-win-11-unit-tests"
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_WIN_11}"
@@ -200,6 +227,9 @@ steps:
           set -euo pipefail
           source .buildkite/scripts/install_macos_tools.sh
           cd x-pack/filebeat && mage build unitTest
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "orka"
           imagePrefix: "${IMAGE_MACOS_X86_64}"
@@ -217,6 +247,9 @@ steps:
           set -euo pipefail
           source .buildkite/scripts/install_macos_tools.sh
           cd x-pack/filebeat && mage build unitTest
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "orka"
           imagePrefix: "${IMAGE_MACOS_ARM}"
diff --git a/.buildkite/x-pack/pipeline.xpack.heartbeat.yml b/.buildkite/x-pack/pipeline.xpack.heartbeat.yml
index 107dfa65f1b..136706e698c 100644
--- a/.buildkite/x-pack/pipeline.xpack.heartbeat.yml
+++ b/.buildkite/x-pack/pipeline.xpack.heartbeat.yml
@@ -39,6 +39,9 @@ steps:
           echo "~~~ Running tests"
           cd x-pack/heartbeat
           mage build unitTest
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_UBUNTU_X86_64}"
@@ -59,6 +62,9 @@ steps:
           echo "~~~ Running tests"
           cd x-pack/heartbeat
           mage goIntegTest
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_UBUNTU_X86_64}"
@@ -76,6 +82,9 @@ steps:
         command: |
           Set-Location -Path x-pack/heartbeat
           mage build unitTest
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_WIN_2016}"
@@ -95,6 +104,9 @@ steps:
         command: |
           Set-Location -Path x-pack/heartbeat
           mage build unitTest
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_WIN_2022}"
@@ -118,6 +130,9 @@ steps:
           Set-Location -Path x-pack/heartbeat
           mage build test
         key: "extended-win-10-unit-tests"
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_WIN_10}"
@@ -136,6 +151,9 @@ steps:
           Set-Location -Path x-pack/heartbeat
           mage build test
         key: "extended-win-11-unit-tests"
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_WIN_11}"
@@ -153,6 +171,9 @@ steps:
         command: |
           Set-Location -Path x-pack/heartbeat
           mage build test
+        retry:
+          automatic:
+           - limit: 3
         key: "extended-win-2019-unit-tests"
         agents:
           provider: "gcp"
@@ -166,7 +187,7 @@ steps:
         notify:
           - github_commit_status:
               context: "x-pack/heartbeat: Windows 2019 Unit Tests"
-  
+
   - group: "x-pack/heartbeat MacOS Extended Tests"
     key: "x-pack-heartbeat-extended-tests-macos"
     if: build.env("BUILDKITE_PULL_REQUEST") == "false" || build.env("GITHUB_PR_LABELS") =~ /.*macOS.*/
@@ -179,6 +200,9 @@ steps:
           installNodeJsDependencies
           cd x-pack/heartbeat
           mage build unitTest
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "orka"
           imagePrefix: "${IMAGE_MACOS_X86_64}"
@@ -197,6 +221,9 @@ steps:
           installNodeJsDependencies
           cd x-pack/heartbeat
           mage build unitTest
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "orka"
           imagePrefix: "${IMAGE_MACOS_ARM}"
diff --git a/.buildkite/x-pack/pipeline.xpack.libbeat.yml b/.buildkite/x-pack/pipeline.xpack.libbeat.yml
index 14316a3ecd7..6bf456f6d83 100644
--- a/.buildkite/x-pack/pipeline.xpack.libbeat.yml
+++ b/.buildkite/x-pack/pipeline.xpack.libbeat.yml
@@ -26,6 +26,9 @@ steps:
         command: |
           cd x-pack/libbeat
           mage build unitTest
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_UBUNTU_X86_64}"
@@ -42,6 +45,9 @@ steps:
         command: |
           cd x-pack/libbeat
           mage goIntegTest
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_UBUNTU_X86_64}"
@@ -58,6 +64,9 @@ steps:
         command: |
           cd x-pack/libbeat
           mage pythonIntegTest
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_UBUNTU_X86_64}"
@@ -74,6 +83,9 @@ steps:
           Set-Location -Path x-pack/libbeat
           mage -w reader\etw build goUnitTest
         key: "mandatory-win-2016-unit-tests"
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_WIN_2016}"
@@ -92,6 +104,9 @@ steps:
           Set-Location -Path x-pack/libbeat
           mage -w reader\etw build goUnitTest
         key: "mandatory-win-2022-unit-tests"
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_WIN_2022}"
@@ -114,6 +129,9 @@ steps:
           Set-Location -Path x-pack/libbeat
           mage -w reader\etw build goUnitTest
         key: "extended-win-10-unit-tests"
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_WIN_10}"
@@ -132,6 +150,9 @@ steps:
           Set-Location -Path x-pack/libbeat
           mage -w reader\etw build goUnitTest
         key: "extended-win-11-unit-tests"
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_WIN_11}"
@@ -150,6 +171,9 @@ steps:
           Set-Location -Path x-pack/libbeat
           mage -w reader\etw build goUnitTest
         key: "extended-win-2019-unit-tests"
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_WIN_2019}"
@@ -172,6 +196,9 @@ steps:
         command: |
           cd x-pack/libbeat
           mage build unitTest
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "aws"
           imagePrefix: "${IMAGE_UBUNTU_ARM_64}"
diff --git a/.buildkite/x-pack/pipeline.xpack.metricbeat.yml b/.buildkite/x-pack/pipeline.xpack.metricbeat.yml
index 317b9069c55..4c1c31521f9 100644
--- a/.buildkite/x-pack/pipeline.xpack.metricbeat.yml
+++ b/.buildkite/x-pack/pipeline.xpack.metricbeat.yml
@@ -30,6 +30,9 @@ steps:
         command: |
           cd x-pack/metricbeat
           mage build unitTest
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_UBUNTU_X86_64}"
@@ -50,6 +53,9 @@ steps:
           defineModuleFromTheChangeSet x-pack/metricbeat
           echo "~~~ Will run tests with env var MODULE=$$MODULE"
           cd x-pack/metricbeat && mage goIntegTest
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_UBUNTU_X86_64}"
@@ -70,6 +76,9 @@ steps:
           defineModuleFromTheChangeSet x-pack/metricbeat
           echo "~~~ Running tests with env var MODULE=$$MODULE"
           cd x-pack/metricbeat && mage pythonIntegTest
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_UBUNTU_X86_64}"
@@ -86,6 +95,9 @@ steps:
           Set-Location -Path x-pack/metricbeat
           mage build unitTest
         key: "mandatory-win-2016-unit-tests"
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_WIN_2016}"
@@ -104,6 +116,9 @@ steps:
           Set-Location -Path x-pack/metricbeat
           mage build unitTest
         key: "mandatory-win-2022-unit-tests"
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_WIN_2022}"
@@ -126,6 +141,9 @@ steps:
           Set-Location -Path x-pack/metricbeat
           mage build unitTest
         key: "extended-win-10-unit-tests"
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_WIN_10}"
@@ -144,6 +162,9 @@ steps:
           Set-Location -Path x-pack/metricbeat
           mage build unitTest
         key: "extended-win-11-unit-tests"
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_WIN_11}"
@@ -162,6 +183,9 @@ steps:
           Set-Location -Path x-pack/metricbeat
           mage build unitTest
         key: "extended-win-2019-unit-tests"
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_WIN_2019}"
@@ -185,6 +209,9 @@ steps:
           set -euo pipefail
           source .buildkite/scripts/install_macos_tools.sh
           cd x-pack/metricbeat && mage build unitTest
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "orka"
           imagePrefix: "${IMAGE_MACOS_X86_64}"
@@ -202,6 +229,9 @@ steps:
           set -euo pipefail
           source .buildkite/scripts/install_macos_tools.sh
           cd x-pack/metricbeat && mage build unitTest
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "orka"
           imagePrefix: "${IMAGE_MACOS_ARM}"
diff --git a/.buildkite/x-pack/pipeline.xpack.osquerybeat.yml b/.buildkite/x-pack/pipeline.xpack.osquerybeat.yml
index 8c9137cb423..c8ecac79735 100644
--- a/.buildkite/x-pack/pipeline.xpack.osquerybeat.yml
+++ b/.buildkite/x-pack/pipeline.xpack.osquerybeat.yml
@@ -30,6 +30,9 @@ steps:
         command: |
           cd x-pack/osquerybeat
           mage build unitTest
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_UBUNTU_X86_64}"
@@ -46,6 +49,9 @@ steps:
         command: |
           cd x-pack/osquerybeat
           mage goIntegTest
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_UBUNTU_X86_64}"
@@ -62,6 +68,9 @@ steps:
           Set-Location -Path x-pack/osquerybeat
           mage build unitTest
         key: "mandatory-win-2016-unit-tests"
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_WIN_2016}"
@@ -80,6 +89,9 @@ steps:
           Set-Location -Path x-pack/osquerybeat
           mage build unitTest
         key: "mandatory-win-2022-unit-tests"
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_WIN_2022}"
@@ -102,6 +114,9 @@ steps:
           Set-Location -Path x-pack/osquerybeat
           mage build unitTest
         key: "extended-win-10-unit-tests"
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_WIN_10}"
@@ -120,6 +135,9 @@ steps:
           Set-Location -Path x-pack/osquerybeat
           mage build unitTest
         key: "extended-win-11-unit-tests"
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_WIN_11}"
@@ -138,6 +156,9 @@ steps:
           Set-Location -Path x-pack/osquerybeat
           mage build unitTest
         key: "extended-win-2019-unit-tests"
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_WIN_2019}"
@@ -160,6 +181,9 @@ steps:
           set -euo pipefail
           source .buildkite/scripts/install_macos_tools.sh
           cd x-pack/osquerybeat && mage build unitTest
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "orka"
           imagePrefix: "${IMAGE_MACOS_X86_64}"
@@ -175,6 +199,9 @@ steps:
           set -euo pipefail
           source .buildkite/scripts/install_macos_tools.sh
           cd x-pack/osquerybeat && mage build unitTest
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "orka"
           imagePrefix: "${IMAGE_MACOS_ARM}"
diff --git a/.buildkite/x-pack/pipeline.xpack.packetbeat.yml b/.buildkite/x-pack/pipeline.xpack.packetbeat.yml
index 77fdf2af848..1ab71c30d7d 100644
--- a/.buildkite/x-pack/pipeline.xpack.packetbeat.yml
+++ b/.buildkite/x-pack/pipeline.xpack.packetbeat.yml
@@ -29,6 +29,9 @@ steps:
         command: |
           cd x-pack/packetbeat
           mage build unitTest
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_UBUNTU_X86_64}"
@@ -45,6 +48,9 @@ steps:
         command: |
           cd x-pack/packetbeat
           mage systemTest
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_UBUNTU_X86_64}"
@@ -61,6 +67,9 @@ steps:
         command: |
           cd x-pack/packetbeat
           mage build unitTest
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_RHEL9_X86_64}"
@@ -77,6 +86,9 @@ steps:
           Set-Location -Path x-pack/packetbeat
           mage build unitTest
         key: "mandatory-win-2016-unit-tests"
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_WIN_2016}"
@@ -95,6 +107,9 @@ steps:
           Set-Location -Path x-pack/packetbeat
           mage build unitTest
         key: "mandatory-win-2022-unit-tests"
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_WIN_2022}"
@@ -114,6 +129,9 @@ steps:
         command: |
           Set-Location -Path x-pack/packetbeat
           mage systemTest
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_WIN_2022}"
@@ -136,6 +154,9 @@ steps:
           Set-Location -Path x-pack/packetbeat
           mage build unitTest
         key: "extended-win-10-unit-tests"
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_WIN_10}"
@@ -154,6 +175,9 @@ steps:
           Set-Location -Path x-pack/packetbeat
           mage build unitTest
         key: "extended-win-11-unit-tests"
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_WIN_11}"
@@ -172,6 +196,9 @@ steps:
           Set-Location -Path x-pack/packetbeat
           mage build unitTest
         key: "extended-win-2019-unit-tests"
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_WIN_2019}"
@@ -191,6 +218,9 @@ steps:
         command: |
           Set-Location -Path x-pack/packetbeat
           mage systemTest
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_WIN_10}"
@@ -214,6 +244,9 @@ steps:
           cd x-pack/packetbeat
           mage build unitTest
         if: build.env("GITHUB_PR_LABELS") =~ /.*arm.*/
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "aws"
           imagePrefix: "${IMAGE_UBUNTU_ARM_64}"
@@ -236,6 +269,9 @@ steps:
           source .buildkite/scripts/install_macos_tools.sh
           cd x-pack/packetbeat
           mage build unitTest
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "orka"
           imagePrefix: "${IMAGE_MACOS_X86_64}"
@@ -253,6 +289,9 @@ steps:
           source .buildkite/scripts/install_macos_tools.sh
           cd x-pack/packetbeat
           mage build unitTest
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "orka"
           imagePrefix: "${IMAGE_MACOS_ARM}"
diff --git a/.buildkite/x-pack/pipeline.xpack.winlogbeat.yml b/.buildkite/x-pack/pipeline.xpack.winlogbeat.yml
index c07e537adf0..c6b5a6f59fe 100644
--- a/.buildkite/x-pack/pipeline.xpack.winlogbeat.yml
+++ b/.buildkite/x-pack/pipeline.xpack.winlogbeat.yml
@@ -29,6 +29,9 @@ steps:
           mage build unitTest
         env:
           MODULE: $MODULE
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_WIN_2019}"
@@ -47,6 +50,9 @@ steps:
           Set-Location -Path x-pack/winlogbeat
           mage build unitTest
         key: "mandatory-win-2016-unit-tests"
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_WIN_2016}"
@@ -65,6 +71,9 @@ steps:
           Set-Location -Path x-pack/winlogbeat
           mage build unitTest
         key: "mandatory-win-2022-unit-tests"
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_WIN_2022}"
@@ -88,6 +97,9 @@ steps:
           Set-Location -Path x-pack/winlogbeat
           mage build unitTest
         key: "extended-win-10-unit-tests"
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_WIN_10}"
@@ -106,6 +118,9 @@ steps:
           Set-Location -Path x-pack/winlogbeat
           mage build unitTest
         key: "extended-win-11-unit-tests"
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_WIN_11}"
@@ -124,6 +139,9 @@ steps:
           Set-Location -Path x-pack/winlogbeat
           mage build unitTest
         key: "extended-win-2019-unit-tests"
+        retry:
+          automatic:
+           - limit: 3
         agents:
           provider: "gcp"
           image: "${IMAGE_WIN_2019}"

From ffcd1814666645a5d7a644911ecf6e2b7d8db3f5 Mon Sep 17 00:00:00 2001
From: Michael Wolf <michael.wolf@elastic.co>
Date: Wed, 1 May 2024 14:52:27 -0700
Subject: [PATCH 30/30] [Auditbeat][add_session_metadata processor] Fix more
 potential enrichment failures (#39243)

Fix two more cases that could cause unenriched processes in the add_session_metadata processor.

It was possible for auditd events to arrive before the ebpf event added processes to the process DB, now the enrichment will wait for the process to be inserted into the DB, if it's not already before enrichment is run on it. Also stop attempting to enrich failed syscall events, and modifying the DB based on these.

Changes:

With the ebpf backend, when an event is processed wait for a process to be added to the DB before enriching, if it's not already in the DB before the event is received.
Do not enrich failed syscall auditd events. Since failed syscalls don't actually cause a process to be created, they should not be enriched, or inserted to the process
Remove scrapeAncestors from DB. The intention of this was to fill in missed processes, but now processes should not be missed with epbf, and ineffective with procfs, as the process will most likely already be ended. This was causing DB inconsistancies when run on failed syscall events, and I haven't ever seen any cases where it's helpful now.
---
 CHANGELOG.next.asciidoc                       |  3 +-
 .../sessionmd/add_session_metadata.go         | 19 ++++-
 .../processors/sessionmd/processdb/db.go      | 39 ++-------
 .../provider/ebpf_provider/ebpf_provider.go   | 80 ++++++++++++++++++-
 .../procfs_provider/procfs_provider.go        | 21 ++---
 .../procfs_provider/procfs_provider_test.go   | 10 +--
 .../processors/sessionmd/provider/provider.go |  2 +-
 7 files changed, 114 insertions(+), 60 deletions(-)

diff --git a/CHANGELOG.next.asciidoc b/CHANGELOG.next.asciidoc
index 58ce7ac0f65..68eb43677ea 100644
--- a/CHANGELOG.next.asciidoc
+++ b/CHANGELOG.next.asciidoc
@@ -94,8 +94,7 @@ https://github.com/elastic/beats/compare/v8.8.1\...main[Check the HEAD diff]
 
 *Auditbeat*
 - Set field types to correctly match ECS in sessionmd processor {issue}38955[38955] {pull}38994[38994]
-- Keep process info on exited processes, to avoid failing to enrich events in sessionmd processor {pull}39173[39173]
-
+- Fix failing to enrich process events in sessionmd processor {issue}38955[38955] {pull}39173[39173] {pull}39243[39243]
 - Prevent scenario of losing children-related file events in a directory for recursive fsnotify backend of auditbeat file integrity module {pull}39133[39133]
 
 
diff --git a/x-pack/auditbeat/processors/sessionmd/add_session_metadata.go b/x-pack/auditbeat/processors/sessionmd/add_session_metadata.go
index ff9fa54e556..766e9623b9e 100644
--- a/x-pack/auditbeat/processors/sessionmd/add_session_metadata.go
+++ b/x-pack/auditbeat/processors/sessionmd/add_session_metadata.go
@@ -96,13 +96,24 @@ func New(cfg *cfg.C) (beat.Processor, error) {
 }
 
 func (p *addSessionMetadata) Run(ev *beat.Event) (*beat.Event, error) {
-	_, err := ev.GetValue(p.config.PIDField)
+	pi, err := ev.GetValue(p.config.PIDField)
 	if err != nil {
 		// Do not attempt to enrich events without PID; it's not a supported event
 		return ev, nil //nolint:nilerr // Running on events without PID is expected
 	}
 
-	err = p.provider.UpdateDB(ev)
+	// Do not enrich failed syscalls, as there was no actual process change related to it
+	v, err := ev.GetValue("auditd.result")
+	if err == nil && v == "fail" {
+		return ev, nil
+	}
+
+	pid, err := pidToUInt32(pi)
+	if err != nil {
+		return ev, nil //nolint:nilerr // Running on events with a different PID type is not a processor error
+	}
+
+	err = p.provider.UpdateDB(ev, pid)
 	if err != nil {
 		return ev, err
 	}
@@ -136,7 +147,9 @@ func (p *addSessionMetadata) enrich(ev *beat.Event) (*beat.Event, error) {
 
 	fullProcess, err := p.db.GetProcess(pid)
 	if err != nil {
-		return nil, fmt.Errorf("pid %v not found in db: %w", pid, err)
+		e := fmt.Errorf("pid %v not found in db: %w", pid, err)
+		p.logger.Errorf("%v", e)
+		return nil, e
 	}
 
 	processMap := fullProcess.ToMap()
diff --git a/x-pack/auditbeat/processors/sessionmd/processdb/db.go b/x-pack/auditbeat/processors/sessionmd/processdb/db.go
index 2c7c228e2c1..b8c624abe00 100644
--- a/x-pack/auditbeat/processors/sessionmd/processdb/db.go
+++ b/x-pack/auditbeat/processors/sessionmd/processdb/db.go
@@ -238,7 +238,6 @@ func (db *DB) InsertFork(fork types.ProcessForkEvent) {
 
 	pid := fork.ChildPIDs.Tgid
 	ppid := fork.ParentPIDs.Tgid
-	db.scrapeAncestors(db.processes[pid])
 
 	if entry, ok := db.processes[ppid]; ok {
 		entry.PIDs = pidInfoFromProto(fork.ChildPIDs)
@@ -282,7 +281,6 @@ func (db *DB) InsertExec(exec types.ProcessExecEvent) {
 	}
 
 	db.processes[exec.PIDs.Tgid] = proc
-	db.scrapeAncestors(proc)
 	entryLeaderPID := db.evaluateEntryLeader(proc)
 	if entryLeaderPID != nil {
 		db.entryLeaderRelationships[exec.PIDs.Tgid] = *entryLeaderPID
@@ -568,6 +566,14 @@ func setSameAsProcess(process *types.Process) {
 	}
 }
 
+func (db *DB) HasProcess(pid uint32) bool {
+	db.mutex.RLock()
+	defer db.mutex.RUnlock()
+
+	_, ok := db.processes[pid]
+	return ok
+}
+
 func (db *DB) GetProcess(pid uint32) (types.Process, error) {
 	db.mutex.RLock()
 	defer db.mutex.RUnlock()
@@ -585,8 +591,6 @@ func (db *DB) GetProcess(pid uint32) (types.Process, error) {
 				fillParent(&ret, parent)
 				break
 			}
-			db.logger.Debugf("failed to find %d in DB (parent of %d), attempting to scrape", process.PIDs.Ppid, pid)
-			db.scrapeAncestors(process)
 		}
 	}
 
@@ -596,8 +600,6 @@ func (db *DB) GetProcess(pid uint32) (types.Process, error) {
 				fillGroupLeader(&ret, groupLeader)
 				break
 			}
-			db.logger.Debugf("failed to find %d in DB (group leader of %d), attempting to scrape", process.PIDs.Pgid, pid)
-			db.scrapeAncestors(process)
 		}
 	}
 
@@ -607,8 +609,6 @@ func (db *DB) GetProcess(pid uint32) (types.Process, error) {
 				fillSessionLeader(&ret, sessionLeader)
 				break
 			}
-			db.logger.Debugf("failed to find %d in DB (session leader of %d), attempting to scrape", process.PIDs.Sid, pid)
-			db.scrapeAncestors(process)
 		}
 	}
 
@@ -712,29 +712,6 @@ func getTTYType(major uint16, minor uint16) TTYType {
 	return TTYUnknown
 }
 
-func (db *DB) scrapeAncestors(proc Process) {
-	for _, pid := range []uint32{proc.PIDs.Pgid, proc.PIDs.Ppid, proc.PIDs.Sid} {
-		if _, exists := db.processes[pid]; pid == 0 || exists {
-			continue
-		}
-		procInfo, err := db.procfs.GetProcess(pid)
-		if err != nil {
-			db.logger.Debugf("couldn't get %v from procfs: %w", pid, err)
-			continue
-		}
-		p := Process{
-			PIDs:     pidInfoFromProto(procInfo.PIDs),
-			Creds:    credInfoFromProto(procInfo.Creds),
-			CTTY:     ttyDevFromProto(procInfo.CTTY),
-			Argv:     procInfo.Argv,
-			Cwd:      procInfo.Cwd,
-			Env:      procInfo.Env,
-			Filename: procInfo.Filename,
-		}
-		db.insertProcess(p)
-	}
-}
-
 func (db *DB) Close() {
 	close(db.stopChan)
 }
diff --git a/x-pack/auditbeat/processors/sessionmd/provider/ebpf_provider/ebpf_provider.go b/x-pack/auditbeat/processors/sessionmd/provider/ebpf_provider/ebpf_provider.go
index 2b9b540e037..f1b8bae0b67 100644
--- a/x-pack/auditbeat/processors/sessionmd/provider/ebpf_provider/ebpf_provider.go
+++ b/x-pack/auditbeat/processors/sessionmd/provider/ebpf_provider/ebpf_provider.go
@@ -9,6 +9,7 @@ package ebpf_provider
 import (
 	"context"
 	"fmt"
+	"time"
 
 	"github.com/elastic/beats/v7/libbeat/beat"
 	"github.com/elastic/beats/v7/libbeat/ebpf"
@@ -151,7 +152,80 @@ func NewProvider(ctx context.Context, logger *logp.Logger, db *processdb.DB) (pr
 	return &p, nil
 }
 
-func (s prvdr) UpdateDB(ev *beat.Event) error {
-	// no-op for ebpf, DB is updated from pushed ebpf events
-	return nil
+const (
+	maxWaitLimit      = 200 * time.Millisecond // Maximum time UpdateDB will wait for process
+	combinedWaitLimit = 2 * time.Second        // Multiple UpdateDB calls will wait up to this amount within resetDuration
+	backoffDuration   = 10 * time.Second       // UpdateDB will stop waiting for processes for this time
+	resetDuration     = 5 * time.Second        // After this amount of times with no backoffs, the combinedWait will be reset
+)
+
+var (
+	combinedWait   = 0 * time.Millisecond
+	inBackoff      = false
+	backoffStart   = time.Now()
+	since          = time.Now()
+	backoffSkipped = 0
+)
+
+// With ebpf, process events are pushed to the DB by the above goroutine, so this doesn't actually update the DB.
+// It does to try sync the processor and ebpf events, so that the process is in the process db before continuing.
+//
+// It's possible that the event to enrich arrives before the process is inserted into the DB. In that case, this
+// will block continuing the enrichment until the process is seen (or the timeout is reached).
+//
+// If for some reason a lot of time has been spent waiting for missing processes, this also has a backoff timer during
+// which it will continue without waiting for missing events to arrive, so the processor doesn't become overly backed-up
+// waiting for these processes, at the cost of possibly not enriching some processes.
+func (s prvdr) UpdateDB(ev *beat.Event, pid uint32) error {
+	if s.db.HasProcess(pid) {
+		return nil
+	}
+
+	now := time.Now()
+	if inBackoff {
+		if now.Sub(backoffStart) > backoffDuration {
+			s.logger.Warnf("ended backoff, skipped %d processes", backoffSkipped)
+			inBackoff = false
+			combinedWait = 0 * time.Millisecond
+		} else {
+			backoffSkipped += 1
+			return nil
+		}
+	} else {
+		if combinedWait > combinedWaitLimit {
+			s.logger.Warn("starting backoff")
+			inBackoff = true
+			backoffStart = now
+			backoffSkipped = 0
+			return nil
+		}
+		// maintain a moving window of time for the delays we track
+		if now.Sub(since) > resetDuration {
+			since = now
+			combinedWait = 0 * time.Millisecond
+		}
+	}
+
+	start := now
+	nextWait := 5 * time.Millisecond
+	for {
+		waited := time.Since(start)
+		if s.db.HasProcess(pid) {
+			s.logger.Debugf("got process that was missing after %v", waited)
+			combinedWait = combinedWait + waited
+			return nil
+		}
+		if waited >= maxWaitLimit {
+			e := fmt.Errorf("process %v was not seen after %v", pid, waited)
+			s.logger.Warnf("%w", e)
+			combinedWait = combinedWait + waited
+			return e
+		}
+		time.Sleep(nextWait)
+		if nextWait*2+waited > maxWaitLimit {
+			nextWait = maxWaitLimit - waited
+		} else {
+			nextWait = nextWait * 2
+		}
+	}
 }
diff --git a/x-pack/auditbeat/processors/sessionmd/provider/procfs_provider/procfs_provider.go b/x-pack/auditbeat/processors/sessionmd/provider/procfs_provider/procfs_provider.go
index 2f99dd72b1f..6525b860b6d 100644
--- a/x-pack/auditbeat/processors/sessionmd/provider/procfs_provider/procfs_provider.go
+++ b/x-pack/auditbeat/processors/sessionmd/provider/procfs_provider/procfs_provider.go
@@ -41,16 +41,7 @@ func NewProvider(ctx context.Context, logger *logp.Logger, db *processdb.DB, rea
 }
 
 // UpdateDB will update the process DB with process info from procfs or the event itself
-func (s prvdr) UpdateDB(ev *beat.Event) error {
-	pi, err := ev.Fields.GetValue(s.pidField)
-	if err != nil {
-		return fmt.Errorf("event not supported, no pid")
-	}
-	pid, ok := pi.(int)
-	if !ok {
-		return fmt.Errorf("pid field not int")
-	}
-
+func (s prvdr) UpdateDB(ev *beat.Event, pid uint32) error {
 	syscall, err := ev.GetValue(syscallField)
 	if err != nil {
 		return fmt.Errorf("event not supported, no syscall data")
@@ -59,7 +50,7 @@ func (s prvdr) UpdateDB(ev *beat.Event) error {
 	switch syscall {
 	case "execveat", "execve":
 		pe := types.ProcessExecEvent{}
-		proc_info, err := s.reader.GetProcess(uint32(pid))
+		proc_info, err := s.reader.GetProcess(pid)
 		if err == nil {
 			pe.PIDs = proc_info.PIDs
 			pe.Creds = proc_info.Creds
@@ -72,7 +63,7 @@ func (s prvdr) UpdateDB(ev *beat.Event) error {
 			s.logger.Warnf("couldn't get process info from proc for pid %v: %w", pid, err)
 			// If process info couldn't be taken from procfs, populate with as much info as
 			// possible from the event
-			pe.PIDs.Tgid = uint32(pid)
+			pe.PIDs.Tgid = pid
 			var intr interface{}
 			var i int
 			var ok bool
@@ -106,7 +97,7 @@ func (s prvdr) UpdateDB(ev *beat.Event) error {
 	case "exit_group":
 		pe := types.ProcessExitEvent{
 			PIDs: types.PIDInfo{
-				Tgid: uint32(pid),
+				Tgid: pid,
 			},
 		}
 		s.db.InsertExit(pe)
@@ -122,8 +113,8 @@ func (s prvdr) UpdateDB(ev *beat.Event) error {
 		if result == "success" {
 			setsid_ev := types.ProcessSetsidEvent{
 				PIDs: types.PIDInfo{
-					Tgid: uint32(pid),
-					Sid:  uint32(pid),
+					Tgid: pid,
+					Sid:  pid,
 				},
 			}
 			s.db.InsertSetsid(setsid_ev)
diff --git a/x-pack/auditbeat/processors/sessionmd/provider/procfs_provider/procfs_provider_test.go b/x-pack/auditbeat/processors/sessionmd/provider/procfs_provider/procfs_provider_test.go
index 6fd333c4711..c438efcfe1a 100644
--- a/x-pack/auditbeat/processors/sessionmd/provider/procfs_provider/procfs_provider_test.go
+++ b/x-pack/auditbeat/processors/sessionmd/provider/procfs_provider/procfs_provider_test.go
@@ -124,7 +124,7 @@ func TestExecveEvent(t *testing.T) {
 	provider, err := NewProvider(context.TODO(), &logger, db, reader, "process.pid")
 	require.Nil(t, err, "error creating provider")
 
-	err = provider.UpdateDB(&event)
+	err = provider.UpdateDB(&event, expected.PIDs.Tgid)
 	require.Nil(t, err)
 
 	actual, err := db.GetProcess(pid)
@@ -234,7 +234,7 @@ func TestExecveatEvent(t *testing.T) {
 	provider, err := NewProvider(context.TODO(), &logger, db, reader, "process.pid")
 	require.Nil(t, err, "error creating provider")
 
-	err = provider.UpdateDB(&event)
+	err = provider.UpdateDB(&event, expected.PIDs.Tgid)
 	require.Nil(t, err)
 
 	actual, err := db.GetProcess(pid)
@@ -317,7 +317,7 @@ func TestSetSidEvent(t *testing.T) {
 	provider, err := NewProvider(context.TODO(), &logger, db, reader, "process.pid")
 	require.Nil(t, err, "error creating provider")
 
-	err = provider.UpdateDB(&event)
+	err = provider.UpdateDB(&event, expected.PIDs.Tgid)
 	require.Nil(t, err)
 
 	actual, err := db.GetProcess(pid)
@@ -399,7 +399,7 @@ func TestSetSidEventFailed(t *testing.T) {
 	provider, err := NewProvider(context.TODO(), &logger, db, reader, "process.pid")
 	require.Nil(t, err, "error creating provider")
 
-	err = provider.UpdateDB(&event)
+	err = provider.UpdateDB(&event, expected.PIDs.Tgid)
 	require.Nil(t, err)
 
 	actual, err := db.GetProcess(pid)
@@ -470,7 +470,7 @@ func TestSetSidSessionLeaderNotScraped(t *testing.T) {
 	provider, err := NewProvider(context.TODO(), &logger, db, reader, "process.pid")
 	require.Nil(t, err, "error creating provider")
 
-	err = provider.UpdateDB(&event)
+	err = provider.UpdateDB(&event, expected.PIDs.Tgid)
 	require.Nil(t, err)
 
 	actual, err := db.GetProcess(pid)
diff --git a/x-pack/auditbeat/processors/sessionmd/provider/provider.go b/x-pack/auditbeat/processors/sessionmd/provider/provider.go
index e3fa1547806..6452eb9e2bf 100644
--- a/x-pack/auditbeat/processors/sessionmd/provider/provider.go
+++ b/x-pack/auditbeat/processors/sessionmd/provider/provider.go
@@ -11,5 +11,5 @@ import (
 )
 
 type Provider interface {
-	UpdateDB(*beat.Event) error
+	UpdateDB(*beat.Event, uint32) error
 }