From d2122d4e3f0b10ef69ee86b457c6f58e1945171b Mon Sep 17 00:00:00 2001 From: Dimitrios Liappis Date: Mon, 29 Apr 2024 11:07:59 +0300 Subject: [PATCH 01/30] Add pipeline-scheduler pipeline in catalog.info (#39254) As a follow up to PR#39206 and PR#39171, this commit adds a new generic scheduling pipeline in catalog-info that serves as a central point for scheduling any other pipeline. Unfortunately, it's not possible to specify a custom agent (k8s image) yet at the catalog-info level[^1], therefore we still need a small static pipeline -- empty for now -- that uploads the needed steps. [^1]: https://github.com/elastic/ci/blob/71e83d340e3b93ab43fcf16a7a70ac33bdeec6e9/terrazzo/terrazzo/constructs/buildkite/pipelines.py#L787-L842 --- .buildkite/pipeline-scheduler.yml | 0 catalog-info.yaml | 47 +++++++++++++++++++++++++++++++ 2 files changed, 47 insertions(+) create mode 100644 .buildkite/pipeline-scheduler.yml diff --git a/.buildkite/pipeline-scheduler.yml b/.buildkite/pipeline-scheduler.yml new file mode 100644 index 00000000000..e69de29bb2d diff --git a/catalog-info.yaml b/catalog-info.yaml index 116e5024663..ae37200762b 100644 --- a/catalog-info.yaml +++ b/catalog-info.yaml @@ -1113,3 +1113,50 @@ spec: access_level: BUILD_AND_READ everyone: access_level: READ_ONLY + +--- +# yaml-language-server: $schema=https://gist.githubusercontent.com/elasticmachine/988b80dae436cafea07d9a4a460a011d/raw/rre.schema.json +apiVersion: backstage.io/v1alpha1 +kind: Resource +metadata: + name: beats-pipeline-scheduler + description: 'Scheduled runs of various Beats pipelines per release branch' + links: + - title: 'Scheduled runs of Beats pipelines per release branch' + url: https://buildkite.com/elastic/logstash-pipeline-scheduler +spec: + type: buildkite-pipeline + owner: group:ingest-fp + system: buildkite + implementation: + apiVersion: buildkite.elastic.dev/v1 + kind: Pipeline + metadata: + name: beats-pipeline-scheduler + description: ':alarm_clock: Scheduled runs of various Beats pipelines per release branch' + spec: + repository: elastic/beats + pipeline_file: ".buildkite/pipeline-scheduler.yml" + maximum_timeout_in_minutes: 240 + schedules: + Daily Snapshot DRA: + branch: main + cronline: 30 02 * * * + message: Daily trigger of Iron Bank validation Pipeline per branch + env: + PIPELINES_TO_TRIGGER: 'beats-ironbank-validation' + skip_intermediate_builds: true + provider_settings: + trigger_mode: none + env: + # TODO enable slack notifications when it's tested + ELASTIC_SLACK_NOTIFICATIONS_ENABLED: 'false' + SLACK_NOTIFICATIONS_CHANNEL: '#ingest-notifications' + SLACK_NOTIFICATIONS_ON_SUCCESS: 'false' + teams: + ingest-fp: + access_level: MANAGE_BUILD_AND_READ + release-eng: + access_level: BUILD_AND_READ + everyone: + access_level: READ_ONLY From a4b21dcd8435b80a97063e0e69a6d98aaba17d1a Mon Sep 17 00:00:00 2001 From: Dimitrios Liappis Date: Mon, 29 Apr 2024 14:29:41 +0300 Subject: [PATCH 02/30] Add IronBank validation to cron schedule (#39255) This commit is a follow up to #39254 and adds a schedule for the IronBank validation pipeline to the centralized scheduling pipeline. Relates: https://github.com/elastic/ingest-dev/issues/3235 --- .buildkite/pipeline-scheduler.yml | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/.buildkite/pipeline-scheduler.yml b/.buildkite/pipeline-scheduler.yml index e69de29bb2d..3f9b628bc63 100644 --- a/.buildkite/pipeline-scheduler.yml +++ b/.buildkite/pipeline-scheduler.yml @@ -0,0 +1,17 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/buildkite/pipeline-schema/main/schema.json + +# this intermediate pipeline is required because we can't specify a custom agent (k8s image) yet +# in catalog-info: https://github.com/elastic/ci/blob/71e83d340e3b93ab43fcf16a7a70ac33bdeec6e9/terrazzo/terrazzo/constructs/buildkite/pipelines.py#L787-L842 + +steps: + - label: ":pipeline: Generate trigger steps for $PIPELINES_TO_TRIGGER" + command: | + set -eo pipefail + .buildkite/pipeline-scheduler.py >steps.yml + echo "~~~ Printing pipeline steps" + yq . steps.yml + echo "~~~ Uploading steps" + buildkite-agent pipeline upload steps.yml + agents: + image: "docker.elastic.co/ci-agent-images/platform-ingest/buildkite-agent-beats-ci-with-hooks:0.1" + useCustomGlobalHooks: true From e588628b24946595a9c6123cf57b5597b534c72b Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Mon, 29 Apr 2024 08:40:02 -0400 Subject: [PATCH 03/30] Fix concurrency bugs that could cause data loss in the `aws-s3` input (#39131) This is a cleanup of concurrency and error handling in the `aws-s3` input that could cause several known bugs: - Memory leaks ([1](https://github.com/elastic/integrations/issues/9463), [2](https://github.com/elastic/beats/issues/39052)). This issue was caused because the input could run several scans of its s3 bucket simultaneously, which led to the cleanup routine `s3Poller.Purge` being called many times concurrently. Inefficiencies in this function caused it to accumulate over time, creating many copies of the state data which could overload process memory. Fixed by: * Changing the `s3Poller` run loop to only run one scan at a time, and wait for it to complete before starting the next one. * Having each object persist its own state after completing, instead of waiting until the end of a scan and writing an entire bucket worth of metadata at once. - This also allowed the removal of other metadata: there is no longer any reason to track the detailed acknowledgment state of each "listing" (page of ~1K events during bucket enumeration), so the `states` helper object is now much simpler. - Skipped data due to buggy last-modified calculations ([3](https://github.com/elastic/beats/issues/39065)). The most recent scanned timestamp was calculated incorrectly, causing the input to skip a growing number of events as ingestion progressed. * Fixed by removing the bucket-wide last modified check entirely. This feature was already risky, since objects with earlier creation timestamps can appear after ones with later timestamps, so there is always the possibility to miss objects. Since the value was calculated incorrectly and was discarded between runs, we can remove it without breaking compatibility and reimplement it more safely in the future if needed. - Skipped data because rate limiting is treated as permanent failure ([4](https://github.com/elastic/beats/issues/39114)). The input treats all error types the same, which causes many objects to be skipped for ephemeral errors. * Fixed by creating an error, `errS3DownloadFailure`, that is returned when processing failure is caused by a download error. In this case, the S3 workers will not persist the failure to the `states` table, so the object will be retried on the next bucket scan. When this happens the worker also sleeps (using an exponential backoff) before trying the next object. * Exponential backoff was also added to the bucket scanning loop for page listing errors, so the bucket scan is not restarted needlessly. --- x-pack/filebeat/input/awss3/input.go | 40 +- .../input/awss3/input_benchmark_test.go | 14 +- x-pack/filebeat/input/awss3/s3.go | 321 ++++----------- x-pack/filebeat/input/awss3/s3_objects.go | 15 +- .../filebeat/input/awss3/s3_objects_test.go | 9 +- x-pack/filebeat/input/awss3/s3_test.go | 20 +- x-pack/filebeat/input/awss3/state.go | 66 +--- x-pack/filebeat/input/awss3/state_test.go | 2 +- x-pack/filebeat/input/awss3/states.go | 368 +++--------------- x-pack/filebeat/input/awss3/states_test.go | 306 +++------------ 10 files changed, 246 insertions(+), 915 deletions(-) diff --git a/x-pack/filebeat/input/awss3/input.go b/x-pack/filebeat/input/awss3/input.go index 733de949f29..bb4a5c15bda 100644 --- a/x-pack/filebeat/input/awss3/input.go +++ b/x-pack/filebeat/input/awss3/input.go @@ -13,6 +13,7 @@ import ( "time" awssdk "github.com/aws/aws-sdk-go-v2/aws" + "github.com/aws/aws-sdk-go-v2/aws/retry" "github.com/aws/aws-sdk-go-v2/service/s3" "github.com/aws/aws-sdk-go-v2/service/sqs" "github.com/aws/smithy-go" @@ -21,7 +22,6 @@ import ( v2 "github.com/elastic/beats/v7/filebeat/input/v2" "github.com/elastic/beats/v7/libbeat/beat" "github.com/elastic/beats/v7/libbeat/feature" - "github.com/elastic/beats/v7/libbeat/statestore" awscommon "github.com/elastic/beats/v7/x-pack/libbeat/common/aws" conf "github.com/elastic/elastic-agent-libs/config" "github.com/elastic/go-concert/unison" @@ -99,21 +99,6 @@ func (in *s3Input) Test(ctx v2.TestContext) error { } func (in *s3Input) Run(inputContext v2.Context, pipeline beat.Pipeline) error { - var err error - - persistentStore, err := in.store.Access() - if err != nil { - return fmt.Errorf("can not access persistent store: %w", err) - } - - defer persistentStore.Close() - - states := newStates(inputContext) - err = states.readStatesFrom(persistentStore) - if err != nil { - return fmt.Errorf("can not start persistent store: %w", err) - } - ctx := v2.GoContextFromCanceler(inputContext.Cancelation) if in.config.QueueURL != "" { @@ -158,8 +143,20 @@ func (in *s3Input) Run(inputContext v2.Context, pipeline beat.Pipeline) error { } defer client.Close() + // Connect to the registry and create our states lookup + persistentStore, err := in.store.Access() + if err != nil { + return fmt.Errorf("can not access persistent store: %w", err) + } + defer persistentStore.Close() + + states, err := newStates(inputContext, persistentStore) + if err != nil { + return fmt.Errorf("can not start persistent store: %w", err) + } + // Create S3 receiver and S3 notification processor. - poller, err := in.createS3Lister(inputContext, ctx, client, persistentStore, states) + poller, err := in.createS3Lister(inputContext, ctx, client, states) if err != nil { return fmt.Errorf("failed to initialize s3 poller: %w", err) } @@ -230,7 +227,7 @@ func (n nonAWSBucketResolver) ResolveEndpoint(region string, options s3.Endpoint return awssdk.Endpoint{URL: n.endpoint, SigningRegion: region, HostnameImmutable: true, Source: awssdk.EndpointSourceCustom}, nil } -func (in *s3Input) createS3Lister(ctx v2.Context, cancelCtx context.Context, client beat.Client, persistentStore *statestore.Store, states *states) (*s3Poller, error) { +func (in *s3Input) createS3Lister(ctx v2.Context, cancelCtx context.Context, client beat.Client, states *states) (*s3Poller, error) { var bucketName string var bucketID string if in.config.NonAWSBucketName != "" { @@ -250,6 +247,12 @@ func (in *s3Input) createS3Lister(ctx v2.Context, cancelCtx context.Context, cli o.EndpointOptions.UseFIPSEndpoint = awssdk.FIPSEndpointStateEnabled } o.UsePathStyle = in.config.PathStyle + + o.Retryer = retry.NewStandard(func(so *retry.StandardOptions) { + so.MaxAttempts = 5 + // Recover quickly when requests start working again + so.NoRetryIncrement = 100 + }) }) regionName, err := getRegionForBucket(cancelCtx, s3Client, bucketName) if err != nil { @@ -295,7 +298,6 @@ func (in *s3Input) createS3Lister(ctx v2.Context, cancelCtx context.Context, cli client, s3EventHandlerFactory, states, - persistentStore, bucketID, in.config.BucketListPrefix, in.awsConfig.Region, diff --git a/x-pack/filebeat/input/awss3/input_benchmark_test.go b/x-pack/filebeat/input/awss3/input_benchmark_test.go index e05e5b461ca..5d22d141168 100644 --- a/x-pack/filebeat/input/awss3/input_benchmark_test.go +++ b/x-pack/filebeat/input/awss3/input_benchmark_test.go @@ -8,7 +8,6 @@ import ( "context" "errors" "fmt" - "io/ioutil" "os" "path/filepath" "runtime" @@ -16,6 +15,8 @@ import ( "testing" "time" + "github.com/stretchr/testify/assert" + "github.com/elastic/beats/v7/libbeat/statestore" "github.com/elastic/beats/v7/libbeat/statestore/storetest" @@ -132,7 +133,7 @@ type constantS3 struct { var _ s3API = (*constantS3)(nil) func newConstantS3(t testing.TB) *constantS3 { - data, err := ioutil.ReadFile(cloudtrailTestFile) + data, err := os.ReadFile(cloudtrailTestFile) if err != nil { t.Fatal(err) } @@ -342,14 +343,11 @@ func benchmarkInputS3(t *testing.T, numberOfWorkers int) testing.BenchmarkResult return } - err = store.Set(awsS3WriteCommitPrefix+"bucket"+listPrefix, &commitWriteState{time.Time{}}) - if err != nil { - errChan <- err - return - } + states, err := newStates(inputCtx, store) + assert.NoError(t, err, "states creation should succeed") s3EventHandlerFactory := newS3ObjectProcessorFactory(log.Named("s3"), metrics, s3API, config.FileSelectors, backupConfig{}, numberOfWorkers) - s3Poller := newS3Poller(logp.NewLogger(inputName), metrics, s3API, client, s3EventHandlerFactory, newStates(inputCtx), store, "bucket", listPrefix, "region", "provider", numberOfWorkers, time.Second) + s3Poller := newS3Poller(logp.NewLogger(inputName), metrics, s3API, client, s3EventHandlerFactory, states, "bucket", listPrefix, "region", "provider", numberOfWorkers, time.Second) if err := s3Poller.Poll(ctx); err != nil { if !errors.Is(err, context.DeadlineExceeded) { diff --git a/x-pack/filebeat/input/awss3/s3.go b/x-pack/filebeat/input/awss3/s3.go index 5aa8d31e95d..8909f78bb39 100644 --- a/x-pack/filebeat/input/awss3/s3.go +++ b/x-pack/filebeat/input/awss3/s3.go @@ -11,34 +11,22 @@ import ( "sync" "time" - "github.com/gofrs/uuid" - "go.uber.org/multierr" + "github.com/aws/aws-sdk-go-v2/aws/ratelimit" "github.com/elastic/beats/v7/libbeat/beat" - "github.com/elastic/beats/v7/libbeat/statestore" + "github.com/elastic/beats/v7/libbeat/common/backoff" awscommon "github.com/elastic/beats/v7/x-pack/libbeat/common/aws" "github.com/elastic/elastic-agent-libs/logp" "github.com/elastic/go-concert/timed" ) -const maxCircuitBreaker = 5 - -type commitWriteState struct { - time.Time -} - -type s3ObjectInfo struct { - name string - key string - etag string - lastModified time.Time - listingID string -} +// var instead of const so it can be reduced during unit tests (instead of waiting +// through 10 minutes of retry backoff) +var readerLoopMaxCircuitBreaker = 10 type s3ObjectPayload struct { s3ObjectHandler s3ObjectHandler - s3ObjectInfo s3ObjectInfo - s3ObjectEvent s3EventV2 + objectState state } type s3Poller struct { @@ -48,15 +36,12 @@ type s3Poller struct { region string provider string bucketPollInterval time.Duration - workerSem *awscommon.Sem s3 s3API log *logp.Logger metrics *inputMetrics client beat.Client s3ObjectHandler s3ObjectHandlerFactory states *states - store *statestore.Store - workersListingMap *sync.Map workersProcessingMap *sync.Map } @@ -66,7 +51,6 @@ func newS3Poller(log *logp.Logger, client beat.Client, s3ObjectHandler s3ObjectHandlerFactory, states *states, - store *statestore.Store, bucket string, listPrefix string, awsRegion string, @@ -85,41 +69,17 @@ func newS3Poller(log *logp.Logger, region: awsRegion, provider: provider, bucketPollInterval: bucketPollInterval, - workerSem: awscommon.NewSem(numberOfWorkers), s3: s3, log: log, metrics: metrics, client: client, s3ObjectHandler: s3ObjectHandler, states: states, - store: store, - workersListingMap: new(sync.Map), workersProcessingMap: new(sync.Map), } } -func (p *s3Poller) handlePurgingLock(info s3ObjectInfo, isStored bool) { - id := stateID(info.name, info.key, info.etag, info.lastModified) - previousState := p.states.FindPreviousByID(id) - if !previousState.IsEmpty() { - if isStored { - previousState.MarkAsStored() - } else { - previousState.MarkAsError() - } - - p.states.Update(previousState, info.listingID) - } - - // Manage locks for purging. - if p.states.IsListingFullyStored(info.listingID) { - // locked on processing we unlock when all the object were ACKed - lock, _ := p.workersListingMap.Load(info.listingID) - lock.(*sync.Mutex).Unlock() - } -} - -func (p *s3Poller) createS3ObjectProcessor(ctx context.Context, state state) (s3ObjectHandler, s3EventV2) { +func (p *s3Poller) createS3ObjectProcessor(ctx context.Context, state state) s3ObjectHandler { event := s3EventV2{} event.AWSRegion = p.region event.Provider = p.provider @@ -129,275 +89,126 @@ func (p *s3Poller) createS3ObjectProcessor(ctx context.Context, state state) (s3 acker := awscommon.NewEventACKTracker(ctx) - return p.s3ObjectHandler.Create(ctx, p.log, p.client, acker, event), event + return p.s3ObjectHandler.Create(ctx, p.log, p.client, acker, event) } -func (p *s3Poller) ProcessObject(s3ObjectPayloadChan <-chan *s3ObjectPayload) error { - var errs []error +func (p *s3Poller) workerLoop(ctx context.Context, s3ObjectPayloadChan <-chan *s3ObjectPayload) { + rateLimitWaiter := backoff.NewEqualJitterBackoff(ctx.Done(), 1, 120) for s3ObjectPayload := range s3ObjectPayloadChan { - // Process S3 object (download, parse, create events). - err := s3ObjectPayload.s3ObjectHandler.ProcessS3Object() + objHandler := s3ObjectPayload.s3ObjectHandler + state := s3ObjectPayload.objectState - // Wait for all events to be ACKed before proceeding. - s3ObjectPayload.s3ObjectHandler.Wait() + // Process S3 object (download, parse, create events). + err := objHandler.ProcessS3Object() + if errors.Is(err, errS3DownloadFailed) { + // Download errors are ephemeral. Add a backoff delay, then skip to the + // next iteration so we don't mark the object as permanently failed. + rateLimitWaiter.Wait() + continue + } + // Reset the rate limit delay on results that aren't download errors. + rateLimitWaiter.Reset() - info := s3ObjectPayload.s3ObjectInfo + // Wait for downloaded objects to be ACKed. + objHandler.Wait() if err != nil { - event := s3ObjectPayload.s3ObjectEvent - errs = append(errs, - fmt.Errorf( - fmt.Sprintf("failed processing S3 event for object key %q in bucket %q: %%w", - event.S3.Object.Key, event.S3.Bucket.Name), - err)) - - p.handlePurgingLock(info, false) - continue + p.log.Errorf("failed processing S3 event for object key %q in bucket %q: %v", + state.Key, state.Bucket, err.Error()) + + // Non-retryable error. + state.Failed = true + } else { + state.Stored = true } - p.handlePurgingLock(info, true) + // Persist the result + p.states.AddState(state) // Metrics p.metrics.s3ObjectsAckedTotal.Inc() } - - return multierr.Combine(errs...) } -func (p *s3Poller) GetS3Objects(ctx context.Context, s3ObjectPayloadChan chan<- *s3ObjectPayload) { +func (p *s3Poller) readerLoop(ctx context.Context, s3ObjectPayloadChan chan<- *s3ObjectPayload) { defer close(s3ObjectPayloadChan) bucketName := getBucketNameFromARN(p.bucket) + errorBackoff := backoff.NewEqualJitterBackoff(ctx.Done(), 1, 120) circuitBreaker := 0 paginator := p.s3.ListObjectsPaginator(bucketName, p.listPrefix) for paginator.HasMorePages() { page, err := paginator.NextPage(ctx) - if err != nil { - if !paginator.HasMorePages() { - break - } + if err != nil { p.log.Warnw("Error when paginating listing.", "error", err) - circuitBreaker++ - if circuitBreaker >= maxCircuitBreaker { - p.log.Warnw(fmt.Sprintf("%d consecutive error when paginating listing, breaking the circuit.", circuitBreaker), "error", err) - break + // QuotaExceededError is client-side rate limiting in the AWS sdk, + // don't include it in the circuit breaker count + if !errors.As(err, &ratelimit.QuotaExceededError{}) { + circuitBreaker++ + if circuitBreaker >= readerLoopMaxCircuitBreaker { + p.log.Warnw(fmt.Sprintf("%d consecutive error when paginating listing, breaking the circuit.", circuitBreaker), "error", err) + break + } } + // add a backoff delay and try again + errorBackoff.Wait() continue } + // Reset the circuit breaker and the error backoff if a read is successful + circuitBreaker = 0 + errorBackoff.Reset() - listingID, err := uuid.NewV4() - if err != nil { - p.log.Warnw("Error generating UUID for listing page.", "error", err) - continue - } - - // lock for the listing page and state in workersListingMap - // this map is shared with the storedOp and will be unlocked there - lock := new(sync.Mutex) - lock.Lock() - p.workersListingMap.Store(listingID.String(), lock) - - totProcessableObjects := 0 totListedObjects := len(page.Contents) - s3ObjectPayloadChanByPage := make(chan *s3ObjectPayload, totListedObjects) // Metrics p.metrics.s3ObjectsListedTotal.Add(uint64(totListedObjects)) for _, object := range page.Contents { - state := newState(bucketName, *object.Key, *object.ETag, p.listPrefix, *object.LastModified) - if p.states.MustSkip(state, p.store) { + state := newState(bucketName, *object.Key, *object.ETag, *object.LastModified) + if p.states.IsProcessed(state) { p.log.Debugw("skipping state.", "state", state) continue } - // we have no previous state or the previous state - // is not stored: refresh the state - previousState := p.states.FindPrevious(state) - if previousState.IsEmpty() || !previousState.IsProcessed() { - p.states.Update(state, "") - } - - s3Processor, event := p.createS3ObjectProcessor(ctx, state) + s3Processor := p.createS3ObjectProcessor(ctx, state) if s3Processor == nil { p.log.Debugw("empty s3 processor.", "state", state) continue } - totProcessableObjects++ - - s3ObjectPayloadChanByPage <- &s3ObjectPayload{ + s3ObjectPayloadChan <- &s3ObjectPayload{ s3ObjectHandler: s3Processor, - s3ObjectInfo: s3ObjectInfo{ - name: bucketName, - key: *object.Key, - etag: *object.ETag, - lastModified: *object.LastModified, - listingID: listingID.String(), - }, - s3ObjectEvent: event, - } - } - - if totProcessableObjects == 0 { - p.log.Debugw("0 processable objects on bucket pagination.", "bucket", p.bucket, "listPrefix", p.listPrefix, "listingID", listingID) - // nothing to be ACKed, unlock here - p.states.DeleteListing(listingID.String()) - lock.Unlock() - } else { - listingInfo := &listingInfo{totObjects: totProcessableObjects} - p.states.AddListing(listingID.String(), listingInfo) - - // Metrics - p.metrics.s3ObjectsProcessedTotal.Add(uint64(totProcessableObjects)) - } - - close(s3ObjectPayloadChanByPage) - for s3ObjectPayload := range s3ObjectPayloadChanByPage { - s3ObjectPayloadChan <- s3ObjectPayload - } - } -} - -func (p *s3Poller) Purge(ctx context.Context) { - listingIDs := p.states.GetListingIDs() - p.log.Debugw("purging listing.", "listingIDs", listingIDs) - for _, listingID := range listingIDs { - // we lock here in order to process the purge only after - // full listing page is ACKed by all the workers - lock, loaded := p.workersListingMap.Load(listingID) - if !loaded { - // purge calls can overlap, GetListingIDs can return - // an outdated snapshot with listing already purged - p.states.DeleteListing(listingID) - p.log.Debugw("deleting already purged listing from states.", "listingID", listingID) - continue - } - - lock.(*sync.Mutex).Lock() - - states := map[string]*state{} - latestStoredTimeByBucketAndListPrefix := make(map[string]time.Time, 0) - - listingStates := p.states.GetStatesByListingID(listingID) - for i, state := range listingStates { - // it is not stored, keep - if !state.IsProcessed() { - p.log.Debugw("state not stored or with error, skip purge", "state", state) - continue + objectState: state, } - var latestStoredTime time.Time - states[state.ID] = &listingStates[i] - latestStoredTime, ok := latestStoredTimeByBucketAndListPrefix[state.Bucket+state.ListPrefix] - if !ok { - var commitWriteState commitWriteState - err := p.store.Get(awsS3WriteCommitPrefix+state.Bucket+state.ListPrefix, &commitWriteState) - if err == nil { - // we have no entry in the map, and we have no entry in the store - // set zero time - latestStoredTime = time.Time{} - p.log.Debugw("last stored time is zero time", "bucket", state.Bucket, "listPrefix", state.ListPrefix) - } else { - latestStoredTime = commitWriteState.Time - p.log.Debugw("last stored time is commitWriteState", "commitWriteState", commitWriteState, "bucket", state.Bucket, "listPrefix", state.ListPrefix) - } - } else { - p.log.Debugw("last stored time from memory", "latestStoredTime", latestStoredTime, "bucket", state.Bucket, "listPrefix", state.ListPrefix) - } - - if state.LastModified.After(latestStoredTime) { - p.log.Debugw("last stored time updated", "state.LastModified", state.LastModified, "bucket", state.Bucket, "listPrefix", state.ListPrefix) - latestStoredTimeByBucketAndListPrefix[state.Bucket+state.ListPrefix] = state.LastModified - } - } - - for key := range states { - p.states.Delete(key) - } - - if err := p.states.writeStates(p.store); err != nil { - p.log.Errorw("Failed to write states to the registry", "error", err) - } - - for bucketAndListPrefix, latestStoredTime := range latestStoredTimeByBucketAndListPrefix { - if err := p.store.Set(awsS3WriteCommitPrefix+bucketAndListPrefix, commitWriteState{latestStoredTime}); err != nil { - p.log.Errorw("Failed to write commit time to the registry", "error", err) - } - } - - // purge is done, we can unlock and clean - lock.(*sync.Mutex).Unlock() - p.workersListingMap.Delete(listingID) - p.states.DeleteListing(listingID) - - // Listing is removed from all states, we can finalize now - for _, state := range states { - processor, _ := p.createS3ObjectProcessor(ctx, *state) - if err := processor.FinalizeS3Object(); err != nil { - p.log.Errorw("Failed to finalize S3 object", "key", state.Key, "error", err) - } + p.metrics.s3ObjectsProcessedTotal.Inc() } } } func (p *s3Poller) Poll(ctx context.Context) error { - // This loop tries to keep the workers busy as much as possible while - // honoring the number in config opposed to a simpler loop that does one - // listing, sequentially processes every object and then does another listing - workerWg := new(sync.WaitGroup) for ctx.Err() == nil { - // Determine how many S3 workers are available. - workers, err := p.workerSem.AcquireContext(p.numberOfWorkers, ctx) - if err != nil { - break - } - - if workers == 0 { - continue - } + var workerWg sync.WaitGroup + workChan := make(chan *s3ObjectPayload) - s3ObjectPayloadChan := make(chan *s3ObjectPayload) - - workerWg.Add(1) - go func() { - defer func() { - workerWg.Done() - }() - - p.GetS3Objects(ctx, s3ObjectPayloadChan) - p.Purge(ctx) - }() - - workerWg.Add(workers) - for i := 0; i < workers; i++ { + // Start the worker goroutines to listen on the work channel + for i := 0; i < p.numberOfWorkers; i++ { + workerWg.Add(1) go func() { - defer func() { - workerWg.Done() - p.workerSem.Release(1) - }() - if err := p.ProcessObject(s3ObjectPayloadChan); err != nil { - p.log.Warnw("Failed processing S3 listing.", "error", err) - } + defer workerWg.Done() + p.workerLoop(ctx, workChan) }() } - err = timed.Wait(ctx, p.bucketPollInterval) - if err != nil { - if errors.Is(err, context.Canceled) { - // A canceled context is a normal shutdown. - return nil - } + // Start reading data and wait for its processing to be done + p.readerLoop(ctx, workChan) + workerWg.Wait() - return err - } + _ = timed.Wait(ctx, p.bucketPollInterval) } - // Wait for all workers to finish. - workerWg.Wait() - if errors.Is(ctx.Err(), context.Canceled) { // A canceled context is a normal shutdown. return nil diff --git a/x-pack/filebeat/input/awss3/s3_objects.go b/x-pack/filebeat/input/awss3/s3_objects.go index 32911778336..21dfa2243e7 100644 --- a/x-pack/filebeat/input/awss3/s3_objects.go +++ b/x-pack/filebeat/input/awss3/s3_objects.go @@ -43,6 +43,11 @@ type s3ObjectProcessorFactory struct { backupConfig backupConfig } +// errS3DownloadFailed reports problems downloading an S3 object. Download errors +// should never treated as permanent, they are just an indication to apply a +// retry backoff until the connection is healthy again. +var errS3DownloadFailed = errors.New("S3 download failure") + func newS3ObjectProcessorFactory(log *logp.Logger, metrics *inputMetrics, s3 s3API, sel []fileSelectorConfig, backupConfig backupConfig, maxWorkers int) *s3ObjectProcessorFactory { if metrics == nil { // Metrics are optional. Initialize a stub. @@ -135,8 +140,9 @@ func (p *s3ObjectProcessor) ProcessS3Object() error { // Request object (download). contentType, meta, body, err := p.download() if err != nil { - return fmt.Errorf("failed to get s3 object (elapsed_time_ns=%d): %w", - time.Since(start).Nanoseconds(), err) + // Wrap downloadError in the result so the caller knows it's not a + // permanent failure. + return fmt.Errorf("%w: %w", errS3DownloadFailed, err) } defer body.Close() p.s3Metadata = meta @@ -434,10 +440,7 @@ func (p *s3ObjectProcessor) FinalizeS3Object() error { if bucketName == "" { return nil } - backupKey := p.s3Obj.S3.Object.Key - if p.backupConfig.BackupToBucketPrefix != "" { - backupKey = fmt.Sprintf("%s%s", p.backupConfig.BackupToBucketPrefix, backupKey) - } + backupKey := p.backupConfig.BackupToBucketPrefix + p.s3Obj.S3.Object.Key _, err := p.s3.CopyObject(p.ctx, p.s3Obj.S3.Bucket.Name, bucketName, p.s3Obj.S3.Object.Key, backupKey) if err != nil { return fmt.Errorf("failed to copy object to backup bucket: %w", err) diff --git a/x-pack/filebeat/input/awss3/s3_objects_test.go b/x-pack/filebeat/input/awss3/s3_objects_test.go index 6732c12e057..28e8f4f42a5 100644 --- a/x-pack/filebeat/input/awss3/s3_objects_test.go +++ b/x-pack/filebeat/input/awss3/s3_objects_test.go @@ -8,7 +8,8 @@ import ( "bytes" "context" "errors" - "io/ioutil" + "io" + "os" "path/filepath" "strings" "testing" @@ -27,7 +28,7 @@ import ( ) func newS3Object(t testing.TB, filename, contentType string) (s3EventV2, *s3.GetObjectOutput) { - data, err := ioutil.ReadFile(filename) + data, err := os.ReadFile(filename) if err != nil { t.Fatal(err) } @@ -39,7 +40,7 @@ func newS3GetObjectResponse(filename string, data []byte, contentType string) *s r := bytes.NewReader(data) getObjectOutput := s3.GetObjectOutput{} getObjectOutput.ContentLength = int64(r.Len()) - getObjectOutput.Body = ioutil.NopCloser(r) + getObjectOutput.Body = io.NopCloser(r) if contentType != "" { getObjectOutput.ContentType = &contentType } @@ -157,7 +158,7 @@ func TestS3ObjectProcessor(t *testing.T) { ack := awscommon.NewEventACKTracker(ctx) err := s3ObjProc.Create(ctx, logp.NewLogger(inputName), mockPublisher, ack, s3Event).ProcessS3Object() require.Error(t, err) - assert.True(t, errors.Is(err, errFakeConnectivityFailure), "expected errFakeConnectivityFailure error") + assert.True(t, errors.Is(err, errS3DownloadFailed), "expected errS3DownloadFailed") }) t.Run("no error empty result in download", func(t *testing.T) { diff --git a/x-pack/filebeat/input/awss3/s3_test.go b/x-pack/filebeat/input/awss3/s3_test.go index b94ba7cfb09..be1d65b796e 100644 --- a/x-pack/filebeat/input/awss3/s3_test.go +++ b/x-pack/filebeat/input/awss3/s3_test.go @@ -13,7 +13,6 @@ import ( "github.com/aws/aws-sdk-go-v2/service/s3" "github.com/aws/aws-sdk-go-v2/service/s3/types" "github.com/golang/mock/gomock" - "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "github.com/elastic/beats/v7/libbeat/statestore" @@ -134,12 +133,16 @@ func TestS3Poller(t *testing.T) { Return(nil, errFakeConnectivityFailure) s3ObjProc := newS3ObjectProcessorFactory(logp.NewLogger(inputName), nil, mockAPI, nil, backupConfig{}, numberOfWorkers) - receiver := newS3Poller(logp.NewLogger(inputName), nil, mockAPI, mockPublisher, s3ObjProc, newStates(inputCtx), store, bucket, "key", "region", "provider", numberOfWorkers, pollInterval) + states, err := newStates(inputCtx, store) + require.NoError(t, err, "states creation must succeed") + receiver := newS3Poller(logp.NewLogger(inputName), nil, mockAPI, mockPublisher, s3ObjProc, states, bucket, "key", "region", "provider", numberOfWorkers, pollInterval) require.Error(t, context.DeadlineExceeded, receiver.Poll(ctx)) - assert.Equal(t, numberOfWorkers, receiver.workerSem.Available()) }) - t.Run("retry after Poll error", func(t *testing.T) { + t.Run("restart bucket scan after paging errors", func(t *testing.T) { + // Change the restart limit to 2 consecutive errors, so the test doesn't + // take too long to run + readerLoopMaxCircuitBreaker = 2 storeReg := statestore.NewRegistry(storetest.NewMemoryStoreBackend()) store, err := storeReg.Get("test") if err != nil { @@ -176,13 +179,13 @@ func TestS3Poller(t *testing.T) { // Initial Next gets an error. mockPagerFirst.EXPECT(). HasMorePages(). - Times(10). + Times(2). DoAndReturn(func() bool { return true }) mockPagerFirst.EXPECT(). NextPage(gomock.Any()). - Times(5). + Times(2). DoAndReturn(func(_ context.Context, optFns ...func(*s3.Options)) (*s3.ListObjectsV2Output, error) { return nil, errFakeConnectivityFailure }) @@ -257,8 +260,9 @@ func TestS3Poller(t *testing.T) { Return(nil, errFakeConnectivityFailure) s3ObjProc := newS3ObjectProcessorFactory(logp.NewLogger(inputName), nil, mockAPI, nil, backupConfig{}, numberOfWorkers) - receiver := newS3Poller(logp.NewLogger(inputName), nil, mockAPI, mockPublisher, s3ObjProc, newStates(inputCtx), store, bucket, "key", "region", "provider", numberOfWorkers, pollInterval) + states, err := newStates(inputCtx, store) + require.NoError(t, err, "states creation must succeed") + receiver := newS3Poller(logp.NewLogger(inputName), nil, mockAPI, mockPublisher, s3ObjProc, states, bucket, "key", "region", "provider", numberOfWorkers, pollInterval) require.Error(t, context.DeadlineExceeded, receiver.Poll(ctx)) - assert.Equal(t, numberOfWorkers, receiver.workerSem.Available()) }) } diff --git a/x-pack/filebeat/input/awss3/state.go b/x-pack/filebeat/input/awss3/state.go index 97fb8d538cd..4b7e09f9e7f 100644 --- a/x-pack/filebeat/input/awss3/state.go +++ b/x-pack/filebeat/input/awss3/state.go @@ -5,84 +5,52 @@ package awss3 import ( - "fmt" "time" ) // state is used to communicate the publishing state of a s3 object type state struct { - // ID is used to identify the state in the store, and it is composed by - // Bucket + Key + Etag + LastModified.String(): changing this value or how it is - // composed will break backward compatibilities with entries already in the store. - ID string `json:"id" struct:"id"` Bucket string `json:"bucket" struct:"bucket"` Key string `json:"key" struct:"key"` Etag string `json:"etag" struct:"etag"` LastModified time.Time `json:"last_modified" struct:"last_modified"` - // ListPrefix is used for unique of the key in the store for awsS3WriteCommitPrefix - ListPrefix string `json:"list_prefix" struct:"list_prefix"` - // A state has Stored = true when all events are ACKed. Stored bool `json:"stored" struct:"stored"` - // A state has Error = true when ProcessS3Object returned an error - Error bool `json:"error" struct:"error"` + + // Failed is true when ProcessS3Object returned an error other than + // s3DownloadError. + // Before 8.14, this field was called "error". However, that field was + // set for many ephemeral reasons including client-side rate limiting + // (see https://github.com/elastic/beats/issues/39114). Now that we + // don't treat download errors as permanent, the field name was changed + // so that users upgrading from old versions aren't prevented from + // retrying old download failures. + Failed bool `json:"failed" struct:"failed"` } +// ID is used to identify the state in the store, and it is composed by +// Bucket + Key + Etag + LastModified.String(): changing this value or how it is +// composed will break backward compatibilities with entries already in the store. func stateID(bucket, key, etag string, lastModified time.Time) string { return bucket + key + etag + lastModified.String() } // newState creates a new s3 object state -func newState(bucket, key, etag, listPrefix string, lastModified time.Time) state { - s := state{ +func newState(bucket, key, etag string, lastModified time.Time) state { + return state{ Bucket: bucket, Key: key, LastModified: lastModified, Etag: etag, - ListPrefix: listPrefix, - Stored: false, - Error: false, } - - s.ID = stateID(s.Bucket, s.Key, s.Etag, s.LastModified) - - return s } -// MarkAsStored set the stored flag to true -func (s *state) MarkAsStored() { - s.Stored = true -} - -// MarkAsError set the error flag to true -func (s *state) MarkAsError() { - s.Error = true -} - -// IsProcessed checks if the state is either Stored or Error -func (s *state) IsProcessed() bool { - return s.Stored || s.Error +func (s *state) ID() string { + return stateID(s.Bucket, s.Key, s.Etag, s.LastModified) } // IsEqual checks if the two states point to the same s3 object. func (s *state) IsEqual(c *state) bool { return s.Bucket == c.Bucket && s.Key == c.Key && s.Etag == c.Etag && s.LastModified.Equal(c.LastModified) } - -// IsEmpty checks if the state is empty -func (s *state) IsEmpty() bool { - c := state{} - return s.Bucket == c.Bucket && s.Key == c.Key && s.Etag == c.Etag && s.LastModified.Equal(c.LastModified) -} - -// String returns string representation of the struct -func (s *state) String() string { - return fmt.Sprintf( - "{ID: %v, Bucket: %v, Key: %v, Etag: %v, LastModified: %v}", - s.ID, - s.Bucket, - s.Key, - s.Etag, - s.LastModified) -} diff --git a/x-pack/filebeat/input/awss3/state_test.go b/x-pack/filebeat/input/awss3/state_test.go index 24a5e9d81b4..375a44ce79e 100644 --- a/x-pack/filebeat/input/awss3/state_test.go +++ b/x-pack/filebeat/input/awss3/state_test.go @@ -61,7 +61,7 @@ func TestStateIsEqual(t *testing.T) { Key: "/key/to/this/file/1", Etag: "etag", LastModified: lastModifed, - Error: true, + Failed: true, }, { Bucket: "bucket a", diff --git a/x-pack/filebeat/input/awss3/states.go b/x-pack/filebeat/input/awss3/states.go index 449219a867f..edbbcc73793 100644 --- a/x-pack/filebeat/input/awss3/states.go +++ b/x-pack/filebeat/input/awss3/states.go @@ -15,278 +15,64 @@ import ( "github.com/elastic/beats/v7/libbeat/statestore" ) -const ( - awsS3ObjectStatePrefix = "filebeat::aws-s3::state::" - awsS3WriteCommitPrefix = "filebeat::aws-s3::writeCommit::" -) - -type listingInfo struct { - totObjects int - - mu sync.Mutex - storedObjects int - errorObjects int - finalCheck bool -} +const awsS3ObjectStatePrefix = "filebeat::aws-s3::state::" // states handles list of s3 object state. One must use newStates to instantiate a // file states registry. Using the zero-value is not safe. type states struct { - sync.RWMutex - log *logp.Logger - // states store - states []state - - // idx maps state IDs to state indexes for fast lookup and modifications. - idx map[string]int + // Completed S3 object states, indexed by state ID. + // statesLock must be held to access states. + states map[string]state + statesLock sync.Mutex - listingIDs map[string]struct{} - listingInfo *sync.Map - statesByListingID map[string][]state + // The store used to persist state changes to the registry. + // storeLock must be held to access store. + store *statestore.Store + storeLock sync.Mutex } // newStates generates a new states registry. -func newStates(ctx v2.Context) *states { - return &states{ - log: ctx.Logger.Named("states"), - states: nil, - idx: map[string]int{}, - listingInfo: new(sync.Map), - listingIDs: map[string]struct{}{}, - statesByListingID: map[string][]state{}, - } -} - -func (s *states) MustSkip(state state, store *statestore.Store) bool { - if !s.IsNew(state) { - s.log.Debugw("not new state in must skip", "state", state) - return true - } - - previousState := s.FindPrevious(state) - - // status is forgotten. if there is no previous state and - // the state.LastModified is before the last cleanStore - // write commit we can remove - var commitWriteState commitWriteState - err := store.Get(awsS3WriteCommitPrefix+state.Bucket+state.ListPrefix, &commitWriteState) - if err == nil && previousState.IsEmpty() && - (state.LastModified.Before(commitWriteState.Time) || state.LastModified.Equal(commitWriteState.Time)) { - s.log.Debugw("state.LastModified older than writeCommitState in must skip", "state", state, "commitWriteState", commitWriteState) - return true - } - - // the previous state is stored or has error: let's skip - if !previousState.IsEmpty() && previousState.IsProcessed() { - s.log.Debugw("previous state is stored or has error", "state", state) - return true - } - - return false -} - -func (s *states) Delete(id string) { - s.Lock() - defer s.Unlock() - - index := s.findPrevious(id) - if index >= 0 { - last := len(s.states) - 1 - s.states[last], s.states[index] = s.states[index], s.states[last] - s.states = s.states[:last] - - s.idx = map[string]int{} - for i, state := range s.states { - s.idx[state.ID] = i - } - } -} - -// IsListingFullyStored check if listing if fully stored -// After first time the condition is met it will always return false -func (s *states) IsListingFullyStored(listingID string) bool { - info, ok := s.listingInfo.Load(listingID) - if !ok { - return false - } - listingInfo, ok := info.(*listingInfo) - if !ok { - return false - } - - listingInfo.mu.Lock() - defer listingInfo.mu.Unlock() - if listingInfo.finalCheck { - return false - } - - listingInfo.finalCheck = (listingInfo.storedObjects + listingInfo.errorObjects) == listingInfo.totObjects - - if (listingInfo.storedObjects + listingInfo.errorObjects) > listingInfo.totObjects { - s.log.Warnf("unexepected mixmatch between storedObjects (%d), errorObjects (%d) and totObjects (%d)", - listingInfo.storedObjects, listingInfo.errorObjects, listingInfo.totObjects) - } - - return listingInfo.finalCheck -} - -// AddListing add listing info -func (s *states) AddListing(listingID string, listingInfo *listingInfo) { - s.Lock() - defer s.Unlock() - s.listingIDs[listingID] = struct{}{} - s.listingInfo.Store(listingID, listingInfo) -} - -// DeleteListing delete listing info -func (s *states) DeleteListing(listingID string) { - s.Lock() - defer s.Unlock() - delete(s.listingIDs, listingID) - delete(s.statesByListingID, listingID) - s.listingInfo.Delete(listingID) -} - -// Update updates a state. If previous state didn't exist, new one is created -func (s *states) Update(newState state, listingID string) { - s.Lock() - defer s.Unlock() - - id := newState.ID - index := s.findPrevious(id) - - if index >= 0 { - s.states[index] = newState - } else { - // No existing state found, add new one - s.idx[id] = len(s.states) - s.states = append(s.states, newState) - s.log.Debug("New state added for ", newState.ID) - } - - if listingID == "" || !newState.IsProcessed() { - return - } - - // here we increase the number of stored object - info, ok := s.listingInfo.Load(listingID) - if !ok { - return - } - listingInfo, ok := info.(*listingInfo) - if !ok { - return - } - - listingInfo.mu.Lock() - - if newState.Stored { - listingInfo.storedObjects++ - } - - if newState.Error { - listingInfo.errorObjects++ - } - - listingInfo.mu.Unlock() - - if _, ok := s.statesByListingID[listingID]; !ok { - s.statesByListingID[listingID] = make([]state, 0) +func newStates(ctx v2.Context, store *statestore.Store) (*states, error) { + states := &states{ + log: ctx.Logger.Named("states"), + states: map[string]state{}, + store: store, } - - s.statesByListingID[listingID] = append(s.statesByListingID[listingID], newState) + return states, states.loadFromRegistry() } -// FindPrevious lookups a registered state, that matching the new state. -// Returns a zero-state if no match is found. -func (s *states) FindPrevious(newState state) state { - s.RLock() - defer s.RUnlock() - id := newState.ID - i := s.findPrevious(id) - if i < 0 { - return state{} - } - return s.states[i] +func (s *states) IsProcessed(state state) bool { + s.statesLock.Lock() + defer s.statesLock.Unlock() + // Our in-memory table only stores completed objects + _, ok := s.states[state.ID()] + return ok } -// FindPreviousByID lookups a registered state, that matching the id. -// Returns a zero-state if no match is found. -func (s *states) FindPreviousByID(id string) state { - s.RLock() - defer s.RUnlock() - i := s.findPrevious(id) - if i < 0 { - return state{} - } - return s.states[i] -} - -func (s *states) IsNew(state state) bool { - s.RLock() - defer s.RUnlock() - id := state.ID - i := s.findPrevious(id) - - if i < 0 { - return true - } +func (s *states) AddState(state state) { - return !s.states[i].IsEqual(&state) -} + id := state.ID() + // Update in-memory copy + s.statesLock.Lock() + s.states[id] = state + s.statesLock.Unlock() -// findPrevious returns the previous state for the file. -// In case no previous state exists, index -1 is returned -func (s *states) findPrevious(id string) int { - if i, exists := s.idx[id]; exists { - return i + // Persist to the registry + s.storeLock.Lock() + key := awsS3ObjectStatePrefix + id + if err := s.store.Set(key, state); err != nil { + s.log.Errorw("Failed to write states to the registry", "error", err) } - return -1 -} - -// GetStates creates copy of the file states. -func (s *states) GetStates() []state { - s.RLock() - defer s.RUnlock() - - newStates := make([]state, len(s.states)) - copy(newStates, s.states) - - return newStates -} - -// GetListingIDs return a of the listing IDs -func (s *states) GetListingIDs() []string { - s.RLock() - defer s.RUnlock() - listingIDs := make([]string, 0, len(s.listingIDs)) - for listingID := range s.listingIDs { - listingIDs = append(listingIDs, listingID) - } - - return listingIDs -} - -// GetStatesByListingID return a copy of the states by listing ID -func (s *states) GetStatesByListingID(listingID string) []state { - s.RLock() - defer s.RUnlock() - - if _, ok := s.statesByListingID[listingID]; !ok { - return nil - } - - newStates := make([]state, len(s.statesByListingID[listingID])) - copy(newStates, s.statesByListingID[listingID]) - return newStates + s.storeLock.Unlock() } -func (s *states) readStatesFrom(store *statestore.Store) error { - var states []state +func (s *states) loadFromRegistry() error { + states := map[string]state{} - err := store.Each(func(key string, dec statestore.ValueDecoder) (bool, error) { + s.storeLock.Lock() + err := s.store.Each(func(key string, dec statestore.ValueDecoder) (bool, error) { if !strings.HasPrefix(key, awsS3ObjectStatePrefix) { return true, nil } @@ -294,78 +80,30 @@ func (s *states) readStatesFrom(store *statestore.Store) error { // try to decode. Ignore faulty/incompatible values. var st state if err := dec.Decode(&st); err != nil { - // XXX: Do we want to log here? In case we start to store other - // state types in the registry, then this operation will likely fail - // quite often, producing some false-positives in the logs... - return false, err + // Skip this key but continue iteration + s.log.Warnf("invalid S3 state loading object key %v", key) + //nolint:nilerr // One bad object shouldn't stop iteration + return true, nil + } + if !st.Stored && !st.Failed { + // This is from an older version where state could be stored in the + // registry even if the object wasn't processed, or if it encountered + // ephemeral download errors. We don't add these to the in-memory cache, + // so if we see them during a bucket scan we will still retry them. + return true, nil } - st.ID = key[len(awsS3ObjectStatePrefix):] - states = append(states, st) + states[st.ID()] = st return true, nil }) + s.storeLock.Unlock() if err != nil { return err } - states = fixStates(states) - - for _, state := range states { - s.Update(state, "") - } - - return nil -} - -// fixStates cleans up the registry states when updating from an older version -// of filebeat potentially writing invalid entries. -func fixStates(states []state) []state { - if len(states) == 0 { - return states - } - - // we use a map of states here, so to identify and merge duplicate entries. - idx := map[string]*state{} - for i := range states { - state := &states[i] - - old, exists := idx[state.ID] - if !exists { - idx[state.ID] = state - } else { - mergeStates(old, state) // overwrite the entry in 'old' - } - } - - if len(idx) == len(states) { - return states - } - - i := 0 - newStates := make([]state, len(idx)) - for _, state := range idx { - newStates[i] = *state - i++ - } - return newStates -} - -// mergeStates merges 2 states by trying to determine the 'newer' state. -// The st state is overwritten with the updated fields. -func mergeStates(st, other *state) { - // update file meta-data. As these are updated concurrently by the - // inputs, select the newer state based on the update timestamp. - if st.LastModified.Before(other.LastModified) { - st.LastModified = other.LastModified - } -} + s.statesLock.Lock() + s.states = states + s.statesLock.Unlock() -func (s *states) writeStates(store *statestore.Store) error { - for _, state := range s.GetStates() { - key := awsS3ObjectStatePrefix + state.ID - if err := store.Set(key, state); err != nil { - return err - } - } return nil } diff --git a/x-pack/filebeat/input/awss3/states_test.go b/x-pack/filebeat/input/awss3/states_test.go index 39dc4cf82e6..2f8bbf58fdf 100644 --- a/x-pack/filebeat/input/awss3/states_test.go +++ b/x-pack/filebeat/input/awss3/states_test.go @@ -14,6 +14,7 @@ import ( "github.com/elastic/beats/v7/libbeat/statestore/storetest" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" v2 "github.com/elastic/beats/v7/filebeat/input/v2" "github.com/elastic/elastic-agent-libs/logp" @@ -46,287 +47,92 @@ var inputCtx = v2.Context{ Cancelation: context.Background(), } -func TestStatesIsNewAndMustSkip(t *testing.T) { +func TestStatesAddStateAndIsProcessed(t *testing.T) { type stateTestCase struct { - states func() *states - state state - mustBeNew bool - persistentStoreKV map[string]interface{} - expectedMustSkip bool - expectedIsNew bool + // An initialization callback to invoke on the (initially empty) states. + statesEdit func(states *states) + + // The state to call IsProcessed on and the expected result + state state + expectedIsProcessed bool + + // If true, the test will run statesEdit, then create a new states + // object from the same persistent store before calling IsProcessed + // (to test persistence between restarts). + shouldReload bool } lastModified := time.Date(2022, time.June, 30, 14, 13, 00, 0, time.UTC) + testState1 := newState("bucket", "key", "etag", lastModified) + testState2 := newState("bucket1", "key1", "etag1", lastModified) tests := map[string]stateTestCase{ "with empty states": { - states: func() *states { - return newStates(inputCtx) - }, - state: newState("bucket", "key", "etag", "listPrefix", lastModified), - expectedMustSkip: false, - expectedIsNew: true, + state: testState1, + expectedIsProcessed: false, }, "not existing state": { - states: func() *states { - states := newStates(inputCtx) - states.Update(newState("bucket", "key", "etag", "listPrefix", lastModified), "") - return states + statesEdit: func(states *states) { + states.AddState(testState2) }, - state: newState("bucket1", "key1", "etag1", "listPrefix1", lastModified), - expectedMustSkip: false, - expectedIsNew: true, + state: testState1, + expectedIsProcessed: false, }, "existing state": { - states: func() *states { - states := newStates(inputCtx) - states.Update(newState("bucket", "key", "etag", "listPrefix", lastModified), "") - return states - }, - state: newState("bucket", "key", "etag", "listPrefix", lastModified), - expectedMustSkip: true, - expectedIsNew: false, - }, - "with different etag": { - states: func() *states { - states := newStates(inputCtx) - states.Update(newState("bucket", "key", "etag1", "listPrefix", lastModified), "") - return states - }, - state: newState("bucket", "key", "etag2", "listPrefix", lastModified), - expectedMustSkip: false, - expectedIsNew: true, - }, - "with different lastmodified": { - states: func() *states { - states := newStates(inputCtx) - states.Update(newState("bucket", "key", "etag", "listPrefix", lastModified), "") - return states - }, - state: newState("bucket", "key", "etag", "listPrefix", lastModified.Add(1*time.Second)), - expectedMustSkip: false, - expectedIsNew: true, - }, - "with stored state": { - states: func() *states { - states := newStates(inputCtx) - aState := newState("bucket", "key", "etag", "listPrefix", lastModified) - aState.Stored = true - states.Update(aState, "") - return states + statesEdit: func(states *states) { + states.AddState(testState1) }, - state: newState("bucket", "key", "etag", "listPrefix", lastModified), - mustBeNew: true, - expectedMustSkip: true, - expectedIsNew: true, + state: testState1, + expectedIsProcessed: true, }, - "with error state": { - states: func() *states { - states := newStates(inputCtx) - aState := newState("bucket", "key", "etag", "listPrefix", lastModified) - aState.Error = true - states.Update(aState, "") - return states + "existing stored state is persisted": { + statesEdit: func(states *states) { + state := testState1 + state.Stored = true + states.AddState(state) }, - state: newState("bucket", "key", "etag", "listPrefix", lastModified), - mustBeNew: true, - expectedMustSkip: true, - expectedIsNew: true, + state: testState1, + shouldReload: true, + expectedIsProcessed: true, }, - "before commit write": { - states: func() *states { - return newStates(inputCtx) + "existing failed state is persisted": { + statesEdit: func(states *states) { + state := testState1 + state.Failed = true + states.AddState(state) }, - persistentStoreKV: map[string]interface{}{ - awsS3WriteCommitPrefix + "bucket" + "listPrefix": &commitWriteState{lastModified}, - }, - state: newState("bucket", "key", "etag", "listPrefix", lastModified.Add(-1*time.Second)), - expectedMustSkip: true, - expectedIsNew: true, + state: testState1, + shouldReload: true, + expectedIsProcessed: true, }, - "same commit write": { - states: func() *states { - return newStates(inputCtx) - }, - persistentStoreKV: map[string]interface{}{ - awsS3WriteCommitPrefix + "bucket" + "listPrefix": &commitWriteState{lastModified}, + "existing unprocessed state is not persisted": { + statesEdit: func(states *states) { + states.AddState(testState1) }, - state: newState("bucket", "key", "etag", "listPrefix", lastModified), - expectedMustSkip: true, - expectedIsNew: true, - }, - "after commit write": { - states: func() *states { - return newStates(inputCtx) - }, - persistentStoreKV: map[string]interface{}{ - awsS3WriteCommitPrefix + "bucket" + "listPrefix": &commitWriteState{lastModified}, - }, - state: newState("bucket", "key", "etag", "listPrefix", lastModified.Add(time.Second)), - expectedMustSkip: false, - expectedIsNew: true, + state: testState1, + shouldReload: true, + expectedIsProcessed: false, }, } for name, test := range tests { test := test t.Run(name, func(t *testing.T) { - states := test.states() store := openTestStatestore() persistentStore, err := store.Access() if err != nil { t.Fatalf("unexpected err: %v", err) } - for key, value := range test.persistentStoreKV { - _ = persistentStore.Set(key, value) + states, err := newStates(inputCtx, persistentStore) + require.NoError(t, err, "states creation must succeed") + if test.statesEdit != nil { + test.statesEdit(states) } - - if test.mustBeNew { - test.state.LastModified = test.state.LastModified.Add(1 * time.Second) + if test.shouldReload { + states, err = newStates(inputCtx, persistentStore) + require.NoError(t, err, "states creation must succeed") } - isNew := states.IsNew(test.state) - assert.Equal(t, test.expectedIsNew, isNew) - - mustSkip := states.MustSkip(test.state, persistentStore) - assert.Equal(t, test.expectedMustSkip, mustSkip) - }) - } -} - -func TestStatesDelete(t *testing.T) { - type stateTestCase struct { - states func() *states - deleteID string - expected []state - } - - lastModified := time.Date(2021, time.July, 22, 18, 38, 00, 0, time.UTC) - tests := map[string]stateTestCase{ - "delete empty states": { - states: func() *states { - return newStates(inputCtx) - }, - deleteID: "an id", - expected: []state{}, - }, - "delete not existing state": { - states: func() *states { - states := newStates(inputCtx) - states.Update(newState("bucket", "key", "etag", "listPrefix", lastModified), "") - return states - }, - deleteID: "an id", - expected: []state{ - { - ID: stateID("bucket", "key", "etag", lastModified), - Bucket: "bucket", - Key: "key", - Etag: "etag", - ListPrefix: "listPrefix", - LastModified: lastModified, - }, - }, - }, - "delete only one existing": { - states: func() *states { - states := newStates(inputCtx) - states.Update(newState("bucket", "key", "etag", "listPrefix", lastModified), "") - return states - }, - deleteID: stateID("bucket", "key", "etag", lastModified), - expected: []state{}, - }, - "delete first": { - states: func() *states { - states := newStates(inputCtx) - states.Update(newState("bucket", "key1", "etag1", "listPrefix", lastModified), "") - states.Update(newState("bucket", "key2", "etag2", "listPrefix", lastModified), "") - states.Update(newState("bucket", "key3", "etag3", "listPrefix", lastModified), "") - return states - }, - deleteID: "bucketkey1etag1" + lastModified.String(), - expected: []state{ - { - ID: stateID("bucket", "key3", "etag3", lastModified), - Bucket: "bucket", - Key: "key3", - Etag: "etag3", - ListPrefix: "listPrefix", - LastModified: lastModified, - }, - { - ID: stateID("bucket", "key2", "etag2", lastModified), - Bucket: "bucket", - Key: "key2", - Etag: "etag2", - ListPrefix: "listPrefix", - LastModified: lastModified, - }, - }, - }, - "delete last": { - states: func() *states { - states := newStates(inputCtx) - states.Update(newState("bucket", "key1", "etag1", "listPrefix", lastModified), "") - states.Update(newState("bucket", "key2", "etag2", "listPrefix", lastModified), "") - states.Update(newState("bucket", "key3", "etag3", "listPrefix", lastModified), "") - return states - }, - deleteID: "bucketkey3etag3" + lastModified.String(), - expected: []state{ - { - ID: stateID("bucket", "key1", "etag1", lastModified), - Bucket: "bucket", - Key: "key1", - Etag: "etag1", - ListPrefix: "listPrefix", - LastModified: lastModified, - }, - { - ID: stateID("bucket", "key2", "etag2", lastModified), - Bucket: "bucket", - Key: "key2", - Etag: "etag2", - ListPrefix: "listPrefix", - LastModified: lastModified, - }, - }, - }, - "delete any": { - states: func() *states { - states := newStates(inputCtx) - states.Update(newState("bucket", "key1", "etag1", "listPrefix", lastModified), "") - states.Update(newState("bucket", "key2", "etag2", "listPrefix", lastModified), "") - states.Update(newState("bucket", "key3", "etag3", "listPrefix", lastModified), "") - return states - }, - deleteID: "bucketkey2etag2" + lastModified.String(), - expected: []state{ - { - ID: stateID("bucket", "key1", "etag1", lastModified), - Bucket: "bucket", - Key: "key1", - Etag: "etag1", - ListPrefix: "listPrefix", - LastModified: lastModified, - }, - { - ID: stateID("bucket", "key3", "etag3", lastModified), - Bucket: "bucket", - Key: "key3", - Etag: "etag3", - ListPrefix: "listPrefix", - LastModified: lastModified, - }, - }, - }, - } - - for name, test := range tests { - test := test - t.Run(name, func(t *testing.T) { - states := test.states() - states.Delete(test.deleteID) - assert.Equal(t, test.expected, states.GetStates()) + isProcessed := states.IsProcessed(test.state) + assert.Equal(t, test.expectedIsProcessed, isProcessed) }) } } From c2c5fea524317e5724fc9114d3766ca42cf0fccd Mon Sep 17 00:00:00 2001 From: Dimitrios Liappis Date: Mon, 29 Apr 2024 16:52:51 +0300 Subject: [PATCH 04/30] Fix cron description for Iron Bank validation (#39260) This commit fixes the schedule description for the Iron Bank validation and removes the old static schedule, now that we have a centralized scheduling job (#39254). Additionally, now that the job has been tested ([^1]) it enables slack alerts as well. [^1]: https://github.com/elastic/beats/pull/39255#issuecomment-2082368821 --- catalog-info.yaml | 24 ++---------------------- 1 file changed, 2 insertions(+), 22 deletions(-) diff --git a/catalog-info.yaml b/catalog-info.yaml index ae37200762b..bc22fbc905d 100644 --- a/catalog-info.yaml +++ b/catalog-info.yaml @@ -1087,25 +1087,6 @@ spec: skip_intermediate_builds: false provider_settings: trigger_mode: none - # TODO uncomment out after https://github.com/elastic/ingest-dev/issues/3235 - # schedules: - # # TODO to be replaced with a generic scheduler similar to https://github.com/elastic/logstash/pull/15705 - # Daily run of ironbank validation / main: - # branch: main - # cronline: 30 02 * * * - # message: Daily trigger of IronBank validation on main - # Daily run of ironbank validation / 8.14: - # branch: 8.14 - # cronline: 30 02 * * * - # message: Daily trigger of IronBank validation on 8.14 - # Daily run of ironbank validation / 8.13: - # branch: 8.13 - # cronline: 30 02 * * * - # message: Daily trigger of IronBank validation on 8.13 - # Daily run of ironbank validation / 7.17: - # branch: 7.17 - # cronline: 30 02 * * * - # message: Daily trigger of IronBank validation on 7.17 teams: ingest-fp: access_level: MANAGE_BUILD_AND_READ @@ -1139,7 +1120,7 @@ spec: pipeline_file: ".buildkite/pipeline-scheduler.yml" maximum_timeout_in_minutes: 240 schedules: - Daily Snapshot DRA: + Daily run of Iron Bank validation: branch: main cronline: 30 02 * * * message: Daily trigger of Iron Bank validation Pipeline per branch @@ -1149,8 +1130,7 @@ spec: provider_settings: trigger_mode: none env: - # TODO enable slack notifications when it's tested - ELASTIC_SLACK_NOTIFICATIONS_ENABLED: 'false' + ELASTIC_SLACK_NOTIFICATIONS_ENABLED: 'true' SLACK_NOTIFICATIONS_CHANNEL: '#ingest-notifications' SLACK_NOTIFICATIONS_ON_SUCCESS: 'false' teams: From 37816dd7150029c8cf99ed9ed962db25d8c3e519 Mon Sep 17 00:00:00 2001 From: Alexandros Sapranidis Date: Mon, 29 Apr 2024 17:00:58 +0300 Subject: [PATCH 05/30] Update the Beats packaging pipeline settings (#39263) This commits changes the settings of the packaging pipeline to make it execute only on the selected branches which currently is only main. Signed-off-by: Alexandros Sapranidis --- catalog-info.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/catalog-info.yaml b/catalog-info.yaml index bc22fbc905d..0e5a5d864d3 100644 --- a/catalog-info.yaml +++ b/catalog-info.yaml @@ -1046,6 +1046,10 @@ spec: cancel_intermediate_builds: false skip_intermediate_builds: false provider_settings: + build_branches: false + build_pull_request_forks: false + build_pull_requests: false + build_tags: false trigger_mode: code env: ELASTIC_SLACK_NOTIFICATIONS_ENABLED: 'true' From 11998672ace0b4e652bfe2edb437bc851880ec7f Mon Sep 17 00:00:00 2001 From: David Kilfoyle <41695641+kilfoyle@users.noreply.github.com> Date: Mon, 29 Apr 2024 11:46:01 -0400 Subject: [PATCH 06/30] Mark add_docker-metadata process as unsupported in packetbeat (#39241) * Mark add_docker-metadata process as unsupported in packetbeat * Update libbeat/processors/add_docker_metadata/docs/add_docker_metadata.asciidoc --- .../add_docker_metadata/docs/add_docker_metadata.asciidoc | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/libbeat/processors/add_docker_metadata/docs/add_docker_metadata.asciidoc b/libbeat/processors/add_docker_metadata/docs/add_docker_metadata.asciidoc index 53292667f13..61658210173 100644 --- a/libbeat/processors/add_docker_metadata/docs/add_docker_metadata.asciidoc +++ b/libbeat/processors/add_docker_metadata/docs/add_docker_metadata.asciidoc @@ -5,6 +5,11 @@ add_docker_metadata ++++ +ifeval::["{beatname_lc}"=="packetbeat"] +There is currently extremely limited capability for using {beatname_lc} to monitor and coexist with containers, for example Docker, Podman, or Kubernetes. Using the `add_docker_metadata` processor with {beatname_lc} is not recommended nor supported. +endif::[] + +ifeval::["{beatname_lc}"!="packetbeat"] The `add_docker_metadata` processor annotates each event with relevant metadata from Docker containers. At startup it detects a docker environment and caches the metadata. The events are annotated with Docker metadata, only if a valid configuration @@ -88,3 +93,4 @@ forget metadata for a container, 60s by default. `labels.dedot`:: (Optional) Default to be false. If set to true, replace dots in labels with `_`. +endif::[] \ No newline at end of file From 59421bb12602eab337cee0fe6e689262cba89763 Mon Sep 17 00:00:00 2001 From: Tiago Queiroz Date: Mon, 29 Apr 2024 20:24:21 +0200 Subject: [PATCH 07/30] Document havester_limit for Filestream input and fix typo (#39244) This commit documents `harvester_limit` for the filestream input and replaces `close_*` by the correct key `close.on_state_change.*`. --- .../input-filestream-file-options.asciidoc | 24 +++++++++++++++++++ .../docs/inputs/input-filestream.asciidoc | 5 ++-- 2 files changed, 27 insertions(+), 2 deletions(-) diff --git a/filebeat/docs/inputs/input-filestream-file-options.asciidoc b/filebeat/docs/inputs/input-filestream-file-options.asciidoc index 47a8c819d9e..5436d3863dc 100644 --- a/filebeat/docs/inputs/input-filestream-file-options.asciidoc +++ b/filebeat/docs/inputs/input-filestream-file-options.asciidoc @@ -517,6 +517,30 @@ less than or equal to `prospector.scanner.check_interval` If `backoff.max` needs to be higher, it is recommended to close the file handler instead and let {beatname_uc} pick up the file again. +[float] +[id="{beatname_lc}-input-{type}-harvester-limit"] +===== `harvester_limit` + +The `harvester_limit` option limits the number of harvesters that are started in +parallel for one input. This directly relates to the maximum number of file +handlers that are opened. The default for `harvester_limit` is 0, which means +there is no limit. This configuration is useful if the number of files to be +harvested exceeds the open file handler limit of the operating system. + +Setting a limit on the number of harvesters means that potentially not all files +are opened in parallel. Therefore we recommended that you use this option in +combination with the `close.on_state_change.*` options to make sure +harvesters are stopped more often so that new files can be picked up. + +Currently if a new harvester can be started again, the harvester is picked +randomly. This means it's possible that the harvester for a file that was just +closed and then updated again might be started instead of the harvester for a +file that hasn't been harvested for a longer period of time. + +This configuration option applies per input. You can use this option to +indirectly set higher priorities on certain inputs by assigning a higher +limit of harvesters. + [float] ===== `file_identity` diff --git a/filebeat/docs/inputs/input-filestream.asciidoc b/filebeat/docs/inputs/input-filestream.asciidoc index 47d1b24a8e8..54283d6cce7 100644 --- a/filebeat/docs/inputs/input-filestream.asciidoc +++ b/filebeat/docs/inputs/input-filestream.asciidoc @@ -11,8 +11,9 @@ Use the `filestream` input to read lines from active log files. It is the new, improved alternative to the `log` input. It comes with various improvements to the existing input: -1. Checking of `close_*` options happens out of band. Thus, if an output is blocked, -{beatname_uc} can close the reader and avoid keeping too many files open. +1. Checking of `close.on_state_change.*` options happens out of +band. Thus, if an output is blocked, {beatname_uc} can close the +reader and avoid keeping too many files open. 2. Detailed metrics are available for all files that match the `paths` configuration regardless of the `harvester_limit`. This way, you can keep track of all files, From 54fb91ed4ee4a697ed3c5cd6a08c5d6671b6f73e Mon Sep 17 00:00:00 2001 From: apmmachine <58790750+apmmachine@users.noreply.github.com> Date: Mon, 29 Apr 2024 16:20:14 -0400 Subject: [PATCH 08/30] chore: Update snapshot.yml (#39268) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Made with ❤️️ by updatecli Co-authored-by: apmmachine Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> --- testing/environments/snapshot.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/testing/environments/snapshot.yml b/testing/environments/snapshot.yml index a031c2184e5..b531cf78a51 100644 --- a/testing/environments/snapshot.yml +++ b/testing/environments/snapshot.yml @@ -3,7 +3,7 @@ version: '2.3' services: elasticsearch: - image: docker.elastic.co/elasticsearch/elasticsearch:8.15.0-053650c4-SNAPSHOT + image: docker.elastic.co/elasticsearch/elasticsearch:8.15.0-81021969-SNAPSHOT # When extend is used it merges healthcheck.tests, see: # https://github.com/docker/compose/issues/8962 # healthcheck: @@ -31,7 +31,7 @@ services: - "./docker/elasticsearch/users_roles:/usr/share/elasticsearch/config/users_roles" logstash: - image: docker.elastic.co/logstash/logstash:8.15.0-053650c4-SNAPSHOT + image: docker.elastic.co/logstash/logstash:8.15.0-81021969-SNAPSHOT healthcheck: test: ["CMD", "curl", "-f", "http://localhost:9600/_node/stats"] retries: 600 @@ -44,7 +44,7 @@ services: - 5055:5055 kibana: - image: docker.elastic.co/kibana/kibana:8.15.0-053650c4-SNAPSHOT + image: docker.elastic.co/kibana/kibana:8.15.0-81021969-SNAPSHOT environment: - "ELASTICSEARCH_USERNAME=kibana_system_user" - "ELASTICSEARCH_PASSWORD=testing" From 81fc73e634c8f8b49d00c0cc3afc378039a1438e Mon Sep 17 00:00:00 2001 From: Maurizio Branca Date: Mon, 29 Apr 2024 22:48:15 +0200 Subject: [PATCH 09/30] Fix Azure Monitor support for multiple aggregation types (#39204) * Add aggregation type to the MetricRegistry key The MetricRegistry wasn't using the aggregation type in the cache key, returning the wrong answer to the 'needs update?' question. * Handle multiple aggregation types Restores support for multiple aggregation types for the same metric name. Adding tests for the known use cases so we don't miss this feature again in future updates. --- CHANGELOG.next.asciidoc | 1 + x-pack/metricbeat/module/azure/azure_test.go | 39 +++++ x-pack/metricbeat/module/azure/client_test.go | 156 ++++++++++++++++++ x-pack/metricbeat/module/azure/data.go | 103 ++++++++---- x-pack/metricbeat/module/azure/data_test.go | 107 +++++++++++- .../module/azure/metric_registry.go | 9 +- .../module/azure/metric_registry_test.go | 138 +++++++++++++++- .../metricbeat/module/azure/mock_service.go | 2 +- .../module/azure/service_interface.go | 13 +- 9 files changed, 527 insertions(+), 41 deletions(-) create mode 100644 x-pack/metricbeat/module/azure/azure_test.go diff --git a/CHANGELOG.next.asciidoc b/CHANGELOG.next.asciidoc index c10e5eb08fa..f57b7100077 100644 --- a/CHANGELOG.next.asciidoc +++ b/CHANGELOG.next.asciidoc @@ -159,6 +159,7 @@ https://github.com/elastic/beats/compare/v8.8.1\...main[Check the HEAD diff] - Fix fields not being parsed correctly in postgresql/database {issue}25301[25301] {pull}37720[37720] - rabbitmq/queue - Change the mapping type of `rabbitmq.queue.consumers.utilisation.pct` to `scaled_float` from `long` because the values fall within the range of `[0.0, 1.0]`. Previously, conversion to integer resulted in reporting either `0` or `1`. - Fix timeout caused by the retrival of which indices are hidden {pull}39165[39165] +- Fix Azure Monitor support for multiple aggregation types {issue}39192[39192] {pull}39204[39204] *Osquerybeat* diff --git a/x-pack/metricbeat/module/azure/azure_test.go b/x-pack/metricbeat/module/azure/azure_test.go new file mode 100644 index 00000000000..c3d67525ddb --- /dev/null +++ b/x-pack/metricbeat/module/azure/azure_test.go @@ -0,0 +1,39 @@ +// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one +// or more contributor license agreements. Licensed under the Elastic License; +// you may not use this file except in compliance with the Elastic License. + +package azure + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestGroupMetricsDefinitionsByResourceId(t *testing.T) { + + t.Run("Group metrics definitions by resource ID", func(t *testing.T) { + metrics := []Metric{ + { + ResourceId: "resource-1", + Namespace: "namespace-1", + Names: []string{"metric-1"}, + }, + { + ResourceId: "resource-1", + Namespace: "namespace-1", + Names: []string{"metric-2"}, + }, + { + ResourceId: "resource-1", + Namespace: "namespace-1", + Names: []string{"metric-3"}, + }, + } + + metricsByResourceId := groupMetricsDefinitionsByResourceId(metrics) + + assert.Equal(t, 1, len(metricsByResourceId)) + assert.Equal(t, 3, len(metricsByResourceId["resource-1"])) + }) +} diff --git a/x-pack/metricbeat/module/azure/client_test.go b/x-pack/metricbeat/module/azure/client_test.go index 79b1742ded0..c23326ac82b 100644 --- a/x-pack/metricbeat/module/azure/client_test.go +++ b/x-pack/metricbeat/module/azure/client_test.go @@ -9,10 +9,12 @@ import ( "testing" "time" + "github.com/Azure/azure-sdk-for-go/sdk/azcore/to" "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/monitor/armmonitor" "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/resources/armresources" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/mock" + "github.com/stretchr/testify/require" ) var ( @@ -35,6 +37,7 @@ var ( }, }}}, } + countUnit = armmonitor.MetricUnit("Count") ) func mockMapResourceMetrics(client *Client, resources []*armresources.GenericResourceExpanded, resourceConfig ResourceConfig) ([]Metric, error) { @@ -112,4 +115,157 @@ func TestGetMetricValues(t *testing.T) { assert.Equal(t, len(client.ResourceConfigurations.Metrics[0].Values), 0) m.AssertExpectations(t) }) + + t.Run("multiple aggregation types", func(t *testing.T) { + client := NewMockClient() + referenceTime := time.Now().UTC() + client.ResourceConfigurations = ResourceConfiguration{ + Metrics: []Metric{ + { + Namespace: "Microsoft.EventHub/Namespaces", + Names: []string{"ActiveConnections"}, + Aggregations: "Maximum,Minimum,Average", + TimeGrain: "PT1M", + }, + }, + } + + m := &MockService{} + m.On( + "GetMetricValues", + mock.Anything, + mock.Anything, + mock.Anything, + mock.Anything, + mock.Anything, + mock.Anything, + mock.Anything, + ).Return( + []armmonitor.Metric{{ + ID: to.Ptr("test"), + Name: &armmonitor.LocalizableString{ + Value: to.Ptr("ActiveConnections"), + LocalizedValue: to.Ptr("ActiveConnections"), + }, + Timeseries: []*armmonitor.TimeSeriesElement{{ + Data: []*armmonitor.MetricValue{{ + Average: to.Ptr(1.0), + Maximum: to.Ptr(2.0), + Minimum: to.Ptr(3.0), + TimeStamp: to.Ptr(time.Now()), + }}, + }}, + Type: to.Ptr("Microsoft.Insights/metrics"), + Unit: &countUnit, + DisplayDescription: to.Ptr("Total Active Connections for Microsoft.EventHub."), + ErrorCode: to.Ptr("Success"), + }}, + "PT1M", + nil, + ) + + client.AzureMonitorService = m + mr := MockReporterV2{} + + metricValues := client.GetMetricValues(referenceTime, client.ResourceConfigurations.Metrics, &mr) + + require.Equal(t, len(metricValues), 1) + require.Equal(t, len(metricValues[0].Values), 1) + + assert.Equal(t, *metricValues[0].Values[0].avg, 1.0) + assert.Equal(t, *metricValues[0].Values[0].max, 2.0) + assert.Equal(t, *metricValues[0].Values[0].min, 3.0) + + require.Equal(t, len(client.ResourceConfigurations.Metrics[0].Values), 1) + + m.AssertExpectations(t) + }) + + t.Run("single aggregation types", func(t *testing.T) { + client := NewMockClient() + referenceTime := time.Now().UTC() + timestamp := time.Now().UTC() + client.ResourceConfigurations = ResourceConfiguration{ + Metrics: []Metric{ + { + Namespace: "Microsoft.EventHub/Namespaces", + Names: []string{"ActiveConnections"}, + Aggregations: "Maximum", + TimeGrain: "PT1M", + }, { + Namespace: "Microsoft.EventHub/Namespaces", + Names: []string{"ActiveConnections"}, + Aggregations: "Minimum", + TimeGrain: "PT1M", + }, { + Namespace: "Microsoft.EventHub/Namespaces", + Names: []string{"ActiveConnections"}, + Aggregations: "Average", + TimeGrain: "PT1M", + }, + }, + } + + m := &MockService{} + + x := []struct { + aggregation string + data []*armmonitor.MetricValue + }{ + {aggregation: "Maximum", data: []*armmonitor.MetricValue{{Maximum: to.Ptr(3.0), TimeStamp: to.Ptr(timestamp)}}}, + {aggregation: "Minimum", data: []*armmonitor.MetricValue{{Minimum: to.Ptr(1.0), TimeStamp: to.Ptr(timestamp)}}}, + {aggregation: "Average", data: []*armmonitor.MetricValue{{Average: to.Ptr(2.0), TimeStamp: to.Ptr(timestamp)}}}, + } + + for _, v := range x { + m.On( + "GetMetricValues", + mock.Anything, + mock.Anything, + mock.Anything, + mock.Anything, + mock.Anything, + v.aggregation, + mock.Anything, + ).Return( + []armmonitor.Metric{{ + ID: to.Ptr("test"), + Name: &armmonitor.LocalizableString{ + Value: to.Ptr("ActiveConnections"), + LocalizedValue: to.Ptr("ActiveConnections"), + }, + Timeseries: []*armmonitor.TimeSeriesElement{{ + Data: v.data, + }}, + Type: to.Ptr("Microsoft.Insights/metrics"), + Unit: &countUnit, + DisplayDescription: to.Ptr("Total Active Connections for Microsoft.EventHub."), + ErrorCode: to.Ptr("Success"), + }}, + "PT1M", + nil, + ).Once() + } + + client.AzureMonitorService = m + mr := MockReporterV2{} + + metricValues := client.GetMetricValues(referenceTime, client.ResourceConfigurations.Metrics, &mr) + + require.Equal(t, 3, len(metricValues)) + + require.Equal(t, 1, len(metricValues[0].Values)) + require.Equal(t, 1, len(metricValues[1].Values)) + require.Equal(t, 1, len(metricValues[2].Values)) + + require.NotNil(t, metricValues[0].Values[0].max, "max value is nil") + require.NotNil(t, metricValues[1].Values[0].min, "min value is nil") + require.NotNil(t, metricValues[2].Values[0].avg, "avg value is nil") + + assert.Equal(t, *metricValues[0].Values[0].max, 3.0) + assert.Equal(t, *metricValues[1].Values[0].min, 1.0) + assert.Equal(t, *metricValues[2].Values[0].avg, 2.0) + + m.AssertExpectations(t) + }) } diff --git a/x-pack/metricbeat/module/azure/data.go b/x-pack/metricbeat/module/azure/data.go index c46aee9da24..b2fffb40426 100644 --- a/x-pack/metricbeat/module/azure/data.go +++ b/x-pack/metricbeat/module/azure/data.go @@ -133,41 +133,8 @@ func mapToKeyValuePoints(metrics []Metric) []KeyValuePoint { var points []KeyValuePoint for _, metric := range metrics { for _, value := range metric.Values { - point := KeyValuePoint{ - Timestamp: value.timestamp, - Dimensions: mapstr.M{}, - } - metricName := managePropertyName(value.name) - switch { - case value.min != nil: - point.Key = fmt.Sprintf("%s.%s", metricName, "min") - point.Value = value.min - case value.max != nil: - point.Key = fmt.Sprintf("%s.%s", metricName, "max") - point.Value = value.max - case value.avg != nil: - point.Key = fmt.Sprintf("%s.%s", metricName, "avg") - point.Value = value.avg - case value.total != nil: - point.Key = fmt.Sprintf("%s.%s", metricName, "total") - point.Value = value.total - case value.count != nil: - point.Key = fmt.Sprintf("%s.%s", metricName, "count") - point.Value = value.count - } - - point.Namespace = metric.Namespace - point.ResourceId = metric.ResourceId - point.ResourceSubId = metric.ResourceSubId - point.TimeGrain = metric.TimeGrain - - // The number of dimensions in the metric definition and the - // number of dimensions in the metric values should be the same. - // - // But, since definitions and values are retrieved from different - // API endpoints, we need to make sure that we don't panic if the - // number of dimensions is different. + dimensions := mapstr.M{} if len(metric.Dimensions) == len(value.dimensions) { // Take the dimension name from the metric definition and the // dimension value from the metric value. @@ -180,11 +147,75 @@ func mapToKeyValuePoints(metrics []Metric) []KeyValuePoint { // Dimensions from metric definition and metric value are // not guaranteed to be in the same order, so we need to // find by name the right value for each dimension. - _, _ = point.Dimensions.Put(dim.Name, getDimensionValue(dim.Name, value.dimensions)) + // _, _ = point.Dimensions.Put(dim.Name, getDimensionValue(dim.Name, value.dimensions)) + _, _ = dimensions.Put(dim.Name, getDimensionValue(dim.Name, value.dimensions)) } } - points = append(points, point) + if value.min != nil { + points = append(points, KeyValuePoint{ + Key: fmt.Sprintf("%s.%s", metricName, "min"), + Value: value.min, + Namespace: metric.Namespace, + ResourceId: metric.ResourceId, + ResourceSubId: metric.ResourceSubId, + TimeGrain: metric.TimeGrain, + Dimensions: dimensions, + Timestamp: value.timestamp, + }) + } + + if value.max != nil { + points = append(points, KeyValuePoint{ + Key: fmt.Sprintf("%s.%s", metricName, "max"), + Value: value.max, + Namespace: metric.Namespace, + ResourceId: metric.ResourceId, + ResourceSubId: metric.ResourceSubId, + TimeGrain: metric.TimeGrain, + Dimensions: dimensions, + Timestamp: value.timestamp, + }) + } + + if value.avg != nil { + points = append(points, KeyValuePoint{ + Key: fmt.Sprintf("%s.%s", metricName, "avg"), + Value: value.avg, + Namespace: metric.Namespace, + ResourceId: metric.ResourceId, + ResourceSubId: metric.ResourceSubId, + TimeGrain: metric.TimeGrain, + Dimensions: dimensions, + Timestamp: value.timestamp, + }) + } + + if value.total != nil { + points = append(points, KeyValuePoint{ + Key: fmt.Sprintf("%s.%s", metricName, "total"), + Value: value.total, + Namespace: metric.Namespace, + ResourceId: metric.ResourceId, + ResourceSubId: metric.ResourceSubId, + TimeGrain: metric.TimeGrain, + Dimensions: dimensions, + Timestamp: value.timestamp, + }) + } + + if value.count != nil { + points = append(points, KeyValuePoint{ + Key: fmt.Sprintf("%s.%s", metricName, "count"), + Value: value.count, + Namespace: metric.Namespace, + ResourceId: metric.ResourceId, + ResourceSubId: metric.ResourceSubId, + TimeGrain: metric.TimeGrain, + Dimensions: dimensions, + Timestamp: value.timestamp, + }) + } } } diff --git a/x-pack/metricbeat/module/azure/data_test.go b/x-pack/metricbeat/module/azure/data_test.go index 85b781ed64e..1519f78982d 100644 --- a/x-pack/metricbeat/module/azure/data_test.go +++ b/x-pack/metricbeat/module/azure/data_test.go @@ -62,7 +62,37 @@ func TestMapToKeyValuePoints(t *testing.T) { resourceSubId := "test" timeGrain := "PT1M" - t.Run("test aggregation types", func(t *testing.T) { + t.Run("test single aggregation type (single config)", func(t *testing.T) { + + metrics := []Metric{{ + Namespace: namespace, + Names: []string{"test"}, + Aggregations: "min", + Values: []MetricValue{{name: metricName, min: &minValue, timestamp: timestamp}}, + TimeGrain: timeGrain, + ResourceId: resourceId, + ResourceSubId: resourceSubId, + }} + + actual := mapToKeyValuePoints(metrics) + + expected := []KeyValuePoint{ + { + Key: fmt.Sprintf("%s.%s", metricName, "min"), + Value: &minValue, + Namespace: namespace, + TimeGrain: timeGrain, + Timestamp: timestamp, + ResourceId: resourceId, + ResourceSubId: resourceSubId, + Dimensions: map[string]interface{}{}, + }, + } + + assert.Equal(t, expected, actual) + }) + + t.Run("test single aggregation types (multiple configs)", func(t *testing.T) { metrics := []Metric{{ Namespace: namespace, @@ -161,4 +191,79 @@ func TestMapToKeyValuePoints(t *testing.T) { assert.Equal(t, expected, actual) }) + + t.Run("test multiple aggregation types (multiple configs)", func(t *testing.T) { + metrics := []Metric{{ + Namespace: namespace, + Names: []string{"test"}, + Aggregations: "Minimum,Maximum,Average,Total,Count", + Values: []MetricValue{ + {name: metricName, min: &minValue, timestamp: timestamp}, + {name: metricName, max: &maxValue, timestamp: timestamp}, + {name: metricName, avg: &avgValue, timestamp: timestamp}, + {name: metricName, total: &totalValue, timestamp: timestamp}, + {name: metricName, count: &countValue, timestamp: timestamp}, + }, + TimeGrain: timeGrain, + ResourceId: resourceId, + ResourceSubId: resourceSubId, + }} + + actual := mapToKeyValuePoints(metrics) + + expected := []KeyValuePoint{ + { + Key: fmt.Sprintf("%s.%s", metricName, "min"), + Value: &minValue, + Namespace: namespace, + TimeGrain: timeGrain, + Timestamp: timestamp, + ResourceId: resourceId, + ResourceSubId: resourceSubId, + Dimensions: map[string]interface{}{}, + }, + { + Key: fmt.Sprintf("%s.%s", metricName, "max"), + Value: &maxValue, + Namespace: namespace, + TimeGrain: timeGrain, + Timestamp: timestamp, + ResourceId: resourceId, + ResourceSubId: resourceSubId, + Dimensions: map[string]interface{}{}, + }, + { + Key: fmt.Sprintf("%s.%s", metricName, "avg"), + Value: &avgValue, + Namespace: namespace, + TimeGrain: timeGrain, + Timestamp: timestamp, + ResourceId: resourceId, + ResourceSubId: resourceSubId, + Dimensions: map[string]interface{}{}, + }, + { + Key: fmt.Sprintf("%s.%s", metricName, "total"), + Value: &totalValue, + Namespace: namespace, + TimeGrain: timeGrain, + Timestamp: timestamp, + ResourceId: resourceId, + ResourceSubId: resourceSubId, + Dimensions: map[string]interface{}{}, + }, + { + Key: fmt.Sprintf("%s.%s", metricName, "count"), + Value: &countValue, + Namespace: namespace, + TimeGrain: timeGrain, + Timestamp: timestamp, + ResourceId: resourceId, + ResourceSubId: resourceSubId, + Dimensions: map[string]interface{}{}, + }, + } + + assert.Equal(t, expected, actual) + }) } diff --git a/x-pack/metricbeat/module/azure/metric_registry.go b/x-pack/metricbeat/module/azure/metric_registry.go index cdaa9496b5d..c127701c996 100644 --- a/x-pack/metricbeat/module/azure/metric_registry.go +++ b/x-pack/metricbeat/module/azure/metric_registry.go @@ -5,6 +5,7 @@ package azure import ( + "fmt" "strings" "time" @@ -118,8 +119,14 @@ func (m *MetricRegistry) buildMetricKey(metric Metric) string { keyComponents := []string{ metric.Namespace, metric.ResourceId, + metric.Aggregations, + metric.TimeGrain, + strings.Join(metric.Names, ","), + } + + for _, dim := range metric.Dimensions { + keyComponents = append(keyComponents, fmt.Sprintf("%s=%s", dim.Name, dim.Value)) } - keyComponents = append(keyComponents, metric.Names...) return strings.Join(keyComponents, ",") } diff --git a/x-pack/metricbeat/module/azure/metric_registry_test.go b/x-pack/metricbeat/module/azure/metric_registry_test.go index a0ecdc84b85..63984aa6b59 100644 --- a/x-pack/metricbeat/module/azure/metric_registry_test.go +++ b/x-pack/metricbeat/module/azure/metric_registry_test.go @@ -13,7 +13,7 @@ import ( "github.com/elastic/elastic-agent-libs/logp" ) -func TestNewMetricRegistry(t *testing.T) { +func TestMetricRegistry(t *testing.T) { logger := logp.NewLogger("test azure monitor") t.Run("Collect metrics with a regular 5 minutes period", func(t *testing.T) { @@ -90,4 +90,140 @@ func TestNewMetricRegistry(t *testing.T) { assert.True(t, needsUpdate, "metric should not need update") }) + + t.Run("Metrics with different aggregation types", func(t *testing.T) { + metricRegistry := NewMetricRegistry(logger) + + referenceTime := time.Now().UTC() + lastCollectionAt := referenceTime.Add(-time.Minute * 10) + + metric1 := Metric{ + ResourceId: "test", + Namespace: "test", + Aggregations: "Maximum", + } + metric2 := Metric{ + ResourceId: "test", + Namespace: "test", + Aggregations: "Minimum", + } + + metricCollectionInfo := MetricCollectionInfo{ + timeGrain: "PT5M", + timestamp: lastCollectionAt, + } + + // Update metrics collection info for previous collection + metricRegistry.Update(metric1, metricCollectionInfo) + metricRegistry.Update(metric2, metricCollectionInfo) + + // Update metric info for metric1 + metricRegistry.Update(metric1, MetricCollectionInfo{ + timeGrain: "PT5M", + timestamp: referenceTime, + }) + + // Check if metrics need update + metric1NeedsUpdate := metricRegistry.NeedsUpdate(referenceTime, metric1) + metric2NeedsUpdate := metricRegistry.NeedsUpdate(referenceTime, metric2) + + assert.False(t, metric1NeedsUpdate, "metric should not need update") + assert.True(t, metric2NeedsUpdate, "metric should need update") + }) + + t.Run("Metrics with different dimensions", func(t *testing.T) { + metricRegistry := NewMetricRegistry(logger) + + referenceTime := time.Now().UTC() + lastCollectionAt := referenceTime.Add(-time.Minute * 10) + + metric1 := Metric{ + ResourceId: "resource-id-1", + Namespace: "namespace-1", + Names: []string{"metric-name-1"}, + Dimensions: []Dimension{ + {Name: "dimension-1", Value: "*"}, + }, + TimeGrain: "PT1M", + } + metric2 := Metric{ + ResourceId: "resource-id-1", + Namespace: "namespace-1", + Names: []string{"metric-name-1"}, + Dimensions: []Dimension{ + {Name: "dimension-2", Value: "*"}, + }, + TimeGrain: "PT1M", + } + + metricCollectionInfo := MetricCollectionInfo{ + timeGrain: "PT1M", + timestamp: lastCollectionAt, + } + + // Update metrics collection info for previous collection + metricRegistry.Update(metric1, metricCollectionInfo) + metricRegistry.Update(metric2, metricCollectionInfo) + + // Update metric info for metric1 + metricRegistry.Update(metric1, MetricCollectionInfo{ + timeGrain: "PT1M", + timestamp: referenceTime, + }) + + // Check if metrics need update + metric1NeedsUpdate := metricRegistry.NeedsUpdate(referenceTime, metric1) + metric2NeedsUpdate := metricRegistry.NeedsUpdate(referenceTime, metric2) + + assert.False(t, metric1NeedsUpdate, "metric should not need update") + assert.True(t, metric2NeedsUpdate, "metric should need update") + }) + + t.Run("Metrics with different timegrain", func(t *testing.T) { + metricRegistry := NewMetricRegistry(logger) + + referenceTime := time.Now().UTC() + lastCollectionAt := referenceTime.Add(-time.Minute * 10) + + metric1 := Metric{ + ResourceId: "resource-id-1", + Namespace: "namespace-1", + Names: []string{"metric-name-1"}, + Dimensions: []Dimension{ + {Name: "dimension-1", Value: "*"}, + }, + TimeGrain: "PT1M", + } + metric2 := Metric{ + ResourceId: "resource-id-1", + Namespace: "namespace-1", + Names: []string{"metric-name-1"}, + Dimensions: []Dimension{ + {Name: "dimension-1", Value: "*"}, + }, + TimeGrain: "PT5M", + } + + metricCollectionInfo := MetricCollectionInfo{ + timeGrain: "PT1M", + timestamp: lastCollectionAt, + } + + // Update metrics collection info for previous collection + metricRegistry.Update(metric1, metricCollectionInfo) + metricRegistry.Update(metric2, metricCollectionInfo) + + // Update metric info for metric1 + metricRegistry.Update(metric1, MetricCollectionInfo{ + timeGrain: "PT1M", + timestamp: referenceTime, + }) + + // Check if metrics need update + metric1NeedsUpdate := metricRegistry.NeedsUpdate(referenceTime, metric1) + metric2NeedsUpdate := metricRegistry.NeedsUpdate(referenceTime, metric2) + + assert.False(t, metric1NeedsUpdate, "metric should not need update") + assert.True(t, metric2NeedsUpdate, "metric should need update") + }) } diff --git a/x-pack/metricbeat/module/azure/mock_service.go b/x-pack/metricbeat/module/azure/mock_service.go index 9626952fa6d..293adc7c9a7 100644 --- a/x-pack/metricbeat/module/azure/mock_service.go +++ b/x-pack/metricbeat/module/azure/mock_service.go @@ -43,7 +43,7 @@ func (client *MockService) GetMetricNamespaces(resourceId string) (armmonitor.Me // GetMetricValues is a mock function for the azure service func (client *MockService) GetMetricValues(resourceId string, namespace string, timegrain string, timespan string, metricNames []string, aggregations string, filter string) ([]armmonitor.Metric, string, error) { - args := client.Called(resourceId, namespace) + args := client.Called(resourceId, namespace, timegrain, timespan, metricNames, aggregations, filter) return args.Get(0).([]armmonitor.Metric), args.String(1), args.Error(2) } diff --git a/x-pack/metricbeat/module/azure/service_interface.go b/x-pack/metricbeat/module/azure/service_interface.go index cb524c7f6ea..75ae48d3d6e 100644 --- a/x-pack/metricbeat/module/azure/service_interface.go +++ b/x-pack/metricbeat/module/azure/service_interface.go @@ -15,5 +15,16 @@ type Service interface { GetResourceDefinitions(id []string, group []string, rType string, query string) ([]*armresources.GenericResourceExpanded, error) GetMetricDefinitionsWithRetry(resourceId string, namespace string) (armmonitor.MetricDefinitionCollection, error) GetMetricNamespaces(resourceId string) (armmonitor.MetricNamespaceCollection, error) - GetMetricValues(resourceId string, namespace string, timegrain string, timespan string, metricNames []string, aggregations string, filter string) ([]armmonitor.Metric, string, error) + // GetMetricValues returns the metric values for the given resource ID, namespace, timegrain, timespan, metricNames, aggregations and filter. + // + // If the timegrain is empty, the default timegrain for the metric is used and returned. + GetMetricValues( + resourceId string, // resourceId is the ID of the resource to query (e.g. "/subscriptions/{subscriptionId}/resourceGroups/{resourceGroupName}/providers/{resourceProviderNamespace}/{resourceType}/{resourceName}") + namespace string, // namespace is the metric namespace to query (e.g. "Microsoft.Compute/virtualMachines") + timegrain string, // timegrain is the timegrain to use for the metric query (e.g. "PT1M"); if empty, returns the default timegrain for the metric. + timespan string, // timespan is the time interval to query (e.g. 2024-04-29T14:03:00Z/2024-04-29T14:04:00Z) + metricNames []string, // metricNames is the list of metric names to query (e.g. ["ServiceApiLatency", "Availability"]) + aggregations string, // aggregations is the comma-separated list of aggregations to use for the metric query (e.g. "Average,Maximum,Minimum") + filter string, // filter is the filter to query for dimensions (e.g. "ActivityType eq '*' AND ActivityName eq '*' AND StatusCode eq '*' AND StatusCodeClass eq '*'") + ) ([]armmonitor.Metric, string, error) } From 6bb2a82b684922419561fd3a935ddd0573fcb762 Mon Sep 17 00:00:00 2001 From: Yi Song <166383463+goodfirm@users.noreply.github.com> Date: Tue, 30 Apr 2024 15:23:31 +0800 Subject: [PATCH 10/30] chore: fix function names in comment (#38800) Signed-off-by: goodfirm Co-authored-by: Pierre HILBERT --- dev-tools/mage/kubernetes/kuberemote.go | 2 +- filebeat/input/filestream/environment_test.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/dev-tools/mage/kubernetes/kuberemote.go b/dev-tools/mage/kubernetes/kuberemote.go index 8e9d9897d44..e3062f00d1a 100644 --- a/dev-tools/mage/kubernetes/kuberemote.go +++ b/dev-tools/mage/kubernetes/kuberemote.go @@ -250,7 +250,7 @@ func (r *KubeRemote) waitForPod(wait time.Duration, condition watchtools.Conditi return nil, err } -// portFoward runs the port forwarding so SSH rsync can be ran into the pod. +// portForward runs the port forwarding so SSH rsync can be ran into the pod. func (r *KubeRemote) portForward(ports []string, stopChannel, readyChannel chan struct{}, stdout, stderr io.Writer) (*portforward.PortForwarder, error) { roundTripper, upgrader, err := spdy.RoundTripperFor(r.cfg) if err != nil { diff --git a/filebeat/input/filestream/environment_test.go b/filebeat/input/filestream/environment_test.go index 7c3c8ccd4d3..88163258938 100644 --- a/filebeat/input/filestream/environment_test.go +++ b/filebeat/input/filestream/environment_test.go @@ -448,7 +448,7 @@ func (e *inputTestingEnvironment) waitUntilHarvesterIsDone() { } } -// requireEventReceived requires that the list of messages has made it into the output. +// requireEventsReceived requires that the list of messages has made it into the output. func (e *inputTestingEnvironment) requireEventsReceived(events []string) { foundEvents := make([]bool, len(events)) checkedEventCount := 0 From cfffc1ce552565fa1ae7c8d13abd04fca3d4fa7e Mon Sep 17 00:00:00 2001 From: Pavel Zorin Date: Tue, 30 Apr 2024 10:27:44 +0200 Subject: [PATCH 11/30] DRA: Disable summary reports for dry runs (#39240) --- .buildkite/scripts/dra.sh | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/.buildkite/scripts/dra.sh b/.buildkite/scripts/dra.sh index ec9d523bf3c..4b6a94ffa2c 100755 --- a/.buildkite/scripts/dra.sh +++ b/.buildkite/scripts/dra.sh @@ -70,11 +70,13 @@ docker run --rm \ --artifact-set "main" \ ${DRY_RUN} | tee rm-output.txt -# extract the summary URL from a release manager output line like: -# Report summary-18.22.0.html can be found at https://artifacts-staging.elastic.co/beats/18.22.0-ABCDEFGH/summary-18.22.0.html -SUMMARY_URL=$(grep -E '^Report summary-.* can be found at ' rm-output.txt | grep -oP 'https://\S+' | awk '{print $1}') -rm rm-output.txt +if [[ "$DRY_RUN" != "--dry-run" ]]; then + # extract the summary URL from a release manager output line like: + # Report summary-18.22.0.html can be found at https://artifacts-staging.elastic.co/beats/18.22.0-ABCDEFGH/summary-18.22.0.html + SUMMARY_URL=$(grep -E '^Report summary-.* can be found at ' rm-output.txt | grep -oP 'https://\S+' | awk '{print $1}') + rm rm-output.txt -# and make it easily clickable as a Builkite annotation -printf "**Summary link:** [${SUMMARY_URL}](${SUMMARY_URL})\n" | buildkite-agent annotate --style=success + # and make it easily clickable as a Builkite annotation + printf "**Summary link:** [${SUMMARY_URL}](${SUMMARY_URL})\n" | buildkite-agent annotate --style=success +fi From d275f2768c0be2b8c2d53c9649a3f263d8b18d64 Mon Sep 17 00:00:00 2001 From: Dimitrios Liappis Date: Tue, 30 Apr 2024 11:36:42 +0300 Subject: [PATCH 12/30] Ensure ordered DRA artifacts (#39270) As things are now we allow parallel builds on the packaging pipeline, which could result in out of order artifacts (depending on which one takes longer to finish). This commit implements two "queues" (snapshot/staging) to ensure ordered builds of DRA artifacts. Relates https://github.com/elastic/ingest-dev/issues/3095 --- .buildkite/packaging.pipeline.yml | 41 ++++++++++++++++++++++++++++++- 1 file changed, 40 insertions(+), 1 deletion(-) diff --git a/.buildkite/packaging.pipeline.yml b/.buildkite/packaging.pipeline.yml index a7fdabb2268..1dd2aaf60b0 100644 --- a/.buildkite/packaging.pipeline.yml +++ b/.buildkite/packaging.pipeline.yml @@ -12,11 +12,32 @@ env: PLATFORMS_ARM: "linux/arm64" steps: + # we use concurrency gates (https://buildkite.com/blog/concurrency-gates) + # to implement two FIFO queues for DRA-snapshot and DRA-staging + # this prevents parallel builds and possibility of publishing out of order DRA artifacts if the first job takes longer than the second + + - name: Start of concurrency group for DRA Snapshot + if: build.branch =~ /^\d+\.\d+$$/ || build.branch == 'main' || build.env('RUN_SNAPSHOT') == "true" + command: echo "--> Start of concurrency gate dra-snapshot" + concurrency_group: "dra-gate-snapshot" + concurrency: 1 + key: start-gate-snapshot + + - name: Start of concurrency group for DRA Staging + if: build.branch =~ /^\d+\.\d+$$/ + command: echo "--> Start of concurrency gate dra-staging" + concurrency_group: "dra-gate-staging" + concurrency: 1 + key: start-gate-staging + + - wait + - group: Beats dashboards key: dashboards steps: - label: Snapshot dashboards if: build.branch =~ /^\d+\.\d+$$/ || build.branch == 'main' || build.env('RUN_SNAPSHOT') == "true" + depends_on: start-gate-snapshot key: dashboards-snapshot # TODO: container with go and make agents: @@ -34,6 +55,7 @@ steps: - label: Staging dashboards if: build.branch =~ /^\d+\.\d+$$/ + depends_on: start-gate-staging key: dashboards-staging # TODO: container with go and make agents: @@ -52,6 +74,7 @@ steps: - group: Packaging snapshot if: build.branch =~ /^\d+\.\d+$$/ || build.branch == 'main' || build.env('RUN_SNAPSHOT') == "true" key: packaging-snapshot + depends_on: start-gate-snapshot steps: - label: "SNAPSHOT: {{matrix}}" env: @@ -123,8 +146,8 @@ steps: - build/distributions/**/* - group: Packaging Staging - key: packaging-staging + depends_on: start-gate-staging ## Only for release if: build.branch =~ /^\d+\.\d+$$/ steps: @@ -207,6 +230,7 @@ steps: env: DRA_WORKFLOW: snapshot depends_on: + - start-gate-snapshot - packaging-snapshot - dashboards-snapshot command: | @@ -225,6 +249,7 @@ steps: env: DRA_WORKFLOW: staging depends_on: + - start-gate-staging - packaging-staging - dashboards-staging command: | @@ -235,3 +260,17 @@ steps: provider: gcp image: "${IMAGE_UBUNTU_X86_64}" machineType: "${GCP_DEFAULT_MACHINE_TYPE}" + + - wait + + - command: echo "End of concurrency gate dra-snapshot <--" + if: build.branch =~ /^\d+\.\d+$$/ || build.branch == 'main' || build.env('RUN_SNAPSHOT') == "true" + concurrency_group: "dra-gate-snapshot" + concurrency: 1 + key: end-gate-snapshot + + - command: echo "End of concurrency gate dra-staging <--" + if: build.branch =~ /^\d+\.\d+$$/ + concurrency_group: "dra-gate-staging" + concurrency: 1 + key: end-gate-staging From b7e3fa27ce564e6d0b8ae9b12c17fcb860ef4cc8 Mon Sep 17 00:00:00 2001 From: Dimitrios Liappis Date: Tue, 30 Apr 2024 12:11:23 +0300 Subject: [PATCH 13/30] Fix auto triggered packaging builds (#39291) PR#39263 introduced a bug causing on packaging DRA builds to be triggered. This commit fixes the issue and also allowed triggered builds for `8.14` --- catalog-info.yaml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/catalog-info.yaml b/catalog-info.yaml index 0e5a5d864d3..59a89559a79 100644 --- a/catalog-info.yaml +++ b/catalog-info.yaml @@ -1040,16 +1040,19 @@ spec: spec: repository: elastic/beats pipeline_file: ".buildkite/packaging.pipeline.yml" - branch_configuration: "main" + branch_configuration: "main 8.14" # TODO enable after packaging backports for release branches # branch_configuration: "main 8.* 7.17" cancel_intermediate_builds: false skip_intermediate_builds: false provider_settings: - build_branches: false + build_branches: true build_pull_request_forks: false build_pull_requests: false build_tags: false + filter_condition: >- + build.branch =~ /^[0-9]+\.[0-9]+$$/ || build.branch == "main" + filter_enabled: true trigger_mode: code env: ELASTIC_SLACK_NOTIFICATIONS_ENABLED: 'true' From d3eaed50b3156664a701f36eda74f2f5e1b3f3a4 Mon Sep 17 00:00:00 2001 From: Dimitrios Liappis Date: Tue, 30 Apr 2024 12:19:13 +0300 Subject: [PATCH 14/30] Add timeout for DRA builds (#39293) --- catalog-info.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/catalog-info.yaml b/catalog-info.yaml index 59a89559a79..420d9c1c16a 100644 --- a/catalog-info.yaml +++ b/catalog-info.yaml @@ -1045,6 +1045,7 @@ spec: # branch_configuration: "main 8.* 7.17" cancel_intermediate_builds: false skip_intermediate_builds: false + maximum_timeout_in_minutes: 60 provider_settings: build_branches: true build_pull_request_forks: false From 85c9d146ebc454fc18819aed430334ce1b78f7ce Mon Sep 17 00:00:00 2001 From: Dimitrios Liappis Date: Tue, 30 Apr 2024 14:41:35 +0300 Subject: [PATCH 15/30] Fix missing docker staging DRA artifacts (#39297) The DRA staging release is failing because the Buildkite step isn't capturing the right artifacts. This commit fixes the issue by adjusting the artifact_paths to match the other steps. --- .buildkite/packaging.pipeline.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/packaging.pipeline.yml b/.buildkite/packaging.pipeline.yml index 1dd2aaf60b0..36cbed29fcf 100644 --- a/.buildkite/packaging.pipeline.yml +++ b/.buildkite/packaging.pipeline.yml @@ -192,7 +192,7 @@ steps: imagePrefix: "${AWS_IMAGE_UBUNTU_ARM_64}" instanceType: "${AWS_ARM_INSTANCE_TYPE}" artifact_paths: - - build/distributions/** + - build/distributions/**/* matrix: - auditbeat - filebeat From 40c68cf2e16d2c4fe9db903822430736b73d88b8 Mon Sep 17 00:00:00 2001 From: Dimitrios Liappis Date: Tue, 30 Apr 2024 14:53:42 +0300 Subject: [PATCH 16/30] Branch specific concurrency gates (#39298) PR #39293 introduced one concurrency queue per staging/snapshot but this slows down unnecessarily concurrent DRA builds for main and other release branches. This commit makes the concurrency gates (additionally) specific per branch. --- .buildkite/packaging.pipeline.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.buildkite/packaging.pipeline.yml b/.buildkite/packaging.pipeline.yml index 36cbed29fcf..c01428100ec 100644 --- a/.buildkite/packaging.pipeline.yml +++ b/.buildkite/packaging.pipeline.yml @@ -19,14 +19,14 @@ steps: - name: Start of concurrency group for DRA Snapshot if: build.branch =~ /^\d+\.\d+$$/ || build.branch == 'main' || build.env('RUN_SNAPSHOT') == "true" command: echo "--> Start of concurrency gate dra-snapshot" - concurrency_group: "dra-gate-snapshot" + concurrency_group: "dra-gate-snapshot-$BUILDKITE_BRANCH" concurrency: 1 key: start-gate-snapshot - name: Start of concurrency group for DRA Staging if: build.branch =~ /^\d+\.\d+$$/ command: echo "--> Start of concurrency gate dra-staging" - concurrency_group: "dra-gate-staging" + concurrency_group: "dra-gate-staging-$BUILDKITE_BRANCH" concurrency: 1 key: start-gate-staging @@ -265,12 +265,12 @@ steps: - command: echo "End of concurrency gate dra-snapshot <--" if: build.branch =~ /^\d+\.\d+$$/ || build.branch == 'main' || build.env('RUN_SNAPSHOT') == "true" - concurrency_group: "dra-gate-snapshot" + concurrency_group: "dra-gate-snapshot-$BUILDKITE_BRANCH" concurrency: 1 key: end-gate-snapshot - command: echo "End of concurrency gate dra-staging <--" if: build.branch =~ /^\d+\.\d+$$/ - concurrency_group: "dra-gate-staging" + concurrency_group: "dra-gate-staging-$BUILDKITE_BRANCH" concurrency: 1 key: end-gate-staging From f6bad74ef6edec0a3d0221d875651aaed75f95d6 Mon Sep 17 00:00:00 2001 From: Alexandros Sapranidis Date: Tue, 30 Apr 2024 14:58:10 +0300 Subject: [PATCH 17/30] Fix the annotating for snapshot and staging (#39299) * Fix the annotating for snapshot and staging This commit adds the `--append` flag to the buildkite-annotate so that when it is called by the snapshot and staging steps, it will not overwrite the other annotation. Signed-off-by: Alexandros Sapranidis --- .buildkite/scripts/dra.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/scripts/dra.sh b/.buildkite/scripts/dra.sh index 4b6a94ffa2c..5ce6e5884b9 100755 --- a/.buildkite/scripts/dra.sh +++ b/.buildkite/scripts/dra.sh @@ -78,5 +78,5 @@ if [[ "$DRY_RUN" != "--dry-run" ]]; then rm rm-output.txt # and make it easily clickable as a Builkite annotation - printf "**Summary link:** [${SUMMARY_URL}](${SUMMARY_URL})\n" | buildkite-agent annotate --style=success + printf "**${DRA_WORKFLOW} summary link:** [${SUMMARY_URL}](${SUMMARY_URL})\n" | buildkite-agent annotate --style=success --append fi From 2fa1123b8f1fb2eef6a96b23ccb7d460cbb6163b Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Tue, 30 Apr 2024 12:39:19 -0400 Subject: [PATCH 18/30] Cleanup: organizing code in awss3/input.go (#38958) Cleanups in `x-pack/filebeat/input/awss3/input.go`. - Split up the two main configuration cases, SQS queues versus bare S3 buckets, into two explicit helper functions (`s3Input.runQueueReader` and `s3Input.runS3Poller`) instead of handling them inline in `s3Input.Run`. - Simplify region-detection logic in `getRegionFromQueueURL` (`regionMismatchError` is no longer needed) - Rename `createS3Lister` to `createS3Poller` (since it creates an `s3Poller`) This is only a cleanup / reorganization, it does not change any behavior. --- x-pack/filebeat/input/awss3/input.go | 155 +++++++++++----------- x-pack/filebeat/input/awss3/input_test.go | 4 +- 2 files changed, 76 insertions(+), 83 deletions(-) diff --git a/x-pack/filebeat/input/awss3/input.go b/x-pack/filebeat/input/awss3/input.go index bb4a5c15bda..51e8c9808ed 100644 --- a/x-pack/filebeat/input/awss3/input.go +++ b/x-pack/filebeat/input/awss3/input.go @@ -102,72 +102,85 @@ func (in *s3Input) Run(inputContext v2.Context, pipeline beat.Pipeline) error { ctx := v2.GoContextFromCanceler(inputContext.Cancelation) if in.config.QueueURL != "" { - regionName, err := getRegionFromQueueURL(in.config.QueueURL, in.config.AWSConfig.Endpoint, in.config.RegionName) - if err != nil && in.config.RegionName == "" { - return fmt.Errorf("failed to get AWS region from queue_url: %w", err) - } - var warn regionMismatchError - if errors.As(err, &warn) { - // Warn of mismatch, but go ahead with configured region name. - inputContext.Logger.Warnf("%v: using %q", err, regionName) - } - in.awsConfig.Region = regionName + return in.runQueueReader(ctx, inputContext, pipeline) + } - // Create SQS receiver and S3 notification processor. - receiver, err := in.createSQSReceiver(inputContext, pipeline) - if err != nil { - return fmt.Errorf("failed to initialize sqs receiver: %w", err) - } - defer receiver.metrics.Close() + if in.config.BucketARN != "" || in.config.NonAWSBucketName != "" { + return in.runS3Poller(ctx, inputContext, pipeline) + } - // Poll metrics periodically in the background - go pollSqsWaitingMetric(ctx, receiver) + return nil +} - if err := receiver.Receive(ctx); err != nil { - return err - } +func (in *s3Input) runQueueReader( + ctx context.Context, + inputContext v2.Context, + pipeline beat.Pipeline, +) error { + configRegion := in.config.RegionName + urlRegion, err := getRegionFromQueueURL(in.config.QueueURL, in.config.AWSConfig.Endpoint) + if err != nil && configRegion == "" { + // Only report an error if we don't have a configured region + // to fall back on. + return fmt.Errorf("failed to get AWS region from queue_url: %w", err) + } else if configRegion != "" && configRegion != urlRegion { + inputContext.Logger.Warnf("configured region disagrees with queue_url region (%q != %q): using %q", configRegion, urlRegion, urlRegion) } - if in.config.BucketARN != "" || in.config.NonAWSBucketName != "" { - // Create client for publishing events and receive notification of their ACKs. - client, err := pipeline.ConnectWith(beat.ClientConfig{ - EventListener: awscommon.NewEventACKHandler(), - Processing: beat.ProcessingConfig{ - // This input only produces events with basic types so normalization - // is not required. - EventNormalization: boolPtr(false), - }, - }) - if err != nil { - return fmt.Errorf("failed to create pipeline client: %w", err) - } - defer client.Close() + in.awsConfig.Region = urlRegion - // Connect to the registry and create our states lookup - persistentStore, err := in.store.Access() - if err != nil { - return fmt.Errorf("can not access persistent store: %w", err) - } - defer persistentStore.Close() + // Create SQS receiver and S3 notification processor. + receiver, err := in.createSQSReceiver(inputContext, pipeline) + if err != nil { + return fmt.Errorf("failed to initialize sqs receiver: %w", err) + } + defer receiver.metrics.Close() - states, err := newStates(inputContext, persistentStore) - if err != nil { - return fmt.Errorf("can not start persistent store: %w", err) - } + // Poll metrics periodically in the background + go pollSqsWaitingMetric(ctx, receiver) - // Create S3 receiver and S3 notification processor. - poller, err := in.createS3Lister(inputContext, ctx, client, states) - if err != nil { - return fmt.Errorf("failed to initialize s3 poller: %w", err) - } - defer poller.metrics.Close() + return receiver.Receive(ctx) +} - if err := poller.Poll(ctx); err != nil { - return err - } +func (in *s3Input) runS3Poller( + ctx context.Context, + inputContext v2.Context, + pipeline beat.Pipeline, +) error { + // Create client for publishing events and receive notification of their ACKs. + client, err := pipeline.ConnectWith(beat.ClientConfig{ + EventListener: awscommon.NewEventACKHandler(), + Processing: beat.ProcessingConfig{ + // This input only produces events with basic types so normalization + // is not required. + EventNormalization: boolPtr(false), + }, + }) + if err != nil { + return fmt.Errorf("failed to create pipeline client: %w", err) } + defer client.Close() - return nil + // Connect to the registry and create our states lookup + persistentStore, err := in.store.Access() + if err != nil { + return fmt.Errorf("can not access persistent store: %w", err) + } + defer persistentStore.Close() + + states, err := newStates(inputContext, persistentStore) + if err != nil { + return fmt.Errorf("can not start persistent store: %w", err) + } + + // Create S3 receiver and S3 notification processor. + poller, err := in.createS3Poller(inputContext, ctx, client, states) + if err != nil { + return fmt.Errorf("failed to initialize s3 poller: %w", err) + } + defer poller.metrics.Close() + + return poller.Poll(ctx) } func (in *s3Input) createSQSReceiver(ctx v2.Context, pipeline beat.Pipeline) (*sqsReader, error) { @@ -212,8 +225,11 @@ func (in *s3Input) createSQSReceiver(ctx v2.Context, pipeline beat.Pipeline) (*s return nil, err } in.metrics = newInputMetrics(ctx.ID, nil, in.config.MaxNumberOfMessages) + s3EventHandlerFactory := newS3ObjectProcessorFactory(log.Named("s3"), in.metrics, s3API, fileSelectors, in.config.BackupConfig, in.config.MaxNumberOfMessages) + sqsMessageHandler := newSQSS3EventProcessor(log.Named("sqs_s3_event"), in.metrics, sqsAPI, script, in.config.VisibilityTimeout, in.config.SQSMaxReceiveCount, pipeline, s3EventHandlerFactory, in.config.MaxNumberOfMessages) + sqsReader := newSQSReader(log.Named("sqs"), in.metrics, sqsAPI, in.config.MaxNumberOfMessages, sqsMessageHandler) return sqsReader, nil @@ -227,7 +243,7 @@ func (n nonAWSBucketResolver) ResolveEndpoint(region string, options s3.Endpoint return awssdk.Endpoint{URL: n.endpoint, SigningRegion: region, HostnameImmutable: true, Source: awssdk.EndpointSourceCustom}, nil } -func (in *s3Input) createS3Lister(ctx v2.Context, cancelCtx context.Context, client beat.Client, states *states) (*s3Poller, error) { +func (in *s3Input) createS3Poller(ctx v2.Context, cancelCtx context.Context, client beat.Client, states *states) (*s3Poller, error) { var bucketName string var bucketID string if in.config.NonAWSBucketName != "" { @@ -310,7 +326,7 @@ func (in *s3Input) createS3Lister(ctx v2.Context, cancelCtx context.Context, cli var errBadQueueURL = errors.New("QueueURL is not in format: https://sqs.{REGION_ENDPOINT}.{ENDPOINT}/{ACCOUNT_NUMBER}/{QUEUE_NAME} or https://{VPC_ENDPOINT}.sqs.{REGION_ENDPOINT}.vpce.{ENDPOINT}/{ACCOUNT_NUMBER}/{QUEUE_NAME}") -func getRegionFromQueueURL(queueURL string, endpoint, defaultRegion string) (region string, err error) { +func getRegionFromQueueURL(queueURL, endpoint string) (string, error) { // get region from queueURL // Example for sqs queue: https://sqs.us-east-1.amazonaws.com/12345678912/test-s3-logs // Example for vpce: https://vpce-test.sqs.us-east-1.vpce.amazonaws.com/12345678912/sqs-queue @@ -323,11 +339,7 @@ func getRegionFromQueueURL(queueURL string, endpoint, defaultRegion string) (reg // check for sqs queue url if len(queueHostSplit) == 3 && queueHostSplit[0] == "sqs" { if queueHostSplit[2] == endpoint || (endpoint == "" && strings.HasPrefix(queueHostSplit[2], "amazonaws.")) { - region = queueHostSplit[1] - if defaultRegion != "" && region != defaultRegion { - return defaultRegion, regionMismatchError{queueURLRegion: region, defaultRegion: defaultRegion} - } - return region, nil + return queueHostSplit[1], nil } } @@ -335,30 +347,13 @@ func getRegionFromQueueURL(queueURL string, endpoint, defaultRegion string) (reg queueHostSplitVPC := strings.SplitN(u.Host, ".", 5) if len(queueHostSplitVPC) == 5 && queueHostSplitVPC[1] == "sqs" { if queueHostSplitVPC[4] == endpoint || (endpoint == "" && strings.HasPrefix(queueHostSplitVPC[4], "amazonaws.")) { - region = queueHostSplitVPC[2] - if defaultRegion != "" && region != defaultRegion { - return defaultRegion, regionMismatchError{queueURLRegion: region, defaultRegion: defaultRegion} - } - return region, nil + return queueHostSplitVPC[2], nil } } - - if defaultRegion != "" { - return defaultRegion, nil - } } return "", errBadQueueURL } -type regionMismatchError struct { - queueURLRegion string - defaultRegion string -} - -func (e regionMismatchError) Error() string { - return fmt.Sprintf("configured region disagrees with queue_url region: %q != %q", e.queueURLRegion, e.defaultRegion) -} - func getRegionForBucket(ctx context.Context, s3Client *s3.Client, bucketName string) (string, error) { getBucketLocationOutput, err := s3Client.GetBucketLocation(ctx, &s3.GetBucketLocationInput{ Bucket: awssdk.String(bucketName), diff --git a/x-pack/filebeat/input/awss3/input_test.go b/x-pack/filebeat/input/awss3/input_test.go index abc9f5c9a6a..0a3053f7f1b 100644 --- a/x-pack/filebeat/input/awss3/input_test.go +++ b/x-pack/filebeat/input/awss3/input_test.go @@ -54,7 +54,6 @@ func TestGetRegionFromQueueURL(t *testing.T) { name string queueURL string endpoint string - deflt string want string wantErr error }{ @@ -77,7 +76,6 @@ func TestGetRegionFromQueueURL(t *testing.T) { { name: "vpce_endpoint", queueURL: "https://vpce-test.sqs.us-east-2.vpce.amazonaws.com/12345678912/sqs-queue", - deflt: "", want: "us-east-2", }, { @@ -90,7 +88,7 @@ func TestGetRegionFromQueueURL(t *testing.T) { for _, test := range tests { t.Run(test.name, func(t *testing.T) { - got, err := getRegionFromQueueURL(test.queueURL, test.endpoint, test.deflt) + got, err := getRegionFromQueueURL(test.queueURL, test.endpoint) if !sameError(err, test.wantErr) { t.Errorf("unexpected error: got:%v want:%v", err, test.wantErr) } From e2c652c2d38fa2a7d4b130ce2860cb0c0c98b87d Mon Sep 17 00:00:00 2001 From: Dimitrios Liappis Date: Tue, 30 Apr 2024 20:10:34 +0300 Subject: [PATCH 19/30] Disable 8.14 DRA on Jenkins (#39322) This commit is complementing PR #39321 and is needed to disable the execution of 8.14 DRA packaging on Jenkins. --- .ci/jobs/packaging.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.ci/jobs/packaging.yml b/.ci/jobs/packaging.yml index 6d4b136a557..50cec32edd8 100644 --- a/.ci/jobs/packaging.yml +++ b/.ci/jobs/packaging.yml @@ -14,7 +14,7 @@ discover-pr-forks-trust: 'permission' discover-pr-origin: 'merge-current' discover-tags: true - head-filter-regex: '(7\.1[6789]|8\.\d+|PR-.*|v\d+\.\d+\.\d+)' + head-filter-regex: '(7\.1[6789]|8\.13|PR-.*|v8\.13\.\d+)' disable-pr-notifications: true notification-context: 'beats-packaging' repo: 'beats' @@ -28,11 +28,11 @@ ignore-tags-older-than: -1 ignore-tags-newer-than: 30 - named-branches: - - regex-name: - regex: '7\.1[6789]' + - exact-name: + name: '8.13' case-sensitive: true - regex-name: - regex: '8\.\d+' + regex: '7\.1[6789]' case-sensitive: true - change-request: ignore-target-only-changes: true From 8c48989a8498f29e8e4dfdcf9d8f6f8bba6fc285 Mon Sep 17 00:00:00 2001 From: Blake Rouse Date: Tue, 30 Apr 2024 13:12:30 -0400 Subject: [PATCH 20/30] Include metricbeat modules directory into agentbeat build. (#39278) --- metricbeat/scripts/mage/package.go | 14 +++++++------- x-pack/agentbeat/magefile.go | 20 ++++++++++++++++++-- 2 files changed, 25 insertions(+), 9 deletions(-) diff --git a/metricbeat/scripts/mage/package.go b/metricbeat/scripts/mage/package.go index e206881dd3c..43e12652f4a 100644 --- a/metricbeat/scripts/mage/package.go +++ b/metricbeat/scripts/mage/package.go @@ -40,7 +40,7 @@ const ( // not supported. You must declare a dependency on either // PrepareModulePackagingOSS or PrepareModulePackagingXPack. func CustomizePackaging() { - mg.Deps(customizeLightModulesPackaging) + mg.Deps(CustomizeLightModulesPackaging) var ( modulesDTarget = "modules.d" @@ -104,7 +104,7 @@ func CustomizePackaging() { // PrepareModulePackagingOSS generates build/package/modules and // build/package/modules.d directories for use in packaging. func PrepareModulePackagingOSS() error { - err := prepareLightModulesPackaging("module") + err := PrepareLightModulesPackaging("module") if err != nil { return err } @@ -116,7 +116,7 @@ func PrepareModulePackagingOSS() error { // PrepareModulePackagingXPack generates build/package/modules and // build/package/modules.d directories for use in packaging. func PrepareModulePackagingXPack() error { - err := prepareLightModulesPackaging("module", devtools.OSSBeatDir("module")) + err := PrepareLightModulesPackaging("module", devtools.OSSBeatDir("module")) if err != nil { return err } @@ -201,8 +201,8 @@ func GenerateDirModulesD() error { return nil } -// customizeLightModulesPackaging customizes packaging to add light modules -func customizeLightModulesPackaging() error { +// CustomizeLightModulesPackaging customizes packaging to add light modules +func CustomizeLightModulesPackaging() error { var ( moduleTarget = "module" module = devtools.PackageFile{ @@ -225,8 +225,8 @@ func customizeLightModulesPackaging() error { return nil } -// prepareLightModulesPackaging generates light modules -func prepareLightModulesPackaging(paths ...string) error { +// PrepareLightModulesPackaging generates light modules +func PrepareLightModulesPackaging(paths ...string) error { err := devtools.Clean([]string{dirModulesGenerated}) if err != nil { return err diff --git a/x-pack/agentbeat/magefile.go b/x-pack/agentbeat/magefile.go index 874c79bf7a3..c7e6c561830 100644 --- a/x-pack/agentbeat/magefile.go +++ b/x-pack/agentbeat/magefile.go @@ -20,6 +20,7 @@ import ( devtools "github.com/elastic/beats/v7/dev-tools/mage" "github.com/elastic/beats/v7/dev-tools/mage/target/build" + metricbeat "github.com/elastic/beats/v7/metricbeat/scripts/mage" packetbeat "github.com/elastic/beats/v7/packetbeat/scripts/mage" osquerybeat "github.com/elastic/beats/v7/x-pack/osquerybeat/scripts/mage" @@ -112,11 +113,19 @@ func CrossBuildDeps() error { return callForBeat("crossBuildExt", "osquerybeat") } +// PrepareLightModules prepares the module packaging. +func PrepareLightModules() error { + return metricbeat.PrepareLightModulesPackaging( + filepath.Join("..", "metricbeat", "module"), // x-pack/metricbeat + filepath.Join("..", "..", "metricbeat", "module"), // metricbeat (oss) + ) +} + // Package packages the Beat for distribution. // Use SNAPSHOT=true to build snapshots. // Use PLATFORMS to control the target platforms. // Use VERSION_QUALIFIER to control the version qualifier. -func Package() { +func Package() error { start := time.Now() defer func() { fmt.Println("package ran for", time.Since(start)) }() @@ -126,7 +135,14 @@ func Package() { // Add osquery distro binaries, required for the osquerybeat subcommand. osquerybeat.CustomizePackaging() - mg.SerialDeps(Update, osquerybeat.FetchOsqueryDistros, CrossBuildDeps, CrossBuild, devtools.Package, TestPackages) + // Add metricbeat lightweight modules. + if err := metricbeat.CustomizeLightModulesPackaging(); err != nil { + return err + } + + mg.SerialDeps(Update, PrepareLightModules, osquerybeat.FetchOsqueryDistros, CrossBuildDeps, CrossBuild, devtools.Package, TestPackages) + + return nil } // TestPackages tests the generated packages (i.e. file modes, owners, groups). From 562e48efea1a93eca9087ca03781dfba60cca883 Mon Sep 17 00:00:00 2001 From: Alex K <8418476+fearful-symmetry@users.noreply.github.com> Date: Tue, 30 Apr 2024 12:10:18 -0700 Subject: [PATCH 21/30] Add queue percentage to libbeat metrics (#39205) * add queue full percentage metric * newline * add div by zero check * change name * linter * fix gauge settings * linter... * change name * set percentage when we set queue max * change name * round numbers --- libbeat/monitoring/report/log/log.go | 66 ++++++++++++------------ libbeat/publisher/pipeline/monitoring.go | 29 +++++++++-- 2 files changed, 59 insertions(+), 36 deletions(-) diff --git a/libbeat/monitoring/report/log/log.go b/libbeat/monitoring/report/log/log.go index 886e207593a..e11e8228cf7 100644 --- a/libbeat/monitoring/report/log/log.go +++ b/libbeat/monitoring/report/log/log.go @@ -37,34 +37,36 @@ import ( // TODO: Replace this with a proper solution that uses the metric type from // where it is defined. See: https://github.com/elastic/beats/issues/5433 var gauges = map[string]bool{ - "libbeat.output.events.active": true, - "libbeat.pipeline.events.active": true, - "libbeat.pipeline.clients": true, - "libbeat.config.module.running": true, - "registrar.states.current": true, - "filebeat.events.active": true, - "filebeat.harvester.running": true, - "filebeat.harvester.open_files": true, - "beat.memstats.memory_total": true, - "beat.memstats.memory_alloc": true, - "beat.memstats.rss": true, - "beat.memstats.gc_next": true, - "beat.info.uptime.ms": true, - "beat.cgroup.memory.mem.usage.bytes": true, - "beat.cpu.user.ticks": true, - "beat.cpu.system.ticks": true, - "beat.cpu.total.value": true, - "beat.cpu.total.ticks": true, - "beat.handles.open": true, - "beat.handles.limit.hard": true, - "beat.handles.limit.soft": true, - "beat.runtime.goroutines": true, - "system.load.1": true, - "system.load.5": true, - "system.load.15": true, - "system.load.norm.1": true, - "system.load.norm.5": true, - "system.load.norm.15": true, + "libbeat.output.events.active": true, + "libbeat.pipeline.events.active": true, + "libbeat.pipeline.clients": true, + "libbeat.pipeline.queue.max_events": true, + "libbeat.pipeline.queue.filled.pct.events": true, + "libbeat.config.module.running": true, + "registrar.states.current": true, + "filebeat.events.active": true, + "filebeat.harvester.running": true, + "filebeat.harvester.open_files": true, + "beat.memstats.memory_total": true, + "beat.memstats.memory_alloc": true, + "beat.memstats.rss": true, + "beat.memstats.gc_next": true, + "beat.info.uptime.ms": true, + "beat.cgroup.memory.mem.usage.bytes": true, + "beat.cpu.user.ticks": true, + "beat.cpu.system.ticks": true, + "beat.cpu.total.value": true, + "beat.cpu.total.ticks": true, + "beat.handles.open": true, + "beat.handles.limit.hard": true, + "beat.handles.limit.soft": true, + "beat.runtime.goroutines": true, + "system.load.1": true, + "system.load.5": true, + "system.load.15": true, + "system.load.norm.1": true, + "system.load.norm.5": true, + "system.load.norm.15": true, } // isGauge returns true when the given metric key name represents a gauge value. @@ -249,16 +251,16 @@ func toKeyValuePairs(snaps map[string]monitoring.FlatSnapshot) []interface{} { for name, snap := range snaps { data := make(mapstr.M, snapshotLen(snap)) for k, v := range snap.Bools { - data.Put(k, v) //nolint:errcheck // All keys within the flat snapshot are unique and are for scalar values. + data.Put(k, v) } for k, v := range snap.Floats { - data.Put(k, v) //nolint:errcheck // All keys within the flat snapshot are unique and are for scalar values. + data.Put(k, v) } for k, v := range snap.Ints { - data.Put(k, v) //nolint:errcheck // All keys within the flat snapshot are unique and are for scalar values. + data.Put(k, v) } for k, v := range snap.Strings { - data.Put(k, v) //nolint:errcheck // All keys within the flat snapshot are unique and are for scalar values. + data.Put(k, v) } if len(data) > 0 { args = append(args, logp.Reflect(name, data)) diff --git a/libbeat/publisher/pipeline/monitoring.go b/libbeat/publisher/pipeline/monitoring.go index 69a21c2c71c..cda329e0963 100644 --- a/libbeat/publisher/pipeline/monitoring.go +++ b/libbeat/publisher/pipeline/monitoring.go @@ -17,7 +17,11 @@ package pipeline -import "github.com/elastic/elastic-agent-libs/monitoring" +import ( + "math" + + "github.com/elastic/elastic-agent-libs/monitoring" +) type observer interface { pipelineObserver @@ -67,8 +71,9 @@ type metricsObserverVars struct { activeEvents *monitoring.Uint // queue metrics - queueACKed *monitoring.Uint - queueMaxEvents *monitoring.Uint + queueACKed *monitoring.Uint + queueMaxEvents *monitoring.Uint + percentQueueFull *monitoring.Float } func newMetricsObserver(metrics *monitoring.Registry) *metricsObserver { @@ -92,7 +97,8 @@ func newMetricsObserver(metrics *monitoring.Registry) *metricsObserver { queueACKed: monitoring.NewUint(reg, "queue.acked"), queueMaxEvents: monitoring.NewUint(reg, "queue.max_events"), - activeEvents: monitoring.NewUint(reg, "events.active"), // Gauge + activeEvents: monitoring.NewUint(reg, "events.active"), // Gauge + percentQueueFull: monitoring.NewFloat(reg, "queue.filled.pct.events"), }, } } @@ -121,12 +127,24 @@ func (o *metricsObserver) clientClosed() { o.vars.clients.Dec() } func (o *metricsObserver) newEvent() { o.vars.events.Inc() o.vars.activeEvents.Inc() + o.setPercentageFull() +} + +// setPercentageFull is used interally to set the `queue.full` metric +func (o *metricsObserver) setPercentageFull() { + maxEvt := o.vars.queueMaxEvents.Get() + if maxEvt != 0 { + pct := float64(o.vars.activeEvents.Get()) / float64(maxEvt) + pctRound := math.Round(pct/0.0005) * 0.0005 + o.vars.percentQueueFull.Set(pctRound) + } } // (client) event is filtered out (on purpose or failed) func (o *metricsObserver) filteredEvent() { o.vars.filtered.Inc() o.vars.activeEvents.Dec() + o.setPercentageFull() } // (client) managed to push an event into the publisher pipeline @@ -138,6 +156,7 @@ func (o *metricsObserver) publishedEvent() { func (o *metricsObserver) failedPublishEvent() { o.vars.failed.Inc() o.vars.activeEvents.Dec() + o.setPercentageFull() } // @@ -148,11 +167,13 @@ func (o *metricsObserver) failedPublishEvent() { func (o *metricsObserver) queueACKed(n int) { o.vars.queueACKed.Add(uint64(n)) o.vars.activeEvents.Sub(uint64(n)) + o.setPercentageFull() } // (queue) maximum queue event capacity func (o *metricsObserver) queueMaxEvents(n int) { o.vars.queueMaxEvents.Set(uint64(n)) + o.setPercentageFull() } // From 5c684a81beb0f24f30339e442306c41fccb74f58 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 30 Apr 2024 16:37:03 -0400 Subject: [PATCH 22/30] build(deps): bump github.com/elastic/elastic-agent-autodiscover from 0.6.8 to 0.6.14 (#39178) * build(deps): bump github.com/elastic/elastic-agent-autodiscover Bumps [github.com/elastic/elastic-agent-autodiscover](https://github.com/elastic/elastic-agent-autodiscover) from 0.6.8 to 0.6.14. - [Release notes](https://github.com/elastic/elastic-agent-autodiscover/releases) - [Changelog](https://github.com/elastic/elastic-agent-autodiscover/blob/main/CHANGELOG.md) - [Commits](https://github.com/elastic/elastic-agent-autodiscover/compare/v0.6.8...v0.6.14) --- updated-dependencies: - dependency-name: github.com/elastic/elastic-agent-autodiscover dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] * Update NOTICE.txt --------- Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: dependabot[bot] --- NOTICE.txt | 8 ++++---- go.mod | 4 ++-- go.sum | 8 ++++---- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/NOTICE.txt b/NOTICE.txt index f060baf4098..951b7e7785c 100644 --- a/NOTICE.txt +++ b/NOTICE.txt @@ -12525,11 +12525,11 @@ various licenses: -------------------------------------------------------------------------------- Dependency : github.com/elastic/elastic-agent-autodiscover -Version: v0.6.13 +Version: v0.6.14 Licence type (autodetected): Apache-2.0 -------------------------------------------------------------------------------- -Contents of probable licence file $GOMODCACHE/github.com/elastic/elastic-agent-autodiscover@v0.6.13/LICENSE: +Contents of probable licence file $GOMODCACHE/github.com/elastic/elastic-agent-autodiscover@v0.6.14/LICENSE: Apache License Version 2.0, January 2004 @@ -25433,11 +25433,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- Dependency : golang.org/x/net -Version: v0.21.0 +Version: v0.23.0 Licence type (autodetected): BSD-3-Clause -------------------------------------------------------------------------------- -Contents of probable licence file $GOMODCACHE/golang.org/x/net@v0.21.0/LICENSE: +Contents of probable licence file $GOMODCACHE/golang.org/x/net@v0.23.0/LICENSE: Copyright (c) 2009 The Go Authors. All rights reserved. diff --git a/go.mod b/go.mod index 0805e9200c8..ad13afabd8d 100644 --- a/go.mod +++ b/go.mod @@ -154,7 +154,7 @@ require ( golang.org/x/crypto v0.21.0 golang.org/x/lint v0.0.0-20210508222113-6edffad5e616 golang.org/x/mod v0.14.0 - golang.org/x/net v0.21.0 + golang.org/x/net v0.23.0 golang.org/x/oauth2 v0.10.0 golang.org/x/sync v0.5.0 golang.org/x/sys v0.18.0 @@ -203,7 +203,7 @@ require ( github.com/awslabs/kinesis-aggregation/go/v2 v2.0.0-20220623125934-28468a6701b5 github.com/elastic/bayeux v1.0.5 github.com/elastic/ebpfevents v0.6.0 - github.com/elastic/elastic-agent-autodiscover v0.6.13 + github.com/elastic/elastic-agent-autodiscover v0.6.14 github.com/elastic/elastic-agent-libs v0.7.5 github.com/elastic/elastic-agent-shipper-client v0.5.1-0.20230228231646-f04347b666f3 github.com/elastic/elastic-agent-system-metrics v0.9.2 diff --git a/go.sum b/go.sum index 57711b7a9fe..5c45bdee748 100644 --- a/go.sum +++ b/go.sum @@ -551,8 +551,8 @@ github.com/elastic/dhcp v0.0.0-20200227161230-57ec251c7eb3 h1:lnDkqiRFKm0rxdljqr github.com/elastic/dhcp v0.0.0-20200227161230-57ec251c7eb3/go.mod h1:aPqzac6AYkipvp4hufTyMj5PDIphF3+At8zr7r51xjY= github.com/elastic/ebpfevents v0.6.0 h1:BrL3m7JFK7U6h2jkbk3xAWWs//IZnugCHEDds5u2v68= github.com/elastic/ebpfevents v0.6.0/go.mod h1:ESG9gw7N+n5yCCMgdg1IIJENKWSmX7+X0Fi9GUs9nvU= -github.com/elastic/elastic-agent-autodiscover v0.6.13 h1:zBeTxV+o2efEKntY+o6iMMNJ1AVjDXUqY3o6uzIkKaw= -github.com/elastic/elastic-agent-autodiscover v0.6.13/go.mod h1:7P6YVKxuBT0qE/VxuA87obwZUAEU0O44mCN3r4/6x8w= +github.com/elastic/elastic-agent-autodiscover v0.6.14 h1:0zJYNyv9GKTOiNqCHqEVboP+WioV73ia17Et+UlFbz8= +github.com/elastic/elastic-agent-autodiscover v0.6.14/go.mod h1:39/fHHlnyTK6oUNZfAhxJwBTVahO9tNasEIjzsxGMu8= github.com/elastic/elastic-agent-client/v7 v7.8.1 h1:J9wZc/0mUvSEok0X5iR5+n60Jgb+AWooKddb3XgPWqM= github.com/elastic/elastic-agent-client/v7 v7.8.1/go.mod h1:axl1nkdqc84YRFkeJGD9jExKNPUrOrzf3DFo2m653nY= github.com/elastic/elastic-agent-libs v0.7.5 h1:4UMqB3BREvhwecYTs/L23oQp1hs/XUkcunPlmTZn5yg= @@ -1960,8 +1960,8 @@ golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= golang.org/x/net v0.8.0/go.mod h1:QVkue5JL9kW//ek3r6jTKnTFis1tRmNAW2P1shuFdJc= golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= -golang.org/x/net v0.21.0 h1:AQyQV4dYCvJ7vGmJyKki9+PBdyvhkSd8EIx/qb0AYv4= -golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44= +golang.org/x/net v0.23.0 h1:7EYJ93RZ9vYSZAIb2x3lnuvqO5zneoD6IvWjuhfxjTs= +golang.org/x/net v0.23.0/go.mod h1:JKghWKKOSdJwpW2GEx0Ja7fmaKnMsbu+MWVZTokSYmg= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/oauth2 v0.0.0-20190130055435-99b60b757ec1/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= From 33ba5f0d9a4493b78d337a22d53d32db390aaeea Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Tue, 30 Apr 2024 17:07:51 -0400 Subject: [PATCH 23/30] add change log for S3 fix (#39320) --- CHANGELOG.next.asciidoc | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.next.asciidoc b/CHANGELOG.next.asciidoc index f57b7100077..58ce7ac0f65 100644 --- a/CHANGELOG.next.asciidoc +++ b/CHANGELOG.next.asciidoc @@ -142,6 +142,7 @@ https://github.com/elastic/beats/compare/v8.8.1\...main[Check the HEAD diff] - Updated Websocket input title to align with existing inputs {pull}39006[39006] - Restore netflow input on Windows {pull}39024[39024] - Upgrade azure-event-hubs-go and azure-storage-blob-go dependencies. {pull}38861[38861] +- Fix concurrency/error handling bugs in the AWS S3 input that could drop data and prevent ingestion of large buckets. {pull}39131[39131] *Heartbeat* From 50e173aebb6ef064adb3f1a97dcef52b998af55d Mon Sep 17 00:00:00 2001 From: apmmachine <58790750+apmmachine@users.noreply.github.com> Date: Tue, 30 Apr 2024 19:04:19 -0400 Subject: [PATCH 24/30] chore: Update snapshot.yml (#39319) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Made with ❤️️ by updatecli Co-authored-by: apmmachine Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> --- testing/environments/snapshot.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/testing/environments/snapshot.yml b/testing/environments/snapshot.yml index b531cf78a51..bf5fdbc9545 100644 --- a/testing/environments/snapshot.yml +++ b/testing/environments/snapshot.yml @@ -3,7 +3,7 @@ version: '2.3' services: elasticsearch: - image: docker.elastic.co/elasticsearch/elasticsearch:8.15.0-81021969-SNAPSHOT + image: docker.elastic.co/elasticsearch/elasticsearch:8.15.0-bb66fa2a-SNAPSHOT # When extend is used it merges healthcheck.tests, see: # https://github.com/docker/compose/issues/8962 # healthcheck: @@ -31,7 +31,7 @@ services: - "./docker/elasticsearch/users_roles:/usr/share/elasticsearch/config/users_roles" logstash: - image: docker.elastic.co/logstash/logstash:8.15.0-81021969-SNAPSHOT + image: docker.elastic.co/logstash/logstash:8.15.0-bb66fa2a-SNAPSHOT healthcheck: test: ["CMD", "curl", "-f", "http://localhost:9600/_node/stats"] retries: 600 @@ -44,7 +44,7 @@ services: - 5055:5055 kibana: - image: docker.elastic.co/kibana/kibana:8.15.0-81021969-SNAPSHOT + image: docker.elastic.co/kibana/kibana:8.15.0-bb66fa2a-SNAPSHOT environment: - "ELASTICSEARCH_USERNAME=kibana_system_user" - "ELASTICSEARCH_PASSWORD=testing" From 246a8bc019676043f19510d57d86853fb5ebbe57 Mon Sep 17 00:00:00 2001 From: Andrew Kroh Date: Wed, 1 May 2024 08:43:14 -0400 Subject: [PATCH 25/30] .github/dependabot.yml - add elastic/ebpfevents (#38695) Add github.com/elastic/ebpfevents to the list of dependencies that are watched. Co-authored-by: Dimitrios Liappis --- .github/dependabot.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 7fcaca8ac9e..bbd4255fd87 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -34,6 +34,7 @@ updates: - dependency-name: github.com/elastic/go-perf - dependency-name: github.com/elastic/go-seccomp-bpf - dependency-name: github.com/elastic/toutoumomoma + - dependency-name: github.com/elastic/ebpfevents ignore: # Skip github.com/elastic/mito because it requires documentation updates. - dependency-name: github.com/elastic/mito From 5011cccdc64c8ecd7ebca6dd10574af4a525fa98 Mon Sep 17 00:00:00 2001 From: Alexandros Sapranidis Date: Wed, 1 May 2024 17:38:35 +0300 Subject: [PATCH 26/30] Allow everyone in Elastic to build Beats (#39335) This commit allows everyone under the Elastic org to be able to trigger builds in Buildkite. Signed-off-by: Alexandros Sapranidis --- catalog-info.yaml | 48 +++++++++++++++++++++++------------------------ 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/catalog-info.yaml b/catalog-info.yaml index 420d9c1c16a..f81d3a6df1a 100644 --- a/catalog-info.yaml +++ b/catalog-info.yaml @@ -61,7 +61,7 @@ spec: ingest-fp: access_level: MANAGE_BUILD_AND_READ everyone: - access_level: READ_ONLY + access_level: BUILD_AND_READ --- # yaml-language-server: $schema=https://gist.githubusercontent.com/elasticmachine/988b80dae436cafea07d9a4a460a011d/raw/rre.schema.json @@ -108,7 +108,7 @@ spec: ingest-fp: access_level: MANAGE_BUILD_AND_READ everyone: - access_level: READ_ONLY + access_level: BUILD_AND_READ --- # yaml-language-server: $schema=https://gist.githubusercontent.com/elasticmachine/988b80dae436cafea07d9a4a460a011d/raw/rre.schema.json @@ -155,7 +155,7 @@ spec: ingest-fp: access_level: MANAGE_BUILD_AND_READ everyone: - access_level: READ_ONLY + access_level: BUILD_AND_READ --- # yaml-language-server: $schema=https://gist.githubusercontent.com/elasticmachine/988b80dae436cafea07d9a4a460a011d/raw/rre.schema.json @@ -202,7 +202,7 @@ spec: ingest-fp: access_level: MANAGE_BUILD_AND_READ everyone: - access_level: READ_ONLY + access_level: BUILD_AND_READ --- # yaml-language-server: $schema=https://gist.githubusercontent.com/elasticmachine/988b80dae436cafea07d9a4a460a011d/raw/rre.schema.json @@ -249,7 +249,7 @@ spec: ingest-fp: access_level: MANAGE_BUILD_AND_READ everyone: - access_level: READ_ONLY + access_level: BUILD_AND_READ --- # yaml-language-server: $schema=https://gist.githubusercontent.com/elasticmachine/988b80dae436cafea07d9a4a460a011d/raw/rre.schema.json @@ -296,7 +296,7 @@ spec: ingest-fp: access_level: MANAGE_BUILD_AND_READ everyone: - access_level: READ_ONLY + access_level: BUILD_AND_READ --- # yaml-language-server: $schema=https://gist.githubusercontent.com/elasticmachine/988b80dae436cafea07d9a4a460a011d/raw/rre.schema.json @@ -343,7 +343,7 @@ spec: ingest-fp: access_level: MANAGE_BUILD_AND_READ everyone: - access_level: READ_ONLY + access_level: BUILD_AND_READ --- # yaml-language-server: $schema=https://gist.githubusercontent.com/elasticmachine/988b80dae436cafea07d9a4a460a011d/raw/rre.schema.json @@ -390,7 +390,7 @@ spec: ingest-fp: access_level: MANAGE_BUILD_AND_READ everyone: - access_level: READ_ONLY + access_level: BUILD_AND_READ --- # yaml-language-server: $schema=https://gist.githubusercontent.com/elasticmachine/988b80dae436cafea07d9a4a460a011d/raw/rre.schema.json @@ -436,7 +436,7 @@ spec: ingest-fp: access_level: MANAGE_BUILD_AND_READ everyone: - access_level: READ_ONLY + access_level: BUILD_AND_READ --- # yaml-language-server: $schema=https://gist.githubusercontent.com/elasticmachine/988b80dae436cafea07d9a4a460a011d/raw/rre.schema.json @@ -483,7 +483,7 @@ spec: ingest-fp: access_level: MANAGE_BUILD_AND_READ everyone: - access_level: READ_ONLY + access_level: BUILD_AND_READ --- # yaml-language-server: $schema=https://gist.githubusercontent.com/elasticmachine/988b80dae436cafea07d9a4a460a011d/raw/rre.schema.json @@ -530,7 +530,7 @@ spec: ingest-fp: access_level: MANAGE_BUILD_AND_READ everyone: - access_level: READ_ONLY + access_level: BUILD_AND_READ --- # yaml-language-server: $schema=https://gist.githubusercontent.com/elasticmachine/988b80dae436cafea07d9a4a460a011d/raw/rre.schema.json @@ -577,7 +577,7 @@ spec: ingest-fp: access_level: MANAGE_BUILD_AND_READ everyone: - access_level: READ_ONLY + access_level: BUILD_AND_READ --- # yaml-language-server: $schema=https://gist.githubusercontent.com/elasticmachine/988b80dae436cafea07d9a4a460a011d/raw/rre.schema.json @@ -624,7 +624,7 @@ spec: ingest-fp: access_level: MANAGE_BUILD_AND_READ everyone: - access_level: READ_ONLY + access_level: BUILD_AND_READ --- # yaml-language-server: $schema=https://gist.githubusercontent.com/elasticmachine/988b80dae436cafea07d9a4a460a011d/raw/rre.schema.json @@ -671,7 +671,7 @@ spec: ingest-fp: access_level: MANAGE_BUILD_AND_READ everyone: - access_level: READ_ONLY + access_level: BUILD_AND_READ --- # yaml-language-server: $schema=https://gist.githubusercontent.com/elasticmachine/988b80dae436cafea07d9a4a460a011d/raw/rre.schema.json @@ -706,7 +706,7 @@ spec: release-eng: access_level: BUILD_AND_READ everyone: - access_level: READ_ONLY + access_level: BUILD_AND_READ --- # yaml-language-server: $schema=https://gist.githubusercontent.com/elasticmachine/988b80dae436cafea07d9a4a460a011d/raw/rre.schema.json apiVersion: backstage.io/v1alpha1 @@ -788,7 +788,7 @@ spec: ingest-fp: access_level: MANAGE_BUILD_AND_READ everyone: - access_level: READ_ONLY + access_level: BUILD_AND_READ --- # yaml-language-server: $schema=https://gist.githubusercontent.com/elasticmachine/988b80dae436cafea07d9a4a460a011d/raw/rre.schema.json @@ -835,7 +835,7 @@ spec: ingest-fp: access_level: MANAGE_BUILD_AND_READ everyone: - access_level: READ_ONLY + access_level: BUILD_AND_READ --- # yaml-language-server: $schema=https://gist.githubusercontent.com/elasticmachine/988b80dae436cafea07d9a4a460a011d/raw/rre.schema.json @@ -882,7 +882,7 @@ spec: ingest-fp: access_level: MANAGE_BUILD_AND_READ everyone: - access_level: READ_ONLY + access_level: BUILD_AND_READ --- # yaml-language-server: $schema=https://gist.githubusercontent.com/elasticmachine/988b80dae436cafea07d9a4a460a011d/raw/rre.schema.json @@ -929,7 +929,7 @@ spec: ingest-fp: access_level: MANAGE_BUILD_AND_READ everyone: - access_level: READ_ONLY + access_level: BUILD_AND_READ --- # yaml-language-server: $schema=https://gist.githubusercontent.com/elasticmachine/988b80dae436cafea07d9a4a460a011d/raw/rre.schema.json @@ -976,7 +976,7 @@ spec: ingest-fp: access_level: MANAGE_BUILD_AND_READ everyone: - access_level: READ_ONLY + access_level: BUILD_AND_READ --- # yaml-language-server: $schema=https://gist.githubusercontent.com/elasticmachine/988b80dae436cafea07d9a4a460a011d/raw/rre.schema.json @@ -1015,7 +1015,7 @@ spec: release-eng: access_level: BUILD_AND_READ everyone: - access_level: READ_ONLY + access_level: BUILD_AND_READ --- # yaml-language-server: $schema=https://gist.githubusercontent.com/elasticmachine/988b80dae436cafea07d9a4a460a011d/raw/rre.schema.json @@ -1065,7 +1065,7 @@ spec: release-eng: access_level: BUILD_AND_READ everyone: - access_level: READ_ONLY + access_level: BUILD_AND_READ --- # yaml-language-server: $schema=https://gist.githubusercontent.com/elasticmachine/988b80dae436cafea07d9a4a460a011d/raw/rre.schema.json @@ -1101,7 +1101,7 @@ spec: release-eng: access_level: BUILD_AND_READ everyone: - access_level: READ_ONLY + access_level: BUILD_AND_READ --- # yaml-language-server: $schema=https://gist.githubusercontent.com/elasticmachine/988b80dae436cafea07d9a4a460a011d/raw/rre.schema.json @@ -1147,4 +1147,4 @@ spec: release-eng: access_level: BUILD_AND_READ everyone: - access_level: READ_ONLY + access_level: BUILD_AND_READ From 726f6e9bdec715f958ba47500e77feb5655b0a48 Mon Sep 17 00:00:00 2001 From: Dimitrios Liappis Date: Wed, 1 May 2024 18:04:21 +0300 Subject: [PATCH 27/30] More resilient DRA packaging (#39332) Occasionally packaging steps from the DRA pipeline may get stuck[^1]. This causes a breach of the global pipeline timeout (currently 1hr) and cancels the job. This commit increases the global timeout to 90min, adds one retry per step and limits the runtime per step to 40min (so that a single stuck step doesn't exhaust the entire global timeout). Finally, we shush slack notifications if the retry recovered the step. In a future PR we will consider also adding a daily DRA build to cover for cases where the retries didn't help and there were no subsequent commits to trigger a new build. [^1]: https://buildkite.com/elastic/beats-packaging-pipeline/builds/114 --- .buildkite/packaging.pipeline.yml | 32 +++++++++++++++++++++++++++++++ catalog-info.yaml | 3 ++- 2 files changed, 34 insertions(+), 1 deletion(-) diff --git a/.buildkite/packaging.pipeline.yml b/.buildkite/packaging.pipeline.yml index c01428100ec..5fd559f458d 100644 --- a/.buildkite/packaging.pipeline.yml +++ b/.buildkite/packaging.pipeline.yml @@ -44,6 +44,10 @@ steps: provider: gcp image: "${IMAGE_UBUNTU_X86_64}" machineType: "${GCP_DEFAULT_MACHINE_TYPE}" + timeout_in_minutes: 40 + retry: + automatic: + - limit: 1 commands: - make build/distributions/dependencies.csv - make beats-dashboards @@ -62,6 +66,10 @@ steps: provider: gcp image: "${IMAGE_UBUNTU_X86_64}" machineType: "${GCP_DEFAULT_MACHINE_TYPE}" + timeout_in_minutes: 40 + retry: + automatic: + - limit: 1 commands: - make build/distributions/dependencies.csv - make beats-dashboards @@ -86,6 +94,10 @@ steps: provider: gcp image: "${IMAGE_UBUNTU_X86_64}" machineType: "${GCP_DEFAULT_MACHINE_TYPE}" + timeout_in_minutes: 40 + retry: + automatic: + - limit: 1 artifact_paths: - build/distributions/**/* matrix: @@ -116,6 +128,10 @@ steps: provider: "aws" imagePrefix: "${AWS_IMAGE_UBUNTU_ARM_64}" instanceType: "${AWS_ARM_INSTANCE_TYPE}" + timeout_in_minutes: 40 + retry: + automatic: + - limit: 1 artifact_paths: - build/distributions/**/* matrix: @@ -142,6 +158,10 @@ steps: provider: gcp image: "${IMAGE_UBUNTU_X86_64}" machineType: "c2-standard-16" + timeout_in_minutes: 40 + retry: + automatic: + - limit: 1 artifact_paths: - build/distributions/**/* @@ -161,6 +181,10 @@ steps: provider: gcp image: "${IMAGE_UBUNTU_X86_64}" machineType: "${GCP_DEFAULT_MACHINE_TYPE}" + timeout_in_minutes: 40 + retry: + automatic: + - limit: 1 artifact_paths: - build/distributions/**/* matrix: @@ -191,6 +215,10 @@ steps: provider: "aws" imagePrefix: "${AWS_IMAGE_UBUNTU_ARM_64}" instanceType: "${AWS_ARM_INSTANCE_TYPE}" + timeout_in_minutes: 40 + retry: + automatic: + - limit: 1 artifact_paths: - build/distributions/**/* matrix: @@ -217,6 +245,10 @@ steps: provider: gcp image: "${IMAGE_UBUNTU_X86_64}" machineType: "c2-standard-16" + timeout_in_minutes: 40 + retry: + automatic: + - limit: 1 artifact_paths: - build/distributions/**/* diff --git a/catalog-info.yaml b/catalog-info.yaml index f81d3a6df1a..34d9e397ca3 100644 --- a/catalog-info.yaml +++ b/catalog-info.yaml @@ -1045,7 +1045,7 @@ spec: # branch_configuration: "main 8.* 7.17" cancel_intermediate_builds: false skip_intermediate_builds: false - maximum_timeout_in_minutes: 60 + maximum_timeout_in_minutes: 90 provider_settings: build_branches: true build_pull_request_forks: false @@ -1059,6 +1059,7 @@ spec: ELASTIC_SLACK_NOTIFICATIONS_ENABLED: 'true' SLACK_NOTIFICATIONS_CHANNEL: '#ingest-notifications' SLACK_NOTIFICATIONS_ON_SUCCESS: 'false' + SLACK_NOTIFICATIONS_SKIP_FOR_RETRIES: 'true' teams: ingest-fp: access_level: MANAGE_BUILD_AND_READ From 5a0293ec9f6222e8fbaddc49f5f56e32d1c09096 Mon Sep 17 00:00:00 2001 From: apmmachine <58790750+apmmachine@users.noreply.github.com> Date: Wed, 1 May 2024 16:01:57 -0400 Subject: [PATCH 28/30] chore: Update snapshot.yml (#39342) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Made with ❤️️ by updatecli Co-authored-by: apmmachine --- testing/environments/snapshot.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/testing/environments/snapshot.yml b/testing/environments/snapshot.yml index bf5fdbc9545..30002f9a255 100644 --- a/testing/environments/snapshot.yml +++ b/testing/environments/snapshot.yml @@ -3,7 +3,7 @@ version: '2.3' services: elasticsearch: - image: docker.elastic.co/elasticsearch/elasticsearch:8.15.0-bb66fa2a-SNAPSHOT + image: docker.elastic.co/elasticsearch/elasticsearch:8.15.0-aa640648-SNAPSHOT # When extend is used it merges healthcheck.tests, see: # https://github.com/docker/compose/issues/8962 # healthcheck: @@ -31,7 +31,7 @@ services: - "./docker/elasticsearch/users_roles:/usr/share/elasticsearch/config/users_roles" logstash: - image: docker.elastic.co/logstash/logstash:8.15.0-bb66fa2a-SNAPSHOT + image: docker.elastic.co/logstash/logstash:8.15.0-aa640648-SNAPSHOT healthcheck: test: ["CMD", "curl", "-f", "http://localhost:9600/_node/stats"] retries: 600 @@ -44,7 +44,7 @@ services: - 5055:5055 kibana: - image: docker.elastic.co/kibana/kibana:8.15.0-bb66fa2a-SNAPSHOT + image: docker.elastic.co/kibana/kibana:8.15.0-aa640648-SNAPSHOT environment: - "ELASTICSEARCH_USERNAME=kibana_system_user" - "ELASTICSEARCH_PASSWORD=testing" From 02ea29d8cf4078ce30c0acb507ffada149101a9b Mon Sep 17 00:00:00 2001 From: Olga Naydyonock Date: Wed, 1 May 2024 23:04:45 +0300 Subject: [PATCH 29/30] Enabling retries for Beats flaky tests (#39174) * added retries for auditbeat flaky tests * added retries for filebeat flaky tests * added retries * test exitcode * checged exit status code for retries * set larger timeout for pytestOpts * restored timeout for pytests --- .buildkite/auditbeat/auditbeat-pipeline.yml | 33 ++++++++++++++++ .buildkite/filebeat/filebeat-pipeline.yml | 33 ++++++++++++++++ .buildkite/heartbeat/heartbeat-pipeline.yml | 36 +++++++++++++++++ .buildkite/libbeat/pipeline.libbeat.yml | 18 +++++++++ .buildkite/metricbeat/pipeline.yml | 33 ++++++++++++++++ .buildkite/packetbeat/pipeline.packetbeat.yml | 30 ++++++++++++++ .buildkite/winlogbeat/pipeline.winlogbeat.yml | 18 +++++++++ .../x-pack/pipeline.xpack.auditbeat.yml | 30 ++++++++++++++ .../x-pack/pipeline.xpack.dockerlogbeat.yml | 6 +++ .buildkite/x-pack/pipeline.xpack.filebeat.yml | 33 ++++++++++++++++ .../x-pack/pipeline.xpack.heartbeat.yml | 29 +++++++++++++- .buildkite/x-pack/pipeline.xpack.libbeat.yml | 27 +++++++++++++ .../x-pack/pipeline.xpack.metricbeat.yml | 30 ++++++++++++++ .../x-pack/pipeline.xpack.osquerybeat.yml | 27 +++++++++++++ .../x-pack/pipeline.xpack.packetbeat.yml | 39 +++++++++++++++++++ .../x-pack/pipeline.xpack.winlogbeat.yml | 18 +++++++++ 16 files changed, 439 insertions(+), 1 deletion(-) diff --git a/.buildkite/auditbeat/auditbeat-pipeline.yml b/.buildkite/auditbeat/auditbeat-pipeline.yml index 801768c271e..ed19c7d9164 100644 --- a/.buildkite/auditbeat/auditbeat-pipeline.yml +++ b/.buildkite/auditbeat/auditbeat-pipeline.yml @@ -32,6 +32,9 @@ steps: command: | cd auditbeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_UBUNTU_X86_64}" @@ -47,6 +50,9 @@ steps: command: | cd auditbeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_RHEL9}" @@ -62,6 +68,9 @@ steps: command: | Set-Location -Path auditbeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_2016}" @@ -79,6 +88,9 @@ steps: command: | Set-Location -Path auditbeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_2022}" @@ -97,6 +109,9 @@ steps: make -C auditbeat crosscompile env: GOX_FLAGS: "-arch amd64" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_UBUNTU_X86_64}" @@ -115,6 +130,9 @@ steps: set -euo pipefail cd auditbeat mage unitTest + retry: + automatic: + - limit: 3 agents: provider: "aws" imagePrefix: "${AWS_IMAGE_UBUNTU_ARM_64}" @@ -133,6 +151,9 @@ steps: source .buildkite/scripts/install_macos_tools.sh cd auditbeat mage unitTest + retry: + automatic: + - limit: 3 agents: provider: "orka" imagePrefix: "${IMAGE_MACOS_X86_64}" @@ -147,6 +168,9 @@ steps: source .buildkite/scripts/install_macos_tools.sh cd auditbeat mage unitTest + retry: + automatic: + - limit: 3 agents: provider: "orka" imagePrefix: "${IMAGE_MACOS_ARM}" @@ -164,6 +188,9 @@ steps: command: | Set-Location -Path auditbeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_2019}" @@ -180,6 +207,9 @@ steps: command: | Set-Location -Path auditbeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_10}" @@ -196,6 +226,9 @@ steps: command: | Set-Location -Path auditbeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_11}" diff --git a/.buildkite/filebeat/filebeat-pipeline.yml b/.buildkite/filebeat/filebeat-pipeline.yml index 7eedd9d76fb..053e8dbec41 100644 --- a/.buildkite/filebeat/filebeat-pipeline.yml +++ b/.buildkite/filebeat/filebeat-pipeline.yml @@ -30,6 +30,9 @@ steps: command: | cd filebeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_UBUNTU_X86_64}" @@ -45,6 +48,9 @@ steps: command: | cd filebeat mage goIntegTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_UBUNTU_X86_64}" @@ -60,6 +66,9 @@ steps: command: | cd filebeat mage pythonIntegTest + retry: + automatic: + - limit: 3 agents: provider: gcp image: "${IMAGE_UBUNTU_X86_64}" @@ -76,6 +85,9 @@ steps: command: | Set-Location -Path filebeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_2016}" @@ -94,6 +106,9 @@ steps: command: | Set-Location -Path filebeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_2022}" @@ -118,6 +133,9 @@ steps: source .buildkite/scripts/install_macos_tools.sh cd filebeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "orka" imagePrefix: "${IMAGE_MACOS_X86_64}" @@ -136,6 +154,9 @@ steps: source .buildkite/scripts/install_macos_tools.sh cd filebeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "orka" imagePrefix: "${IMAGE_MACOS_ARM}" @@ -152,6 +173,9 @@ steps: command: | cd filebeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "aws" imagePrefix: "${AWS_IMAGE_UBUNTU_ARM_64}" @@ -172,6 +196,9 @@ steps: command: | Set-Location -Path filebeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_2019}" @@ -190,6 +217,9 @@ steps: command: | Set-Location -Path filebeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_11}" @@ -208,6 +238,9 @@ steps: command: | Set-Location -Path filebeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_10}" diff --git a/.buildkite/heartbeat/heartbeat-pipeline.yml b/.buildkite/heartbeat/heartbeat-pipeline.yml index 8091b2eead1..cadbcec1eca 100644 --- a/.buildkite/heartbeat/heartbeat-pipeline.yml +++ b/.buildkite/heartbeat/heartbeat-pipeline.yml @@ -30,6 +30,9 @@ steps: command: | cd heartbeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_UBUNTU_X86_64}" @@ -45,6 +48,9 @@ steps: command: | cd heartbeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_RHEL9}" @@ -61,6 +67,9 @@ steps: command: | Set-Location -Path heartbeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_2016}" @@ -78,6 +87,9 @@ steps: command: | Set-Location -Path heartbeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_2022}" @@ -94,6 +106,9 @@ steps: command: | cd heartbeat mage goIntegTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_UBUNTU_X86_64}" @@ -109,6 +124,9 @@ steps: command: | cd heartbeat mage pythonIntegTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_UBUNTU_X86_64}" @@ -130,6 +148,9 @@ steps: command: | cd heartbeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "aws" imagePrefix: "${AWS_IMAGE_UBUNTU_ARM_64}" @@ -151,6 +172,9 @@ steps: source .buildkite/scripts/install_macos_tools.sh cd heartbeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "orka" imagePrefix: "${IMAGE_MACOS_X86_64}" @@ -168,6 +192,9 @@ steps: source .buildkite/scripts/install_macos_tools.sh cd heartbeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "orka" imagePrefix: "${IMAGE_MACOS_ARM}" @@ -188,6 +215,9 @@ steps: command: | Set-Location -Path heartbeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_2019}" @@ -205,6 +235,9 @@ steps: command: | Set-Location -Path heartbeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_11}" @@ -222,6 +255,9 @@ steps: command: | Set-Location -Path heartbeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_10}" diff --git a/.buildkite/libbeat/pipeline.libbeat.yml b/.buildkite/libbeat/pipeline.libbeat.yml index 040ad9b1d66..bc77712c330 100644 --- a/.buildkite/libbeat/pipeline.libbeat.yml +++ b/.buildkite/libbeat/pipeline.libbeat.yml @@ -21,6 +21,9 @@ steps: set -euo pipefail cd libbeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_UBUNTU_X86_64}" @@ -38,6 +41,9 @@ steps: set -euo pipefail cd libbeat mage goIntegTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_UBUNTU_X86_64}" @@ -55,6 +61,9 @@ steps: set -euo pipefail cd libbeat mage pythonIntegTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_UBUNTU_X86_64}" @@ -72,6 +81,9 @@ steps: set -euo pipefail cd libbeat make crosscompile + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_UBUNTU_X86_64}" @@ -89,6 +101,9 @@ steps: set -euo pipefail cd libbeat make STRESS_TEST_OPTIONS='-timeout=20m -race -v -parallel 1' GOTEST_OUTPUT_OPTIONS=' | go-junit-report > libbeat-stress-test.xml' stress-tests + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_UBUNTU_X86_64}" @@ -108,6 +123,9 @@ steps: set -euo pipefail cd libbeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "aws" imagePrefix: "${AWS_IMAGE_UBUNTU_ARM_64}" diff --git a/.buildkite/metricbeat/pipeline.yml b/.buildkite/metricbeat/pipeline.yml index 1fb6bfcc237..d15212d2ef3 100644 --- a/.buildkite/metricbeat/pipeline.yml +++ b/.buildkite/metricbeat/pipeline.yml @@ -32,6 +32,9 @@ steps: - label: ":linux: Ubuntu Unit Tests" key: "mandatory-linux-unit-test" command: "cd metricbeat && mage build unitTest" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_UBUNTU_X86_64}" @@ -62,6 +65,9 @@ steps: echo "~~~ Running tests" export KUBECONFIG="$$PWD/kubecfg" cd metricbeat && mage goIntegTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_UBUNTU_X86_64}" @@ -92,6 +98,9 @@ steps: echo "~~~ Running tests" export KUBECONFIG="$$PWD/kubecfg" cd metricbeat && mage pythonIntegTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_UBUNTU_X86_64}" @@ -106,6 +115,9 @@ steps: - label: ":negative_squared_cross_mark: Cross compile" key: "mandatory-cross-compile" command: "make -C metricbeat crosscompile" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_UBUNTU_X86_64}" @@ -122,6 +134,9 @@ steps: Set-Location -Path metricbeat mage build unitTest key: "mandatory-win-2016-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_2016}" @@ -140,6 +155,9 @@ steps: Set-Location -Path metricbeat mage build unitTest key: "mandatory-win-2022-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_2022}" @@ -162,6 +180,9 @@ steps: Set-Location -Path metricbeat mage build unitTest key: "extended-win-10-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_10}" @@ -180,6 +201,9 @@ steps: Set-Location -Path metricbeat mage build unitTest key: "extended-win-11-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_11}" @@ -198,6 +222,9 @@ steps: Set-Location -Path metricbeat mage build unitTest key: "extended-win-2019-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_2019}" @@ -221,6 +248,9 @@ steps: set -euo pipefail source .buildkite/scripts/install_macos_tools.sh cd metricbeat && mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "orka" imagePrefix: "${IMAGE_MACOS_X86_64}" @@ -238,6 +268,9 @@ steps: set -euo pipefail source .buildkite/scripts/install_macos_tools.sh cd metricbeat && mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "orka" imagePrefix: "${IMAGE_MACOS_ARM}" diff --git a/.buildkite/packetbeat/pipeline.packetbeat.yml b/.buildkite/packetbeat/pipeline.packetbeat.yml index c0f5c1e1a73..d510107a89c 100644 --- a/.buildkite/packetbeat/pipeline.packetbeat.yml +++ b/.buildkite/packetbeat/pipeline.packetbeat.yml @@ -28,6 +28,9 @@ steps: command: | cd packetbeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_UBUNTU_X86_64}" @@ -43,6 +46,9 @@ steps: command: | cd packetbeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_RHEL9_X86_64}" @@ -58,6 +64,9 @@ steps: command: | Set-Location -Path packetbeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_2016}" @@ -75,6 +84,9 @@ steps: command: | Set-Location -Path packetbeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_2022}" @@ -96,6 +108,9 @@ steps: command: | Set-Location -Path packetbeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_10}" @@ -114,6 +129,9 @@ steps: Set-Location -Path packetbeat mage build unitTest key: "extended-win-11-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_11}" @@ -132,6 +150,9 @@ steps: Set-Location -Path packetbeat mage build unitTest key: "extended-win-2019-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_2019}" @@ -156,6 +177,9 @@ steps: source .buildkite/scripts/install_macos_tools.sh cd packetbeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "orka" imagePrefix: "${IMAGE_MACOS_X86_64}" @@ -174,6 +198,9 @@ steps: source .buildkite/scripts/install_macos_tools.sh cd packetbeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "orka" imagePrefix: "${IMAGE_MACOS_ARM}" @@ -188,6 +215,9 @@ steps: key: "linux-arm64-unit-tests-extended" command: "cd packetbeat && mage build unitTest" if: build.env("BUILDKITE_PULL_REQUEST") == "false" || build.env("GITHUB_PR_LABELS") =~ /.*arm.*/ + retry: + automatic: + - limit: 3 agents: provider: "aws" imagePrefix: "${AWS_IMAGE_UBUNTU_ARM_64}" diff --git a/.buildkite/winlogbeat/pipeline.winlogbeat.yml b/.buildkite/winlogbeat/pipeline.winlogbeat.yml index c71858b45b0..ff332791349 100644 --- a/.buildkite/winlogbeat/pipeline.winlogbeat.yml +++ b/.buildkite/winlogbeat/pipeline.winlogbeat.yml @@ -24,6 +24,9 @@ steps: - label: ":ubuntu: Winlogbeat Crossccompile" key: "mandatory-cross-compile" command: "make -C winlogbeat crosscompile" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_UBUNTU_X86_64}" @@ -40,6 +43,9 @@ steps: Set-Location -Path winlogbeat mage build unitTest key: "mandatory-win-2016-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_2016}" @@ -58,6 +64,9 @@ steps: Set-Location -Path winlogbeat mage build unitTest key: "mandatory-win-2019-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_2019}" @@ -76,6 +85,9 @@ steps: Set-Location -Path winlogbeat mage build unitTest key: "mandatory-win-2022-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_2022}" @@ -99,6 +111,9 @@ steps: Set-Location -Path winlogbeat mage build unitTest key: "extended-win-10-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_10}" @@ -117,6 +132,9 @@ steps: Set-Location -Path winlogbeat mage build unitTest key: "extended-win-11-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_11}" diff --git a/.buildkite/x-pack/pipeline.xpack.auditbeat.yml b/.buildkite/x-pack/pipeline.xpack.auditbeat.yml index 36fcb9bebd9..80c298c725d 100644 --- a/.buildkite/x-pack/pipeline.xpack.auditbeat.yml +++ b/.buildkite/x-pack/pipeline.xpack.auditbeat.yml @@ -36,6 +36,9 @@ steps: echo "~~~ Will run tests with env var MODULE=$$MODULE" cd x-pack/auditbeat mage update build test + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_UBUNTU_X86_64}" @@ -52,6 +55,9 @@ steps: command: | cd x-pack/auditbeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_RHEL9_X86_64}" @@ -68,6 +74,9 @@ steps: Set-Location -Path x-pack/auditbeat mage build unitTest key: "mandatory-win-2022-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_2022}" @@ -86,6 +95,9 @@ steps: Set-Location -Path x-pack/auditbeat mage build unitTest key: "mandatory-win-2016-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_2016}" @@ -108,6 +120,9 @@ steps: Set-Location -Path x-pack/auditbeat mage build unitTest key: "extended-win-2019-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_2019}" @@ -126,6 +141,9 @@ steps: Set-Location -Path x-pack/auditbeat mage build unitTest key: "extended-win-10-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_10}" @@ -144,6 +162,9 @@ steps: Set-Location -Path x-pack/auditbeat mage build unitTest key: "extended-win-11-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_11}" @@ -167,6 +188,9 @@ steps: source .buildkite/scripts/install_macos_tools.sh cd x-pack/auditbeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "orka" imagePrefix: "${IMAGE_MACOS_X86_64}" @@ -183,6 +207,9 @@ steps: source .buildkite/scripts/install_macos_tools.sh cd x-pack/auditbeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "orka" imagePrefix: "${IMAGE_MACOS_ARM}" @@ -201,6 +228,9 @@ steps: command: | cd x-pack/auditbeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "aws" imagePrefix: "${IMAGE_UBUNTU_ARM_64}" diff --git a/.buildkite/x-pack/pipeline.xpack.dockerlogbeat.yml b/.buildkite/x-pack/pipeline.xpack.dockerlogbeat.yml index 05aee81e4d8..a64f7851913 100644 --- a/.buildkite/x-pack/pipeline.xpack.dockerlogbeat.yml +++ b/.buildkite/x-pack/pipeline.xpack.dockerlogbeat.yml @@ -28,6 +28,9 @@ steps: - label: ":ubuntu: Xpack/Dockerlogbeat Ubuntu Unit Tests" key: "mandatory-linux-unit-test" command: "cd x-pack/dockerlogbeat && mage build unitTest" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_UBUNTU_X86_64}" @@ -44,6 +47,9 @@ steps: command: "cd x-pack/dockerlogbeat && mage goIntegTest" env: MODULE: $MODULE + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_UBUNTU_X86_64}" diff --git a/.buildkite/x-pack/pipeline.xpack.filebeat.yml b/.buildkite/x-pack/pipeline.xpack.filebeat.yml index 795302bc2d9..b7e71e3c3c0 100644 --- a/.buildkite/x-pack/pipeline.xpack.filebeat.yml +++ b/.buildkite/x-pack/pipeline.xpack.filebeat.yml @@ -30,6 +30,9 @@ steps: command: | cd x-pack/filebeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_UBUNTU_X86_64}" @@ -50,6 +53,9 @@ steps: defineModuleFromTheChangeSet x-pack/filebeat echo "~~~ Will run tests with env var MODULE=$$MODULE" cd x-pack/filebeat && mage goIntegTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_UBUNTU_X86_64}" @@ -70,6 +76,9 @@ steps: defineModuleFromTheChangeSet x-pack/filebeat echo "~~~ Running tests with env var MODULE=$$MODULE" cd x-pack/filebeat && mage pythonIntegTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_UBUNTU_X86_64}" @@ -86,6 +95,9 @@ steps: Set-Location -Path x-pack/filebeat mage build unitTest key: "x-pack-filebeat-mandatory-win-2022-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_2022}" @@ -104,6 +116,9 @@ steps: Set-Location -Path x-pack/filebeat mage build unitTest key: "x-pack-filebeat-mandatory-win-2016-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_2016}" @@ -122,6 +137,9 @@ steps: command: | cd x-pack/filebeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "aws" imagePrefix: "${IMAGE_UBUNTU_ARM_64}" @@ -142,6 +160,9 @@ steps: Set-Location -Path x-pack/filebeat mage build unitTest key: "x-pack-filebeat-extended-win-2019-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_2019}" @@ -160,6 +181,9 @@ steps: Set-Location -Path x-pack/filebeat mage build unitTest key: "x-pack-filebeat-extended-win-10-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_10}" @@ -178,6 +202,9 @@ steps: Set-Location -Path x-pack/filebeat mage build unitTest key: "x-pack-filebeat-extended-win-11-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_11}" @@ -200,6 +227,9 @@ steps: set -euo pipefail source .buildkite/scripts/install_macos_tools.sh cd x-pack/filebeat && mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "orka" imagePrefix: "${IMAGE_MACOS_X86_64}" @@ -217,6 +247,9 @@ steps: set -euo pipefail source .buildkite/scripts/install_macos_tools.sh cd x-pack/filebeat && mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "orka" imagePrefix: "${IMAGE_MACOS_ARM}" diff --git a/.buildkite/x-pack/pipeline.xpack.heartbeat.yml b/.buildkite/x-pack/pipeline.xpack.heartbeat.yml index 107dfa65f1b..136706e698c 100644 --- a/.buildkite/x-pack/pipeline.xpack.heartbeat.yml +++ b/.buildkite/x-pack/pipeline.xpack.heartbeat.yml @@ -39,6 +39,9 @@ steps: echo "~~~ Running tests" cd x-pack/heartbeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_UBUNTU_X86_64}" @@ -59,6 +62,9 @@ steps: echo "~~~ Running tests" cd x-pack/heartbeat mage goIntegTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_UBUNTU_X86_64}" @@ -76,6 +82,9 @@ steps: command: | Set-Location -Path x-pack/heartbeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_2016}" @@ -95,6 +104,9 @@ steps: command: | Set-Location -Path x-pack/heartbeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_2022}" @@ -118,6 +130,9 @@ steps: Set-Location -Path x-pack/heartbeat mage build test key: "extended-win-10-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_10}" @@ -136,6 +151,9 @@ steps: Set-Location -Path x-pack/heartbeat mage build test key: "extended-win-11-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_11}" @@ -153,6 +171,9 @@ steps: command: | Set-Location -Path x-pack/heartbeat mage build test + retry: + automatic: + - limit: 3 key: "extended-win-2019-unit-tests" agents: provider: "gcp" @@ -166,7 +187,7 @@ steps: notify: - github_commit_status: context: "x-pack/heartbeat: Windows 2019 Unit Tests" - + - group: "x-pack/heartbeat MacOS Extended Tests" key: "x-pack-heartbeat-extended-tests-macos" if: build.env("BUILDKITE_PULL_REQUEST") == "false" || build.env("GITHUB_PR_LABELS") =~ /.*macOS.*/ @@ -179,6 +200,9 @@ steps: installNodeJsDependencies cd x-pack/heartbeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "orka" imagePrefix: "${IMAGE_MACOS_X86_64}" @@ -197,6 +221,9 @@ steps: installNodeJsDependencies cd x-pack/heartbeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "orka" imagePrefix: "${IMAGE_MACOS_ARM}" diff --git a/.buildkite/x-pack/pipeline.xpack.libbeat.yml b/.buildkite/x-pack/pipeline.xpack.libbeat.yml index 14316a3ecd7..6bf456f6d83 100644 --- a/.buildkite/x-pack/pipeline.xpack.libbeat.yml +++ b/.buildkite/x-pack/pipeline.xpack.libbeat.yml @@ -26,6 +26,9 @@ steps: command: | cd x-pack/libbeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_UBUNTU_X86_64}" @@ -42,6 +45,9 @@ steps: command: | cd x-pack/libbeat mage goIntegTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_UBUNTU_X86_64}" @@ -58,6 +64,9 @@ steps: command: | cd x-pack/libbeat mage pythonIntegTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_UBUNTU_X86_64}" @@ -74,6 +83,9 @@ steps: Set-Location -Path x-pack/libbeat mage -w reader\etw build goUnitTest key: "mandatory-win-2016-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_2016}" @@ -92,6 +104,9 @@ steps: Set-Location -Path x-pack/libbeat mage -w reader\etw build goUnitTest key: "mandatory-win-2022-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_2022}" @@ -114,6 +129,9 @@ steps: Set-Location -Path x-pack/libbeat mage -w reader\etw build goUnitTest key: "extended-win-10-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_10}" @@ -132,6 +150,9 @@ steps: Set-Location -Path x-pack/libbeat mage -w reader\etw build goUnitTest key: "extended-win-11-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_11}" @@ -150,6 +171,9 @@ steps: Set-Location -Path x-pack/libbeat mage -w reader\etw build goUnitTest key: "extended-win-2019-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_2019}" @@ -172,6 +196,9 @@ steps: command: | cd x-pack/libbeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "aws" imagePrefix: "${IMAGE_UBUNTU_ARM_64}" diff --git a/.buildkite/x-pack/pipeline.xpack.metricbeat.yml b/.buildkite/x-pack/pipeline.xpack.metricbeat.yml index 317b9069c55..4c1c31521f9 100644 --- a/.buildkite/x-pack/pipeline.xpack.metricbeat.yml +++ b/.buildkite/x-pack/pipeline.xpack.metricbeat.yml @@ -30,6 +30,9 @@ steps: command: | cd x-pack/metricbeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_UBUNTU_X86_64}" @@ -50,6 +53,9 @@ steps: defineModuleFromTheChangeSet x-pack/metricbeat echo "~~~ Will run tests with env var MODULE=$$MODULE" cd x-pack/metricbeat && mage goIntegTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_UBUNTU_X86_64}" @@ -70,6 +76,9 @@ steps: defineModuleFromTheChangeSet x-pack/metricbeat echo "~~~ Running tests with env var MODULE=$$MODULE" cd x-pack/metricbeat && mage pythonIntegTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_UBUNTU_X86_64}" @@ -86,6 +95,9 @@ steps: Set-Location -Path x-pack/metricbeat mage build unitTest key: "mandatory-win-2016-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_2016}" @@ -104,6 +116,9 @@ steps: Set-Location -Path x-pack/metricbeat mage build unitTest key: "mandatory-win-2022-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_2022}" @@ -126,6 +141,9 @@ steps: Set-Location -Path x-pack/metricbeat mage build unitTest key: "extended-win-10-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_10}" @@ -144,6 +162,9 @@ steps: Set-Location -Path x-pack/metricbeat mage build unitTest key: "extended-win-11-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_11}" @@ -162,6 +183,9 @@ steps: Set-Location -Path x-pack/metricbeat mage build unitTest key: "extended-win-2019-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_2019}" @@ -185,6 +209,9 @@ steps: set -euo pipefail source .buildkite/scripts/install_macos_tools.sh cd x-pack/metricbeat && mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "orka" imagePrefix: "${IMAGE_MACOS_X86_64}" @@ -202,6 +229,9 @@ steps: set -euo pipefail source .buildkite/scripts/install_macos_tools.sh cd x-pack/metricbeat && mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "orka" imagePrefix: "${IMAGE_MACOS_ARM}" diff --git a/.buildkite/x-pack/pipeline.xpack.osquerybeat.yml b/.buildkite/x-pack/pipeline.xpack.osquerybeat.yml index 8c9137cb423..c8ecac79735 100644 --- a/.buildkite/x-pack/pipeline.xpack.osquerybeat.yml +++ b/.buildkite/x-pack/pipeline.xpack.osquerybeat.yml @@ -30,6 +30,9 @@ steps: command: | cd x-pack/osquerybeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_UBUNTU_X86_64}" @@ -46,6 +49,9 @@ steps: command: | cd x-pack/osquerybeat mage goIntegTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_UBUNTU_X86_64}" @@ -62,6 +68,9 @@ steps: Set-Location -Path x-pack/osquerybeat mage build unitTest key: "mandatory-win-2016-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_2016}" @@ -80,6 +89,9 @@ steps: Set-Location -Path x-pack/osquerybeat mage build unitTest key: "mandatory-win-2022-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_2022}" @@ -102,6 +114,9 @@ steps: Set-Location -Path x-pack/osquerybeat mage build unitTest key: "extended-win-10-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_10}" @@ -120,6 +135,9 @@ steps: Set-Location -Path x-pack/osquerybeat mage build unitTest key: "extended-win-11-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_11}" @@ -138,6 +156,9 @@ steps: Set-Location -Path x-pack/osquerybeat mage build unitTest key: "extended-win-2019-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_2019}" @@ -160,6 +181,9 @@ steps: set -euo pipefail source .buildkite/scripts/install_macos_tools.sh cd x-pack/osquerybeat && mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "orka" imagePrefix: "${IMAGE_MACOS_X86_64}" @@ -175,6 +199,9 @@ steps: set -euo pipefail source .buildkite/scripts/install_macos_tools.sh cd x-pack/osquerybeat && mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "orka" imagePrefix: "${IMAGE_MACOS_ARM}" diff --git a/.buildkite/x-pack/pipeline.xpack.packetbeat.yml b/.buildkite/x-pack/pipeline.xpack.packetbeat.yml index 77fdf2af848..1ab71c30d7d 100644 --- a/.buildkite/x-pack/pipeline.xpack.packetbeat.yml +++ b/.buildkite/x-pack/pipeline.xpack.packetbeat.yml @@ -29,6 +29,9 @@ steps: command: | cd x-pack/packetbeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_UBUNTU_X86_64}" @@ -45,6 +48,9 @@ steps: command: | cd x-pack/packetbeat mage systemTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_UBUNTU_X86_64}" @@ -61,6 +67,9 @@ steps: command: | cd x-pack/packetbeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_RHEL9_X86_64}" @@ -77,6 +86,9 @@ steps: Set-Location -Path x-pack/packetbeat mage build unitTest key: "mandatory-win-2016-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_2016}" @@ -95,6 +107,9 @@ steps: Set-Location -Path x-pack/packetbeat mage build unitTest key: "mandatory-win-2022-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_2022}" @@ -114,6 +129,9 @@ steps: command: | Set-Location -Path x-pack/packetbeat mage systemTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_2022}" @@ -136,6 +154,9 @@ steps: Set-Location -Path x-pack/packetbeat mage build unitTest key: "extended-win-10-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_10}" @@ -154,6 +175,9 @@ steps: Set-Location -Path x-pack/packetbeat mage build unitTest key: "extended-win-11-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_11}" @@ -172,6 +196,9 @@ steps: Set-Location -Path x-pack/packetbeat mage build unitTest key: "extended-win-2019-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_2019}" @@ -191,6 +218,9 @@ steps: command: | Set-Location -Path x-pack/packetbeat mage systemTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_10}" @@ -214,6 +244,9 @@ steps: cd x-pack/packetbeat mage build unitTest if: build.env("GITHUB_PR_LABELS") =~ /.*arm.*/ + retry: + automatic: + - limit: 3 agents: provider: "aws" imagePrefix: "${IMAGE_UBUNTU_ARM_64}" @@ -236,6 +269,9 @@ steps: source .buildkite/scripts/install_macos_tools.sh cd x-pack/packetbeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "orka" imagePrefix: "${IMAGE_MACOS_X86_64}" @@ -253,6 +289,9 @@ steps: source .buildkite/scripts/install_macos_tools.sh cd x-pack/packetbeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "orka" imagePrefix: "${IMAGE_MACOS_ARM}" diff --git a/.buildkite/x-pack/pipeline.xpack.winlogbeat.yml b/.buildkite/x-pack/pipeline.xpack.winlogbeat.yml index c07e537adf0..c6b5a6f59fe 100644 --- a/.buildkite/x-pack/pipeline.xpack.winlogbeat.yml +++ b/.buildkite/x-pack/pipeline.xpack.winlogbeat.yml @@ -29,6 +29,9 @@ steps: mage build unitTest env: MODULE: $MODULE + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_2019}" @@ -47,6 +50,9 @@ steps: Set-Location -Path x-pack/winlogbeat mage build unitTest key: "mandatory-win-2016-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_2016}" @@ -65,6 +71,9 @@ steps: Set-Location -Path x-pack/winlogbeat mage build unitTest key: "mandatory-win-2022-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_2022}" @@ -88,6 +97,9 @@ steps: Set-Location -Path x-pack/winlogbeat mage build unitTest key: "extended-win-10-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_10}" @@ -106,6 +118,9 @@ steps: Set-Location -Path x-pack/winlogbeat mage build unitTest key: "extended-win-11-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_11}" @@ -124,6 +139,9 @@ steps: Set-Location -Path x-pack/winlogbeat mage build unitTest key: "extended-win-2019-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_2019}" From ffcd1814666645a5d7a644911ecf6e2b7d8db3f5 Mon Sep 17 00:00:00 2001 From: Michael Wolf Date: Wed, 1 May 2024 14:52:27 -0700 Subject: [PATCH 30/30] [Auditbeat][add_session_metadata processor] Fix more potential enrichment failures (#39243) Fix two more cases that could cause unenriched processes in the add_session_metadata processor. It was possible for auditd events to arrive before the ebpf event added processes to the process DB, now the enrichment will wait for the process to be inserted into the DB, if it's not already before enrichment is run on it. Also stop attempting to enrich failed syscall events, and modifying the DB based on these. Changes: With the ebpf backend, when an event is processed wait for a process to be added to the DB before enriching, if it's not already in the DB before the event is received. Do not enrich failed syscall auditd events. Since failed syscalls don't actually cause a process to be created, they should not be enriched, or inserted to the process Remove scrapeAncestors from DB. The intention of this was to fill in missed processes, but now processes should not be missed with epbf, and ineffective with procfs, as the process will most likely already be ended. This was causing DB inconsistancies when run on failed syscall events, and I haven't ever seen any cases where it's helpful now. --- CHANGELOG.next.asciidoc | 3 +- .../sessionmd/add_session_metadata.go | 19 ++++- .../processors/sessionmd/processdb/db.go | 39 ++------- .../provider/ebpf_provider/ebpf_provider.go | 80 ++++++++++++++++++- .../procfs_provider/procfs_provider.go | 21 ++--- .../procfs_provider/procfs_provider_test.go | 10 +-- .../processors/sessionmd/provider/provider.go | 2 +- 7 files changed, 114 insertions(+), 60 deletions(-) diff --git a/CHANGELOG.next.asciidoc b/CHANGELOG.next.asciidoc index 58ce7ac0f65..68eb43677ea 100644 --- a/CHANGELOG.next.asciidoc +++ b/CHANGELOG.next.asciidoc @@ -94,8 +94,7 @@ https://github.com/elastic/beats/compare/v8.8.1\...main[Check the HEAD diff] *Auditbeat* - Set field types to correctly match ECS in sessionmd processor {issue}38955[38955] {pull}38994[38994] -- Keep process info on exited processes, to avoid failing to enrich events in sessionmd processor {pull}39173[39173] - +- Fix failing to enrich process events in sessionmd processor {issue}38955[38955] {pull}39173[39173] {pull}39243[39243] - Prevent scenario of losing children-related file events in a directory for recursive fsnotify backend of auditbeat file integrity module {pull}39133[39133] diff --git a/x-pack/auditbeat/processors/sessionmd/add_session_metadata.go b/x-pack/auditbeat/processors/sessionmd/add_session_metadata.go index ff9fa54e556..766e9623b9e 100644 --- a/x-pack/auditbeat/processors/sessionmd/add_session_metadata.go +++ b/x-pack/auditbeat/processors/sessionmd/add_session_metadata.go @@ -96,13 +96,24 @@ func New(cfg *cfg.C) (beat.Processor, error) { } func (p *addSessionMetadata) Run(ev *beat.Event) (*beat.Event, error) { - _, err := ev.GetValue(p.config.PIDField) + pi, err := ev.GetValue(p.config.PIDField) if err != nil { // Do not attempt to enrich events without PID; it's not a supported event return ev, nil //nolint:nilerr // Running on events without PID is expected } - err = p.provider.UpdateDB(ev) + // Do not enrich failed syscalls, as there was no actual process change related to it + v, err := ev.GetValue("auditd.result") + if err == nil && v == "fail" { + return ev, nil + } + + pid, err := pidToUInt32(pi) + if err != nil { + return ev, nil //nolint:nilerr // Running on events with a different PID type is not a processor error + } + + err = p.provider.UpdateDB(ev, pid) if err != nil { return ev, err } @@ -136,7 +147,9 @@ func (p *addSessionMetadata) enrich(ev *beat.Event) (*beat.Event, error) { fullProcess, err := p.db.GetProcess(pid) if err != nil { - return nil, fmt.Errorf("pid %v not found in db: %w", pid, err) + e := fmt.Errorf("pid %v not found in db: %w", pid, err) + p.logger.Errorf("%v", e) + return nil, e } processMap := fullProcess.ToMap() diff --git a/x-pack/auditbeat/processors/sessionmd/processdb/db.go b/x-pack/auditbeat/processors/sessionmd/processdb/db.go index 2c7c228e2c1..b8c624abe00 100644 --- a/x-pack/auditbeat/processors/sessionmd/processdb/db.go +++ b/x-pack/auditbeat/processors/sessionmd/processdb/db.go @@ -238,7 +238,6 @@ func (db *DB) InsertFork(fork types.ProcessForkEvent) { pid := fork.ChildPIDs.Tgid ppid := fork.ParentPIDs.Tgid - db.scrapeAncestors(db.processes[pid]) if entry, ok := db.processes[ppid]; ok { entry.PIDs = pidInfoFromProto(fork.ChildPIDs) @@ -282,7 +281,6 @@ func (db *DB) InsertExec(exec types.ProcessExecEvent) { } db.processes[exec.PIDs.Tgid] = proc - db.scrapeAncestors(proc) entryLeaderPID := db.evaluateEntryLeader(proc) if entryLeaderPID != nil { db.entryLeaderRelationships[exec.PIDs.Tgid] = *entryLeaderPID @@ -568,6 +566,14 @@ func setSameAsProcess(process *types.Process) { } } +func (db *DB) HasProcess(pid uint32) bool { + db.mutex.RLock() + defer db.mutex.RUnlock() + + _, ok := db.processes[pid] + return ok +} + func (db *DB) GetProcess(pid uint32) (types.Process, error) { db.mutex.RLock() defer db.mutex.RUnlock() @@ -585,8 +591,6 @@ func (db *DB) GetProcess(pid uint32) (types.Process, error) { fillParent(&ret, parent) break } - db.logger.Debugf("failed to find %d in DB (parent of %d), attempting to scrape", process.PIDs.Ppid, pid) - db.scrapeAncestors(process) } } @@ -596,8 +600,6 @@ func (db *DB) GetProcess(pid uint32) (types.Process, error) { fillGroupLeader(&ret, groupLeader) break } - db.logger.Debugf("failed to find %d in DB (group leader of %d), attempting to scrape", process.PIDs.Pgid, pid) - db.scrapeAncestors(process) } } @@ -607,8 +609,6 @@ func (db *DB) GetProcess(pid uint32) (types.Process, error) { fillSessionLeader(&ret, sessionLeader) break } - db.logger.Debugf("failed to find %d in DB (session leader of %d), attempting to scrape", process.PIDs.Sid, pid) - db.scrapeAncestors(process) } } @@ -712,29 +712,6 @@ func getTTYType(major uint16, minor uint16) TTYType { return TTYUnknown } -func (db *DB) scrapeAncestors(proc Process) { - for _, pid := range []uint32{proc.PIDs.Pgid, proc.PIDs.Ppid, proc.PIDs.Sid} { - if _, exists := db.processes[pid]; pid == 0 || exists { - continue - } - procInfo, err := db.procfs.GetProcess(pid) - if err != nil { - db.logger.Debugf("couldn't get %v from procfs: %w", pid, err) - continue - } - p := Process{ - PIDs: pidInfoFromProto(procInfo.PIDs), - Creds: credInfoFromProto(procInfo.Creds), - CTTY: ttyDevFromProto(procInfo.CTTY), - Argv: procInfo.Argv, - Cwd: procInfo.Cwd, - Env: procInfo.Env, - Filename: procInfo.Filename, - } - db.insertProcess(p) - } -} - func (db *DB) Close() { close(db.stopChan) } diff --git a/x-pack/auditbeat/processors/sessionmd/provider/ebpf_provider/ebpf_provider.go b/x-pack/auditbeat/processors/sessionmd/provider/ebpf_provider/ebpf_provider.go index 2b9b540e037..f1b8bae0b67 100644 --- a/x-pack/auditbeat/processors/sessionmd/provider/ebpf_provider/ebpf_provider.go +++ b/x-pack/auditbeat/processors/sessionmd/provider/ebpf_provider/ebpf_provider.go @@ -9,6 +9,7 @@ package ebpf_provider import ( "context" "fmt" + "time" "github.com/elastic/beats/v7/libbeat/beat" "github.com/elastic/beats/v7/libbeat/ebpf" @@ -151,7 +152,80 @@ func NewProvider(ctx context.Context, logger *logp.Logger, db *processdb.DB) (pr return &p, nil } -func (s prvdr) UpdateDB(ev *beat.Event) error { - // no-op for ebpf, DB is updated from pushed ebpf events - return nil +const ( + maxWaitLimit = 200 * time.Millisecond // Maximum time UpdateDB will wait for process + combinedWaitLimit = 2 * time.Second // Multiple UpdateDB calls will wait up to this amount within resetDuration + backoffDuration = 10 * time.Second // UpdateDB will stop waiting for processes for this time + resetDuration = 5 * time.Second // After this amount of times with no backoffs, the combinedWait will be reset +) + +var ( + combinedWait = 0 * time.Millisecond + inBackoff = false + backoffStart = time.Now() + since = time.Now() + backoffSkipped = 0 +) + +// With ebpf, process events are pushed to the DB by the above goroutine, so this doesn't actually update the DB. +// It does to try sync the processor and ebpf events, so that the process is in the process db before continuing. +// +// It's possible that the event to enrich arrives before the process is inserted into the DB. In that case, this +// will block continuing the enrichment until the process is seen (or the timeout is reached). +// +// If for some reason a lot of time has been spent waiting for missing processes, this also has a backoff timer during +// which it will continue without waiting for missing events to arrive, so the processor doesn't become overly backed-up +// waiting for these processes, at the cost of possibly not enriching some processes. +func (s prvdr) UpdateDB(ev *beat.Event, pid uint32) error { + if s.db.HasProcess(pid) { + return nil + } + + now := time.Now() + if inBackoff { + if now.Sub(backoffStart) > backoffDuration { + s.logger.Warnf("ended backoff, skipped %d processes", backoffSkipped) + inBackoff = false + combinedWait = 0 * time.Millisecond + } else { + backoffSkipped += 1 + return nil + } + } else { + if combinedWait > combinedWaitLimit { + s.logger.Warn("starting backoff") + inBackoff = true + backoffStart = now + backoffSkipped = 0 + return nil + } + // maintain a moving window of time for the delays we track + if now.Sub(since) > resetDuration { + since = now + combinedWait = 0 * time.Millisecond + } + } + + start := now + nextWait := 5 * time.Millisecond + for { + waited := time.Since(start) + if s.db.HasProcess(pid) { + s.logger.Debugf("got process that was missing after %v", waited) + combinedWait = combinedWait + waited + return nil + } + if waited >= maxWaitLimit { + e := fmt.Errorf("process %v was not seen after %v", pid, waited) + s.logger.Warnf("%w", e) + combinedWait = combinedWait + waited + return e + } + time.Sleep(nextWait) + if nextWait*2+waited > maxWaitLimit { + nextWait = maxWaitLimit - waited + } else { + nextWait = nextWait * 2 + } + } } diff --git a/x-pack/auditbeat/processors/sessionmd/provider/procfs_provider/procfs_provider.go b/x-pack/auditbeat/processors/sessionmd/provider/procfs_provider/procfs_provider.go index 2f99dd72b1f..6525b860b6d 100644 --- a/x-pack/auditbeat/processors/sessionmd/provider/procfs_provider/procfs_provider.go +++ b/x-pack/auditbeat/processors/sessionmd/provider/procfs_provider/procfs_provider.go @@ -41,16 +41,7 @@ func NewProvider(ctx context.Context, logger *logp.Logger, db *processdb.DB, rea } // UpdateDB will update the process DB with process info from procfs or the event itself -func (s prvdr) UpdateDB(ev *beat.Event) error { - pi, err := ev.Fields.GetValue(s.pidField) - if err != nil { - return fmt.Errorf("event not supported, no pid") - } - pid, ok := pi.(int) - if !ok { - return fmt.Errorf("pid field not int") - } - +func (s prvdr) UpdateDB(ev *beat.Event, pid uint32) error { syscall, err := ev.GetValue(syscallField) if err != nil { return fmt.Errorf("event not supported, no syscall data") @@ -59,7 +50,7 @@ func (s prvdr) UpdateDB(ev *beat.Event) error { switch syscall { case "execveat", "execve": pe := types.ProcessExecEvent{} - proc_info, err := s.reader.GetProcess(uint32(pid)) + proc_info, err := s.reader.GetProcess(pid) if err == nil { pe.PIDs = proc_info.PIDs pe.Creds = proc_info.Creds @@ -72,7 +63,7 @@ func (s prvdr) UpdateDB(ev *beat.Event) error { s.logger.Warnf("couldn't get process info from proc for pid %v: %w", pid, err) // If process info couldn't be taken from procfs, populate with as much info as // possible from the event - pe.PIDs.Tgid = uint32(pid) + pe.PIDs.Tgid = pid var intr interface{} var i int var ok bool @@ -106,7 +97,7 @@ func (s prvdr) UpdateDB(ev *beat.Event) error { case "exit_group": pe := types.ProcessExitEvent{ PIDs: types.PIDInfo{ - Tgid: uint32(pid), + Tgid: pid, }, } s.db.InsertExit(pe) @@ -122,8 +113,8 @@ func (s prvdr) UpdateDB(ev *beat.Event) error { if result == "success" { setsid_ev := types.ProcessSetsidEvent{ PIDs: types.PIDInfo{ - Tgid: uint32(pid), - Sid: uint32(pid), + Tgid: pid, + Sid: pid, }, } s.db.InsertSetsid(setsid_ev) diff --git a/x-pack/auditbeat/processors/sessionmd/provider/procfs_provider/procfs_provider_test.go b/x-pack/auditbeat/processors/sessionmd/provider/procfs_provider/procfs_provider_test.go index 6fd333c4711..c438efcfe1a 100644 --- a/x-pack/auditbeat/processors/sessionmd/provider/procfs_provider/procfs_provider_test.go +++ b/x-pack/auditbeat/processors/sessionmd/provider/procfs_provider/procfs_provider_test.go @@ -124,7 +124,7 @@ func TestExecveEvent(t *testing.T) { provider, err := NewProvider(context.TODO(), &logger, db, reader, "process.pid") require.Nil(t, err, "error creating provider") - err = provider.UpdateDB(&event) + err = provider.UpdateDB(&event, expected.PIDs.Tgid) require.Nil(t, err) actual, err := db.GetProcess(pid) @@ -234,7 +234,7 @@ func TestExecveatEvent(t *testing.T) { provider, err := NewProvider(context.TODO(), &logger, db, reader, "process.pid") require.Nil(t, err, "error creating provider") - err = provider.UpdateDB(&event) + err = provider.UpdateDB(&event, expected.PIDs.Tgid) require.Nil(t, err) actual, err := db.GetProcess(pid) @@ -317,7 +317,7 @@ func TestSetSidEvent(t *testing.T) { provider, err := NewProvider(context.TODO(), &logger, db, reader, "process.pid") require.Nil(t, err, "error creating provider") - err = provider.UpdateDB(&event) + err = provider.UpdateDB(&event, expected.PIDs.Tgid) require.Nil(t, err) actual, err := db.GetProcess(pid) @@ -399,7 +399,7 @@ func TestSetSidEventFailed(t *testing.T) { provider, err := NewProvider(context.TODO(), &logger, db, reader, "process.pid") require.Nil(t, err, "error creating provider") - err = provider.UpdateDB(&event) + err = provider.UpdateDB(&event, expected.PIDs.Tgid) require.Nil(t, err) actual, err := db.GetProcess(pid) @@ -470,7 +470,7 @@ func TestSetSidSessionLeaderNotScraped(t *testing.T) { provider, err := NewProvider(context.TODO(), &logger, db, reader, "process.pid") require.Nil(t, err, "error creating provider") - err = provider.UpdateDB(&event) + err = provider.UpdateDB(&event, expected.PIDs.Tgid) require.Nil(t, err) actual, err := db.GetProcess(pid) diff --git a/x-pack/auditbeat/processors/sessionmd/provider/provider.go b/x-pack/auditbeat/processors/sessionmd/provider/provider.go index e3fa1547806..6452eb9e2bf 100644 --- a/x-pack/auditbeat/processors/sessionmd/provider/provider.go +++ b/x-pack/auditbeat/processors/sessionmd/provider/provider.go @@ -11,5 +11,5 @@ import ( ) type Provider interface { - UpdateDB(*beat.Event) error + UpdateDB(*beat.Event, uint32) error }