From adb0fb52af79032204edea64f1c86dd2edcaf8e0 Mon Sep 17 00:00:00 2001 From: VihasMakwana <121151420+VihasMakwana@users.noreply.github.com> Date: Wed, 5 Jun 2024 16:09:02 +0530 Subject: [PATCH 01/21] [libbeat] fix a small linting issue (#39726) The linter complains of an `errcheck`. --- libbeat/monitoring/report/report.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libbeat/monitoring/report/report.go b/libbeat/monitoring/report/report.go index eab8886af4f..98ce1b5435c 100644 --- a/libbeat/monitoring/report/report.go +++ b/libbeat/monitoring/report/report.go @@ -128,7 +128,7 @@ func collectSubObject(cfg *conf.C) *conf.C { for _, field := range cfg.GetFields() { if obj, err := cfg.Child(field, -1); err == nil { // on error field is no object, but primitive value -> ignore - out.SetChild(field, -1, obj) + out.SetChild(field, -1, obj) //nolint:errcheck // this error is safe to ignore continue } } From 592ea5bb128d40885db65e5aafa789d3d75c66e6 Mon Sep 17 00:00:00 2001 From: Elastic Machine Date: Thu, 6 Jun 2024 00:47:38 +1000 Subject: [PATCH 02/21] [Release] Update docs for the 8.15.0 release (#39034) * docs: update docs * Make check-ci --------- Co-authored-by: Pierre HILBERT --- deploy/kubernetes/auditbeat-kubernetes.yaml | 2 +- deploy/kubernetes/filebeat-kubernetes.yaml | 2 +- deploy/kubernetes/heartbeat-kubernetes.yaml | 2 +- deploy/kubernetes/metricbeat-kubernetes.yaml | 2 +- libbeat/docs/version.asciidoc | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/deploy/kubernetes/auditbeat-kubernetes.yaml b/deploy/kubernetes/auditbeat-kubernetes.yaml index eb668c11254..d25c7245066 100644 --- a/deploy/kubernetes/auditbeat-kubernetes.yaml +++ b/deploy/kubernetes/auditbeat-kubernetes.yaml @@ -209,7 +209,7 @@ spec: dnsPolicy: ClusterFirstWithHostNet containers: - name: auditbeat - image: docker.elastic.co/beats/auditbeat:8.14.0 + image: docker.elastic.co/beats/auditbeat:8.15.0 args: [ "-c", "/etc/auditbeat.yml", "-e", diff --git a/deploy/kubernetes/filebeat-kubernetes.yaml b/deploy/kubernetes/filebeat-kubernetes.yaml index 40855fcc9e2..8539501d292 100644 --- a/deploy/kubernetes/filebeat-kubernetes.yaml +++ b/deploy/kubernetes/filebeat-kubernetes.yaml @@ -183,7 +183,7 @@ spec: dnsPolicy: ClusterFirstWithHostNet containers: - name: filebeat - image: docker.elastic.co/beats/filebeat:8.14.0 + image: docker.elastic.co/beats/filebeat:8.15.0 args: [ "-c", "/etc/filebeat.yml", "-e", diff --git a/deploy/kubernetes/heartbeat-kubernetes.yaml b/deploy/kubernetes/heartbeat-kubernetes.yaml index cec015a62fc..0c22501e707 100644 --- a/deploy/kubernetes/heartbeat-kubernetes.yaml +++ b/deploy/kubernetes/heartbeat-kubernetes.yaml @@ -171,7 +171,7 @@ spec: dnsPolicy: ClusterFirstWithHostNet containers: - name: heartbeat - image: docker.elastic.co/beats/heartbeat:8.14.0 + image: docker.elastic.co/beats/heartbeat:8.15.0 args: [ "-c", "/etc/heartbeat.yml", "-e", diff --git a/deploy/kubernetes/metricbeat-kubernetes.yaml b/deploy/kubernetes/metricbeat-kubernetes.yaml index 9b9822323e6..e319d4b884c 100644 --- a/deploy/kubernetes/metricbeat-kubernetes.yaml +++ b/deploy/kubernetes/metricbeat-kubernetes.yaml @@ -291,7 +291,7 @@ spec: dnsPolicy: ClusterFirstWithHostNet containers: - name: metricbeat - image: docker.elastic.co/beats/metricbeat:8.14.0 + image: docker.elastic.co/beats/metricbeat:8.15.0 args: [ "-c", "/etc/metricbeat.yml", "-e", diff --git a/libbeat/docs/version.asciidoc b/libbeat/docs/version.asciidoc index ae9efcf9d01..cb37384bb30 100644 --- a/libbeat/docs/version.asciidoc +++ b/libbeat/docs/version.asciidoc @@ -1,4 +1,4 @@ -:stack-version: 8.14.0 +:stack-version: 8.15.0 :doc-branch: main :go-version: 1.21.10 :release-state: unreleased From 90f9e8f6e48e76a83331f64f6c8c633ae6b31661 Mon Sep 17 00:00:00 2001 From: Lee E Hinman <57081003+leehinman@users.noreply.github.com> Date: Wed, 5 Jun 2024 13:29:26 -0500 Subject: [PATCH 03/21] add benchmark input to agentbeat (#39789) * add benchmark input to agentbeat --- x-pack/agentbeat/agentbeat.spec.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/x-pack/agentbeat/agentbeat.spec.yml b/x-pack/agentbeat/agentbeat.spec.yml index b05975d2f89..8f153a02eba 100644 --- a/x-pack/agentbeat/agentbeat.spec.yml +++ b/x-pack/agentbeat/agentbeat.spec.yml @@ -100,6 +100,11 @@ inputs: platforms: *platforms outputs: *outputs command: *filebeat_command + - name: benchmark + description: "Benchmark Input" + platforms: *platforms + outputs: *outputs + command: *filebeat_command - name: cel description: "Common Expression Language Input" platforms: *platforms From 727777eab4c217bd2ad031a7ee3f13556bdca212 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Emilio=20Alvarez=20Pi=C3=B1eiro?= <95703246+emilioalvap@users.noreply.github.com> Date: Thu, 6 Jun 2024 17:24:04 +0200 Subject: [PATCH 04/21] [Heartbeat] Move import to cmd for agentbeat (#39818) * [Heartbeat] Move import to cmd for agentbeat Fix browser plugin import to be bundled inside agentbeat, which is now being skipped and it prevents browser monitors from working. --- CHANGELOG.next.asciidoc | 1 + x-pack/heartbeat/{ => cmd}/import.go | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) rename x-pack/heartbeat/{ => cmd}/import.go (72%) diff --git a/CHANGELOG.next.asciidoc b/CHANGELOG.next.asciidoc index a45a61e3e41..746e1e9f13d 100644 --- a/CHANGELOG.next.asciidoc +++ b/CHANGELOG.next.asciidoc @@ -161,6 +161,7 @@ https://github.com/elastic/beats/compare/v8.8.1\...main[Check the HEAD diff] - Adjust State loader to only retry when response code status is 5xx {pull}37981[37981] - Reset prctl dumpable flag after cap drop. {pull}38269[38269] - Redact synthexec cmd output. {pull}39535[39535] +- Fix import of browser plugin for agentbeat. {pull}39818[39818] *Heartbeat* diff --git a/x-pack/heartbeat/import.go b/x-pack/heartbeat/cmd/import.go similarity index 72% rename from x-pack/heartbeat/import.go rename to x-pack/heartbeat/cmd/import.go index d3d4eeea99d..d90b1648e3e 100644 --- a/x-pack/heartbeat/import.go +++ b/x-pack/heartbeat/cmd/import.go @@ -4,8 +4,10 @@ //go:build linux || darwin -package main +package cmd +// Agentbeat imports cmd directly and skips main, import all required plugins +// here to have them bundled together import ( _ "github.com/elastic/beats/v7/x-pack/heartbeat/monitors/browser" ) From 37db5990da1bb87c317e9a8cbacd1a98f5d7d222 Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Thu, 6 Jun 2024 14:18:24 -0400 Subject: [PATCH 05/21] Clean up / document metrics monitor fields (#39413) Document the semantics of many metrics monitoring variables, and rename some metrics APIs to more clearly indicate their function. As a side effect, fix several metrics reporting / publishing bugs in the Elasticsearch output, including https://github.com/elastic/beats/issues/39146. Add the `output.events.dead_letter` metric to distinguish events that were ingested to the dead letter index after a fatal error (previously these events were just reported as "acked"). This could have been a shorter fix, but it was hard to properly test since the metrics were changed from two separate functions with a lot of special cases. I ended up reorganizing the Elasticsearch `Publish` helpers to make the logic more clear. The new layout makes it much easier to test the error handling and metrics reporting. The bugs fixed by this refactor are: - When a batch was split, the events in it were not reported to the observer via `RetryableErrors`. This caused `activeEvents` to increase permanently even after the events were handled. - When a previously-failed event was ingested to the dead letter index as a raw string, it was reported to the observer as `Acked` (success). The new logic creates a new `dead_letter` metric specifically for this case. - When a previously-failed event encountered a fatal error ingesting to the dead letter index: * It was reported to the observer as both a permanent error and a retryable error, giving incorrect event counts. It's now reported as just a permanent error. * It was added to the retry list anyway, which would block all ingestion if the error really was fatal since the queue could never advance past that event. It's now dropped, the same as with a permanent error in the main index. - If the Elasticsearch bulk index response was invalid, the associated events were dropped and reported as acknowledged --- .../filestream/internal/task/group_test.go | 4 +- libbeat/esleg/eslegclient/bulkapi.go | 13 +- .../monitoring/report/elasticsearch/client.go | 2 +- libbeat/outputs/console/console.go | 4 +- libbeat/outputs/discard/discard.go | 2 +- libbeat/outputs/elasticsearch/client.go | 373 +++++++++------- .../elasticsearch/client_integration_test.go | 5 +- libbeat/outputs/elasticsearch/client_test.go | 398 +++++++++++------- .../outputs/elasticsearch/event_encoder.go | 15 + libbeat/outputs/fileout/file.go | 4 +- libbeat/outputs/kafka/client.go | 12 +- libbeat/outputs/logstash/async.go | 6 +- libbeat/outputs/logstash/sync.go | 4 +- libbeat/outputs/metrics.go | 120 +++--- libbeat/outputs/observer.go | 40 +- libbeat/outputs/redis/client.go | 12 +- libbeat/publisher/event.go | 2 + libbeat/publisher/pipeline/monitoring.go | 76 +++- libbeat/publisher/pipeline/stress/out.go | 4 +- libbeat/tests/integration/ca_pinning_test.go | 2 +- libbeat/tests/integration/template_test.go | 4 +- libbeat/tests/system/test_ilm.py | 6 +- 22 files changed, 677 insertions(+), 431 deletions(-) diff --git a/filebeat/input/filestream/internal/task/group_test.go b/filebeat/input/filestream/internal/task/group_test.go index 5ce15d455e3..30b9858a1de 100644 --- a/filebeat/input/filestream/internal/task/group_test.go +++ b/filebeat/input/filestream/internal/task/group_test.go @@ -195,8 +195,8 @@ func TestGroup_Go(t *testing.T) { }) t.Run("without limit, all goroutines run", func(t *testing.T) { - // 100 <= limit <= 100000 - limit := rand.Int63n(100000-100) + 100 + // 100 <= limit <= 10000 + limit := rand.Int63n(10000-100) + 100 t.Logf("running %d goroutines", limit) g := NewGroup(uint64(limit), time.Second, noopLogger{}, "") diff --git a/libbeat/esleg/eslegclient/bulkapi.go b/libbeat/esleg/eslegclient/bulkapi.go index 56a36ccf145..8512426ed9e 100644 --- a/libbeat/esleg/eslegclient/bulkapi.go +++ b/libbeat/esleg/eslegclient/bulkapi.go @@ -23,7 +23,6 @@ import ( "encoding/json" "errors" "io" - "io/ioutil" "net/http" "strings" @@ -60,8 +59,8 @@ type bulkRequest struct { requ *http.Request } -// BulkResult contains the result of a bulk API request. -type BulkResult json.RawMessage +// BulkResponse contains the result of a bulk API request. +type BulkResponse json.RawMessage // Bulk performs many index/delete operations in a single API call. // Implements: http://www.elastic.co/guide/en/elasticsearch/reference/current/docs-bulk.html @@ -69,7 +68,7 @@ func (conn *Connection) Bulk( ctx context.Context, index, docType string, params map[string]string, body []interface{}, -) (int, BulkResult, error) { +) (int, BulkResponse, error) { if len(body) == 0 { return 0, nil, nil } @@ -142,7 +141,7 @@ func (r *bulkRequest) reset(body BodyEncoder) { rc, ok := bdy.(io.ReadCloser) if !ok && body != nil { - rc = ioutil.NopCloser(bdy) + rc = io.NopCloser(bdy) } switch v := bdy.(type) { @@ -160,9 +159,9 @@ func (r *bulkRequest) reset(body BodyEncoder) { body.AddHeader(&r.requ.Header) } -func (conn *Connection) sendBulkRequest(requ *bulkRequest) (int, BulkResult, error) { +func (conn *Connection) sendBulkRequest(requ *bulkRequest) (int, BulkResponse, error) { status, resp, err := conn.execHTTPRequest(requ.requ) - return status, BulkResult(resp), err + return status, BulkResponse(resp), err } func bulkEncode(log *logp.Logger, out BulkWriter, body []interface{}) error { diff --git a/libbeat/monitoring/report/elasticsearch/client.go b/libbeat/monitoring/report/elasticsearch/client.go index fbc4fcef772..56f56ac8e1e 100644 --- a/libbeat/monitoring/report/elasticsearch/client.go +++ b/libbeat/monitoring/report/elasticsearch/client.go @@ -224,7 +224,7 @@ func getMonitoringIndexName() string { return fmt.Sprintf(".monitoring-beats-%v-%s", version, date) } -func logBulkFailures(log *logp.Logger, result eslegclient.BulkResult, events []report.Event) { +func logBulkFailures(log *logp.Logger, result eslegclient.BulkResponse, events []report.Event) { var response struct { Items []map[string]map[string]interface{} `json:"items"` } diff --git a/libbeat/outputs/console/console.go b/libbeat/outputs/console/console.go index f723bf818c9..867316fe3a9 100644 --- a/libbeat/outputs/console/console.go +++ b/libbeat/outputs/console/console.go @@ -111,8 +111,8 @@ func (c *console) Publish(_ context.Context, batch publisher.Batch) error { c.writer.Flush() batch.ACK() - st.Dropped(dropped) - st.Acked(len(events) - dropped) + st.PermanentErrors(dropped) + st.AckedEvents(len(events) - dropped) return nil } diff --git a/libbeat/outputs/discard/discard.go b/libbeat/outputs/discard/discard.go index c9a51b0f33d..bfd1a1c1add 100644 --- a/libbeat/outputs/discard/discard.go +++ b/libbeat/outputs/discard/discard.go @@ -70,7 +70,7 @@ func (out *discardOutput) Publish(_ context.Context, batch publisher.Batch) erro st := out.observer events := batch.Events() st.NewBatch(len(events)) - st.Acked(len(events)) + st.AckedEvents(len(events)) return nil } diff --git a/libbeat/outputs/elasticsearch/client.go b/libbeat/outputs/elasticsearch/client.go index 0892ce40173..e05c4e0b261 100644 --- a/libbeat/outputs/elasticsearch/client.go +++ b/libbeat/outputs/elasticsearch/client.go @@ -26,6 +26,7 @@ import ( "time" "go.elastic.co/apm/v2" + "gotest.tools/gotestsum/log" "github.com/elastic/beats/v7/libbeat/beat" "github.com/elastic/beats/v7/libbeat/beat/events" @@ -66,7 +67,11 @@ type clientSettings struct { connection eslegclient.ConnectionSettings indexSelector outputs.IndexSelector pipelineSelector *outil.Selector - observer outputs.Observer + + // The metrics observer from the clientSettings, or a no-op placeholder if + // none is provided. This variable is always non-nil for a client created + // via NewClient. + observer outputs.Observer // If deadLetterIndex is set, events with bulk-ingest errors will be // forwarded to this index. Otherwise, they will be dropped. @@ -76,15 +81,40 @@ type clientSettings struct { type bulkResultStats struct { acked int // number of events ACKed by Elasticsearch duplicates int // number of events failed with `create` due to ID already being indexed - fails int // number of failed events (can be retried) - nonIndexable int // number of failed events (not indexable) + fails int // number of events with retryable failures. + nonIndexable int // number of events with permanent failures. + deadLetter int // number of failed events ingested to the dead letter index. tooMany int // number of events receiving HTTP 429 Too Many Requests } +type bulkResult struct { + // A connection-level error if the request couldn't be sent or the response + // couldn't be read. This error is returned from (*Client).Publish to signal + // to the pipeline that this output worker needs to be reconnected before the + // next Publish call. + connErr error + + // The array of events sent via bulk request. This excludes any events that + // had encoding errors while assembling the request. + events []publisher.Event + + // The http status returned by the bulk request. + status int + + // The API response from Elasticsearch. + response eslegclient.BulkResponse +} + const ( defaultEventType = "doc" ) +// Flags passed with the Bulk API request: we filter the response to include +// only the fields we need for checking request/item state. +var bulkRequestParams = map[string]string{ + "filter_path": "errors,items.*.error,items.*.status", +} + // NewClient instantiates a new client. func NewClient( s clientSettings, @@ -125,11 +155,17 @@ func NewClient( return nil } + // Make sure there's a non-nil obser + observer := s.observer + if observer == nil { + observer = outputs.NewNilObserver() + } + client := &Client{ conn: *conn, indexSelector: s.indexSelector, pipelineSelector: pipeline, - observer: s.observer, + observer: observer, deadLetterIndex: s.deadLetterIndex, log: logp.NewLogger("elasticsearch"), @@ -178,119 +214,104 @@ func (client *Client) Clone() *Client { } func (client *Client) Publish(ctx context.Context, batch publisher.Batch) error { - events := batch.Events() - rest, err := client.publishEvents(ctx, events) - - switch { - case errors.Is(err, errPayloadTooLarge): - if batch.SplitRetry() { - // Report that we split a batch - client.observer.Split() - } else { - // If the batch could not be split, there is no option left but - // to drop it and log the error state. - batch.Drop() - client.observer.Dropped(len(events)) - err := apm.CaptureError(ctx, fmt.Errorf("failed to perform bulk index operation: %w", err)) - err.Send() - client.log.Error(err) - } - // Returning an error from Publish forces a client close / reconnect, - // so don't pass this error through since it doesn't indicate anything - // wrong with the connection. - return nil - case len(rest) == 0: - batch.ACK() - default: - batch.RetryEvents(rest) - } - return err -} - -// PublishEvents sends all events to elasticsearch. On error a slice with all -// events not published or confirmed to be processed by elasticsearch will be -// returned. The input slice backing memory will be reused by return the value. -func (client *Client) publishEvents(ctx context.Context, data []publisher.Event) ([]publisher.Event, error) { span, ctx := apm.StartSpan(ctx, "publishEvents", "output") defer span.End() + span.Context.SetLabel("events_original", len(batch.Events())) + client.observer.NewBatch(len(batch.Events())) + + // Create and send the bulk request. + bulkResult := client.doBulkRequest(ctx, batch) + span.Context.SetLabel("events_encoded", len(bulkResult.events)) + if bulkResult.connErr != nil { + // If there was a connection-level error there is no per-item response, + // handle it and return. + return client.handleBulkResultError(ctx, batch, bulkResult) + } + span.Context.SetLabel("events_published", len(bulkResult.events)) - st := client.observer + // At this point we have an Elasticsearch response for our request, + // check and report the per-item results. + eventsToRetry, stats := client.bulkCollectPublishFails(bulkResult) + stats.reportToObserver(client.observer) - if st != nil { - st.NewBatch(len(data)) + if len(eventsToRetry) > 0 { + span.Context.SetLabel("events_failed", len(eventsToRetry)) + batch.RetryEvents(eventsToRetry) + } else { + batch.ACK() } + return nil +} - if len(data) == 0 { - return nil, nil - } +// Encode a batch's events into a bulk publish request, send the request to +// Elasticsearch, and return the resulting metadata. +// Reports the network request latency to the client's metrics observer. +// The events list in the result will be shorter than the original batch if +// some events couldn't be encoded. In this case, the removed events will +// be reported to the Client's metrics observer via PermanentErrors. +func (client *Client) doBulkRequest( + ctx context.Context, + batch publisher.Batch, +) bulkResult { + var result bulkResult + + rawEvents := batch.Events() // encode events into bulk request buffer, dropping failed elements from // events slice - origCount := len(data) - span.Context.SetLabel("events_original", origCount) - data, bulkItems := client.bulkEncodePublishRequest(client.conn.GetVersion(), data) - newCount := len(data) - span.Context.SetLabel("events_encoded", newCount) - if st != nil && origCount > newCount { - st.Dropped(origCount - newCount) - } - if newCount == 0 { - return nil, nil - } - - begin := time.Now() - params := map[string]string{"filter_path": "errors,items.*.error,items.*.status"} - status, result, sendErr := client.conn.Bulk(ctx, "", "", params, bulkItems) - timeSinceSend := time.Since(begin) - - if sendErr != nil { - if status == http.StatusRequestEntityTooLarge { - // This error must be handled by splitting the batch, propagate it - // back to Publish instead of reporting it directly - return data, errPayloadTooLarge + resultEvents, bulkItems := client.bulkEncodePublishRequest(client.conn.GetVersion(), rawEvents) + result.events = resultEvents + client.observer.PermanentErrors(len(rawEvents) - len(resultEvents)) + + // If we encoded any events, send the network request. + if len(result.events) > 0 { + begin := time.Now() + result.status, result.response, result.connErr = + client.conn.Bulk(ctx, "", "", bulkRequestParams, bulkItems) + if result.connErr == nil { + duration := time.Since(begin) + client.observer.ReportLatency(duration) + client.log.Debugf( + "doBulkRequest: %d events have been sent to elasticsearch in %v.", + len(result.events), duration) } - err := apm.CaptureError(ctx, fmt.Errorf("failed to perform any bulk index operations: %w", sendErr)) - err.Send() - client.log.Error(err) - return data, sendErr - } - pubCount := len(data) - span.Context.SetLabel("events_published", pubCount) - - client.log.Debugf("PublishEvents: %d events have been published to elasticsearch in %v.", - pubCount, - timeSinceSend) - - // check response for transient errors - var failedEvents []publisher.Event - var stats bulkResultStats - if status != 200 { - failedEvents = data - stats.fails = len(failedEvents) - } else { - failedEvents, stats = client.bulkCollectPublishFails(result, data) } - failed := len(failedEvents) - span.Context.SetLabel("events_failed", failed) - if st := client.observer; st != nil { - dropped := stats.nonIndexable - duplicates := stats.duplicates - acked := len(data) - failed - dropped - duplicates - - st.Acked(acked) - st.Failed(failed) - st.Dropped(dropped) - st.Duplicate(duplicates) - st.ErrTooMany(stats.tooMany) - st.ReportLatency(timeSinceSend) + return result +} +func (client *Client) handleBulkResultError( + ctx context.Context, batch publisher.Batch, bulkResult bulkResult, +) error { + if bulkResult.status == http.StatusRequestEntityTooLarge { + if batch.SplitRetry() { + // Report that we split a batch + client.observer.BatchSplit() + client.observer.RetryableErrors(len(bulkResult.events)) + } else { + // If the batch could not be split, there is no option left but + // to drop it and log the error state. + batch.Drop() + client.observer.PermanentErrors(len(bulkResult.events)) + client.log.Error(errPayloadTooLarge) + } + // Don't propagate a too-large error since it doesn't indicate a problem + // with the connection. + return nil } + err := apm.CaptureError(ctx, fmt.Errorf("failed to perform any bulk index operations: %w", bulkResult.connErr)) + err.Send() + client.log.Error(err) - if failed > 0 { - return failedEvents, eslegclient.ErrTempBulkFailure + if len(bulkResult.events) > 0 { + // At least some events failed, retry them + batch.RetryEvents(bulkResult.events) + } else { + // All events were sent successfully + batch.ACK() } - return nil, nil + client.observer.RetryableErrors(len(bulkResult.events)) + return bulkResult.connErr } // bulkEncodePublishRequest encodes all bulk requests and returns slice of events @@ -380,80 +401,108 @@ func getPipeline(event *beat.Event, defaultSelector *outil.Selector) (string, er // to be tried again due to error code returned for that items. If indexing an // event failed due to some error in the event itself (e.g. does not respect mapping), // the event will be dropped. -func (client *Client) bulkCollectPublishFails(result eslegclient.BulkResult, data []publisher.Event) ([]publisher.Event, bulkResultStats) { - reader := newJSONReader(result) +// Each of the events will be reported in the returned stats as exactly one of +// acked, duplicates, fails, nonIndexable, or deadLetter. +func (client *Client) bulkCollectPublishFails(bulkResult bulkResult) ([]publisher.Event, bulkResultStats) { + events := bulkResult.events + + if len(bulkResult.events) == 0 { + // No events to process + return nil, bulkResultStats{} + } + if bulkResult.status != 200 { + return events, bulkResultStats{fails: len(events)} + } + reader := newJSONReader(bulkResult.response) if err := bulkReadToItems(reader); err != nil { client.log.Errorf("failed to parse bulk response: %v", err.Error()) - return nil, bulkResultStats{} + return events, bulkResultStats{fails: len(events)} } - count := len(data) - failed := data[:0] + count := len(events) + eventsToRetry := events[:0] stats := bulkResultStats{} for i := 0; i < count; i++ { - status, msg, err := bulkReadItemStatus(client.log, reader) + itemStatus, itemMessage, err := bulkReadItemStatus(client.log, reader) if err != nil { - client.log.Error(err) - return nil, bulkResultStats{} + // The response json is invalid, mark the remaining events for retry. + stats.fails += count - i + eventsToRetry = append(eventsToRetry, events[i:]...) + break } - if status < 300 { - stats.acked++ - continue // ok value + if client.applyItemStatus(events[i], itemStatus, itemMessage, &stats) { + eventsToRetry = append(eventsToRetry, events[i]) + log.Debugf("Bulk item insert failed (i=%v, status=%v): %s", i, itemStatus, itemMessage) } + } - if status == 409 { - // 409 is used to indicate an event with same ID already exists if - // `create` op_type is used. - stats.duplicates++ - continue // ok - } + return eventsToRetry, stats +} - if status < 500 { - if status == http.StatusTooManyRequests { - stats.tooMany++ - } else { - // hard failure, apply policy action - encodedEvent := data[i].EncodedEvent.(*encodedEvent) - if encodedEvent.deadLetter { - stats.nonIndexable++ - client.log.Errorf("Can't deliver to dead letter index event (status=%v). Look at the event log to view the event and cause.", status) - client.log.Errorw(fmt.Sprintf("Can't deliver to dead letter index event %#v (status=%v): %s", data[i], status, msg), logp.TypeKey, logp.EventType) - // poison pill - this will clog the pipeline if the underlying failure is non transient. - } else if client.deadLetterIndex != "" { - client.log.Warnf("Cannot index event (status=%v), trying dead letter index. Look at the event log to view the event and cause.", status) - client.log.Warnw(fmt.Sprintf("Cannot index event %#v (status=%v): %s, trying dead letter index", data[i], status, msg), logp.TypeKey, logp.EventType) - client.setDeadLetter(encodedEvent, status, string(msg)) - - } else { // drop - stats.nonIndexable++ - client.log.Warnf("Cannot index event (status=%v): dropping event! Look at the event log to view the event and cause.", status) - client.log.Warnw(fmt.Sprintf("Cannot index event %#v (status=%v): %s, dropping event!", data[i], status, msg), logp.TypeKey, logp.EventType) - continue - } - } +// applyItemStatus processes the ingestion status of one event from a bulk request. +// Returns true if the item should be retried. +// In the provided bulkResultStats, applyItemStatus increments exactly one of: +// acked, duplicates, deadLetter, fails, nonIndexable. +func (client *Client) applyItemStatus( + event publisher.Event, + itemStatus int, + itemMessage []byte, + stats *bulkResultStats, +) bool { + encodedEvent := event.EncodedEvent.(*encodedEvent) + if itemStatus < 300 { + if encodedEvent.deadLetter { + // This was ingested into the dead letter index, not the original target + stats.deadLetter++ + } else { + stats.acked++ } + return false // no retry needed + } - client.log.Debugf("Bulk item insert failed (i=%v, status=%v): %s", i, status, msg) - stats.fails++ - failed = append(failed, data[i]) + if itemStatus == 409 { + // 409 is used to indicate there is already an event with the same ID, or + // with identical Time Series Data Stream dimensions when TSDS is active. + stats.duplicates++ + return false // no retry needed } - return failed, stats -} + if itemStatus == http.StatusTooManyRequests { + stats.fails++ + stats.tooMany++ + return true + } -func (client *Client) setDeadLetter( - encodedEvent *encodedEvent, errType int, errMsg string, -) { - encodedEvent.deadLetter = true - encodedEvent.index = client.deadLetterIndex - deadLetterReencoding := mapstr.M{ - "@timestamp": encodedEvent.timestamp, - "message": string(encodedEvent.encoding), - "error.type": errType, - "error.message": errMsg, + if itemStatus < 500 { + // hard failure, apply policy action + if encodedEvent.deadLetter { + // Fatal error while sending an already-failed event to the dead letter + // index, drop. + client.log.Errorf("Can't deliver to dead letter index event (status=%v). Look at the event log to view the event and cause.", itemStatus) + client.log.Errorw(fmt.Sprintf("Can't deliver to dead letter index event %#v (status=%v): %s", event, itemStatus, itemMessage), logp.TypeKey, logp.EventType) + stats.nonIndexable++ + return false + } + if client.deadLetterIndex == "" { + // Fatal error and no dead letter index, drop. + client.log.Warnf("Cannot index event (status=%v): dropping event! Look at the event log to view the event and cause.", itemStatus) + client.log.Warnw(fmt.Sprintf("Cannot index event %#v (status=%v): %s, dropping event!", event, itemStatus, itemMessage), logp.TypeKey, logp.EventType) + stats.nonIndexable++ + return false + } + // Send this failure to the dead letter index and "retry". + // We count this as a "retryable failure", and then if the dead letter + // ingestion succeeds it is counted in the "deadLetter" counter + // rather than the "acked" counter. + client.log.Warnf("Cannot index event (status=%v), trying dead letter index. Look at the event log to view the event and cause.", itemStatus) + client.log.Warnw(fmt.Sprintf("Cannot index event %#v (status=%v): %s, trying dead letter index", event, itemStatus, itemMessage), logp.TypeKey, logp.EventType) + encodedEvent.setDeadLetter(client.deadLetterIndex, itemStatus, string(itemMessage)) } - encodedEvent.encoding = []byte(deadLetterReencoding.String()) + + // Everything else gets retried. + stats.fails++ + return true } func (client *Client) Connect() error { @@ -471,3 +520,13 @@ func (client *Client) String() string { func (client *Client) Test(d testing.Driver) { client.conn.Test(d) } + +func (stats bulkResultStats) reportToObserver(ob outputs.Observer) { + ob.AckedEvents(stats.acked) + ob.RetryableErrors(stats.fails) + ob.PermanentErrors(stats.nonIndexable) + ob.DuplicateEvents(stats.duplicates) + ob.DeadLetterEvents(stats.deadLetter) + + ob.ErrTooMany(stats.tooMany) +} diff --git a/libbeat/outputs/elasticsearch/client_integration_test.go b/libbeat/outputs/elasticsearch/client_integration_test.go index 56567931ee4..765fd3eec5a 100644 --- a/libbeat/outputs/elasticsearch/client_integration_test.go +++ b/libbeat/outputs/elasticsearch/client_integration_test.go @@ -238,10 +238,7 @@ func TestClientBulkPublishEventsWithDeadletterIndex(t *testing.T) { "testfield": "foo0", }, })) - err = output.Publish(context.Background(), batch) - if err == nil { - t.Fatal("Expecting mapping conflict") - } + _ = output.Publish(context.Background(), batch) _, _, err = client.conn.Refresh(deadletterIndex) if err == nil { t.Fatal("expecting index to not exist yet") diff --git a/libbeat/outputs/elasticsearch/client_test.go b/libbeat/outputs/elasticsearch/client_test.go index 429b600d11a..5124c0defe9 100644 --- a/libbeat/outputs/elasticsearch/client_test.go +++ b/libbeat/outputs/elasticsearch/client_test.go @@ -49,6 +49,7 @@ import ( c "github.com/elastic/elastic-agent-libs/config" "github.com/elastic/elastic-agent-libs/logp" "github.com/elastic/elastic-agent-libs/mapstr" + "github.com/elastic/elastic-agent-libs/monitoring" libversion "github.com/elastic/elastic-agent-libs/version" ) @@ -83,17 +84,19 @@ func (bm *batchMock) RetryEvents(events []publisher.Event) { } func TestPublish(t *testing.T) { - makePublishTestClient := func(t *testing.T, url string) *Client { + + makePublishTestClient := func(t *testing.T, url string) (*Client, *monitoring.Registry) { + reg := monitoring.NewRegistry() client, err := NewClient( clientSettings{ - observer: outputs.NewNilObserver(), + observer: outputs.NewStats(reg), connection: eslegclient.ConnectionSettings{URL: url}, indexSelector: testIndexSelector{}, }, nil, ) require.NoError(t, err) - return client + return client, reg } ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) @@ -109,7 +112,7 @@ func TestPublish(t *testing.T) { _, _ = w.Write([]byte("Request failed to get to the server (status code: 413)")) // actual response from ES })) defer esMock.Close() - client := makePublishTestClient(t, esMock.URL) + client, reg := makePublishTestClient(t, esMock.URL) // Try publishing a batch that can be split batch := encodeBatch(client, &batchMock{ @@ -120,6 +123,8 @@ func TestPublish(t *testing.T) { assert.NoError(t, err, "Publish should split the batch without error") assert.True(t, batch.didSplit, "batch should be split") + assertRegistryUint(t, reg, "events.failed", 1, "Splitting a batch should report the event as failed/retried") + assertRegistryUint(t, reg, "events.dropped", 0, "Splitting a batch should not report any dropped events") // Try publishing a batch that cannot be split batch = encodeBatch(client, &batchMock{ @@ -131,6 +136,9 @@ func TestPublish(t *testing.T) { assert.NoError(t, err, "Publish should drop the batch without error") assert.False(t, batch.didSplit, "batch should not be split") assert.True(t, batch.drop, "unsplittable batch should be dropped") + assertRegistryUint(t, reg, "events.failed", 1, "Failed batch split should not report any more retryable failures") + assertRegistryUint(t, reg, "events.dropped", 1, "Failed batch split should report a dropped event") + }) t.Run("retries the batch if bad HTTP status", func(t *testing.T) { @@ -138,7 +146,7 @@ func TestPublish(t *testing.T) { w.WriteHeader(http.StatusInternalServerError) })) defer esMock.Close() - client := makePublishTestClient(t, esMock.URL) + client, reg := makePublishTestClient(t, esMock.URL) batch := encodeBatch(client, &batchMock{ events: []publisher.Event{event1, event2}, @@ -149,40 +157,44 @@ func TestPublish(t *testing.T) { assert.Error(t, err) assert.False(t, batch.ack, "should not be acknowledged") assert.Len(t, batch.retryEvents, 2, "all events should be retried") + assertRegistryUint(t, reg, "events.failed", 2, "HTTP failure should report failed events") }) t.Run("live batches, still too big after split", func(t *testing.T) { - // Test a live (non-mocked) batch where both events by themselves are - // rejected by the server as too large after the initial split. + // Test a live (non-mocked) batch where all three events by themselves are + // rejected by the server as too large after the initial batch splits. esMock := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { w.WriteHeader(http.StatusRequestEntityTooLarge) _, _ = w.Write([]byte("Request failed to get to the server (status code: 413)")) // actual response from ES })) defer esMock.Close() - client := makePublishTestClient(t, esMock.URL) + client, reg := makePublishTestClient(t, esMock.URL) // Because our tests don't use a live eventConsumer routine, // everything will happen synchronously and it's safe to track // test results directly without atomics/mutexes. done := false retryCount := 0 + var retryBatches []publisher.Batch batch := encodeBatch(client, pipeline.NewBatchForTesting( []publisher.Event{event1, event2, event3}, func(b publisher.Batch) { // The retry function sends the batch back through Publish. // In a live pipeline it would instead be sent to eventConsumer - // first and then back to Publish when an output worker was - // available. + // and then back to Publish when an output worker was available. retryCount++ - // We shouldn't need to re-encode the events since that was done - // before the initial Publish call - err := client.Publish(ctx, b) - assert.NoError(t, err, "Publish should return without error") + retryBatches = append(retryBatches, b) }, func() { done = true }, )) - err := client.Publish(ctx, batch) - assert.NoError(t, err, "Publish should return without error") + retryBatches = []publisher.Batch{batch} + // Loop until all pending retries are complete, the same as a pipeline caller would. + for len(retryBatches) > 0 { + batch := retryBatches[0] + retryBatches = retryBatches[1:] + err := client.Publish(ctx, batch) + assert.NoError(t, err, "Publish should return without error") + } // For three events there should be four retries in total: // {[event1], [event2, event3]}, then {[event2], [event3]}. @@ -190,6 +202,15 @@ func TestPublish(t *testing.T) { // events, all 3 will fail and be dropped. assert.Equal(t, 4, retryCount, "3-event batch should produce 4 total retries") assert.True(t, done, "batch should be marked as done") + // Metrics should report: + // 8 total events (3 + 1 + 2 + 1 + 1 from the batches described above) + // 3 dropped events (each event is dropped once) + // 5 failed events (8 - 3, for each event's attempted publish calls before being dropped) + // 0 active events (because Publish is complete) + assertRegistryUint(t, reg, "events.total", 8, "Publish is called on 8 events total") + assertRegistryUint(t, reg, "events.dropped", 3, "All 3 events should be dropped") + assertRegistryUint(t, reg, "events.failed", 5, "Split batches should retry 5 events before dropping them") + assertRegistryUint(t, reg, "events.active", 0, "Active events should be zero when Publish returns") }) t.Run("live batches, one event too big after split", func(t *testing.T) { @@ -206,32 +227,36 @@ func TestPublish(t *testing.T) { } else { // Report success with no events dropped w.WriteHeader(200) - _, _ = io.WriteString(w, "{\"items\": []}") + _, _ = io.WriteString(w, "{\"items\": [{\"index\":{\"status\":200}},{\"index\":{\"status\":200}},{\"index\":{\"status\":200}}]}") } })) defer esMock.Close() - client := makePublishTestClient(t, esMock.URL) + client, reg := makePublishTestClient(t, esMock.URL) // Because our tests don't use a live eventConsumer routine, // everything will happen synchronously and it's safe to track // test results directly without atomics/mutexes. done := false retryCount := 0 + var retryBatches []publisher.Batch batch := encodeBatch(client, pipeline.NewBatchForTesting( []publisher.Event{event1, event2, event3}, func(b publisher.Batch) { // The retry function sends the batch back through Publish. // In a live pipeline it would instead be sent to eventConsumer - // first and then back to Publish when an output worker was - // available. + // and then back to Publish when an output worker was available. retryCount++ - err := client.Publish(ctx, b) - assert.NoError(t, err, "Publish should return without error") + retryBatches = append(retryBatches, b) }, func() { done = true }, )) - err := client.Publish(ctx, batch) - assert.NoError(t, err, "Publish should return without error") + retryBatches = []publisher.Batch{batch} + for len(retryBatches) > 0 { + batch := retryBatches[0] + retryBatches = retryBatches[1:] + err := client.Publish(ctx, batch) + assert.NoError(t, err, "Publish should return without error") + } // There should be two retries: {[event1], [event2, event3]}. // The first split batch should fail and be dropped since it contains @@ -240,9 +265,26 @@ func TestPublish(t *testing.T) { // (one with failure, one with success). assert.Equal(t, 2, retryCount, "splitting with one large event should produce two retries") assert.True(t, done, "batch should be marked as done") + // The metrics should show: + // 6 total events (3 + 1 + 2) + // 1 dropped event (because only one event is uningestable) + // 2 acked events (because the other two ultimately succeed) + // 3 failed events (because all events fail and are retried on the first call) + // 0 active events (because Publish is finished) + assertRegistryUint(t, reg, "events.total", 6, "Publish is called on 6 events total") + assertRegistryUint(t, reg, "events.dropped", 1, "One event should be dropped") + assertRegistryUint(t, reg, "events.failed", 3, "Split batches should retry 3 events before dropping them") + assertRegistryUint(t, reg, "events.active", 0, "Active events should be zero when Publish returns") }) } +func assertRegistryUint(t *testing.T, reg *monitoring.Registry, key string, expected uint64, message string) { + t.Helper() + value := reg.Get(key).(*monitoring.Uint) + assert.NotNilf(t, value, "expected registry entry for key '%v'", key) + assert.Equal(t, expected, value.Get(), message) +} + func TestCollectPublishFailsNone(t *testing.T) { client, err := NewClient( clientSettings{ @@ -262,7 +304,11 @@ func TestCollectPublishFailsNone(t *testing.T) { events[i] = publisher.Event{Content: beat.Event{Fields: event}} } - res, _ := client.bulkCollectPublishFails(response, events) + res, _ := client.bulkCollectPublishFails(bulkResult{ + events: encodeEvents(client, events), + status: 200, + response: response, + }) assert.Equal(t, 0, len(res)) } @@ -283,11 +329,16 @@ func TestCollectPublishFailMiddle(t *testing.T) { ]} `) - event := publisher.Event{Content: beat.Event{Fields: mapstr.M{"field": 1}}} - eventFail := publisher.Event{Content: beat.Event{Fields: mapstr.M{"field": 2}}} - events := []publisher.Event{event, eventFail, event} + event1 := encodeEvent(client, publisher.Event{Content: beat.Event{Fields: mapstr.M{"field": 1}}}) + event2 := encodeEvent(client, publisher.Event{Content: beat.Event{Fields: mapstr.M{"field": 2}}}) + eventFail := encodeEvent(client, publisher.Event{Content: beat.Event{Fields: mapstr.M{"field": 3}}}) + events := []publisher.Event{event1, eventFail, event2} - res, stats := client.bulkCollectPublishFails(response, events) + res, stats := client.bulkCollectPublishFails(bulkResult{ + events: events, + status: 200, + response: response, + }) assert.Equal(t, 1, len(res)) if len(res) == 1 { assert.Equal(t, eventFail, res[0]) @@ -295,58 +346,143 @@ func TestCollectPublishFailMiddle(t *testing.T) { assert.Equal(t, bulkResultStats{acked: 2, fails: 1, tooMany: 1}, stats) } -func TestCollectPublishFailDeadLetterQueue(t *testing.T) { +func TestCollectPublishFailDeadLetterSuccess(t *testing.T) { + const deadLetterIndex = "test_index" client, err := NewClient( clientSettings{ observer: outputs.NewNilObserver(), - deadLetterIndex: "test_index", + deadLetterIndex: deadLetterIndex, }, nil, ) assert.NoError(t, err) - parseError := `{ - "root_cause" : [ - { - "type" : "mapper_parsing_exception", - "reason" : "failed to parse field [bar] of type [long] in document with id '1'. Preview of field's value: 'bar1'" - } - ], - "type" : "mapper_parsing_exception", - "reason" : "failed to parse field [bar] of type [long] in document with id '1'. Preview of field's value: 'bar1'", - "caused_by" : { - "type" : "illegal_argument_exception", - "reason" : "For input string: \"bar1\"" - } - }` - response := []byte(` - { "items": [ - {"create": {"status": 200}}, - {"create": { - "error" : ` + parseError + `, - "status" : 400 - } - }, - {"create": {"status": 200}} - ]} - `) + const errorMessage = "test error message" + // Return a successful response + response := []byte(`{"items": [{"create": {"status": 200}}]}`) - event := publisher.Event{Content: beat.Event{Fields: mapstr.M{"bar": 1}}} - event2 := publisher.Event{Content: beat.Event{Fields: mapstr.M{"bar": 2}}} - eventFail := publisher.Event{Content: beat.Event{Fields: mapstr.M{"bar": "bar1"}}} - events := encodeEvents(client, []publisher.Event{event, eventFail, event2}) + event1 := encodeEvent(client, publisher.Event{Content: beat.Event{Fields: mapstr.M{"bar": 1}}}) + event1.EncodedEvent.(*encodedEvent).setDeadLetter(deadLetterIndex, 123, errorMessage) + events := []publisher.Event{event1} - res, stats := client.bulkCollectPublishFails(response, events) - assert.Equal(t, 1, len(res)) - if len(res) == 1 { - expected := encodeEvent(client, eventFail) - encodedEvent := expected.EncodedEvent.(*encodedEvent) - // Mark the encoded event with the expected error - client.setDeadLetter(encodedEvent, 400, parseError) + // The event should be successful after being set to dead letter, so it + // should be reported in the metrics as deadLetter + res, stats := client.bulkCollectPublishFails(bulkResult{ + events: events, + status: 200, + response: response, + }) + assert.Equal(t, bulkResultStats{acked: 0, deadLetter: 1}, stats) + assert.Equal(t, 0, len(res)) +} + +func TestCollectPublishFailFatalErrorNotRetried(t *testing.T) { + // Test that a fatal error sending to the dead letter index is reported as + // a dropped event, and is not retried forever + const deadLetterIndex = "test_index" + client, err := NewClient( + clientSettings{ + observer: outputs.NewNilObserver(), + deadLetterIndex: deadLetterIndex, + }, + nil, + ) + assert.NoError(t, err) + + const errorMessage = "test error message" + // Return a fatal error + response := []byte(`{"items": [{"create": {"status": 499}}]}`) + + event1 := encodeEvent(client, publisher.Event{Content: beat.Event{Fields: mapstr.M{"bar": 1}}}) + event1.EncodedEvent.(*encodedEvent).setDeadLetter(deadLetterIndex, 123, errorMessage) + events := []publisher.Event{event1} + + // The event should fail permanently while being sent to the dead letter + // index, so it should be dropped instead of retrying. + res, stats := client.bulkCollectPublishFails(bulkResult{ + events: events, + status: 200, + response: response, + }) + assert.Equal(t, bulkResultStats{acked: 0, nonIndexable: 1}, stats) + assert.Equal(t, 0, len(res)) +} + +func TestCollectPublishFailInvalidBulkIndexResponse(t *testing.T) { + client, err := NewClient( + clientSettings{observer: outputs.NewNilObserver()}, + nil, + ) + assert.NoError(t, err) + + // Return a truncated response without valid item data + response := []byte(`{"items": [...`) - assert.Equal(t, expected, res[0]) + event1 := encodeEvent(client, publisher.Event{Content: beat.Event{Fields: mapstr.M{"bar": 1}}}) + events := []publisher.Event{event1} + + // The event should be successful after being set to dead letter, so it + // should be reported in the metrics as deadLetter + res, stats := client.bulkCollectPublishFails(bulkResult{ + events: events, + status: 200, + response: response, + }) + // The event should be returned for retry, and should appear in aggregated + // stats as failed (retryable error) + assert.Equal(t, bulkResultStats{acked: 0, fails: 1}, stats) + assert.Equal(t, 1, len(res)) + if len(res) > 0 { + assert.Equal(t, event1, res[0]) } +} + +func TestCollectPublishFailDeadLetterIndex(t *testing.T) { + const deadLetterIndex = "test_index" + client, err := NewClient( + clientSettings{ + observer: outputs.NewNilObserver(), + deadLetterIndex: deadLetterIndex, + }, + nil, + ) + assert.NoError(t, err) + + const errorMessage = "test error message" + response := []byte(` +{ + "items": [ + {"create": {"status": 200}}, + { + "create": { + "error" : "` + errorMessage + `", + "status" : 400 + } + }, + {"create": {"status": 200}} + ] +}`) + + event1 := encodeEvent(client, publisher.Event{Content: beat.Event{Fields: mapstr.M{"bar": 1}}}) + event2 := encodeEvent(client, publisher.Event{Content: beat.Event{Fields: mapstr.M{"bar": 2}}}) + eventFail := encodeEvent(client, publisher.Event{Content: beat.Event{Fields: mapstr.M{"bar": "bar1"}}}) + events := []publisher.Event{event1, eventFail, event2} + + res, stats := client.bulkCollectPublishFails(bulkResult{ + events: events, + status: 200, + response: response, + }) assert.Equal(t, bulkResultStats{acked: 2, fails: 1, nonIndexable: 0}, stats) + assert.Equal(t, 1, len(res)) + if len(res) == 1 { + assert.Equalf(t, eventFail, res[0], "bulkCollectPublishFails should return failed event") + encodedEvent, ok := res[0].EncodedEvent.(*encodedEvent) + require.True(t, ok, "event must be encoded as *encodedEvent") + assert.True(t, encodedEvent.deadLetter, "failed event's dead letter flag should be set") + assert.Equalf(t, deadLetterIndex, encodedEvent.index, "failed event's index should match dead letter index") + assert.Contains(t, string(encodedEvent.encoding), errorMessage, "dead letter event should include associated error message") + } } func TestCollectPublishFailDrop(t *testing.T) { @@ -388,7 +524,11 @@ func TestCollectPublishFailDrop(t *testing.T) { eventFail := publisher.Event{Content: beat.Event{Fields: mapstr.M{"bar": "bar1"}}} events := encodeEvents(client, []publisher.Event{event, eventFail, event}) - res, stats := client.bulkCollectPublishFails(response, events) + res, stats := client.bulkCollectPublishFails(bulkResult{ + events: events, + status: 200, + response: response, + }) assert.Equal(t, 0, len(res)) assert.Equal(t, bulkResultStats{acked: 2, fails: 0, nonIndexable: 1}, stats) } @@ -413,7 +553,11 @@ func TestCollectPublishFailAll(t *testing.T) { event := publisher.Event{Content: beat.Event{Fields: mapstr.M{"field": 2}}} events := encodeEvents(client, []publisher.Event{event, event, event}) - res, stats := client.bulkCollectPublishFails(response, events) + res, stats := client.bulkCollectPublishFails(bulkResult{ + events: events, + status: 200, + response: response, + }) assert.Equal(t, 3, len(res)) assert.Equal(t, events, res) assert.Equal(t, stats, bulkResultStats{fails: 3, tooMany: 3}) @@ -462,7 +606,11 @@ func TestCollectPipelinePublishFail(t *testing.T) { event := publisher.Event{Content: beat.Event{Fields: mapstr.M{"field": 2}}} events := encodeEvents(client, []publisher.Event{event}) - res, _ := client.bulkCollectPublishFails(response, events) + res, _ := client.bulkCollectPublishFails(bulkResult{ + events: events, + status: 200, + response: response, + }) assert.Equal(t, 1, len(res)) assert.Equal(t, events, res) } @@ -489,7 +637,11 @@ func BenchmarkCollectPublishFailsNone(b *testing.B) { events := encodeEvents(client, []publisher.Event{event, event, event}) for i := 0; i < b.N; i++ { - res, _ := client.bulkCollectPublishFails(response, events) + res, _ := client.bulkCollectPublishFails(bulkResult{ + events: events, + status: 200, + response: response, + }) if len(res) != 0 { b.Fail() } @@ -518,7 +670,11 @@ func BenchmarkCollectPublishFailMiddle(b *testing.B) { events := encodeEvents(client, []publisher.Event{event, eventFail, event}) for i := 0; i < b.N; i++ { - res, _ := client.bulkCollectPublishFails(response, events) + res, _ := client.bulkCollectPublishFails(bulkResult{ + events: events, + status: 200, + response: response, + }) if len(res) != 1 { b.Fail() } @@ -546,7 +702,11 @@ func BenchmarkCollectPublishFailAll(b *testing.B) { events := encodeEvents(client, []publisher.Event{event, event, event}) for i := 0; i < b.N; i++ { - res, _ := client.bulkCollectPublishFails(response, events) + res, _ := client.bulkCollectPublishFails(bulkResult{ + events: events, + status: 200, + response: response, + }) if len(res) != 3 { b.Fail() } @@ -791,7 +951,7 @@ func TestClientWithAPIKey(t *testing.T) { assert.Equal(t, "ApiKey aHlva0hHNEJmV2s1dmlLWjE3Mlg6bzQ1SlVreXVTLS15aVNBdXV4bDhVdw==", headers.Get("Authorization")) } -func TestPublishEventsWithBulkFiltering(t *testing.T) { +func TestBulkRequestHasFilterPath(t *testing.T) { makePublishTestClient := func(t *testing.T, url string, configParams map[string]string) *Client { client, err := NewClient( clientSettings{ @@ -813,16 +973,14 @@ func TestPublishEventsWithBulkFiltering(t *testing.T) { event1 := publisher.Event{Content: beat.Event{Fields: mapstr.M{"field": 1}}} + const filterPathKey = "filter_path" + const filterPathValue = "errors,items.*.error,items.*.status" t.Run("Single event with response filtering", func(t *testing.T) { - var expectedFilteringParams = map[string]string{ - "filter_path": "errors,items.*.error,items.*.status", - } - var recParams url.Values - + var reqParams url.Values esMock := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { w.WriteHeader(http.StatusOK) if strings.ContainsAny("_bulk", r.URL.Path) { - recParams = r.URL.Query() + reqParams = r.URL.Query() response := []byte(`{"took":85,"errors":false,"items":[{"index":{"status":200}}]}`) _, _ = w.Write(response) } @@ -834,26 +992,23 @@ func TestPublishEventsWithBulkFiltering(t *testing.T) { defer esMock.Close() client := makePublishTestClient(t, esMock.URL, nil) - // Try publishing a batch that can be split - events := encodeEvents(client, []publisher.Event{event1}) - evt, err := client.publishEvents(ctx, events) - require.NoError(t, err) - require.Equal(t, len(recParams), len(expectedFilteringParams)) - require.Nil(t, evt) + batch := encodeBatch(client, &batchMock{events: []publisher.Event{event1}}) + result := client.doBulkRequest(ctx, batch) + require.NoError(t, result.connErr) + // Only param should be the standard filter path + require.Equal(t, len(reqParams), 1, "Only bulk request param should be standard filter path") + require.Equal(t, filterPathValue, reqParams.Get(filterPathKey), "Bulk request should include standard filter path") }) t.Run("Single event with response filtering and preconfigured client params", func(t *testing.T) { var configParams = map[string]string{ "hardcoded": "yes", } - var expectedFilteringParams = map[string]string{ - "filter_path": "errors,items.*.error,items.*.status", - } - var recParams url.Values + var reqParams url.Values esMock := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { w.WriteHeader(http.StatusOK) if strings.ContainsAny("_bulk", r.URL.Path) { - recParams = r.URL.Query() + reqParams = r.URL.Query() response := []byte(`{"took":85,"errors":false,"items":[{"index":{"status":200}}]}`) _, _ = w.Write(response) } @@ -865,69 +1020,22 @@ func TestPublishEventsWithBulkFiltering(t *testing.T) { defer esMock.Close() client := makePublishTestClient(t, esMock.URL, configParams) - // Try publishing a batch that can be split - events := encodeEvents(client, []publisher.Event{event1}) - evt, err := client.publishEvents(ctx, events) - require.NoError(t, err) - require.Equal(t, len(recParams), len(expectedFilteringParams)+len(configParams)) - require.Nil(t, evt) - }) - t.Run("Single event without response filtering", func(t *testing.T) { - var recParams url.Values - - esMock := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - if strings.ContainsAny("_bulk", r.URL.Path) { - recParams = r.URL.Query() - response := []byte(`{ - "took":85, - "errors":false, - "items":[ - { - "index":{ - "_index":"test", - "_id":"1", - "_version":1, - "result":"created", - "_shards":{"total":2,"successful":1,"failed":0}, - "_seq_no":0, - "_primary_term":1, - "status":201 - } - } - ]}`) - _, _ = w.Write(response) - } - if strings.Contains("/", r.URL.Path) { - response := []byte(`{}`) - _, _ = w.Write(response) - } - w.WriteHeader(http.StatusOK) - - })) - defer esMock.Close() - client := makePublishTestClient(t, esMock.URL, nil) - - // Try publishing a batch that can be split - events := encodeEvents(client, []publisher.Event{event1}) - _, err := client.publishEvents(ctx, events) - require.NoError(t, err) - require.Equal(t, len(recParams), 1) + batch := encodeBatch(client, &batchMock{events: []publisher.Event{event1}}) + result := client.doBulkRequest(ctx, batch) + require.NoError(t, result.connErr) + require.Equal(t, len(reqParams), 2, "Bulk request should include configured parameter and standard filter path") + require.Equal(t, filterPathValue, reqParams.Get(filterPathKey), "Bulk request should include standard filter path") }) } func TestSetDeadLetter(t *testing.T) { dead_letter_index := "dead_index" - client := &Client{ - deadLetterIndex: dead_letter_index, - indexSelector: testIndexSelector{}, - } - e := &encodedEvent{ index: "original_index", } errType := 123 errStr := "test error string" - client.setDeadLetter(e, errType, errStr) + e.setDeadLetter(dead_letter_index, errType, errStr) assert.True(t, e.deadLetter, "setDeadLetter should set the event's deadLetter flag") assert.Equal(t, dead_letter_index, e.index, "setDeadLetter should overwrite the event's original index") diff --git a/libbeat/outputs/elasticsearch/event_encoder.go b/libbeat/outputs/elasticsearch/event_encoder.go index 0441695d53c..7d345e2bc5c 100644 --- a/libbeat/outputs/elasticsearch/event_encoder.go +++ b/libbeat/outputs/elasticsearch/event_encoder.go @@ -29,6 +29,7 @@ import ( "github.com/elastic/beats/v7/libbeat/outputs/outil" "github.com/elastic/beats/v7/libbeat/publisher" "github.com/elastic/beats/v7/libbeat/publisher/queue" + "github.com/elastic/elastic-agent-libs/mapstr" ) type eventEncoder struct { @@ -136,3 +137,17 @@ func (pe *eventEncoder) encodeRawEvent(e *beat.Event) *encodedEvent { encoding: bytes, } } + +func (e *encodedEvent) setDeadLetter( + deadLetterIndex string, errType int, errMsg string, +) { + e.deadLetter = true + e.index = deadLetterIndex + deadLetterReencoding := mapstr.M{ + "@timestamp": e.timestamp, + "message": string(e.encoding), + "error.type": errType, + "error.message": errMsg, + } + e.encoding = []byte(deadLetterReencoding.String()) +} diff --git a/libbeat/outputs/fileout/file.go b/libbeat/outputs/fileout/file.go index 87b50f62c1a..f650ff3f964 100644 --- a/libbeat/outputs/fileout/file.go +++ b/libbeat/outputs/fileout/file.go @@ -159,9 +159,9 @@ func (out *fileOutput) Publish(_ context.Context, batch publisher.Batch) error { st.ReportLatency(took) } - st.Dropped(dropped) + st.PermanentErrors(dropped) - st.Acked(len(events) - dropped) + st.AckedEvents(len(events) - dropped) return nil } diff --git a/libbeat/outputs/kafka/client.go b/libbeat/outputs/kafka/client.go index afeb02a5534..1780f1392b3 100644 --- a/libbeat/outputs/kafka/client.go +++ b/libbeat/outputs/kafka/client.go @@ -171,7 +171,7 @@ func (c *client) Publish(_ context.Context, batch publisher.Batch) error { if err != nil { c.log.Errorf("Dropping event: %+v", err) ref.done() - c.observer.Dropped(1) + c.observer.PermanentErrors(1) continue } @@ -360,13 +360,13 @@ func (r *msgRef) fail(msg *message, err error) { switch { case errors.Is(err, sarama.ErrInvalidMessage): r.client.log.Errorf("Kafka (topic=%v): dropping invalid message", msg.topic) - r.client.observer.Dropped(1) + r.client.observer.PermanentErrors(1) case errors.Is(err, sarama.ErrMessageSizeTooLarge) || errors.Is(err, sarama.ErrInvalidMessageSize): r.client.log.Errorf("Kafka (topic=%v): dropping too large message of size %v.", msg.topic, len(msg.key)+len(msg.value)) - r.client.observer.Dropped(1) + r.client.observer.PermanentErrors(1) case errors.Is(err, breaker.ErrBreakerOpen): // Add this message to the failed list, but don't overwrite r.err since @@ -399,15 +399,15 @@ func (r *msgRef) dec() { success := r.total - failed r.batch.RetryEvents(r.failed) - stats.Failed(failed) + stats.RetryableErrors(failed) if success > 0 { - stats.Acked(success) + stats.AckedEvents(success) } r.client.log.Debugf("Kafka publish failed with: %+v", err) } else { r.batch.ACK() - stats.Acked(r.total) + stats.AckedEvents(r.total) } } diff --git a/libbeat/outputs/logstash/async.go b/libbeat/outputs/logstash/async.go index 1458ee9d382..b1e20a0e774 100644 --- a/libbeat/outputs/logstash/async.go +++ b/libbeat/outputs/logstash/async.go @@ -238,7 +238,7 @@ func (r *msgRef) callback(seq uint32, err error) { } func (r *msgRef) done(n uint32) { - r.client.observer.Acked(int(n)) + r.client.observer.AckedEvents(int(n)) r.slice = r.slice[n:] if r.win != nil { r.win.tryGrowWindow(r.batchSize) @@ -255,7 +255,7 @@ func (r *msgRef) fail(n uint32, err error) { r.win.shrinkWindow() } - r.client.observer.Acked(int(n)) + r.client.observer.AckedEvents(int(n)) r.dec() } @@ -267,7 +267,7 @@ func (r *msgRef) dec() { } if L := len(r.slice); L > 0 { - r.client.observer.Failed(L) + r.client.observer.RetryableErrors(L) } err := r.err diff --git a/libbeat/outputs/logstash/sync.go b/libbeat/outputs/logstash/sync.go index 2a49324c46f..d24ab1ebb97 100644 --- a/libbeat/outputs/logstash/sync.go +++ b/libbeat/outputs/logstash/sync.go @@ -149,7 +149,7 @@ func (c *syncClient) Publish(_ context.Context, batch publisher.Batch) error { n, len(events), c.Host()) events = events[n:] - st.Acked(n) + st.AckedEvents(n) if err != nil { // return batch to pipeline before reporting/counting error batch.RetryEvents(events) @@ -162,7 +162,7 @@ func (c *syncClient) Publish(_ context.Context, batch publisher.Batch) error { c.log.Errorf("Failed to publish events caused by: %+v", err) rest := len(events) - st.Failed(rest) + st.RetryableErrors(rest) return err } diff --git a/libbeat/outputs/metrics.go b/libbeat/outputs/metrics.go index 5502c4e4ae0..7e47c6e7ab9 100644 --- a/libbeat/outputs/metrics.go +++ b/libbeat/outputs/metrics.go @@ -32,18 +32,41 @@ type Stats struct { // // Output event stats // - batches *monitoring.Uint // total number of batches processed by output - events *monitoring.Uint // total number of events processed by output - acked *monitoring.Uint // total number of events ACKed by output - failed *monitoring.Uint // total number of events failed in output - active *monitoring.Uint // events sent and waiting for ACK/fail from output - duplicates *monitoring.Uint // events sent and waiting for ACK/fail from output - dropped *monitoring.Uint // total number of invalid events dropped by the output - tooMany *monitoring.Uint // total number of too many requests replies from output + // Number of calls to the output's Publish function + eventsBatches *monitoring.Uint + + // Number of events sent to the output's Publish function. + eventsTotal *monitoring.Uint + + // Number of events accepted by the output's receiver. + eventsACKed *monitoring.Uint + + // Number of failed events ingested to the dead letter index + eventsDeadLetter *monitoring.Uint + + // Number of events that reported a retryable error from the output's + // receiver. + eventsFailed *monitoring.Uint + + // Number of events that were dropped due to a non-retryable error. + eventsDropped *monitoring.Uint + + // Number of events rejected by the output's receiver for being duplicates. + eventsDuplicates *monitoring.Uint + + // (Gauge) Number of events that have been sent to the output's Publish + // call but have not yet been ACKed, + eventsActive *monitoring.Uint // (gauge) events sent and waiting for ACK/fail from output + + // Number of events that failed due to a "429 too many requests" error. + // These events are also included in eventsFailed. + eventsTooMany *monitoring.Uint // Output batch stats - split *monitoring.Uint // total number of batches split for being too large + + // Number of times a batch was split for being too large + batchesSplit *monitoring.Uint // // Output network connection stats @@ -62,16 +85,17 @@ type Stats struct { // The registry must not be null. func NewStats(reg *monitoring.Registry) *Stats { obj := &Stats{ - batches: monitoring.NewUint(reg, "events.batches"), - events: monitoring.NewUint(reg, "events.total"), - acked: monitoring.NewUint(reg, "events.acked"), - failed: monitoring.NewUint(reg, "events.failed"), - dropped: monitoring.NewUint(reg, "events.dropped"), - duplicates: monitoring.NewUint(reg, "events.duplicates"), - active: monitoring.NewUint(reg, "events.active"), - tooMany: monitoring.NewUint(reg, "events.toomany"), - - split: monitoring.NewUint(reg, "batches.split"), + eventsBatches: monitoring.NewUint(reg, "events.batches"), + eventsTotal: monitoring.NewUint(reg, "events.total"), + eventsACKed: monitoring.NewUint(reg, "events.acked"), + eventsDeadLetter: monitoring.NewUint(reg, "events.dead_letter"), + eventsFailed: monitoring.NewUint(reg, "events.failed"), + eventsDropped: monitoring.NewUint(reg, "events.dropped"), + eventsDuplicates: monitoring.NewUint(reg, "events.duplicates"), + eventsActive: monitoring.NewUint(reg, "events.active"), + eventsTooMany: monitoring.NewUint(reg, "events.toomany"), + + batchesSplit: monitoring.NewUint(reg, "batches.split"), writeBytes: monitoring.NewUint(reg, "write.bytes"), writeErrors: monitoring.NewUint(reg, "write.errors"), @@ -88,9 +112,9 @@ func NewStats(reg *monitoring.Registry) *Stats { // NewBatch updates active batch and event metrics. func (s *Stats) NewBatch(n int) { if s != nil { - s.batches.Inc() - s.events.Add(uint64(n)) - s.active.Add(uint64(n)) + s.eventsBatches.Inc() + s.eventsTotal.Add(uint64(n)) + s.eventsActive.Add(uint64(n)) } } @@ -98,59 +122,59 @@ func (s *Stats) ReportLatency(time time.Duration) { s.sendLatencyMillis.Update(time.Milliseconds()) } -// Acked updates active and acked event metrics. -func (s *Stats) Acked(n int) { +// AckedEvents updates active and acked event metrics. +func (s *Stats) AckedEvents(n int) { if s != nil { - s.acked.Add(uint64(n)) - s.active.Sub(uint64(n)) + s.eventsACKed.Add(uint64(n)) + s.eventsActive.Sub(uint64(n)) } } -// Failed updates active and failed event metrics. -func (s *Stats) Failed(n int) { +func (s *Stats) DeadLetterEvents(n int) { if s != nil { - s.failed.Add(uint64(n)) - s.active.Sub(uint64(n)) + s.eventsDeadLetter.Add(uint64(n)) + s.eventsActive.Sub(uint64(n)) } } -// Duplicate updates the active and duplicate event metrics. -func (s *Stats) Duplicate(n int) { +// RetryableErrors updates active and failed event metrics. +func (s *Stats) RetryableErrors(n int) { if s != nil { - s.duplicates.Add(uint64(n)) - s.active.Sub(uint64(n)) + s.eventsFailed.Add(uint64(n)) + s.eventsActive.Sub(uint64(n)) } } -// Dropped updates total number of event drops as reported by the output. -// Outputs will only report dropped events on fatal errors which lead to the -// event not being publishable. For example encoding errors or total event size -// being bigger then maximum supported event size. -func (s *Stats) Dropped(n int) { - // number of dropped events (e.g. encoding failures) +// DuplicateEvents updates the active and duplicate event metrics. +func (s *Stats) DuplicateEvents(n int) { if s != nil { - s.active.Sub(uint64(n)) - s.dropped.Add(uint64(n)) + s.eventsDuplicates.Add(uint64(n)) + s.eventsActive.Sub(uint64(n)) } } -// Cancelled updates the active event metrics. -func (s *Stats) Cancelled(n int) { +// PermanentErrors updates total number of event drops as reported by the output. +// Outputs will only report dropped events on fatal errors which lead to the +// event not being publishable. For example encoding errors or total event size +// being bigger then maximum supported event size. +func (s *Stats) PermanentErrors(n int) { + // number of dropped events (e.g. encoding failures) if s != nil { - s.active.Sub(uint64(n)) + s.eventsActive.Sub(uint64(n)) + s.eventsDropped.Add(uint64(n)) } } -func (s *Stats) Split() { +func (s *Stats) BatchSplit() { if s != nil { - s.split.Inc() + s.batchesSplit.Inc() } } // ErrTooMany updates the number of Too Many Requests responses reported by the output. func (s *Stats) ErrTooMany(n int) { if s != nil { - s.tooMany.Add(uint64(n)) + s.eventsTooMany.Add(uint64(n)) } } diff --git a/libbeat/outputs/observer.go b/libbeat/outputs/observer.go index 3a330e4a43a..28d40f90dbd 100644 --- a/libbeat/outputs/observer.go +++ b/libbeat/outputs/observer.go @@ -22,19 +22,23 @@ import "time" // Observer provides an interface used by outputs to report common events on // documents/events being published and I/O workload. type Observer interface { - NewBatch(int) // report new batch being processed with number of events + NewBatch(int) // report new batch being processed with number of events + + RetryableErrors(int) // report number of events with retryable errors + PermanentErrors(int) // report number of events dropped due to permanent errors + DuplicateEvents(int) // report number of events detected as duplicates (e.g. on resends) + DeadLetterEvents(int) // report number of failed events ingested to dead letter index + AckedEvents(int) // report number of acked events + ErrTooMany(int) // report too many requests response + + BatchSplit() // report a batch was split for being too large to ingest + + WriteError(error) // report an I/O error on write + WriteBytes(int) // report number of bytes being written + ReadError(error) // report an I/O error on read + ReadBytes(int) // report number of bytes being read + ReportLatency(time.Duration) // report the duration a send to the output takes - Acked(int) // report number of acked events - Failed(int) // report number of failed events - Dropped(int) // report number of dropped events - Duplicate(int) // report number of events detected as duplicates (e.g. on resends) - Cancelled(int) // report number of cancelled events - Split() // report a batch was split for being too large to ingest - WriteError(error) // report an I/O error on write - WriteBytes(int) // report number of bytes being written - ReadError(error) // report an I/O error on read - ReadBytes(int) // report number of bytes being read - ErrTooMany(int) // report too many requests response } type emptyObserver struct{} @@ -48,12 +52,12 @@ func NewNilObserver() Observer { func (*emptyObserver) NewBatch(int) {} func (*emptyObserver) ReportLatency(_ time.Duration) {} -func (*emptyObserver) Acked(int) {} -func (*emptyObserver) Duplicate(int) {} -func (*emptyObserver) Failed(int) {} -func (*emptyObserver) Dropped(int) {} -func (*emptyObserver) Cancelled(int) {} -func (*emptyObserver) Split() {} +func (*emptyObserver) AckedEvents(int) {} +func (*emptyObserver) DeadLetterEvents(int) {} +func (*emptyObserver) DuplicateEvents(int) {} +func (*emptyObserver) RetryableErrors(int) {} +func (*emptyObserver) PermanentErrors(int) {} +func (*emptyObserver) BatchSplit() {} func (*emptyObserver) WriteError(error) {} func (*emptyObserver) WriteBytes(int) {} func (*emptyObserver) ReadError(error) {} diff --git a/libbeat/outputs/redis/client.go b/libbeat/outputs/redis/client.go index 1fcd46e6f64..9f5c9812dd1 100644 --- a/libbeat/outputs/redis/client.go +++ b/libbeat/outputs/redis/client.go @@ -148,7 +148,7 @@ func (c *client) Publish(_ context.Context, batch publisher.Batch) error { c.observer.NewBatch(len(events)) rest, err := c.publish(c.key, events) if rest != nil { - c.observer.Failed(len(rest)) + c.observer.RetryableErrors(len(rest)) batch.RetryEvents(rest) return err } @@ -229,7 +229,7 @@ func (c *client) publishEventsBulk(conn redis.Conn, command string) publishFn { args[0] = dest okEvents, args := serializeEvents(c.log, args, 1, data, c.index, c.codec) - c.observer.Dropped(len(data) - len(okEvents)) + c.observer.PermanentErrors(len(data) - len(okEvents)) if (len(args) - 1) == 0 { return nil, nil } @@ -245,7 +245,7 @@ func (c *client) publishEventsBulk(conn redis.Conn, command string) publishFn { } - c.observer.Acked(len(okEvents)) + c.observer.AckedEvents(len(okEvents)) return nil, nil } } @@ -255,7 +255,7 @@ func (c *client) publishEventsPipeline(conn redis.Conn, command string) publishF var okEvents []publisher.Event serialized := make([]interface{}, 0, len(data)) okEvents, serialized = serializeEvents(c.log, serialized, 0, data, c.index, c.codec) - c.observer.Dropped(len(data) - len(okEvents)) + c.observer.PermanentErrors(len(data) - len(okEvents)) if len(serialized) == 0 { return nil, nil } @@ -276,7 +276,7 @@ func (c *client) publishEventsPipeline(conn redis.Conn, command string) publishF return okEvents, err } } - c.observer.Dropped(dropped) + c.observer.PermanentErrors(dropped) if err := conn.Flush(); err != nil { return data, err @@ -302,7 +302,7 @@ func (c *client) publishEventsPipeline(conn redis.Conn, command string) publishF } } - c.observer.Acked(len(okEvents) - len(failed)) + c.observer.AckedEvents(len(okEvents) - len(failed)) return failed, lastErr } } diff --git a/libbeat/publisher/event.go b/libbeat/publisher/event.go index efd5220740e..b80b12d6793 100644 --- a/libbeat/publisher/event.go +++ b/libbeat/publisher/event.go @@ -27,6 +27,8 @@ import ( // errors), one of the signal methods must be called. In normal operation // every batch will eventually receive an ACK() or a Drop(). type Batch interface { + // The output that receives a batch owns the entries in its Events array, + // and changes to them will persist between retries. Events() []Event // All events have been acknowledged by the output. diff --git a/libbeat/publisher/pipeline/monitoring.go b/libbeat/publisher/pipeline/monitoring.go index cda329e0963..0bc63a739f9 100644 --- a/libbeat/publisher/pipeline/monitoring.go +++ b/libbeat/publisher/pipeline/monitoring.go @@ -32,21 +32,32 @@ type observer interface { } type pipelineObserver interface { + // A new client connected to the pipeline via (*Pipeline).ConnectWith. clientConnected() + // An open pipeline client received a Close() call. clientClosed() } type clientObserver interface { + // The client received a Publish call newEvent() + // An event was filtered by processors before being published filteredEvent() + // An event was published to the queue publishedEvent() + // An event was rejected by the queue failedPublishEvent() } type outputObserver interface { + // Events encountered too many errors and were permanently dropped. eventsDropped(int) + // Events were sent back to an output worker after an earlier failure. eventsRetry(int) + // The queue received acknowledgment for events from the output workers. + // (This may include events already reported via eventsDropped.) queueACKed(n int) + // Report the maximum event count supported by the queue. queueMaxEvents(n int) } @@ -65,10 +76,10 @@ type metricsObserverVars struct { // clients metrics clients *monitoring.Uint - // events publish/dropped stats - events, filtered, published, failed *monitoring.Uint - dropped, retry *monitoring.Uint // (retryer) drop/retry counters - activeEvents *monitoring.Uint + // eventsTotal publish/dropped stats + eventsTotal, eventsFiltered, eventsPublished, eventsFailed *monitoring.Uint + eventsDropped, eventsRetry *monitoring.Uint // (retryer) drop/retry counters + activeEvents *monitoring.Uint // queue metrics queueACKed *monitoring.Uint @@ -85,19 +96,46 @@ func newMetricsObserver(metrics *monitoring.Registry) *metricsObserver { return &metricsObserver{ metrics: metrics, vars: metricsObserverVars{ - clients: monitoring.NewUint(reg, "clients"), // Gauge + // (Gauge) clients measures the number of open pipeline clients. + clients: monitoring.NewUint(reg, "clients"), - events: monitoring.NewUint(reg, "events.total"), - filtered: monitoring.NewUint(reg, "events.filtered"), - published: monitoring.NewUint(reg, "events.published"), - failed: monitoring.NewUint(reg, "events.failed"), - dropped: monitoring.NewUint(reg, "events.dropped"), - retry: monitoring.NewUint(reg, "events.retry"), + // events.total counts all created events. + eventsTotal: monitoring.NewUint(reg, "events.total"), - queueACKed: monitoring.NewUint(reg, "queue.acked"), + // (Gauge) events.active measures events that have been created, but have + // not yet been failed, filtered, or acked/dropped. + activeEvents: monitoring.NewUint(reg, "events.active"), + + // events.filtered counts events that were filtered by processors before + // being sent to the queue. + eventsFiltered: monitoring.NewUint(reg, "events.filtered"), + + // events.failed counts events that were rejected by the queue, or that + // were sent via an already-closed pipeline client. + eventsFailed: monitoring.NewUint(reg, "events.failed"), + + // events.published counts events that were accepted by the queue. + eventsPublished: monitoring.NewUint(reg, "events.published"), + + // events.retry counts events that an output worker sent back to be + // retried. + eventsRetry: monitoring.NewUint(reg, "events.retry"), + + // events.dropped counts events that were dropped because errors from + // the output workers exceeded the configured maximum retry count. + eventsDropped: monitoring.NewUint(reg, "events.dropped"), + + // (Gauge) queue.max_events measures the maximum number of events the + // queue will accept, or 0 if there is none. queueMaxEvents: monitoring.NewUint(reg, "queue.max_events"), - activeEvents: monitoring.NewUint(reg, "events.active"), // Gauge + // queue.acked counts events that have been acknowledged by the output + // workers. This includes events that were dropped for fatal errors, + // which are also reported in events.dropped. + queueACKed: monitoring.NewUint(reg, "queue.acked"), + + // (Gauge) queue.filled.pct.events measures the fraction (from 0 to 1) + // of the queue's event capacity that is currently filled. percentQueueFull: monitoring.NewFloat(reg, "queue.filled.pct.events"), }, } @@ -125,7 +163,7 @@ func (o *metricsObserver) clientClosed() { o.vars.clients.Dec() } // (client) client is trying to publish a new event func (o *metricsObserver) newEvent() { - o.vars.events.Inc() + o.vars.eventsTotal.Inc() o.vars.activeEvents.Inc() o.setPercentageFull() } @@ -142,19 +180,19 @@ func (o *metricsObserver) setPercentageFull() { // (client) event is filtered out (on purpose or failed) func (o *metricsObserver) filteredEvent() { - o.vars.filtered.Inc() + o.vars.eventsFiltered.Inc() o.vars.activeEvents.Dec() o.setPercentageFull() } // (client) managed to push an event into the publisher pipeline func (o *metricsObserver) publishedEvent() { - o.vars.published.Inc() + o.vars.eventsPublished.Inc() } // (client) client closing down or DropIfFull is set func (o *metricsObserver) failedPublishEvent() { - o.vars.failed.Inc() + o.vars.eventsFailed.Inc() o.vars.activeEvents.Dec() o.setPercentageFull() } @@ -182,12 +220,12 @@ func (o *metricsObserver) queueMaxEvents(n int) { // (retryer) number of events dropped by retryer func (o *metricsObserver) eventsDropped(n int) { - o.vars.dropped.Add(uint64(n)) + o.vars.eventsDropped.Add(uint64(n)) } // (retryer) number of events pushed to the output worker queue func (o *metricsObserver) eventsRetry(n int) { - o.vars.retry.Add(uint64(n)) + o.vars.eventsRetry.Add(uint64(n)) } type emptyObserver struct{} diff --git a/libbeat/publisher/pipeline/stress/out.go b/libbeat/publisher/pipeline/stress/out.go index 03ea06d3be8..b0e4c3d4e39 100644 --- a/libbeat/publisher/pipeline/stress/out.go +++ b/libbeat/publisher/pipeline/stress/out.go @@ -93,7 +93,7 @@ func (t *testOutput) Publish(_ context.Context, batch publisher.Batch) error { if config.Fail.EveryBatch == t.batchCount { t.batchCount = 0 - t.observer.Failed(n) + t.observer.RetryableErrors(n) batch.Retry() return nil } @@ -104,7 +104,7 @@ func (t *testOutput) Publish(_ context.Context, batch publisher.Batch) error { // ack complete batch batch.ACK() - t.observer.Acked(n) + t.observer.AckedEvents(n) return nil } diff --git a/libbeat/tests/integration/ca_pinning_test.go b/libbeat/tests/integration/ca_pinning_test.go index 51e098885ea..98c3db6729d 100644 --- a/libbeat/tests/integration/ca_pinning_test.go +++ b/libbeat/tests/integration/ca_pinning_test.go @@ -54,7 +54,7 @@ output.elasticsearch: mockbeat.WriteConfigFile(fmt.Sprintf(cfg, esURL.String(), caPath)) mockbeat.Start() mockbeat.WaitForLogs("mockbeat start running.", 60*time.Second) - mockbeat.WaitForLogs("PublishEvents: 1 events have been published", 60*time.Second) + mockbeat.WaitForLogs("doBulkRequest: 1 events have been sent", 60*time.Second) } func TestCAPinningBadSHA(t *testing.T) { diff --git a/libbeat/tests/integration/template_test.go b/libbeat/tests/integration/template_test.go index aec46e448b9..3dc30cbf430 100644 --- a/libbeat/tests/integration/template_test.go +++ b/libbeat/tests/integration/template_test.go @@ -224,7 +224,7 @@ logging: mockbeat.WaitForLogs("mockbeat start running.", 60*time.Second) mockbeat.WaitForLogs("Template with name \\\"mockbeat-9.9.9\\\" loaded.", 20*time.Second) require.Eventually(t, func() bool { - return mockbeat.LogMatch("PublishEvents: [[:digit:]]+ events have been published") + return mockbeat.LogMatch("doBulkRequest: [[:digit:]]+ events have been sent") }, 20*time.Second, 100*time.Millisecond, "looking for PublishEvents") status, body, err := HttpDo(t, http.MethodGet, indexURL) @@ -296,7 +296,7 @@ logging: mockbeat.Start() mockbeat.WaitForLogs("mockbeat start running.", 60*time.Second) require.Eventually(t, func() bool { - return mockbeat.LogMatch("PublishEvents: [[:digit:]]+ events have been published") + return mockbeat.LogMatch("doBulkRequest: [[:digit:]]+ events have been sent") }, 20*time.Second, 100*time.Millisecond, "looking for PublishEvents") u := fmt.Sprintf("%s/_index_template/%s", esUrl.String(), datastream) diff --git a/libbeat/tests/system/test_ilm.py b/libbeat/tests/system/test_ilm.py index 630ab551593..0313097d938 100644 --- a/libbeat/tests/system/test_ilm.py +++ b/libbeat/tests/system/test_ilm.py @@ -52,7 +52,7 @@ def test_ilm_default(self): proc = self.start_beat() self.wait_until(lambda: self.log_contains("mockbeat start running.")) self.wait_until(lambda: self.log_contains(MSG_ILM_POLICY_LOADED)) - self.wait_until(lambda: self.log_contains("PublishEvents: 1 events have been published")) + self.wait_until(lambda: self.log_contains("doBulkRequest: 1 events have been sent")) proc.check_kill_and_wait() self.idxmgmt.assert_data_stream_created(self.data_stream) @@ -69,7 +69,7 @@ def test_ilm_disabled(self): self.render_config(ilm={"enabled": False}) proc = self.start_beat() self.wait_until(lambda: self.log_contains("mockbeat start running.")) - self.wait_until(lambda: self.log_contains("PublishEvents: 1 events have been published")) + self.wait_until(lambda: self.log_contains("doBulkRequest: 1 events have been sent")) proc.check_kill_and_wait() self.idxmgmt.assert_index_template_loaded(self.data_stream) @@ -89,7 +89,7 @@ def test_policy_name(self): proc = self.start_beat() self.wait_until(lambda: self.log_contains("mockbeat start running.")) self.wait_until(lambda: self.log_contains(MSG_ILM_POLICY_LOADED)) - self.wait_until(lambda: self.log_contains("PublishEvents: 1 events have been published")) + self.wait_until(lambda: self.log_contains("doBulkRequest: 1 events have been sent")) proc.check_kill_and_wait() self.idxmgmt.assert_index_template_loaded(self.data_stream) From 7f4dc7403de05700eccfb144817e16c8ed0188f3 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 6 Jun 2024 12:35:42 -0700 Subject: [PATCH 06/21] build(deps): bump github.com/elastic/elastic-agent-client/v7 from 7.9.0 to 7.11.0 (#39816) * build(deps): bump github.com/elastic/elastic-agent-client/v7 Bumps [github.com/elastic/elastic-agent-client/v7](https://github.com/elastic/elastic-agent-client) from 7.9.0 to 7.11.0. - [Release notes](https://github.com/elastic/elastic-agent-client/releases) - [Commits](https://github.com/elastic/elastic-agent-client/compare/v7.9.0...v7.11.0) --- updated-dependencies: - dependency-name: github.com/elastic/elastic-agent-client/v7 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] * Update NOTICE.txt --------- Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: dependabot[bot] --- NOTICE.txt | 4 ++-- go.mod | 2 +- go.sum | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/NOTICE.txt b/NOTICE.txt index ced927671b6..c57628fc730 100644 --- a/NOTICE.txt +++ b/NOTICE.txt @@ -12736,11 +12736,11 @@ Contents of probable licence file $GOMODCACHE/github.com/elastic/elastic-agent-a -------------------------------------------------------------------------------- Dependency : github.com/elastic/elastic-agent-client/v7 -Version: v7.9.0 +Version: v7.11.0 Licence type (autodetected): Elastic -------------------------------------------------------------------------------- -Contents of probable licence file $GOMODCACHE/github.com/elastic/elastic-agent-client/v7@v7.9.0/LICENSE.txt: +Contents of probable licence file $GOMODCACHE/github.com/elastic/elastic-agent-client/v7@v7.11.0/LICENSE.txt: ELASTIC LICENSE AGREEMENT diff --git a/go.mod b/go.mod index cf7852056ea..f0dcbbfb692 100644 --- a/go.mod +++ b/go.mod @@ -71,7 +71,7 @@ require ( github.com/dustin/go-humanize v1.0.1 github.com/eapache/go-resiliency v1.2.0 github.com/eclipse/paho.mqtt.golang v1.3.5 - github.com/elastic/elastic-agent-client/v7 v7.9.0 + github.com/elastic/elastic-agent-client/v7 v7.11.0 github.com/elastic/go-concert v0.2.0 github.com/elastic/go-libaudit/v2 v2.5.0 github.com/elastic/go-licenser v0.4.1 diff --git a/go.sum b/go.sum index 2b12847342d..c56eee10e5e 100644 --- a/go.sum +++ b/go.sum @@ -551,8 +551,8 @@ github.com/elastic/ebpfevents v0.6.0 h1:BrL3m7JFK7U6h2jkbk3xAWWs//IZnugCHEDds5u2 github.com/elastic/ebpfevents v0.6.0/go.mod h1:ESG9gw7N+n5yCCMgdg1IIJENKWSmX7+X0Fi9GUs9nvU= github.com/elastic/elastic-agent-autodiscover v0.7.0 h1:FCrHXh5AZGrPlpAx8kBu/s/guw9d/EXt+GKlFCnrgsc= github.com/elastic/elastic-agent-autodiscover v0.7.0/go.mod h1:zLf0SDdQXisVZxzXPxKXdj3Fa+H4bsu4HHbTEQImDz8= -github.com/elastic/elastic-agent-client/v7 v7.9.0 h1:ryNbISIg4tTRT9KA0MYOa+fxW0CpsF+qxELWWb13rYE= -github.com/elastic/elastic-agent-client/v7 v7.9.0/go.mod h1:/AeiwX9zxG99eUNrLhpApTpwmE71Qwuh4ozObn7a0ss= +github.com/elastic/elastic-agent-client/v7 v7.11.0 h1:YpkFQyE3qPnVai2a2NiKTMpBXXmPcHRV86AtW7LdpA8= +github.com/elastic/elastic-agent-client/v7 v7.11.0/go.mod h1:/AeiwX9zxG99eUNrLhpApTpwmE71Qwuh4ozObn7a0ss= github.com/elastic/elastic-agent-libs v0.9.11 h1:J4aduNJhVeb699FxJIW/dD4BPREILqXgpWD41sCw8Uc= github.com/elastic/elastic-agent-libs v0.9.11/go.mod h1:TLFd0T/e1SHmxnx9pbdm/pqOV9y+VMvHikDyPN4Owkw= github.com/elastic/elastic-agent-system-metrics v0.10.2 h1:AVW+YqgezR0mNOZ80NxPLH3tiYMenNGZ8SC/bIUf4Uc= From 68371a0974990279c9a2d3a19ef7086a0ea66bd6 Mon Sep 17 00:00:00 2001 From: Dan Kortschak Date: Fri, 7 Jun 2024 08:06:24 +0930 Subject: [PATCH 07/21] x-pack/filebeat/input/entityanalytics/provider/okta: add user group membership support (#39815) --- CHANGELOG.next.asciidoc | 1 + .../provider/okta/internal/okta/okta.go | 34 +++++++++++++++++-- .../provider/okta/internal/okta/okta_test.go | 25 ++++++++++++++ .../entityanalytics/provider/okta/okta.go | 16 +++++++-- .../provider/okta/okta_test.go | 25 +++++++++++++- .../provider/okta/statestore.go | 3 +- 6 files changed, 98 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.next.asciidoc b/CHANGELOG.next.asciidoc index 746e1e9f13d..93d33bee5c4 100644 --- a/CHANGELOG.next.asciidoc +++ b/CHANGELOG.next.asciidoc @@ -290,6 +290,7 @@ https://github.com/elastic/beats/compare/v8.8.1\...main[Check the HEAD diff] - Make HTTP Endpoint input GA. {issue}38979[38979] {pull}39410[39410] - Update CEL mito extensions to v1.12.2. {pull}39755[39755] - Add support for base64-encoded HMAC headers to HTTP Endpoint. {pull}39655[39655] +- Add user group membership support to Okta entity analytics provider. {issue}39814[39814] {pull}39815[39815] *Auditbeat* diff --git a/x-pack/filebeat/input/entityanalytics/provider/okta/internal/okta/okta.go b/x-pack/filebeat/input/entityanalytics/provider/okta/internal/okta/okta.go index aae221e6be9..58495cbcd6c 100644 --- a/x-pack/filebeat/input/entityanalytics/provider/okta/internal/okta/okta.go +++ b/x-pack/filebeat/input/entityanalytics/provider/okta/internal/okta/okta.go @@ -9,6 +9,7 @@ import ( "bytes" "context" "encoding/json" + "errors" "fmt" "io" "net/http" @@ -98,6 +99,14 @@ type Provider struct { Name *string `json:"name,omitempty"` } +// Group is an Okta user group. +// +// See https://developer.okta.com/docs/reference/api/users/#request-parameters-8 (no anchor exists on the page for this endpoint) for details. +type Group struct { + ID string `json:"id"` + Profile map[string]any `json:"profile"` +} + // Device is an Okta device's details. // // See https://developer.okta.com/docs/api/openapi/okta-management/management/tag/Device/#tag/Device/operation/listDevices for details @@ -223,6 +232,27 @@ func GetUserDetails(ctx context.Context, cli *http.Client, host, key, user strin return getDetails[User](ctx, cli, u, key, user == "", omit, lim, window) } +// GetUserGroupDetails returns Okta group details using the users API endpoint. host is the +// Okta user domain and key is the API token to use for the query. user must not be empty. +// +// See GetUserDetails for details of the query and rate limit parameters. +// +// See https://developer.okta.com/docs/reference/api/users/#request-parameters-8 (no anchor exists on the page for this endpoint) for details. +func GetUserGroupDetails(ctx context.Context, cli *http.Client, host, key, user string, lim *rate.Limiter, window time.Duration) ([]Group, http.Header, error) { + const endpoint = "/api/v1/users" + + if user == "" { + return nil, nil, errors.New("no user specified") + } + + u := &url.URL{ + Scheme: "https", + Host: host, + Path: path.Join(endpoint, user, "groups"), + } + return getDetails[Group](ctx, cli, u, key, true, OmitNone, lim, window) +} + // GetDeviceDetails returns Okta device details using the list devices API endpoint. host is the // Okta user domain and key is the API token to use for the query. If device is not empty, // details for the specific device are returned, otherwise a list of all devices is returned. @@ -242,7 +272,7 @@ func GetDeviceDetails(ctx context.Context, cli *http.Client, host, key, device s return getDetails[Device](ctx, cli, u, key, device == "", OmitNone, lim, window) } -// GetDeviceUsers returns Okta user details for users asscoiated with the provided device identifier +// GetDeviceUsers returns Okta user details for users associated with the provided device identifier // using the list device users API. host is the Okta user domain and key is the API token to use for // the query. If device is empty, a nil User slice and header is returned, without error. // @@ -276,7 +306,7 @@ func GetDeviceUsers(ctx context.Context, cli *http.Client, host, key, device str // entity is an Okta entity analytics entity. type entity interface { - User | Device | devUser + User | Group | Device | devUser } type devUser struct { diff --git a/x-pack/filebeat/input/entityanalytics/provider/okta/internal/okta/okta_test.go b/x-pack/filebeat/input/entityanalytics/provider/okta/internal/okta/okta_test.go index 63b6dbf6ba2..58816ef0f41 100644 --- a/x-pack/filebeat/input/entityanalytics/provider/okta/internal/okta/okta_test.go +++ b/x-pack/filebeat/input/entityanalytics/provider/okta/internal/okta/okta_test.go @@ -86,6 +86,31 @@ func Test(t *testing.T) { return } + t.Run("my_groups", func(t *testing.T) { + query := make(url.Values) + query.Set("limit", "200") + groups, _, err := GetUserGroupDetails(context.Background(), http.DefaultClient, host, key, me.ID, limiter, window) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if len(groups) == 0 { + t.Fatalf("unexpected len(groups): got:%d want:1", len(groups)) + } + + if omit&OmitCredentials != 0 && me.Credentials != nil { + t.Errorf("unexpected credentials with %s: %#v", omit, me.Credentials) + } + + if !*logResponses { + return + } + b, err := json.Marshal(groups) + if err != nil { + t.Errorf("failed to marshal groups for logging: %v", err) + } + t.Logf("groups: %s", b) + }) + t.Run("user", func(t *testing.T) { if me.Profile.Login == "" { b, _ := json.Marshal(me) diff --git a/x-pack/filebeat/input/entityanalytics/provider/okta/okta.go b/x-pack/filebeat/input/entityanalytics/provider/okta/okta.go index d56ae757060..70f95d7396e 100644 --- a/x-pack/filebeat/input/entityanalytics/provider/okta/okta.go +++ b/x-pack/filebeat/input/entityanalytics/provider/okta/okta.go @@ -385,7 +385,7 @@ func (p *oktaInput) doFetchUsers(ctx context.Context, state *stateStore, fullSyn if fullSync { for _, u := range batch { - state.storeUser(u) + p.addGroup(ctx, u, state) if u.LastUpdated.After(lastUpdated) { lastUpdated = u.LastUpdated } @@ -393,7 +393,8 @@ func (p *oktaInput) doFetchUsers(ctx context.Context, state *stateStore, fullSyn } else { users = grow(users, len(batch)) for _, u := range batch { - users = append(users, state.storeUser(u)) + su := p.addGroup(ctx, u, state) + users = append(users, su) if u.LastUpdated.After(lastUpdated) { lastUpdated = u.LastUpdated } @@ -424,6 +425,17 @@ func (p *oktaInput) doFetchUsers(ctx context.Context, state *stateStore, fullSyn return users, nil } +func (p *oktaInput) addGroup(ctx context.Context, u okta.User, state *stateStore) *User { + su := state.storeUser(u) + groups, _, err := okta.GetUserGroupDetails(ctx, p.client, p.cfg.OktaDomain, p.cfg.OktaToken, u.ID, p.lim, p.cfg.LimitWindow) + if err != nil { + p.logger.Warnf("failed to get user group membership for %s: %v", u.ID, err) + return su + } + su.Groups = groups + return su +} + // doFetchDevices handles fetching device and associated user identities from Okta. // If fullSync is true, then any existing deltaLink will be ignored, forcing a full // synchronization from Okta. diff --git a/x-pack/filebeat/input/entityanalytics/provider/okta/okta_test.go b/x-pack/filebeat/input/entityanalytics/provider/okta/okta_test.go index 1286cc24689..da29666712b 100644 --- a/x-pack/filebeat/input/entityanalytics/provider/okta/okta_test.go +++ b/x-pack/filebeat/input/entityanalytics/provider/okta/okta_test.go @@ -18,6 +18,7 @@ import ( "golang.org/x/time/rate" + "github.com/elastic/beats/v7/x-pack/filebeat/input/entityanalytics/provider/okta/internal/okta" "github.com/elastic/elastic-agent-libs/logp" ) @@ -49,11 +50,13 @@ func TestOktaDoFetch(t *testing.T) { window = time.Minute key = "token" users = `[{"id":"USERID","status":"STATUS","created":"2023-05-14T13:37:20.000Z","activated":null,"statusChanged":"2023-05-15T01:50:30.000Z","lastLogin":"2023-05-15T01:59:20.000Z","lastUpdated":"2023-05-15T01:50:32.000Z","passwordChanged":"2023-05-15T01:50:32.000Z","type":{"id":"typeid"},"profile":{"firstName":"name","lastName":"surname","mobilePhone":null,"secondEmail":null,"login":"name.surname@example.com","email":"name.surname@example.com"},"credentials":{"password":{"value":"secret"},"emails":[{"value":"name.surname@example.com","status":"VERIFIED","type":"PRIMARY"}],"provider":{"type":"OKTA","name":"OKTA"}},"_links":{"self":{"href":"https://localhost/api/v1/users/USERID"}}}]` + groups = `[{"id":"USERID","profile":{"description":"All users in your organization","name":"Everyone"}}]` devices = `[{"id":"DEVICEID","status":"STATUS","created":"2019-10-02T18:03:07.000Z","lastUpdated":"2019-10-02T18:03:07.000Z","profile":{"displayName":"Example Device name 1","platform":"WINDOWS","serialNumber":"XXDDRFCFRGF3M8MD6D","sid":"S-1-11-111","registered":true,"secureHardwarePresent":false,"diskEncryptionType":"ALL_INTERNAL_VOLUMES"},"resourceType":"UDDevice","resourceDisplayName":{"value":"Example Device name 1","sensitive":false},"resourceAlternateId":null,"resourceId":"DEVICEID","_links":{"activate":{"href":"https://localhost/api/v1/devices/DEVICEID/lifecycle/activate","hints":{"allow":["POST"]}},"self":{"href":"https://localhost/api/v1/devices/DEVICEID","hints":{"allow":["GET","PATCH","PUT"]}},"users":{"href":"https://localhost/api/v1/devices/DEVICEID/users","hints":{"allow":["GET"]}}}}]` ) data := map[string]string{ "users": users, + "groups": groups, "devices": devices, } @@ -63,6 +66,14 @@ func TestOktaDoFetch(t *testing.T) { if err != nil { t.Fatalf("failed to unmarshal user data: %v", err) } + var wantGroups []okta.Group + err = json.Unmarshal([]byte(groups), &wantGroups) + if err != nil { + t.Fatalf("failed to unmarshal user data: %v", err) + } + for i, u := range wantUsers { + wantUsers[i].Groups = append(u.Groups, wantGroups...) + } } var wantDevices []Device if test.wantDevices { @@ -83,6 +94,12 @@ func TestOktaDoFetch(t *testing.T) { w.Header().Add("x-rate-limit-remaining", "49") w.Header().Add("x-rate-limit-reset", fmt.Sprint(time.Now().Add(time.Minute).Unix())) + if strings.HasPrefix(r.URL.Path, "/api/v1/users") && strings.HasSuffix(r.URL.Path, "groups") { + // Give the groups if this is a get user groups request. + userid := strings.TrimSuffix(strings.TrimPrefix(r.URL.Path, "/api/v1/users/"), "/groups") + fmt.Fprintln(w, strings.ReplaceAll(data["groups"], "USERID", userid)) + return + } if strings.HasPrefix(r.URL.Path, "/api/v1/device") && strings.HasSuffix(r.URL.Path, "users") { // Give one user if this is a get device users request. fmt.Fprintln(w, data["users"]) @@ -158,9 +175,15 @@ func TestOktaDoFetch(t *testing.T) { t.Errorf("unexpected number of results: got:%d want:%d", len(got), wantCount(repeats, test.wantUsers)) } for i, g := range got { - if wantID := fmt.Sprintf("userid%d", i+1); g.ID != wantID { + wantID := fmt.Sprintf("userid%d", i+1) + if g.ID != wantID { t.Errorf("unexpected user ID for user %d: got:%s want:%s", i, g.ID, wantID) } + for j, gg := range g.Groups { + if gg.ID != wantID { + t.Errorf("unexpected used ID for user group %d in %d: got:%s want:%s", j, i, gg.ID, wantID) + } + } if g.State != wantStates[g.ID] { t.Errorf("unexpected user state for user %s: got:%s want:%s", g.ID, g.State, wantStates[g.ID]) } diff --git a/x-pack/filebeat/input/entityanalytics/provider/okta/statestore.go b/x-pack/filebeat/input/entityanalytics/provider/okta/statestore.go index 8a11376af51..a54fc3b9928 100644 --- a/x-pack/filebeat/input/entityanalytics/provider/okta/statestore.go +++ b/x-pack/filebeat/input/entityanalytics/provider/okta/statestore.go @@ -39,7 +39,8 @@ const ( type User struct { okta.User `json:"properties"` - State State `json:"state"` + Groups []okta.Group `json:"groups"` + State State `json:"state"` } type Device struct { From 3102b496b9e9f0eae8c7eb685b1217734d40190b Mon Sep 17 00:00:00 2001 From: Michael Wolf Date: Fri, 7 Jun 2024 05:15:53 -0700 Subject: [PATCH 08/21] Fail integration tests if system requirements are not met (#39783) If the system running integration tests doesn't have the requirements, fail the test. If the tests are incorrectly marked as passing, it's less likely that anyone will notice that no tests were actually run, and will cause misleading results, and not catch errors may exist. It's better to fail on systems that don't have the requirements, so such systems can be investigated and fixed. --------- Co-authored-by: Andrew Kroh --- dev-tools/mage/integtest.go | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/dev-tools/mage/integtest.go b/dev-tools/mage/integtest.go index 96b98f0b6b8..0b881a21e49 100644 --- a/dev-tools/mage/integtest.go +++ b/dev-tools/mage/integtest.go @@ -257,7 +257,7 @@ func (r *IntegrationRunner) Test(mageTarget string, test func() error) (err erro // Inside the testing environment just run the test. if IsInIntegTestEnv() { err = r.tester.InsideTest(test) - return + return err } // Honor the TEST_ENVIRONMENT value if set. @@ -266,25 +266,21 @@ func (r *IntegrationRunner) Test(mageTarget string, test func() error) (err erro enabled, err = strconv.ParseBool(testEnvVar) if err != nil { err = fmt.Errorf("failed to parse TEST_ENVIRONMENT value: %w", err) - return + return err } if !enabled { err = fmt.Errorf("TEST_ENVIRONMENT=%s", testEnvVar) - return + return err } } - // log missing requirements and do nothing err = r.tester.HasRequirements() if err != nil { - // log error; and return (otherwise on machines without requirements it will mark the tests as failed) - fmt.Printf("skipping test run with %s due to missing requirements: %s\n", r.tester.Name(), err) - err = nil - return + return fmt.Errorf("test %s not run due to missing requirements: %w\n", r.tester.Name(), err) } if err = r.steps.Setup(r.env); err != nil { - return + return err } // catch any panics to run teardown From 4c1d3f2292e02db8e31a60cb3461a20ed6758f2a Mon Sep 17 00:00:00 2001 From: Maurizio Branca Date: Fri, 7 Jun 2024 16:33:09 +0200 Subject: [PATCH 09/21] [Docker] Stop returning errors when there are no metric values (#39807) * Return nil instead or an error if no metrics The docker metricset returns an error when there are no memory metric values available. This condition can happen when there are no running containers on Docker. When no containers are running, the metricset returns an error at every collection, creating noise. * Add a "docker.memory" logger to write a debug message on no-metrics --- metricbeat/module/docker/memory/memory.go | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/metricbeat/module/docker/memory/memory.go b/metricbeat/module/docker/memory/memory.go index a4f56138972..140383de833 100644 --- a/metricbeat/module/docker/memory/memory.go +++ b/metricbeat/module/docker/memory/memory.go @@ -22,6 +22,8 @@ package memory import ( "fmt" + "github.com/elastic/elastic-agent-libs/logp" + "github.com/docker/docker/client" "github.com/elastic/beats/v7/metricbeat/mb" @@ -41,16 +43,18 @@ type MetricSet struct { memoryService *MemoryService dockerClient *client.Client dedot bool + logger *logp.Logger } // New creates a new instance of the docker memory MetricSet. func New(base mb.BaseMetricSet) (mb.MetricSet, error) { + logger := logp.NewLogger("docker.memory") config := docker.DefaultConfig() if err := base.Module().UnpackConfig(&config); err != nil { return nil, err } - client, err := docker.NewDockerClient(base.HostData().URI, config) + dockerClient, err := docker.NewDockerClient(base.HostData().URI, config) if err != nil { return nil, err } @@ -58,8 +62,9 @@ func New(base mb.BaseMetricSet) (mb.MetricSet, error) { return &MetricSet{ BaseMetricSet: base, memoryService: &MemoryService{}, - dockerClient: client, + dockerClient: dockerClient, dedot: config.DeDot, + logger: logger, }, nil } @@ -72,7 +77,10 @@ func (m *MetricSet) Fetch(r mb.ReporterV2) error { memoryStats := m.memoryService.getMemoryStatsList(stats, m.dedot) if len(memoryStats) == 0 { - return fmt.Errorf("No memory stats data available") + // No memory stats available, probably + // because no containers are running. + m.logger.Debug("No memory stats data available") + return nil } eventsMapping(r, memoryStats) From 3b246988469e302220d61b9e8905fd3e15db973a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Emilio=20Alvarez=20Pi=C3=B1eiro?= <95703246+emilioalvap@users.noreply.github.com> Date: Fri, 7 Jun 2024 20:33:28 +0200 Subject: [PATCH 10/21] [Heartbeat] Unit test plugin import at cmd level (#39830) * [Heartbeat] Unit test plugin imports inside cmd module Add unit test to cover plugin imports at cmd module, to be run from agentbeat. --- heartbeat/cmd/root_test.go | 40 +++++++++++++++++++++++++++++++ x-pack/heartbeat/cmd/root_test.go | 28 ++++++++++++++++++++++ 2 files changed, 68 insertions(+) create mode 100644 heartbeat/cmd/root_test.go create mode 100644 x-pack/heartbeat/cmd/root_test.go diff --git a/heartbeat/cmd/root_test.go b/heartbeat/cmd/root_test.go new file mode 100644 index 00000000000..ec40aa0c750 --- /dev/null +++ b/heartbeat/cmd/root_test.go @@ -0,0 +1,40 @@ +// Licensed to Elasticsearch B.V. under one or more contributor +// license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright +// ownership. Elasticsearch B.V. licenses this file to you under +// the Apache License, Version 2.0 (the "License"); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package cmd + +import ( + "fmt" + "testing" + + "github.com/stretchr/testify/assert" + + "github.com/elastic/beats/v7/heartbeat/monitors/plugin" +) + +// Test all required plugins are exported by this module, since it's the +// one imported by agentbeat: https://github.com/elastic/beats/pull/39818 +func TestRootCmdPlugins(t *testing.T) { + t.Parallel() + plugins := []string{"http", "tcp", "icmp"} + for _, p := range plugins { + t.Run(fmt.Sprintf("%s plugin", p), func(t *testing.T) { + _, found := plugin.GlobalPluginsReg.Get(p) + assert.True(t, found) + }) + } +} diff --git a/x-pack/heartbeat/cmd/root_test.go b/x-pack/heartbeat/cmd/root_test.go new file mode 100644 index 00000000000..27e9ec2dd1c --- /dev/null +++ b/x-pack/heartbeat/cmd/root_test.go @@ -0,0 +1,28 @@ +// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one +// or more contributor license agreements. Licensed under the Elastic License; +// you may not use this file except in compliance with the Elastic License. +//go:build linux || synthetics + +package cmd + +import ( + "fmt" + "testing" + + "github.com/stretchr/testify/assert" + + "github.com/elastic/beats/v7/heartbeat/monitors/plugin" +) + +// Test all required plugins are exported by this module, since it's the +// one imported by agentbeat: https://github.com/elastic/beats/pull/39818 +func TestRootCmdPlugins(t *testing.T) { + t.Parallel() + plugins := []string{"http", "tcp", "icmp", "browser"} + for _, p := range plugins { + t.Run(fmt.Sprintf("%s plugin", p), func(t *testing.T) { + _, found := plugin.GlobalPluginsReg.Get(p) + assert.True(t, found) + }) + } +} From 8211cbcc85254dc2cdfd889fa6f593ab032a39f2 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 7 Jun 2024 16:05:48 -0400 Subject: [PATCH 11/21] build(deps): bump github.com/elastic/go-elasticsearch/v8 from 8.13.1 to 8.14.0 (#39826) * build(deps): bump github.com/elastic/go-elasticsearch/v8 Bumps [github.com/elastic/go-elasticsearch/v8](https://github.com/elastic/go-elasticsearch) from 8.13.1 to 8.14.0. - [Release notes](https://github.com/elastic/go-elasticsearch/releases) - [Changelog](https://github.com/elastic/go-elasticsearch/blob/main/CHANGELOG.md) - [Commits](https://github.com/elastic/go-elasticsearch/compare/v8.13.1...v8.14.0) --- updated-dependencies: - dependency-name: github.com/elastic/go-elasticsearch/v8 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] * Update NOTICE.txt --------- Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: dependabot[bot] --- NOTICE.txt | 28 ++++++++++++++-------------- go.mod | 14 +++++++------- go.sum | 26 ++++++++++++++------------ 3 files changed, 35 insertions(+), 33 deletions(-) diff --git a/NOTICE.txt b/NOTICE.txt index c57628fc730..45e63fe2ded 100644 --- a/NOTICE.txt +++ b/NOTICE.txt @@ -13602,11 +13602,11 @@ Contents of probable licence file $GOMODCACHE/github.com/elastic/go-concert@v0.2 -------------------------------------------------------------------------------- Dependency : github.com/elastic/go-elasticsearch/v8 -Version: v8.13.1 +Version: v8.14.0 Licence type (autodetected): Apache-2.0 -------------------------------------------------------------------------------- -Contents of probable licence file $GOMODCACHE/github.com/elastic/go-elasticsearch/v8@v8.13.1/LICENSE: +Contents of probable licence file $GOMODCACHE/github.com/elastic/go-elasticsearch/v8@v8.14.0/LICENSE: Apache License Version 2.0, January 2004 @@ -25411,11 +25411,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- Dependency : golang.org/x/sys -Version: v0.19.0 +Version: v0.20.0 Licence type (autodetected): BSD-3-Clause -------------------------------------------------------------------------------- -Contents of probable licence file $GOMODCACHE/golang.org/x/sys@v0.19.0/LICENSE: +Contents of probable licence file $GOMODCACHE/golang.org/x/sys@v0.20.0/LICENSE: Copyright (c) 2009 The Go Authors. All rights reserved. @@ -38331,11 +38331,11 @@ SOFTWARE. -------------------------------------------------------------------------------- Dependency : github.com/elastic/elastic-transport-go/v8 -Version: v8.5.0 +Version: v8.6.0 Licence type (autodetected): Apache-2.0 -------------------------------------------------------------------------------- -Contents of probable licence file $GOMODCACHE/github.com/elastic/elastic-transport-go/v8@v8.5.0/LICENSE: +Contents of probable licence file $GOMODCACHE/github.com/elastic/elastic-transport-go/v8@v8.6.0/LICENSE: Apache License Version 2.0, January 2004 @@ -39290,11 +39290,11 @@ SOFTWARE. -------------------------------------------------------------------------------- Dependency : github.com/go-logr/logr -Version: v1.3.0 +Version: v1.4.1 Licence type (autodetected): Apache-2.0 -------------------------------------------------------------------------------- -Contents of probable licence file $GOMODCACHE/github.com/go-logr/logr@v1.3.0/LICENSE: +Contents of probable licence file $GOMODCACHE/github.com/go-logr/logr@v1.4.1/LICENSE: Apache License Version 2.0, January 2004 @@ -54244,11 +54244,11 @@ Contents of probable licence file $GOMODCACHE/go.opencensus.io@v0.24.0/LICENSE: -------------------------------------------------------------------------------- Dependency : go.opentelemetry.io/otel -Version: v1.21.0 +Version: v1.24.0 Licence type (autodetected): Apache-2.0 -------------------------------------------------------------------------------- -Contents of probable licence file $GOMODCACHE/go.opentelemetry.io/otel@v1.21.0/LICENSE: +Contents of probable licence file $GOMODCACHE/go.opentelemetry.io/otel@v1.24.0/LICENSE: Apache License Version 2.0, January 2004 @@ -54455,11 +54455,11 @@ Contents of probable licence file $GOMODCACHE/go.opentelemetry.io/otel@v1.21.0/L -------------------------------------------------------------------------------- Dependency : go.opentelemetry.io/otel/metric -Version: v1.21.0 +Version: v1.24.0 Licence type (autodetected): Apache-2.0 -------------------------------------------------------------------------------- -Contents of probable licence file $GOMODCACHE/go.opentelemetry.io/otel/metric@v1.21.0/LICENSE: +Contents of probable licence file $GOMODCACHE/go.opentelemetry.io/otel/metric@v1.24.0/LICENSE: Apache License Version 2.0, January 2004 @@ -54877,11 +54877,11 @@ Contents of probable licence file $GOMODCACHE/go.opentelemetry.io/otel/sdk@v1.21 -------------------------------------------------------------------------------- Dependency : go.opentelemetry.io/otel/trace -Version: v1.21.0 +Version: v1.24.0 Licence type (autodetected): Apache-2.0 -------------------------------------------------------------------------------- -Contents of probable licence file $GOMODCACHE/go.opentelemetry.io/otel/trace@v1.21.0/LICENSE: +Contents of probable licence file $GOMODCACHE/go.opentelemetry.io/otel/trace@v1.24.0/LICENSE: Apache License Version 2.0, January 2004 diff --git a/go.mod b/go.mod index f0dcbbfb692..0b26cff2cd4 100644 --- a/go.mod +++ b/go.mod @@ -159,7 +159,7 @@ require ( golang.org/x/net v0.24.0 golang.org/x/oauth2 v0.10.0 golang.org/x/sync v0.6.0 - golang.org/x/sys v0.19.0 + golang.org/x/sys v0.20.0 golang.org/x/text v0.14.0 golang.org/x/time v0.5.0 golang.org/x/tools v0.16.1 @@ -208,7 +208,7 @@ require ( github.com/elastic/elastic-agent-autodiscover v0.7.0 github.com/elastic/elastic-agent-libs v0.9.11 github.com/elastic/elastic-agent-system-metrics v0.10.2 - github.com/elastic/go-elasticsearch/v8 v8.13.1 + github.com/elastic/go-elasticsearch/v8 v8.14.0 github.com/elastic/mito v1.12.2 github.com/elastic/tk-btf v0.1.0 github.com/elastic/toutoumomoma v0.0.0-20221026030040-594ef30cb640 @@ -287,7 +287,7 @@ require ( github.com/docker/go-metrics v0.0.1 // indirect github.com/eapache/go-xerial-snappy v0.0.0-20180814174437-776d5712da21 // indirect github.com/eapache/queue v1.1.0 // indirect - github.com/elastic/elastic-transport-go/v8 v8.5.0 // indirect + github.com/elastic/elastic-transport-go/v8 v8.6.0 // indirect github.com/elastic/go-windows v1.0.1 // indirect github.com/elastic/pkcs8 v1.0.0 // indirect github.com/emicklei/go-restful/v3 v3.11.0 // indirect @@ -296,7 +296,7 @@ require ( github.com/felixge/httpsnoop v1.0.1 // indirect github.com/go-asn1-ber/asn1-ber v1.5.5 // indirect github.com/go-logfmt/logfmt v0.5.1 // indirect - github.com/go-logr/logr v1.3.0 // indirect + github.com/go-logr/logr v1.4.1 // indirect github.com/go-logr/stdr v1.2.2 // indirect github.com/go-ole/go-ole v1.2.6 // indirect github.com/go-openapi/jsonpointer v0.19.6 // indirect @@ -386,9 +386,9 @@ require ( github.com/zeebo/xxh3 v1.0.2 // indirect go.elastic.co/fastjson v1.1.0 // indirect go.opencensus.io v0.24.0 // indirect - go.opentelemetry.io/otel v1.21.0 // indirect - go.opentelemetry.io/otel/metric v1.21.0 // indirect - go.opentelemetry.io/otel/trace v1.21.0 // indirect + go.opentelemetry.io/otel v1.24.0 // indirect + go.opentelemetry.io/otel/metric v1.24.0 // indirect + go.opentelemetry.io/otel/trace v1.24.0 // indirect golang.org/x/term v0.19.0 // indirect golang.org/x/xerrors v0.0.0-20220907171357-04be3eba64a2 // indirect google.golang.org/appengine v1.6.7 // indirect diff --git a/go.sum b/go.sum index c56eee10e5e..86c3c94dd78 100644 --- a/go.sum +++ b/go.sum @@ -557,15 +557,15 @@ github.com/elastic/elastic-agent-libs v0.9.11 h1:J4aduNJhVeb699FxJIW/dD4BPREILqX github.com/elastic/elastic-agent-libs v0.9.11/go.mod h1:TLFd0T/e1SHmxnx9pbdm/pqOV9y+VMvHikDyPN4Owkw= github.com/elastic/elastic-agent-system-metrics v0.10.2 h1:AVW+YqgezR0mNOZ80NxPLH3tiYMenNGZ8SC/bIUf4Uc= github.com/elastic/elastic-agent-system-metrics v0.10.2/go.mod h1:0jJ2ARnzTTOEMmcRX9UNqSwbwguEluE/mK2HaM3GViI= -github.com/elastic/elastic-transport-go/v8 v8.5.0 h1:v5membAl7lvQgBTexPRDBO/RdnlQX+FM9fUVDyXxvH0= -github.com/elastic/elastic-transport-go/v8 v8.5.0/go.mod h1:YLHer5cj0csTzNFXoNQ8qhtGY1GTvSqPnKWKaqQE3Hk= +github.com/elastic/elastic-transport-go/v8 v8.6.0 h1:Y2S/FBjx1LlCv5m6pWAF2kDJAHoSjSRSJCApolgfthA= +github.com/elastic/elastic-transport-go/v8 v8.6.0/go.mod h1:YLHer5cj0csTzNFXoNQ8qhtGY1GTvSqPnKWKaqQE3Hk= github.com/elastic/fsevents v0.0.0-20181029231046-e1d381a4d270 h1:cWPqxlPtir4RoQVCpGSRXmLqjEHpJKbR60rxh1nQZY4= github.com/elastic/fsevents v0.0.0-20181029231046-e1d381a4d270/go.mod h1:Msl1pdboCbArMF/nSCDUXgQuWTeoMmE/z8607X+k7ng= github.com/elastic/glog v1.0.1-0.20210831205241-7d8b5c89dfc4/go.mod h1:EWib/APOK0SL3dFbYqvxE3UYd8E6s1ouQ7iEp/0LWV4= github.com/elastic/go-concert v0.2.0 h1:GAQrhRVXprnNjtvTP9pWJ1d4ToEA4cU5ci7TwTa20xg= github.com/elastic/go-concert v0.2.0/go.mod h1:HWjpO3IAEJUxOeaJOWXWEp7imKd27foxz9V5vegC/38= -github.com/elastic/go-elasticsearch/v8 v8.13.1 h1:du5F8IzUUyCkzxyHdrO9AtopcG95I/qwi2WK8Kf1xlg= -github.com/elastic/go-elasticsearch/v8 v8.13.1/go.mod h1:DIn7HopJs4oZC/w0WoJR13uMUxtHeq92eI5bqv5CRfI= +github.com/elastic/go-elasticsearch/v8 v8.14.0 h1:1ywU8WFReLLcxE1WJqii3hTtbPUE2hc38ZK/j4mMFow= +github.com/elastic/go-elasticsearch/v8 v8.14.0/go.mod h1:WRvnlGkSuZyp83M2U8El/LGXpCjYLrvlkSgkAH4O5I4= github.com/elastic/go-libaudit/v2 v2.5.0 h1:5OK919QRnGtcjVBz3n/cs5F42im1mPlVTA9TyIn2K54= github.com/elastic/go-libaudit/v2 v2.5.0/go.mod h1:AjlnhinP+kKQuUJoXLVrqxBM8uyhQmkzoV6jjsCFP4Q= github.com/elastic/go-licenser v0.4.1 h1:1xDURsc8pL5zYT9R29425J3vkHdt4RT5TNEMeRN48x4= @@ -685,8 +685,9 @@ github.com/go-logr/logr v0.1.0/go.mod h1:ixOQHD9gLJUVQQ2ZOR7zLEifBX6tGkNJF4QyIY7 github.com/go-logr/logr v0.4.0/go.mod h1:z6/tIYblkpsD+a4lm/fGIIU9mZ+XfAiaFtq7xTgseGU= github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= github.com/go-logr/logr v1.2.3/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= -github.com/go-logr/logr v1.3.0 h1:2y3SDp0ZXuc6/cjLSZ+Q3ir+QB9T/iG5yYRXqsagWSY= github.com/go-logr/logr v1.3.0/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= +github.com/go-logr/logr v1.4.1 h1:pKouT5E8xu9zeFC39JXRDukb6JFQPXM5p5I91188VAQ= +github.com/go-logr/logr v1.4.1/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= github.com/go-martini/martini v0.0.0-20170121215854-22fa46961aab h1:xveKWz2iaueeTaUgdetzel+U7exyigDYBryyVfV/rZk= @@ -1750,14 +1751,14 @@ go.opencensus.io v0.22.5/go.mod h1:5pWMHQbX5EPX2/62yrJeAkowc+lfs/XD7Uxpq3pI6kk= go.opencensus.io v0.23.0/go.mod h1:XItmlyltB5F7CS4xOC1DcqMoFqwtC6OG2xF7mCv7P7E= go.opencensus.io v0.24.0 h1:y73uSU6J157QMP2kn2r30vwW1A2W2WFwSCGnAVxeaD0= go.opencensus.io v0.24.0/go.mod h1:vNK8G9p7aAivkbmorf4v+7Hgx+Zs0yY+0fOtgBfjQKo= -go.opentelemetry.io/otel v1.21.0 h1:hzLeKBZEL7Okw2mGzZ0cc4k/A7Fta0uoPgaJCr8fsFc= -go.opentelemetry.io/otel v1.21.0/go.mod h1:QZzNPQPm1zLX4gZK4cMi+71eaorMSGT3A4znnUvNNEo= -go.opentelemetry.io/otel/metric v1.21.0 h1:tlYWfeo+Bocx5kLEloTjbcDwBuELRrIFxwdQ36PlJu4= -go.opentelemetry.io/otel/metric v1.21.0/go.mod h1:o1p3CA8nNHW8j5yuQLdc1eeqEaPfzug24uvsyIEJRWM= +go.opentelemetry.io/otel v1.24.0 h1:0LAOdjNmQeSTzGBzduGe/rU4tZhMwL5rWgtp9Ku5Jfo= +go.opentelemetry.io/otel v1.24.0/go.mod h1:W7b9Ozg4nkF5tWI5zsXkaKKDjdVjpD4oAt9Qi/MArHo= +go.opentelemetry.io/otel/metric v1.24.0 h1:6EhoGWWK28x1fbpA4tYTOWBkPefTDQnb8WSGXlc88kI= +go.opentelemetry.io/otel/metric v1.24.0/go.mod h1:VYhLe1rFfxuTXLgj4CBiyz+9WYBA8pNGJgDcSFRKBco= go.opentelemetry.io/otel/sdk v1.21.0 h1:FTt8qirL1EysG6sTQRZ5TokkU8d0ugCj8htOgThZXQ8= go.opentelemetry.io/otel/sdk v1.21.0/go.mod h1:Nna6Yv7PWTdgJHVRD9hIYywQBRx7pbox6nwBnZIxl/E= -go.opentelemetry.io/otel/trace v1.21.0 h1:WD9i5gzvoUPuXIXH24ZNBudiarZDKuekPqi/E8fpfLc= -go.opentelemetry.io/otel/trace v1.21.0/go.mod h1:LGbsEB0f9LGjN+OZaQQ26sohbOmiMR+BaslueVtS/qQ= +go.opentelemetry.io/otel/trace v1.24.0 h1:CsKnnL4dUAr/0llH9FKuc698G04IrpWV0MQA/Y1YELI= +go.opentelemetry.io/otel/trace v1.24.0/go.mod h1:HPc3Xr/cOApsBI154IU0OI0HJexz+aw5uPdbs3UCjNU= go.opentelemetry.io/proto/otlp v0.7.0/go.mod h1:PqfVotwruBrMGOCsRd/89rSnXhoiJIqeYNgFYFoEGnI= go.uber.org/atomic v1.3.2/go.mod h1:gD2HeocX3+yG+ygLZcrzQJaqmWj9AIm7n08wl/qW/PE= go.uber.org/atomic v1.5.0/go.mod h1:sABNBOSYdrvTF6hTgEIbc7YasKWGhgEQZyfxyTvoXHQ= @@ -2113,8 +2114,9 @@ golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.11.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/sys v0.19.0 h1:q5f1RH2jigJ1MoAWp2KTp3gm5zAGFUTarQZ5U386+4o= golang.org/x/sys v0.19.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.20.0 h1:Od9JTbYCk261bKm4M/mw7AklTlFYIa0bIp9BgSm1S8Y= +golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/term v0.0.0-20201117132131-f5c789dd3221/go.mod h1:Nr5EML6q2oocZ2LXRh80K7BxOlk5/8JxuGnuhpl+muw= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210220032956-6a3ed077a48d/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= From 334a4c729422bb81dc3c5c46924ad357a7ce9da7 Mon Sep 17 00:00:00 2001 From: "mergify[bot]" <37929162+mergify[bot]@users.noreply.github.com> Date: Fri, 7 Jun 2024 16:20:26 -0400 Subject: [PATCH 12/21] docs: Prepare Changelog for 8.14.0 (#39809) (#39813) * docs: Close changelog for 8.14.0 * Update CHANGELOG.asciidoc * Update CHANGELOG.asciidoc Co-authored-by: David Kilfoyle <41695641+kilfoyle@users.noreply.github.com> * Apply suggestions from code review Co-authored-by: David Kilfoyle <41695641+kilfoyle@users.noreply.github.com> * fix doc build --------- Co-authored-by: Pierre HILBERT Co-authored-by: David Kilfoyle <41695641+kilfoyle@users.noreply.github.com> Co-authored-by: Brandon Morelli (cherry picked from commit afdf5b2b9f3c84391b5445edb8c7ce2af367713a) Co-authored-by: Elastic Machine Co-authored-by: Pierre HILBERT --- CHANGELOG.asciidoc | 125 ++++++++++++++++++++++++++++++++++ CHANGELOG.next.asciidoc | 26 ++----- libbeat/docs/release.asciidoc | 1 + 3 files changed, 132 insertions(+), 20 deletions(-) diff --git a/CHANGELOG.asciidoc b/CHANGELOG.asciidoc index d6b9a0f6297..83844526f9d 100644 --- a/CHANGELOG.asciidoc +++ b/CHANGELOG.asciidoc @@ -3,6 +3,131 @@ :issue: https://github.com/elastic/beats/issues/ :pull: https://github.com/elastic/beats/pull/ +[[release-notes-8.14.0]] +=== Beats version 8.14.0 +https://github.com/elastic/beats/compare/v8.13.4\...v8.14.0[View commits] + +==== Breaking changes + +*Filebeat* + +- Removed deprecated ZScaler from Beats. Use the https://docs.elastic.co/integrations/zscaler_zia[Zscaler Internet Access] Elastic integration instead. {pull}38037[38037] +- Removed deprecated Tomcat from Beats. Use the https://docs.elastic.co/integrations/apache_tomcat[Apache Tomcat] Elastic integration instead. {pull}38037[38037] +- Removed deprecated Squid from Beats. See {filebeat-ref}/migrate-from-deprecated-module.html[Migrate from a deprecated module] for migration options. {pull}38037[38037] +- Removed deprecated SonicWall from Beats. Use the https://docs.elastic.co/integrations/sonicwall[SonicWall Firewall] Elastic integration instead. {pull}38037[38037] +- Removed deprecated Snort from Beats. Use the https://docs.elastic.co/integrations/snort[Snort] Elastic integration instead. {pull}38037[38037] +- Removed deprecated Radware from Beats. See {filebeat-ref}/migrate-from-deprecated-module.html[Migrate from a deprecated module] for migration options. {pull}38037[38037] +- Removed deprecated Proofpoint from Beats. Use the https://docs.elastic.co/integrations/proofpoint_tap[Proofpoint TAP] Elastic integration instead. {pull}38037[38037] +- Removed deprecated Netscout from Beats. See {filebeat-ref}/migrate-from-deprecated-module.html[Migrate from a deprecated module] for migration options. {pull}38037[38037] +- Removed deprecated Microsoft DHCP from Beats. Use the https://docs.elastic.co/integrations/microsoft_dhcp[Microsoft DHCP] Elastic integration instead. {pull}38037[38037] +- Removed deprecated Juniper Junos from Beats. Use the https://docs.elastic.co/integrations/juniper_srx[Juniper SRX] Elastic integration instead. {pull}38037[38037] +- Removed deprecated Juniper Netscreen from Beats. See {filebeat-ref}/migrate-from-deprecated-module.html[Migrate from a deprecated module] for migration options. {pull}38037[38037] +- Removed deprecated Infoblox from Beats. Use the https://docs.elastic.co/integrations/infoblox_nios[Infoblox NIOS] Elastic integration instead. {pull}38037[38037] +- Removed deprecated Impreva from Beats. See {filebeat-ref}/migrate-from-deprecated-module.html[Migrate from a deprecated module] for migration options. {pull}38037[38037] +- Removed deprecated Fortinet Client Endpoint from Beats. Use the https://docs.elastic.co/integrations/fortinet_forticlient[Fortinet FortiClient Logs] Elastic integration instead. {pull}38037[38037] +- Removed deprecated Fortinet Fortimail from Beats. Use the https://docs.elastic.co/integrations/fortinet_fortimail[Fortinet FortiMail] Elastic integration instead. {pull}38037[38037] +- Removed deprecated Fortinet Fortimanager from Beats. Use the https://docs.elastic.co/integrations/fortinet_fortimanager[Fortinet FortiManager Logs] Elastic integration instead. {pull}38037[38037] +- Removed deprecated F5 from Beats. Use the https://docs.elastic.co/integrations/f5_bigip[F5 BIG-IP] Elastic integration instead. {pull}38037[38037] +- Removed deprecated Cylance from Beats. See {filebeat-ref}/migrate-from-deprecated-module.html[Migrate from a deprecated module] for migration options. {pull}38037[38037] +- Removed deprecated Cisco Meraki from Beats. Use the https://docs.elastic.co/integrations/cisco_meraki[Cisco Meraki] Elastic integration instead. {pull}38037[38037] +- Removed deprecated Cisco Nexus from Beats. Use the https://docs.elastic.co/integrations/cisco_nexus[Cisco Nexus] Elastic integration instead. {pull}38037[38037] +- Removed deprecated Bluecoat from Beats. See {filebeat-ref}/migrate-from-deprecated-module.html[Migrate from a deprecated module] for migration options. {pull}38037[38037] +- Removed deprecated Barracuda from Beats. Use the https://docs.elastic.co/integrations/barracuda[Barracuda Web Application Firewall] Elastic integration instead. {pull}38037[38037] +- Removed deprecated Sophos UTM from Beats. Use the https://docs.elastic.co/integrations/sophos[Sophos] Elastic integration instead. {pull}38037[38037] +- Introduce input/netmetrics and refactor netflow input metrics. {pull}38055[38055] +- Update Salesforce module to use new Salesforce input. {pull}37509[37509] + +*Heartbeat* + +- Fix monitor state loader to not wait extra seconds for the last attempt. {pull}39621[39621] + +==== Bugfixes + +*Auditbeat* +- Set field types to correctly match ECS in sessionmd processor. {issue}38955[38955] {pull}38994[38994] +- Fix failing to enrich process events in sessionmd processor. {issue}38955[38955] {pull}39173[39173] {pull}39243[39243] +- Fix seccomp policy of FIM kprobes backend on arm64. {pull}39759[39759] + +*Filebeat* +- Fix handling of endpoint for custom domains and ensure region, default_region, and region parsed from queue_url are applied in the order specified in the documentation for the awss3 input. {pull}39709[39709] +- Prevent HTTPJSON holding response bodies between executions. {issue}35219[35219] {pull}38116[38116] +- Fix the incorrect values generated by the uri_parts processor. {pull}38216[38216] +- Rename `activity_guid` to `activity_id` in ETW input events to suit other Windows inputs. {pull}38530[38530] +- Add missing provider registration and fix published entity for Active Directory entityanalytics provider. {pull}38645[38645] +- Fix handling of un-parsed JSON in O365 module. {issue}37800[37800] {pull}38709[38709] +- Fix filestream's registry GC: registry entries are now removed from the in-memory and disk store when they're older than the set TTL. {issue}36761[36761] {pull}38488[38488] +- Fix handling of truncated files in Filestream {issue}38070[38070] {pull}38416[38416] +- Fix panic when more than 32767 pipeline clients are active. {issue}38197[38197] {pull}38556[38556] +- Fix a bug in CloudWatch task allocation that could skip some logs. {issue}38918[38918] {pull}38953[38953] +- Prevent GCP Pub/Sub input blockage by increasing default value of `max_outstanding_messages`. {issue}35029[35029] {pull}38985[38985] +- entity-analytics input: Improve structured logging. {pull}38990[38990] +- Upgrade `azure-event-hubs-go` and `azure-storage-blob-go` dependencies. {pull}38861[38861] +- Fix concurrency/error handling bugs in the AWS S3 input that could drop data and prevent ingestion of large buckets. {pull}39131[39131] +- Fix EntraID query handling. {issue}39419[39419] {pull}39420[39420] +- Expand ID patterns in request trace logger for HTTP Endpoint. {pull}39656[39656] + +*Heartbeat* + +- Redact synthexec cmd output. {pull}39535[39535] + +*Metricbeat* + +- RabbitMQ/queue - Change the mapping type of `rabbitmq.queue.consumers.utilisation.pct` to `scaled_float` from `long` because the values fall within the range of `[0.0, 1.0]`. Previously, conversion to integer resulted in reporting either `0` or `1`. +- Fix timeout caused by the retrival of which indices are hidden. {pull}39165[39165] + +*Winlogbeat* + +- Fix error handling in perfmon metrics. {issue}38140[38140] {pull}39404[39404] + +==== Added + +*Affecting all Beats* + +- Update Go version to 1.21.10. {pull}39467[39467] +- Enable early event encoding in the Elasticsearch output, improving CPU and memory use. {pull}38572[38572] + +*Auditbeat* + +- Add `add_session_metadata` processor, which enables session viewer on Auditbeat data. {pull}37640[37640] +- Add procfs backend to the `add_session_metadata` processor. {pull}38799[38799] +- Add `process.entity_id`, `process.group.name` and `process.group.id` in `add_process_metadata` processor. Make FIM module with Kprobes backend to always add an appropriately configured `add_process_metadata` processor to enrich file events. {pull}38776[38776] + +*Filebeat* + +- Add Saved Object name field to Kibana audit logs. {pull}38307[38307] +- Add Salesforce input. {pull}37331[37331] +- Add logging for cache processor file reads and writes. {pull}38052[38052] +- Support VPC endpoint for aws-s3 input SQS queue url. {pull}38189[38189] +- Add support for complex event objects in the HTTP Endpoint input. {issue}37910[37910] {pull}38193[38193] +- Parse more fields from Elasticsearch slowlogs. {pull}38295[38295] +- Update CEL mito extensions to v1.10.0 to add keys/values helper. {pull}38504[38504] +- Add support for Active Directory an entity analytics provider. {pull}37919[37919] +- Add AWS AWSHealth metricset. {pull}38370[38370] +- Add debugging breadcrumb to logs when writing request trace log. {pull}38636[38636] +- Add benchmark input and discard output. {pull}37437[37437] + +*Libbeat* + +- Add support for Linux capabilities in `add_process_metadata`. {pull}38252[38252] + +*Metricbeat* + +- Add support for `shards_stats.total_count` in Elasticsearch Monitoring data. {pull}38891[38891] +- Add SSL support to MySQL module. {pull}37997[37997] +- Add SSL support for Aerospike module. {pull}38126[38126] + +*Winlogbeat* + +- Use fixed size buffer at first pass for event parsing, improving throughput. {issue}39530[39530] {pull}39544[39544] + +==== Deprecated + +*Filebeat* + +- Deprecate `syslog` input in favor of `syslog` processor. {issue}37555[37555] {pull}38277[38277] +- Deprecate `o365audit` input in favor of `CEL` input. {issue}37719[37719] {pull}38922[38922] + + [[release-notes-8.13.4]] === Beats version 8.13.4 https://github.com/elastic/beats/compare/v8.13.3\...v8.13.4[View commits] diff --git a/CHANGELOG.next.asciidoc b/CHANGELOG.next.asciidoc index 93d33bee5c4..49b0cf9b2a3 100644 --- a/CHANGELOG.next.asciidoc +++ b/CHANGELOG.next.asciidoc @@ -46,7 +46,6 @@ https://github.com/elastic/beats/compare/v8.8.1\...main[Check the HEAD diff] *Heartbeat* -- Fix monitor state loader to not wait extra seconds for the last attempt {pull}39621[39621] *Metricbeat* @@ -86,8 +85,6 @@ https://github.com/elastic/beats/compare/v8.8.1\...main[Check the HEAD diff] - Fix panic when MaxRetryInterval is specified, but RetryInterval is not {pull}35820[35820] - Support build of projects outside of beats directory {pull}36126[36126] - Support Elastic Agent control protocol chunking support {pull}37343[37343] -- Upgrade elastic-agent-libs to v0.7.5. Removes obsolete "Treating the CommonName field on X.509 certificates as a host name..." deprecation warning for 8.0. {pull}37755[37755] -- aws: Add credential caching for `AssumeRole` session tokens. {issue}37787[37787] - Lower logging level to debug when attempting to configure beats with unknown fields from autodiscovered events/environments {pull}[37816][37816] - Set timeout of 1 minute for FQDN requests {pull}37756[37756] - Fix the paths in the .cmd script added to the path by the Windows MSI to point to the new C:\Program Files installation location. https://github.com/elastic/elastic-stack-installers/pull/238 @@ -98,12 +95,6 @@ https://github.com/elastic/beats/compare/v8.8.1\...main[Check the HEAD diff] - Rename the field "apache2.module.error" to "apache.module.error" in Apache error visualization. {issue}39480[39480] {pull}39481[39481] *Auditbeat* -- Set field types to correctly match ECS in sessionmd processor {issue}38955[38955] {pull}38994[38994] -- Fix failing to enrich process events in sessionmd processor {issue}38955[38955] {pull}39173[39173] {pull}39243[39243] -- Prevent scenario of losing children-related file events in a directory for recursive fsnotify backend of auditbeat file integrity module {pull}39133[39133] -- Allow extra syscalls by auditbeat required in FIM with kprobes back-end {pull}39361[39361] -- Fix losing events in FIM for OS X by allowing always to walk an added directory to monitor {pull}39362[39362] -- Fix seccomp policy of FIM kprobes backend on arm64 {pull}39759[39759] @@ -121,9 +112,6 @@ https://github.com/elastic/beats/compare/v8.8.1\...main[Check the HEAD diff] - Fix handling of Juniper SRX structured data when there is no leading junos element. {issue}36270[36270] {pull}36308[36308] - Fix Filebeat Cisco module with missing escape character {issue}36325[36325] {pull}36326[36326] - Added a fix for Crowdstrike pipeline handling process arrays {pull}36496[36496] -- Fix m365_defender cursor value and query building. {pull}37116[37116] -- Fix TCP/UDP metric queue length parsing base. {pull}37714[37714] -- Update github.com/lestrrat-go/jwx dependency. {pull}37799[37799] - [threatintel] MISP pagination fixes {pull}37898[37898] - Fix file handle leak when handling errors in filestream {pull}37973[37973] - Fix a race condition that could crash Filebeat with a "negative WaitGroup counter" error {pull}38094[38094] @@ -187,7 +175,6 @@ https://github.com/elastic/beats/compare/v8.8.1\...main[Check the HEAD diff] *Winlogbeat* -- Fix error handling in perfmon metrics. {issue}38140[38140] {pull}39404[39404] *Elastic Logging Plugin* @@ -227,10 +214,11 @@ https://github.com/elastic/beats/compare/v8.8.1\...main[Check the HEAD diff] *Auditbeat* +*Auditbeat* + + *Filebeat* -- Adding Saved Object name field to Kibana audit logs {pull}38307[38307] -- Update SQL input documentation regarding Oracle DSNs {pull}37590[37590] - add documentation for decode_xml_wineventlog processor field mappings. {pull}32456[32456] - httpjson input: Add request tracing logger. {issue}32402[32402] {pull}32412[32412] - Add cloudflare R2 to provider list in AWS S3 input. {pull}32620[32620] @@ -297,7 +285,6 @@ https://github.com/elastic/beats/compare/v8.8.1\...main[Check the HEAD diff] *Libbeat* -- Add support for linux capabilities in add_process_metadata. {pull}38252[38252] *Heartbeat* @@ -306,8 +293,6 @@ https://github.com/elastic/beats/compare/v8.8.1\...main[Check the HEAD diff] *Metricbeat* -- Add support for shards_stats.total_count in Elasticsearch Monitoring data. {pull}38891[38891] -- Add new fields to configure the lease duration, retry and renew when using leader elector with kubernetes autodiscover.{pull}38471[38471] - Add per-thread metrics to system_summary {pull}33614[33614] - Add GCP CloudSQL metadata {pull}33066[33066] - Add GCP Carbon Footprint metricbeat data {pull}34820[34820] @@ -350,8 +335,6 @@ https://github.com/elastic/beats/compare/v8.8.1\...main[Check the HEAD diff] *Filebeat* -- Deprecate `syslog` input in favor of `syslog` processor. {issue}37555[37555] {pull}38277[38277] -- Deprecate `o365audit` input in favor of `CEL` input. {issue}37719[37719] {pull}38922[38922] *Heartbeat* @@ -432,6 +415,9 @@ https://github.com/elastic/beats/compare/v8.8.1\...main[Check the HEAD diff] + + + diff --git a/libbeat/docs/release.asciidoc b/libbeat/docs/release.asciidoc index 7c0f20789fd..bad3f87b38d 100644 --- a/libbeat/docs/release.asciidoc +++ b/libbeat/docs/release.asciidoc @@ -8,6 +8,7 @@ This section summarizes the changes in each release. Also read <> for more detail about changes that affect upgrade. +* <> * <> * <> * <> From 132a06bec556de7f4261181f13a2059d719e9908 Mon Sep 17 00:00:00 2001 From: Dan Kortschak Date: Sat, 8 Jun 2024 06:13:25 +0930 Subject: [PATCH 13/21] x-pack/filebeat/input/salesforce: bump github.com/golang-jwt/jwt to v5 (#39823) This reduces the dep count since all other uses of the package are at v5. The API that is used here does not change. https://github.com/golang-jwt/jwt/blob/main/MIGRATION_GUIDE.md --- CHANGELOG-developer.next.asciidoc | 1 + NOTICE.txt | 25 +++-------------------- go.mod | 3 +-- go.sum | 1 - x-pack/filebeat/input/salesforce/input.go | 2 +- 5 files changed, 6 insertions(+), 26 deletions(-) diff --git a/CHANGELOG-developer.next.asciidoc b/CHANGELOG-developer.next.asciidoc index cf378879b15..15d65cf131b 100644 --- a/CHANGELOG-developer.next.asciidoc +++ b/CHANGELOG-developer.next.asciidoc @@ -190,6 +190,7 @@ The list below covers the major changes between 7.0.0-rc2 and main only. - Add Active Directory entity collector for Filebeat entity analytics. {pull}37854[37854] - Make logs for empty and small files less noisy when using fingerprint file identity in filestream. {pull}38421[38421] - Improve robustness and error reporting from packetbeat default route testing. {pull}39757[39757] +- Move x-pack/filebeat/input/salesforce jwt import to v5. {pull}39823[39823] ==== Deprecated diff --git a/NOTICE.txt b/NOTICE.txt index 45e63fe2ded..0c501bc8d85 100644 --- a/NOTICE.txt +++ b/NOTICE.txt @@ -17541,12 +17541,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- -Dependency : github.com/golang-jwt/jwt -Version: v3.2.1+incompatible +Dependency : github.com/golang-jwt/jwt/v5 +Version: v5.0.0 Licence type (autodetected): MIT -------------------------------------------------------------------------------- -Contents of probable licence file $GOMODCACHE/github.com/golang-jwt/jwt@v3.2.1+incompatible/LICENSE: +Contents of probable licence file $GOMODCACHE/github.com/golang-jwt/jwt/v5@v5.0.0/LICENSE: Copyright (c) 2012 Dave Grijalva Copyright (c) 2021 golang-jwt maintainers @@ -40825,25 +40825,6 @@ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLI --------------------------------------------------------------------------------- -Dependency : github.com/golang-jwt/jwt/v5 -Version: v5.0.0 -Licence type (autodetected): MIT --------------------------------------------------------------------------------- - -Contents of probable licence file $GOMODCACHE/github.com/golang-jwt/jwt/v5@v5.0.0/LICENSE: - -Copyright (c) 2012 Dave Grijalva -Copyright (c) 2021 golang-jwt maintainers - -Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - -------------------------------------------------------------------------------- Dependency : github.com/golang-sql/civil Version: v0.0.0-20190719163853-cb61b32ac6fe diff --git a/go.mod b/go.mod index 0b26cff2cd4..9552997c4c3 100644 --- a/go.mod +++ b/go.mod @@ -215,7 +215,7 @@ require ( github.com/foxcpp/go-mockdns v0.0.0-20201212160233-ede2f9158d15 github.com/g8rswimmer/go-sfdc v0.0.0-00010101000000-000000000000 github.com/go-ldap/ldap/v3 v3.4.6 - github.com/golang-jwt/jwt v3.2.1+incompatible + github.com/golang-jwt/jwt/v5 v5.0.0 github.com/google/cel-go v0.19.0 github.com/googleapis/gax-go/v2 v2.12.0 github.com/gorilla/handlers v1.5.1 @@ -307,7 +307,6 @@ require ( github.com/goccy/go-json v0.10.2 // indirect github.com/godror/knownpb v0.1.0 // indirect github.com/golang-jwt/jwt/v4 v4.5.0 // indirect - github.com/golang-jwt/jwt/v5 v5.0.0 // indirect github.com/golang-sql/civil v0.0.0-20190719163853-cb61b32ac6fe // indirect github.com/golang-sql/sqlexp v0.1.0 // indirect github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect diff --git a/go.sum b/go.sum index 86c3c94dd78..3f0ce0bf1fc 100644 --- a/go.sum +++ b/go.sum @@ -866,7 +866,6 @@ github.com/gogo/protobuf v1.2.2-0.20190730201129-28a6bbf47e48/go.mod h1:SlYgWuQ5 github.com/gogo/protobuf v1.3.1/go.mod h1:SlYgWuQ5SjCEi6WLHjHCa1yvBfUnHcTbrrZtXPKa29o= github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= -github.com/golang-jwt/jwt v3.2.1+incompatible h1:73Z+4BJcrTC+KczS6WvTPvRGOp1WmfEP4Q1lOd9Z/+c= github.com/golang-jwt/jwt v3.2.1+incompatible/go.mod h1:8pz2t5EyA70fFQQSrl6XZXzqecmYZeUEB8OUGHkxJ+I= github.com/golang-jwt/jwt/v4 v4.0.0/go.mod h1:/xlHOz8bRuivTWchD4jCa+NbatV+wEUSzwAxVc6locg= github.com/golang-jwt/jwt/v4 v4.2.0/go.mod h1:/xlHOz8bRuivTWchD4jCa+NbatV+wEUSzwAxVc6locg= diff --git a/x-pack/filebeat/input/salesforce/input.go b/x-pack/filebeat/input/salesforce/input.go index dacb15042b3..f2f8ef15c68 100644 --- a/x-pack/filebeat/input/salesforce/input.go +++ b/x-pack/filebeat/input/salesforce/input.go @@ -20,7 +20,7 @@ import ( "github.com/g8rswimmer/go-sfdc/credentials" "github.com/g8rswimmer/go-sfdc/session" "github.com/g8rswimmer/go-sfdc/soql" - "github.com/golang-jwt/jwt" + "github.com/golang-jwt/jwt/v5" "github.com/hashicorp/go-retryablehttp" "go.uber.org/zap" "golang.org/x/exp/slices" From 35f8d09c79361c4367dada1fa1de0d1ab0704819 Mon Sep 17 00:00:00 2001 From: Dan Kortschak Date: Sat, 8 Jun 2024 06:14:08 +0930 Subject: [PATCH 14/21] packetbeat/route: fix test failure on Windows 2022 (#39822) Windows 2022 appears to have added new text to the output of netsh. The text is interpreted as having three fields. This impact on the behaviour of the field splitting which expects five. - Originally this would cause a panic due to bounds checking. This was fixed in #39757 by erroring on an unexpectedly short line. - The fieldsN helper expects lines to be n fields or longer, but does not terminate on lines with fewer fields. So treat the first blank line found in the table as the end of the table and fix the fieldsN helper. --- CHANGELOG-developer.next.asciidoc | 1 + packetbeat/route/route_test.go | 1 - packetbeat/route/route_windows_test.go | 9 ++++++++- 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/CHANGELOG-developer.next.asciidoc b/CHANGELOG-developer.next.asciidoc index 15d65cf131b..58f9cea5211 100644 --- a/CHANGELOG-developer.next.asciidoc +++ b/CHANGELOG-developer.next.asciidoc @@ -98,6 +98,7 @@ The list below covers the major changes between 7.0.0-rc2 and main only. - Fix copy arguments for strict aligned architectures. {pull}36976[36976] - Fix panic when more than 32767 pipeline clients are active. {issue}38197[38197] {pull}38556[38556] - Skip flakey metrics test on windows in filebeat httpjson input. {issue}39676[39676] {pull}39678[39678] +- Fix flakey test on Windows 2022 in packetbeat/route. {issue}39698[39698] {pull}39822[39822] ==== Added diff --git a/packetbeat/route/route_test.go b/packetbeat/route/route_test.go index 2858d8521da..3e848bb56ab 100644 --- a/packetbeat/route/route_test.go +++ b/packetbeat/route/route_test.go @@ -32,7 +32,6 @@ import ( ) func TestDefault(t *testing.T) { - t.Skip("Flaky test: https://github.com/elastic/beats/issues/39698") for _, family := range []int{syscall.AF_INET, syscall.AF_INET6} { wantIface, wantIndex, wantErr := defaultRoute(family) if wantErr != nil && wantErr != ErrNotFound { diff --git a/packetbeat/route/route_windows_test.go b/packetbeat/route/route_windows_test.go index f216dabc338..1b513125f2d 100644 --- a/packetbeat/route/route_windows_test.go +++ b/packetbeat/route/route_windows_test.go @@ -54,6 +54,9 @@ func defaultRoute(af int) (name string, index int, err error) { for inTable := false; sc.Scan(); { f := strings.Fields(sc.Text()) if len(f) == 0 { + if inTable { + break + } continue } if !inTable { @@ -94,6 +97,9 @@ func defaultRoute(af int) (name string, index int, err error) { for inTable := false; sc.Scan(); { f := fieldsN(sc.Text(), 5) if len(f) == 0 { + if inTable { + break + } continue } if !inTable { @@ -152,6 +158,7 @@ func fieldsN(s string, n int) []string { } var f []string for s != "" { + l := len(s) for i, r := range s { if unicode.IsSpace(r) { f = append(f, s[:i]) @@ -165,7 +172,7 @@ func fieldsN(s string, n int) []string { break } } - if len(f) == n-1 { + if len(f) == n-1 || len(s) == l { break } } From a2ab85dac7cc4efc4a52fe0265a022a8a14a3281 Mon Sep 17 00:00:00 2001 From: kruskall <99559985+kruskall@users.noreply.github.com> Date: Mon, 10 Jun 2024 17:20:16 +0200 Subject: [PATCH 15/21] fix: avoid gotestsum import in libbeat es output (#39835) gotestsum is being imported causing downstream apps to include it in the dependency tree. Drop the import and use the client logger. --- libbeat/outputs/elasticsearch/client.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/libbeat/outputs/elasticsearch/client.go b/libbeat/outputs/elasticsearch/client.go index e05c4e0b261..933d04c789c 100644 --- a/libbeat/outputs/elasticsearch/client.go +++ b/libbeat/outputs/elasticsearch/client.go @@ -26,7 +26,6 @@ import ( "time" "go.elastic.co/apm/v2" - "gotest.tools/gotestsum/log" "github.com/elastic/beats/v7/libbeat/beat" "github.com/elastic/beats/v7/libbeat/beat/events" @@ -433,7 +432,7 @@ func (client *Client) bulkCollectPublishFails(bulkResult bulkResult) ([]publishe if client.applyItemStatus(events[i], itemStatus, itemMessage, &stats) { eventsToRetry = append(eventsToRetry, events[i]) - log.Debugf("Bulk item insert failed (i=%v, status=%v): %s", i, itemStatus, itemMessage) + client.log.Debugf("Bulk item insert failed (i=%v, status=%v): %s", i, itemStatus, itemMessage) } } From db9406b50514b8896f08e7bbb176f09c07650566 Mon Sep 17 00:00:00 2001 From: kruskall <99559985+kruskall@users.noreply.github.com> Date: Mon, 10 Jun 2024 17:20:31 +0200 Subject: [PATCH 16/21] refactor: replace x/exp/slices with stdlib slices (#39838) Go 1.21 added slices package to stdlib. Replace usage of x/exp/slices with slices and drop x/exp dependency. --- NOTICE.txt | 74 +++++++++++------------ go.mod | 2 +- x-pack/filebeat/input/salesforce/input.go | 2 +- 3 files changed, 39 insertions(+), 39 deletions(-) diff --git a/NOTICE.txt b/NOTICE.txt index 0c501bc8d85..c7bfc6b351b 100644 --- a/NOTICE.txt +++ b/NOTICE.txt @@ -25187,43 +25187,6 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. --------------------------------------------------------------------------------- -Dependency : golang.org/x/exp -Version: v0.0.0-20231127185646-65229373498e -Licence type (autodetected): BSD-3-Clause --------------------------------------------------------------------------------- - -Contents of probable licence file $GOMODCACHE/golang.org/x/exp@v0.0.0-20231127185646-65229373498e/LICENSE: - -Copyright (c) 2009 The Go Authors. All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: - - * Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above -copyright notice, this list of conditions and the following disclaimer -in the documentation and/or other materials provided with the -distribution. - * Neither the name of Google Inc. nor the names of its -contributors may be used to endorse or promote products derived from -this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - -------------------------------------------------------------------------------- Dependency : golang.org/x/lint Version: v0.0.0-20210508222113-6edffad5e616 @@ -55098,6 +55061,43 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +-------------------------------------------------------------------------------- +Dependency : golang.org/x/exp +Version: v0.0.0-20231127185646-65229373498e +Licence type (autodetected): BSD-3-Clause +-------------------------------------------------------------------------------- + +Contents of probable licence file $GOMODCACHE/golang.org/x/exp@v0.0.0-20231127185646-65229373498e/LICENSE: + +Copyright (c) 2009 The Go Authors. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + -------------------------------------------------------------------------------- Dependency : golang.org/x/term Version: v0.19.0 diff --git a/go.mod b/go.mod index 9552997c4c3..bf3ff2ad312 100644 --- a/go.mod +++ b/go.mod @@ -232,7 +232,6 @@ require ( go.elastic.co/apm/module/apmhttp/v2 v2.6.0 go.elastic.co/apm/v2 v2.6.0 go.mongodb.org/mongo-driver v1.5.1 - golang.org/x/exp v0.0.0-20231127185646-65229373498e golang.org/x/tools/go/vcs v0.1.0-deprecated google.golang.org/genproto/googleapis/api v0.0.0-20230913181813-007df8e322eb gopkg.in/natefinch/lumberjack.v2 v2.0.0 @@ -388,6 +387,7 @@ require ( go.opentelemetry.io/otel v1.24.0 // indirect go.opentelemetry.io/otel/metric v1.24.0 // indirect go.opentelemetry.io/otel/trace v1.24.0 // indirect + golang.org/x/exp v0.0.0-20231127185646-65229373498e // indirect golang.org/x/term v0.19.0 // indirect golang.org/x/xerrors v0.0.0-20220907171357-04be3eba64a2 // indirect google.golang.org/appengine v1.6.7 // indirect diff --git a/x-pack/filebeat/input/salesforce/input.go b/x-pack/filebeat/input/salesforce/input.go index f2f8ef15c68..12ed4e652a7 100644 --- a/x-pack/filebeat/input/salesforce/input.go +++ b/x-pack/filebeat/input/salesforce/input.go @@ -14,6 +14,7 @@ import ( "io" "net/http" "os" + "slices" "time" "github.com/g8rswimmer/go-sfdc" @@ -23,7 +24,6 @@ import ( "github.com/golang-jwt/jwt/v5" "github.com/hashicorp/go-retryablehttp" "go.uber.org/zap" - "golang.org/x/exp/slices" v2 "github.com/elastic/beats/v7/filebeat/input/v2" inputcursor "github.com/elastic/beats/v7/filebeat/input/v2/input-cursor" From c5b0cec5dd9bbb2d51a13007ab9ee8fdf9aa06ff Mon Sep 17 00:00:00 2001 From: Dan Kortschak Date: Tue, 11 Jun 2024 06:28:39 +0930 Subject: [PATCH 17/21] x-pack/filebeat/input/entityanalytics/provider/{azuread,okta}: add request tracing support (#39821) --- CHANGELOG.next.asciidoc | 1 + .../inputs/input-entity-analytics.asciidoc | 24 +++++++++ .../entityanalytics/provider/azuread/azure.go | 5 +- .../provider/azuread/fetcher/graph/.gitignore | 1 + .../provider/azuread/fetcher/graph/graph.go | 52 ++++++++++++++++++- .../azuread/fetcher/graph/graph_test.go | 25 +++++++-- .../entityanalytics/provider/okta/.gitignore | 1 + .../entityanalytics/provider/okta/conf.go | 20 ++++++- .../entityanalytics/provider/okta/okta.go | 52 +++++++++++++++++-- .../provider/okta/okta_test.go | 14 +++++ 10 files changed, 186 insertions(+), 9 deletions(-) create mode 100644 x-pack/filebeat/input/entityanalytics/provider/azuread/fetcher/graph/.gitignore create mode 100644 x-pack/filebeat/input/entityanalytics/provider/okta/.gitignore diff --git a/CHANGELOG.next.asciidoc b/CHANGELOG.next.asciidoc index 49b0cf9b2a3..e1214aa0e27 100644 --- a/CHANGELOG.next.asciidoc +++ b/CHANGELOG.next.asciidoc @@ -279,6 +279,7 @@ https://github.com/elastic/beats/compare/v8.8.1\...main[Check the HEAD diff] - Update CEL mito extensions to v1.12.2. {pull}39755[39755] - Add support for base64-encoded HMAC headers to HTTP Endpoint. {pull}39655[39655] - Add user group membership support to Okta entity analytics provider. {issue}39814[39814] {pull}39815[39815] +- Add request trace support for Okta and EntraID entity analytics providers. {pull}39821[39821] *Auditbeat* diff --git a/x-pack/filebeat/docs/inputs/input-entity-analytics.asciidoc b/x-pack/filebeat/docs/inputs/input-entity-analytics.asciidoc index e099bf3d247..41c01576123 100644 --- a/x-pack/filebeat/docs/inputs/input-entity-analytics.asciidoc +++ b/x-pack/filebeat/docs/inputs/input-entity-analytics.asciidoc @@ -509,6 +509,18 @@ This is a list of optional query parameters. The default is `["accountEnabled", "displayName", "operatingSystem", "operatingSystemVersion", "physicalIds", "extensionAttributes", "alternativeSecurityIds"]`. +[float] +==== `tracer.filename` + +It is possible to log HTTP requests and responses to the EntraID API to a local file-system for debugging configurations. +This option is enabled by setting the `tracer.filename` value. Additional options are available to +tune log rotation behavior. + +To differentiate the trace files generated from different input instances, a placeholder `*` can be added to the filename and will be replaced with the input instance id. +For Example, `http-request-trace-*.ndjson`. + +Enabling this option compromises security and should only be used for debugging. + [id="provider-okta"] ==== Okta User Identities (`okta`) @@ -797,6 +809,18 @@ The interval in which incremental updates should occur. The interval must be shorter than the full synchronization interval (`sync_interval`). Expressed as a duration string (e.g., 1m, 3h, 24h). Defaults to `15m` (15 minutes). +[float] +==== `tracer.filename` + +It is possible to log HTTP requests and responses to the Okta API to a local file-system for debugging configurations. +This option is enabled by setting the `tracer.filename` value. Additional options are available to +tune log rotation behavior. + +To differentiate the trace files generated from different input instances, a placeholder `*` can be added to the filename and will be replaced with the input instance id. +For Example, `http-request-trace-*.ndjson`. + +Enabling this option compromises security and should only be used for debugging. + [float] ==== Metrics diff --git a/x-pack/filebeat/input/entityanalytics/provider/azuread/azure.go b/x-pack/filebeat/input/entityanalytics/provider/azuread/azure.go index 30514352eba..d67031753fd 100644 --- a/x-pack/filebeat/input/entityanalytics/provider/azuread/azure.go +++ b/x-pack/filebeat/input/entityanalytics/provider/azuread/azure.go @@ -48,6 +48,8 @@ type azure struct { logger *logp.Logger auth authenticator.Authenticator fetcher fetcher.Fetcher + + ctx v2.Context } // Name returns the name of this provider. @@ -71,6 +73,7 @@ func (p *azure) Test(testCtx v2.TestContext) error { // Run will start data collection on this provider. func (p *azure) Run(inputCtx v2.Context, store *kvstore.Store, client beat.Client) error { p.logger = inputCtx.Logger.With("tenant_id", p.conf.TenantID, "provider", Name) + p.ctx = inputCtx p.auth.SetLogger(p.logger) p.fetcher.SetLogger(p.logger) p.metrics = newMetrics(inputCtx.ID, nil) @@ -575,7 +578,7 @@ func (p *azure) configure(cfg *config.C) (kvstore.Input, error) { if p.auth, err = oauth2.New(cfg, p.Manager.Logger); err != nil { return nil, fmt.Errorf("unable to create authenticator: %w", err) } - if p.fetcher, err = graph.New(cfg, p.Manager.Logger, p.auth); err != nil { + if p.fetcher, err = graph.New(ctxtool.FromCanceller(p.ctx.Cancelation), p.ctx.ID, cfg, p.Manager.Logger, p.auth); err != nil { return nil, fmt.Errorf("unable to create fetcher: %w", err) } diff --git a/x-pack/filebeat/input/entityanalytics/provider/azuread/fetcher/graph/.gitignore b/x-pack/filebeat/input/entityanalytics/provider/azuread/fetcher/graph/.gitignore new file mode 100644 index 00000000000..13df6a73f0d --- /dev/null +++ b/x-pack/filebeat/input/entityanalytics/provider/azuread/fetcher/graph/.gitignore @@ -0,0 +1 @@ +*.ndjson diff --git a/x-pack/filebeat/input/entityanalytics/provider/azuread/fetcher/graph/graph.go b/x-pack/filebeat/input/entityanalytics/provider/azuread/fetcher/graph/graph.go index 558e277d106..a3104ce0d00 100644 --- a/x-pack/filebeat/input/entityanalytics/provider/azuread/fetcher/graph/graph.go +++ b/x-pack/filebeat/input/entityanalytics/provider/azuread/fetcher/graph/graph.go @@ -15,13 +15,19 @@ import ( "io" "net/http" "net/url" + "path/filepath" "strings" "github.com/google/uuid" + "go.elastic.co/ecszap" + "go.uber.org/zap" + "go.uber.org/zap/zapcore" + "gopkg.in/natefinch/lumberjack.v2" "github.com/elastic/beats/v7/x-pack/filebeat/input/entityanalytics/internal/collections" "github.com/elastic/beats/v7/x-pack/filebeat/input/entityanalytics/provider/azuread/authenticator" "github.com/elastic/beats/v7/x-pack/filebeat/input/entityanalytics/provider/azuread/fetcher" + "github.com/elastic/beats/v7/x-pack/filebeat/input/internal/httplog" "github.com/elastic/elastic-agent-libs/config" "github.com/elastic/elastic-agent-libs/logp" "github.com/elastic/elastic-agent-libs/mapstr" @@ -104,6 +110,9 @@ type graphConf struct { Select selection `config:"select"` Transport httpcommon.HTTPTransportSettings `config:",inline"` + + // Tracer allows configuration of request trace logging. + Tracer *lumberjack.Logger `config:"tracer"` } type selection struct { @@ -329,16 +338,22 @@ func (f *graph) doRequest(ctx context.Context, method, url string, body io.Reade } // New creates a new instance of the graph fetcher. -func New(cfg *config.C, logger *logp.Logger, auth authenticator.Authenticator) (fetcher.Fetcher, error) { +func New(ctx context.Context, id string, cfg *config.C, logger *logp.Logger, auth authenticator.Authenticator) (fetcher.Fetcher, error) { var c graphConf if err := cfg.Unpack(&c); err != nil { return nil, fmt.Errorf("unable to unpack Graph API Fetcher config: %w", err) } + if c.Tracer != nil { + id = sanitizeFileName(id) + c.Tracer.Filename = strings.ReplaceAll(c.Tracer.Filename, "*", id) + } + client, err := c.Transport.Client() if err != nil { return nil, fmt.Errorf("unable to create HTTP client: %w", err) } + client = requestTrace(ctx, client, c, logger) f := graph{ conf: c, @@ -383,6 +398,41 @@ func New(cfg *config.C, logger *logp.Logger, auth authenticator.Authenticator) ( return &f, nil } +// requestTrace decorates cli with an httplog.LoggingRoundTripper if cfg.Tracer +// is non-nil. +func requestTrace(ctx context.Context, cli *http.Client, cfg graphConf, log *logp.Logger) *http.Client { + if cfg.Tracer == nil { + return cli + } + w := zapcore.AddSync(cfg.Tracer) + go func() { + // Close the logger when we are done. + <-ctx.Done() + cfg.Tracer.Close() + }() + core := ecszap.NewCore( + ecszap.NewDefaultEncoderConfig(), + w, + zap.DebugLevel, + ) + traceLogger := zap.New(core) + + const margin = 10e3 // 1OkB ought to be enough room for all the remainder of the trace details. + maxSize := cfg.Tracer.MaxSize * 1e6 + cli.Transport = httplog.NewLoggingRoundTripper(cli.Transport, traceLogger, max(0, maxSize-margin), log) + return cli +} + +// sanitizeFileName returns name with ":" and "/" replaced with "_", removing +// repeated instances. The request.tracer.filename may have ":" when an input +// has cursor config and the macOS Finder will treat this as path-separator and +// causes to show up strange filepaths. +func sanitizeFileName(name string) string { + name = strings.ReplaceAll(name, ":", string(filepath.Separator)) + name = filepath.Clean(name) + return strings.ReplaceAll(name, string(filepath.Separator), "_") +} + func formatQuery(name string, query []string, dflt string) string { q := dflt if len(query) != 0 { diff --git a/x-pack/filebeat/input/entityanalytics/provider/azuread/fetcher/graph/graph_test.go b/x-pack/filebeat/input/entityanalytics/provider/azuread/fetcher/graph/graph_test.go index f439cc91679..f2fc2effe29 100644 --- a/x-pack/filebeat/input/entityanalytics/provider/azuread/fetcher/graph/graph_test.go +++ b/x-pack/filebeat/input/entityanalytics/provider/azuread/fetcher/graph/graph_test.go @@ -7,6 +7,7 @@ package graph import ( "context" "encoding/json" + "flag" "fmt" "net/http" "net/http/httptest" @@ -19,6 +20,7 @@ import ( "github.com/google/go-cmp/cmp" "github.com/google/uuid" "github.com/stretchr/testify/require" + "gopkg.in/natefinch/lumberjack.v2" "github.com/elastic/beats/v7/x-pack/filebeat/input/entityanalytics/internal/collections" "github.com/elastic/beats/v7/x-pack/filebeat/input/entityanalytics/provider/azuread/authenticator/mock" @@ -27,6 +29,8 @@ import ( "github.com/elastic/elastic-agent-libs/logp" ) +var trace = flag.Bool("request_trace", false, "enable request tracing during tests") + var usersResponse1 = apiUserResponse{ Users: []userAPI{ { @@ -313,11 +317,16 @@ func TestGraph_Groups(t *testing.T) { rawConf := graphConf{ APIEndpoint: "http://" + testSrv.addr, } + if *trace { + rawConf.Tracer = &lumberjack.Logger{ + Filename: "test_trace-*.ndjson", + } + } c, err := config.NewConfigFrom(&rawConf) require.NoError(t, err) auth := mock.New(mock.DefaultTokenValue) - f, err := New(c, logp.L(), auth) + f, err := New(context.Background(), t.Name(), c, logp.L(), auth) require.NoError(t, err) ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) @@ -372,11 +381,16 @@ func TestGraph_Users(t *testing.T) { rawConf := graphConf{ APIEndpoint: "http://" + testSrv.addr, } + if *trace { + rawConf.Tracer = &lumberjack.Logger{ + Filename: "test_trace-*.ndjson", + } + } c, err := config.NewConfigFrom(&rawConf) require.NoError(t, err) auth := mock.New(mock.DefaultTokenValue) - f, err := New(c, logp.L(), auth) + f, err := New(context.Background(), t.Name(), c, logp.L(), auth) require.NoError(t, err) ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) @@ -477,11 +491,16 @@ func TestGraph_Devices(t *testing.T) { APIEndpoint: "http://" + testSrv.addr, Select: test.selection, } + if *trace { + rawConf.Tracer = &lumberjack.Logger{ + Filename: "test_trace-*.ndjson", + } + } c, err := config.NewConfigFrom(&rawConf) require.NoError(t, err) auth := mock.New(mock.DefaultTokenValue) - f, err := New(c, logp.L(), auth) + f, err := New(context.Background(), t.Name(), c, logp.L(), auth) require.NoError(t, err) ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) diff --git a/x-pack/filebeat/input/entityanalytics/provider/okta/.gitignore b/x-pack/filebeat/input/entityanalytics/provider/okta/.gitignore new file mode 100644 index 00000000000..13df6a73f0d --- /dev/null +++ b/x-pack/filebeat/input/entityanalytics/provider/okta/.gitignore @@ -0,0 +1 @@ +*.ndjson diff --git a/x-pack/filebeat/input/entityanalytics/provider/okta/conf.go b/x-pack/filebeat/input/entityanalytics/provider/okta/conf.go index eb0906e78d5..873a6195d47 100644 --- a/x-pack/filebeat/input/entityanalytics/provider/okta/conf.go +++ b/x-pack/filebeat/input/entityanalytics/provider/okta/conf.go @@ -9,6 +9,8 @@ import ( "strings" "time" + "gopkg.in/natefinch/lumberjack.v2" + "github.com/elastic/elastic-agent-libs/transport/httpcommon" ) @@ -62,6 +64,9 @@ type conf struct { // Request is the configuration for establishing // HTTP requests to the API. Request *requestConfig `config:"request"` + + // Tracer allows configuration of request trace logging. + Tracer *lumberjack.Logger `config:"tracer"` } type requestConfig struct { @@ -163,10 +168,23 @@ func (c *conf) Validate() error { } switch strings.ToLower(c.Dataset) { case "", "all", "users", "devices": - return nil default: return errors.New("dataset must be 'all', 'users', 'devices' or empty") } + + if c.Tracer == nil { + return nil + } + if c.Tracer.Filename == "" { + return errors.New("request tracer must have a filename if used") + } + if c.Tracer.MaxSize == 0 { + // By default Lumberjack caps file sizes at 100MB which + // is excessive for a debugging logger, so default to 1MB + // which is the minimum. + c.Tracer.MaxSize = 1 + } + return nil } func (c *conf) wantUsers() bool { diff --git a/x-pack/filebeat/input/entityanalytics/provider/okta/okta.go b/x-pack/filebeat/input/entityanalytics/provider/okta/okta.go index 70f95d7396e..0980575df3a 100644 --- a/x-pack/filebeat/input/entityanalytics/provider/okta/okta.go +++ b/x-pack/filebeat/input/entityanalytics/provider/okta/okta.go @@ -12,10 +12,14 @@ import ( "io" "net/http" "net/url" + "path/filepath" + "strings" "time" "github.com/hashicorp/go-retryablehttp" + "go.elastic.co/ecszap" "go.uber.org/zap" + "go.uber.org/zap/zapcore" "golang.org/x/time/rate" v2 "github.com/elastic/beats/v7/filebeat/input/v2" @@ -23,6 +27,7 @@ import ( "github.com/elastic/beats/v7/x-pack/filebeat/input/entityanalytics/internal/kvstore" "github.com/elastic/beats/v7/x-pack/filebeat/input/entityanalytics/provider" "github.com/elastic/beats/v7/x-pack/filebeat/input/entityanalytics/provider/okta/internal/okta" + "github.com/elastic/beats/v7/x-pack/filebeat/input/internal/httplog" "github.com/elastic/elastic-agent-libs/config" "github.com/elastic/elastic-agent-libs/logp" "github.com/elastic/elastic-agent-libs/mapstr" @@ -105,8 +110,13 @@ func (p *oktaInput) Run(inputCtx v2.Context, store *kvstore.Store, client beat.C // Allow a single fetch operation to obtain limits from the API. p.lim = rate.NewLimiter(1, 1) + if p.cfg.Tracer != nil { + id := sanitizeFileName(inputCtx.ID) + p.cfg.Tracer.Filename = strings.ReplaceAll(p.cfg.Tracer.Filename, "*", id) + } + var err error - p.client, err = newClient(p.cfg, p.logger) + p.client, err = newClient(ctxtool.FromCanceller(inputCtx.Cancelation), p.cfg, p.logger) if err != nil { return err } @@ -152,12 +162,14 @@ func (p *oktaInput) Run(inputCtx v2.Context, store *kvstore.Store, client beat.C } } -func newClient(cfg conf, log *logp.Logger) (*http.Client, error) { +func newClient(ctx context.Context, cfg conf, log *logp.Logger) (*http.Client, error) { c, err := cfg.Request.Transport.Client(clientOptions(cfg.Request.KeepAlive.settings())...) if err != nil { return nil, err } + c = requestTrace(ctx, c, cfg, log) + c.CheckRedirect = checkRedirect(cfg.Request, log) client := &retryablehttp.Client{ @@ -169,10 +181,44 @@ func newClient(cfg conf, log *logp.Logger) (*http.Client, error) { CheckRetry: retryablehttp.DefaultRetryPolicy, Backoff: retryablehttp.DefaultBackoff, } - return client.StandardClient(), nil } +// requestTrace decorates cli with an httplog.LoggingRoundTripper if cfg.Tracer +// is non-nil. +func requestTrace(ctx context.Context, cli *http.Client, cfg conf, log *logp.Logger) *http.Client { + if cfg.Tracer == nil { + return cli + } + w := zapcore.AddSync(cfg.Tracer) + go func() { + // Close the logger when we are done. + <-ctx.Done() + cfg.Tracer.Close() + }() + core := ecszap.NewCore( + ecszap.NewDefaultEncoderConfig(), + w, + zap.DebugLevel, + ) + traceLogger := zap.New(core) + + const margin = 10e3 // 1OkB ought to be enough room for all the remainder of the trace details. + maxSize := cfg.Tracer.MaxSize * 1e6 + cli.Transport = httplog.NewLoggingRoundTripper(cli.Transport, traceLogger, max(0, maxSize-margin), log) + return cli +} + +// sanitizeFileName returns name with ":" and "/" replaced with "_", removing +// repeated instances. The request.tracer.filename may have ":" when an input +// has cursor config and the macOS Finder will treat this as path-separator and +// causes to show up strange filepaths. +func sanitizeFileName(name string) string { + name = strings.ReplaceAll(name, ":", string(filepath.Separator)) + name = filepath.Clean(name) + return strings.ReplaceAll(name, string(filepath.Separator), "_") +} + // clientOption returns constructed client configuration options, including // setting up http+unix and http+npipe transports if requested. func clientOptions(keepalive httpcommon.WithKeepaliveSettings) []httpcommon.TransportOption { diff --git a/x-pack/filebeat/input/entityanalytics/provider/okta/okta_test.go b/x-pack/filebeat/input/entityanalytics/provider/okta/okta_test.go index da29666712b..cf3f1423006 100644 --- a/x-pack/filebeat/input/entityanalytics/provider/okta/okta_test.go +++ b/x-pack/filebeat/input/entityanalytics/provider/okta/okta_test.go @@ -7,6 +7,7 @@ package okta import ( "context" "encoding/json" + "flag" "fmt" "net/http" "net/http/httptest" @@ -17,11 +18,14 @@ import ( "time" "golang.org/x/time/rate" + "gopkg.in/natefinch/lumberjack.v2" "github.com/elastic/beats/v7/x-pack/filebeat/input/entityanalytics/provider/okta/internal/okta" "github.com/elastic/elastic-agent-libs/logp" ) +var trace = flag.Bool("request_trace", false, "enable request tracing during tests") + func TestOktaDoFetch(t *testing.T) { tests := []struct { dataset string @@ -153,6 +157,16 @@ func TestOktaDoFetch(t *testing.T) { lim: rate.NewLimiter(1, 1), logger: logp.L(), } + if *trace { + name := test.dataset + if name == "" { + name = "default" + } + a.cfg.Tracer = &lumberjack.Logger{ + Filename: fmt.Sprintf("test_trace_%s.ndjson", name), + } + } + a.client = requestTrace(context.Background(), a.client, a.cfg, a.logger) ss, err := newStateStore(store) if err != nil { From 1fd65c7f980d69886cb894014f4e9737ce05bb71 Mon Sep 17 00:00:00 2001 From: Dan Kortschak Date: Tue, 11 Jun 2024 06:28:56 +0930 Subject: [PATCH 18/21] x-pack/filebeat/input/{cel,httpjson}: fix typo in minimum log length (#39834) --- CHANGELOG-developer.next.asciidoc | 1 + x-pack/filebeat/input/cel/input.go | 2 +- x-pack/filebeat/input/httpjson/input.go | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/CHANGELOG-developer.next.asciidoc b/CHANGELOG-developer.next.asciidoc index 58f9cea5211..995b52511b8 100644 --- a/CHANGELOG-developer.next.asciidoc +++ b/CHANGELOG-developer.next.asciidoc @@ -99,6 +99,7 @@ The list below covers the major changes between 7.0.0-rc2 and main only. - Fix panic when more than 32767 pipeline clients are active. {issue}38197[38197] {pull}38556[38556] - Skip flakey metrics test on windows in filebeat httpjson input. {issue}39676[39676] {pull}39678[39678] - Fix flakey test on Windows 2022 in packetbeat/route. {issue}39698[39698] {pull}39822[39822] +- Fix bug in minimum length for request trace logging. {pull}39834[39834] ==== Added diff --git a/x-pack/filebeat/input/cel/input.go b/x-pack/filebeat/input/cel/input.go index 7e8a7584c28..c70941a25a5 100644 --- a/x-pack/filebeat/input/cel/input.go +++ b/x-pack/filebeat/input/cel/input.go @@ -754,7 +754,7 @@ func newClient(ctx context.Context, cfg config, log *logp.Logger, reg *monitorin ) traceLogger := zap.New(core) - const margin = 1e3 // 1OkB ought to be enough room for all the remainder of the trace details. + const margin = 10e3 // 1OkB ought to be enough room for all the remainder of the trace details. maxSize := cfg.Resource.Tracer.MaxSize * 1e6 trace = httplog.NewLoggingRoundTripper(c.Transport, traceLogger, max(0, maxSize-margin), log) c.Transport = trace diff --git a/x-pack/filebeat/input/httpjson/input.go b/x-pack/filebeat/input/httpjson/input.go index 0764259619b..cb91723f464 100644 --- a/x-pack/filebeat/input/httpjson/input.go +++ b/x-pack/filebeat/input/httpjson/input.go @@ -259,7 +259,7 @@ func newNetHTTPClient(ctx context.Context, cfg *requestConfig, log *logp.Logger, ) traceLogger := zap.New(core) - const margin = 1e3 // 1OkB ought to be enough room for all the remainder of the trace details. + const margin = 10e3 // 1OkB ought to be enough room for all the remainder of the trace details. maxSize := cfg.Tracer.MaxSize*1e6 - margin if maxSize < 0 { maxSize = 0 From 3c9f4d952bfd20b1898cfeb59916a2239b667988 Mon Sep 17 00:00:00 2001 From: kruskall <99559985+kruskall@users.noreply.github.com> Date: Mon, 10 Jun 2024 23:46:23 +0200 Subject: [PATCH 19/21] refactor: replace urso/sderr with stdlib errors (#39839) * refactor: replace urso/sderr with stdlib errors Go 1.20 added multiple errors wrapping so we can migrate to stdlib errors and drop the additional dependency on github.com/urso/sderr * refactor: avoid wrapping and unwrapping * Update copytruncate_prospector.go * Update statestore.go --- NOTICE.txt | 422 +++++++++--------- .../filestream/copytruncate_prospector.go | 5 +- .../internal/input-logfile/manager.go | 4 +- filebeat/input/filestream/prospector.go | 5 +- filebeat/input/journald/input.go | 4 +- .../input/journald/pkg/journalread/reader.go | 9 +- filebeat/input/v2/input-cursor/input.go | 7 +- filebeat/input/v2/input-cursor/manager.go | 5 +- go.mod | 2 +- .../provider/azuread/statestore.go | 3 +- .../provider/okta/statestore.go | 8 +- 11 files changed, 231 insertions(+), 243 deletions(-) diff --git a/NOTICE.txt b/NOTICE.txt index c7bfc6b351b..f6881a796c0 100644 --- a/NOTICE.txt +++ b/NOTICE.txt @@ -23158,217 +23158,6 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. --------------------------------------------------------------------------------- -Dependency : github.com/urso/sderr -Version: v0.0.0-20210525210834-52b04e8f5c71 -Licence type (autodetected): Apache-2.0 --------------------------------------------------------------------------------- - -Contents of probable licence file $GOMODCACHE/github.com/urso/sderr@v0.0.0-20210525210834-52b04e8f5c71/LICENSE: - - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright [yyyy] [name of copyright owner] - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. - - -------------------------------------------------------------------------------- Dependency : github.com/vmware/govmomi Version: v0.0.0-20170802214208-2cad15190b41 @@ -52541,6 +52330,217 @@ Contents of probable licence file $GOMODCACHE/github.com/urso/diag@v0.0.0-202002 limitations under the License. +-------------------------------------------------------------------------------- +Dependency : github.com/urso/sderr +Version: v0.0.0-20210525210834-52b04e8f5c71 +Licence type (autodetected): Apache-2.0 +-------------------------------------------------------------------------------- + +Contents of probable licence file $GOMODCACHE/github.com/urso/sderr@v0.0.0-20210525210834-52b04e8f5c71/LICENSE: + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + + -------------------------------------------------------------------------------- Dependency : github.com/vishvananda/netlink Version: v1.1.0 diff --git a/filebeat/input/filestream/copytruncate_prospector.go b/filebeat/input/filestream/copytruncate_prospector.go index 5b1c6bdd427..50ea7df25c0 100644 --- a/filebeat/input/filestream/copytruncate_prospector.go +++ b/filebeat/input/filestream/copytruncate_prospector.go @@ -18,14 +18,13 @@ package filestream import ( + "errors" "os" "regexp" "sort" "strconv" "time" - "github.com/urso/sderr" - loginp "github.com/elastic/beats/v7/filebeat/input/filestream/internal/input-logfile" input "github.com/elastic/beats/v7/filebeat/input/v2" "github.com/elastic/beats/v7/libbeat/common/file" @@ -230,7 +229,7 @@ func (p *copyTruncateFileProspector) Run(ctx input.Context, s loginp.StateMetada errs := tg.Wait() if len(errs) > 0 { - log.Error("%s", sderr.WrapAll(errs, "running prospector failed")) + log.Errorf("running prospector failed: %v", errors.Join(errs...)) } } diff --git a/filebeat/input/filestream/internal/input-logfile/manager.go b/filebeat/input/filestream/internal/input-logfile/manager.go index cfa83006117..3eb2f951036 100644 --- a/filebeat/input/filestream/internal/input-logfile/manager.go +++ b/filebeat/input/filestream/internal/input-logfile/manager.go @@ -25,8 +25,6 @@ import ( "sync" "time" - "github.com/urso/sderr" - "github.com/elastic/go-concert/unison" v2 "github.com/elastic/beats/v7/filebeat/input/v2" @@ -141,7 +139,7 @@ func (cim *InputManager) Init(group unison.Group) error { if err != nil { store.Release() cim.shutdown() - return sderr.Wrap(err, "Can not start registry cleanup process") + return fmt.Errorf("Can not start registry cleanup process: %w", err) } return nil diff --git a/filebeat/input/filestream/prospector.go b/filebeat/input/filestream/prospector.go index 336461fede5..2bf737a86fd 100644 --- a/filebeat/input/filestream/prospector.go +++ b/filebeat/input/filestream/prospector.go @@ -18,11 +18,10 @@ package filestream import ( + "errors" "fmt" "time" - "github.com/urso/sderr" - loginp "github.com/elastic/beats/v7/filebeat/input/filestream/internal/input-logfile" input "github.com/elastic/beats/v7/filebeat/input/v2" "github.com/elastic/beats/v7/libbeat/beat" @@ -160,7 +159,7 @@ func (p *fileProspector) Run(ctx input.Context, s loginp.StateMetadataUpdater, h errs := tg.Wait() if len(errs) > 0 { - log.Error("%s", sderr.WrapAll(errs, "running prospector failed")) + log.Errorf("running prospector failed: %v", errors.Join(errs...)) } } diff --git a/filebeat/input/journald/input.go b/filebeat/input/journald/input.go index c32d677ffa4..a704962d28c 100644 --- a/filebeat/input/journald/input.go +++ b/filebeat/input/journald/input.go @@ -20,10 +20,10 @@ package journald import ( + "fmt" "time" "github.com/coreos/go-systemd/v22/sdjournal" - "github.com/urso/sderr" "github.com/elastic/beats/v7/filebeat/input/journald/pkg/journalfield" "github.com/elastic/beats/v7/filebeat/input/journald/pkg/journalread" @@ -181,7 +181,7 @@ func (inp *journald) open(log *logp.Logger, canceler input.Canceler, src cursor. withTransports(inp.Transports), withSyslogIdentifiers(inp.Identifiers)) if err != nil { - return nil, sderr.Wrap(err, "failed to create reader for %{path} journal", src.Name()) + return nil, fmt.Errorf("failed to create reader for %s journal: %w", src.Name(), err) } return reader, nil diff --git a/filebeat/input/journald/pkg/journalread/reader.go b/filebeat/input/journald/pkg/journalread/reader.go index 9994c0aad7c..6b99036d871 100644 --- a/filebeat/input/journald/pkg/journalread/reader.go +++ b/filebeat/input/journald/pkg/journalread/reader.go @@ -27,7 +27,6 @@ import ( "time" "github.com/coreos/go-systemd/v22/sdjournal" - "github.com/urso/sderr" "github.com/elastic/beats/v7/libbeat/common/backoff" "github.com/elastic/beats/v7/libbeat/common/cleanup" @@ -96,27 +95,27 @@ func openJournal(path string) (*sdjournal.Journal, error) { if path == localSystemJournalID || path == "" { j, err := sdjournal.NewJournal() if err != nil { - err = sderr.Wrap(err, "failed to open local journal") + err = fmt.Errorf("failed to open local journal: %w", err) } return j, err } stat, err := os.Stat(path) if err != nil { - return nil, sderr.Wrap(err, "failed to read meta data for %{path}", path) + return nil, fmt.Errorf("failed to read meta data for %s: %w", path, err) } if stat.IsDir() { j, err := sdjournal.NewJournalFromDir(path) if err != nil { - err = sderr.Wrap(err, "failed to open journal directory %{path}", path) + err = fmt.Errorf("failed to open journal directory %s: %w", path, err) } return j, err } j, err := sdjournal.NewJournalFromFiles(path) if err != nil { - err = sderr.Wrap(err, "failed to open journal file %{path}", path) + err = fmt.Errorf("failed to open journal file %s: %w", path, err) } return j, err } diff --git a/filebeat/input/v2/input-cursor/input.go b/filebeat/input/v2/input-cursor/input.go index 37036e983c6..c1d4cec0762 100644 --- a/filebeat/input/v2/input-cursor/input.go +++ b/filebeat/input/v2/input-cursor/input.go @@ -19,12 +19,11 @@ package cursor import ( "context" + "errors" "fmt" "runtime/debug" "time" - "github.com/urso/sderr" - "github.com/elastic/go-concert/ctxtool" "github.com/elastic/go-concert/unison" @@ -81,7 +80,7 @@ func (inp *managedInput) Test(ctx input.TestContext) error { errs := grp.Wait() if len(errs) > 0 { - return sderr.WrapAll(errs, "input tests failed") + return fmt.Errorf("input tests failed: %w", errors.Join(errs...)) } return nil } @@ -127,7 +126,7 @@ func (inp *managedInput) Run( } if errs := grp.Wait(); len(errs) > 0 { - return sderr.WrapAll(errs, "input %{id} failed", ctx.ID) + return fmt.Errorf("input %s failed: %w", ctx.ID, errors.Join(errs...)) } return nil } diff --git a/filebeat/input/v2/input-cursor/manager.go b/filebeat/input/v2/input-cursor/manager.go index 0d004acc16a..1d5578a7122 100644 --- a/filebeat/input/v2/input-cursor/manager.go +++ b/filebeat/input/v2/input-cursor/manager.go @@ -20,11 +20,10 @@ package cursor import ( "context" "errors" + "fmt" "sync" "time" - "github.com/urso/sderr" - "github.com/elastic/go-concert/unison" v2 "github.com/elastic/beats/v7/filebeat/input/v2" @@ -131,7 +130,7 @@ func (cim *InputManager) Init(group unison.Group) error { if err != nil { store.Release() cim.shutdown() - return sderr.Wrap(err, "Can not start registry cleanup process") + return fmt.Errorf("Can not start registry cleanup process: %w", err) } return nil diff --git a/go.mod b/go.mod index bf3ff2ad312..d7709734067 100644 --- a/go.mod +++ b/go.mod @@ -144,7 +144,7 @@ require ( github.com/stretchr/testify v1.9.0 github.com/tsg/go-daemon v0.0.0-20200207173439-e704b93fd89b github.com/ugorji/go/codec v1.1.8 - github.com/urso/sderr v0.0.0-20210525210834-52b04e8f5c71 + github.com/urso/sderr v0.0.0-20210525210834-52b04e8f5c71 // indirect github.com/vmware/govmomi v0.0.0-20170802214208-2cad15190b41 github.com/xdg/scram v1.0.3 go.elastic.co/ecszap v1.0.2 diff --git a/x-pack/filebeat/input/entityanalytics/provider/azuread/statestore.go b/x-pack/filebeat/input/entityanalytics/provider/azuread/statestore.go index 4073b6f0e45..392ce5f0460 100644 --- a/x-pack/filebeat/input/entityanalytics/provider/azuread/statestore.go +++ b/x-pack/filebeat/input/entityanalytics/provider/azuread/statestore.go @@ -11,7 +11,6 @@ import ( "time" "github.com/google/uuid" - "github.com/urso/sderr" "github.com/elastic/beats/v7/x-pack/filebeat/input/entityanalytics/internal/collections" "github.com/elastic/beats/v7/x-pack/filebeat/input/entityanalytics/internal/kvstore" @@ -176,7 +175,7 @@ func (s *stateStore) close(commit bool) (err error) { } if err != nil { - err = sderr.WrapAll([]error{err, rollbackErr}, "multiple errors during statestore close") + err = fmt.Errorf("multiple errors during statestore close: %w", errors.Join(err, rollbackErr)) } else { err = rollbackErr } diff --git a/x-pack/filebeat/input/entityanalytics/provider/okta/statestore.go b/x-pack/filebeat/input/entityanalytics/provider/okta/statestore.go index a54fc3b9928..401b3353d14 100644 --- a/x-pack/filebeat/input/entityanalytics/provider/okta/statestore.go +++ b/x-pack/filebeat/input/entityanalytics/provider/okta/statestore.go @@ -10,8 +10,6 @@ import ( "fmt" "time" - "github.com/urso/sderr" - "github.com/elastic/beats/v7/x-pack/filebeat/input/entityanalytics/internal/kvstore" "github.com/elastic/beats/v7/x-pack/filebeat/input/entityanalytics/provider/okta/internal/okta" ) @@ -187,10 +185,8 @@ func (s *stateStore) close(commit bool) (err error) { return } rollbackErr := s.tx.Rollback() - if rollbackErr == nil { - // FIXME: Use fmt.Errorf("multiple errors during statestore close: %w", errors.Join(err, rollbackErr)) - // when go1.20 is supported. - err = sderr.WrapAll([]error{err, rollbackErr}, "multiple errors during statestore close") + if rollbackErr != nil { + err = fmt.Errorf("multiple errors during statestore close: %w", errors.Join(err, rollbackErr)) } }() From f8aedce388312b782a19b053508541a0901dbf18 Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Tue, 11 Jun 2024 16:51:08 -0400 Subject: [PATCH 20/21] [libbeat] Add a metrics observer to the queue (#39774) Add a metrics observer to the queue, reporting the metrics: - `queue.added.{events, bytes}`, the number of events/bytes added to the queue - `queue.consumed.{events, bytes}`, the number of events/bytes sent to the outputs - `queue.removed.{events, bytes}`, the number of events/bytes removed from the queue after acknowledgment (`queue.removed.events` is an alias for the existing `queue.acked`). `queue.filled.{events, bytes}`, the current number of events and bytes in the queue (gauges) It also fixes the behavior of `queue.filled.pct.events`, renaming it `queue.filled.pct`. All byte values reported by the memory queue are 0 if the output doesn't support early encoding. This required some refactoring to the pipeline, which previously used a single custom callback to track its only queue metric (`queue.acked`) from `outputObserver`, and also used that to manage a wait group that was used to drain the queue on pipeline shutdown. The main changes are: - A new interface type, `queue.Observer`, with an implementation `queueObserver` for standard metrics reporting. - `queueMaxEvents` and `queueACKed` were removed from `pipeline.outputObserver`, since their logic is now handled by `queue.Observer`. - A queue factory now takes a `queue.Observer` instead of an ACK callback - The queue API now includes a `Done()` channel that signals when all events are acked / shutdown is complete, so shutdown handling now waits on that channel in `outputController.Close` instead of the shared waitgroup in `Pipeline.Close`. - `pipeline.outputObserver` was renamed `pipeline.retryObserver` since its only remaining functions track retries and retry failures. It is now owned by `eventConsumer` (its only caller) instead of `pipeline.outputController`. The queue previously had a `Metrics()` call that was used in the shipper but didn't integrate with Beats metrics. It had no remaining callers, so I deleted it while adding the new helpers. --- filebeat/tests/system/test_reload_inputs.py | 4 +- libbeat/docs/metrics-in-logs.asciidoc | 49 +++- libbeat/monitoring/report/log/log.go | 63 ++--- libbeat/publisher/pipeline/client.go | 16 +- libbeat/publisher/pipeline/client_test.go | 75 ++---- libbeat/publisher/pipeline/consumer.go | 16 +- libbeat/publisher/pipeline/controller.go | 108 +++++---- libbeat/publisher/pipeline/controller_test.go | 18 +- libbeat/publisher/pipeline/monitoring.go | 50 +--- libbeat/publisher/pipeline/pipeline.go | 51 ++-- libbeat/publisher/pipeline/pipeline_test.go | 8 +- libbeat/publisher/queue/diskqueue/consumer.go | 7 + .../queue/diskqueue/consumer_test.go | 61 +++++ .../publisher/queue/diskqueue/core_loop.go | 37 +-- .../queue/diskqueue/core_loop_test.go | 63 +++++ libbeat/publisher/queue/diskqueue/producer.go | 2 +- libbeat/publisher/queue/diskqueue/queue.go | 122 +++------- .../publisher/queue/diskqueue/queue_test.go | 41 ---- libbeat/publisher/queue/diskqueue/segments.go | 5 +- .../publisher/queue/diskqueue/writer_loop.go | 15 +- libbeat/publisher/queue/memqueue/ackloop.go | 4 - libbeat/publisher/queue/memqueue/broker.go | 72 +++--- .../publisher/queue/memqueue/internal_api.go | 16 -- libbeat/publisher/queue/memqueue/produce.go | 26 +- .../publisher/queue/memqueue/queue_test.go | 224 ------------------ libbeat/publisher/queue/memqueue/runloop.go | 64 +++-- .../publisher/queue/memqueue/runloop_test.go | 89 +++++++ libbeat/publisher/queue/monitoring.go | 153 ++++++++++++ libbeat/publisher/queue/queue.go | 39 +-- 29 files changed, 733 insertions(+), 765 deletions(-) create mode 100644 libbeat/publisher/queue/diskqueue/consumer_test.go create mode 100644 libbeat/publisher/queue/monitoring.go diff --git a/filebeat/tests/system/test_reload_inputs.py b/filebeat/tests/system/test_reload_inputs.py index 53644837c2c..6f380bb6d4e 100644 --- a/filebeat/tests/system/test_reload_inputs.py +++ b/filebeat/tests/system/test_reload_inputs.py @@ -91,8 +91,6 @@ def test_start_stop(self): inputs=False, ) - proc = self.start_beat() - os.mkdir(self.working_dir + "/logs/") logfile = self.working_dir + "/logs/test.log" os.mkdir(self.working_dir + "/configs/") @@ -103,6 +101,8 @@ def test_start_stop(self): with open(logfile, 'w') as f: f.write("Hello world\n") + proc = self.start_beat() + self.wait_until(lambda: self.output_lines() == 1) # Remove input by moving the file diff --git a/libbeat/docs/metrics-in-logs.asciidoc b/libbeat/docs/metrics-in-logs.asciidoc index 97aac4f3a30..27d27ef9d43 100644 --- a/libbeat/docs/metrics-in-logs.asciidoc +++ b/libbeat/docs/metrics-in-logs.asciidoc @@ -2,11 +2,11 @@ Every 30 seconds (by default), {beatname_uc} collects a _snapshot_ of metrics about itself. From this snapshot, {beatname_uc} computes a _delta snapshot_; this delta snapshot contains any metrics that have _changed_ since the last snapshot. Note that the values of the metrics are the values when the snapshot is taken, _NOT_ the _difference_ in values from the last snapshot. -If this delta snapshot contains _any_ metrics (indicating at least one metric that has changed since the last snapshot), this delta snapshot is serialized as JSON and emitted in {beatname_uc}'s logs at the `INFO` log level. Here is an example of such a log entry: +If this delta snapshot contains _any_ metrics (indicating at least one metric that has changed since the last snapshot), this delta snapshot is serialized as JSON and emitted in {beatname_uc}'s logs at the `INFO` log level. Most snapshot fields report the change in the metric since the last snapshot, however some fields are _gauges_, which always report the current value. Here is an example of such a log entry: [source,json] ---- -{"log.level":"info","@timestamp":"2023-07-14T12:50:36.811Z","log.logger":"monitoring","log.origin":{"file.name":"log/log.go","file.line":187},"message":"Non-zero metrics in the last 30s","service.name":"filebeat","monitoring":{"metrics":{"beat":{"cgroup":{"memory":{"mem":{"usage":{"bytes":0}}}},"cpu":{"system":{"ticks":692690,"time":{"ms":60}},"total":{"ticks":3167250,"time":{"ms":150},"value":3167250},"user":{"ticks":2474560,"time":{"ms":90}}},"handles":{"limit":{"hard":1048576,"soft":1048576},"open":32},"info":{"ephemeral_id":"2bab8688-34c0-4522-80af-db86948d547d","uptime":{"ms":617670096},"version":"8.6.2"},"memstats":{"gc_next":57189272,"memory_alloc":43589824,"memory_total":275281335792,"rss":183574528},"runtime":{"goroutines":212}},"filebeat":{"events":{"active":5,"added":52,"done":49},"harvester":{"open_files":6,"running":6,"started":1}},"libbeat":{"config":{"module":{"running":15}},"output":{"events":{"acked":48,"active":0,"batches":6,"total":48},"read":{"bytes":210},"write":{"bytes":26923}},"pipeline":{"clients":15,"events":{"active":5,"filtered":1,"published":51,"total":52},"queue":{"acked":48}}},"registrar":{"states":{"current":14,"update":49},"writes":{"success":6,"total":6}},"system":{"load":{"1":0.91,"15":0.37,"5":0.4,"norm":{"1":0.1138,"15":0.0463,"5":0.05}}}},"ecs.version":"1.6.0"}} +{"log.level":"info","@timestamp":"2023-07-14T12:50:36.811Z","log.logger":"monitoring","log.origin":{"file.name":"log/log.go","file.line":187},"message":"Non-zero metrics in the last 30s","service.name":"filebeat","monitoring":{"metrics":{"beat":{"cgroup":{"memory":{"mem":{"usage":{"bytes":0}}}},"cpu":{"system":{"ticks":692690,"time":{"ms":60}},"total":{"ticks":3167250,"time":{"ms":150},"value":3167250},"user":{"ticks":2474560,"time":{"ms":90}}},"handles":{"limit":{"hard":1048576,"soft":1048576},"open":32},"info":{"ephemeral_id":"2bab8688-34c0-4522-80af-db86948d547d","uptime":{"ms":617670096},"version":"8.6.2"},"memstats":{"gc_next":57189272,"memory_alloc":43589824,"memory_total":275281335792,"rss":183574528},"runtime":{"goroutines":212}},"filebeat":{"events":{"active":5,"added":52,"done":49},"harvester":{"open_files":6,"running":6,"started":1}},"libbeat":{"config":{"module":{"running":15}},"output":{"events":{"acked":48,"active":0,"batches":6,"total":48},"read":{"bytes":210},"write":{"bytes":26923}},"pipeline":{"clients":15,"events":{"active":5,"filtered":1,"published":51,"total":52},"queue":{"max_events":3500,"filled":{"events":5,"bytes":6425,"pct":0.0014},"added":{"events":52,"bytes":65702},"consumed":{"events":52,"bytes":65702},"removed":{"events":48,"bytes":59277},"acked":48}}},"registrar":{"states":{"current":14,"update":49},"writes":{"success":6,"total":6}},"system":{"load":{"1":0.91,"15":0.37,"5":0.4,"norm":{"1":0.1138,"15":0.0463,"5":0.05}}}},"ecs.version":"1.6.0"}} ---- [discrete] @@ -113,6 +113,24 @@ Focussing on the `.monitoring.metrics` field, and formatting the JSON, it's valu "total": 52 }, "queue": { + "max_events": 3500, + "filled": { + "events": 5, + "bytes": 6425, + "pct": 0.0014 + }, + "added": { + "events": 52, + "bytes": 65702 + }, + "consumed": { + "events": 52, + "bytes": 65702 + }, + "removed": { + "events": 48, + "bytes": 59277 + }, "acked": 48 } } @@ -130,12 +148,12 @@ Focussing on the `.monitoring.metrics` field, and formatting the JSON, it's valu "system": { "load": { "1": 0.91, - "5": 0.4, "15": 0.37, + "5": 0.4, "norm": { "1": 0.1138, - "5": 0.05, - "15": 0.0463 + "15": 0.0463, + "5": 0.05 } } } @@ -170,9 +188,30 @@ endif::[] | `.output.events.total` | Integer | Number of events currently being processed by the output. | If this number grows over time, it may indicate that the output destination (e.g. {ls} pipeline or {es} cluster) is not able to accept events at the same or faster rate than what {beatname_uc} is sending to it. | `.output.events.acked` | Integer | Number of events acknowledged by the output destination. | Generally, we want this number to be the same as `.output.events.total` as this indicates that the output destination has reliably received all the events sent to it. | `.output.events.failed` | Integer | Number of events that {beatname_uc} tried to send to the output destination, but the destination failed to receive them. | Generally, we want this field to be absent or its value to be zero. When the value is greater than zero, it's useful to check {beatname_uc}'s logs right before this log entry's `@timestamp` to see if there are any connectivity issues with the output destination. Note that failed events are not lost or dropped; they will be sent back to the publisher pipeline for retrying later. +| `.output.events.dropped` | Integer | Number of events that {beatname_uc} gave up sending to the output destination because of a permanent (non-retryable) error. +| `.output.events.dead_letter` | Integer | Number of events that {beatname_uc} successfully sent to a configured dead letter index after they failed to ingest in the primary index. | `.output.write.latency` | Object | Reports statistics on the time to send an event to the connected output, in milliseconds. This can be used to diagnose delays and performance issues caused by I/O or output configuration. This metric is available for the Elasticsearch, file, redis, and logstash outputs. |=== +[cols="1,1,2,2"] +|=== +| Field path (relative to `.monitoring.metrics.libbeat.pipeline`) | Type | Meaning | Troubleshooting hints + +| `.queue.max_events` | Integer (gauge) | The queue's maximum event count if it has one, otherwise zero. +| `.queue.max_bytes` | Integer (gauge) | The queue's maximum byte count if it has one, otherwise zero. +| `.queue.filled.events` | Integer (gauge) | Number of events currently stored by the queue. | +| `.queue.filled.bytes` | Integer (gauge) | Number of bytes currently stored by the queue. | +| `.queue.filled.pct` | Float (gauge) | How full the queue is relative to its maximum size, as a fraction from 0 to 1. | Low throughput while `queue.filled.pct` is low means congestion in the input. Low throughput while `queue.filled.pct` is high means congestion in the output. +| `.queue.added.events` | Integer | Number of events added to the queue by input workers. | +| `.queue.added.bytes` | Integer | Number of bytes added to the queue by input workers. | +| `.queue.consumed.events` | Integer | Number of events sent to output workers. | +| `.queue.consumed.bytes` | Integer | Number of bytes sent to output workers. | +| `.queue.removed.events` | Integer | Number of events removed from the queue after being processed by output workers. | +| `.queue.removed.bytes` | Integer | Number of bytes removed from the queue after being processed by output workers. | +|=== + +When using the memory queue, byte metrics are only set if the output supports them. Currently only the Elasticsearch output supports byte metrics. + ifeval::["{beatname_lc}"=="filebeat"] [cols="1,1,2,2"] |=== diff --git a/libbeat/monitoring/report/log/log.go b/libbeat/monitoring/report/log/log.go index e11e8228cf7..b40c6d33e42 100644 --- a/libbeat/monitoring/report/log/log.go +++ b/libbeat/monitoring/report/log/log.go @@ -37,36 +37,39 @@ import ( // TODO: Replace this with a proper solution that uses the metric type from // where it is defined. See: https://github.com/elastic/beats/issues/5433 var gauges = map[string]bool{ - "libbeat.output.events.active": true, - "libbeat.pipeline.events.active": true, - "libbeat.pipeline.clients": true, - "libbeat.pipeline.queue.max_events": true, - "libbeat.pipeline.queue.filled.pct.events": true, - "libbeat.config.module.running": true, - "registrar.states.current": true, - "filebeat.events.active": true, - "filebeat.harvester.running": true, - "filebeat.harvester.open_files": true, - "beat.memstats.memory_total": true, - "beat.memstats.memory_alloc": true, - "beat.memstats.rss": true, - "beat.memstats.gc_next": true, - "beat.info.uptime.ms": true, - "beat.cgroup.memory.mem.usage.bytes": true, - "beat.cpu.user.ticks": true, - "beat.cpu.system.ticks": true, - "beat.cpu.total.value": true, - "beat.cpu.total.ticks": true, - "beat.handles.open": true, - "beat.handles.limit.hard": true, - "beat.handles.limit.soft": true, - "beat.runtime.goroutines": true, - "system.load.1": true, - "system.load.5": true, - "system.load.15": true, - "system.load.norm.1": true, - "system.load.norm.5": true, - "system.load.norm.15": true, + "libbeat.output.events.active": true, + "libbeat.pipeline.events.active": true, + "libbeat.pipeline.clients": true, + "libbeat.pipeline.queue.max_events": true, + "libbeat.pipeline.queue.max_bytes": true, + "libbeat.pipeline.queue.filled.events": true, + "libbeat.pipeline.queue.filled.bytes": true, + "libbeat.pipeline.queue.filled.pct": true, + "libbeat.config.module.running": true, + "registrar.states.current": true, + "filebeat.events.active": true, + "filebeat.harvester.running": true, + "filebeat.harvester.open_files": true, + "beat.memstats.memory_total": true, + "beat.memstats.memory_alloc": true, + "beat.memstats.rss": true, + "beat.memstats.gc_next": true, + "beat.info.uptime.ms": true, + "beat.cgroup.memory.mem.usage.bytes": true, + "beat.cpu.user.ticks": true, + "beat.cpu.system.ticks": true, + "beat.cpu.total.value": true, + "beat.cpu.total.ticks": true, + "beat.handles.open": true, + "beat.handles.limit.hard": true, + "beat.handles.limit.soft": true, + "beat.runtime.goroutines": true, + "system.load.1": true, + "system.load.5": true, + "system.load.15": true, + "system.load.norm.1": true, + "system.load.norm.5": true, + "system.load.norm.15": true, } // isGauge returns true when the given metric key name represents a gauge value. diff --git a/libbeat/publisher/pipeline/client.go b/libbeat/publisher/pipeline/client.go index 7ecce6fd8c7..af756213a63 100644 --- a/libbeat/publisher/pipeline/client.go +++ b/libbeat/publisher/pipeline/client.go @@ -37,9 +37,8 @@ type client struct { mutex sync.Mutex waiter *clientCloseWaiter - eventFlags publisher.EventFlags - canDrop bool - eventWaitGroup *sync.WaitGroup + eventFlags publisher.EventFlags + canDrop bool // Open state, signaling, and sync primitives for coordinating client Close. isOpen atomic.Bool // set to false during shutdown, such that no new events will be accepted anymore. @@ -132,10 +131,8 @@ func (c *client) publish(e beat.Event) { } func (c *client) Close() error { - // first stop ack handling. ACK handler might block on wait (with timeout), waiting - // for pending events to be ACKed. - c.closeOnce.Do(func() { - c.isOpen.Store(false) + if c.isOpen.Swap(false) { + // Only do shutdown handling the first time Close is called c.onClosing() c.logger.Debug("client: closing acker") @@ -158,7 +155,7 @@ func (c *client) Close() error { } c.logger.Debug("client: done closing processors") } - }) + } return nil } @@ -180,9 +177,6 @@ func (c *client) onNewEvent() { } func (c *client) onPublished() { - if c.eventWaitGroup != nil { - c.eventWaitGroup.Add(1) - } c.observer.publishedEvent() if c.clientListener != nil { c.clientListener.Published() diff --git a/libbeat/publisher/pipeline/client_test.go b/libbeat/publisher/pipeline/client_test.go index 25080c90615..f729d417ca5 100644 --- a/libbeat/publisher/pipeline/client_test.go +++ b/libbeat/publisher/pipeline/client_test.go @@ -60,59 +60,27 @@ func TestClient(t *testing.T) { // Note: no asserts. If closing fails we have a deadlock, because Publish // would block forever - cases := map[string]struct { - context bool - close func(client beat.Client, cancel func()) - }{ - "close unblocks client without context": { - context: false, - close: func(client beat.Client, _ func()) { - client.Close() - }, - }, - "close unblocks client with context": { - context: true, - close: func(client beat.Client, _ func()) { - client.Close() - }, - }, - "context cancel unblocks client": { - context: true, - close: func(client beat.Client, cancel func()) { - cancel() - }, - }, - } - logp.TestingSetup() + routinesChecker := resources.NewGoroutinesChecker() + defer routinesChecker.Check(t) - for name, test := range cases { - t.Run(name, func(t *testing.T) { - routinesChecker := resources.NewGoroutinesChecker() - defer routinesChecker.Check(t) + pipeline := makePipeline(t, Settings{}, makeTestQueue()) + defer pipeline.Close() - pipeline := makePipeline(t, Settings{}, makeTestQueue()) - defer pipeline.Close() - - client, err := pipeline.ConnectWith(beat.ClientConfig{}) - if err != nil { - t.Fatal(err) - } - defer client.Close() + client, err := pipeline.ConnectWith(beat.ClientConfig{}) + if err != nil { + t.Fatal(err) + } - var wg sync.WaitGroup - wg.Add(1) - go func() { - defer wg.Done() - client.Publish(beat.Event{}) - }() + var wg sync.WaitGroup + wg.Add(1) + go func() { + defer wg.Done() + client.Publish(beat.Event{}) + }() - test.close(client, func() { - client.Close() - }) - wg.Wait() - }) - } + client.Close() + wg.Wait() }) t.Run("no infinite loop when processing fails", func(t *testing.T) { @@ -216,9 +184,6 @@ func TestClient(t *testing.T) { } func TestClientWaitClose(t *testing.T) { - routinesChecker := resources.NewGoroutinesChecker() - defer routinesChecker.Check(t) - makePipeline := func(settings Settings, qu queue.Queue) *Pipeline { p, err := New(beat.Info{}, Monitors{}, @@ -241,6 +206,9 @@ func TestClientWaitClose(t *testing.T) { defer pipeline.Close() t.Run("WaitClose blocks", func(t *testing.T) { + routinesChecker := resources.NewGoroutinesChecker() + defer routinesChecker.Check(t) + client, err := pipeline.ConnectWith(beat.ClientConfig{ WaitClose: 500 * time.Millisecond, }) @@ -272,6 +240,8 @@ func TestClientWaitClose(t *testing.T) { }) t.Run("ACKing events unblocks WaitClose", func(t *testing.T) { + routinesChecker := resources.NewGoroutinesChecker() + defer routinesChecker.Check(t) client, err := pipeline.ConnectWith(beat.ClientConfig{ WaitClose: time.Minute, }) @@ -344,9 +314,6 @@ func TestMonitoring(t *testing.T) { require.NoError(t, err) defer pipeline.Close() - metricsSnapshot := monitoring.CollectFlatSnapshot(metrics, monitoring.Full, true) - assert.Equal(t, int64(maxEvents), metricsSnapshot.Ints["pipeline.queue.max_events"]) - telemetrySnapshot := monitoring.CollectFlatSnapshot(telemetry, monitoring.Full, true) assert.Equal(t, "output_name", telemetrySnapshot.Strings["output.name"]) assert.Equal(t, int64(batchSize), telemetrySnapshot.Ints["output.batch_size"]) diff --git a/libbeat/publisher/pipeline/consumer.go b/libbeat/publisher/pipeline/consumer.go index 1ff8c1bc95d..a7806a3ded2 100644 --- a/libbeat/publisher/pipeline/consumer.go +++ b/libbeat/publisher/pipeline/consumer.go @@ -31,8 +31,8 @@ import ( type eventConsumer struct { logger *logp.Logger - // eventConsumer calls the observer methods eventsRetry and eventsDropped. - observer outputObserver + // eventConsumer calls the retryObserver methods eventsRetry and eventsDropped. + retryObserver retryObserver // When the output changes, the new target is sent to the worker routine // on this channel. Clients should call eventConsumer.setTarget(). @@ -73,12 +73,12 @@ type retryRequest struct { func newEventConsumer( log *logp.Logger, - observer outputObserver, + observer retryObserver, ) *eventConsumer { c := &eventConsumer{ - logger: log, - observer: observer, - queueReader: makeQueueReader(), + logger: log, + retryObserver: observer, + queueReader: makeQueueReader(), targetChan: make(chan consumerTarget), retryChan: make(chan retryRequest), @@ -163,7 +163,7 @@ outerLoop: // Successfully sent a batch to the output workers if len(retryBatches) > 0 { // This was a retry, report it to the observer - c.observer.eventsRetry(len(active.Events())) + c.retryObserver.eventsRetry(len(active.Events())) retryBatches = retryBatches[1:] } else { // This was directly from the queue, clear the value so we can @@ -183,7 +183,7 @@ outerLoop: alive := req.batch.reduceTTL() countDropped := countFailed - len(req.batch.Events()) - c.observer.eventsDropped(countDropped) + c.retryObserver.eventsDropped(countDropped) if !alive { log.Info("Drop batch") diff --git a/libbeat/publisher/pipeline/controller.go b/libbeat/publisher/pipeline/controller.go index b34d6a64d2c..d7e07846e0c 100644 --- a/libbeat/publisher/pipeline/controller.go +++ b/libbeat/publisher/pipeline/controller.go @@ -19,6 +19,7 @@ package pipeline import ( "sync" + "time" "github.com/elastic/beats/v7/libbeat/beat" "github.com/elastic/beats/v7/libbeat/common/reload" @@ -38,11 +39,6 @@ import ( type outputController struct { beat beat.Info monitors Monitors - observer outputObserver - - // If eventWaitGroup is non-nil, it will be decremented as the queue - // reports upstream acknowledgment of published events. - eventWaitGroup *sync.WaitGroup // The queue is not created until the outputController is assigned a // nonempty outputs.Group, in case the output group requests a proxy @@ -58,10 +54,15 @@ type outputController struct { // is called. queueFactory queue.QueueFactory + // consumer is a helper goroutine that reads event batches from the queue + // and sends them to workerChan for an output worker to process. + consumer *eventConsumer + + // Each worker is a goroutine that will read batches from workerChan and + // send them to the output. + workers []outputWorker workerChan chan publisher.Batch - consumer *eventConsumer - workers []outputWorker // The InputQueueSize can be set when the Beat is started, in // libbeat/cmd/instance/Settings we need to preserve that // value and pass it into the queue factory. The queue @@ -85,54 +86,42 @@ type outputWorker interface { func newOutputController( beat beat.Info, monitors Monitors, - observer outputObserver, - eventWaitGroup *sync.WaitGroup, + retryObserver retryObserver, queueFactory queue.QueueFactory, inputQueueSize int, ) (*outputController, error) { controller := &outputController{ beat: beat, monitors: monitors, - observer: observer, - eventWaitGroup: eventWaitGroup, queueFactory: queueFactory, workerChan: make(chan publisher.Batch), - consumer: newEventConsumer(monitors.Logger, observer), + consumer: newEventConsumer(monitors.Logger, retryObserver), inputQueueSize: inputQueueSize, } return controller, nil } -func (c *outputController) Close() error { +func (c *outputController) WaitClose(timeout time.Duration) error { + // First: signal the queue that we're shutting down, and wait up to the + // given duration for it to drain and process ACKs. + c.closeQueue(timeout) + + // We've drained the queue as much as we can, signal eventConsumer to + // close, and wait for it to finish. After consumer.close returns, + // there will be no more writes to c.workerChan, so it is safe to close. c.consumer.close() close(c.workerChan) + // Signal the output workers to close. This step is a hint, and carries + // no guarantees. For example, on close the Elasticsearch output workers + // will close idle connections, but will not change any behavior for + // active connections, giving any remaining events a chance to ingest + // before we terminate. for _, out := range c.workers { out.Close() } - // Closing the queue stops ACKs from propagating, so we close everything - // else first to give it a chance to wait for any outstanding events to be - // acknowledged. - c.queueLock.Lock() - if c.queue != nil { - c.queue.Close() - } - for _, req := range c.pendingRequests { - // We can only end up here if there was an attempt to connect to the - // pipeline but it was shut down before any output was set. - // In this case, return nil and Pipeline.ConnectWith will pass on a - // real error to the caller. - // NOTE: under the current shutdown process, Pipeline.Close (and hence - // outputController.Close) is ~never called. So even if we did have - // blocked callers here, in a real shutdown they will never be woken - // up. But in hopes of a day when the shutdown process is more robust, - // I've decided to do the right thing here anyway. - req.responseChan <- nil - } - c.queueLock.Unlock() - return nil } @@ -203,6 +192,32 @@ func (c *outputController) Reload( return nil } +// Close the queue, waiting up to the specified timeout for pending events +// to complete. +func (c *outputController) closeQueue(timeout time.Duration) { + c.queueLock.Lock() + defer c.queueLock.Unlock() + if c.queue != nil { + c.queue.Close() + select { + case <-c.queue.Done(): + case <-time.After(timeout): + } + } + for _, req := range c.pendingRequests { + // We can only end up here if there was an attempt to connect to the + // pipeline but it was shut down before any output was set. + // In this case, return nil and Pipeline.ConnectWith will pass on a + // real error to the caller. + // NOTE: under the current shutdown process, Pipeline.Close (and hence + // outputController.Close) is ~never called. So even if we did have + // blocked callers here, in a real shutdown they will never be woken + // up. But in hopes of a day when the shutdown process is more robust, + // I've decided to do the right thing here anyway. + req.responseChan <- nil + } +} + // queueProducer creates a queue producer with the given config, blocking // until the queue is created if it does not yet exist. func (c *outputController) queueProducer(config queue.ProducerConfig) queue.Producer { @@ -233,16 +248,6 @@ func (c *outputController) queueProducer(config queue.ProducerConfig) queue.Prod return <-request.responseChan } -// onACK receives event acknowledgment notifications from the queue and -// forwards them to the metrics observer and the pipeline's global event -// wait group if one is set. -func (c *outputController) onACK(eventCount int) { - c.observer.queueACKed(eventCount) - if c.eventWaitGroup != nil { - c.eventWaitGroup.Add(-eventCount) - } -} - func (c *outputController) createQueueIfNeeded(outGrp outputs.Group) { logger := c.monitors.Logger if len(outGrp.Clients) == 0 { @@ -266,12 +271,21 @@ func (c *outputController) createQueueIfNeeded(outGrp outputs.Group) { if factory == nil { factory = c.queueFactory } + // Queue metrics are reported under the pipeline namespace + var pipelineMetrics *monitoring.Registry + if c.monitors.Metrics != nil { + pipelineMetrics := c.monitors.Metrics.GetRegistry("pipeline") + if pipelineMetrics == nil { + pipelineMetrics = c.monitors.Metrics.NewRegistry("pipeline") + } + } + queueObserver := queue.NewQueueObserver(pipelineMetrics) - queue, err := factory(logger, c.onACK, c.inputQueueSize, outGrp.EncoderFactory) + queue, err := factory(logger, queueObserver, c.inputQueueSize, outGrp.EncoderFactory) if err != nil { logger.Errorf("queue creation failed, falling back to default memory queue, check your queue configuration") s, _ := memqueue.SettingsForUserConfig(nil) - queue = memqueue.NewQueue(logger, c.onACK, s, c.inputQueueSize, outGrp.EncoderFactory) + queue = memqueue.NewQueue(logger, queueObserver, s, c.inputQueueSize, outGrp.EncoderFactory) } c.queue = queue @@ -279,8 +293,6 @@ func (c *outputController) createQueueIfNeeded(outGrp outputs.Group) { queueReg := c.monitors.Telemetry.NewRegistry("queue") monitoring.NewString(queueReg, "name").Set(c.queue.QueueType()) } - maxEvents := c.queue.BufferConfig().MaxEvents - c.observer.queueMaxEvents(maxEvents) // Now that we've created a queue, go through and unblock any callers // that are waiting for a producer. diff --git a/libbeat/publisher/pipeline/controller_test.go b/libbeat/publisher/pipeline/controller_test.go index 6834af2c7f3..2e4f0df990f 100644 --- a/libbeat/publisher/pipeline/controller_test.go +++ b/libbeat/publisher/pipeline/controller_test.go @@ -150,9 +150,9 @@ func TestQueueCreatedOnlyAfterOutputExists(t *testing.T) { // We aren't testing the values sent to eventConsumer, we // just need a placeholder here so outputController can // send configuration updates without blocking. - targetChan: make(chan consumerTarget, 4), + targetChan: make(chan consumerTarget, 4), + retryObserver: nilObserver, }, - observer: nilObserver, } // Set to an empty output group. This should not create a queue. controller.Set(outputs.Group{}) @@ -173,9 +173,9 @@ func TestOutputQueueFactoryTakesPrecedence(t *testing.T) { memqueue.Settings{Events: 1}, ), consumer: &eventConsumer{ - targetChan: make(chan consumerTarget, 4), + targetChan: make(chan consumerTarget, 4), + retryObserver: nilObserver, }, - observer: nilObserver, } controller.Set(outputs.Group{ Clients: []outputs.Client{newMockClient(nil)}, @@ -189,15 +189,15 @@ func TestOutputQueueFactoryTakesPrecedence(t *testing.T) { func TestFailedQueueFactoryRevertsToDefault(t *testing.T) { defaultSettings, _ := memqueue.SettingsForUserConfig(nil) - failedFactory := func(_ *logp.Logger, _ func(int), _ int, _ queue.EncoderFactory) (queue.Queue, error) { + failedFactory := func(_ *logp.Logger, _ queue.Observer, _ int, _ queue.EncoderFactory) (queue.Queue, error) { return nil, fmt.Errorf("This queue creation intentionally failed") } controller := outputController{ queueFactory: failedFactory, consumer: &eventConsumer{ - targetChan: make(chan consumerTarget, 4), + targetChan: make(chan consumerTarget, 4), + retryObserver: nilObserver, }, - observer: nilObserver, monitors: Monitors{ Logger: logp.NewLogger("tests"), }, @@ -213,9 +213,9 @@ func TestQueueProducerBlocksUntilOutputIsSet(t *testing.T) { controller := outputController{ queueFactory: memqueue.FactoryForSettings(memqueue.Settings{Events: 1}), consumer: &eventConsumer{ - targetChan: make(chan consumerTarget, 4), + targetChan: make(chan consumerTarget, 4), + retryObserver: nilObserver, }, - observer: nilObserver, } // Send producer requests from different goroutines. They should all // block, because there is no queue, but they should become unblocked diff --git a/libbeat/publisher/pipeline/monitoring.go b/libbeat/publisher/pipeline/monitoring.go index 0bc63a739f9..4a1e5ad76a1 100644 --- a/libbeat/publisher/pipeline/monitoring.go +++ b/libbeat/publisher/pipeline/monitoring.go @@ -18,15 +18,13 @@ package pipeline import ( - "math" - "github.com/elastic/elastic-agent-libs/monitoring" ) type observer interface { pipelineObserver clientObserver - outputObserver + retryObserver cleanup() } @@ -47,18 +45,14 @@ type clientObserver interface { publishedEvent() // An event was rejected by the queue failedPublishEvent() + eventsACKed(count int) } -type outputObserver interface { +type retryObserver interface { // Events encountered too many errors and were permanently dropped. eventsDropped(int) // Events were sent back to an output worker after an earlier failure. eventsRetry(int) - // The queue received acknowledgment for events from the output workers. - // (This may include events already reported via eventsDropped.) - queueACKed(n int) - // Report the maximum event count supported by the queue. - queueMaxEvents(n int) } // metricsObserver is used by many component in the publisher pipeline, to report @@ -165,24 +159,12 @@ func (o *metricsObserver) clientClosed() { o.vars.clients.Dec() } func (o *metricsObserver) newEvent() { o.vars.eventsTotal.Inc() o.vars.activeEvents.Inc() - o.setPercentageFull() -} - -// setPercentageFull is used interally to set the `queue.full` metric -func (o *metricsObserver) setPercentageFull() { - maxEvt := o.vars.queueMaxEvents.Get() - if maxEvt != 0 { - pct := float64(o.vars.activeEvents.Get()) / float64(maxEvt) - pctRound := math.Round(pct/0.0005) * 0.0005 - o.vars.percentQueueFull.Set(pctRound) - } } // (client) event is filtered out (on purpose or failed) func (o *metricsObserver) filteredEvent() { o.vars.eventsFiltered.Inc() o.vars.activeEvents.Dec() - o.setPercentageFull() } // (client) managed to push an event into the publisher pipeline @@ -190,28 +172,15 @@ func (o *metricsObserver) publishedEvent() { o.vars.eventsPublished.Inc() } +// (client) number of ACKed events from this client +func (o *metricsObserver) eventsACKed(n int) { + o.vars.activeEvents.Sub(uint64(n)) +} + // (client) client closing down or DropIfFull is set func (o *metricsObserver) failedPublishEvent() { o.vars.eventsFailed.Inc() o.vars.activeEvents.Dec() - o.setPercentageFull() -} - -// -// queue events -// - -// (queue) number of events ACKed by the queue/broker in use -func (o *metricsObserver) queueACKed(n int) { - o.vars.queueACKed.Add(uint64(n)) - o.vars.activeEvents.Sub(uint64(n)) - o.setPercentageFull() -} - -// (queue) maximum queue event capacity -func (o *metricsObserver) queueMaxEvents(n int) { - o.vars.queueMaxEvents.Set(uint64(n)) - o.setPercentageFull() } // @@ -239,7 +208,6 @@ func (*emptyObserver) newEvent() {} func (*emptyObserver) filteredEvent() {} func (*emptyObserver) publishedEvent() {} func (*emptyObserver) failedPublishEvent() {} -func (*emptyObserver) queueACKed(n int) {} -func (*emptyObserver) queueMaxEvents(int) {} +func (*emptyObserver) eventsACKed(n int) {} func (*emptyObserver) eventsDropped(int) {} func (*emptyObserver) eventsRetry(int) {} diff --git a/libbeat/publisher/pipeline/pipeline.go b/libbeat/publisher/pipeline/pipeline.go index dbe87681ea6..a5a13a0584e 100644 --- a/libbeat/publisher/pipeline/pipeline.go +++ b/libbeat/publisher/pipeline/pipeline.go @@ -22,7 +22,6 @@ package pipeline import ( "fmt" - "sync" "time" "github.com/elastic/beats/v7/libbeat/beat" @@ -64,12 +63,9 @@ type Pipeline struct { observer observer - // wait close support. If eventWaitGroup is non-nil, then publishing - // an event through this pipeline will increment it and acknowledging - // a published event will decrement it, so the pipeline can wait on - // the group on shutdown to allow pending events to be acknowledged. + // If waitCloseTimeout is positive, then the pipeline will wait up to the + // specified time when it is closed for pending events to be acknowledged. waitCloseTimeout time.Duration - eventWaitGroup *sync.WaitGroup processors processing.Supporter } @@ -132,9 +128,7 @@ func New( processors: settings.Processors, } if settings.WaitCloseMode == WaitOnPipelineClose && settings.WaitClose > 0 { - // If wait-on-close is enabled, give the pipeline a WaitGroup for - // events that have been Published but not yet ACKed. - p.eventWaitGroup = &sync.WaitGroup{} + p.waitCloseTimeout = settings.WaitClose } if monitors.Metrics != nil { @@ -153,7 +147,7 @@ func New( return nil, err } - output, err := newOutputController(beat, monitors, p.observer, p.eventWaitGroup, queueFactory, settings.InputQueueSize) + output, err := newOutputController(beat, monitors, p.observer, queueFactory, settings.InputQueueSize) if err != nil { return nil, err } @@ -172,24 +166,8 @@ func (p *Pipeline) Close() error { log.Debug("close pipeline") - if p.eventWaitGroup != nil { - ch := make(chan struct{}) - go func() { - p.eventWaitGroup.Wait() - ch <- struct{}{} - }() - - select { - case <-ch: - // all events have been ACKed - - case <-time.After(p.waitCloseTimeout): - // timeout -> close pipeline with pending events - } - } - // Note: active clients are not closed / disconnected. - p.outputController.Close() + p.outputController.WaitClose(p.waitCloseTimeout) p.observer.cleanup() return nil @@ -238,20 +216,14 @@ func (p *Pipeline) ConnectWith(cfg beat.ClientConfig) (beat.Client, error) { processors: processors, eventFlags: eventFlags, canDrop: canDrop, - eventWaitGroup: p.eventWaitGroup, observer: p.observer, } ackHandler := cfg.EventListener - producerCfg := queue.ProducerConfig{} - var waiter *clientCloseWaiter if waitClose > 0 { waiter = newClientCloseWaiter(waitClose) - } - - if waiter != nil { if ackHandler == nil { ackHandler = waiter } else { @@ -259,9 +231,16 @@ func (p *Pipeline) ConnectWith(cfg beat.ClientConfig) (beat.Client, error) { } } - if ackHandler != nil { - producerCfg.ACK = ackHandler.ACKEvents - } else { + producerCfg := queue.ProducerConfig{ + ACK: func(count int) { + client.observer.eventsACKed(count) + if ackHandler != nil { + ackHandler.ACKEvents(count) + } + }, + } + + if ackHandler == nil { ackHandler = acker.Nil() } diff --git a/libbeat/publisher/pipeline/pipeline_test.go b/libbeat/publisher/pipeline/pipeline_test.go index 78725b043f1..a8cf34b895a 100644 --- a/libbeat/publisher/pipeline/pipeline_test.go +++ b/libbeat/publisher/pipeline/pipeline_test.go @@ -125,10 +125,6 @@ type testProducer struct { cancel func() } -func (q *testQueue) Metrics() (queue.Metrics, error) { - return queue.Metrics{}, nil -} - func (q *testQueue) Close() error { if q.close != nil { return q.close() @@ -136,6 +132,10 @@ func (q *testQueue) Close() error { return nil } +func (q *testQueue) Done() <-chan struct{} { + return nil +} + func (q *testQueue) QueueType() string { return "test" } diff --git a/libbeat/publisher/queue/diskqueue/consumer.go b/libbeat/publisher/queue/diskqueue/consumer.go index 55098b10fa8..20e6648d927 100644 --- a/libbeat/publisher/queue/diskqueue/consumer.go +++ b/libbeat/publisher/queue/diskqueue/consumer.go @@ -54,6 +54,13 @@ eventLoop: } } + // Check the batch size so we can report to the metrics observer + batchByteCount := 0 + for _, frame := range frames { + batchByteCount += int(frame.bytesOnDisk) + } + dq.observer.ConsumeEvents(len(frames), batchByteCount) + // There is a mild race condition here based on queue closure: events // written to readerLoop.output may have been buffered before the // queue was closed, and we may be reading its leftovers afterwards. diff --git a/libbeat/publisher/queue/diskqueue/consumer_test.go b/libbeat/publisher/queue/diskqueue/consumer_test.go new file mode 100644 index 00000000000..80378029be2 --- /dev/null +++ b/libbeat/publisher/queue/diskqueue/consumer_test.go @@ -0,0 +1,61 @@ +// Licensed to Elasticsearch B.V. under one or more contributor +// license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright +// ownership. Elasticsearch B.V. licenses this file to you under +// the Apache License, Version 2.0 (the "License"); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package diskqueue + +import ( + "testing" + + "github.com/stretchr/testify/assert" + + "github.com/elastic/beats/v7/libbeat/publisher/queue" + "github.com/elastic/elastic-agent-libs/monitoring" +) + +func TestQueueGetObserver(t *testing.T) { + reg := monitoring.NewRegistry() + const eventCount = 50 + dq := diskQueue{ + observer: queue.NewQueueObserver(reg), + readerLoop: &readerLoop{ + output: make(chan *readFrame, eventCount), + }, + } + for i := 0; i < eventCount; i++ { + dq.readerLoop.output <- &readFrame{bytesOnDisk: 123} + } + _, err := dq.Get(eventCount) + assert.NoError(t, err, "Queue Get call should succeed") + assertRegistryUint(t, reg, "queue.consumed.events", eventCount, "Get call should report consumed events") + assertRegistryUint(t, reg, "queue.consumed.bytes", eventCount*123, "Get call should report consumed bytes") +} + +func assertRegistryUint(t *testing.T, reg *monitoring.Registry, key string, expected uint64, message string) { + t.Helper() + + entry := reg.Get(key) + if entry == nil { + assert.Failf(t, message, "registry key '%v' doesn't exist", key) + return + } + value, ok := reg.Get(key).(*monitoring.Uint) + if !ok { + assert.Failf(t, message, "registry key '%v' doesn't refer to a uint64", key) + return + } + assert.Equal(t, expected, value.Get(), message) +} diff --git a/libbeat/publisher/queue/diskqueue/core_loop.go b/libbeat/publisher/queue/diskqueue/core_loop.go index 93051dd4581..4f30a0e58ba 100644 --- a/libbeat/publisher/queue/diskqueue/core_loop.go +++ b/libbeat/publisher/queue/diskqueue/core_loop.go @@ -47,7 +47,7 @@ func (dq *diskQueue) run() { // After receiving new ACKs, a segment might be ready to delete. dq.maybeDeleteACKed() - case <-dq.done: + case <-dq.close: dq.handleShutdown() return @@ -84,21 +84,10 @@ func (dq *diskQueue) run() { // If there were blocked producers waiting for more queue space, // we might be able to unblock them now. dq.maybeUnblockProducers() - - case metricsReq := <-dq.metricsRequestChan: - dq.handleMetricsRequest(metricsReq) } } } -// handleMetricsRequest responds to an event on the metricsRequestChan chan -func (dq *diskQueue) handleMetricsRequest(request metricsRequest) { - resp := metricsRequestResponse{ - sizeOnDisk: dq.segments.sizeOnDisk(), - } - request.response <- resp -} - func (dq *diskQueue) handleProducerWriteRequest(request producerWriteRequest) { // Pathological case checking: make sure the incoming frame isn't bigger // than an entire segment all by itself (as long as it isn't, it is @@ -122,6 +111,7 @@ func (dq *diskQueue) handleProducerWriteRequest(request producerWriteRequest) { // pending list and report success, then dispatch it to the // writer loop if no other requests are outstanding. dq.enqueueWriteFrame(request.frame) + dq.observer.AddEvent(int(request.frame.sizeOnDisk())) request.responseChan <- true } else { // The queue is too full. Either add the request to blockedProducers, @@ -186,6 +176,8 @@ func (dq *diskQueue) handleDeleterLoopResponse(response deleterLoopResponse) { dq.deleting = false newAckedSegments := []*queueSegment{} errors := []error{} + removedEventCount := 0 + removedByteCount := 0 for i, err := range response.results { if err != nil { // This segment had an error, so it stays in the acked list. @@ -193,8 +185,15 @@ func (dq *diskQueue) handleDeleterLoopResponse(response deleterLoopResponse) { errors = append(errors, fmt.Errorf("couldn't delete segment %d: %w", dq.segments.acked[i].id, err)) + } else { + removedEventCount += int(dq.segments.acked[i].frameCount) + // For the metrics observer, we (can) only report the size of the raw + // events, not the segment header, so subtract that here so it doesn't + // look like we're deleting more than was added in the first place. + removedByteCount += int(dq.segments.acked[i].byteCount - dq.segments.acked[i].headerSize()) } } + dq.observer.RemoveEvents(removedEventCount, removedByteCount) if len(dq.segments.acked) > len(response.results) { // Preserve any new acked segments that were added during the deletion // request. @@ -479,9 +478,13 @@ func (dq *diskQueue) canAcceptFrameOfSize(frameSize uint64) bool { return true } - // Compute the current queue size. We accept if there is enough capacity - // left in the queue after accounting for the existing segments and the - // pending writes that were already accepted. + // We accept if there is enough capacity left in the queue after accounting + // for the existing segments and the pending writes that were already + // accepted. + return dq.currentSize()+frameSize <= dq.settings.MaxBufferSize +} + +func (dq *diskQueue) currentSize() uint64 { pendingBytes := uint64(0) for _, sf := range dq.pendingFrames { pendingBytes += sf.frame.sizeOnDisk() @@ -490,7 +493,5 @@ func (dq *diskQueue) canAcceptFrameOfSize(frameSize uint64) bool { if dq.writing { pendingBytes += dq.writeRequestSize } - currentSize := pendingBytes + dq.segments.sizeOnDisk() - - return currentSize+frameSize <= dq.settings.MaxBufferSize + return pendingBytes + dq.segments.sizeOnDisk() } diff --git a/libbeat/publisher/queue/diskqueue/core_loop_test.go b/libbeat/publisher/queue/diskqueue/core_loop_test.go index a48142655f9..5a0a35b367b 100644 --- a/libbeat/publisher/queue/diskqueue/core_loop_test.go +++ b/libbeat/publisher/queue/diskqueue/core_loop_test.go @@ -18,10 +18,13 @@ package diskqueue import ( + "errors" "fmt" "testing" + "github.com/elastic/beats/v7/libbeat/publisher/queue" "github.com/elastic/elastic-agent-libs/logp" + "github.com/elastic/elastic-agent-libs/monitoring" ) func TestHandleProducerWriteRequest(t *testing.T) { @@ -127,6 +130,7 @@ func TestHandleProducerWriteRequest(t *testing.T) { for description, test := range testCases { dq := &diskQueue{ logger: logp.L(), + observer: queue.NewQueueObserver(nil), settings: settings, segments: test.segments, } @@ -949,6 +953,65 @@ func TestCanAcceptFrameOfSize(t *testing.T) { } } +func TestObserverAddEvent(t *testing.T) { + // Check that write requests accepted by the queue are reported to its + // metrics observer. + reg := monitoring.NewRegistry() + dq := diskQueue{ + settings: Settings{ + MaxBufferSize: 100000, + MaxSegmentSize: 1000, + WriteAheadLimit: 10, + }, + observer: queue.NewQueueObserver(reg), + } + eventFrame := &writeFrame{serialized: make([]byte, 123)} + request := producerWriteRequest{ + frame: eventFrame, + responseChan: make(chan bool, 1), + } + dq.handleProducerWriteRequest(request) + assertRegistryUint(t, reg, "queue.added.events", 1, "handleProducerWriteRequest should report the added event") + assertRegistryUint(t, reg, "queue.added.bytes", eventFrame.sizeOnDisk(), "handleProducerWriteRequest should report the added bytes") +} + +func TestObserverDeleteSegment(t *testing.T) { + // Check that the results of segment deletions are reported to the + // metrics observer. + reg := monitoring.NewRegistry() + dq := diskQueue{ + logger: logp.NewLogger("testing"), + observer: queue.NewQueueObserver(reg), + } + // Note the segment header size is added to the test values, because segment + // metadata isn't included in event metrics. + dq.segments.acked = []*queueSegment{ + { + frameCount: 50, + byteCount: 1234 + segmentHeaderSize, + }, + { + frameCount: 25, + byteCount: 567 + segmentHeaderSize, + }, + } + // Handle a deletion response of length 1, which means the second acked + // segment shouldn't be reported yet. + dq.handleDeleterLoopResponse(deleterLoopResponse{results: []error{nil}}) + assertRegistryUint(t, reg, "queue.removed.events", 50, "Deleted events should be reported") + assertRegistryUint(t, reg, "queue.removed.bytes", 1234, "Deleted bytes should be reported") + + // Report an error, which should not change the metrics values + dq.handleDeleterLoopResponse(deleterLoopResponse{results: []error{errors.New("some error")}}) + assertRegistryUint(t, reg, "queue.removed.events", 50, "Failed deletion shouldn't report any removed events") + assertRegistryUint(t, reg, "queue.removed.bytes", 1234, "Failed deletion shouldn't report any removed bytes") + + // Now send a nil error, which should add the second segment to the metrics. + dq.handleDeleterLoopResponse(deleterLoopResponse{results: []error{nil}}) + assertRegistryUint(t, reg, "queue.removed.events", 50+25, "Deleted events should be reported") + assertRegistryUint(t, reg, "queue.removed.bytes", 1234+567, "Deleted bytes should be reported") +} + func boolRef(b bool) *bool { return &b } diff --git a/libbeat/publisher/queue/diskqueue/producer.go b/libbeat/publisher/queue/diskqueue/producer.go index 7d084adf5ea..c379ac40637 100644 --- a/libbeat/publisher/queue/diskqueue/producer.go +++ b/libbeat/publisher/queue/diskqueue/producer.go @@ -87,7 +87,7 @@ func (producer *diskQueueProducer) publish( // blocking the core loop. response := <-request.responseChan return response - case <-producer.queue.done: + case <-producer.queue.close: return false case <-producer.done: return false diff --git a/libbeat/publisher/queue/diskqueue/queue.go b/libbeat/publisher/queue/diskqueue/queue.go index 4fedcfa6a6e..e07c0185ade 100644 --- a/libbeat/publisher/queue/diskqueue/queue.go +++ b/libbeat/publisher/queue/diskqueue/queue.go @@ -20,13 +20,10 @@ package diskqueue import ( "errors" "fmt" - "io" "os" - "sync" "github.com/elastic/beats/v7/libbeat/publisher/queue" "github.com/elastic/elastic-agent-libs/logp" - "github.com/elastic/elastic-agent-libs/opt" ) // The string used to specify this queue in beats configurations. @@ -36,6 +33,7 @@ const QueueType = "disk" // of queue.Queue. type diskQueue struct { logger *logp.Logger + observer queue.Observer settings Settings // Metadata related to the segment files. @@ -50,10 +48,6 @@ type diskQueue struct { writerLoop *writerLoop deleterLoop *deleterLoop - // Wait group for shutdown of the goroutines associated with this queue: - // reader loop, writer loop, deleter loop, and core loop (diskQueue.run()). - waitGroup sync.WaitGroup - // writing is true if the writer loop is processing a request, false // otherwise. writing bool @@ -74,9 +68,6 @@ type diskQueue struct { // The API channel used by diskQueueProducer to write events. producerWriteRequestChan chan producerWriteRequest - // API channel used by the public Metrics() API to request queue metrics - metricsRequestChan chan metricsRequest - // pendingFrames is a list of all incoming data frames that have been // accepted by the queue and are waiting to be sent to the writer loop. // Segment ids in this list always appear in sorted order, even between @@ -88,18 +79,13 @@ type diskQueue struct { // waiting for free space in the queue. blockedProducers []producerWriteRequest - // The channel to signal our goroutines to shut down. - done chan struct{} -} - -// channel request for metrics from an external client -type metricsRequest struct { - response chan metricsRequestResponse -} + // The channel to signal our goroutines to shut down, used by + // (*diskQueue).Close. + close chan struct{} -// metrics response from the disk queue -type metricsRequestResponse struct { - sizeOnDisk uint64 + // The channel to report that shutdown is finished, used by + // (*diskQueue).Done. + done chan struct{} } // FactoryForSettings is a simple wrapper around NewQueue so a concrete @@ -108,11 +94,11 @@ type metricsRequestResponse struct { func FactoryForSettings(settings Settings) queue.QueueFactory { return func( logger *logp.Logger, - ackCallback func(eventCount int), + observer queue.Observer, inputQueueSize int, encoderFactory queue.EncoderFactory, ) (queue.Queue, error) { - return NewQueue(logger, ackCallback, settings, encoderFactory) + return NewQueue(logger, observer, settings, encoderFactory) } } @@ -120,13 +106,16 @@ func FactoryForSettings(settings Settings) queue.QueueFactory { // and settings, creating it if it doesn't exist. func NewQueue( logger *logp.Logger, - writeToDiskCallback func(eventCount int), + observer queue.Observer, settings Settings, encoderFactory queue.EncoderFactory, ) (*diskQueue, error) { logger = logger.Named("diskqueue") logger.Debugf( "Initializing disk queue at path %v", settings.directoryPath()) + if observer == nil { + observer = queue.NewQueueObserver(nil) + } if settings.MaxBufferSize > 0 && settings.MaxBufferSize < settings.MaxSegmentSize*2 { @@ -135,6 +124,7 @@ func NewQueue( "twice the segment size (%v)", settings.MaxBufferSize, settings.MaxSegmentSize) } + observer.MaxBytes(int(settings.MaxBufferSize)) // Create the given directory path if it doesn't exist. err := os.MkdirAll(settings.directoryPath(), os.ModePerm) @@ -182,6 +172,15 @@ func NewQueue( lastID := initialSegments[len(initialSegments)-1].id nextSegmentID = lastID + 1 } + // Check the initial contents to report to the metrics observer. + initialEventCount := 0 + initialByteCount := 0 + for _, segment := range initialSegments { + initialEventCount += int(segment.frameCount) + // Event metrics for the queue observer don't include segment headser size + initialByteCount += int(segment.byteCount - segment.headerSize()) + } + observer.Restore(initialEventCount, initialByteCount) // If any of the initial segments are older than the current queue // position, move them directly to the acked list where they can be @@ -199,20 +198,13 @@ func NewQueue( nextReadPosition = queuePosition{segmentID: initialSegments[0].id} } - // We can compute the active frames right now but still need a way to report - // them to the global beat metrics. For now, just log the total. - // Note that for consistency with existing queue behavior, this excludes - // events that are still present on disk but were already sent and - // acknowledged on a previous run (we probably want to track these as well - // in the future.) - //nolint:godox // Ignore This - // TODO: pass in a context that queues can use to report these events. + // Count just the active events to report in the log activeFrameCount := 0 for _, segment := range initialSegments { activeFrameCount += int(segment.frameCount) } activeFrameCount -= int(nextReadPosition.frameIndex) - logger.Infof("Found %d existing events on queue start", activeFrameCount) + logger.Infof("Found %v queued events consuming %v bytes, %v events still pending", initialEventCount, initialByteCount, activeFrameCount) var encoder queue.Encoder if encoderFactory != nil { @@ -221,6 +213,7 @@ func NewQueue( queue := &diskQueue{ logger: logger, + observer: observer, settings: settings, segments: diskQueueSegments{ @@ -233,36 +226,20 @@ func NewQueue( acks: newDiskQueueACKs(logger, nextReadPosition, positionFile), readerLoop: newReaderLoop(settings, encoder), - writerLoop: newWriterLoop(logger, writeToDiskCallback, settings), + writerLoop: newWriterLoop(logger, settings), deleterLoop: newDeleterLoop(settings), producerWriteRequestChan: make(chan producerWriteRequest), - metricsRequestChan: make(chan metricsRequest), - done: make(chan struct{}), + close: make(chan struct{}), + done: make(chan struct{}), } - // We wait for four goroutines on shutdown: core loop, reader loop, - // writer loop, deleter loop. - queue.waitGroup.Add(4) - // Start the goroutines and return the queue! - go func() { - queue.readerLoop.run() - queue.waitGroup.Done() - }() - go func() { - queue.writerLoop.run() - queue.waitGroup.Done() - }() - go func() { - queue.deleterLoop.run() - queue.waitGroup.Done() - }() - go func() { - queue.run() - queue.waitGroup.Done() - }() + go queue.readerLoop.run() + go queue.writerLoop.run() + go queue.deleterLoop.run() + go queue.run() return queue, nil } @@ -274,12 +251,15 @@ func NewQueue( func (dq *diskQueue) Close() error { // Closing the done channel signals to the core loop that it should // shut down the other helper goroutines and wrap everything up. - close(dq.done) - dq.waitGroup.Wait() + close(dq.close) return nil } +func (dq *diskQueue) Done() <-chan struct{} { + return dq.done +} + func (dq *diskQueue) QueueType() string { return QueueType } @@ -296,29 +276,3 @@ func (dq *diskQueue) Producer(cfg queue.ProducerConfig) queue.Producer { done: make(chan struct{}), } } - -// Metrics returns current disk metrics -func (dq *diskQueue) Metrics() (queue.Metrics, error) { - respChan := make(chan metricsRequestResponse, 1) - req := metricsRequest{response: respChan} - - select { - case <-dq.done: - return queue.Metrics{}, io.EOF - case dq.metricsRequestChan <- req: - - } - - resp := metricsRequestResponse{} - select { - case <-dq.done: - return queue.Metrics{}, io.EOF - case resp = <-respChan: - } - - maxSize := dq.settings.MaxBufferSize - return queue.Metrics{ - ByteLimit: opt.UintWith(maxSize), - ByteCount: opt.UintWith(resp.sizeOnDisk), - }, nil -} diff --git a/libbeat/publisher/queue/diskqueue/queue_test.go b/libbeat/publisher/queue/diskqueue/queue_test.go index f6a4c406ed3..30c770e45a4 100644 --- a/libbeat/publisher/queue/diskqueue/queue_test.go +++ b/libbeat/publisher/queue/diskqueue/queue_test.go @@ -28,9 +28,6 @@ import ( "github.com/elastic/beats/v7/libbeat/publisher/queue" "github.com/elastic/beats/v7/libbeat/publisher/queue/queuetest" "github.com/elastic/elastic-agent-libs/logp" - "github.com/elastic/elastic-agent-libs/mapstr" - - "github.com/stretchr/testify/require" ) var seed int64 @@ -78,44 +75,6 @@ func TestProduceConsumer(t *testing.T) { t.Run("direct", testWith(makeTestQueue())) } -func TestMetrics(t *testing.T) { - dir, err := ioutil.TempDir("", "diskqueue_metrics") - defer func() { - _ = os.RemoveAll(dir) - }() - require.NoError(t, err) - settings := DefaultSettings() - settings.Path = dir - // lower max segment size so we can get multiple segments - settings.MaxSegmentSize = 100 - - testQueue, err := NewQueue(logp.L(), nil, settings, nil) - require.NoError(t, err) - defer testQueue.Close() - - eventsToTest := 100 - - // Send events to queue - producer := testQueue.Producer(queue.ProducerConfig{}) - sendEventsToQueue(eventsToTest, producer) - - // fetch metrics before we read any events - time.Sleep(time.Millisecond * 500) - testMetrics, err := testQueue.Metrics() - require.NoError(t, err) - - require.Equal(t, testMetrics.ByteLimit.ValueOr(0), uint64((1 << 30))) - require.NotZero(t, testMetrics.ByteCount.ValueOr(0)) - t.Logf("got %d bytes written", testMetrics.ByteCount.ValueOr(0)) - -} - -func sendEventsToQueue(count int, prod queue.Producer) { - for i := 0; i < count; i++ { - prod.Publish(queuetest.MakeEvent(mapstr.M{"count": i})) - } -} - func makeTestQueue() queuetest.QueueFactory { return func(t *testing.T) queue.Queue { dir, err := ioutil.TempDir("", "diskqueue_test") diff --git a/libbeat/publisher/queue/diskqueue/segments.go b/libbeat/publisher/queue/diskqueue/segments.go index 0460fc4431a..7e3661f6e5b 100644 --- a/libbeat/publisher/queue/diskqueue/segments.go +++ b/libbeat/publisher/queue/diskqueue/segments.go @@ -94,10 +94,7 @@ type queueSegment struct { // If this segment was loaded from a previous session, schemaVersion // points to the file schema version that was read from its header. // This is only used by queueSegment.headerSize(), which is used in - // maybeReadPending to calculate the position of the first data frame, - // and by queueSegment.shouldUseJSON(), which is used in the reader - // loop to detect old segments that used JSON encoding instead of - // the current CBOR. + // maybeReadPending to calculate the position of the first data frame. schemaVersion *uint32 // The number of bytes occupied by this segment on-disk, as of the most diff --git a/libbeat/publisher/queue/diskqueue/writer_loop.go b/libbeat/publisher/queue/diskqueue/writer_loop.go index c0e7103c41b..72cfb04642e 100644 --- a/libbeat/publisher/queue/diskqueue/writer_loop.go +++ b/libbeat/publisher/queue/diskqueue/writer_loop.go @@ -71,10 +71,6 @@ type writerLoop struct { // The logger for the writer loop, assigned when the queue creates it. logger *logp.Logger - // A callback that, if set, should be invoked with an event count when - // events are successfully written to disk. - writeToDiskCallback func(eventCount int) - // The writer loop listens on requestChan for frames to write, and // writes them to disk immediately (all queue capacity checking etc. is // done by the core loop before sending it to the writer). @@ -102,14 +98,12 @@ type writerLoop struct { func newWriterLoop( logger *logp.Logger, - writeToDiskCallback func(eventCount int), settings Settings, ) *writerLoop { buffer := &bytes.Buffer{} return &writerLoop{ - logger: logger, - writeToDiskCallback: writeToDiskCallback, - settings: settings, + logger: logger, + settings: settings, requestChan: make(chan writerLoopRequest, 1), responseChan: make(chan writerLoopResponse), @@ -243,11 +237,6 @@ outerLoop: // Try to sync the written data to disk. _ = wl.outputFile.Sync() - // If the queue has an ACK listener, notify it the frames were written. - if wl.writeToDiskCallback != nil { - wl.writeToDiskCallback(totalACKCount) - } - // Notify any producers with ACK listeners that their frames were written. for producer, ackCount := range producerACKCounts { producer.config.ACK(ackCount) diff --git a/libbeat/publisher/queue/memqueue/ackloop.go b/libbeat/publisher/queue/memqueue/ackloop.go index 1a964d8bb45..9432bd5af19 100644 --- a/libbeat/publisher/queue/memqueue/ackloop.go +++ b/libbeat/publisher/queue/memqueue/ackloop.go @@ -67,10 +67,6 @@ func (l *ackLoop) handleBatchSig() int { } if count > 0 { - if callback := l.broker.ackCallback; callback != nil { - callback(count) - } - // report acks to waiting clients l.processACK(ackedBatches, count) } diff --git a/libbeat/publisher/queue/memqueue/broker.go b/libbeat/publisher/queue/memqueue/broker.go index d9aff10bd3a..b617bae6110 100644 --- a/libbeat/publisher/queue/memqueue/broker.go +++ b/libbeat/publisher/queue/memqueue/broker.go @@ -25,7 +25,6 @@ import ( "github.com/elastic/beats/v7/libbeat/publisher/queue" "github.com/elastic/elastic-agent-libs/logp" - "github.com/elastic/elastic-agent-libs/opt" ) // The string used to specify this queue in beats configurations. @@ -66,9 +65,8 @@ type broker struct { // Consumers send requests to getChan to read events from the queue. getChan chan getRequest - // Metrics() sends requests to metricChan to expose internal queue - // metrics to external callers. - metricChan chan metricsRequest + // Close triggers a queue close by sending to closeChan. + closeChan chan struct{} /////////////////////////// // internal channels @@ -77,18 +75,16 @@ type broker struct { // through this channel so ackLoop can monitor them for acknowledgments. consumedChan chan batchList - // ackCallback is a configurable callback to invoke when ACKs are processed. - // ackLoop calls this function when it advances the consumer ACK position. - // Right now this forwards the notification to queueACKed() in - // the pipeline observer, which updates the beats registry if needed. - ackCallback func(eventCount int) - // When batches are acknowledged, ackLoop saves any metadata needed // for producer callbacks and such, then notifies runLoop that it's // safe to free these events and advance the queue by sending the // acknowledged event count to this channel. deleteChan chan int + // closingChan is closed when the queue has processed a close request. + // It's used to prevent producers from blocking on a closing queue. + closingChan chan struct{} + /////////////////////////////// // internal goroutine state @@ -112,8 +108,9 @@ type Settings struct { } type queueEntry struct { - event queue.Entry - id queue.EntryID + event queue.Entry + eventSize int + id queue.EntryID producer *ackProducer producerID producerID // The order of this entry within its producer @@ -144,11 +141,11 @@ type batchList struct { func FactoryForSettings(settings Settings) queue.QueueFactory { return func( logger *logp.Logger, - ackCallback func(eventCount int), + observer queue.Observer, inputQueueSize int, encoderFactory queue.EncoderFactory, ) (queue.Queue, error) { - return NewQueue(logger, ackCallback, settings, inputQueueSize, encoderFactory), nil + return NewQueue(logger, observer, settings, inputQueueSize, encoderFactory), nil } } @@ -157,12 +154,12 @@ func FactoryForSettings(settings Settings) queue.QueueFactory { // workers handling incoming messages and ACKs have been shut down. func NewQueue( logger *logp.Logger, - ackCallback func(eventCount int), + observer queue.Observer, settings Settings, inputQueueSize int, encoderFactory queue.EncoderFactory, ) *broker { - b := newQueue(logger, ackCallback, settings, inputQueueSize, encoderFactory) + b := newQueue(logger, observer, settings, inputQueueSize, encoderFactory) // Start the queue workers b.wg.Add(2) @@ -184,11 +181,14 @@ func NewQueue( // when the workers are active. func newQueue( logger *logp.Logger, - ackCallback func(eventCount int), + observer queue.Observer, settings Settings, inputQueueSize int, encoderFactory queue.EncoderFactory, ) *broker { + if observer == nil { + observer = queue.NewQueueObserver(nil) + } chanSize := AdjustInputQueueSize(inputQueueSize, settings.Events) // Backwards compatibility: an old way to select synchronous queue @@ -218,29 +218,34 @@ func newQueue( encoderFactory: encoderFactory, // broker API channels - pushChan: make(chan pushRequest, chanSize), - getChan: make(chan getRequest), - metricChan: make(chan metricsRequest), + pushChan: make(chan pushRequest, chanSize), + getChan: make(chan getRequest), + closeChan: make(chan struct{}), // internal runLoop and ackLoop channels consumedChan: make(chan batchList), deleteChan: make(chan int), - - ackCallback: ackCallback, + closingChan: make(chan struct{}), } b.ctx, b.ctxCancel = context.WithCancel(context.Background()) - b.runLoop = newRunLoop(b) + b.runLoop = newRunLoop(b, observer) b.ackLoop = newACKLoop(b) + observer.MaxEvents(settings.Events) + return b } func (b *broker) Close() error { - b.ctxCancel() + b.closeChan <- struct{}{} return nil } +func (b *broker) Done() <-chan struct{} { + return b.ctx.Done() +} + func (b *broker) QueueType() string { return QueueType } @@ -276,25 +281,6 @@ func (b *broker) Get(count int) (queue.Batch, error) { return resp, nil } -func (b *broker) Metrics() (queue.Metrics, error) { - - responseChan := make(chan memQueueMetrics, 1) - select { - case <-b.ctx.Done(): - return queue.Metrics{}, io.EOF - case b.metricChan <- metricsRequest{ - responseChan: responseChan}: - } - resp := <-responseChan - - return queue.Metrics{ - EventCount: opt.UintWith(uint64(resp.currentQueueSize)), - EventLimit: opt.UintWith(uint64(len(b.buf))), - UnackedConsumedEvents: opt.UintWith(uint64(resp.occupiedRead)), - OldestEntryID: resp.oldestEntryID, - }, nil -} - var batchPool = sync.Pool{ New: func() interface{} { return &batch{ diff --git a/libbeat/publisher/queue/memqueue/internal_api.go b/libbeat/publisher/queue/memqueue/internal_api.go index 6575472edbd..0d983de6520 100644 --- a/libbeat/publisher/queue/memqueue/internal_api.go +++ b/libbeat/publisher/queue/memqueue/internal_api.go @@ -46,19 +46,3 @@ type getRequest struct { } type batchDoneMsg struct{} - -// Metrics API - -type metricsRequest struct { - responseChan chan memQueueMetrics -} - -// memQueueMetrics tracks metrics that are returned by the individual memory queue implementations -type memQueueMetrics struct { - // the size of items in the queue - currentQueueSize int - // the number of items that have been read by a consumer but not yet ack'ed - occupiedRead int - - oldestEntryID queue.EntryID -} diff --git a/libbeat/publisher/queue/memqueue/produce.go b/libbeat/publisher/queue/memqueue/produce.go index a206e357aac..0ecabfe77f0 100644 --- a/libbeat/publisher/queue/memqueue/produce.go +++ b/libbeat/publisher/queue/memqueue/produce.go @@ -35,11 +35,11 @@ type ackProducer struct { } type openState struct { - log *logp.Logger - done chan struct{} - queueDone <-chan struct{} - events chan pushRequest - encoder queue.Encoder + log *logp.Logger + done chan struct{} + queueClosing <-chan struct{} + events chan pushRequest + encoder queue.Encoder } // producerID stores the order of events within a single producer, so multiple @@ -57,11 +57,11 @@ type ackHandler func(count int) func newProducer(b *broker, cb ackHandler, encoder queue.Encoder) queue.Producer { openState := openState{ - log: b.logger, - done: make(chan struct{}), - queueDone: b.ctx.Done(), - events: b.pushChan, - encoder: encoder, + log: b.logger, + done: make(chan struct{}), + queueClosing: b.closingChan, + events: b.pushChan, + encoder: encoder, } if cb != nil { @@ -141,14 +141,14 @@ func (st *openState) publish(req pushRequest) (queue.EntryID, bool) { select { case resp := <-req.resp: return resp, true - case <-st.queueDone: + case <-st.queueClosing: st.events = nil return 0, false } case <-st.done: st.events = nil return 0, false - case <-st.queueDone: + case <-st.queueClosing: st.events = nil return 0, false } @@ -169,7 +169,7 @@ func (st *openState) tryPublish(req pushRequest) (queue.EntryID, bool) { select { case resp := <-req.resp: return resp, true - case <-st.queueDone: + case <-st.queueClosing: st.events = nil return 0, false } diff --git a/libbeat/publisher/queue/memqueue/queue_test.go b/libbeat/publisher/queue/memqueue/queue_test.go index 5ebf6b6f6fb..9cd209bbd51 100644 --- a/libbeat/publisher/queue/memqueue/queue_test.go +++ b/libbeat/publisher/queue/memqueue/queue_test.go @@ -32,7 +32,6 @@ import ( "github.com/elastic/beats/v7/libbeat/publisher/queue" "github.com/elastic/beats/v7/libbeat/publisher/queue/queuetest" - "github.com/elastic/elastic-agent-libs/mapstr" ) var seed int64 @@ -228,69 +227,6 @@ func TestProducerClosePreservesEventCount(t *testing.T) { assert.False(t, activeEvents.Load() < 0, "active event count should never be negative") } -func TestQueueMetricsDirect(t *testing.T) { - eventsToTest := 5 - maxEvents := 10 - - // Test the directEventLoop - directSettings := Settings{ - Events: maxEvents, - MaxGetRequest: 1, - FlushTimeout: 0, - } - t.Logf("Testing directEventLoop") - queueTestWithSettings(t, directSettings, eventsToTest, "directEventLoop") - -} - -func TestQueueMetricsBuffer(t *testing.T) { - eventsToTest := 5 - maxEvents := 10 - // Test Buffered Event Loop - bufferedSettings := Settings{ - Events: maxEvents, - MaxGetRequest: eventsToTest, // The buffered event loop can only return FlushMinEvents per Get() - FlushTimeout: time.Millisecond, - } - t.Logf("Testing bufferedEventLoop") - queueTestWithSettings(t, bufferedSettings, eventsToTest, "bufferedEventLoop") -} - -func queueTestWithSettings(t *testing.T, settings Settings, eventsToTest int, testName string) { - testQueue := NewQueue(nil, nil, settings, 0, nil) - defer testQueue.Close() - - // Send events to queue - producer := testQueue.Producer(queue.ProducerConfig{}) - for i := 0; i < eventsToTest; i++ { - producer.Publish(queuetest.MakeEvent(mapstr.M{"count": i})) - } - queueMetricsAreValid(t, testQueue, 5, settings.Events, 0, fmt.Sprintf("%s - First send of metrics to queue", testName)) - - // Read events, don't yet ack them - batch, err := testQueue.Get(eventsToTest) - assert.NoError(t, err, "error in Get") - t.Logf("Got batch of %d events", batch.Count()) - - queueMetricsAreValid(t, testQueue, 5, settings.Events, 5, fmt.Sprintf("%s - Producer Getting events, no ACK", testName)) - - // Test metrics after ack - batch.Done() - - queueMetricsAreValid(t, testQueue, 0, settings.Events, 0, fmt.Sprintf("%s - Producer Getting events, no ACK", testName)) - -} - -func queueMetricsAreValid(t *testing.T, q queue.Queue, evtCount, evtLimit, occupied int, test string) { - // wait briefly to avoid races across all the queue channels - time.Sleep(time.Millisecond * 100) - testMetrics, err := q.Metrics() - assert.NoError(t, err, "error calling metrics for test %s", test) - assert.Equal(t, testMetrics.EventCount.ValueOr(0), uint64(evtCount), "incorrect EventCount for %s", test) - assert.Equal(t, testMetrics.EventLimit.ValueOr(0), uint64(evtLimit), "incorrect EventLimit for %s", test) - assert.Equal(t, testMetrics.UnackedConsumedEvents.ValueOr(0), uint64(occupied), "incorrect OccupiedRead for %s", test) -} - func makeTestQueue(sz, minEvents int, flushTimeout time.Duration) queuetest.QueueFactory { return func(_ *testing.T) queue.Queue { return NewQueue(nil, nil, Settings{ @@ -326,163 +262,3 @@ func TestAdjustInputQueueSize(t *testing.T) { assert.Equal(t, int(float64(mainQueue)*maxInputQueueSizeRatio), AdjustInputQueueSize(mainQueue, mainQueue)) }) } - -func TestEntryIDs(t *testing.T) { - entryCount := 100 - - testForward := func(q queue.Queue) { - waiter := &producerACKWaiter{} - producer := q.Producer(queue.ProducerConfig{ACK: waiter.ack}) - for i := 0; i < entryCount; i++ { - id, success := producer.Publish(nil) - assert.Equal(t, success, true, "Queue publish should succeed") - assert.Equal(t, id, queue.EntryID(i), "Entry ID should match publication order") - } - - for i := 0; i < entryCount; i++ { - batch, err := q.Get(1) - assert.NoError(t, err, "Queue read should succeed") - assert.Equal(t, batch.Count(), 1, "Returned batch should have 1 entry") - - metrics, err := q.Metrics() - assert.NoError(t, err, "Queue metrics call should succeed") - assert.Equal(t, metrics.OldestEntryID, queue.EntryID(i), - fmt.Sprintf("Oldest entry ID before ACKing event %v should be %v", i, i)) - - batch.Done() - waiter.waitForEvents(1) - metrics, err = q.Metrics() - assert.NoError(t, err, "Queue metrics call should succeed") - assert.Equal(t, metrics.OldestEntryID, queue.EntryID(i+1), - fmt.Sprintf("Oldest entry ID after ACKing event %v should be %v", i, i+1)) - - } - } - - testBackward := func(q queue.Queue) { - waiter := &producerACKWaiter{} - producer := q.Producer(queue.ProducerConfig{ACK: waiter.ack}) - for i := 0; i < entryCount; i++ { - id, success := producer.Publish(nil) - assert.Equal(t, success, true, "Queue publish should succeed") - assert.Equal(t, id, queue.EntryID(i), "Entry ID should match publication order") - } - - batches := []queue.Batch{} - - for i := 0; i < entryCount; i++ { - batch, err := q.Get(1) - assert.NoError(t, err, "Queue read should succeed") - assert.Equal(t, batch.Count(), 1, "Returned batch should have 1 entry") - batches = append(batches, batch) - } - - for i := entryCount - 1; i > 0; i-- { - batches[i].Done() - - // It's hard to remove this delay since the Done signal is propagated - // asynchronously to the queue, and since this test is ensuring that the - // queue _doesn't_ advance we can't use a callback to gate the comparison - // like we do in testForward. However: - // - While this race condition could sometimes let a buggy implementation - // pass, it will not produce a false failure (so it won't contribute - // to general test flakiness) - // - That notwithstanding, when the ACK _does_ cause an incorrect - // metrics update, this delay is enough to recognize it approximately - // 100% of the time, so this test is still a good signal despite - // the slight nondeterminism. - time.Sleep(1 * time.Millisecond) - metrics, err := q.Metrics() - assert.NoError(t, err, "Queue metrics call should succeed") - assert.Equal(t, metrics.OldestEntryID, queue.EntryID(0), - fmt.Sprintf("Oldest entry ID after ACKing event %v should be 0", i)) - } - // ACK the first batch, which should unblock all the later ones - batches[0].Done() - waiter.waitForEvents(100) - metrics, err := q.Metrics() - assert.NoError(t, err, "Queue metrics call should succeed") - assert.Equal(t, metrics.OldestEntryID, queue.EntryID(100), - fmt.Sprintf("Oldest entry ID after ACKing event 0 should be %v", queue.EntryID(entryCount))) - - } - - t.Run("acking in forward order with directEventLoop reports the right event IDs", func(t *testing.T) { - testQueue := NewQueue(nil, nil, Settings{Events: 1000}, 0, nil) - testForward(testQueue) - }) - - t.Run("acking in reverse order with directEventLoop reports the right event IDs", func(t *testing.T) { - testQueue := NewQueue(nil, nil, Settings{Events: 1000}, 0, nil) - testBackward(testQueue) - }) - - t.Run("acking in forward order with bufferedEventLoop reports the right event IDs", func(t *testing.T) { - testQueue := NewQueue(nil, nil, Settings{Events: 1000, MaxGetRequest: 2, FlushTimeout: time.Microsecond}, 0, nil) - testForward(testQueue) - }) - - t.Run("acking in reverse order with bufferedEventLoop reports the right event IDs", func(t *testing.T) { - testQueue := NewQueue(nil, nil, Settings{Events: 1000, MaxGetRequest: 2, FlushTimeout: time.Microsecond}, 0, nil) - testBackward(testQueue) - }) -} - -// producerACKWaiter is a helper that can listen to queue producer callbacks -// and wait on them from the test thread, so we can test the queue's asynchronous -// behavior without relying on time.Sleep. -type producerACKWaiter struct { - sync.Mutex - - // The number of acks received from a producer callback. - acked int - - // The number of acks that callers have waited for in waitForEvents. - waited int - - // When non-nil, this channel is being listened to by a test thread - // blocking on ACKs, and incoming producer callbacks are forwarded - // to it. - ackChan chan int -} - -func (w *producerACKWaiter) ack(count int) { - w.Lock() - defer w.Unlock() - w.acked += count - if w.ackChan != nil { - w.ackChan <- count - } -} - -func (w *producerACKWaiter) waitForEvents(count int) { - w.Lock() - defer w.Unlock() - if w.ackChan != nil { - panic("don't call producerACKWaiter.waitForEvents from multiple goroutines") - } - - avail := w.acked - w.waited - if count <= avail { - w.waited += count - return - } - w.waited = w.acked - count -= avail - // We have advanced as far as we can, we have to wait for - // more incoming ACKs. - // Set a listener and unlock, so ACKs can come in on another - // goroutine. - w.ackChan = make(chan int) - w.Unlock() - - newAcked := 0 - for newAcked < count { - newAcked += <-w.ackChan - } - // When we're done, turn off the listener channel and update - // the number of events waited on. - w.Lock() - w.ackChan = nil - w.waited += count -} diff --git a/libbeat/publisher/queue/memqueue/runloop.go b/libbeat/publisher/queue/memqueue/runloop.go index ed14106f20c..397b41a25e8 100644 --- a/libbeat/publisher/queue/memqueue/runloop.go +++ b/libbeat/publisher/queue/memqueue/runloop.go @@ -29,6 +29,9 @@ import ( type runLoop struct { broker *broker + // observer is a metrics observer used to report internal queue state. + observer queue.Observer + // The index of the beginning of the current ring buffer within its backing // array. If the queue isn't empty, bufPos points to the oldest remaining // event. @@ -57,13 +60,18 @@ type runLoop struct { // It is active if and only if pendingGetRequest is non-nil. getTimer *time.Timer + // closing is set when a close request is received. Once closing is true, + // the queue will not accept any new events, but will continue responding + // to Gets and Acks to allow pending events to complete on shutdown. + closing bool + // TODO (https://github.com/elastic/beats/issues/37893): entry IDs were a // workaround for an external project that no longer exists. At this point // they just complicate the API and should be removed. nextEntryID queue.EntryID } -func newRunLoop(broker *broker) *runLoop { +func newRunLoop(broker *broker, observer queue.Observer) *runLoop { var timer *time.Timer // Create the timer we'll use for get requests, but stop it until a @@ -76,6 +84,7 @@ func newRunLoop(broker *broker) *runLoop { } return &runLoop{ broker: broker, + observer: observer, getTimer: timer, } } @@ -90,8 +99,8 @@ func (l *runLoop) run() { // standalone helper function to allow testing of loop invariants. func (l *runLoop) runIteration() { var pushChan chan pushRequest - // Push requests are enabled if the queue isn't yet full. - if l.eventCount < len(l.broker.buf) { + // Push requests are enabled if the queue isn't full or closing. + if l.eventCount < len(l.broker.buf) && !l.closing { pushChan = l.broker.pushChan } @@ -116,7 +125,14 @@ func (l *runLoop) runIteration() { } select { + case <-l.broker.closeChan: + l.closing = true + close(l.broker.closingChan) + // Get requests are handled immediately during shutdown + l.maybeUnblockGetRequest() + case <-l.broker.ctx.Done(): + // The queue is fully shut down, do nothing return case req := <-pushChan: // producer pushing new event @@ -133,9 +149,6 @@ func (l *runLoop) runIteration() { case count := <-l.broker.deleteChan: l.handleDelete(count) - case req := <-l.broker.metricChan: // asking broker for queue metrics - l.handleMetricsRequest(&req) - case <-timeoutChan: // The get timer has expired, handle the blocked request l.getTimer.Stop() @@ -157,8 +170,8 @@ func (l *runLoop) handleGetRequest(req *getRequest) { } func (l *runLoop) getRequestShouldBlock(req *getRequest) bool { - if l.broker.settings.FlushTimeout <= 0 { - // Never block if the flush timeout isn't positive + if l.broker.settings.FlushTimeout <= 0 || l.closing { + // Never block if the flush timeout isn't positive, or during shutdown return false } eventsAvailable := l.eventCount - l.consumedCount @@ -177,18 +190,34 @@ func (l *runLoop) handleGetReply(req *getRequest) { startIndex := l.bufPos + l.consumedCount batch := newBatch(l.broker, startIndex, batchSize) + batchBytes := 0 + for i := 0; i < batchSize; i++ { + batchBytes += batch.rawEntry(i).eventSize + } + // Send the batch to the caller and update internal state req.responseChan <- batch l.consumedBatches.append(batch) l.consumedCount += batchSize + l.observer.ConsumeEvents(batchSize, batchBytes) } func (l *runLoop) handleDelete(count int) { + byteCount := 0 + for i := 0; i < count; i++ { + entry := l.broker.buf[(l.bufPos+i)%len(l.broker.buf)] + byteCount += entry.eventSize + } // Advance position and counters. Event data was already cleared in // batch.FreeEntries when the events were vended. l.bufPos = (l.bufPos + count) % len(l.broker.buf) l.eventCount -= count l.consumedCount -= count + l.observer.RemoveEvents(count, byteCount) + if l.closing && l.eventCount == 0 { + // Our last events were acknowledged during shutdown, signal final shutdown + l.broker.ctxCancel() + } } func (l *runLoop) handleInsert(req *pushRequest) { @@ -208,8 +237,7 @@ func (l *runLoop) maybeUnblockGetRequest() { // If a get request is blocked waiting for more events, check if // we should unblock it. if getRequest := l.pendingGetRequest; getRequest != nil { - available := l.eventCount - l.consumedCount - if available >= getRequest.entryCount { + if !l.getRequestShouldBlock(getRequest) { l.pendingGetRequest = nil if !l.getTimer.Stop() { <-l.getTimer.C @@ -223,22 +251,10 @@ func (l *runLoop) insert(req *pushRequest, id queue.EntryID) { index := (l.bufPos + l.eventCount) % len(l.broker.buf) l.broker.buf[index] = queueEntry{ event: req.event, + eventSize: req.eventSize, id: id, producer: req.producer, producerID: req.producerID, } -} - -func (l *runLoop) handleMetricsRequest(req *metricsRequest) { - oldestEntryID := l.nextEntryID - if l.eventCount > 0 { - index := l.bufPos % len(l.broker.buf) - oldestEntryID = l.broker.buf[index].id - } - - req.responseChan <- memQueueMetrics{ - currentQueueSize: l.eventCount, - occupiedRead: l.consumedCount, - oldestEntryID: oldestEntryID, - } + l.observer.AddEvent(req.eventSize) } diff --git a/libbeat/publisher/queue/memqueue/runloop_test.go b/libbeat/publisher/queue/memqueue/runloop_test.go index 266704fc1fd..f6c83e8fec0 100644 --- a/libbeat/publisher/queue/memqueue/runloop_test.go +++ b/libbeat/publisher/queue/memqueue/runloop_test.go @@ -18,13 +18,17 @@ package memqueue import ( + "context" "testing" "time" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "github.com/elastic/beats/v7/libbeat/publisher" + "github.com/elastic/beats/v7/libbeat/publisher/queue" "github.com/elastic/elastic-agent-libs/logp" + "github.com/elastic/elastic-agent-libs/monitoring" ) func TestFlushSettingsDoNotBlockFullBatches(t *testing.T) { @@ -112,3 +116,88 @@ func TestFlushSettingsBlockPartialBatches(t *testing.T) { assert.Nil(t, rl.pendingGetRequest, "Queue should have no pending get request since adding an event should unblock the previous one") assert.Equal(t, 101, rl.consumedCount, "Queue should have a consumedCount of 101 after adding an event unblocked the pending get request") } + +func TestObserverAddEvent(t *testing.T) { + // Confirm that an entry inserted into the queue is reported in + // queue.added.events and queue.added.bytes. + reg := monitoring.NewRegistry() + rl := &runLoop{ + observer: queue.NewQueueObserver(reg), + broker: &broker{ + buf: make([]queueEntry, 100), + }, + } + request := &pushRequest{ + event: publisher.Event{}, + eventSize: 123, + } + rl.insert(request, 0) + assertRegistryUint(t, reg, "queue.added.events", 1, "Queue insert should report added event") + assertRegistryUint(t, reg, "queue.added.bytes", 123, "Queue insert should report added bytes") +} + +func TestObserverConsumeEvents(t *testing.T) { + // Confirm that event batches sent to the output are reported in + // queue.consumed.events and queue.consumed.bytes. + reg := monitoring.NewRegistry() + rl := &runLoop{ + observer: queue.NewQueueObserver(reg), + broker: &broker{ + buf: make([]queueEntry, 100), + }, + eventCount: 50, + } + // Initialize the queue entries to a test byte size + for i := range rl.broker.buf { + rl.broker.buf[i].eventSize = 123 + } + request := &getRequest{ + entryCount: len(rl.broker.buf), + responseChan: make(chan *batch, 1), + } + rl.handleGetReply(request) + // We should have gotten back 50 events, everything in the queue, so we expect the size + // to be 50 * 123. + assertRegistryUint(t, reg, "queue.consumed.events", 50, "Sending a batch to a Get caller should report the consumed events") + assertRegistryUint(t, reg, "queue.consumed.bytes", 50*123, "Sending a batch to a Get caller should report the consumed bytes") +} + +func TestObserverRemoveEvents(t *testing.T) { + reg := monitoring.NewRegistry() + rl := &runLoop{ + observer: queue.NewQueueObserver(reg), + broker: &broker{ + ctx: context.Background(), + buf: make([]queueEntry, 100), + deleteChan: make(chan int, 1), + }, + eventCount: 50, + } + // Initialize the queue entries to a test byte size + for i := range rl.broker.buf { + rl.broker.buf[i].eventSize = 123 + } + const deleteCount = 25 + rl.broker.deleteChan <- deleteCount + // Run one iteration of the run loop, so it can handle the delete request + rl.runIteration() + // It should have deleted 25 events, so we expect the size to be 25 * 123. + assertRegistryUint(t, reg, "queue.removed.events", deleteCount, "Deleting from the queue should report the removed events") + assertRegistryUint(t, reg, "queue.removed.bytes", deleteCount*123, "Deleting from the queue should report the removed bytes") +} + +func assertRegistryUint(t *testing.T, reg *monitoring.Registry, key string, expected uint64, message string) { + t.Helper() + + entry := reg.Get(key) + if entry == nil { + assert.Failf(t, message, "registry key '%v' doesn't exist", key) + return + } + value, ok := reg.Get(key).(*monitoring.Uint) + if !ok { + assert.Failf(t, message, "registry key '%v' doesn't refer to a uint64", key) + return + } + assert.Equal(t, expected, value.Get(), message) +} diff --git a/libbeat/publisher/queue/monitoring.go b/libbeat/publisher/queue/monitoring.go new file mode 100644 index 00000000000..5061d3f5600 --- /dev/null +++ b/libbeat/publisher/queue/monitoring.go @@ -0,0 +1,153 @@ +// Licensed to Elasticsearch B.V. under one or more contributor +// license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright +// ownership. Elasticsearch B.V. licenses this file to you under +// the Apache License, Version 2.0 (the "License"); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package queue + +import ( + "github.com/elastic/elastic-agent-libs/monitoring" +) + +// Observer is an interface for queues to send state updates to a metrics +// or test listener. +type Observer interface { + MaxEvents(int) + MaxBytes(int) + + // Restore queue state on startup. Used by the disk queue to report events + // that are already in the queue from a previous run. + Restore(eventCount int, byteCount int) + + // All reported byte counts are zero if the output doesn't support + // early encoding. + AddEvent(byteCount int) + ConsumeEvents(eventCount int, byteCount int) + RemoveEvents(eventCount int, byteCount int) +} + +type queueObserver struct { + maxEvents *monitoring.Uint // gauge + maxBytes *monitoring.Uint // gauge + + addedEvents *monitoring.Uint + addedBytes *monitoring.Uint + consumedEvents *monitoring.Uint + consumedBytes *monitoring.Uint + removedEvents *monitoring.Uint + removedBytes *monitoring.Uint + + filledEvents *monitoring.Uint // gauge + filledBytes *monitoring.Uint // gauge + filledPct *monitoring.Float // gauge + + // backwards compatibility: the metric "acked" is the old name for + // "removed.events". Ideally we would like to define an alias in the + // monitoring API, but until that's possible we shadow it with this + // extra variable and make sure to always change removedEvents and + // acked at the same time. + acked *monitoring.Uint +} + +type nilObserver struct{} + +// Creates queue metrics in the given registry under the path "pipeline.queue". +func NewQueueObserver(metrics *monitoring.Registry) Observer { + if metrics == nil { + return nilObserver{} + } + queueMetrics := metrics.GetRegistry("queue") + if queueMetrics != nil { + err := queueMetrics.Clear() + if err != nil { + return nilObserver{} + } + } else { + queueMetrics = metrics.NewRegistry("queue") + } + + ob := &queueObserver{ + maxEvents: monitoring.NewUint(queueMetrics, "max_events"), // gauge + maxBytes: monitoring.NewUint(queueMetrics, "max_bytes"), // gauge + + addedEvents: monitoring.NewUint(queueMetrics, "added.events"), + addedBytes: monitoring.NewUint(queueMetrics, "added.bytes"), + consumedEvents: monitoring.NewUint(queueMetrics, "consumed.events"), + consumedBytes: monitoring.NewUint(queueMetrics, "consumed.bytes"), + removedEvents: monitoring.NewUint(queueMetrics, "removed.events"), + removedBytes: monitoring.NewUint(queueMetrics, "removed.bytes"), + + filledEvents: monitoring.NewUint(queueMetrics, "filled.events"), // gauge + filledBytes: monitoring.NewUint(queueMetrics, "filled.bytes"), // gauge + filledPct: monitoring.NewFloat(queueMetrics, "filled.pct"), // gauge + + // backwards compatibility: "acked" is an alias for "removed.events". + acked: monitoring.NewUint(queueMetrics, "acked"), + } + return ob +} + +func (ob *queueObserver) MaxEvents(value int) { + ob.maxEvents.Set(uint64(value)) +} + +func (ob *queueObserver) MaxBytes(value int) { + ob.maxBytes.Set(uint64(value)) +} + +func (ob *queueObserver) Restore(eventCount int, byteCount int) { + ob.filledEvents.Set(uint64(eventCount)) + ob.filledBytes.Set(uint64(byteCount)) + ob.updateFilledPct() +} + +func (ob *queueObserver) AddEvent(byteCount int) { + ob.addedEvents.Inc() + ob.addedBytes.Add(uint64(byteCount)) + + ob.filledEvents.Inc() + ob.filledBytes.Add(uint64(byteCount)) + ob.updateFilledPct() +} + +func (ob *queueObserver) ConsumeEvents(eventCount int, byteCount int) { + ob.consumedEvents.Add(uint64(eventCount)) + ob.consumedBytes.Add(uint64(byteCount)) +} + +func (ob *queueObserver) RemoveEvents(eventCount int, byteCount int) { + ob.removedEvents.Add(uint64(eventCount)) + ob.acked.Add(uint64(eventCount)) + ob.removedBytes.Add(uint64(byteCount)) + + ob.filledEvents.Sub(uint64(eventCount)) + ob.filledBytes.Sub(uint64(byteCount)) + ob.updateFilledPct() +} + +func (ob *queueObserver) updateFilledPct() { + if maxBytes := ob.maxBytes.Get(); maxBytes > 0 { + ob.filledPct.Set(float64(ob.filledBytes.Get()) / float64(maxBytes)) + } else { + ob.filledPct.Set(float64(ob.filledEvents.Get()) / float64(ob.maxEvents.Get())) + } +} + +func (nilObserver) MaxEvents(_ int) {} +func (nilObserver) MaxBytes(_ int) {} +func (nilObserver) Restore(_ int, _ int) {} +func (nilObserver) AddEvent(_ int) {} +func (nilObserver) ConsumeEvents(_ int, _ int) {} +func (nilObserver) RemoveEvents(_ int, _ int) {} diff --git a/libbeat/publisher/queue/queue.go b/libbeat/publisher/queue/queue.go index 9c186ad30d0..075d7ad66a4 100644 --- a/libbeat/publisher/queue/queue.go +++ b/libbeat/publisher/queue/queue.go @@ -18,11 +18,7 @@ package queue import ( - "errors" - - "github.com/elastic/beats/v7/libbeat/common" "github.com/elastic/elastic-agent-libs/logp" - "github.com/elastic/elastic-agent-libs/opt" ) // Entry is a placeholder type for the objects contained by the queue, which @@ -31,31 +27,6 @@ import ( // and reduces accidental type mismatches. type Entry interface{} -// Metrics is a set of basic-user friendly metrics that report the current state of the queue. These metrics are meant to be relatively generic and high-level, and when reported directly, can be comprehensible to a user. -type Metrics struct { - //EventCount is the total events currently in the queue - EventCount opt.Uint - //ByteCount is the total byte size of the queue - ByteCount opt.Uint - //ByteLimit is the user-configured byte limit of the queue - ByteLimit opt.Uint - //EventLimit is the user-configured event limit of the queue - EventLimit opt.Uint - - //UnackedConsumedEvents is the count of events that an output consumer has read, but not yet ack'ed - UnackedConsumedEvents opt.Uint - - //OldestActiveTimestamp is the timestamp of the oldest item in the queue. - OldestActiveTimestamp common.Time - - // OldestActiveID is ID of the oldest unacknowledged event in the queue, or - // the next ID that will be assigned if the queue is empty. - OldestEntryID EntryID -} - -// ErrMetricsNotImplemented is a hopefully temporary type to mark queue metrics as not yet implemented -var ErrMetricsNotImplemented = errors.New("Queue metrics not implemented") - // Queue is responsible for accepting, forwarding and ACKing events. // A queue will receive and buffer single events from its producers. // Consumers will receive events in batches from the queues buffers. @@ -66,8 +37,14 @@ var ErrMetricsNotImplemented = errors.New("Queue metrics not implemented") // consumer or flush to some other intermediate storage), it will send an ACK signal // with the number of ACKed events to the Producer (ACK happens in batches). type Queue interface { + // Close signals the queue to shut down, but it may keep handling requests + // and acknowledgments for events that are already in progress. Close() error + // Done returns a channel that unblocks when the queue is closed and all + // its events are persisted or acknowledged. + Done() <-chan struct{} + QueueType() string BufferConfig() BufferConfig @@ -76,15 +53,13 @@ type Queue interface { // Get retrieves a batch of up to eventCount events. If eventCount <= 0, // there is no bound on the number of returned events. Get(eventCount int) (Batch, error) - - Metrics() (Metrics, error) } // If encoderFactory is provided, then the resulting queue must use it to // encode queued events before returning them. type QueueFactory func( logger *logp.Logger, - ack func(eventCount int), + observer Observer, inputQueueSize int, encoderFactory EncoderFactory, ) (Queue, error) From 328670be543c281a3d3bc6906f9a11b9ad446c22 Mon Sep 17 00:00:00 2001 From: VihasMakwana <121151420+VihasMakwana@users.noreply.github.com> Date: Wed, 12 Jun 2024 12:59:28 +0530 Subject: [PATCH 21/21] feature: tag events that come from a filestream with `take_over: true` (#39828) * filestream: tag events with `take_over: true` * filestream: modify test cases * add comments * update documentation * Update filebeat/input/filestream/input_test.go Co-authored-by: Tiago Queiroz * add changelog --------- Co-authored-by: Tiago Queiroz --- CHANGELOG.next.asciidoc | 1 + .../docs/howto/migrate-to-filestream.asciidoc | 5 +++ filebeat/input/filestream/input.go | 8 ++++ filebeat/input/filestream/input_test.go | 43 +++++++++++++++++++ 4 files changed, 57 insertions(+) diff --git a/CHANGELOG.next.asciidoc b/CHANGELOG.next.asciidoc index e1214aa0e27..8e5cd3497e1 100644 --- a/CHANGELOG.next.asciidoc +++ b/CHANGELOG.next.asciidoc @@ -42,6 +42,7 @@ https://github.com/elastic/beats/compare/v8.8.1\...main[Check the HEAD diff] - Removed deprecated Sophos UTM from Beats. Use the https://docs.elastic.co/integrations/sophos[Sophos] Elastic integration instead. {pull}38037[38037] - Introduce input/netmetrics and refactor netflow input metrics {pull}38055[38055] - Update Salesforce module to use new Salesforce input. {pull}37509[37509] +- Tag events that come from a filestream in "take over" mode. {pull}39828[39828] - Fix high IO and handling of a corrupted registry log file. {pull}35893[35893] *Heartbeat* diff --git a/filebeat/docs/howto/migrate-to-filestream.asciidoc b/filebeat/docs/howto/migrate-to-filestream.asciidoc index a57105adb3e..30057fab725 100644 --- a/filebeat/docs/howto/migrate-to-filestream.asciidoc +++ b/filebeat/docs/howto/migrate-to-filestream.asciidoc @@ -247,3 +247,8 @@ and return to old `log` inputs the files that were taken by `filestream` inputs, 6. Run Filebeat with the old configuration (no `filestream` inputs with `take_over: true`). NOTE: Reverting to backups might cause some events to repeat, depends on the amount of time the new configuration was running. + +=== Debugging on Kibana + +Events produced by `filestream` with `take_over: true` contains `take_over` tag. +You can filter on this tag in Kibana and see the events which came from a filestream in the "take over" mode. \ No newline at end of file diff --git a/filebeat/input/filestream/input.go b/filebeat/input/filestream/input.go index 0136b062b48..7da25654a25 100644 --- a/filebeat/input/filestream/input.go +++ b/filebeat/input/filestream/input.go @@ -41,6 +41,7 @@ import ( "github.com/elastic/beats/v7/libbeat/reader/readfile/encoding" conf "github.com/elastic/elastic-agent-libs/config" "github.com/elastic/elastic-agent-libs/logp" + "github.com/elastic/elastic-agent-libs/mapstr" ) const pluginName = "filestream" @@ -61,6 +62,7 @@ type filestream struct { encodingFactory encoding.EncodingFactory closerConfig closerConfig parsers parser.Config + takeOver bool } // Plugin creates a new filestream input plugin for creating a stateful input. @@ -101,6 +103,7 @@ func configure(cfg *conf.C) (loginp.Prospector, loginp.Harvester, error) { encodingFactory: encodingFactory, closerConfig: config.Close, parsers: config.Reader.Parsers, + takeOver: config.TakeOver, } return prospector, filestream, nil @@ -378,6 +381,11 @@ func (inp *filestream) readFromSource( metrics.BytesProcessed.Add(uint64(message.Bytes)) + // add "take_over" tag if `take_over` is set to true + if inp.takeOver { + _ = mapstr.AddTags(message.Fields, []string{"take_over"}) + } + if err := p.Publish(message.ToEvent(), s); err != nil { metrics.ProcessingErrors.Inc() return err diff --git a/filebeat/input/filestream/input_test.go b/filebeat/input/filestream/input_test.go index a1d9729c5aa..3dfe176ac01 100644 --- a/filebeat/input/filestream/input_test.go +++ b/filebeat/input/filestream/input_test.go @@ -35,6 +35,7 @@ import ( "github.com/elastic/beats/v7/libbeat/statestore/storetest" conf "github.com/elastic/elastic-agent-libs/config" "github.com/elastic/elastic-agent-libs/logp" + "github.com/elastic/elastic-agent-libs/mapstr" ) func BenchmarkFilestream(b *testing.B) { @@ -115,6 +116,48 @@ paths: }) } +func TestTakeOverTags(t *testing.T) { + testCases := []struct { + name string + takeOver bool + testFunc func(t *testing.T, event beat.Event) + }{ + { + name: "test-take_over-true", + takeOver: true, + testFunc: func(t *testing.T, event beat.Event) { + tags, err := event.GetValue("tags") + require.NoError(t, err) + require.Contains(t, tags, "take_over") + }, + }, + { + name: "test-take_over-false", + takeOver: false, + testFunc: func(t *testing.T, event beat.Event) { + _, err := event.GetValue("tags") + require.ErrorIs(t, err, mapstr.ErrKeyNotFound) + }, + }, + } + for _, testCase := range testCases { + t.Run(testCase.name, func(t *testing.T) { + filename := generateFile(t, t.TempDir(), 5) + cfg := fmt.Sprintf(` +type: filestream +prospector.scanner.check_interval: 1s +take_over: %t +paths: + - %s`, testCase.takeOver, filename) + runner := createFilestreamTestRunner(context.Background(), t, testCase.name, cfg, 5, true) + events := runner(t) + for _, event := range events { + testCase.testFunc(t, event) + } + }) + } +} + // runFilestreamBenchmark runs the entire filestream input with the in-memory registry and the test pipeline. // `testID` must be unique for each test run // `cfg` must be a valid YAML string containing valid filestream configuration