From 374933f3d031729d4f9dac4d83a50d86bfae7c6b Mon Sep 17 00:00:00 2001 From: Fanit Kolchina Date: Thu, 13 Jun 2024 12:42:38 -0400 Subject: [PATCH] Doc review Signed-off-by: Fanit Kolchina --- .../supported-field-types/derived-field.md | 465 --------- _field-types/supported-field-types/derived.md | 903 ++++++++++++++++++ _field-types/supported-field-types/index.md | 1 + 3 files changed, 904 insertions(+), 465 deletions(-) delete mode 100644 _field-types/supported-field-types/derived-field.md create mode 100644 _field-types/supported-field-types/derived.md diff --git a/_field-types/supported-field-types/derived-field.md b/_field-types/supported-field-types/derived-field.md deleted file mode 100644 index 97f1544227..0000000000 --- a/_field-types/supported-field-types/derived-field.md +++ /dev/null @@ -1,465 +0,0 @@ -# Full Markdown content for the Derived field type documentation without the migration section and migration reference - -markdown_content = """ ---- -layout: default -title: Derived -nav_order: 62 -has_children: false -parent: Advanced field types -grand_parent: Supported field types -redirect_from: -- /opensearch/supported-field-types/derived/ -- /field-types/derived/ ---- - -## Derived field type - -Derived fields enable users to create new fields dynamically by executing scripts on existing fields retrieved from the `_source` field, which contains the original document, or from a field's doc values for faster retrieval. Once defined, either in the index mapping or within a search request, these fields can be utilized like regular fields in query definitions. - -## When to use derived fields -Derived fields prioritize storage efficiency and offer flexibility in field manipulations, albeit at the cost of query performance since they are computed at query time. They are particularly useful in scenarios requiring real-time data transformation, such as: - -- **Log Analysis**: Extracting timestamps and log levels from log messages. -- **Performance Metrics**: Calculating response times from start and end timestamps. -- **Security Analytics**: Real-time IP geolocation and user-agent parsing for threat detection. -- **Experimental Use Cases**: Testing new data transformations, creating temporary fields for A/B testing, or generating ad-hoc reports without altering mappings or reindexing. - -Despite the potential performance impact of query-time computations, the flexibility and storage efficiency of derived fields make them a valuable tool for these applications. - -## Current limitations -Currently, derived fields have the following limitations: - -- **Aggregation, Scoring, and Sorting**: Not supported yet. -- **Dashboard Support**: These fields are not displayed in the list of available fields on dashboards. However, they can still be used in for filtering if derived field name is known to the user. -- **Chained Derived Fields**: One derived field cannot be used to define another derived field. -- **Join Field Type**: Derived fields are not supported with [join field type]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/join/) -- **Concurrent Segment Search**: Derived fields are not supported with [concurrent segment search]({{site.url}}{{site.baseurl}}/search-plugins/concurrent-segment-search/) - -These limitations are recognized, and there are plans to address them in future releases. - -## Prerequisites -- **Enable `_source` or Doc Values**: Ensure that either the `_source` field or doc values are enabled for the fields used in your script. -- **Enable expensive queries**: Ensure [`search.allow_expensive_queries`]({{site.url}}{{site.baseurl}}/query-dsl/index/#expensive-queries) is set to `true`. -- **Feature Control**: The derived fields feature is enabled by default. You can control it using the following settings: - - **Index Level**: Use the `index.query.derived_field.enabled` setting. - - **Cluster Level**: Use the `search.derived_field.enabled` setting. - Both settings are dynamic, meaning they can be changed without requiring reindexing or node restarts. -- **Performance Considerations**: Evaluate the [performance implications](#performance) to ensure this feature meets your scale requirements before using it. - -## Definition -Derived fields can be defined in index mappings or directly within the search request. Following index and documents will be used for all examples: - -```json -PUT logs -{ - "mappings": { - "properties": { - "request": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword" - } - } - }, - "client_ip": { - "type": "keyword" - } - } - } -} -``` -{% include copy-curl.html %} - -```json -POST _bulk -{ "index" : { "_index" : "logs", "_id" : "1" } } -{ "request": "894030400 GET /english/images/france98_venues.gif HTTP/1.0 200 778", "clientip": "61.177.2.0" } -{ "index" : { "_index" : "logs", "_id" : "2" } } -{ "request": "894140400 GET /french/playing/mascot/mascot.html HTTP/1.1 200 5474", "clientip": "185.92.2.0" } -{ "index" : { "_index" : "logs", "_id" : "3" } } -{ "request": "894250400 POST /english/venues/images/venue_header.gif HTTP/1.0 200 711", "clientip": "61.177.2.0" } -{ "index" : { "_index" : "logs", "_id" : "4" } } -{ "request": "894360400 POST /images/home_fr_button.gif HTTP/1.1 200 2140", "clientip": "129.178.2.0" } -{ "index" : { "_index" : "logs", "_id" : "5" } } -{ "request": "894470400 DELETE /images/102384s.gif HTTP/1.0 200 785", "clientip": "227.177.2.0" } -``` -{% include copy-curl.html %} - -## Deriving fields with index mapping approach - -To derive fields `timestamp`, `method`, and `size` from the indexed field `request` in the `logs` index, update the mappings as follows: - -```json -PUT /logs/_mapping -{ - "derived": { - "timestamp": { - "type": "date", - "script": { - "source": """ - emit(Long.parseLong(doc["request.keyword"].value.splitOnToken(" ")[0])) - """ - } - }, - "method": { - "type": "keyword", - "script": { - "source": """ - emit(doc["request.keyword"].value.splitOnToken(" ")[1]) - """ - } - }, - "size": { - "type": "long", - "script": { - "source": """ - emit(Long.parseLong(doc["request.keyword"].value.splitOnToken(" ")[5])) - """ - } - } - } -} -``` -{% include copy-curl.html %} - -### Supported parameters - -| Parameter | Description | -|-------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| `type` | Type of the derived field. Supported types include `boolean`, `date`, `geo_point`, `ip`, `keyword`, `text`, `long`, `double`, `float`, and `object`. | -| `script` | The script associated with derived fields. Any value emitted from the script needs to be emitted using `emit()`. The type of the emitted value must match the `type` of the derived field. Scripts have access to both `doc_values` and `_source` document if enabled. The doc value of a field can be accessed using `doc['field_name'].value`, and the source can be accessed using `params._source["field_name"]`. | -| `format` | The format for parsing dates. Only applicable when the type is `date`. Format can be `strict_date_time_no_millis`, `strict_date_optional_time`, or `epoch_millis`. | -| `ignore_malformed`| A Boolean value that specifies whether to ignore malformed values and not throw an exception during query execution on derived fields. Default value is `false`. | -| `prefilter_field` | An indexed text field provided to boost the performance of derived fields. It adds the same query as a filter on this indexed field first and uses only matching documents on derived fields. Check [Prefilter field](#prefilter-field) | - -All parameters are dynamic and can be modified without the need to reindex. - -### Emitting values in script - -The `emit()` function is available only within the derived field script context. It is used to emit one or more values (for a multi-valued field) from the script for a given document on which the script runs. - -Here is the emit format and support for multi-valued fields for different types: - -| Type | Emit format | Multi-valued | -|-----------|----------------------------------|--------------| -| `date` | `emit(long timeInMilis)` | Yes | -| `geo_point`| `emit(double lat, double lon)` | Yes | -| `ip` | `emit(String ip)` | Yes | -| `keyword` | `emit(String)` | Yes | -| `text` | `emit(String)` | Yes | -| `long` | `emit(long)` | Yes | -| `double` | `emit(double)` | Yes | -| `float` | `emit(float)` | Yes | -| `boolean` | `emit(boolean)` | No | -| `object` | `emit(String json)` (valid JSON) | Yes | - -In case of a mismatch between the `type` of the derived field and its emitted value, it will result in an `IllegalArgumentException`, and the search request will fail. However, if `ignore_malformed` is set, the document for which the failure occurred will be skipped, and the search request will not fail. - -**Note:** There is a hard limit of `1 MB` on the size of emitted values per document. - -## Search on derived fields - -Searching on derived fields follows the same syntax as for regular fields. The following examples illustrate the use of derived fields defined in search requests: - -```json -POST /logs/_search -{ - "query": { - "range": { - "timestamp": { - "gte": "1970-01-11T08:20:30.400Z", - "lte": "1970-01-11T08:26:00.400Z" - } - } - }, - "fields": ["timestamp"] -} -``` -{% include copy-curl.html %} - -Since the `timestamp` field is defined as a `date` type in the derived field definition, you can also specify the desired date format in the search request using `format` parameter. - -```json -POST /logs/_search -{ - "query": { - "range": { - "timestamp": { - "gte": "1970-01-11T08:20:30.400Z", - "lte": "1970-01-11T08:26:00.400Z" - } - } - }, - "fields": ["timestamp"] -} -``` -{% include copy-curl.html %} - -## Defining mappings in search request -You can also define derived fields directly in a search request and query on them in conjunction with regular indexed fields. Here's an example: - -```json -POST /logs/_search -{ - "derived": { - "url": { - "type": "text", - "script": { - "source": """ - emit(doc["request"].value.splitOnToken(" ")[2]) - """ - } - }, - "status": { - "type": "keyword", - "script": { - "source": """ - emit(doc["request"].value.splitOnToken(" ")[4]) - """ - } - } - }, - "query": { - "bool": { - "must": [ - { - "term": { - "clientip": "61.177.2.0" - } - }, - { - "match": { - "url": "images" - } - }, - { - "term": { - "status": "200" - } - } - ] - } - }, - "fields": ["request", "clientip", "url", "status"] -} -``` -{% include copy-curl.html %} - -**Note:** Derived fields use the default analyzer set in the index analysis settings during searches. You can override the default analyzer or specify a search analyzer in the search request, similar to how it's done for regular fields. -**Note:** When both index mapping and search definition are present for a field, the search definition takes precedence. - -**Retrieving Fields** -Fields can be retrieved using the `fields` parameter in the search request, similar to regular fields as shown in the preceding examples. Wildcards can also be used to retrieve all derived fields that match a given pattern. - - -**Highlight** -For derived fields of type `text` where highlighting makes sense, the currently supported highlighter is the [Unified Highlighter]({{site.url}}{{site.baseurl}}/opensearch/search/highlight#the-unified-highlighter) - -```json -POST /logs/_search -{ - "derived": { - "url": { - "type": "text", - "script": { - "source": """ - emit(doc["request"].value.splitOnToken(" " )[2]) - """ - } - } - }, - "query": { - "bool": { - "must": [ - { - "term": { - "clientip": "61.177.2.0" - } - }, - { - "match": { - "url": "images" - } - } - ] - } - }, - "fields": ["request", "clientip", "url"], - "highlight": { - "fields": { - "url": {} - } - } -} -``` -{% include copy-curl.html %} - -## Performance -Derived fields are not indexed and are computed on-the-fly by retrieving values from `_source` field or doc values. Consequently, they can be slow to execute. To improve performance: - -- Prune the search space by adding query filters on indexed fields in conjunction with derived fields. -- Use doc values in the script wherever available for faster access compared to `_source`. -- Consider using `prefilter_field` to automatically prune the search space without explicit filters in the search request. - -### Prefilter field -This technique helps prune the search space automatically without adding explicit filters in the search request. It implicitly adds a filter on the specified indexed field (`prefilter_field`) when constructing the query. `prefilter_field ` must be of text family types (`text`, `match_only_text`). - -For example, lets update the mapping for derived field `method` with `"prefilter_field": "request"`: - -```json -PUT /logs/_mapping -{ - "derived": { - "method": { - "type": "keyword", - "script": { - "source": """ - emit(doc["request.keyword"].value.splitOnToken(" ")[1]) - """ - }, - "prefilter_field": "request" - } - } -} -``` -{% include copy-curl.html %} - -Now, a search with a query on the `method` derived field will implicitly add a filter on the `request` field: - -```json -POST /logs/_search -{ - "profile": true, - "query": { - "term": { - "method": { - "value": "GET" - } - } - }, - "fields": ["method"] -} -``` -{% include copy-curl.html %} - -The resulting query includes the filter on prefiltered field: -```json -"#request:GET #DerivedFieldQuery (Query: [ method:GET])" -``` - -**Note:** `profile` option can be used to analyze the performance of derived fields as well. - -## Object type -A valid JSON object can be emitted from the script, allowing queries on subfields just like regular fields without the need to index them. The subfield type will be inferred if not explicitly provided. This is useful for large JSON objects where occasional searches on some subfields are required, but indexing them is costly, and defining derived fields for each subfield is a lot of overhead. - -Here is an example of its usage: - -```json -PUT logs_object -{ - "mappings": { - "properties": { - "request_object": { "type": "text" } - }, - "derived": { - "derived_request_object": { - "type": "object", - "script": { - "source": "emit(params._source[\"request_object\"])" - } - } - } - } -} -``` - -Consider the following documents: - -```json -POST _bulk -{ "index" : { "_index" : "logs_object", "_id" : "1" } } -{ "request_object": "{\"@timestamp\": 894030400, \"clientip\":\"61.177.2.0\", \"request\": \"GET /english/venues/images/venue_header.gif HTTP/1.0\", \"status\": 200, \"size\": 711}" } -{ "index" : { "_index" : "logs_object", "_id" : "2" } } -{ "request_object": "{\"@timestamp\": 894140400, \"clientip\":\"129.178.2.0\", \"request\": \"GET /images/home_fr_button.gif HTTP/1.1\", \"status\": 200, \"size\": 2140}" } -{ "index" : { "_index" : "logs_object", "_id" : "3" } } -{ "request_object": "{\"@timestamp\": 894240400, \"clientip\":\"227.177.2.0\", \"request\": \"GET /images/102384s.gif HTTP/1.0\", \"status\": 400, \"size\": 785}" } -{ "index" : { "_index" : "logs_object", "_id" : "4" } } -{ "request_object": "{\"@timestamp\": 894340400, \"clientip\":\"61.177.2.0\", \"request\": \"GET /english/images/venue_bu_city_on.gif HTTP/1.0\", \"status\": 400, \"size\": 1397}\n" } -{ "index" : { "_index" : "logs_object", "_id" : "5" } } -{ "request_object": "{\"@timestamp\": 894440400, \"clientip\":\"132.176.2.0\", \"request\": \"GET /french/news/11354.htm HTTP/1.0\", \"status\": 200, \"size\": 3460, \"is_active\": true}" } -``` - -**Search** -```json -POST /logs_object/_search -{ - "query": { - "range": { - "derived_request_object.@timestamp": { - "gte": "894030400", - "lte": "894140400" - } - } - }, - "fields": ["derived_request_object.@timestamp"] -} -``` - -```json -POST /logs_object/_search -{ - "query": { - "bool": { - "must": [ - { - "term": { - "derived_request_object.clientip": "61.177.2.0" - } - }, - { - "match": { - "derived_request_object.request": "images" - } - } - ] - } - }, - "fields": ["derived_request_object.*"], - "highlight": { - "fields": { - "derived_request_object.request": {} - } - } -} -``` - -### Subfields type inference -Type inference is based on the same logic as [Dynamic mapping]({{site.url}}{{site.baseurl}}/opensearch/mappings#dynamic-mapping). Instead of inferring the type from the first document, it generates a random sample of documents. If the type isn't found in the first document, it iterates over the random sample until the subfield is found or the list is exhausted. If it's a rare field, consider defining the explicit type for the subfield, as it may result in 0 results, similar to the behavior of a missing field. A warning will be logged related to inference failure. - -### Subfield explicit type -Let's define an explicit type for `derived_logs_object.is_active` as `boolean`. Since this field is only present in one of the documents, its type inference might fail, so it's important to define the explicit type using `properties` section. - -```json -POST /logs_object/_search -{ - "derived": { - "derived_request_object": { - "type": "object", - "script": { - "source": "emit(params._source[\"request_object\"])" - }, - "properties": { - "is_active": "boolean" - } - } - }, - "query": { - "term": { - "derived_request_object.is_active": true - } - }, - "fields": ["derived_request_object.is_active"] -} -``` \ No newline at end of file diff --git a/_field-types/supported-field-types/derived.md b/_field-types/supported-field-types/derived.md new file mode 100644 index 0000000000..4609499a76 --- /dev/null +++ b/_field-types/supported-field-types/derived.md @@ -0,0 +1,903 @@ +--- +layout: default +title: Derived +nav_order: 62 +has_children: false +parent: Supported field types +--- + +## Derived field type +**Introduced 2.14** +{: .label .label-purple } + +Derived fields allow you to create new fields dynamically by executing scripts on existing fields. The existing fields can be either retrieved from the `_source` field, which contains the original document, or from a field's doc values for faster retrieval. Once you define a derived field either in the index mapping or within a search request, you can use this field in a query in the same way you use regular fields. + +## When to use derived fields + +Derived fields offer flexibility in field manipulation and prioritize storage efficiency. However, +because they are computed at query time, they can reduce query performance. Derived fields are particularly useful in scenarios requiring real-time data transformation, such as: + +- **Log analysis**: Extracting timestamps and log levels from log messages. +- **Performance metrics**: Calculating response times from start and end timestamps. +- **Security analytics**: Real-time IP geolocation and user-agent parsing for threat detection. +- **Experimental use cases**: Testing new data transformations, creating temporary fields for A/B testing, or generating one-time reports without altering mappings or reindexing data. + +Despite the potential performance impact of query-time computations, the flexibility and storage efficiency of derived fields make them a valuable tool for these applications. + +## Current limitations + +Currently, derived fields have the following limitations: + +- **Aggregation, scoring, and sorting**: Not supported yet. +- **Dashboard support**: These fields are not displayed in the list of available fields in OpenSearch Dashboards. However, you can still use them for filtering if you know the derived field name. +- **Chained derived fields**: One derived field cannot be used to define another derived field. +- **Join field type**: Derived fields are not supported with [join field type]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/join/). +- **Concurrent segment search**: Derived fields are not supported with [concurrent segment search]({{site.url}}{{site.baseurl}}/search-plugins/concurrent-segment-search/). + +We are planning to address these limitations in future versions. + +## Prerequisites + +Before using a derived field, be sure to satisfy the following prerequisites: + +- **Enable `_source` or `doc_values`**: Ensure that either the `_source` field or doc values are enabled for the fields used in your script. +- **Enable expensive queries**: Ensure [`search.allow_expensive_queries`]({{site.url}}{{site.baseurl}}/query-dsl/index/#expensive-queries) is set to `true`. +- **Feature control**: Derived fields are enabled by default. You enable or disable derived fields using the following settings: + - **Index level**: Update the `index.query.derived_field.enabled` setting. + - **Cluster level**: Update the `search.derived_field.enabled` setting. + Both settings are dynamic, so they can be changed without requiring reindexing or node restarts. +- **Performance considerations**: Before using derived fields, evaluate the [performance implications](#performance) to ensure derived fields meet your scale requirements. + +## Defining derived fields + +Derived fields can be defined [in index mappings](#defining-derived-fields-in-index-mappings) or [directly within the search request](#defining-and-searching-derived-fields-in-a-search-request). + +## Example setup + +To try out the examples on this page, first create the following `logs` index: + +```json +PUT logs +{ + "mappings": { + "properties": { + "request": { + "type": "text", + "fields": { + "keyword": { + "type": "keyword" + } + } + }, + "client_ip": { + "type": "keyword" + } + } + } +} +``` +{% include copy-curl.html %} + +Add sample documents to the index: + +```json +POST _bulk +{ "index" : { "_index" : "logs", "_id" : "1" } } +{ "request": "894030400 GET /english/images/france98_venues.gif HTTP/1.0 200 778", "clientip": "61.177.2.0" } +{ "index" : { "_index" : "logs", "_id" : "2" } } +{ "request": "894140400 GET /french/playing/mascot/mascot.html HTTP/1.1 200 5474", "clientip": "185.92.2.0" } +{ "index" : { "_index" : "logs", "_id" : "3" } } +{ "request": "894250400 POST /english/venues/images/venue_header.gif HTTP/1.0 200 711", "clientip": "61.177.2.0" } +{ "index" : { "_index" : "logs", "_id" : "4" } } +{ "request": "894360400 POST /images/home_fr_button.gif HTTP/1.1 200 2140", "clientip": "129.178.2.0" } +{ "index" : { "_index" : "logs", "_id" : "5" } } +{ "request": "894470400 DELETE /images/102384s.gif HTTP/1.0 200 785", "clientip": "227.177.2.0" } +``` +{% include copy-curl.html %} + +## Defining derived fields in index mappings + +To derive the `timestamp`, `method`, and `size` fields from the `request` indexed field in the `logs` index, configure the following mappings: + +```json +PUT /logs/_mapping +{ + "derived": { + "timestamp": { + "type": "date", + "format": "MM/dd/yyyy", + "script": { + "source": """ + emit(Long.parseLong(doc["request.keyword"].value.splitOnToken(" ")[0])) + """ + } + }, + "method": { + "type": "keyword", + "script": { + "source": """ + emit(doc["request.keyword"].value.splitOnToken(" ")[1]) + """ + } + }, + "size": { + "type": "long", + "script": { + "source": """ + emit(Long.parseLong(doc["request.keyword"].value.splitOnToken(" ")[5])) + """ + } + } + } +} +``` +{% include copy-curl.html %} + +Note that the `timestamp` field has an additional `format` parameter that specifies the format to display `date` fields. If you don't include a `format` parameter, the format defaults to `strict_date_time_no_millis`. For more information about supported date formats, see [Parameters](#parameters). + +## Parameters + +The following table lists the parameters accepted by `derived` field types. All parameters are dynamic and can be modified without reindexing documents. + +| Parameter | Required/Optional | Description | +| :--- | :--- | :--- | +| `type` | Required | The type of the derived field. Supported types are `boolean`, `date`, `geo_point`, `ip`, `keyword`, `text`, `long`, `double`, `float`, and `object`. | +| `script` | Required | The script associated with derived fields. Any value emitted from the script needs to be emitted using `emit()`. The type of the emitted value must match the `type` of the derived field. Scripts have access to both `doc_values` and `_source` fields if those are enabled. The doc value of a field can be accessed using `doc['field_name'].value`, and the source can be accessed using `params._source["field_name"]`. | +| `format` | Optional | The format for parsing dates. Only applicable for the fields whose type is `date`. Valid values are `strict_date_time_no_millis`, `strict_date_optional_time`, or `epoch_millis`. For more information, see [Formats]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/date/#formats).| +| `ignore_malformed`| Optional | A Boolean value that specifies whether to ignore malformed values when running a query on derived fields. Default value is `false` (throw an exception when encountering malformed values). | +| `prefilter_field` | Optional | An indexed text field provided to boost the performance of derived fields. It adds the same query as a filter on this indexed field first and uses only matching documents on derived fields. For more information, see [Prefilter field](#prefilter-field). | + +## Emitting values in scripts + +The `emit()` function is available only within the derived field script context. It is used to emit one value or multiple values (for a multi-valued field) from the script for a document on which the script runs. + +The following table lists the emit formats. + +| Type | Emit format | Multi-valued fields supported| +|-----------|----------------------------------|--------------| +| `date` | `emit(long timeInMilis)` | Yes | +| `geo_point`| `emit(double lat, double lon)` | Yes | +| `ip` | `emit(String ip)` | Yes | +| `keyword` | `emit(String)` | Yes | +| `text` | `emit(String)` | Yes | +| `long` | `emit(long)` | Yes | +| `double` | `emit(double)` | Yes | +| `float` | `emit(float)` | Yes | +| `boolean` | `emit(boolean)` | No | +| `object` | `emit(String json)` (valid JSON) | Yes | + +By default, a type mismatch between the derived field and its emitted value will result in the search request failing with an error. If `ignore_malformed` is set to `true`, the failing document is skipped and the search request succeeds. +{: .note} + +The size limit of the emitted values is 1 MB per document. +{: .important} + +## Searching derived fields defined in index mappings + +To search derived fields, use the same syntax as regular fields. For example, the following request searches for documents whose derived `timestamp` field in the specified range: + +```json +POST /logs/_search +{ + "query": { + "range": { + "timestamp": { + "gte": "1970-01-11T08:20:30.400Z", + "lte": "1970-01-11T08:26:00.400Z" + } + } + }, + "fields": ["timestamp"] +} +``` +{% include copy-curl.html %} + +The response contains the matching documents: + +
+ + Response + + {: .text-delta} + +```json +{ + "took": 315, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 4, + "relation": "eq" + }, + "max_score": 1, + "hits": [ + { + "_index": "logs", + "_id": "1", + "_score": 1, + "_source": { + "request": "894030400 GET /english/images/france98_venues.gif HTTP/1.0 200 778", + "clientip": "61.177.2.0" + }, + "fields": { + "timestamp": [ + "1970-01-11T08:20:30.400Z" + ] + } + }, + { + "_index": "logs", + "_id": "2", + "_score": 1, + "_source": { + "request": "894140400 GET /french/playing/mascot/mascot.html HTTP/1.1 200 5474", + "clientip": "185.92.2.0" + }, + "fields": { + "timestamp": [ + "1970-01-11T08:22:20.400Z" + ] + } + }, + { + "_index": "logs", + "_id": "3", + "_score": 1, + "_source": { + "request": "894250400 POST /english/venues/images/venue_header.gif HTTP/1.0 200 711", + "clientip": "61.177.2.0" + }, + "fields": { + "timestamp": [ + "1970-01-11T08:24:10.400Z" + ] + } + }, + { + "_index": "logs", + "_id": "4", + "_score": 1, + "_source": { + "request": "894360400 POST /images/home_fr_button.gif HTTP/1.1 200 2140", + "clientip": "129.178.2.0" + }, + "fields": { + "timestamp": [ + "1970-01-11T08:26:00.400Z" + ] + } + } + ] + } +} +``` +
+ +## Defining and searching derived fields in a search request + +You can also define derived fields directly in a search request and query them along with regular indexed fields. For example, the following request creates the `url` and `status` derived fields and searches those fields along with the regular `request` and `clientip` fields: + +```json +POST /logs/_search +{ + "derived": { + "url": { + "type": "text", + "script": { + "source": """ + emit(doc["request"].value.splitOnToken(" ")[2]) + """ + } + }, + "status": { + "type": "keyword", + "script": { + "source": """ + emit(doc["request"].value.splitOnToken(" ")[4]) + """ + } + } + }, + "query": { + "bool": { + "must": [ + { + "term": { + "clientip": "61.177.2.0" + } + }, + { + "match": { + "url": "images" + } + }, + { + "term": { + "status": "200" + } + } + ] + } + }, + "fields": ["request", "clientip", "url", "status"] +} +``` +{% include copy-curl.html %} + +The response contains the matching documents: + +
+ + Response + + {: .text-delta} + +```json +{ + "took": 6, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 2, + "relation": "eq" + }, + "max_score": 2.8754687, + "hits": [ + { + "_index": "logs", + "_id": "1", + "_score": 2.8754687, + "_source": { + "request": "894030400 GET /english/images/france98_venues.gif HTTP/1.0 200 778", + "clientip": "61.177.2.0" + }, + "fields": { + "request": [ + "894030400 GET /english/images/france98_venues.gif HTTP/1.0 200 778" + ], + "clientip": [ + "61.177.2.0" + ], + "url": [ + "/english/images/france98_venues.gif" + ], + "status": [ + "200" + ] + } + }, + { + "_index": "logs", + "_id": "3", + "_score": 2.8754687, + "_source": { + "request": "894250400 POST /english/venues/images/venue_header.gif HTTP/1.0 200 711", + "clientip": "61.177.2.0" + }, + "fields": { + "request": [ + "894250400 POST /english/venues/images/venue_header.gif HTTP/1.0 200 711" + ], + "clientip": [ + "61.177.2.0" + ], + "url": [ + "/english/venues/images/venue_header.gif" + ], + "status": [ + "200" + ] + } + } + ] + } +} +``` +
+ +Derived fields use the default analyzer set in the index analysis settings during search. You can override the default analyzer or specify a search analyzer within a search request in the same way as you do for regular fields. For more information, see [Analyzers]({{site.url}}{{site.baseurl}}/analyzers/). +{: .note} + +When both an index mapping and a search definition are present for a field, the search definition takes precedence. +{: .note} + +### Retrieving fields + +You can retrieve derived fields using the `fields` parameter in the search request in the same way as regular fields, as shown in the preceding examples. You can also use wildcards to retrieve all derived fields that match a given pattern. + +### Highlighting + +Derived fields of type `text` support highlighting using the [unified highlighter]({{site.url}}{{site.baseurl}}/opensearch/search/highlight#the-unified-highlighter). For example, the following request specifies to highlight the derived `url` field: + +```json +POST /logs/_search +{ + "derived": { + "url": { + "type": "text", + "script": { + "source": """ + emit(doc["request"].value.splitOnToken(" " )[2]) + """ + } + } + }, + "query": { + "bool": { + "must": [ + { + "term": { + "clientip": "61.177.2.0" + } + }, + { + "match": { + "url": "images" + } + } + ] + } + }, + "fields": ["request", "clientip", "url"], + "highlight": { + "fields": { + "url": {} + } + } +} +``` +{% include copy-curl.html %} + +The response contains highlighting in the `url` field: + +
+ + Response + + {: .text-delta} + +```json +{ + "took": 45, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 2, + "relation": "eq" + }, + "max_score": 1.8754687, + "hits": [ + { + "_index": "logs", + "_id": "1", + "_score": 1.8754687, + "_source": { + "request": "894030400 GET /english/images/france98_venues.gif HTTP/1.0 200 778", + "clientip": "61.177.2.0" + }, + "fields": { + "request": [ + "894030400 GET /english/images/france98_venues.gif HTTP/1.0 200 778" + ], + "clientip": [ + "61.177.2.0" + ], + "url": [ + "/english/images/france98_venues.gif" + ] + }, + "highlight": { + "url": [ + "/english/images/france98_venues.gif" + ] + } + }, + { + "_index": "logs", + "_id": "3", + "_score": 1.8754687, + "_source": { + "request": "894250400 POST /english/venues/images/venue_header.gif HTTP/1.0 200 711", + "clientip": "61.177.2.0" + }, + "fields": { + "request": [ + "894250400 POST /english/venues/images/venue_header.gif HTTP/1.0 200 711" + ], + "clientip": [ + "61.177.2.0" + ], + "url": [ + "/english/venues/images/venue_header.gif" + ] + }, + "highlight": { + "url": [ + "/english/venues/images/venue_header.gif" + ] + } + } + ] + } +} +``` +
+ +## Performance + +Derived fields are not indexed but are computed dynamically by retrieving values from the `_source` field or doc values. Thus, they can be slow to execute. To improve performance, try the following: + +- Prune the search space by adding query filters on indexed fields and derived fields. +- Prefer using doc values over `_source` in the script for faster access. +- Consider using a [`prefilter_field`](#prefilter-field) to automatically prune the search space without explicit filters in the search request. + +### Prefilter field + +Specifying a prefilter field helps prune the search space automatically without adding explicit filters in the search request. The prefilter field specifies an existing indexed field (`prefilter_field`) on which to filter implicitly when constructing the query. The `prefilter_field` must be a text field (either [`text`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/text/) or [`match_only_text`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/match-only-text/)). + +For example, first update the mapping for the `method` derived field by specifying to prefilter on the `request` field: + +```json +PUT /logs/_mapping +{ + "derived": { + "method": { + "type": "keyword", + "script": { + "source": """ + emit(doc["request.keyword"].value.splitOnToken(" ")[1]) + """ + }, + "prefilter_field": "request" + } + } +} +``` +{% include copy-curl.html %} + +Now search using a query on the `method` derived field: + +```json +POST /logs/_search +{ + "profile": true, + "query": { + "term": { + "method": { + "value": "GET" + } + } + }, + "fields": ["method"] +} +``` +{% include copy-curl.html %} + +OpenSearch implicitly adds a filter on the `request` field to your query: + +```json +"#request:GET #DerivedFieldQuery (Query: [ method:GET])" +``` + +You can use the `profile` option can to analyze the derived field performance, as shown in the preceding example. +{: .tip} + +## Dervied object fields + +A script can emit a valid JSON object so you can query subfields without indexing them, in the same way as regular fields. This is useful for large JSON objects where occasional searches on some subfields are required. In this case, indexing the subfields is expensive while defining derived fields for each subfield also adds a lot of resource overhead. If you don't [explicitly provide the subfield type](#explicit-subfield-type), the subfield type is [inferred](#inferred-subfield-type). + +For example, the following request defines a `derived_request_object` derived field as an `object` type: + +```json +PUT logs_object +{ + "mappings": { + "properties": { + "request_object": { "type": "text" } + }, + "derived": { + "derived_request_object": { + "type": "object", + "script": { + "source": "emit(params._source[\"request_object\"])" + } + } + } + } +} +``` +{% include copy-curl.html %} + +Consider the following documents, in which the `request_object` is a string representation of a JSON object: + +```json +POST _bulk +{ "index" : { "_index" : "logs_object", "_id" : "1" } } +{ "request_object": "{\"@timestamp\": 894030400, \"clientip\":\"61.177.2.0\", \"request\": \"GET /english/venues/images/venue_header.gif HTTP/1.0\", \"status\": 200, \"size\": 711}" } +{ "index" : { "_index" : "logs_object", "_id" : "2" } } +{ "request_object": "{\"@timestamp\": 894140400, \"clientip\":\"129.178.2.0\", \"request\": \"GET /images/home_fr_button.gif HTTP/1.1\", \"status\": 200, \"size\": 2140}" } +{ "index" : { "_index" : "logs_object", "_id" : "3" } } +{ "request_object": "{\"@timestamp\": 894240400, \"clientip\":\"227.177.2.0\", \"request\": \"GET /images/102384s.gif HTTP/1.0\", \"status\": 400, \"size\": 785}" } +{ "index" : { "_index" : "logs_object", "_id" : "4" } } +{ "request_object": "{\"@timestamp\": 894340400, \"clientip\":\"61.177.2.0\", \"request\": \"GET /english/images/venue_bu_city_on.gif HTTP/1.0\", \"status\": 400, \"size\": 1397}\n" } +{ "index" : { "_index" : "logs_object", "_id" : "5" } } +{ "request_object": "{\"@timestamp\": 894440400, \"clientip\":\"132.176.2.0\", \"request\": \"GET /french/news/11354.htm HTTP/1.0\", \"status\": 200, \"size\": 3460, \"is_active\": true}" } +``` +{% include copy-curl.html %} + +The following query searches the `@timestamp` subfield of the `derived_request_object`: + +```json +POST /logs_object/_search +{ + "query": { + "range": { + "derived_request_object.@timestamp": { + "gte": "894030400", + "lte": "894140400" + } + } + }, + "fields": ["derived_request_object.@timestamp"] +} +``` +{% include copy-curl.html %} + +The response contains the matching documents: + +
+ + Response + + {: .text-delta} + +```json +{ + "took": 26, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 2, + "relation": "eq" + }, + "max_score": 1, + "hits": [ + { + "_index": "logs_object", + "_id": "1", + "_score": 1, + "_source": { + "request_object": """{"@timestamp": 894030400, "clientip":"61.177.2.0", "request": "GET /english/venues/images/venue_header.gif HTTP/1.0", "status": 200, "size": 711}""" + }, + "fields": { + "derived_request_object.@timestamp": [ + 894030400 + ] + } + }, + { + "_index": "logs_object", + "_id": "2", + "_score": 1, + "_source": { + "request_object": """{"@timestamp": 894140400, "clientip":"129.178.2.0", "request": "GET /images/home_fr_button.gif HTTP/1.1", "status": 200, "size": 2140}""" + }, + "fields": { + "derived_request_object.@timestamp": [ + 894140400 + ] + } + } + ] + } +} +``` + +
+ +You can also specify to highlight a derived object field: + +```json +POST /logs_object/_search +{ + "query": { + "bool": { + "must": [ + { + "term": { + "derived_request_object.clientip": "61.177.2.0" + } + }, + { + "match": { + "derived_request_object.request": "images" + } + } + ] + } + }, + "fields": ["derived_request_object.*"], + "highlight": { + "fields": { + "derived_request_object.request": {} + } + } +} +``` +{% include copy-curl.html %} + +The response adds highlighting to the `derived_request_object.request` field: + +
+ + Response + + {: .text-delta} + +```json +{ + "took": 5, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 2, + "relation": "eq" + }, + "max_score": 2, + "hits": [ + { + "_index": "logs_object", + "_id": "1", + "_score": 2, + "_source": { + "request_object": """{"@timestamp": 894030400, "clientip":"61.177.2.0", "request": "GET /english/venues/images/venue_header.gif HTTP/1.0", "status": 200, "size": 711}""" + }, + "fields": { + "derived_request_object.request": [ + "GET /english/venues/images/venue_header.gif HTTP/1.0" + ], + "derived_request_object.clientip": [ + "61.177.2.0" + ] + }, + "highlight": { + "derived_request_object.request": [ + "GET /english/venues/images/venue_header.gif HTTP/1.0" + ] + } + }, + { + "_index": "logs_object", + "_id": "4", + "_score": 2, + "_source": { + "request_object": """{"@timestamp": 894340400, "clientip":"61.177.2.0", "request": "GET /english/images/venue_bu_city_on.gif HTTP/1.0", "status": 400, "size": 1397} +""" + }, + "fields": { + "derived_request_object.request": [ + "GET /english/images/venue_bu_city_on.gif HTTP/1.0" + ], + "derived_request_object.clientip": [ + "61.177.2.0" + ] + }, + "highlight": { + "derived_request_object.request": [ + "GET /english/images/venue_bu_city_on.gif HTTP/1.0" + ] + } + } + ] + } +} +``` + +
+ +### Inferred subfield type + +Type inference is based on the same logic as [Dynamic mapping]({{site.url}}{{site.baseurl}}/opensearch/mappings#dynamic-mapping). Instead of inferring the subfield type from the first document, a random sample of documents is used to infer the type. If the subfield isn't found in any documents from the random sample, type inference fails and logs a warning. For subfields that seldom occur in documents, consider defining the explicit field type. Using dynamic type inference for such subfields may result in a query returning no results, a behavior similar to that of a missing field. + +### Explicit subfield type + +To define the explicit subfield type, provide the `type` parameter in the `properties` object. The following example defines an explicit type for the `derived_logs_object.is_active` field as `boolean`. Because this field is only present in one of the documents, its type inference might fail, so it's important to define the explicit type: + +```json +POST /logs_object/_search +{ + "derived": { + "derived_request_object": { + "type": "object", + "script": { + "source": "emit(params._source[\"request_object\"])" + }, + "properties": { + "is_active": "boolean" + } + } + }, + "query": { + "term": { + "derived_request_object.is_active": true + } + }, + "fields": ["derived_request_object.is_active"] +} +``` +{% include copy-curl.html %} + +The response contains the matching documents: + +
+ + Response + + {: .text-delta} + +```json +{ + "took": 13, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 1, + "relation": "eq" + }, + "max_score": 1, + "hits": [ + { + "_index": "logs_object", + "_id": "5", + "_score": 1, + "_source": { + "request_object": """{"@timestamp": 894440400, "clientip":"132.176.2.0", "request": "GET /french/news/11354.htm HTTP/1.0", "status": 200, "size": 3460, "is_active": true}""" + }, + "fields": { + "derived_request_object.is_active": [ + true + ] + } + } + ] + } +} +``` + +
\ No newline at end of file diff --git a/_field-types/supported-field-types/index.md b/_field-types/supported-field-types/index.md index 69ca0032be..be0963e976 100644 --- a/_field-types/supported-field-types/index.md +++ b/_field-types/supported-field-types/index.md @@ -29,6 +29,7 @@ IP | [`ip`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/ip/): [Rank]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/rank/) | Boosts or decreases the relevance score of documents (`rank_feature`, `rank_features`). k-NN vector | [`knn_vector`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/knn-vector/): Allows indexing a k-NN vector into OpenSearch and performing different kinds of k-NN search. Percolator | [`percolator`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/percolator/): Specifies to treat this field as a query. +Derived | [`derived`]({{site.url}}{{site.baseurl}}/field-types/supported-field-types/derived/): Creates new fields dynamically by executing scripts on existing fields. ## Arrays